Tuesday, September 18, 2012

Smallest Repeating Prefix in a non empty String.


Below is the implementation of method
int find_smallest_repeating_prefix_length(string &s);


/**
-------------------------------------------------------------------
Problem :
-------------------------------------------------------------------
Finding Smallest Repeating Prefix "p" in a non-empty string "s". 
Length of "s" = slen.
Length of "p" = plen.

Following equation should be satisfied
slen = plen * k, where k > 0. 
-------------------------------------------------------------------

-------------------------------------------------------------------
Constraints :
-------------------------------------------------------------------
time  : O(n)
space : O(n)
-------------------------------------------------------------------

-------------------------------------------------------------------
Example :
-------------------------------------------------------------------
[1]
s = ababab
slen = 6
p = ab
plen = 2
k = 3


[2]
s = abh
slen = 3
p = abh
plen = 3
k = 1
-------------------------------------------------------------------
/**/

#include <iostream>
#include <conio.h> 

#include <vector>
#include <string>

using namespace std;


void kmpPreprocess( string &str,  int slen, vector &barray)
{
    int i = 0;
    int j = -1;
    barray[i] = j;
    while(i < slen)
    {
        while( (j >= 0) && (str[i] != str[j]) )
        {
            j = barray[j];
        }
        ++i;
        ++j;
        barray[i] = j;
    }
}

void print_barray( string &s, vector &barray)
{
    int i;
    int slen = s.length();

    cout << endl;
    cout << "index  :  ";
    for(i = 0; i < (slen+1); ++i)
    {
    cout << i << " ";
    }
    cout << endl;

    cout << "string :    ";
    for(i = 0; i < slen; ++i)
    {
    cout << s[i] << " ";
    }
    cout << endl;
    cout << "barray : ";
    for(i = 0; i < (slen+1); ++i)
    {
    cout << barray[i] << " ";
    }
    cout << endl;
}


int using_barray_method_1(string &s, vector &barray)
{        
    string srp_curr = s;    
    int srp_length_curr;
    int srp_length_prev;

    srp_length_curr = srp_curr.length();
    srp_length_prev = srp_length_curr;
    
    while( (srp_length_curr = barray[srp_length_curr]) > 0)
    {    
        if( (srp_length_prev % srp_length_curr) == 0)
        {     
            int k = srp_length_prev / srp_length_curr;

            string chain_of_k_parts;
            chain_of_k_parts.clear();
            while(k--)
            {
                chain_of_k_parts.append(srp_curr.c_str(), srp_length_curr); 
            }

            if(srp_curr.compare(chain_of_k_parts) == 0)
            {
                srp_length_prev = srp_length_curr;                         
                
                srp_curr = srp_curr.substr(0, srp_length_prev);
            }            
        }
    }

    return srp_length_prev;
}


int using_barray_method_2(string &s, vector &barray)
{
    int slen = s.length();

    int srp_length = slen;

    if( (slen % (slen - barray[slen])) == 0 ) 
    {
        srp_length = slen - barray[slen];        
    }

    return srp_length;
}


int find_smallest_repeating_prefix_length(string &s) 
{
    int slen = s.length();

    if(slen == 0)
    {
        return -1;
    }

    vector barray(slen+1);
    
    kmpPreprocess(s, slen, barray);    
    print_barray(s, barray);

        
    int srp_length = -1;

    srp_length = using_barray_method_1(s, barray);
    srp_length = using_barray_method_2(s, barray);
                
    return srp_length;
}


bool check(int slen, int plen, int k)
{
    if( slen == (plen * k) )
    {
        return true;
    }
    else
    {
        return false;
    }
}


void process_string(string &s)
{
    int plen = find_smallest_repeating_prefix_length(s);

    if(plen == -1)
    {
        cout << "error in input string" << endl;
        getch();
        return;
    }

    int i;
    int slen = s.length();    

    int k = slen / plen;    

    if(check(slen, plen, k) == true)
    {
        cout << "[successfully satisfied]  slen == (plen * k)" << endl;
    }
    else
    {
        cout << "[fail to satisfy]  slen != (plen * k)" << endl;
        getch();
        return;
    }

    cout << "s = " << s << endl;
    cout << "slen = " << slen << endl;

    cout << "p = ";
    for(i = 0; i < plen; ++i)
    {
        cout << s[i]; 
    }
    cout << endl;

    cout << "plen = " << plen << endl;    
    cout << "k = " << k << endl;    
}


void main()
{
    string s;
    
    s = "ababab";
    process_string(s);    
    cout << endl << endl;

    s = "abh";
    process_string(s);
    cout << endl << endl;    

    getch();
}


/**
-------------------------------------------------------------------
Output :
-------------------------------------------------------------------

index  :  0 1 2 3 4 5 6
string :    a b a b a b
barray : -1 0 0 1 2 3 4
[successfully satisfied]  slen == (plen * k)
s = ababab
slen = 6
p = ab
plen = 2
k = 3



index  :  0 1 2 3
string :    a b h
barray : -1 0 0 0
[successfully satisfied]  slen == (plen * k)
s = abh
slen = 3
p = abh
plen = 3
k = 1
-------------------------------------------------------------------
/**/

These methods works on following logic :
"barray" stores lengths of various Borders of the string.
Border "b" of a string "s" is a proper prefix which is also the proper suffix of a string "s".
Therefore we can see a border of a string "s" as a prefix "p" which is repeated atleast two times in that string "s".

For more information on Border and how barray is created, please refer Knuth-Morris-Pratt Algorithm.
KMP Algorithm

-----------------------------------------------------------------------------------------------------
Method 1: (using_barray_method_1)
reference : Utilization of KMP Algorithm
-----------------------------------------------------------------------------------------------------
We recursively try to find smallest repeating prefix inside an existing\current smallest repeating prefix.
We start by considering the original string "s" itself as the smallest repeating prefix.
[1] The next smallest prefix "x" (and not the smallest 'REPEATING' prefix) which is repeated in the string "s" can be found
using the "barray" [refer Knuth-Morris-Pratt Algorithm].
[2] Now we check whether this smallest prefix "x" is the smallest repeating prefix in the string "s".
This is done by
[a] checking whether the (length of "x") fully divides (length of "s")
i.e [((strlen(s) % strlen(x)) == 0]
[b] if "x" fully divides "s" then let k = strlen(s) / strlen(x);
[3] Now we try to confirm whether repeating string "x", 'k' number of time give us string "s".
i.e, if string "xxxx...[k times]" == string "s", then we can say string "x" can be the smallest repeating prefix which is repeated 'k' number of times in string "s".
[4] Now if there is a smallest repeating prefix "y" in "x" then it will be our new smallest repeating prefix in "s".
This is true because length of "y" will be smaller then "x".
So we make "s" = "x" and repeat the above steps to find smallest repeating prefix in "x".
-----------------------------------------------------------------------------------------------------

-----------------------------------------------------------------------------------------------------
Method 2: (using_barray_method_2)
-----------------------------------------------------------------------------------------------------
There is one more direct and fast method to find smallest repeating prefix after computing "barray" of string "s".
Came to know about this direct method when discussing this problem on one of the forums Smallest Repeating Prefix
Dateng LIN suggested this method.
-----------------------------------------------------------------------------------------------------

Monday, March 12, 2012

Substring Matching Algorithm : BMH

Here is a very good reference link and code for an algorithm to find all occurrences of a pattern(sub-string) in a given text.


/**
---------------------------------------------------------------------------
problem :
----------
- find all occurrences of a provided pattern(sub-string) in a given text. 
---------------------------------------------------------------------------
input :
----------
- unsigned char* text : the text string to be searched.
- int n : length of text string.
- unsigned char* pattern : the pattern(sub-string) to be searched in the text.        
- int m : length of pattern.
---------------------------------------------------------------------------
output :
----------
- all position within text from where the pattern can be found.
---------------------------------------------------------------------------
constraints :
-------------
- for each element or letter 't' of text : 0 <= t <= 254.
- for each element or letter 'p' of pattern : 0 <= p <= 254.
- text of pattern cannot contain any letter or element with value 255. 
---------------------------------------------------------------------------
/**/
 
//-------------------------------------------------------------------------
#include <iostream>
#include <conio.h>

#define MAX_ALPHABET_SIZE 256
#define CHARACTER_NOT_IN_THE_TEXT 255

void boyer_moore_horspool(unsigned char* text, int n, unsigned char* pattern, int m)
{
 int d[MAX_ALPHABET_SIZE], i, j, k, lim;

 // Preprocessing
 for(k = 0; k < MAX_ALPHABET_SIZE; ++k) 
 {
  d[k] = m; 
 }

 for(k = 0; k < m; ++k )
 {
  d[pattern[k]] = m - k;
 }

 // To avoid having code
 pattern[m] = CHARACTER_NOT_IN_THE_TEXT; 

 lim = n - m;

 // Searching
 for(k = 0; k <= lim; k += d[text[k + m]]) 
 {
  i = k;
  j = 0;
  while(text[i] == pattern[j])
  {
   ++i;
   ++j;
  }

  if(j == m)
  {
   cout << "pattern found in text at position : " << k << endl;
   if( (k + (m<<1)) >= n)
   {
    break;
   }
  }
 }
}
//-------------------------------------------------------------------------

/**
---------------------------------------------------------------------------
solution :
------------
- complexity : 
average case = O(n) , O(length of text, n).
worst case = O(nm), O([length of text:n]*[length of pattern:m]). 
---------------------------------------------------------------------------
/**/


void main()
{
//positions.....012345678901234567890
 char text[] = "ZZACCABCDBDABCDABCDQQ";
 char pattern[] = "ABCD";
 boyer_moore_horspool(text, strlen(text), pattern, strlen(pattern));

 getch();
}

/**
---------------------------------------------------------------------------
output :
------------
pattern found in text at position : 5
pattern found in text at position : 11
pattern found in text at position : 15
---------------------------------------------------------------------------
/**/

Have used the above code from the following link :
Boyer-Moore-Horspool Algorithm

Friday, March 9, 2012

Substring Matching Algorithm : KMP

Here is the link to a good algorithm for finding all occurrences
of a sub-string(or pattern) in a given text(or string).

Knuth-Morris-Pratt Algorithm.

Monday, February 20, 2012

finding duplicate entry in an array.


/**
---------------------------------------------------------------------------
problem : 
----------
- find first duplicate value in the given array without modifying the array.
---------------------------------------------------------------------------
input : 
----------
- int *a : array of integers
- int l : number of elements in a
- int m : loose upper bound for value of an element of array a.  
        : 0 <= value_of_element_of_a < m
- relation between l and m : l > m
---------------------------------------------------------------------------
output : 
----------
- first duplicate value occuring in the array.
---------------------------------------------------------------------------
constraints : 
------------- 
- space requirement should be O(1)
- after the function returns the array should be same as original.
---------------------------------------------------------------------------
/**/

//-------------------------------------------------------------------------
int find_firstduplicate(int *a, int l, int m)
{
 int original = 0;
 int index = 0;
 int is_index_already_used = 0;
 const int adjustment_for_zero_value = 1;
 int duplicate = -1;

 for(index = 0; index <= m; ++index)
 {
  is_index_already_used = 0;
  original = a[index];

  if(a[index] < 0)
  {
   //means there is already an element with value = index
   is_index_already_used = 1;

   //reversing the step of procedure to handle zero value
   a[index] += adjustment_for_zero_value;

   //original value of the element
   original = abs(a[index]);
  }
  
  if(a[original] >= 0)
  {
   //negating
   a[original] = (- a[original]);
   
   //substracting 1, to handle zero value, since (minus ZERO) = (ZERO) 
   a[original] = a[original] - adjustment_for_zero_value;

   //restoring the fact that index is used.
   a[index] -= is_index_already_used;   
  }
  else
  {
   //duplicate found.
   duplicate = original;
   break;   
  }
 }

 //readjusting the values of array so that they resemble the original.
 for(index = 0; index <= m; ++index)
 {
  if(a[index] < 0)
  {   
   a[index] += adjustment_for_zero_value;
   a[index] = - a[index];
  }
 } 

 return duplicate;
}
//-------------------------------------------------------------------------

/**
---------------------------------------------------------------------------
solution :
- hint : use the value of element as an index 'i' and negate the value at  
index i.
- complexity : O(m) 
---------------------------------------------------------------------------
/**/

void main()
{
 int m = 5;
 int a[] = {1, 0, 3, 0, 4, 5};
 int l = sizeof(a) / sizeof(a[0]);
 int duplicate = find_firstduplicate(a, l, m);
 if(duplicate != -1)
 {
  cout << "first duplicate = " << duplicate << endl;
 }
 else
 {
  cout << "duplicate does not exist." << endl;
 }
 getch();
}

/**
output :
--------
first duplicate = 0
/**/