Tuesday, September 18, 2012

Smallest Repeating Prefix in a non empty String.


/**
-------------------------------------------------------------------
Problem :
-------------------------------------------------------------------
Finding Smallest Repeating Prefix "p" in a non-empty string "s". 
Length of "s" = slen.
Length of "p" = plen.

Following equation should be satisfied
slen = plen * k, where k > 0. 
-------------------------------------------------------------------

-------------------------------------------------------------------
Constraints :
-------------------------------------------------------------------
time  : O(n)
space : O(n)
-------------------------------------------------------------------

-------------------------------------------------------------------
Example :
-------------------------------------------------------------------
[1]
s = ababab
slen = 6
p = ab
plen = 2
k = 3


[2]
s = abh
slen = 3
p = abh
plen = 3
k = 1
-------------------------------------------------------------------
/**/

#include <iostream>
#include <conio.h> 

#include <vector>
#include <string>

using namespace std;


void kmpPreprocess( string &str,  int slen, vector &barray)
{
    int i = 0;
    int j = -1;
    barray[i] = j;
    while(i < slen)
    {
        while( (j >= 0) && (str[i] != str[j]) )
        {
            j = barray[j];
        }
        ++i;
        ++j;
        barray[i] = j;
    }
}

void print_barray( string &s, vector &barray)
{
    int i;
    int slen = s.length();

    cout << endl;
    cout << "index  :  ";
    for(i = 0; i < (slen+1); ++i)
    {
    cout << i << " ";
    }
    cout << endl;

    cout << "string :    ";
    for(i = 0; i < slen; ++i)
    {
    cout << s[i] << " ";
    }
    cout << endl;
    cout << "barray : ";
    for(i = 0; i < (slen+1); ++i)
    {
    cout << barray[i] << " ";
    }
    cout << endl;
}


int using_barray_method_1(string &s, vector &barray)
{        
    string srp_curr = s;    
    int srp_length_curr;
    int srp_length_prev;

    srp_length_curr = srp_curr.length();
    srp_length_prev = srp_length_curr;
    
    while( (srp_length_curr = barray[srp_length_curr]) > 0)
    {    
        if( (srp_length_prev % srp_length_curr) == 0)
        {     
            int k = srp_length_prev / srp_length_curr;

            string chain_of_k_parts;
            chain_of_k_parts.clear();
            while(k--)
            {
                chain_of_k_parts.append(srp_curr.c_str(), srp_length_curr); 
            }

            if(srp_curr.compare(chain_of_k_parts) == 0)
            {
                srp_length_prev = srp_length_curr;                         
                
                srp_curr = srp_curr.substr(0, srp_length_prev);
            }            
        }
    }

    return srp_length_prev;
}


int using_barray_method_2(string &s, vector &barray)
{
    int slen = s.length();

    int srp_length = slen;

    if( (slen % (slen - barray[slen])) == 0 ) 
    {
        srp_length = slen - barray[slen];        
    }

    return srp_length;
}


int find_smallest_repeating_prefix_length(string &s) 
{
    int slen = s.length();

    if(slen == 0)
    {
        return -1;
    }

    vector barray(slen+1);
    
    kmpPreprocess(s, slen, barray);    
    print_barray(s, barray);

        
    int srp_length = -1;

    srp_length = using_barray_method_1(s, barray);
    srp_length = using_barray_method_2(s, barray);
                
    return srp_length;
}


bool check(int slen, int plen, int k)
{
    if( slen == (plen * k) )
    {
        return true;
    }
    else
    {
        return false;
    }
}


void process_string(string &s)
{
    int plen = find_smallest_repeating_prefix_length(s);

    if(plen == -1)
    {
        cout << "error in input string" << endl;
        getch();
        return;
    }

    int i;
    int slen = s.length();    

    int k = slen / plen;    

    if(check(slen, plen, k) == true)
    {
        cout << "[successfully satisfied]  slen == (plen * k)" << endl;
    }
    else
    {
        cout << "[fail to satisfy]  slen != (plen * k)" << endl;
        getch();
        return;
    }

    cout << "s = " << s << endl;
    cout << "slen = " << slen << endl;

    cout << "p = ";
    for(i = 0; i < plen; ++i)
    {
        cout << s[i]; 
    }
    cout << endl;

    cout << "plen = " << plen << endl;    
    cout << "k = " << k << endl;    
}


void main()
{
    string s;
    
    s = "ababab";
    process_string(s);    
    cout << endl << endl;

    s = "abh";
    process_string(s);
    cout << endl << endl;    

    getch();
}


/**
-------------------------------------------------------------------
Output :
-------------------------------------------------------------------

index  :  0 1 2 3 4 5 6
string :    a b a b a b
barray : -1 0 0 1 2 3 4
[successfully satisfied]  slen == (plen * k)
s = ababab
slen = 6
p = ab
plen = 2
k = 3



index  :  0 1 2 3
string :    a b h
barray : -1 0 0 0
[successfully satisfied]  slen == (plen * k)
s = abh
slen = 3
p = abh
plen = 3
k = 1
-------------------------------------------------------------------
/**/


These methods works on following logic :

"barray" stores lengths of various Borders of the string.

Border "b" of a string "s" is a proper prefix which is also the proper suffix of a string "s".

Therefore we can see a border of a string "s" as a prefix "p" which is repeated atleast two times in that string "s".



For more information on Border and how barray is created, please refer Knuth-Morris-Pratt Algorithm.

KMP Algorithm



-----------------------------------------------------------------------------------------------------

Method 1: (using_barray_method_1)

reference : Utilization of KMP Algorithm

-----------------------------------------------------------------------------------------------------

We recursively try to find smallest repeating prefix inside an existing\current smallest repeating prefix.

We start by considering the original string "s" itself as the smallest repeating prefix.

[1] The next smallest prefix "x" (and not the smallest 'REPEATING' prefix) which is repeated in the string "s" can be found

using the "barray" [refer Knuth-Morris-Pratt Algorithm].

[2] Now we check whether this smallest prefix "x" is the smallest repeating prefix in the string "s".

This is done by

[a] checking whether the (length of "x") fully divides (length of "s")

i.e [((strlen(s) % strlen(x)) == 0]

[b] if "x" fully divides "s" then let k = strlen(s) / strlen(x);

[3] Now we try to confirm whether repeating string "x", 'k' number of time give us string "s".

i.e, if string "xxxx...[k times]" == string "s", then we can say string "x" can be the smallest repeating prefix which is repeated 'k' number of times in string "s".

[4] Now if there is a smallest repeating prefix "y" in "x" then it will be our new smallest repeating prefix in "s".

This is true because length of "y" will be smaller then "x".

So we make "s" = "x" and repeat the above steps to find smallest repeating prefix in "x".

-----------------------------------------------------------------------------------------------------



-----------------------------------------------------------------------------------------------------

Method 2: (using_barray_method_2)

-----------------------------------------------------------------------------------------------------

There is one more direct and fast method to find smallest repeating prefix after computing "barray" of string "s".

Came to know about this direct method when discussing this problem on one of the forums Smallest Repeating Prefix

Dateng LIN suggested this method.

-----------------------------------------------------------------------------------------------------