## Wednesday, July 9, 2014

### a bit of string matching: Suffix Automaton

I will not have the time to expand the derivation of this data structure yet, but here I will present the algorithm as written in a paper by Blumer et. al. on smallest suffix automaton that recognizes all subwords.

Problem Statement:
Given a text (fixed), build a deterministic finite automaton (DFA) which accepting states are substrings in the given text.

Algorithm: Construction of the DFA (called Suffix Automaton / Directed acyclic word graphs (DAWG) )

Build-DAWG (string s):
1. Create a node u representing the empty state
2. Set root as u, sink points to u
3. For i from 0 to length of s:
3.1 Create a new node called newsink
3.2 Create a primary edge from sink to newsink with label s[i]
3.3 Set currentstate points to sink, and suffixstate as null
3.4 While currentstate is not root and suffixstate is null:
3.4.1 Set currentstate = currentstate.suff
3.4.2 If currentstate has a primary edge labeled s[i]:
3.4.2.1 Set suffixstate to the state that edge points to
3.4.3 Else if currentstate has a secondary edge labeled s[i]:
3.4.3.1 Let child be the state that edge points to
3.4.3.2 Set newstate = Split( currentstate, child )
3.4.3.3 Set suffixstate = newstate
3.4.4 Else:
3.4.4.1 Create a secondary edge from currentstate to newsink with label s[i]
3.5 If suffixstate is still null:
3.5.1 Set suffixstate = root
3.6 Set newsink.suff = suffixstate
3.7 Set sink = newsink

Split (parent, child):
1. Create a new node newstate
2. Set that secondary edge to a primary edge from parent to newstate
3. For all outgoing primary and secondary edge from child:
3.1 Create a similar outgoing secondary edge from newstate pointing to the same state
4. Set newstate.suff = child.suff
5. Reset child.suff = newstate
6. Set currentstate = parent
7. While currentstate is not root:
7.1 Set currentstate = currentstate.suff
7.2 If currentstate has a secondary edge to child:
7.2.1 Reset the secondary edge to point to newstate
7.3 Else:
7.3.1 Break from the loop
8. Return newstate

Here is an implementation (a very inefficient one :D haha) in C++ :

#include <cstdio>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
//Implementation of Suffix Automaton
//First Try, Must be very INEFFICIENT
//Using pointers maybe

struct state;
struct edge;
state* split(state* parent, state* child, vector<edge*>::iterator it);

struct edge
{
state* child;
char label;
};

struct state
{
//information regarding the state
int index;
int pos;
state* suff;

//prim edge
vector<edge*> prim_edge;

//sec edge
vector<edge*> sec_edge;
};

state *root, *sink;
int idx = 1;

void init(string str)
{
//create the root state
root = new state;
root->suff = NULL;
root->prim_edge.clear();
root->sec_edge.clear();
root->index = 0;
root->pos = -1;

//initialize sink
sink = root;

int len = str.size();

for(int i=0;i<len;++i)
{
//edge label
char a = str[i];

state* newsink = new state;
newsink->index = idx++;
newsink->pos = i;
newsink->suff = NULL;

//initialize new edge to the new sink
edge* to_newsink = new edge;
to_newsink->child = newsink;
to_newsink->label = a;

sink->prim_edge.push_back(to_newsink);

state* currstate = sink;

while( currstate->suff != NULL && newsink->suff == NULL)
{
currstate = currstate->suff;

bool finished = false;

//if exist prim_edge of currstate with label == a
for(vector<edge*>::iterator it = currstate->prim_edge.begin(); it != currstate->prim_edge.end(); ++it)
{
if( (*it)->label == a )
{
//update suff link of newsink to currstate
newsink->suff = (*it)->child;
finished = true;
break;
}
}

if( finished ) break;

//if exist secondary edge
for(vector<edge*>::iterator it = currstate->sec_edge.begin(); it != currstate->sec_edge.end(); ++it )
{
if( (*it)->label == a )
{
//split the state
state* child = (*it)->child;

state* clone = split(currstate, child, it);

//update suffix link of newsink to clone
newsink->suff = clone;
finished = true;
break;
}
}

if( finished ) break;

//Else, add secondary edge to newsink and continue
currstate->sec_edge.push_back(to_newsink);
}

//if newsink still does not have suffix link
if (newsink->suff == NULL)
{
//point to source
newsink->suff = root;
}

//update sink
sink = newsink;
}

}

state* split(state* parent, state* child, vector<edge*>::iterator iter)
{
state* clone = new state;
clone->index = idx++;
clone->pos = child->pos;

char label = (*iter)->label;
//copy all outgoing edge

for(vector<edge*>::iterator it = child->prim_edge.begin(); it != child->prim_edge.end();++it)
{
edge* e = new edge;
e->label = (*it)->label;
e->child = (*it)->child;
clone->sec_edge.push_back(e);
}

for(vector<edge*>::iterator it = child->sec_edge.begin(); it != child->sec_edge.end();++it)
{
edge* e = new edge;
e->label = (*it)->label;
e->child = (*it)->child;
clone->sec_edge.push_back(e);
}

//update suffix link of clone
clone->suff = child->suff;

//update suffix link of child
child->suff = clone;

//update parent's sec edge to child to prim edge to clone
parent->sec_edge.erase(iter);

edge* to_clone = new edge;
to_clone->label = label;
to_clone->child = clone;

parent->prim_edge.push_back(to_clone);

//update all edge to child to point to clone
state* currstate = parent;

while( currstate->suff != NULL )
{
currstate = currstate->suff;

bool update = false;
for(vector<edge*>::iterator it = currstate->sec_edge.begin(); it != currstate->sec_edge.end(); ++it)
{
if( (*it)->child == child )
{
(*it)->child = clone;
update = true;
break;
}
}

if( !update ) break;
}

return clone;
}

void print(state* s)
{
printf("state with index: %d\n", s->index);

vector<edge*>::iterator it;

for(it = s->prim_edge.begin(); it != s->prim_edge.end(); ++it)
{
printf( "==%c (%d)" , (*it)->label, (*it)->child->index);
printf("\n");
}

for(it = s->sec_edge.begin(); it != s->sec_edge.end(); ++it)
{
printf( "--%c (%d)" , (*it)->label, (*it)->child->index);
printf("\n");
}

for(it = s->prim_edge.begin(); it != s->prim_edge.end(); ++it)
{
print((*it)->child);
}

for(it = s->sec_edge.begin(); it != s->sec_edge.end(); ++it)
{
print((*it)->child);
}
}

int search(string str)
{
int len = str.size();
state* currstate = root;

for(int i=0;i<len;++i)
{
vector<edge*>::iterator it;
bool found = false;
for(it = currstate->prim_edge.begin(); it!= currstate->prim_edge.end(); ++it)
{
if( (*it)->label == str[i] )
{
currstate = (*it)->child;
found = true;
break;
}
}

if( found ) continue;

for(it = currstate->sec_edge.begin(); it!= currstate->sec_edge.end(); ++it)
{
if( (*it)->label == str[i] )
{
currstate = (*it)->child;
found = true;
break;
}
}

if (!found)
return -1;
}

return currstate->pos;
}

int main()
{
printf("Welcome to SUFFIX AUTOMATON SEARCH\n");

string str;
getline(cin,str);

init(str);

string src;
while (1)
{
printf("Enter your search term:\n");

getline(cin,src);

int pos = search(src);

if(pos < 0)
{