a bit of cs: a bit of string matching: Suffix Automaton

I will not have the time to expand the derivation of this data structure yet, but here I will present the algorithm as written in a paper by Blumer et. al. on smallest suffix automaton that recognizes all subwords.

Problem Statement:
Given a text (fixed), build a deterministic finite automaton (DFA) which accepting states are substrings in the given text.

Algorithm: Construction of the DFA (called Suffix Automaton / Directed acyclic word graphs (DAWG) )

Build-DAWG (string s):
1. Create a node u representing the empty state
2. Set root as u, sink points to u
3. For i from 0 to length of s:
3.1 Create a new node called newsink
3.2 Create a primary edge from sink to newsink with label s[i]
3.3 Set currentstate points to sink, and suffixstate as null
3.4 While currentstate is not root and suffixstate is null:
3.4.1 Set currentstate = currentstate.suff
3.4.2 If currentstate has a primary edge labeled s[i]:
3.4.2.1 Set suffixstate to the state that edge points to
3.4.3 Else if currentstate has a secondary edge labeled s[i]:
3.4.3.1 Let child be the state that edge points to
3.4.3.2 Set newstate = Split( currentstate, child )
3.4.3.3 Set suffixstate = newstate
3.4.4 Else:
3.4.4.1 Create a secondary edge from currentstate to newsink with label s[i]
3.5 If suffixstate is still null:
3.5.1 Set suffixstate = root
3.6 Set newsink.suff = suffixstate
3.7 Set sink = newsink

Split (parent, child):
1. Create a new node newstate
2. Set that secondary edge to a primary edge from parent to newstate
3. For all outgoing primary and secondary edge from child:
3.1 Create a similar outgoing secondary edge from newstate pointing to the same state
4. Set newstate.suff = child.suff
5. Reset child.suff = newstate
6. Set currentstate = parent
7. While currentstate is not root:
7.1 Set currentstate = currentstate.suff
7.2 If currentstate has a secondary edge to child:
7.2.1 Reset the secondary edge to point to newstate
7.3 Else:
7.3.1 Break from the loop
8. Return newstate

Here is an implementation (a very inefficient one :D haha) in C++ :

#include <cstdio>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
using namespace std;
//Implementation of Suffix Automaton
//First Try, Must be very INEFFICIENT
//Using pointers maybe

struct state;
struct edge;
state* split(state* parent, state* child, vector<edge*>::iterator it);

struct edge
{
 state* child;
 char label;
};

struct state
{
 //information regarding the state
 int index;
 int pos;
 //suffix link
 state* suff;

 //prim edge
 vector<edge*> prim_edge;

 //sec edge
 vector<edge*> sec_edge;
};

state *root, *sink;
int idx = 1;

void init(string str)
{
 //create the root state
 root = new state;
 root->suff = NULL;
 root->prim_edge.clear();
 root->sec_edge.clear();
 root->index = 0;
 root->pos = -1;

 //initialize sink
 sink = root;

 int len = str.size();

 for(int i=0;i<len;++i)
 {
  //edge label
  char a = str[i];
  
  state* newsink = new state;
  newsink->index = idx++;
  newsink->pos = i;
  newsink->suff = NULL;

  //initialize new edge to the new sink
  edge* to_newsink = new edge;
  to_newsink->child = newsink;
  to_newsink->label = a;

  sink->prim_edge.push_back(to_newsink);

  state* currstate = sink;

  while( currstate->suff != NULL && newsink->suff == NULL) 
  {
   currstate = currstate->suff;

   bool finished = false;

   //if exist prim_edge of currstate with label == a
   for(vector<edge*>::iterator it = currstate->prim_edge.begin(); it != currstate->prim_edge.end(); ++it)
   {
    if( (*it)->label == a )
    {
     //update suff link of newsink to currstate
     newsink->suff = (*it)->child;
     finished = true;
     break;
    }
   }

   if( finished ) break;

   //if exist secondary edge
   for(vector<edge*>::iterator it = currstate->sec_edge.begin(); it != currstate->sec_edge.end(); ++it )
   {
    if( (*it)->label == a )
    { 
     //split the state
     state* child = (*it)->child;

     state* clone = split(currstate, child, it);

     //update suffix link of newsink to clone
     newsink->suff = clone;
     finished = true;
     break;
    }
   }

   if( finished ) break;

   //Else, add secondary edge to newsink and continue
   currstate->sec_edge.push_back(to_newsink);
  }

  //if newsink still does not have suffix link
  if (newsink->suff == NULL)
  {
   //point to source
   newsink->suff = root;
  }

  //update sink
  sink = newsink;
 }

}


state* split(state* parent, state* child, vector<edge*>::iterator iter)
{
 state* clone = new state;
 clone->index = idx++;
 clone->pos = child->pos;

 char label = (*iter)->label;
 //copy all outgoing edge

 for(vector<edge*>::iterator it = child->prim_edge.begin(); it != child->prim_edge.end();++it)
 {
  edge* e = new edge;
  e->label = (*it)->label;
  e->child = (*it)->child;
  clone->sec_edge.push_back(e);
 }

 for(vector<edge*>::iterator it = child->sec_edge.begin(); it != child->sec_edge.end();++it)
 {
  edge* e = new edge;
  e->label = (*it)->label;
  e->child = (*it)->child;
  clone->sec_edge.push_back(e);
 }

 //update suffix link of clone
 clone->suff = child->suff;

 //update suffix link of child
 child->suff = clone;

 //update parent's sec edge to child to prim edge to clone
 parent->sec_edge.erase(iter);

 edge* to_clone = new edge;
 to_clone->label = label;
 to_clone->child = clone;

 parent->prim_edge.push_back(to_clone);

 //update all edge to child to point to clone
 state* currstate = parent;

 while( currstate->suff != NULL )
 {
  currstate = currstate->suff;

  bool update = false;
  for(vector<edge*>::iterator it = currstate->sec_edge.begin(); it != currstate->sec_edge.end(); ++it)
  {
   if( (*it)->child == child )
   {
    (*it)->child = clone;
    update = true;
    break;
   }
  }

  if( !update ) break;
 }

 return clone;
}

void print(state* s)
{
 printf("state with index: %d\n", s->index);

 vector<edge*>::iterator it;

 for(it = s->prim_edge.begin(); it != s->prim_edge.end(); ++it)
 {
  printf( "==%c (%d)" , (*it)->label, (*it)->child->index);
  printf("\n");
 }

 for(it = s->sec_edge.begin(); it != s->sec_edge.end(); ++it)
 {
  printf( "--%c (%d)" , (*it)->label, (*it)->child->index);
  printf("\n");
 }

 for(it = s->prim_edge.begin(); it != s->prim_edge.end(); ++it)
 {
  print((*it)->child);
 }

 for(it = s->sec_edge.begin(); it != s->sec_edge.end(); ++it)
 {
  print((*it)->child);
 }
}

int search(string str)
{
 int len = str.size();
 state* currstate = root;

 for(int i=0;i<len;++i)
 {
  vector<edge*>::iterator it;
  bool found = false;
  for(it = currstate->prim_edge.begin(); it!= currstate->prim_edge.end(); ++it)
  {
   if( (*it)->label == str[i] )
   {
    currstate = (*it)->child;
    found = true;
    break;
   }
  }

  if( found ) continue;

  for(it = currstate->sec_edge.begin(); it!= currstate->sec_edge.end(); ++it)
  {
   if( (*it)->label == str[i] )
   {
    currstate = (*it)->child;
    found = true;
    break;
   }
  }

  if (!found)
   return -1;
 }

 return currstate->pos;
}

int main()
{
 printf("Welcome to SUFFIX AUTOMATON SEARCH\n");
 printf("Enter your string/dictionary\n");

 string str;
 getline(cin,str);

 init(str);

 string src;
 while (1)
 {
  printf("Enter your search term:\n");

  getline(cin,src);

  int pos = search(src);

  if(pos < 0) 
  {
   printf("The term is not found\n");
  }
  else
  {
   printf("The term is found at end-pos: %d\n", pos);
  }
 }
 return 0;
}

a bit of cs

Wednesday, July 9, 2014

a bit of string matching: Suffix Automaton

No comments:

Post a Comment