Problem Statement:
Given a text (fixed), build a deterministic finite automaton (DFA) which accepting states are substrings in the given text.
Algorithm: Construction of the DFA (called Suffix Automaton / Directed acyclic word graphs (DAWG) )
Build-DAWG (string s):
1. Create a node u representing the empty state
2. Set root as u, sink points to u
3. For i from 0 to length of s:
3.1 Create a new node called newsink
3.2 Create a primary edge from sink to newsink with label s[i]
3.3 Set currentstate points to sink, and suffixstate as null
3.4 While currentstate is not root and suffixstate is null:
3.4.1 Set currentstate = currentstate.suff
3.4.2 If currentstate has a primary edge labeled s[i]:
3.4.2.1 Set suffixstate to the state that edge points to
3.4.3 Else if currentstate has a secondary edge labeled s[i]:
3.4.3.1 Let child be the state that edge points to
3.4.3.2 Set newstate = Split( currentstate, child )
3.4.3.3 Set suffixstate = newstate
3.4.4 Else:
3.4.4.1 Create a secondary edge from currentstate to newsink with label s[i]
3.5 If suffixstate is still null:
3.5.1 Set suffixstate = root
3.6 Set newsink.suff = suffixstate
3.7 Set sink = newsink
Split (parent, child):
1. Create a new node newstate
2. Set that secondary edge to a primary edge from parent to newstate
3. For all outgoing primary and secondary edge from child:
3.1 Create a similar outgoing secondary edge from newstate pointing to the same state
4. Set newstate.suff = child.suff
5. Reset child.suff = newstate
6. Set currentstate = parent
7. While currentstate is not root:
7.1 Set currentstate = currentstate.suff
7.2 If currentstate has a secondary edge to child:
7.2.1 Reset the secondary edge to point to newstate
7.3 Else:
7.3.1 Break from the loop
8. Return newstate
Here is an implementation (a very inefficient one :D haha) in C++ :
#include <cstdio> #include <iostream> #include <vector> #include <string> #include <algorithm> using namespace std; //Implementation of Suffix Automaton //First Try, Must be very INEFFICIENT //Using pointers maybe struct state; struct edge; state* split(state* parent, state* child, vector<edge*>::iterator it); struct edge { state* child; char label; }; struct state { //information regarding the state int index; int pos; //suffix link state* suff; //prim edge vector<edge*> prim_edge; //sec edge vector<edge*> sec_edge; }; state *root, *sink; int idx = 1; void init(string str) { //create the root state root = new state; root->suff = NULL; root->prim_edge.clear(); root->sec_edge.clear(); root->index = 0; root->pos = -1; //initialize sink sink = root; int len = str.size(); for(int i=0;i<len;++i) { //edge label char a = str[i]; state* newsink = new state; newsink->index = idx++; newsink->pos = i; newsink->suff = NULL; //initialize new edge to the new sink edge* to_newsink = new edge; to_newsink->child = newsink; to_newsink->label = a; sink->prim_edge.push_back(to_newsink); state* currstate = sink; while( currstate->suff != NULL && newsink->suff == NULL) { currstate = currstate->suff; bool finished = false; //if exist prim_edge of currstate with label == a for(vector<edge*>::iterator it = currstate->prim_edge.begin(); it != currstate->prim_edge.end(); ++it) { if( (*it)->label == a ) { //update suff link of newsink to currstate newsink->suff = (*it)->child; finished = true; break; } } if( finished ) break; //if exist secondary edge for(vector<edge*>::iterator it = currstate->sec_edge.begin(); it != currstate->sec_edge.end(); ++it ) { if( (*it)->label == a ) { //split the state state* child = (*it)->child; state* clone = split(currstate, child, it); //update suffix link of newsink to clone newsink->suff = clone; finished = true; break; } } if( finished ) break; //Else, add secondary edge to newsink and continue currstate->sec_edge.push_back(to_newsink); } //if newsink still does not have suffix link if (newsink->suff == NULL) { //point to source newsink->suff = root; } //update sink sink = newsink; } } state* split(state* parent, state* child, vector<edge*>::iterator iter) { state* clone = new state; clone->index = idx++; clone->pos = child->pos; char label = (*iter)->label; //copy all outgoing edge for(vector<edge*>::iterator it = child->prim_edge.begin(); it != child->prim_edge.end();++it) { edge* e = new edge; e->label = (*it)->label; e->child = (*it)->child; clone->sec_edge.push_back(e); } for(vector<edge*>::iterator it = child->sec_edge.begin(); it != child->sec_edge.end();++it) { edge* e = new edge; e->label = (*it)->label; e->child = (*it)->child; clone->sec_edge.push_back(e); } //update suffix link of clone clone->suff = child->suff; //update suffix link of child child->suff = clone; //update parent's sec edge to child to prim edge to clone parent->sec_edge.erase(iter); edge* to_clone = new edge; to_clone->label = label; to_clone->child = clone; parent->prim_edge.push_back(to_clone); //update all edge to child to point to clone state* currstate = parent; while( currstate->suff != NULL ) { currstate = currstate->suff; bool update = false; for(vector<edge*>::iterator it = currstate->sec_edge.begin(); it != currstate->sec_edge.end(); ++it) { if( (*it)->child == child ) { (*it)->child = clone; update = true; break; } } if( !update ) break; } return clone; } void print(state* s) { printf("state with index: %d\n", s->index); vector<edge*>::iterator it; for(it = s->prim_edge.begin(); it != s->prim_edge.end(); ++it) { printf( "==%c (%d)" , (*it)->label, (*it)->child->index); printf("\n"); } for(it = s->sec_edge.begin(); it != s->sec_edge.end(); ++it) { printf( "--%c (%d)" , (*it)->label, (*it)->child->index); printf("\n"); } for(it = s->prim_edge.begin(); it != s->prim_edge.end(); ++it) { print((*it)->child); } for(it = s->sec_edge.begin(); it != s->sec_edge.end(); ++it) { print((*it)->child); } } int search(string str) { int len = str.size(); state* currstate = root; for(int i=0;i<len;++i) { vector<edge*>::iterator it; bool found = false; for(it = currstate->prim_edge.begin(); it!= currstate->prim_edge.end(); ++it) { if( (*it)->label == str[i] ) { currstate = (*it)->child; found = true; break; } } if( found ) continue; for(it = currstate->sec_edge.begin(); it!= currstate->sec_edge.end(); ++it) { if( (*it)->label == str[i] ) { currstate = (*it)->child; found = true; break; } } if (!found) return -1; } return currstate->pos; } int main() { printf("Welcome to SUFFIX AUTOMATON SEARCH\n"); printf("Enter your string/dictionary\n"); string str; getline(cin,str); init(str); string src; while (1) { printf("Enter your search term:\n"); getline(cin,src); int pos = search(src); if(pos < 0) { printf("The term is not found\n"); } else { printf("The term is found at end-pos: %d\n", pos); } } return 0; }
No comments:
Post a Comment