123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629(* this file is part of Spelll. See `opam` for the license. *)(** {1 Levenshtein distance} *)moduletypeSTRING=sigtypechar_typetvalof_list:char_list->tvalget:t->int->char_vallength:t->intvalcompare_char:char_->char_->intendletlist_of_seqs=letl=Seq.fold_left(funaccx->x::acc)[]sinList.revlmoduletypeS=sigtypechar_typestring_(** {6 Edit Distance} *)valedit_distance:string_->string_->int(** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c] *)(** {6 Automaton}
An automaton, built from a string [s] and a limit [n], that accepts
every string that is at distance at most [n] from [s]. *)typeautomaton(** Levenshtein automaton *)valof_string:limit:int->string_->automaton(** Build an automaton from a string, with a maximal distance [limit].
The automaton will accept strings whose {!edit_distance} to the
parameter is at most [limit]. *)valof_list:limit:int->char_list->automaton(** Build an automaton from a list, with a maximal distance [limit] *)valdebug_print:(out_channel->char_->unit)->out_channel->automaton->unit(** Output the automaton's structure on the given channel. *)valmatch_with:automaton->string_->bool(** [match_with a s] matches the string [s] against [a], and returns
[true] if the distance from [s] to the word represented by [a] is smaller
than the limit used to build [a] *)(** {6 Index for one-to-many matching} *)moduleIndex:sigtype'bt(** Index that maps strings to values of type 'b. Internally it is
based on a trie. A string can only map to one value. *)valempty:'bt(** Empty index *)valis_empty:_t->boolvaladd:'bt->string_->'b->'bt(** Add a pair string/value to the index. If a value was already present
for this string it is replaced. *)valremove:'bt->string_->'bt(** Remove a string (and its associated value, if any) from the index. *)valretrieve:limit:int->'bt->string_->'bSeq.t(** Lazy list of objects associated to strings close to the query string *)valretrieve_l:limit:int->'bt->string_->'blist(** List of objects associated to strings close to the query string
@since 0.3 *)valof_list:(string_*'b)list->'bt(** Build an index from a list of pairs of strings and values *)valto_list:'bt->(string_*'b)list(** Extract a list of pairs from an index *)valfold:('a->string_->'b->'a)->'a->'bt->'a(** Fold over the stored pairs string/value *)valiter:(string_->'b->unit)->'bt->unit(** Iterate on the pairs *)valto_seq:'bt->(string_*'b)Seq.t(** Conversion to an iterator
@since 0.3 *)endendmoduleMake(Str:STRING)=structtypestring_=Str.ttypechar_=Str.char_letedit_distances1s2=ifStr.lengths1=0thenStr.lengths2elseifStr.lengths2=0thenStr.lengths1elseifs1=s2then0elsebegin(* distance vectors (v0=previous, v1=current) *)letv0=Array.make(Str.lengths2+1)0inletv1=Array.make(Str.lengths2+1)0in(* initialize v0: v0(i) = A(0)(i) = delete i chars from t *)fori=0toStr.lengths2dov0.(i)<-idone;(* main loop for the bottom up dynamic algorithm *)fori=0toStr.lengths1-1do(* first edit distance is the deletion of i+1 elements from s *)v1.(0)<-i+1;(* try add/delete/replace operations *)forj=0toStr.lengths2-1doletcost=ifStr.compare_char(Str.gets1i)(Str.gets2j)=0then0else1inv1.(j+1)<-min(v1.(j)+1)(min(v0.(j+1)+1)(v0.(j)+cost));done;(* copy v1 into v0 for next iteration *)Array.blitv10v00(Str.lengths2+1);done;v1.(Str.lengths2)endmoduleNDA=structtypechar=|Any|Charofchar_typetransition=|Success|Uponofchar*int*int|Epsilonofint*int(* non deterministic automaton *)typet=transitionlistarrayarrayletlength(nda:t)=Array.lengthndaletrecmem_trtrl=matchtr,lwith|_,[]->false|Success,Success::_->true|Epsilon(i,j),Epsilon(i',j')::_->i=i'&&j=j'|Upon(Any,i,j),Upon(Any,i',j')::_wheni=i'&&j=j'->true|Upon(Charc,i,j),Upon(Charc',i',j')::_whenStr.compare_charcc'=0&&i=i'&&j=j'->true|_,_::l'->mem_trtrl'(* build NDA from the string *)letmake~limits=letlen=Str.lengthsinletm=Array.make_matrix(len+1)(limit+1)[]inletadd_transitionijtr=ifnot(mem_trtrm.(i).(j))thenm.(i).(j)<-tr::m.(i).(j)in(* internal transitions *)fori=0tolen-1doforj=0tolimitdo(* correct char *)add_transitionij(Upon(Char(Str.getsi),i+1,j));(* other transitions *)ifj<limitthenbegin(* substitution *)add_transitionij(Upon(Any,i+1,j+1));(* deletion in indexed string *)add_transitionij(Upon(Any,i,j+1));(* addition to indexed string *)add_transitionij(Epsilon(i+1,j+1));enddonedone;forj=0tolimitdo(* deletions at the end *)ifj<limitthenadd_transitionlenj(Upon(Any,len,j+1));(* win in any case *)add_transitionlenjSuccess;done;mletgetnda(i,j)=nda.(i).(j)letis_finalnda(i,j)=List.exists(functionSuccess->true|_->false)(getnda(i,j))end(** deterministic automaton *)moduleDFA=structtypet={mutabletransitions:(char_*int)listarray;mutableis_final:boolarray;mutableotherwise:intarray;(* transition by default *)mutablelen:int;}letcreatesize={len=0;transitions=Array.makesize[];is_final=Array.makesizefalse;otherwise=Array.makesize~-1;}let_double_array~inita=leta'=Array.make(2*Array.lengtha)initinArray.blita0a'0(Array.lengtha);a'(* add a new state *)letadd_statedfa=letn=dfa.lenin(* resize *)ifn=Array.lengthdfa.transitionsthenbegindfa.transitions<-_double_array~init:[]dfa.transitions;dfa.is_final<-_double_array~init:falsedfa.is_final;dfa.otherwise<-_double_array~init:~-1dfa.otherwise;end;dfa.len<-n+1;nletrec__mem_trtrl=matchtr,lwith|_,[]->false|(c,i),(c',i')::l'->(i=i'&&comparecc'=0)||__mem_trtrl'(* add transition *)letadd_transitiondfaitr=ifnot(__mem_trtrdfa.transitions.(i))thendfa.transitions.(i)<-tr::dfa.transitions.(i)letadd_otherwisedfaij=dfa.otherwise.(i)<-jletset_finaldfai=dfa.is_final.(i)<-true(* set of pairs of ints: used for representing a set of states of the NDA *)moduleNDAStateSet=Set.Make(structtypet=int*intletcompare=Stdlib.compareend)let_set_to_strings=letb=Buffer.create15inBuffer.add_charb'{';NDAStateSet.iter(fun(x,y)->Printf.bprintfb"(%d,%d)"xy)s;Buffer.add_charb'}';Buffer.contentsb(* list of characters that can specifically be followed from the given set *)letchars_from_setndaset=NDAStateSet.fold(funstateacc->lettransitions=NDA.getndastateinList.fold_left(funacctr->matchtrwith|NDA.Upon(NDA.Charc,_,_)->ifList.exists(func'->Str.compare_charcc'=0)accthenaccelsec::acc(* new char! *)|_->acc)acctransitions)set[](* saturate current set w.r.t epsilon links *)letsaturate_epsilonndaset=letq=Queue.create()inNDAStateSet.iter(funs->Queue.pushsq)set;letset=refsetinwhilenot(Queue.is_emptyq)doletstate=Queue.popqin(*Printf.printf "saturate epsilon: add state %d,%d\n" (fst state)(snd state);*)set:=NDAStateSet.addstate!set;List.iter(funtr'->matchtr'with|NDA.Epsilon(i,j)->ifnot(NDAStateSet.mem(i,j)!set)thenQueue.push(i,j)q|_->())(NDA.getndastate)done;!set(* find the transition that matches the given char (if any), or "*";
may raise exceptions Not_found or LeadToSuccess. *)letrecget_transition_for_charndacacctransitions=matchtransitionswith|NDA.Upon(NDA.Charc',i,j)::transitions'whenStr.compare_charcc'=0->(* follow same char *)letacc=NDAStateSet.add(i,j)accinget_transition_for_charndacacctransitions'|NDA.Upon(NDA.Any,i,j)::transitions'->(* follow '*' *)letacc=NDAStateSet.add(i,j)accinget_transition_for_charndacacctransitions'|_::transitions'->get_transition_for_charndacacctransitions'|[]->accletrecget_transitions_for_anyndaacctransitions=matchtransitionswith|NDA.Upon(NDA.Char_,_,_)::transitions'->get_transitions_for_anyndaacctransitions'|NDA.Upon(NDA.Any,i,j)::transitions'->letacc=NDAStateSet.add(i,j)accinget_transitions_for_anyndaacctransitions'|_::transitions'->get_transitions_for_anyndaacctransitions'|[]->acc(* follow transition for given NDA.char, returns a new state
and a boolean indicating whether it's final *)letfollow_transitionndasetc=letset'=NDAStateSet.fold(funstateacc->lettransitions=NDA.getndastatein(* among possible transitions, follow the one that matches c
the most closely *)get_transition_for_charndacacctransitions)setNDAStateSet.emptyinsaturate_epsilonndaset'letfollow_transition_anyndaset=letset'=NDAStateSet.fold(funstateacc->lettransitions=NDA.getndastatein(* among possible transitions, follow the ones that are labelled with "*" *)get_transitions_for_anyndaacctransitions)setNDAStateSet.emptyinsaturate_epsilonndaset'(* call [k] with every [transition'] that can be reached from [set], with
a bool that states whether it's final *)letiterate_transition_setndasetk=(*Printf.printf "iterate_transition at set %s\n" (_set_to_string set);*)(* all possible "fixed char" transitions *)letchars=chars_from_setndasetinList.iter(func->(*Printf.printf "iterate_transition follows %c (at %s)\n"
(Obj.magic c) (_set_to_string set);*)letset'=follow_transitionndasetcinifnot(NDAStateSet.is_emptyset')thenk(NDA.Charc)set';)chars;(* remaining transitions, with only "Any" *)(*Printf.printf "iterate transition follows * (at %s)\n" (_set_to_string set);*)letset'=follow_transition_anyndasetinifnot(NDAStateSet.is_emptyset')thenkNDA.Anyset'moduleStateSetMap=Map.Make(NDAStateSet)(* get the state that corresponds to the given set of NDA states.
[states] is a map [nda set] -> [nfa state] *)letget_statedfastatesset=tryStateSetMap.findset!stateswithNot_found->leti=add_statedfainstates:=StateSetMap.addseti!states;i(* traverse the NDA. Currently we're at [set] *)letrectraversendadfastatesset=letset_i=get_statedfastatessetin(* does this set lead to success? *)letis_final=NDAStateSet.exists(NDA.is_finalnda)setinifis_finalthenset_finaldfaset_i;iterate_transition_setndaset(funcset'->(*Printf.printf "traverse %s --%c--> %s\n" (_set_to_string set)
(match c with NDA.Char c' -> Obj.magic c' | NDA.Any -> '*')
(_set_to_string set');*)letset_i'=get_statedfastatesset'in(* link set -> set' *)matchcwith|NDA.Charc'->add_transitiondfaset_i(c',set_i');traversendadfastatesset'|NDA.Any->add_otherwisedfaset_iset_i';traversendadfastatesset')letof_ndanda=letdfa=create(NDA.lengthnda)in(* map (set of NDA states) to int (state in DFA) *)letstates=refStateSetMap.emptyin(* traverse the NDA to build the NFA *)letset=NDAStateSet.singleton(0,0)inletset=saturate_epsilonndasetintraversendadfastatesset;(*StateSetMap.iter
(fun set i ->
Printf.printf "set %s --> state %d\n" (_set_to_string set) i
) !states;*)dfaletgetdfai=dfa.transitions.(i)letotherwisedfai=dfa.otherwise.(i)letis_finaldfai=dfa.is_final.(i)endletdebug_printpp_charocdfa=Printf.fprintfoc"automaton of %d states\n"dfa.DFA.len;fori=0todfa.DFA.len-1dolettransitions=DFA.getdfaiinifDFA.is_finaldfaithenPrintf.fprintfoc" success %d\n"i;List.iter(fun(c,j)->Printf.fprintfoc" %d --%a--> %d\n"ipp_charcj)transitions;leto=DFA.otherwisedfaiinifo>=0thenPrintf.fprintfoc" %d --*--> %d\n"iodonetypeautomaton=DFA.tletof_string~limits=letnda=NDA.make~limitsinletdfa=DFA.of_ndandaindfaletof_list~limitl=of_string~limit(Str.of_listl)letrec__find_charcl=matchlwith|[]->raiseNot_found|(c',next)::l'->ifcomparecc'=0thennextelse__find_charcl'(* transition for [c] in state [i] of [dfa];
@raise Not_found if no transition matches *)let__transitiondfaic=lettransitions=DFA.getdfaiintry__find_charctransitionswithNot_found->leto=DFA.otherwisedfaiinifo>=0thenoelseraiseNot_foundletmatch_withdfaa=letlen=Str.lengthainletrecsearchistate=(*Printf.printf "at state %d (dist %d)\n" i dist;*)ifi=lenthenDFA.is_finaldfastateelsebegin(* current char *)letc=Str.getaiintryletnext=__transitiondfastatecinsearch(i+1)nextwithNot_found->falseendinsearch00(** {6 Index for one-to-many matching} *)moduleIndex=structtypekey=char_moduleM=Map.Make(structtypet=keyletcompare=Str.compare_charend)type'bt=|Nodeof'boption*'btM.tletempty=Node(None,M.empty)letis_empty=function|Node(None,m)->M.is_emptym|_->falselet()=assert(is_emptyempty)(** get/add/remove the leaf for the given array.
the continuation k takes the leaf, and returns a leaf option
that replaces the old leaf.
This function returns the new trie. *)letgoto_leafsnodek=letlen=Str.lengthsin(* insert the value in given [node], assuming the current index
in [arr] is [i]. [k] is given the resulting tree. *)letrecgotonodeirebuild=matchnodewith|_wheni=len->letnode'=knodeinrebuildnode'|Node(opt,m)->letc=Str.getsiinlett'=tryM.findcmwithNot_found->emptyingotot'(i+1)(funt''->ifis_emptyt''thenrebuild(Node(opt,M.removecm))elserebuild(Node(opt,M.addct''m)))ingotonode0(funt->t)letaddtriesvalue=goto_leafstrie(function|Node(_,m)->Node(Somevalue,m))letremovetries=goto_leafstrie(function|Node(_,m)->Node(None,m))(* traverse the automaton and the idx, yielding a klist of values *)letretrieve~limitidxs=letdfa=of_string~limitsin(* traverse at index i in automaton, with
[fk] the failure continuation *)letrectraversenodei~(fk:'aSeq.t)()=matchnodewith|Node(opt,m)->(* all alternatives: continue exploring [m], or call [fk] *)letfk=M.fold(funcnode'fk->tryletnext=__transitiondfaicintraversenode'next~fkwithNot_found->fk)mfkinmatchoptwith|SomevwhenDFA.is_finaldfai->(* yield one solution now *)Seq.Cons(v,fk)|_->fk()(* fail... or explore subtrees *)intraverseidx0~fk:Seq.emptyletretrieve_l~limitidxs=list_of_seq@@retrieve~limitidxsletof_listl=List.fold_left(funacc(arr,v)->addaccarrv)emptylletfoldfaccidx=letrecexploreacctrailnode=matchnodewith|Node(opt,m)->(* first, yield current value, if any *)letacc=matchoptwith|None->acc|Somev->letstr=Str.of_list(List.revtrail)infaccstrvinM.fold(funcnode'acc->exploreacc(c::trail)node')maccinexploreacc[]idxletiterfidx=fold(fun()strv->fstrv)()idxletto_listidx=fold(funaccstrv->(str,v)::acc)[]idxletto_seqidx=letrectraversenodetrail~(fk:(string_*'a)Seq.t)()=matchnodewith|Node(opt,m)->(* all alternatives: continue exploring [m], or call [fk] *)letfk=M.fold(funcnode'fk->traversenode'(c::trail)~fk)mfkinmatchoptwith|Somev->letstr=Str.of_list(List.revtrail)inSeq.Cons((str,v),fk)|_->fk()(* fail... or explore subtrees *)intraverseidx[]~fk:Seq.emptyendendincludeMake(structtypet=stringtypechar_=charletcompare_char=Char.compareletlength=String.lengthletget=String.getletof_listl=letbuf=Bytes.make(List.lengthl)' 'inList.iteri(funic->Bytes.setbufic)l;Bytes.to_stringbufend)letdebug_print=debug_printoutput_char