123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)moduleCset=Re_csettypesem=[`Longest|`Shortest|`First]typerep_kind=[`Greedy|`Non_greedy]typecategory=inttypemark=inttypeidx=intmodulePmark:sigtypet=privateintvalequal:t->t->boolvalcompare:t->t->intvalgen:unit->tvalpp:Format.formatter->t->unitend=structtypet=intletequal(x:int)(y:int)=x=yletcompare(x:int)(y:int)=comparexyletr=ref0letgen()=incrr;!rletpp=Format.pp_print_intendtypeexpr={id:int;def:def}anddef=CstofCset.t|Altofexprlist|Seqofsem*expr*expr|Eps|Repofrep_kind*sem*expr|Markofint|Eraseofint*int|Beforeofcategory|Afterofcategory|PmarkofPmark.tmodulePmarkSet=Set.Make(Pmark)lethash_combinehaccu=accu*65599+hmoduleMarks=structtypet={marks:(int*int)list;pmarks:PmarkSet.t}letempty={marks=[];pmarks=PmarkSet.empty}letrecmerge_marks_offsetold=function|[]->old|(i,v)::rem->letnw'=merge_marks_offset(List.remove_assqiold)reminifv=-2thennw'else(i,v)::nw'letmergeoldnw={marks=merge_marks_offsetold.marksnw.marks;pmarks=PmarkSet.unionold.pmarksnw.pmarks}letrechash_marks_offsetlaccu=matchlwith[]->accu|(a,i)::r->hash_marks_offsetr(hash_combinea(hash_combineiaccu))lethashmaccu=hash_marks_offsetm.marks(hash_combine(Hashtbl.hashm.pmarks)accu)letrecmarks_set_idxidx=function|(a,-1)::rem->(a,idx)::marks_set_idxidxrem|marks->marksletmarks_set_idxmarksidx={markswithmarks=marks_set_idxidxmarks.marks}letpp_markscht=matcht.markswith|[]->()|(a,i)::r->Format.fprintfch"%d-%d"ai;List.iter(fun(a,i)->Format.fprintfch" %d-%d"ai)rend(****)letpp_semchk=Format.pp_print_stringch(matchkwith`Shortest->"short"|`Longest->"long"|`First->"first")letpp_rep_kindfmt=function|`Greedy->Format.pp_print_stringfmt"Greedy"|`Non_greedy->Format.pp_print_stringfmt"Non_greedy"letrecppche=letopenRe_fmtinmatche.defwithCstl->sexpch"cst"Cset.ppl;|Altl->sexpch"alt"(listpp)l|Seq(k,e,e')->sexpch"seq"(triplepp_sempppp)(k,e,e')|Eps->strch"eps"|Rep(_rk,k,e)->sexpch"rep"(pairpp_sempp)(k,e)|Marki->sexpch"mark"inti|Pmarki->sexpch"pmark"int(i:>int)|Erase(b,e)->sexpch"erase"(pairintint)(b,e)|Beforec->sexpch"before"intc|Afterc->sexpch"after"intc(****)letrecfirstf=function|[]->None|x::r->matchfxwithNone->firstfr|Some_asres->res(****)typeids=intrefletcreate_ids()=ref0leteps_expr={id=0;def=Eps}letmk_expridsdef=incrids;{id=!ids;def=def}letemptyids=mk_exprids(Alt[])letcstidss=ifRe_cset.is_emptysthenemptyidselsemk_exprids(Csts)letaltids=function|[]->emptyids|[c]->c|l->mk_exprids(Altl)letseqidskindxy=matchx.def,y.defwithAlt[],_->x|_,Alt[]->y|Eps,_->y|_,Epswhenkind=`First->x|_->mk_exprids(Seq(kind,x,y))letis_epsexpr=matchexpr.defwith|Eps->true|_->falseletepsids=mk_expridsEpsletrepidskindsemx=mk_exprids(Rep(kind,sem,x))letmarkidsm=mk_exprids(Markm)letpmarkidsi=mk_exprids(Pmarki)leteraseidsmm'=mk_exprids(Erase(m,m'))letbeforeidsc=mk_exprids(Beforec)letafteridsc=mk_exprids(Afterc)(****)letrecrenameidsx=matchx.defwithCst_|Eps|Mark_|Pmark_|Erase_|Before_|After_->mk_expridsx.def|Altl->mk_exprids(Alt(List.map(renameids)l))|Seq(k,y,z)->mk_exprids(Seq(k,renameidsy,renameidsz))|Rep(g,k,y)->mk_exprids(Rep(g,k,renameidsy))(****)typehash=inttypemark_infos=intarraytypestatus=Failed|Matchofmark_infos*PmarkSet.t|RunningmoduleE=structtypet=|TSeqoftlist*expr*sem|TExpofMarks.t*expr|TMatchofMarks.tletrecequall1l2=matchl1,l2with|[],[]->true|TSeq(l1',e1,_)::r1,TSeq(l2',e2,_)::r2->e1.id=e2.id&&equall1'l2'&&equalr1r2|TExp(marks1,e1)::r1,TExp(marks2,e2)::r2->e1.id=e2.id&&marks1=marks2&&equalr1r2|TMatchmarks1::r1,TMatchmarks2::r2->marks1=marks2&&equalr1r2|_->falseletrechashlaccu=matchlwith|[]->accu|TSeq(l',e,_)::r->hashr(hash_combine0x172a1bce(hash_combinee.id(hashl'accu)))|TExp(marks,e)::r->hashr(hash_combine0x2b4c0d77(hash_combinee.id(Marks.hashmarksaccu)))|TMatchmarks::r->hashr(hash_combine0x1c205ad5(Marks.hashmarksaccu))lettexpmarksx=TExp(marks,x)lettseqkindxyrem=matchxwith[]->rem|[TExp(marks,{def=Eps;_})]->TExp(marks,y)::rem|_->TSeq(x,y,kind)::remletrecprint_state_recchey=matchewith|TMatchmarks->Format.fprintfch"@[<2>(Match@ %a)@]"Marks.pp_marksmarks|TSeq(l',x,_kind)->Format.fprintfch"@[<2>(Seq@ ";print_state_lstchl'x;Format.fprintfch" %a)@]"ppx|TExp(marks,{def=Eps;_})->Format.fprintfch"(Exp %d (%a) (eps))"y.idMarks.pp_marksmarks|TExp(marks,x)->Format.fprintfch"(Exp %d (%a) %a)"x.idMarks.pp_marksmarksppxandprint_state_lstchly=matchlwith[]->Format.fprintfch"()"|e::rem->print_state_recchey;List.iter(fune->Format.fprintfch" | ";print_state_recchey)remletppcht=print_state_lstch[t]{id=0;def=Eps}endmoduleState=structtypet={idx:idx;category:category;desc:E.tlist;mutablestatus:statusoption;hash:hash}letdummy={idx=-1;category=-1;desc=[];status=None;hash=-1}lethashidxcatdesc=E.hashdesc(hash_combineidx(hash_combinecat0))land0x3FFFFFFFletmkidxcatdesc={idx;category=cat;desc;status=None;hash=hashidxcatdesc}letcreatecate=mk0cat[E.TExp(Marks.empty,e)]letequalxy=(x.hash:int)=y.hash&&(x.idx:int)=y.idx&&(x.category:int)=y.category&&E.equalx.descy.descletcomparexy=letc=compare(x.hash:int)y.hashinifc<>0thencelseletc=compare(x.category:int)y.categoryinifc<>0thencelsecomparex.descy.desctypet'=tmoduleTable=Hashtbl.Make(structtypet=t'letequal=equallethasht=t.hashend)end(**** Find a free index ****)typeworking_area=boolarrayrefletcreate_working_area()=ref[|false|]letindex_countw=Array.length!wletreset_tablea=Array.filla0(Array.lengtha)falseletrecmark_used_indicestbl=List.iter(function|E.TSeq(l,_,_)->mark_used_indicestbll|E.TExp(marks,_)|E.TMatchmarks->List.iter(fun(_,i)->ifi>=0thentbl.(i)<-true)marks.Marks.marks)letrecfind_freetblidxlen=ifidx=len||nottbl.(idx)thenidxelsefind_freetbl(idx+1)lenletfree_indextbl_refl=lettbl=!tbl_refinreset_tabletbl;mark_used_indicestbll;letlen=Array.lengthtblinletidx=find_freetbl0leninifidx=lenthentbl_ref:=Array.make(2*len)false;idx(**** Computation of the next state ****)letremove_matches=List.filter(functionE.TMatch_->false|_->true)letrecsplit_at_match_recl'=function|[]->assertfalse|E.TMatch_::r->(List.revl',remove_matchesr)|x::r->split_at_match_rec(x::l')rletsplit_at_matchl=split_at_match_rec[]lletrecremove_duplicatesprevly=matchlwith[]->([],prev)|E.TMatch_asx::_->(* Truncate after first match *)([x],prev)|E.TSeq(l',x,kind)::r->let(l'',prev')=remove_duplicatesprevl'xinlet(r',prev'')=remove_duplicatesprev'ryin(E.tseqkindl''xr',prev'')|E.TExp(_marks,{def=Eps;_})ase::r->ifList.memqy.idprevthenremove_duplicatesprevryelselet(r',prev')=remove_duplicates(y.id::prev)ryin(e::r',prev')|E.TExp(_marks,x)ase::r->ifList.memqx.idprevthenremove_duplicatesprevryelselet(r',prev')=remove_duplicates(x.id::prev)ryin(e::r',prev')letrecset_idxidx=function|[]->[]|E.TMatchmarks::r->E.TMatch(Marks.marks_set_idxmarksidx)::set_idxidxr|E.TSeq(l',x,kind)::r->E.TSeq(set_idxidxl',x,kind)::set_idxidxr|E.TExp(marks,x)::r->E.TExp((Marks.marks_set_idxmarksidx),x)::set_idxidxrletfilter_marksbemarks={markswithMarks.marks=List.filter(fun(i,_)->i<b||i>e)marks.Marks.marks}letrecdelta_1marksccat'catxrem=(*Format.eprintf "%d@." x.id;*)matchx.defwithCsts->ifCset.memcsthenE.texpmarkseps_expr::remelserem|Altl->delta_2marksccat'catlrem|Seq(kind,y,z)->lety'=delta_1marksccat'caty[]indelta_seqccat'catkindy'zrem|Rep(rep_kind,kind,y)->lety'=delta_1marksccat'caty[]inlet(y'',marks')=matchfirst(functionE.TMatchmarks->Somemarks|_->None)y'withNone->(y',marks)|Somemarks'->(remove_matchesy',marks')inbeginmatchrep_kindwith`Greedy->E.tseqkindy''x(E.TMatchmarks'::rem)|`Non_greedy->E.TMatchmarks::E.tseqkindy''xremend|Eps->E.TMatchmarks::rem|Marki->letmarks={markswithMarks.marks=(i,-1)::List.remove_assqimarks.Marks.marks}inE.TMatchmarks::rem|Pmarki->letmarks={markswithMarks.pmarks=PmarkSet.addimarks.Marks.pmarks}inE.TMatchmarks::rem|Erase(b,e)->E.TMatch(filter_marksbemarks)::rem|Beforecat''->ifcatlandcat''<>0thenE.TMatchmarks::remelserem|Aftercat''->ifcat'landcat''<>0thenE.TMatchmarks::remelseremanddelta_2marksccat'catlrem=matchlwith[]->rem|y::r->delta_1marksccat'caty(delta_2marksccat'catrrem)anddelta_seqccat'catkindyzrem=matchfirst(functionE.TMatchmarks->Somemarks|_->None)ywithNone->E.tseqkindyzrem|Somemarks->matchkindwith`Longest->E.tseqkind(remove_matchesy)z(delta_1marksccat'catzrem)|`Shortest->delta_1marksccat'catz(E.tseqkind(remove_matchesy)zrem)|`First->let(y',y'')=split_at_matchyinE.tseqkindy'z(delta_1marksccat'catz(E.tseqkindy''zrem))letrecdelta_3ccat'catxrem=matchxwithE.TSeq(y,z,kind)->lety'=delta_4ccat'caty[]indelta_seqccat'catkindy'zrem|E.TExp(marks,e)->delta_1marksccat'caterem|E.TMatch_->x::remanddelta_4ccat'catlrem=matchlwith[]->rem|y::r->delta_3ccat'caty(delta_4ccat'catrrem)letdeltatbl_refcat'charst=let(expr',_)=remove_duplicates[](delta_4charst.State.categorycat'st.State.desc[])eps_exprinletidx=free_indextbl_refexpr'inletexpr''=set_idxidxexpr'inState.mkidxcat'expr''(****)letrecred_tr=function|[]|[_]asl->l|((s1,st1)astr1)::((s2,st2)astr2)::rem->ifState.equalst1st2thenred_tr((Cset.unions1s2,st1)::rem)elsetr1::red_tr(tr2::rem)letsimpl_trl=List.sort(fun(s1,_)(s2,_)->compares1s2)(red_tr(List.sort(fun(_,st1)(_,st2)->State.comparest1st2)l))(****)letprepend_deriv=List.fold_right(fun(s,x)l->Cset.prependsxl)letrecrestricts=function|[]->[]|(s',x')::rem->lets''=Cset.interss'inifCset.is_emptys''thenrestrictsremelse(s'',x')::restrictsremletrecremove_marksberem=ifb>ethenremelseremove_marksb(e-1)((e,-2)::rem)letrecprepend_marks_exprm=function|E.TSeq(l,e',s)->E.TSeq(prepend_marks_expr_lstml,e',s)|E.TExp(m',e')->E.TExp(Marks.mergemm',e')|E.TMatchm'->E.TMatch(Marks.mergemm')andprepend_marks_expr_lstml=List.map(prepend_marks_exprm)lletprepend_marksm=List.map(fun(s,x)->(s,prepend_marks_expr_lstmx))letrecderiv_1all_charscategoriesmarkscatxrem=matchx.defwith|Csts->Cset.prepends[E.texpmarkseps_expr]rem|Altl->deriv_2all_charscategoriesmarkscatlrem|Seq(kind,y,z)->lety'=deriv_1all_charscategoriesmarkscaty[(all_chars,[])]inderiv_seqall_charscategoriescatkindy'zrem|Rep(rep_kind,kind,y)->lety'=deriv_1all_charscategoriesmarkscaty[(all_chars,[])]inList.fold_right(fun(s,z)rem->let(z',marks')=matchfirst(functionE.TMatchmarks->Somemarks|_->None)zwithNone->(z,marks)|Somemarks'->(remove_matchesz,marks')inCset.prepends(matchrep_kindwith`Greedy->E.tseqkindz'x[E.TMatchmarks']|`Non_greedy->E.TMatchmarks::E.tseqkindz'x[])rem)y'rem|Eps->Cset.prependall_chars[E.TMatchmarks]rem|Marki->Cset.prependall_chars[E.TMatch{markswithMarks.marks=((i,-1)::List.remove_assqimarks.Marks.marks)}]rem|Pmark_->Cset.prependall_chars[E.TMatchmarks]rem|Erase(b,e)->Cset.prependall_chars[E.TMatch{markswithMarks.marks=(remove_marksbe(filter_marksbemarks).Marks.marks)}]rem|Beforecat'->Cset.prepend(List.assqcat'categories)[E.TMatchmarks]rem|Aftercat'->ifcatlandcat'<>0thenCset.prependall_chars[E.TMatchmarks]remelseremandderiv_2all_charscategoriesmarkscatlrem=matchlwith[]->rem|y::r->deriv_1all_charscategoriesmarkscaty(deriv_2all_charscategoriesmarkscatrrem)andderiv_seqall_charscategoriescatkindyzrem=ifList.exists(fun(_s,xl)->List.exists(functionE.TMatch_->true|_->false)xl)ythenletz'=deriv_1all_charscategoriesMarks.emptycatz[(all_chars,[])]inList.fold_right(fun(s,y)rem->matchfirst(functionE.TMatchmarks->Somemarks|_->None)ywithNone->Cset.prepends(E.tseqkindyz[])rem|Somemarks->letz''=prepend_marksmarksz'inmatchkindwith`Longest->Cset.prepends(E.tseqkind(remove_matchesy)z[])(prepend_deriv(restrictsz'')rem)|`Shortest->prepend_deriv(restrictsz'')(Cset.prepends(E.tseqkind(remove_matchesy)z[])rem)|`First->let(y',y'')=split_at_matchyinCset.prepends(E.tseqkindy'z[])(prepend_deriv(restrictsz'')(Cset.prepends(E.tseqkindy''z[])rem)))yremelseList.fold_right(fun(s,xl)rem->Cset.prepends(E.tseqkindxlz[])rem)yremletrecderiv_3all_charscategoriescatxrem=matchxwithE.TSeq(y,z,kind)->lety'=deriv_4all_charscategoriescaty[(all_chars,[])]inderiv_seqall_charscategoriescatkindy'zrem|E.TExp(marks,e)->deriv_1all_charscategoriesmarkscaterem|E.TMatch_->Cset.prependall_chars[x]remandderiv_4all_charscategoriescatlrem=matchlwith[]->rem|y::r->deriv_3all_charscategoriescaty(deriv_4all_charscategoriescatrrem)letderivtbl_refall_charscategoriesst=letder=deriv_4all_charscategoriesst.State.categoryst.State.desc[(all_chars,[])]insimpl_tr(List.fold_right(fun(s,expr)rem->let(expr',_)=remove_duplicates[]expreps_exprin(*
Format.eprintf "@[<3>@[%a@]: %a / %a@]@." Cset.print s print_state expr print_state expr';
*)letidx=free_indextbl_refexpr'inletexpr''=set_idxidxexpr'inList.fold_right(fun(cat',s')rem->lets''=Cset.interss'inifCset.is_emptys''thenremelse(s'',State.mkidxcat'expr'')::rem)categoriesrem)der[])(****)letflatten_matchm=letma=List.fold_left(funma(i,_)->maxmai)(-1)minletres=Array.make(ma+1)(-1)inList.iter(fun(i,v)->res.(i)<-v)m;resletstatuss=matchs.State.statuswithSomest->st|None->letst=matchs.State.descwith[]->Failed|E.TMatchm::_->Match(flatten_matchm.Marks.marks,m.Marks.pmarks)|_->Runningins.State.status<-Somest;st