123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669openImport(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)moduleIds=structtypet=intrefletcreate()=ref0letnextt=incrt;!t;;endmoduleSem=structtypet=[`Longest|`Shortest|`First]letequal=Poly.equalletppchk=Format.pp_print_stringch(matchkwith|`Shortest->"short"|`Longest->"long"|`First->"first");;endmoduleRep_kind=structtypet=[`Greedy|`Non_greedy]letppfmt=function|`Greedy->Format.pp_print_stringfmt"Greedy"|`Non_greedy->Format.pp_print_stringfmt"Non_greedy";;endmoduleMark=structtypet=intletstart=0letprevx=predxletnextx=succxletnext2x=x+2letgroup_countx=x/2endtypeidx=inttypeexpr={id:int;def:def}anddef=|CstofCset.t|Altofexprlist|SeqofSem.t*expr*expr|Eps|RepofRep_kind.t*Sem.t*expr|Markofint|Eraseofint*int|BeforeofCategory.t|AfterofCategory.t|PmarkofPmark.tlethash_combinehaccu=(accu*65599)+hmoduleMarks=structtypet={marks:(int*int)list;pmarks:Pmark.Set.t}letequal{marks;pmarks}t=List.equal~eq:(fun(x,y)(x',y')->Int.equalxx'&&Int.equalyy')markst.marks&&Pmark.Set.equalpmarkst.pmarks;;letempty={marks=[];pmarks=Pmark.Set.empty}letmerge=letrecmerge_marks_offsetold=function|[]->old|(i,v)::rem->letnw'=merge_marks_offset(List.remove_assqiold)reminifv=-2thennw'else(i,v)::nw'infunoldnw->{marks=merge_marks_offsetold.marksnw.marks;pmarks=Pmark.Set.unionold.pmarksnw.pmarks};;letrechash_marks_offsetlaccu=matchlwith|[]->accu|(a,i)::r->hash_marks_offsetr(hash_combinea(hash_combineiaccu));;lethashmaccu=hash_marks_offsetm.marks(hash_combine(Hashtbl.hashm.pmarks)accu)letmarks_set_idx=letrecmarks_set_idxidx=function|(a,-1)::rem->(a,idx)::marks_set_idxidxrem|marks->marksinfunmarksidx->{markswithmarks=marks_set_idxidxmarks.marks};;letrecremove_marksberem=ifb>ethenremelseremove_marksb(e-1)((e,-2)::rem);;letfiltertbe={twithmarks=List.filter~f:(fun(i,_)->i<b||i>e)t.marks};;leterasetbe={twithmarks=remove_marksbe(filtertbe).marks}letset_markti={twithmarks=(i,-1)::List.remove_assqit.marks}letset_pmarkti={twithpmarks=Pmark.Set.addit.pmarks}letpp_markscht=matcht.markswith|[]->()|(a,i)::r->Format.fprintfch"%d-%d"ai;List.iter~f:(fun(a,i)->Format.fprintfch" %d-%d"ai)r;;end(****)letrecppche=letopenFmtinmatche.defwith|Cstl->sexpch"cst"Cset.ppl|Altl->sexpch"alt"(listpp)l|Seq(k,e,e')->sexpch"seq"(tripleSem.pppppp)(k,e,e')|Eps->strch"eps"|Rep(_rk,k,e)->sexpch"rep"(pairSem.pppp)(k,e)|Marki->sexpch"mark"inti|Pmarki->sexpch"pmark"int(i:>int)|Erase(b,e)->sexpch"erase"(pairintint)(b,e)|Beforec->sexpch"before"Category.ppc|Afterc->sexpch"after"Category.ppc;;(****)leteps_expr={id=0;def=Eps}letmk_expridsdef={id=Ids.nextids;def}letemptyids=mk_exprids(Alt[])letcstidss=ifCset.is_emptysthenemptyidselsemk_exprids(Csts)letaltids=function|[]->emptyids|[c]->c|l->mk_exprids(Altl);;letseqids(kind:Sem.t)xy=matchx.def,y.defwith|Alt[],_->x|_,Alt[]->y|Eps,_->y|_,EpswhenSem.equalkind`First->x|_->mk_exprids(Seq(kind,x,y));;letis_epsexpr=matchexpr.defwith|Eps->true|_->false;;letepsids=mk_expridsEpsletrepidskindsemx=mk_exprids(Rep(kind,sem,x))letmarkidsm=mk_exprids(Markm)letpmarkidsi=mk_exprids(Pmarki)leteraseidsmm'=mk_exprids(Erase(m,m'))letbeforeidsc=mk_exprids(Beforec)letafteridsc=mk_exprids(Afterc)(****)letrecrenameidsx=matchx.defwith|Cst_|Eps|Mark_|Pmark_|Erase_|Before_|After_->mk_expridsx.def|Altl->mk_exprids(Alt(List.map~f:(renameids)l))|Seq(k,y,z)->mk_exprids(Seq(k,renameidsy,renameidsz))|Rep(g,k,y)->mk_exprids(Rep(g,k,renameidsy));;(****)typehash=inttypestatus=|Failed|MatchofMark_infos.t*Pmark.Set.t|RunningmoduleE=structtypet=|TSeqoftlist*expr*Sem.t|TExpofMarks.t*expr|TMatchofMarks.tletis_tmatch=function|TMatch_->true|TSeq_|TExp_->false;;letrecequall1l2=matchl1,l2with|[],[]->true|TSeq(l1',e1,_)::r1,TSeq(l2',e2,_)::r2->e1.id=e2.id&&equall1'l2'&&equalr1r2|TExp(marks1,e1)::r1,TExp(marks2,e2)::r2->e1.id=e2.id&&Marks.equalmarks1marks2&&equalr1r2|TMatchmarks1::r1,TMatchmarks2::r2->Marks.equalmarks1marks2&&equalr1r2|_->false;;letrechashlaccu=matchlwith|[]->accu|TSeq(l',e,_)::r->hashr(hash_combine0x172a1bce(hash_combinee.id(hashl'accu)))|TExp(marks,e)::r->hashr(hash_combine0x2b4c0d77(hash_combinee.id(Marks.hashmarksaccu)))|TMatchmarks::r->hashr(hash_combine0x1c205ad5(Marks.hashmarksaccu));;lettexpmarksx=TExp(marks,x)lettseqkindxyrem=matchxwith|[]->rem|[TExp(marks,{def=Eps;_})]->TExp(marks,y)::rem|_->TSeq(x,y,kind)::rem;;letrecprint_state_recchey=matchewith|TMatchmarks->Format.fprintfch"@[<2>(Match@ %a)@]"Marks.pp_marksmarks|TSeq(l',x,_kind)->Format.fprintfch"@[<2>(Seq@ ";print_state_lstchl'x;Format.fprintfch"@ %a)@]"ppx|TExp(marks,{def=Eps;_})->Format.fprintfch"@[<2>(Exp@ %d@ (%a)@ (eps))@]"y.idMarks.pp_marksmarks|TExp(marks,x)->Format.fprintfch"@[<2>(Exp@ %d@ (%a)@ %a)@]"x.idMarks.pp_marksmarksppxandprint_state_lstchly=matchlwith|[]->Format.fprintfch"()"|e::rem->print_state_recchey;List.iterrem~f:(fune->Format.fprintfch"@ | ";print_state_recchey);;let_ppcht=print_state_lstch[t]{id=0;def=Eps}endmoduleDesc=structtypet=E.tlistopenEletrecfirst_match=function|[]->None|TMatchmarks::_->Somemarks|_::r->first_matchr;;letremove_matches=List.filter~f:(function|TMatch_->false|_->true);;letsplit_at_match=letrecsplit_at_match_recl=function|[]->assertfalse|TMatch_::r->List.revl,remove_matchesr|x::r->split_at_match_rec(x::l)rinfunl->split_at_match_rec[]l;;letexists_tmatch=List.exists~f:is_tmatchletrecset_idxidx=function|[]->[]|TMatchmarks::r->TMatch(Marks.marks_set_idxmarksidx)::set_idxidxr|TSeq(l,x,kind)::r->TSeq(set_idxidxl,x,kind)::set_idxidxr|TExp(marks,x)::r->TExp(Marks.marks_set_idxmarksidx,x)::set_idxidxr;;endmoduleState=structtypet={idx:idx;category:Category.t;desc:Desc.t;mutablestatus:statusoption;hash:hash}let[@inline]idxt=t.idxletdummy={idx=-1;category=Category.dummy;desc=[];status=None;hash=-1}lethashidxcatdesc=E.hashdesc(hash_combineidx(hash_combine(Category.to_intcat)0))land0x3FFFFFFF;;letmkidxcatdesc={idx;category=cat;desc;status=None;hash=hashidxcatdesc};;letcreatecate=mk0cat[E.TExp(Marks.empty,e)]letequalxy=(x.hash:int)=y.hash&&(x.idx:int)=y.idx&&Category.equalx.categoryy.category&&E.equalx.descy.desc;;letcomparexy=letc=compare(x.hash:int)y.hashinifc<>0thencelse(letc=Category.comparex.categoryy.categoryinifc<>0thencelsecomparex.descy.desc);;letstatuss=matchs.statuswith|Somest->st|None->letst=matchs.descwith|[]->Failed|E.TMatchm::_->Match(Mark_infos.makem.marks,m.pmarks)|_->Runningins.status<-Somest;st;;moduleTable=Hashtbl.Make(structtypenonrect=tletequal=equallethasht=t.hashend)end(**** Find a free index ****)moduleWorking_area=structtypet=Bit_vector.trefletcreate()=ref(Bit_vector.create_zero1)letindex_countw=Bit_vector.length!wletrecmark_used_indicestbl=List.iter~f:(function|E.TSeq(l,_,_)->mark_used_indicestbll|E.TExp(marks,_)|E.TMatchmarks->List.itermarks.marks~f:(fun(_,i)->ifi>=0thenBit_vector.settblitrue));;letrecfind_freetblidxlen=ifidx=len||not(Bit_vector.gettblidx)thenidxelsefind_freetbl(idx+1)len;;letfree_indextbl_refl=lettbl=!tbl_refinBit_vector.reset_zerotbl;mark_used_indicestbll;letlen=Bit_vector.lengthtblinletidx=find_freetbl0leninifidx=lenthentbl_ref:=Bit_vector.create_zero(2*len);idx;;end(**** Computation of the next state ****)letrecremove_duplicatesprevly=matchlwith|[]->[],prev|(E.TMatch_asx)::_->(* Truncate after first match *)[x],prev|E.TSeq(l,x,kind)::r->letl,prev=remove_duplicatesprevlxinletr,prev=remove_duplicatesprevryinE.tseqkindlxr,prev|(E.TExp(_marks,{def=Eps;_})ase)::r->ifList.memqy.id~set:prevthenremove_duplicatesprevryelse(letr,prev=remove_duplicates(y.id::prev)ryine::r,prev)|(E.TExp(_marks,x)ase)::r->ifList.memqx.id~set:prevthenremove_duplicatesprevryelse(letr,prev=remove_duplicates(x.id::prev)ryine::r,prev);;typectx={c:Cset.c;prev_cat:Category.t;next_cat:Category.t;marks:Marks.t}letrecdelta_1({c;marks;_}asctx)xrem=(*Format.eprintf "%d@." x.id;*)matchx.defwith|Csts->ifCset.memcsthenE.texpmarkseps_expr::remelserem|Altl->delta_altctxlrem|Seq(kind,y,z)->lety=delta_1ctxy[]indelta_seqctxkindyzrem|Rep(rep_kind,kind,y)->lety,marks'=lety=delta_1ctxy[]inmatchDesc.first_matchywith|None->y,marks|Somemarks->Desc.remove_matchesy,marksin(matchrep_kindwith|`Greedy->E.tseqkindyx(E.TMatchmarks'::rem)|`Non_greedy->E.TMatchmarks::E.tseqkindyxrem)|Eps->E.TMatchmarks::rem|Marki->letmarks=Marks.set_markmarksiinE.TMatchmarks::rem|Pmarki->letmarks=Marks.set_pmarkmarksiinE.TMatchmarks::rem|Erase(b,e)->E.TMatch(Marks.filtermarksbe)::rem|Beforecat->ifCategory.intersectctx.next_catcatthenE.TMatchmarks::remelserem|Aftercat->ifCategory.intersectctx.prev_catcatthenE.TMatchmarks::remelseremanddelta_altctxlrem=matchlwith|[]->rem|y::r->delta_1ctxy(delta_altctxrrem)anddelta_seqctx(kind:Sem.t)yzrem=matchDesc.first_matchywith|None->E.tseqkindyzrem|Somemarks->letctx={ctxwithmarks}in(matchkindwith|`Longest->E.tseqkind(Desc.remove_matchesy)z(delta_1ctxzrem)|`Shortest->delta_1ctxz(E.tseqkind(Desc.remove_matchesy)zrem)|`First->lety,y'=Desc.split_at_matchyinE.tseqkindyz(delta_1ctxz(E.tseqkindy'zrem)));;letrecdelta_3ctxxrem=matchxwith|E.TSeq(y,z,kind)->lety=delta_4ctxy[]indelta_seqctxkindyzrem|E.TExp(marks,e)->delta_1{ctxwithmarks}erem|E.TMatch_->x::remanddelta_4ctxlrem=matchlwith|[]->rem|y::r->delta_3ctxy(delta_4ctxrrem);;letdeltatbl_refnext_catchar(st:State.t)=letexpr,_=letprev_cat=st.categoryinletctx={c=char;next_cat;prev_cat;marks=Marks.empty}inremove_duplicates[](delta_4ctxst.desc[])eps_exprinletidx=Working_area.free_indextbl_refexprinletexpr=Desc.set_idxidxexprinState.mkidxnext_catexpr;;(****)letrecred_tr=function|([]|[_])asl->l|((s1,st1)astr1)::((s2,st2)astr2)::rem->ifState.equalst1st2thenred_tr((Cset.unions1s2,st1)::rem)elsetr1::red_tr(tr2::rem);;letsimpl_trl=List.sort~cmp:(fun(s1,_)(s2,_)->compares1s2)(red_tr(List.sort~cmp:(fun(_,st1)(_,st2)->State.comparest1st2)l));;(****)letprepend_derivinit=List.fold_right~init~f:(fun(s,x)l->Cset.prependsxl)letrecrestricts=function|[]->[]|(s',x)::rem->lets'=Cset.interss'inifCset.is_emptys'thenrestrictsremelse(s',x)::restrictsrem;;letprepend_marks=letrecprepend_marks_exprm=function|E.TSeq(l,e,s)->E.TSeq(prepend_marks_expr_lstml,e,s)|E.TExp(m',e)->E.TExp(Marks.mergemm',e)|E.TMatchm'->E.TMatch(Marks.mergemm')andprepend_marks_expr_lstml=List.map~f:(prepend_marks_exprm)linfunm->List.map~f:(fun(s,x)->s,prepend_marks_expr_lstmx);;letrecderiv_1all_charscategoriesmarkscatxrem=matchx.defwith|Csts->Cset.prepends[E.texpmarkseps_expr]rem|Altl->deriv_2all_charscategoriesmarkscatlrem|Seq(kind,y,z)->lety=deriv_1all_charscategoriesmarkscaty[all_chars,[]]inderiv_seqall_charscategoriescatkindyzrem|Rep(rep_kind,kind,y)->lety=deriv_1all_charscategoriesmarkscaty[all_chars,[]]inList.fold_right~init:remy~f:(fun(s,z)rem->letz',marks'=matchDesc.first_matchzwith|None->z,marks|Somemarks'->Desc.remove_matchesz,marks'inCset.prepends(matchrep_kindwith|`Greedy->E.tseqkindz'x[E.TMatchmarks']|`Non_greedy->E.TMatchmarks::E.tseqkindz'x[])rem)|Eps->Cset.prependall_chars[E.TMatchmarks]rem|Marki->Cset.prependall_chars[E.TMatch(Marks.set_markmarksi)]rem|Pmark_->Cset.prependall_chars[E.TMatchmarks]rem|Erase(b,e)->Cset.prependall_chars[E.TMatch(Marks.erasemarksbe)]rem|Beforecat->Cset.prepend(List.assqcatcategories)[E.TMatchmarks]rem|Aftercat'->ifCategory.intersectcatcat'thenCset.prependall_chars[E.TMatchmarks]remelseremandderiv_2all_charscategoriesmarkscatlrem=matchlwith|[]->rem|y::r->deriv_1all_charscategoriesmarkscaty(deriv_2all_charscategoriesmarkscatrrem)andderiv_seqall_charscategoriescatkindyzrem=ifList.existsy~f:(fun(_s,xl)->Desc.exists_tmatchxl)then(letz'=deriv_1all_charscategoriesMarks.emptycatz[all_chars,[]]inList.fold_right~init:remy~f:(fun(s,y)rem->matchDesc.first_matchywith|None->Cset.prepends(E.tseqkindyz[])rem|Somemarks->letz''=prepend_marksmarksz'|>restrictsin(matchkindwith|`Longest->Cset.prepends(E.tseqkind(Desc.remove_matchesy)z[])(prepend_derivz''rem)|`Shortest->prepend_derivz''(Cset.prepends(E.tseqkind(Desc.remove_matchesy)z[])rem)|`First->lety',y''=Desc.split_at_matchyinCset.prepends(E.tseqkindy'z[])(prepend_derivz''(Cset.prepends(E.tseqkindy''z[])rem)))))elseList.fold_righty~init:rem~f:(fun(s,xl)rem->Cset.prepends(E.tseqkindxlz[])rem);;letrecderiv_3all_charscategoriescatxrem=matchxwith|E.TSeq(y,z,kind)->lety'=deriv_4all_charscategoriescaty[all_chars,[]]inderiv_seqall_charscategoriescatkindy'zrem|E.TExp(marks,e)->deriv_1all_charscategoriesmarkscaterem|E.TMatch_->Cset.prependall_chars[x]remandderiv_4all_charscategoriescatlrem=matchlwith|[]->rem|y::r->deriv_3all_charscategoriescaty(deriv_4all_charscategoriescatrrem);;letderivtbl_refall_charscategories(st:State.t)=letder=deriv_4all_charscategoriesst.categoryst.desc[all_chars,[]]insimpl_tr(List.fold_rightder~init:[]~f:(fun(s,expr)rem->letexpr',_=remove_duplicates[]expreps_exprin(*
Format.eprintf "@[<3>@[%a@]: %a / %a@]@." Cset.print s print_state expr print_state expr';
*)letidx=Working_area.free_indextbl_refexpr'inletexpr''=Desc.set_idxidxexpr'inList.fold_rightcategories~init:rem~f:(fun(cat',s')rem->lets''=Cset.interss'inifCset.is_emptys''thenremelse(s'',State.mkidxcat'expr'')::rem)));;(****)