123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781openImport(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)lethash_combinehaccu=(accu*65599)+hmoduleIds:sigmoduleId:sigtypetvalequal:t->t->boolvalzero:tvalhash:t->intvalpp:tFmt.tmoduleHash_set:sigtypeid:=ttypetvalcreate:unit->tvalmem:t->id->boolvaladd:t->id->unitvalclear:t->unitendendtypetvalcreate:unit->tvalnext:t->Id.tend=structmoduleId=structtypet=intmoduleHash_set=Hash_setletequal=Int.equalletzero=0lethashx=xletpp=Fmt.intendtypet=intrefletcreate()=ref0letnextt=incrt;!t;;endmoduleId=Ids.IdmoduleSem=structtypet=[`Longest|`Shortest|`First]letto_string=function|`Shortest->"short"|`Longest->"long"|`First->"first";;letto_dynt=Dyn.enum(to_stringt)letequal=Poly.equalletppchk=Format.pp_print_stringch(to_stringk)endmoduleRep_kind=structtypet=[`Greedy|`Non_greedy]letto_string=function|`Greedy->"Greedy"|`Non_greedy->"Non_greedy";;letto_dynt=Dyn.enum(to_stringt)letppfmtt=Format.pp_print_stringfmt(to_stringt)endmoduleMark:sigtypet=privateintvalcompare:t->t->intvalequal:t->t->boolvalpp:tFmt.tvalto_dyn:t->Dyn.tvalstart:tvalprev:t->tvalnext:t->tvalnext2:t->tvalgroup_count:t->intvaloutside_range:t->start_inclusive:t->stop_inclusive:t->boolend=structtypet=intletequal=Int.equalletcompare=Int.compareletpp=Format.pp_print_intletto_dyn=Dyn.intletstart=0letprevx=predxletnextx=succxletnext2x=x+2letgroup_countx=x/2letoutside_ranget~start_inclusive~stop_inclusive=t<start_inclusive||t>stop_inclusive;;endmoduleIdx:sigtypet=privateintvalpp:tFmt.tvalto_dyn:t->Dyn.tvalto_int:t->intvalunknown:tvalinitial:tvalused:t->boolvalmake:int->tvalequal:t->t->boolend=structtypet=intletto_dyn=Dyn.intletto_intx=xletpp=Format.pp_print_intletusedt=t>=0letmakex=xletequal=Int.equalletunknown=-1letinitial=0endmoduleExpr=structtypet={id:Id.t;def:def}anddef=|CstofCset.t|Altoftlist|SeqofSem.t*t*t|Eps|RepofRep_kind.t*Sem.t*t|MarkofMark.t|EraseofMark.t*Mark.t|BeforeofCategory.t|AfterofCategory.t|PmarkofPmark.tletwrap_semsemsem'v=letopenDyninletname=Sem.to_stringsem'inmatchsemwith|SomesemwhenSem.equalsemsem'->v|None|Some_->(matchvwith|Listv->variantnamev|_->variantname[v]);;letrecseq_as_listsem=function|Eps->[]|Cstcs->[Cstcs]|Seq(sem',x,y)->ifSem.equalsemsem'thenx.def::seq_as_listsemy.defelseraise_notraceNot_found|_->raise_notraceNot_found;;letseq_as_listsemt=matchseq_as_listsemtwith|exceptionNot_found->None|s->Somes;;letrecdyn_of_defsem=letopenDyninfunction|Cstcset->Cset.to_dyncset|Altalt->variant"Alt"(List.map~f:(to_dynsem)alt)|Seq(sem',x,y)->letto_dyn=to_dyn(Somesem')inletx=matchseq_as_listsem'y.defwith|None->variant"Seq"[to_dynx;to_dyny]|Somey->variant"Seq"(to_dynx::List.mapy~f:(dyn_of_defsem))inwrap_semsemsem'x|Eps->Enum"Eps"|Rep(_,sem',t)->wrap_semsemsem'(variant"Rep"[to_dyn(Somesem')t])|Markm->variant"Mark"[Mark.to_dynm]|Pmarkm->variant"Pmark"[Pmark.to_dynm]|Erase(x,y)->variant"Erase"[Mark.to_dynx;Mark.to_dyny]|Beforec->variant"Before"[Category.to_dync]|Afterc->variant"After"[Category.to_dync]andto_dynsem{id=_;def}=dyn_of_defsemdefletrecpp_with_semsemche=letopenFmtinmatche.defwith|Cstl->sexpch"cst"Cset.ppl|Altl->sexpch"alt"(list(pp_with_semsem))l|Seq(k,e,e')->sexpch"seq"(tripleSem.pp(pp_with_semsem)(pp_with_semsem))(k,e,e')|Eps->strch"eps"|Rep(_rk,k,e)->sexpch"rep"(pairSem.pp(pp_with_sem(Somek)))(k,e)|Marki->sexpch"mark"Mark.ppi|Pmarki->sexpch"pmark"Pmark.ppi|Erase(b,e)->sexpch"erase"(pairMark.ppMark.pp)(b,e)|Beforec->sexpch"before"Category.ppc|Afterc->sexpch"after"Category.ppc;;letpp=pp_with_semNoneleteps_expr={id=Id.zero;def=Eps}letmkidsdef={id=Ids.nextids;def}letemptyids=mkids(Alt[])letcstidss=ifCset.is_emptysthenemptyidselsemkids(Csts)letepsids=mkidsEpsletrepidskindsemx=mkids(Rep(kind,sem,x))letmarkidsm=mkids(Markm)letpmarkidsi=mkids(Pmarki)leteraseidsmm'=mkids(Erase(m,m'))letbeforeidsc=mkids(Beforec)letafteridsc=mkids(Afterc)letaltids=function|[]->emptyids|[c]->c|l->mkids(Altl);;letseqids(kind:Sem.t)xy=matchx.def,y.defwith|Alt[],_->x|_,Alt[]->y|Eps,_->y|_,EpswhenSem.equalkind`First->x|_->mkids(Seq(kind,x,y));;letis_epsexpr=matchexpr.defwith|Eps->true|_->false;;letrecrenameidsx=matchx.defwith|Cst_|Eps|Mark_|Pmark_|Erase_|Before_|After_->mkidsx.def|Altl->mkids(Alt(List.map~f:(renameids)l))|Seq(k,y,z)->mkids(Seq(k,renameidsy,renameidsz))|Rep(g,k,y)->mkids(Rep(g,k,renameidsy));;endtypeexpr=Expr.tincludeExprmoduleMarks=structtypet={marks:(Mark.t*Idx.t)list;pmarks:Pmark.Set.t}letto_dyn{marks;pmarks}:Dyn.t=letopenDyninrecord[("marks",List.mapmarks~f:(fun(m,idx)->pair(Mark.to_dynm)(Idx.to_dynidx))|>list);"pmarks",Pmark.Set.to_listpmarks|>List.map~f:Pmark.to_dyn|>list];;letequal{marks;pmarks}t=List.equal~eq:(fun(x,y)(x',y')->Mark.equalxx'&&Idx.equalyy')markst.marks&&Pmark.Set.equalpmarkst.pmarks;;letempty={marks=[];pmarks=Pmark.Set.empty}lethash_marks_offset=letfacc((a:Mark.t),(i:Idx.t))=hash_combine(a:>int)(hash_combine(i:>int)acc)infunlinit->List.fold_leftl~init~f;;lethashmaccu=hash_marks_offsetm.marks(hash_combine(Hashtbl.hashm.pmarks)accu)letmarks_set_idx=letrecmarks_set_idxidxmarks=matchmarkswith|[]->[]|(a,idx')::rem->ifIdx.equalidx'Idx.unknownthen(a,idx)::marks_set_idxidxremelsemarksinfunmarksidx->{markswithmarks=marks_set_idxidxmarks.marks};;letfiltert(b:Mark.t)(e:Mark.t)={twithmarks=List.filtert.marks~f:(fun((i:Mark.t),_)->Mark.outside_rangei~start_inclusive:b~stop_inclusive:e)};;letset_markt(i:Mark.t)={twithmarks=(i,Idx.unknown)::List.remove_assqit.marks};;letset_pmarkti={twithpmarks=Pmark.Set.addit.pmarks}letppfmt{marks;pmarks}=Format.pp_open_boxfmt1;(matchmarkswith|[]->()|_::_->Format.fprintffmt"@[<2>marks@ %a@]"(Format.pp_print_list(funfmt(a,i)->Format.fprintffmt"%a-%a"Mark.ppaIdx.ppi))marks);(matchPmark.Set.to_listpmarkswith|[]->()|pmarks->Format.fprintffmt"@[<2>pmarks %a@]"(Format.pp_print_listPmark.pp)pmarks);Format.pp_close_boxfmt();;endmoduleStatus=structtypet=|Failed|MatchofMark_infos.t*Pmark.Set.t|RunningendmoduleDesc:sigtypetvalpp:tFmt.tmoduleE:sigtypenonrect=private|TSeqofSem.t*t*Expr.t|TExpofMarks.t*Expr.t|TMatchofMarks.tendvalto_dyn:t->Dyn.tvalfold_right:t->init:'acc->f:(E.t->'acc->'acc)->'accvaltseq:Sem.t->t->Expr.t->t->tvalinitial:Expr.t->tvalempty:tvalset_idx:Idx.t->t->tvalhash:t->int->intvalequal:t->t->boolvalstatus:t->Status.tvalfirst_match:t->Marks.toptionvalremove_matches:t->tvalsplit_at_match:t->t*tvaladd_match:t->Marks.t->tvaladd_eps:t->Marks.t->tvaladd_expr:t->E.t->tvaliter_marks:t->f:(Marks.t->unit)->unitvalremove_duplicates:Id.Hash_set.t->t->Expr.t->tend=structmoduleE=structtypet=|TSeqofSem.t*tlist*Expr.t|TExpofMarks.t*Expr.t|TMatchofMarks.tletrecequal_listl1l2=List.equal~eq:equall1l2andequalxy=matchx,ywith|TSeq(_,l1,e1),TSeq(_,l2,e2)->Id.equale1.ide2.id&&equal_listl1l2|TExp(marks1,e1),TExp(marks2,e2)->Id.equale1.ide2.id&&Marks.equalmarks1marks2|TMatchmarks1,TMatchmarks2->Marks.equalmarks1marks2|_,_->false;;letrechash(t:t)accu=matchtwith|TSeq(_,l,e)->hash_combine0x172a1bce(hash_combine(Id.hashe.id)(hash_listlaccu))|TExp(marks,e)->hash_combine0x2b4c0d77(hash_combine(Id.hashe.id)(Marks.hashmarksaccu))|TMatchmarks->hash_combine0x1c205ad5(Marks.hashmarksaccu)andhash_list=letfaccx=hashxaccinfunlinit->List.fold_leftl~init~f;;endtypet=E.tlistletrecto_dynsemt=Dyn.list(List.map~f:(dyn_of_esem)t)anddyn_of_esem=letopenDyninfunction|E.TSeq(sem',x,y)->wrap_semsemsem'(variant"TSeq"[to_dyn(Somesem')x;Expr.to_dyn(Somesem')y])|TExp(marks,e)->lete=letbase=[Expr.to_dynseme]inifMarks.(equalemptymarks)thenbaseelseMarks.to_dynmarks::baseinvariant"TExp"e|TMatchm->variant"TMarks"[Marks.to_dynm];;letto_dynt=to_dynNonetopenEletequal=E.equal_listlethash=E.hash_listlettseq'kindxy=matchxwith|[]->[]|[TExp(marks,{def=Eps;_})]->[TExp(marks,y)]|_->[TSeq(kind,x,y)];;lettseqkindxyrem=tseq'kindxy@remletrecfold_rightt~init~f=matchtwith|[]->init|x::xs->fx(fold_rightxs~init~f);;letreciter_markst~f=List.itert~f:(fun(e:E.t)->matchewith|TSeq(_,l,_)->iter_marksl~f|TExp(marks,_)|TMatchmarks->fmarks);;letrecprint_state_recche(y:Expr.t)=matchewith|TMatchmarks->Format.fprintfch"@[<2>(TMatch@ %a)@]"Marks.ppmarks|TSeq(sem,l',x)->Format.fprintfch"@[<2>(TSeq@ %a@ "Sem.ppsem;print_state_lstchl'x;Format.fprintfch"@ %a)@]"Expr.ppx|TExp(marks,{def=Eps;_})->Format.fprintfch"@[<2>(TExp@ %a@ (%a)@ (eps))@]"Id.ppy.idMarks.ppmarks|TExp(marks,x)->Format.fprintfch"@[<2>(TExp@ %a@ (%a)@ %a)@]"Id.ppx.idMarks.ppmarksExpr.ppxandprint_state_lstchly=matchlwith|[]->Format.fprintfch"()"|e::rem->print_state_recchey;List.iterrem~f:(fune->Format.fprintfch"@ | ";print_state_recchey);;letppcht=print_state_lstch[t]{id=Id.zero;def=Eps}letrecfirst_match=function|[]->None|TMatchmarks::_->Somemarks|_::r->first_matchr;;letremove_matchest=List.filtert~f:(function|TMatch_->false|_->true);;letsplit_at_match=letrecsplit_at_match_recl=function|[]->assertfalse|TMatch_::r->List.revl,remove_matchesr|x::r->split_at_match_rec(x::l)rinfunl->split_at_match_rec[]l;;letstatus:_->Status.t=function|[]->Failed|TMatchm::_->Match(Mark_infos.make(m.marks:>(int*int)list),m.pmarks)|_->Running;;letset_idx=letrecfidx=function|TMatchmarks->TMatch(Marks.marks_set_idxmarksidx)|TSeq(kind,l,x)->TSeq(kind,set_idxidxl,x)|TExp(marks,x)->TExp(Marks.marks_set_idxmarksidx,x)andset_idxidxxs=List.mapxs~f:(fidx)inset_idx;;let[@ocaml.warning"-32"]ppfmtt=Format.fprintffmt"[%a]"(Format.pp_print_list~pp_sep:(Fmt.lit"; ")pp)t;;letempty=[]letinitialexpr=[TExp(Marks.empty,expr)]letadd_matchtmarks=TMatchmarks::tletadd_epstmarks=TExp(marks,eps_expr)::tletadd_exprtexpr=expr::tletremove_duplicates=letrecloopseenly=matchlwith|[]->[]|(TMatch_asx)::_->(* Truncate after first match *)[x]|TSeq(kind,l,x)::r->letl=loopseenlxinletr=loopseenryintseqkindlxr|(TExp(_marks,{def=Eps;_})ase)::r->ifId.Hash_set.memseeny.idthenloopseenryelse(Id.Hash_set.addseeny.id;e::loopseenry)|(TExp(_marks,x)ase)::r->ifId.Hash_set.memseenx.idthenloopseenryelse(Id.Hash_set.addseenx.id;e::loopseenry)infunseenly->Id.Hash_set.clearseen;loopseenly;;endmoduleE=Desc.EmoduleState=structtypet={idx:Idx.t;category:Category.t;desc:Desc.t;mutablestatus:Status.toption;hash:int}(* Thread-safety: We use double-checked locking to access field
[status] in function [status] below. *)letppfmtt=Desc.ppfmtt.desclet[@inline]idxt=t.idxletto_dynt=Desc.to_dynt.descletdummy={idx=Idx.unknown;category=Category.dummy;desc=Desc.empty;status=None;hash=-1};;lethashidxcatdesc=Desc.hashdesc(hash_combineidx(hash_combine(Category.to_intcat)0))land0x3FFFFFFF;;letmkidxcatdesc={idx;category=cat;desc;status=None;hash=hash(idx:>int)catdesc};;letcreatecate=mkIdx.initialcat(Desc.initiale)letequal{idx;category;desc;status=_;hash}t=Int.equalhasht.hash&&Idx.equalidxt.idx&&Category.equalcategoryt.category&&Desc.equaldesct.desc;;(* To be called when the mutex has already been acquired *)letstatus_no_mutexs=matchs.statuswith|Somes->s|None->letst=Desc.statuss.descins.status<-Somest;st;;letstatusms=matchs.statuswith|Somes->s|None->Mutex.lockm;letst=status_no_mutexsinMutex.unlockm;st;;moduleTable=Hashtbl.Make(structtypenonrect=tletequal=equallethasht=t.hashend)end(**** Find a free index ****)moduleWorking_area=structtypet={mutableids:Bit_vector.t;seen:Id.Hash_set.t;index_count:intAtomic.t}letcreate()={ids=Bit_vector.create_zero1;seen=Id.Hash_set.create();index_count=Atomic.make0};;letindex_countw=Atomic.getw.index_countletmark_used_indicestbl=Desc.iter_marks~f:(funmarks->List.itermarks.marks~f:(fun(_,i)->ifIdx.usedithenBit_vector.settbl(i:>int)true));;letrecfind_freetblidxlen=ifidx=len||not(Bit_vector.gettblidx)thenidxelsefind_freetbl(idx+1)len;;letfree_indextl=Bit_vector.reset_zerot.ids;mark_used_indicest.idsl;letlen=Bit_vector.lengtht.idsinletidx=find_freet.ids0leninifidx=lenthen(t.ids<-Bit_vector.create_zero(2*len);(* This function is only called when the mutex is locked. So we
are sure that this is always coherent with the length of
[t.ids]. *)Atomic.sett.index_count(2*len));Idx.makeidx;;end(**** Computation of the next state ****)typectx={c:Cset.c;prev_cat:Category.t;next_cat:Category.t}letrecdelta_expr({c;_}asctx)marks(x:Expr.t)rem=(*Format.eprintf "%d@." x.id;*)matchx.defwith|Csts->ifCset.memcsthenDesc.add_epsremmarkselserem|Altl->delta_altctxmarkslrem|Seq(kind,y,z)->lety=delta_exprctxmarksyDesc.emptyindelta_seqctxkindyzrem|Rep(rep_kind,kind,y)->delta_repctxmarksxrep_kindkindyrem|Eps->Desc.add_matchremmarks|Marki->Desc.add_matchrem(Marks.set_markmarksi)|Pmarki->Desc.add_matchrem(Marks.set_pmarkmarksi)|Erase(b,e)->Desc.add_matchrem(Marks.filtermarksbe)|Beforecat->ifCategory.intersectctx.next_catcatthenDesc.add_matchremmarkselserem|Aftercat->ifCategory.intersectctx.prev_catcatthenDesc.add_matchremmarkselseremanddelta_repctxmarksxrep_kindkindyrem=lety,marks'=lety=delta_exprctxmarksyDesc.emptyinmatchDesc.first_matchywith|None->y,marks|Somemarks->Desc.remove_matchesy,marksinmatchrep_kindwith|`Greedy->Desc.tseqkindyx(Desc.add_matchremmarks')|`Non_greedy->Desc.add_match(Desc.tseqkindyxrem)marksanddelta_altctxmarkslrem=List.fold_rightl~init:rem~f:(delta_exprctxmarks)anddelta_seqctx(kind:Sem.t)yzrem=matchDesc.first_matchywith|None->Desc.tseqkindyzrem|Somemarks->(matchkindwith|`Longest->Desc.tseqkind(Desc.remove_matchesy)z(delta_exprctxmarkszrem)|`Shortest->delta_exprctxmarksz(Desc.tseqkind(Desc.remove_matchesy)zrem)|`First->lety,y'=Desc.split_at_matchyinDesc.tseqkindyz(delta_exprctxmarksz(Desc.tseqkindy'zrem)));;letrecdelta_ectxmarks(x:E.t)rem=matchxwith|TSeq(kind,y,z)->lety=delta_descctxmarksyDesc.emptyindelta_seqctxkindyzrem|TExp(marks,e)->delta_exprctxmarkserem|TMatch_->Desc.add_exprremxanddelta_descctxmarks(l:Desc.t)rem=Desc.fold_rightl~init:rem~f:(funyacc->delta_ectxmarksyacc);;letdelta(tbl_ref:Working_area.t)next_catchar(st:State.t)=letexpr=letprev_cat=st.categoryinletctx={c=char;next_cat;prev_cat}inDesc.remove_duplicatestbl_ref.seen(delta_descctxMarks.emptyst.descDesc.empty)Expr.eps_exprinletidx=Working_area.free_indextbl_refexprinletexpr=Desc.set_idxidxexprinState.mkidxnext_catexpr;;