123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)letreciternfv=ifn=0thenvelseiter(n-1)f(fv)(****)letunknown=-2letbreak=-3typematch_info=|MatchofGroup.t|Failed|Runningof{no_match_starts_before:int}typestate={idx:int;(* Index of the current position in the position table.
Not yet computed transitions point to a dummy state where
[idx] is set to [unknown];
If [idx] is set to [break] for states that either always
succeed or always fail. *)real_idx:int;(* The real index, in case [idx] is set to [break] *)next:statearray;(* Transition table, indexed by color *)mutablefinal:(Category.t*(Automata.idx*Automata.status))list;(* Mapping from the category of the next character to
- the index where the next position should be saved
- possibly, the list of marks (and the corresponding indices)
corresponding to the best match *)desc:Automata.State.t(* Description of this state of the automata *)}(* Automata (compiled regular expression) *)typere={initial:Automata.expr;(* The whole regular expression *)mutableinitial_states:(Category.t*state)list;(* Initial states, indexed by initial category *)colors:string;(* Color table *)color_repr:string;(* Table from colors to one character of this color *)ncolor:int;(* Number of colors. *)lnl:int;(* Color of the last newline. -1 if unnecessary *)tbl:Automata.working_area;(* Temporary table used to compute the first available index
when computing a new state *)states:stateAutomata.State.Table.t;(* States of the deterministic automata *)group_names:(string*int)list;(* Named groups in the regular expression *)group_count:int(* Number of groups in the regular expression *)}letpp_rechre=Automata.ppchre.initialletprint_re=pp_reletgroup_countre=re.group_countletgroup_namesre=re.group_names(* Information used during matching *)typeinfo={re:re;(* The automata *)colors:string;(* Color table ([x.colors = x.re.colors])
Shortcut used for performance reasons *)mutablepositions:intarray;(* Array of mark positions
The mark are off by one for performance reasons *)pos:int;(* Position where the match is started *)last:int(* Position where the match should stop *)}(****)letcategoryre~color=ifcolor=-1thenCategory.inexistant(* Special category for the last newline *)elseifcolor=re.lnlthenCategory.(lastnewline++newline++not_letter)elseCategory.from_char(re.color_repr.[color])(****)letdummy_next=[||]letunknown_state={idx=unknown;real_idx=0;next=dummy_next;final=[];desc=Automata.State.dummy}letmk_statencoldesc=letbreak_state=matchAutomata.statusdescwith|Automata.Running->false|Automata.Failed|Automata.Match_->truein{idx=ifbreak_statethenbreakelsedesc.Automata.State.idx;real_idx=desc.Automata.State.idx;next=ifbreak_statethendummy_nextelseArray.makencolunknown_state;final=[];desc}letfind_stateredesc=tryAutomata.State.Table.findre.statesdescwithNot_found->letst=mk_statere.ncolordescinAutomata.State.Table.addre.statesdescst;st(**** Match with marks ****)letdeltainfocat~colorst=letdesc=Automata.deltainfo.re.tblcatcolorst.descinletlen=Array.lengthinfo.positionsinifdesc.Automata.State.idx=len&&len>0thenbeginletpos=info.positionsininfo.positions<-Array.make(2*len)0;Array.blitpos0info.positions0lenend;descletvalidateinfo(s:string)~posst=letcolor=Char.code(info.colors.[Char.codes.[pos]])inletcat=categoryinfo.re~colorinletdesc'=deltainfocat~colorstinletst'=find_stateinfo.redesc'inst.next.(color)<-st'letrecloopinfos~posst=ifpos<info.lastthenletst'=st.next.(Char.codeinfo.colors.[Char.codes.[pos]])inletidx=st'.idxinifidx>=0thenbegininfo.positions.(idx)<-pos;loopinfos~pos:(pos+1)st'endelseifidx=breakthenbegininfo.positions.(st'.real_idx)<-pos;st'endelsebegin(* Unknown *)validateinfos~posst;loopinfos~posstendelsestletrecloop_no_markinfos~pos~lastst=ifpos<lastthenletst'=st.next.(Char.codeinfo.colors.[Char.codes.[pos]])inifst'.idx>=0thenloop_no_markinfos~pos:(pos+1)~lastst'elseifst'.idx=breakthenst'elsebegin(* Unknown *)validateinfos~posst;loop_no_markinfos~pos~laststendelsestletfinalinfostcat=tryList.assqcatst.finalwithNot_found->letst'=deltainfocat~color:(-1)stinletres=(st'.Automata.State.idx,Automata.statusst')inst.final<-(cat,res)::st.final;resletfind_initial_staterecat=tryList.assqcatre.initial_stateswithNot_found->letst=find_statere(Automata.State.createcatre.initial)inre.initial_states<-(cat,st)::re.initial_states;stletget_colorre(s:string)pos=ifpos<0then-1elseletslen=String.lengthsinifpos>=slenthen-1elseifpos=slen-1&&re.lnl<>-1&&s.[pos]='\n'then(* Special case for the last newline *)re.lnlelseChar.codere.colors.[Char.codes.[pos]]letrechandle_last_newlineinfo~posst~groups=letst'=st.next.(info.re.lnl)inifst'.idx>=0thenbeginifgroupstheninfo.positions.(st'.idx)<-pos;st'endelseifst'.idx=breakthenbeginifgroupstheninfo.positions.(st'.real_idx)<-pos;st'endelsebegin(* Unknown *)letcolor=info.re.lnlinletreal_c=Char.codeinfo.colors.[Char.code'\n']inletcat=categoryinfo.re~colorinletdesc'=deltainfocat~color:real_cstinletst'=find_stateinfo.redesc'inst.next.(color)<-st';handle_last_newlineinfo~posst~groupsendletrecscan_strinfo(s:string)initial_state~groups=letpos=info.posinletlast=info.lastinif(last=String.lengths&&info.re.lnl<>-1&&last>pos&&String.gets(last-1)='\n')thenbeginletinfo={infowithlast=last-1}inletst=scan_strinfosinitial_state~groupsinifst.idx=breakthenstelsehandle_last_newlineinfo~pos:(last-1)st~groupsendelseifgroupsthenloopinfos~posinitial_stateelseloop_no_markinfos~pos~lastinitial_state(* This function adds a final boundary check on the input.
This is useful to indicate that the output failed because
of insufficient input, or to verify that the output actually
matches for regex that have boundary conditions with respect
to the input string.
*)letfinal_boundary_check~last~slenres~info~st~groups=letfinal_cat=iflast=slenthenCategory.(search_boundary++inexistant)elseCategory.(search_boundary++categoryre~color:(get_colorreslast))inlet(idx,res)=finalinfostfinal_catin(matchgroups,reswith|true,Match_->info.positions.(idx)<-last|_->());resletmatch_str~groups~partialres~pos~len=letslen=String.lengthsinletlast=iflen=-1thenslenelsepos+leninletinfo={re;colors=re.colors;pos;last;positions=ifgroupsthenbeginletn=Automata.index_countre.tbl+1inifn<=10then[|0;0;0;0;0;0;0;0;0;0|]elseArray.maken0endelse[||]}inletinitial_cat=ifpos=0thenCategory.(search_boundary++inexistant)elseCategory.(search_boundary++categoryre~color:(get_colorres(pos-1)))inletinitial_state=find_initial_statereinitial_catinletst=scan_strinfosinitial_state~groupsinletres=ifst.idx=break||(partial&¬groups)thenAutomata.statusst.descelseifpartial&&groupsthenmatchAutomata.statusst.descwith|Match_|Failedasstatus->status|Running->(* This could be because it's still not fully matched, or it
could be that because we need to run special end of input
checks. *)(matchfinal_boundary_check~last~slenres~info~st~groupswith|Match_asstatus->status|Failed|Running->(* A failure here just means that we need more data, i.e.
it's a partial match. *)Running)elsefinal_boundary_check~last~slenres~info~st~groupsinmatchreswithAutomata.Match(marks,pmarks)->Match{s;marks;pmarks;gpos=info.positions;gcount=re.group_count}|Automata.Failed->Failed|Automata.Running->letno_match_starts_before=ifgroupstheninfo.positions.(0)else0inRunning{no_match_starts_before}letmk_re~initial~colors~color_repr~ncolor~lnl~group_names~group_count={initial;initial_states=[];colors;color_repr;ncolor;lnl;tbl=Automata.create_working_area();states=Automata.State.Table.create97;group_names;group_count}(**** Character sets ****)letcseqcc'=Cset.seq(Char.codec)(Char.codec')letcaddcs=Cset.add(Char.codec)slettrans_setcachecms=matchCset.one_charswith|Somei->Cset.csinglecm.[i]|None->letv=(Cset.hash_recs,s)intryCset.CSetMap.findv!cachewithNot_found->letl=Cset.fold_rights~f:(fun(i,j)l->Cset.union(cseqcm.[i]cm.[j])l)~init:Cset.emptyincache:=Cset.CSetMap.addvl!cache;l(****)typeregexp=SetofCset.t|Sequenceofregexplist|Alternativeofregexplist|Repeatofregexp*int*intoption|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stop|SemofAutomata.sem*regexp|Sem_greedyofAutomata.rep_kind*regexp|Groupofstringoption*regexp|No_groupofregexp|Nestofregexp|Caseofregexp|No_caseofregexp|Intersectionofregexplist|Complementofregexplist|Differenceofregexp*regexp|PmarkofPmark.t*regexpmoduleView=structtypet=regexp=SetofCset.t|Sequenceofregexplist|Alternativeofregexplist|Repeatofregexp*int*intoption|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stop|SemofAutomata.sem*regexp|Sem_greedyofAutomata.rep_kind*regexp|Groupofstringoption*regexp|No_groupofregexp|Nestofregexp|Caseofregexp|No_caseofregexp|Intersectionofregexplist|Complementofregexplist|Differenceofregexp*regexp|PmarkofPmark.t*regexpletviewt=tendletrecppfmtt=letopenFmtinletvarsre=sexpfmtsppreinletseqsrel=sexpfmts(listpp)relinmatchtwith|Sets->sexpfmt"Set"Cset.pps|Sequencesq->seq"Sequence"sq|Alternativealt->seq"Alternative"alt|Repeat(re,start,stop)->letpp'fmt()=fprintffmt"%a@ %d%a"pprestartoptintstopinsexpfmt"Repeat"pp'()|Beg_of_line->strfmt"Beg_of_line"|End_of_line->strfmt"End_of_line"|Beg_of_word->strfmt"Beg_of_word"|End_of_word->strfmt"End_of_word"|Not_bound->strfmt"Not_bound"|Beg_of_str->strfmt"Beg_of_str"|End_of_str->strfmt"End_of_str"|Last_end_of_line->strfmt"Last_end_of_line"|Start->strfmt"Start"|Stop->strfmt"Stop"|Sem(sem,re)->sexpfmt"Sem"(pairAutomata.pp_sempp)(sem,re)|Sem_greedy(k,re)->sexpfmt"Sem_greedy"(pairAutomata.pp_rep_kindpp)(k,re)|Group(None,c)->var"Group"c|Group(Somen,c)->sexpfmt"Named_group"(pairstrpp)(n,c)|No_groupc->var"No_group"c|Nestc->var"Nest"c|Casec->var"Case"c|No_casec->var"No_case"c|Intersectionc->seq"Intersection"c|Complementc->seq"Complement"c|Difference(a,b)->sexpfmt"Difference"(pairpppp)(a,b)|Pmark(m,r)->sexpfmt"Pmark"(pairPmark.pppp)(m,r)letrecis_charset=function|Set_->true|Alternativel|Intersectionl|Complementl->List.for_allis_charsetl|Difference(r,r')->is_charsetr&&is_charsetr'|Sem(_,r)|Sem_greedy(_,r)|No_groupr|Caser|No_caser->is_charsetr|Sequence_|Repeat_|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Beg_of_str|End_of_str|Not_bound|Last_end_of_line|Start|Stop|Group_|Nest_|Pmark(_,_)->false(*XXX Use a better algorithm allowing non-contiguous regions? *)letcupper=Cset.union(cseq'A''Z')(Cset.union(cseq'\192''\214')(cseq'\216''\222'))letclower=Cset.offset32cupperletcalpha=List.fold_rightcadd['\170';'\181';'\186';'\223';'\255'](Cset.unionclowercupper)letcdigit=cseq'0''9'letcalnum=Cset.unioncalphacdigitletcword=cadd'_'calnumletcolorizecregexp=letlnl=reffalseinletreccolorizeregexp=matchregexpwithSets->Color_map.splitsc|Sequencel->List.itercolorizel|Alternativel->List.itercolorizel|Repeat(r,_,_)->colorizer|Beg_of_line|End_of_line->Color_map.split(Cset.csingle'\n')c|Beg_of_word|End_of_word|Not_bound->Color_map.splitcwordc|Beg_of_str|End_of_str|Start|Stop->()|Last_end_of_line->lnl:=true|Sem(_,r)|Sem_greedy(_,r)|Group(_,r)|No_groupr|Nestr|Pmark(_,r)->colorizer|Case_|No_case_|Intersection_|Complement_|Difference_->assertfalseincolorizeregexp;!lnl(**** Compilation ****)letrecequalx1x2=matchx1,x2withSets1,Sets2->s1=s2|Sequencel1,Sequencel2->eq_listl1l2|Alternativel1,Alternativel2->eq_listl1l2|Repeat(x1',i1,j1),Repeat(x2',i2,j2)->i1=i2&&j1=j2&&equalx1'x2'|Beg_of_line,Beg_of_line|End_of_line,End_of_line|Beg_of_word,Beg_of_word|End_of_word,End_of_word|Not_bound,Not_bound|Beg_of_str,Beg_of_str|End_of_str,End_of_str|Last_end_of_line,Last_end_of_line|Start,Start|Stop,Stop->true|Sem(sem1,x1'),Sem(sem2,x2')->sem1=sem2&&equalx1'x2'|Sem_greedy(k1,x1'),Sem_greedy(k2,x2')->k1=k2&&equalx1'x2'|Group_,Group_->(* Do not merge groups! *)false|No_groupx1',No_groupx2'->equalx1'x2'|Nestx1',Nestx2'->equalx1'x2'|Casex1',Casex2'->equalx1'x2'|No_casex1',No_casex2'->equalx1'x2'|Intersectionl1,Intersectionl2->eq_listl1l2|Complementl1,Complementl2->eq_listl1l2|Difference(x1',x1''),Difference(x2',x2'')->equalx1'x2'&&equalx1''x2''|Pmark(m1,r1),Pmark(m2,r2)->Pmark.equalm1m2&&equalr1r2|_->falseandeq_listl1l2=matchl1,l2with[],[]->true|x1::r1,x2::r2->equalx1x2&&eq_listr1r2|_->falseletsequence=function|[x]->x|l->Sequencelletrecmerge_sequences=function|[]->[]|Alternativel'::r->merge_sequences(l'@r)|Sequence(x::y)::r->beginmatchmerge_sequencesrwithSequence(x'::y')::r'whenequalxx'->Sequence[x;Alternative[sequencey;sequencey']]::r'|r'->Sequence(x::y)::r'end|x::r->x::merge_sequencesrmoduleA=Automataletenforce_kindidskindkind'cr=matchkind,kind'with`First,`First->cr|`First,k->A.seqidskcr(A.epsids)|_->cr(* XXX should probably compute a category mask *)letrectranslateidskindign_groupign_casegreedyposnamescachec=function|Sets->(A.cstids(trans_setcachecs),kind)|Sequencel->(trans_seqidskindign_groupign_casegreedyposnamescachecl,kind)|Alternativel->beginmatchmerge_sequenceslwith[r']->let(cr,kind')=translateidskindign_groupign_casegreedyposnamescachecr'in(enforce_kindidskindkind'cr,kind)|merged_sequences->(A.altids(List.map(funr'->let(cr,kind')=translateidskindign_groupign_casegreedyposnamescachecr'inenforce_kindidskindkind'cr)merged_sequences),kind)end|Repeat(r',i,j)->let(cr,kind')=translateidskindign_groupign_casegreedyposnamescachecr'inletrem=matchjwithNone->A.repidsgreedykind'cr|Somej->letf=matchgreedywith`Greedy->funrem->A.altids[A.seqidskind'(A.renameidscr)rem;A.epsids]|`Non_greedy->funrem->A.altids[A.epsids;A.seqidskind'(A.renameidscr)rem]initer(j-i)f(A.epsids)in(iteri(funrem->A.seqidskind'(A.renameidscr)rem)rem,kind)|Beg_of_line->(A.afteridsCategory.(inexistant++newline),kind)|End_of_line->(A.beforeidsCategory.(inexistant++newline),kind)|Beg_of_word->(A.seqids`First(A.afteridsCategory.(inexistant++not_letter))(A.beforeidsCategory.letter),kind)|End_of_word->(A.seqids`First(A.afteridsCategory.letter)(A.beforeidsCategory.(inexistant++not_letter)),kind)|Not_bound->(A.altids[A.seqids`First(A.afteridsCategory.letter)(A.beforeidsCategory.letter);A.seqids`First(A.afteridsCategory.(inexistant++not_letter))(A.beforeidsCategory.(inexistant++not_letter))],kind)|Beg_of_str->(A.afteridsCategory.inexistant,kind)|End_of_str->(A.beforeidsCategory.inexistant,kind)|Last_end_of_line->(A.beforeidsCategory.(inexistant++lastnewline),kind)|Start->(A.afteridsCategory.search_boundary,kind)|Stop->(A.beforeidsCategory.search_boundary,kind)|Sem(kind',r')->let(cr,kind'')=translateidskind'ign_groupign_casegreedyposnamescachecr'in(enforce_kindidskind'kind''cr,kind')|Sem_greedy(greedy',r')->translateidskindign_groupign_casegreedy'posnamescachecr'|Group(n,r')->ifign_groupthentranslateidskindign_groupign_casegreedyposnamescachecr'elseletp=!posinlet()=matchnwith|Somename->names:=(name,p/2)::!names|None->()inpos:=!pos+2;let(cr,kind')=translateidskindign_groupign_casegreedyposnamescachecr'in(A.seqids`First(A.markidsp)(A.seqids`Firstcr(A.markids(p+1))),kind')|No_groupr'->translateidskindtrueign_casegreedyposnamescachecr'|Nestr'->letb=!posinlet(cr,kind')=translateidskindign_groupign_casegreedyposnamescachecr'inlete=!pos-1inife<bthen(cr,kind')else(A.seqids`First(A.eraseidsbe)cr,kind')|Difference_|Complement_|Intersection_|No_case_|Case_->assertfalse|Pmark(i,r')->let(cr,kind')=translateidskindign_groupign_casegreedyposnamescachecr'in(A.seqids`First(A.pmarkidsi)cr,kind')andtrans_seqidskindign_groupign_casegreedyposnamescachec=function|[]->A.epsids|[r]->let(cr',kind')=translateidskindign_groupign_casegreedyposnamescachecrinenforce_kindidskindkind'cr'|r::rem->let(cr',kind')=translateidskindign_groupign_casegreedyposnamescachecrinletcr''=trans_seqidskindign_groupign_casegreedyposnamescachecreminifA.is_epscr''thencr'elseifA.is_epscr'thencr''elseA.seqidskind'cr'cr''(**** Case ****)letcase_insenss=Cset.unions(Cset.union(Cset.offset32(Cset.interscupper))(Cset.offset(-32)(Cset.intersclower)))letas_set=function|Sets->s|_->assertfalse(* XXX Should split alternatives into (1) charsets and (2) more
complex regular expressions; alternative should therefore probably
be flatten here *)letrechandle_caseign_case=function|Sets->Set(ifign_casethencase_insensselses)|Sequencel->Sequence(List.map(handle_caseign_case)l)|Alternativel->letl'=List.map(handle_caseign_case)linifis_charset(Alternativel')thenSet(List.fold_left(funsr->Cset.unions(as_setr))Cset.emptyl')elseAlternativel'|Repeat(r,i,j)->Repeat(handle_caseign_caser,i,j)|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stopasr->r|Sem(k,r)->letr'=handle_caseign_caserinifis_charsetr'thenr'elseSem(k,r')|Sem_greedy(k,r)->letr'=handle_caseign_caserinifis_charsetr'thenr'elseSem_greedy(k,r')|Group(n,r)->Group(n,handle_caseign_caser)|No_groupr->letr'=handle_caseign_caserinifis_charsetr'thenr'elseNo_groupr'|Nestr->letr'=handle_caseign_caserinifis_charsetr'thenr'elseNestr'|Caser->handle_casefalser|No_caser->handle_casetruer|Intersectionl->letl'=List.map(funr->handle_caseign_caser)linSet(List.fold_left(funsr->Cset.inters(as_setr))Cset.canyl')|Complementl->letl'=List.map(funr->handle_caseign_caser)linSet(Cset.diffCset.cany(List.fold_left(funsr->Cset.unions(as_setr))Cset.emptyl'))|Difference(r,r')->Set(Cset.inter(as_set(handle_caseign_caser))(Cset.diffCset.cany(as_set(handle_caseign_caser'))))|Pmark(i,r)->Pmark(i,handle_caseign_caser)(****)letcompile_1regexp=letregexp=handle_casefalseregexpinletc=Color_map.make()inletneed_lnl=colorizecregexpinlet(colors,color_repr,ncolor)=Color_map.flattencinletlnl=ifneed_lnlthenncolorelse-1inletncolor=ifneed_lnlthenncolor+1elsencolorinletids=A.create_ids()inletpos=ref0inletnames=ref[]inlet(r,kind)=translateids`Firstfalsefalse`Greedyposnames(refCset.CSetMap.empty)colorsregexpinletr=enforce_kindids`Firstkindrin(*Format.eprintf "<%d %d>@." !ids ncol;*)mk_re~initial:r~colors~color_repr~ncolor~lnl~group_names:(List.rev!names)~group_count:(!pos/2)(****)letrecanchored=function|Sequencel->List.existsanchoredl|Alternativel->List.for_allanchoredl|Repeat(r,i,_)->i>0&&anchoredr|Set_|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|End_of_str|Last_end_of_line|Stop|Intersection_|Complement_|Difference_->false|Beg_of_str|Start->true|Sem(_,r)|Sem_greedy(_,r)|Group(_,r)|No_groupr|Nestr|Caser|No_caser|Pmark(_,r)->anchoredr(****)typet=regexpletstrs=letl=ref[]infori=String.lengths-1downto0dol:=Set(Cset.csingles.[i])::!ldone;Sequence!lletcharc=Set(Cset.csinglec)letalt=function|[r]->r|l->Alternativelletseq=function|[r]->r|l->Sequencelletempty=alt[]letepsilon=seq[]letrepnrij=ifi<0theninvalid_arg"Re.repn";beginmatchjwith|Somejwhenj<i->invalid_arg"Re.repn"|_->()end;Repeat(r,i,j)letrepr=repnr0Noneletrep1r=repnr1Noneletoptr=repnr0(Some1)letbol=Beg_of_lineleteol=End_of_lineletbow=Beg_of_wordleteow=End_of_wordletwordr=seq[bow;r;eow]letnot_boundary=Not_boundletbos=Beg_of_strleteos=End_of_strletwhole_stringr=seq[bos;r;eos]letleol=Last_end_of_lineletstart=Startletstop=Stopletlongestr=Sem(`Longest,r)letshortestr=Sem(`Shortest,r)letfirstr=Sem(`First,r)letgreedyr=Sem_greedy(`Greedy,r)letnon_greedyr=Sem_greedy(`Non_greedy,r)letgroup?namer=Group(name,r)letno_groupr=No_grouprletnestr=Nestrletmarkr=leti=Pmark.gen()in(i,Pmark(i,r))letsetstr=lets=refCset.emptyinfori=0toString.lengthstr-1dos:=Cset.union(Cset.csinglestr.[i])!sdone;Set!sletrgcc'=Set(cseqcc')letinterl=letr=Intersectionlinifis_charsetrthenrelseinvalid_arg"Re.inter"letcompll=letr=Complementlinifis_charsetrthenrelseinvalid_arg"Re.compl"letdiffrr'=letr''=Difference(r,r')inifis_charsetr''thenr''elseinvalid_arg"Re.diff"letany=SetCset.canyletnotnl=Set(Cset.diffCset.cany(Cset.csingle'\n'))letlower=alt[rg'a''z';char'\181';rg'\223''\246';rg'\248''\255']letupper=alt[rg'A''Z';rg'\192''\214';rg'\216''\222']letalpha=alt[lower;upper;char'\170';char'\186']letdigit=rg'0''9'letalnum=alt[alpha;digit]letwordc=alt[alnum;char'_']letascii=rg'\000''\127'letblank=set"\t "letcntrl=alt[rg'\000''\031';rg'\127''\159']letgraph=alt[rg'\033''\126';rg'\160''\255']letprint=alt[rg'\032''\126';rg'\160''\255']letpunct=alt[rg'\033''\047';rg'\058''\064';rg'\091''\096';rg'\123''\126';rg'\160''\169';rg'\171''\180';rg'\182''\185';rg'\187''\191';char'\215';char'\247']letspace=alt[char' ';rg'\009''\013']letxdigit=alt[digit;rg'a''f';rg'A''F']letcaser=Caserletno_caser=No_caser(****)letcompiler=compile_1(ifanchoredrthengrouprelseseq[shortest(repany);groupr])letexec_internalname?(pos=0)?(len=-1)~partial~groupsres=ifpos<0||len<-1||pos+len>String.lengthstheninvalid_argname;match_str~groups~partialres~pos~lenletexec?pos?lenres=matchexec_internal"Re.exec"?pos?len~groups:true~partial:falsereswithMatchsubstr->substr|_->raiseNot_foundletexec_opt?pos?lenres=matchexec_internal"Re.exec_opt"?pos?len~groups:true~partial:falsereswithMatchsubstr->Somesubstr|_->Noneletexecp?pos?lenres=matchexec_internal~groups:false~partial:false"Re.execp"?pos?lenreswithMatch_substr->true|_->falseletexec_partial?pos?lenres=matchexec_internal~groups:false~partial:true"Re.exec_partial"?pos?lenreswithMatch_->`Full|Running_->`Partial|Failed->`Mismatchletexec_partial_detailed?pos?lenres=matchexec_internal~groups:true~partial:true"Re.exec_partial_detailed"?pos?lenreswithMatchgroup->`Fullgroup|Running{no_match_starts_before}->`Partialno_match_starts_before|Failed->`MismatchmoduleMark=structtypet=Pmark.tlettest(g:Group.t)p=Pmark.Set.mempg.pmarksletall(g:Group.t)=g.pmarksmoduleSet=Pmark.Setletequal=Pmark.equalletcompare=Pmark.compareendtypesplit_token=[`Textofstring|`DelimofGroup.t]moduleRseq=structletall?(pos=0)?lenres:_Seq.t=ifpos<0theninvalid_arg"Re.all";(* index of the first position we do not consider.
!pos < limit is an invariant *)letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.all";pos+lin(* iterate on matches. When a match is found, search for the next
one just after its end *)letrecauxpos()=ifpos>=limitthenSeq.Nil(* no more matches *)elsematchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0inletpos=ifp1=p2thenp2+1elsep2inSeq.Cons(substr,auxpos)|Running_|Failed->Seq.Nilinauxposletmatches?pos?lenres:_Seq.t=all?pos?lenres|>Seq.map(funsub->Group.getsub0)letsplit_full?(pos=0)?lenres:_Seq.t=ifpos<0theninvalid_arg"Re.split";letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.split";pos+lin(* i: start of delimited string
pos: first position after last match of [re]
limit: first index we ignore (!pos < limit is an invariant) *)letpos0=posinletrecauxstateipos()=matchstatewith|`Idlewhenpos>=limit->ifi<limitthen(letsub=String.subsi(limit-i)inSeq.Cons(`Textsub,auxstate(i+1)pos))elseSeq.Nil|`Idle->beginmatchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0inletpos=ifp1=p2thenp2+1elsep2inletold_i=iinleti=p2inifp1>pos0then((* string does not start by a delimiter *)lettext=String.subsold_i(p1-old_i)inletstate=`Yield(`Delimsubstr)inSeq.Cons(`Texttext,auxstateipos))elseSeq.Cons(`Delimsubstr,auxstateipos)|Running_->Seq.Nil|Failed->ifi<limitthen(lettext=String.subsi(limit-i)in(* yield last string *)Seq.Cons(`Texttext,auxstatelimitpos))elseSeq.Nilend|`Yieldx->Seq.Cons(x,aux`Idleipos)inaux`Idleposposletsplit?pos?lenres:_Seq.t=letseq=split_full?pos?lenresinletrecfilterseq()=matchseq()with|Seq.Nil->Seq.Nil|Seq.Cons(`Delim_,tl)->filtertl()|Seq.Cons(`Texts,tl)->Seq.Cons(s,filtertl)infilterseqendmoduleRlist=structletlist_of_seq(s:'aSeq.t):'alist=Seq.fold_left(funlx->x::l)[]s|>List.revletall?pos?lenres=Rseq.all?pos?lenres|>list_of_seqletmatches?pos?lenres=Rseq.matches?pos?lenres|>list_of_seqletsplit_full?pos?lenres=Rseq.split_full?pos?lenres|>list_of_seqletsplit?pos?lenres=Rseq.split?pos?lenres|>list_of_seqendmoduleGen=structtype'agen=unit->'aoptionletgen_of_seq(s:'aSeq.t):'agen=letr=refsinfun()->match!r()with|Seq.Nil->None|Seq.Cons(x,tl)->r:=tl;Somexletsplit?pos?lenres:_gen=Rseq.split?pos?lenres|>gen_of_seqletsplit_full?pos?lenres:_gen=Rseq.split_full?pos?lenres|>gen_of_seqletall?pos?lenres=Rseq.all?pos?lenres|>gen_of_seqletmatches?pos?lenres=Rseq.matches?pos?lenres|>gen_of_seqendletreplace?(pos=0)?len?(all=true)re~fs=ifpos<0theninvalid_arg"Re.replace";letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.replace";pos+lin(* buffer into which we write the result *)letbuf=Buffer.create(String.lengths)in(* iterate on matched substrings. *)letreciterpos=ifpos<limitthenmatchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0in(* add string between previous match and current match *)Buffer.add_substringbufspos(p1-pos);(* what should we replace the matched group with? *)letreplacing=fsubstrinBuffer.add_stringbufreplacing;ifallthen(* if we matched a non-char e.g. ^ we must manually advance by 1 *)iter(ifp1=p2then((* a non char could be past the end of string. e.g. $ *)ifp2<limitthenBuffer.add_charbufs.[p2];p2+1)elsep2)elseBuffer.add_substringbufsp2(limit-p2)|Running_->()|Failed->Buffer.add_substringbufspos(limit-pos)initerpos;Buffer.contentsbufletreplace_string?pos?len?allre~bys=replace?pos?len?allres~f:(fun_->by)letwitnesst=letrecwitness=function|Setc->String.make1(Char.chr(Cset.pickc))|Sequencexs->String.concat""(List.mapwitnessxs)|Alternative(x::_)->witnessx|Alternative[]->assertfalse|Repeat(r,from,_to)->letw=witnessrinletb=Buffer.create(String.lengthw*from)infor_i=1tofromdoBuffer.add_stringbwdone;Buffer.contentsb|No_caser->witnessr|Intersection_|Complement_|Difference(_,_)->assertfalse|Group(_,r)|No_groupr|Nestr|Sem(_,r)|Pmark(_,r)|Caser|Sem_greedy(_,r)->witnessr|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|Last_end_of_line|Start|Stop|End_of_str->""inwitness(handle_casefalset)moduleSeq=RseqmoduleList=RlistmoduleGroup=Group(** {2 Deprecated functions} *)letsplit_full_seq=Seq.split_fullletsplit_seq=Seq.splitletmatches_seq=Seq.matchesletall_seq=Seq.alltype'agen='aGen.genletall_gen=Gen.allletmatches_gen=Gen.matchesletsplit_gen=Gen.splitletsplit_full_gen=Gen.split_fulltypesubstrings=Group.tletget=Group.getletget_ofs=Group.offsetletget_all=Group.allletget_all_ofs=Group.all_offsetlettest=Group.testtypemarkid=Mark.tletmarked=Mark.testletmark_set=Mark.all(**********************************)(*
Information about the previous character:
- does not exists
- is a letter
- is not a letter
- is a newline
- is last newline
Beginning of word:
- previous is not a letter or does not exist
- current is a letter or does not exist
End of word:
- previous is a letter or does not exist
- current is not a letter or does not exist
Beginning of line:
- previous is a newline or does not exist
Beginning of buffer:
- previous does not exist
End of buffer
- current does not exist
End of line
- current is a newline or does not exist
*)(*
Rep: e = T,e | ()
- semantics of the comma (shortest/longest/first)
- semantics of the union (greedy/non-greedy)
Bounded repetition
a{0,3} = (a,(a,a?)?)?
*)typegroups=Group.tincludeRlist