123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)letreciternfv=ifn=0thenvelseiter(n-1)f(fv)(****)letunknown=-2letbreak=-3typematch_info=|MatchofGroup.t|Failed|Runningtypestate={idx:int;(* Index of the current position in the position table.
Not yet computed transitions point to a dummy state where
[idx] is set to [unknown];
If [idx] is set to [break] for states that either always
succeed or always fail. *)real_idx:int;(* The real index, in case [idx] is set to [break] *)next:statearray;(* Transition table, indexed by color *)mutablefinal:(Category.t*(Automata.idx*Automata.status))list;(* Mapping from the category of the next character to
- the index where the next position should be saved
- possibly, the list of marks (and the corresponding indices)
corresponding to the best match *)desc:Automata.State.t(* Description of this state of the automata *)}(* Automata (compiled regular expression) *)typere={initial:Automata.expr;(* The whole regular expression *)mutableinitial_states:(Category.t*state)list;(* Initial states, indexed by initial category *)colors:Bytes.t;(* Color table *)color_repr:Bytes.t;(* Table from colors to one character of this color *)ncolor:int;(* Number of colors. *)lnl:int;(* Color of the last newline. -1 if unnecessary *)tbl:Automata.working_area;(* Temporary table used to compute the first available index
when computing a new state *)states:stateAutomata.State.Table.t;(* States of the deterministic automata *)group_count:int(* Number of groups in the regular expression *)}letpp_rechre=Automata.ppchre.initialletprint_re=pp_re(* Information used during matching *)typeinfo={re:re;(* The automata *)colors:Bytes.t;(* Color table ([x.colors = x.re.colors])
Shortcut used for performance reasons *)mutablepositions:intarray;(* Array of mark positions
The mark are off by one for performance reasons *)pos:int;(* Position where the match is started *)last:int(* Position where the match should stop *)}(****)letcategoryre~color=ifcolor=-1thenCategory.inexistant(* Special category for the last newline *)elseifcolor=re.lnlthenCategory.(lastnewline++newline++not_letter)elseCategory.from_char(Bytes.getre.color_reprcolor)(****)letdummy_next=[||]letunknown_state={idx=unknown;real_idx=0;next=dummy_next;final=[];desc=Automata.State.dummy}letmk_statencoldesc=letbreak_state=matchAutomata.statusdescwith|Automata.Running->false|Automata.Failed|Automata.Match_->truein{idx=ifbreak_statethenbreakelsedesc.Automata.State.idx;real_idx=desc.Automata.State.idx;next=ifbreak_statethendummy_nextelseArray.makencolunknown_state;final=[];desc}letfind_stateredesc=tryAutomata.State.Table.findre.statesdescwithNot_found->letst=mk_statere.ncolordescinAutomata.State.Table.addre.statesdescst;st(**** Match with marks ****)letdeltainfocat~colorst=letdesc=Automata.deltainfo.re.tblcatcolorst.descinletlen=Array.lengthinfo.positionsinifdesc.Automata.State.idx=len&&len>0thenbeginletpos=info.positionsininfo.positions<-Array.make(2*len)0;Array.blitpos0info.positions0lenend;descletvalidateinfo(s:string)~posst=letcolor=Char.code(Bytes.getinfo.colors(Char.codes.[pos]))inletcat=categoryinfo.re~colorinletdesc'=deltainfocat~colorstinletst'=find_stateinfo.redesc'inst.next.(color)<-st'(*
let rec loop info s pos st =
if pos < info.last then
let st' = st.next.(Char.code info.cols.[Char.code s.[pos]]) in
let idx = st'.idx in
if idx >= 0 then begin
info.positions.(idx) <- pos;
loop info s (pos + 1) st'
end else if idx = break then begin
info.positions.(st'.real_idx) <- pos;
st'
end else begin (* Unknown *)
validate info s pos st;
loop info s pos st
end
else
st
*)letrecloopinfo(s:string)~posst=ifpos<info.lastthenletst'=st.next.(Char.code(Bytes.getinfo.colors(Char.codes.[pos])))inloop2infos~posstst'elsestandloop2infos~posstst'=ifst'.idx>=0thenbeginletpos=pos+1inifpos<info.lastthenbegin(* It is important to place these reads before the write *)(* But then, we don't have enough registers left to store the
right position. So, we store the position plus one. *)letst''=st'.next.(Char.code(Bytes.getinfo.colors(Char.codes.[pos])))ininfo.positions.(st'.idx)<-pos;loop2infos~posst'st''endelsebegininfo.positions.(st'.idx)<-pos;st'endendelseifst'.idx=breakthenbegininfo.positions.(st'.real_idx)<-pos+1;st'endelsebegin(* Unknown *)validateinfos~posst;loopinfos~posstendletrecloop_no_markinfos~pos~lastst=ifpos<lastthenletst'=st.next.(Char.code(Bytes.getinfo.colors(Char.codes.[pos])))inifst'.idx>=0thenloop_no_markinfos~pos:(pos+1)~lastst'elseifst'.idx=breakthenst'elsebegin(* Unknown *)validateinfos~posst;loop_no_markinfos~pos~laststendelsestletfinalinfostcat=tryList.assqcatst.finalwithNot_found->letst'=deltainfocat~color:(-1)stinletres=(st'.Automata.State.idx,Automata.statusst')inst.final<-(cat,res)::st.final;resletfind_initial_staterecat=tryList.assqcatre.initial_stateswithNot_found->letst=find_statere(Automata.State.createcatre.initial)inre.initial_states<-(cat,st)::re.initial_states;stletget_colorre(s:string)pos=ifpos<0then-1elseletslen=String.lengthsinifpos>=slenthen-1elseifpos=slen-1&&re.lnl<>-1&&s.[pos]='\n'then(* Special case for the last newline *)re.lnlelseChar.code(Bytes.getre.colors(Char.codes.[pos]))letrechandle_last_newlineinfo~posst~groups=letst'=st.next.(info.re.lnl)inifst'.idx>=0thenbeginifgroupstheninfo.positions.(st'.idx)<-pos+1;st'endelseifst'.idx=breakthenbeginifgroupstheninfo.positions.(st'.real_idx)<-pos+1;st'endelsebegin(* Unknown *)letcolor=info.re.lnlinletreal_c=Char.code(Bytes.getinfo.colors(Char.code'\n'))inletcat=categoryinfo.re~colorinletdesc'=deltainfocat~color:real_cstinletst'=find_stateinfo.redesc'inst.next.(color)<-st';handle_last_newlineinfo~posst~groupsendletrecscan_strinfo(s:string)initial_state~groups=letpos=info.posinletlast=info.lastinif(last=String.lengths&&info.re.lnl<>-1&&last>pos&&String.gets(last-1)='\n')thenbeginletinfo={infowithlast=last-1}inletst=scan_strinfosinitial_state~groupsinifst.idx=breakthenstelsehandle_last_newlineinfo~pos:(last-1)st~groupsendelseifgroupsthenloopinfos~posinitial_stateelseloop_no_markinfos~pos~lastinitial_stateletmatch_str~groups~partialres~pos~len=letslen=String.lengthsinletlast=iflen=-1thenslenelsepos+leninletinfo={re;colors=re.colors;pos;last;positions=ifgroupsthenbeginletn=Automata.index_countre.tbl+1inifn<=10then[|0;0;0;0;0;0;0;0;0;0|]elseArray.maken0endelse[||]}inletinitial_cat=ifpos=0thenCategory.(search_boundary++inexistant)elseCategory.(search_boundary++categoryre~color:(get_colorres(pos-1)))inletinitial_state=find_initial_statereinitial_catinletst=scan_strinfosinitial_state~groupsinletres=ifst.idx=break||partialthenAutomata.statusst.descelseletfinal_cat=iflast=slenthenCategory.(search_boundary++inexistant)elseCategory.(search_boundary++categoryre~color:(get_colorreslast))inlet(idx,res)=finalinfostfinal_catinifgroupstheninfo.positions.(idx)<-last+1;resinmatchreswithAutomata.Match(marks,pmarks)->Match{s;marks;pmarks;gpos=info.positions;gcount=re.group_count}|Automata.Failed->Failed|Automata.Running->Runningletmk_re~initial~colors~color_repr~ncolor~lnl~group_count={initial;initial_states=[];colors;color_repr;ncolor;lnl;tbl=Automata.create_working_area();states=Automata.State.Table.create97;group_count}(**** Character sets ****)letcseqcc'=Cset.seq(Char.codec)(Char.codec')letcaddcs=Cset.add(Char.codec)slettrans_setcachecms=matchCset.one_charswith|Somei->Cset.csingle(Bytes.getcmi)|None->letv=(Cset.hash_recs,s)intryCset.CSetMap.findv!cachewithNot_found->letl=Cset.fold_rights~f:(fun(i,j)l->Cset.union(cseq(Bytes.getcmi)(Bytes.getcmj))l)~init:Cset.emptyincache:=Cset.CSetMap.addvl!cache;l(****)typeregexp=SetofCset.t|Sequenceofregexplist|Alternativeofregexplist|Repeatofregexp*int*intoption|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stop|SemofAutomata.sem*regexp|Sem_greedyofAutomata.rep_kind*regexp|Groupofregexp|No_groupofregexp|Nestofregexp|Caseofregexp|No_caseofregexp|Intersectionofregexplist|Complementofregexplist|Differenceofregexp*regexp|PmarkofPmark.t*regexpmoduleView=structtypet=regexp=SetofCset.t|Sequenceofregexplist|Alternativeofregexplist|Repeatofregexp*int*intoption|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stop|SemofAutomata.sem*regexp|Sem_greedyofAutomata.rep_kind*regexp|Groupofregexp|No_groupofregexp|Nestofregexp|Caseofregexp|No_caseofregexp|Intersectionofregexplist|Complementofregexplist|Differenceofregexp*regexp|PmarkofPmark.t*regexpletviewt=tendletrecppfmtt=letopenFmtinletvarsre=sexpfmtsppreinletseqsrel=sexpfmts(listpp)relinmatchtwith|Sets->sexpfmt"Set"Cset.pps|Sequencesq->seq"Sequence"sq|Alternativealt->seq"Alternative"alt|Repeat(re,start,stop)->letpp'fmt()=fprintffmt"%a@ %d%a"pprestartoptintstopinsexpfmt"Repeat"pp'()|Beg_of_line->strfmt"Beg_of_line"|End_of_line->strfmt"End_of_line"|Beg_of_word->strfmt"Beg_of_word"|End_of_word->strfmt"End_of_word"|Not_bound->strfmt"Not_bound"|Beg_of_str->strfmt"Beg_of_str"|End_of_str->strfmt"End_of_str"|Last_end_of_line->strfmt"Last_end_of_line"|Start->strfmt"Start"|Stop->strfmt"Stop"|Sem(sem,re)->sexpfmt"Sem"(pairAutomata.pp_sempp)(sem,re)|Sem_greedy(k,re)->sexpfmt"Sem_greedy"(pairAutomata.pp_rep_kindpp)(k,re)|Groupc->var"Group"c|No_groupc->var"No_group"c|Nestc->var"Nest"c|Casec->var"Case"c|No_casec->var"No_case"c|Intersectionc->seq"Intersection"c|Complementc->seq"Complement"c|Difference(a,b)->sexpfmt"Difference"(pairpppp)(a,b)|Pmark(m,r)->sexpfmt"Pmark"(pairPmark.pppp)(m,r)letrecis_charset=function|Set_->true|Alternativel|Intersectionl|Complementl->List.for_allis_charsetl|Difference(r,r')->is_charsetr&&is_charsetr'|Sem(_,r)|Sem_greedy(_,r)|No_groupr|Caser|No_caser->is_charsetr|Sequence_|Repeat_|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Beg_of_str|End_of_str|Not_bound|Last_end_of_line|Start|Stop|Group_|Nest_|Pmark(_,_)->false(*XXX Use a better algorithm allowing non-contiguous regions? *)letcupper=Cset.union(cseq'A''Z')(Cset.union(cseq'\192''\214')(cseq'\216''\222'))letclower=Cset.offset32cupperletcalpha=List.fold_rightcadd['\170';'\181';'\186';'\223';'\255'](Cset.unionclowercupper)letcdigit=cseq'0''9'letcalnum=Cset.unioncalphacdigitletcword=cadd'_'calnumletcolorizecregexp=letlnl=reffalseinletreccolorizeregexp=matchregexpwithSets->Color_map.splitsc|Sequencel->List.itercolorizel|Alternativel->List.itercolorizel|Repeat(r,_,_)->colorizer|Beg_of_line|End_of_line->Color_map.split(Cset.csingle'\n')c|Beg_of_word|End_of_word|Not_bound->Color_map.splitcwordc|Beg_of_str|End_of_str|Start|Stop->()|Last_end_of_line->lnl:=true|Sem(_,r)|Sem_greedy(_,r)|Groupr|No_groupr|Nestr|Pmark(_,r)->colorizer|Case_|No_case_|Intersection_|Complement_|Difference_->assertfalseincolorizeregexp;!lnl(**** Compilation ****)letrecequalx1x2=matchx1,x2withSets1,Sets2->s1=s2|Sequencel1,Sequencel2->eq_listl1l2|Alternativel1,Alternativel2->eq_listl1l2|Repeat(x1',i1,j1),Repeat(x2',i2,j2)->i1=i2&&j1=j2&&equalx1'x2'|Beg_of_line,Beg_of_line|End_of_line,End_of_line|Beg_of_word,Beg_of_word|End_of_word,End_of_word|Not_bound,Not_bound|Beg_of_str,Beg_of_str|End_of_str,End_of_str|Last_end_of_line,Last_end_of_line|Start,Start|Stop,Stop->true|Sem(sem1,x1'),Sem(sem2,x2')->sem1=sem2&&equalx1'x2'|Sem_greedy(k1,x1'),Sem_greedy(k2,x2')->k1=k2&&equalx1'x2'|Group_,Group_->(* Do not merge groups! *)false|No_groupx1',No_groupx2'->equalx1'x2'|Nestx1',Nestx2'->equalx1'x2'|Casex1',Casex2'->equalx1'x2'|No_casex1',No_casex2'->equalx1'x2'|Intersectionl1,Intersectionl2->eq_listl1l2|Complementl1,Complementl2->eq_listl1l2|Difference(x1',x1''),Difference(x2',x2'')->equalx1'x2'&&equalx1''x2''|Pmark(m1,r1),Pmark(m2,r2)->Pmark.equalm1m2&&equalr1r2|_->falseandeq_listl1l2=matchl1,l2with[],[]->true|x1::r1,x2::r2->equalx1x2&&eq_listr1r2|_->falseletsequence=function|[x]->x|l->Sequencelletrecmerge_sequences=function|[]->[]|Alternativel'::r->merge_sequences(l'@r)|Sequence(x::y)::r->beginmatchmerge_sequencesrwithSequence(x'::y')::r'whenequalxx'->Sequence[x;Alternative[sequencey;sequencey']]::r'|r'->Sequence(x::y)::r'end|x::r->x::merge_sequencesrmoduleA=Automataletenforce_kindidskindkind'cr=matchkind,kind'with`First,`First->cr|`First,k->A.seqidskcr(A.epsids)|_->cr(* XXX should probably compute a category mask *)letrectranslateidskindign_groupign_casegreedyposcachec=function|Sets->(A.cstids(trans_setcachecs),kind)|Sequencel->(trans_seqidskindign_groupign_casegreedyposcachecl,kind)|Alternativel->beginmatchmerge_sequenceslwith[r']->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'in(enforce_kindidskindkind'cr,kind)|merged_sequences->(A.altids(List.map(funr'->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'inenforce_kindidskindkind'cr)merged_sequences),kind)end|Repeat(r',i,j)->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'inletrem=matchjwithNone->A.repidsgreedykind'cr|Somej->letf=matchgreedywith`Greedy->funrem->A.altids[A.seqidskind'(A.renameidscr)rem;A.epsids]|`Non_greedy->funrem->A.altids[A.epsids;A.seqidskind'(A.renameidscr)rem]initer(j-i)f(A.epsids)in(iteri(funrem->A.seqidskind'(A.renameidscr)rem)rem,kind)|Beg_of_line->(A.afteridsCategory.(inexistant++newline),kind)|End_of_line->(A.beforeidsCategory.(inexistant++newline),kind)|Beg_of_word->(A.seqids`First(A.afteridsCategory.(inexistant++not_letter))(A.beforeidsCategory.(inexistant++letter)),kind)|End_of_word->(A.seqids`First(A.afteridsCategory.(inexistant++letter))(A.beforeidsCategory.(inexistant++not_letter)),kind)|Not_bound->(A.altids[A.seqids`First(A.afteridsCategory.letter)(A.beforeidsCategory.letter);A.seqids`First(A.afteridsCategory.letter)(A.beforeidsCategory.letter)],kind)|Beg_of_str->(A.afteridsCategory.inexistant,kind)|End_of_str->(A.beforeidsCategory.inexistant,kind)|Last_end_of_line->(A.beforeidsCategory.(inexistant++lastnewline),kind)|Start->(A.afteridsCategory.search_boundary,kind)|Stop->(A.beforeidsCategory.search_boundary,kind)|Sem(kind',r')->let(cr,kind'')=translateidskind'ign_groupign_casegreedyposcachecr'in(enforce_kindidskind'kind''cr,kind')|Sem_greedy(greedy',r')->translateidskindign_groupign_casegreedy'poscachecr'|Groupr'->ifign_groupthentranslateidskindign_groupign_casegreedyposcachecr'elseletp=!posinpos:=!pos+2;let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'in(A.seqids`First(A.markidsp)(A.seqids`Firstcr(A.markids(p+1))),kind')|No_groupr'->translateidskindtrueign_casegreedyposcachecr'|Nestr'->letb=!posinlet(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'inlete=!pos-1inife<bthen(cr,kind')else(A.seqids`First(A.eraseidsbe)cr,kind')|Difference_|Complement_|Intersection_|No_case_|Case_->assertfalse|Pmark(i,r')->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'in(A.seqids`First(A.pmarkidsi)cr,kind')andtrans_seqidskindign_groupign_casegreedyposcachec=function|[]->A.epsids|[r]->let(cr',kind')=translateidskindign_groupign_casegreedyposcachecrinenforce_kindidskindkind'cr'|r::rem->let(cr',kind')=translateidskindign_groupign_casegreedyposcachecrinletcr''=trans_seqidskindign_groupign_casegreedyposcachecreminifA.is_epscr''thencr'elseifA.is_epscr'thencr''elseA.seqidskind'cr'cr''(**** Case ****)letcase_insenss=Cset.unions(Cset.union(Cset.offset32(Cset.interscupper))(Cset.offset(-32)(Cset.intersclower)))letas_set=function|Sets->s|_->assertfalse(* XXX Should split alternatives into (1) charsets and (2) more
complex regular expressions; alternative should therefore probably
be flatten here *)letrechandle_caseign_case=function|Sets->Set(ifign_casethencase_insensselses)|Sequencel->Sequence(List.map(handle_caseign_case)l)|Alternativel->letl'=List.map(handle_caseign_case)linifis_charset(Alternativel')thenSet(List.fold_left(funsr->Cset.unions(as_setr))Cset.emptyl')elseAlternativel'|Repeat(r,i,j)->Repeat(handle_caseign_caser,i,j)|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stopasr->r|Sem(k,r)->letr'=handle_caseign_caserinifis_charsetr'thenr'elseSem(k,r')|Sem_greedy(k,r)->letr'=handle_caseign_caserinifis_charsetr'thenr'elseSem_greedy(k,r')|Groupr->Group(handle_caseign_caser)|No_groupr->letr'=handle_caseign_caserinifis_charsetr'thenr'elseNo_groupr'|Nestr->letr'=handle_caseign_caserinifis_charsetr'thenr'elseNestr'|Caser->handle_casefalser|No_caser->handle_casetruer|Intersectionl->letl'=List.map(funr->handle_caseign_caser)linSet(List.fold_left(funsr->Cset.inters(as_setr))Cset.canyl')|Complementl->letl'=List.map(funr->handle_caseign_caser)linSet(Cset.diffCset.cany(List.fold_left(funsr->Cset.unions(as_setr))Cset.emptyl'))|Difference(r,r')->Set(Cset.inter(as_set(handle_caseign_caser))(Cset.diffCset.cany(as_set(handle_caseign_caser'))))|Pmark(i,r)->Pmark(i,handle_caseign_caser)(****)letcompile_1regexp=letregexp=handle_casefalseregexpinletc=Color_map.make()inletneed_lnl=colorizecregexpinlet(colors,color_repr,ncolor)=Color_map.flattencinletlnl=ifneed_lnlthenncolorelse-1inletncolor=ifneed_lnlthenncolor+1elsencolorinletids=A.create_ids()inletpos=ref0inlet(r,kind)=translateids`Firstfalsefalse`Greedypos(refCset.CSetMap.empty)colorsregexpinletr=enforce_kindids`Firstkindrin(*Format.eprintf "<%d %d>@." !ids ncol;*)mk_re~initial:r~colors~color_repr~ncolor~lnl~group_count:(!pos/2)(****)letrecanchored=function|Sequencel->List.existsanchoredl|Alternativel->List.for_allanchoredl|Repeat(r,i,_)->i>0&&anchoredr|Set_|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|End_of_str|Last_end_of_line|Stop|Intersection_|Complement_|Difference_->false|Beg_of_str|Start->true|Sem(_,r)|Sem_greedy(_,r)|Groupr|No_groupr|Nestr|Caser|No_caser|Pmark(_,r)->anchoredr(****)typet=regexpletstrs=letl=ref[]infori=String.lengths-1downto0dol:=Set(Cset.csingles.[i])::!ldone;Sequence!lletcharc=Set(Cset.csinglec)letalt=function|[r]->r|l->Alternativelletseq=function|[r]->r|l->Sequencelletempty=alt[]letepsilon=seq[]letrepnrij=ifi<0theninvalid_arg"Re.repn";beginmatchjwith|Somejwhenj<i->invalid_arg"Re.repn"|_->()end;Repeat(r,i,j)letrepr=repnr0Noneletrep1r=repnr1Noneletoptr=repnr0(Some1)letbol=Beg_of_lineleteol=End_of_lineletbow=Beg_of_wordleteow=End_of_wordletwordr=seq[bow;r;eow]letnot_boundary=Not_boundletbos=Beg_of_strleteos=End_of_strletwhole_stringr=seq[bos;r;eos]letleol=Last_end_of_lineletstart=Startletstop=Stopletlongestr=Sem(`Longest,r)letshortestr=Sem(`Shortest,r)letfirstr=Sem(`First,r)letgreedyr=Sem_greedy(`Greedy,r)letnon_greedyr=Sem_greedy(`Non_greedy,r)letgroupr=Grouprletno_groupr=No_grouprletnestr=Nestrletmarkr=leti=Pmark.gen()in(i,Pmark(i,r))letsetstr=lets=refCset.emptyinfori=0toString.lengthstr-1dos:=Cset.union(Cset.csinglestr.[i])!sdone;Set!sletrgcc'=Set(cseqcc')letinterl=letr=Intersectionlinifis_charsetrthenrelseinvalid_arg"Re.inter"letcompll=letr=Complementlinifis_charsetrthenrelseinvalid_arg"Re.compl"letdiffrr'=letr''=Difference(r,r')inifis_charsetr''thenr''elseinvalid_arg"Re.diff"letany=SetCset.canyletnotnl=Set(Cset.diffCset.cany(Cset.csingle'\n'))letlower=alt[rg'a''z';char'\181';rg'\223''\246';rg'\248''\255']letupper=alt[rg'A''Z';rg'\192''\214';rg'\216''\222']letalpha=alt[lower;upper;char'\170';char'\186']letdigit=rg'0''9'letalnum=alt[alpha;digit]letwordc=alt[alnum;char'_']letascii=rg'\000''\127'letblank=set"\t "letcntrl=alt[rg'\000''\031';rg'\127''\159']letgraph=alt[rg'\033''\126';rg'\160''\255']letprint=alt[rg'\032''\126';rg'\160''\255']letpunct=alt[rg'\033''\047';rg'\058''\064';rg'\091''\096';rg'\123''\126';rg'\160''\169';rg'\171''\180';rg'\182''\185';rg'\187''\191';char'\215';char'\247']letspace=alt[char' ';rg'\009''\013']letxdigit=alt[digit;rg'a''f';rg'A''F']letcaser=Caserletno_caser=No_caser(****)letcompiler=compile_1(ifanchoredrthengrouprelseseq[shortest(repany);groupr])letexec_internalname?(pos=0)?(len=-1)~partial~groupsres=ifpos<0||len<-1||pos+len>String.lengthstheninvalid_argname;match_str~groups~partialres~pos~lenletexec?pos?lenres=matchexec_internal"Re.exec"?pos?len~groups:true~partial:falsereswithMatchsubstr->substr|_->raiseNot_foundletexec_opt?pos?lenres=matchexec_internal"Re.exec_opt"?pos?len~groups:true~partial:falsereswithMatchsubstr->Somesubstr|_->Noneletexecp?pos?lenres=matchexec_internal~groups:false~partial:false"Re.execp"?pos?lenreswithMatch_substr->true|_->falseletexec_partial?pos?lenres=matchexec_internal~groups:false~partial:true"Re.exec_partial"?pos?lenreswithMatch_->`Full|Running->`Partial|Failed->`MismatchmoduleMark=structtypet=Pmark.tlettest(g:Group.t)p=Pmark.Set.mempg.pmarksletall(g:Group.t)=g.pmarksmoduleSet=Pmark.Setletequal=Pmark.equalletcompare=Pmark.compareendtypesplit_token=[`Textofstring|`DelimofGroup.t]moduleRseq=structletall?(pos=0)?lenres:_Seq.t=ifpos<0theninvalid_arg"Re.all";(* index of the first position we do not consider.
!pos < limit is an invariant *)letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.all";pos+lin(* iterate on matches. When a match is found, search for the next
one just after its end *)letrecauxpos()=ifpos>=limitthenSeq.Nil(* no more matches *)elsematchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0inletpos=ifp1=p2thenp2+1elsep2inSeq.Cons(substr,auxpos)|Running|Failed->Seq.Nilinauxposletmatches?pos?lenres:_Seq.t=all?pos?lenres|>Seq.map(funsub->Group.getsub0)letsplit_full?(pos=0)?lenres:_Seq.t=ifpos<0theninvalid_arg"Re.split";letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.split";pos+lin(* i: start of delimited string
pos: first position after last match of [re]
limit: first index we ignore (!pos < limit is an invariant) *)letpos0=posinletrecauxstateipos()=matchstatewith|`Idlewhenpos>=limit->ifi<limitthen(letsub=String.subsi(limit-i)inSeq.Cons(`Textsub,auxstate(i+1)pos))elseSeq.Nil|`Idle->beginmatchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0inletpos=ifp1=p2thenp2+1elsep2inletold_i=iinleti=p2inifp1>pos0then((* string does not start by a delimiter *)lettext=String.subsold_i(p1-old_i)inletstate=`Yield(`Delimsubstr)inSeq.Cons(`Texttext,auxstateipos))elseSeq.Cons(`Delimsubstr,auxstateipos)|Running->Seq.Nil|Failed->ifi<limitthen(lettext=String.subsi(limit-i)in(* yield last string *)Seq.Cons(`Texttext,auxstatelimitpos))elseSeq.Nilend|`Yieldx->Seq.Cons(x,aux`Idleipos)inaux`Idleposposletsplit?pos?lenres:_Seq.t=letseq=split_full?pos?lenresinletrecfilterseq()=matchseq()with|Seq.Nil->Seq.Nil|Seq.Cons(`Delim_,tl)->filtertl()|Seq.Cons(`Texts,tl)->Seq.Cons(s,filtertl)infilterseqendmoduleRlist=structletlist_of_seq(s:'aSeq.t):'alist=Seq.fold_left(funlx->x::l)[]s|>List.revletall?pos?lenres=Rseq.all?pos?lenres|>list_of_seqletmatches?pos?lenres=Rseq.matches?pos?lenres|>list_of_seqletsplit_full?pos?lenres=Rseq.split_full?pos?lenres|>list_of_seqletsplit?pos?lenres=Rseq.split?pos?lenres|>list_of_seqendmoduleGen=structtype'agen=unit->'aoptionletgen_of_seq(s:'aSeq.t):'agen=letr=refsinfun()->match!r()with|Seq.Nil->None|Seq.Cons(x,tl)->r:=tl;Somexletsplit?pos?lenres:_gen=Rseq.split?pos?lenres|>gen_of_seqletsplit_full?pos?lenres:_gen=Rseq.split_full?pos?lenres|>gen_of_seqletall?pos?lenres=Rseq.all?pos?lenres|>gen_of_seqletmatches?pos?lenres=Rseq.matches?pos?lenres|>gen_of_seqendletreplace?(pos=0)?len?(all=true)re~fs=ifpos<0theninvalid_arg"Re.replace";letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.replace";pos+lin(* buffer into which we write the result *)letbuf=Buffer.create(String.lengths)in(* iterate on matched substrings. *)letreciterpos=ifpos<limitthenmatchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0in(* add string between previous match and current match *)Buffer.add_substringbufspos(p1-pos);(* what should we replace the matched group with? *)letreplacing=fsubstrinBuffer.add_stringbufreplacing;ifallthen(* if we matched a non-char e.g. ^ we must manually advance by 1 *)iter(ifp1=p2then((* a non char could be past the end of string. e.g. $ *)ifp2<limitthenBuffer.add_charbufs.[p2];p2+1)elsep2)elseBuffer.add_substringbufsp2(limit-p2)|Running->()|Failed->Buffer.add_substringbufspos(limit-pos)initerpos;Buffer.contentsbufletreplace_string?pos?len?allre~bys=replace?pos?len?allres~f:(fun_->by)letwitnesst=letrecwitness=function|Setc->String.make1(Char.chr(Cset.pickc))|Sequencexs->String.concat""(List.mapwitnessxs)|Alternative(x::_)->witnessx|Alternative[]->assertfalse|Repeat(r,from,_to)->letw=witnessrinletb=Buffer.create(String.lengthw*from)infor_i=1tofromdoBuffer.add_stringbwdone;Buffer.contentsb|No_caser->witnessr|Intersection_|Complement_|Difference(_,_)->assertfalse|Groupr|No_groupr|Nestr|Sem(_,r)|Pmark(_,r)|Caser|Sem_greedy(_,r)->witnessr|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|Last_end_of_line|Start|Stop|End_of_str->""inwitness(handle_casefalset)type'aseq='aSeq.tmoduleSeq=RseqmoduleList=RlistmoduleGroup=Group(** {2 Deprecated functions} *)type'agen='aGen.genletall_gen=Gen.allletmatches_gen=Gen.matchesletsplit_gen=Gen.splitletsplit_full_gen=Gen.split_fullletall_seq=Seq.allletmatches_seq=Seq.matchesletsplit_seq=Seq.splitletsplit_full_seq=Seq.split_fulltypesubstrings=Group.tletget=Group.getletget_ofs=Group.offsetletget_all=Group.allletget_all_ofs=Group.all_offsetlettest=Group.testtypemarkid=Mark.tletmarked=Mark.testletmark_set=Mark.all(**********************************)(*
Information about the previous character:
- does not exists
- is a letter
- is not a letter
- is a newline
- is last newline
Beginning of word:
- previous is not a letter or does not exist
- current is a letter or does not exist
End of word:
- previous is a letter or does not exist
- current is not a letter or does not exist
Beginning of line:
- previous is a newline or does not exist
Beginning of buffer:
- previous does not exist
End of buffer
- current does not exist
End of line
- current is a newline or does not exist
*)(*
Rep: e = T,e | ()
- semantics of the comma (shortest/longest/first)
- semantics of the union (greedy/non-greedy)
Bounded repetition
a{0,3} = (a,(a,a?)?)?
*)typegroups=Group.tincludeRlist