12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)moduleCset=Re_csetmoduleAutomata=Re_automatamoduleMarkSet=Automata.PmarkSetletreciternfv=ifn=0thenvelseiter(n-1)f(fv)(****)letunknown=-2letbreak=-3(* Result of a successful match. *)typegroups={s:string(* Input string. Matched strings are substrings of s *);marks:Automata.mark_infos(* Mapping from group indices to positions in gpos. group i has positions 2*i
- 1, 2*i + 1 in gpos. If the group wasn't matched, then its corresponding
values in marks will be -1,-1 *);pmarks:MarkSet.t(* Marks positions. i.e. those marks created with Re.marks *);gpos:intarray(* Group positions. Adjacent elements are (start, stop) of group match.
indexed by the values in marks. So group i in an re would be the substring:
start = t.gpos.(marks.(2*i)) - 1
stop = t.gpos.(marks.(2*i + 1)) - 1 *);gcount:int(* Number of groups the regular expression contains. Matched or not *)}typematch_info=|Matchofgroups|Failed|Runningtypestate={idx:int;(* Index of the current position in the position table.
Not yet computed transitions point to a dummy state where
[idx] is set to [unknown];
If [idx] is set to [break] for states that either always
succeed or always fail. *)real_idx:int;(* The real index, in case [idx] is set to [break] *)next:statearray;(* Transition table, indexed by color *)mutablefinal:(Automata.category*(Automata.idx*Automata.status))list;(* Mapping from the category of the next character to
- the index where the next position should be saved
- possibly, the list of marks (and the corresponding indices)
corresponding to the best match *)desc:Automata.State.t(* Description of this state of the automata *)}(* Automata (compiled regular expression) *)typere={initial:Automata.expr;(* The whole regular expression *)mutableinitial_states:(Automata.category*state)list;(* Initial states, indexed by initial category *)cols:Bytes.t;(* Color table *)col_repr:Bytes.t;(* Table from colors to one character of this color *)ncol:int;(* Number of colors. *)lnl:int;(* Color of the last newline *)tbl:Automata.working_area;(* Temporary table used to compute the first available index
when computing a new state *)states:stateAutomata.State.Table.t;(* States of the deterministic automata *)group_count:int(* Number of groups in the regular expression *)}letpp_rechre=Automata.ppchre.initialletprint_re=pp_re(* Information used during matching *)typeinfo={re:re;(* The automata *)i_cols:Bytes.t;(* Color table ([x.i_cols = x.re.cols])
Shortcut used for performance reasons *)mutablepositions:intarray;(* Array of mark positions
The mark are off by one for performance reasons *)pos:int;(* Position where the match is started *)last:int(* Position where the match should stop *)}(****)letcat_inexistant=1letcat_letter=2letcat_not_letter=4letcat_newline=8letcat_lastnewline=16letcat_search_boundary=32letcategoryrec=ifc=-1thencat_inexistant(* Special category for the last newline *)elseifc=re.lnlthencat_lastnewlinelorcat_newlinelorcat_not_letterelsematchBytes.getre.col_reprcwith(* Should match [cword] definition *)'a'..'z'|'A'..'Z'|'0'..'9'|'_'|'\170'|'\181'|'\186'|'\192'..'\214'|'\216'..'\246'|'\248'..'\255'->cat_letter|'\n'->cat_not_letterlorcat_newline|_->cat_not_letter(****)letdummy_next=[||]letunknown_state={idx=unknown;real_idx=0;next=dummy_next;final=[];desc=Automata.State.dummy}letmk_statencoldesc=letbreak_state=matchAutomata.statusdescwith|Automata.Running->false|Automata.Failed|Automata.Match_->truein{idx=ifbreak_statethenbreakelsedesc.Automata.State.idx;real_idx=desc.Automata.State.idx;next=ifbreak_statethendummy_nextelseArray.makencolunknown_state;final=[];desc=desc}letfind_stateredesc=tryAutomata.State.Table.findre.statesdescwithNot_found->letst=mk_statere.ncoldescinAutomata.State.Table.addre.statesdescst;st(**** Match with marks ****)letdeltainfocatcst=letdesc=Automata.deltainfo.re.tblcatcst.descinletlen=Array.lengthinfo.positionsinifdesc.Automata.State.idx=len&&len>0thenbeginletpos=info.positionsininfo.positions<-Array.make(2*len)0;Array.blitpos0info.positions0lenend;descletvalidateinfo(s:string)posst=letc=Char.code(Bytes.getinfo.i_cols(Char.codes.[pos]))inletcat=categoryinfo.recinletdesc'=deltainfocatcstinletst'=find_stateinfo.redesc'inst.next.(c)<-st'(*
let rec loop info s pos st =
if pos < info.last then
let st' = st.next.(Char.code info.i_cols.[Char.code s.[pos]]) in
let idx = st'.idx in
if idx >= 0 then begin
info.positions.(idx) <- pos;
loop info s (pos + 1) st'
end else if idx = break then begin
info.positions.(st'.real_idx) <- pos;
st'
end else begin (* Unknown *)
validate info s pos st;
loop info s pos st
end
else
st
*)letrecloopinfo(s:string)posst=ifpos<info.lastthenletst'=st.next.(Char.code(Bytes.getinfo.i_cols(Char.codes.[pos])))inloop2infosposstst'elsestandloop2infosposstst'=ifst'.idx>=0thenbeginletpos=pos+1inifpos<info.lastthenbegin(* It is important to place these reads before the write *)(* But then, we don't have enough registers left to store the
right position. So, we store the position plus one. *)letst''=st'.next.(Char.code(Bytes.getinfo.i_cols(Char.codes.[pos])))ininfo.positions.(st'.idx)<-pos;loop2infosposst'st''endelsebegininfo.positions.(st'.idx)<-pos;st'endendelseifst'.idx=breakthenbegininfo.positions.(st'.real_idx)<-pos+1;st'endelsebegin(* Unknown *)validateinfosposst;loopinfosposstendletrecloop_no_markinfosposlastst=ifpos<lastthenletst'=st.next.(Char.code(Bytes.getinfo.i_cols(Char.codes.[pos])))inifst'.idx>=0thenloop_no_markinfos(pos+1)lastst'elseifst'.idx=breakthenst'elsebegin(* Unknown *)validateinfosposst;loop_no_markinfosposlaststendelsestletfinalinfostcat=tryList.assqcatst.finalwithNot_found->letst'=deltainfocat(-1)stinletres=(st'.Automata.State.idx,Automata.statusst')inst.final<-(cat,res)::st.final;resletfind_initial_staterecat=tryList.assqcatre.initial_stateswithNot_found->letst=find_statere(Automata.State.createcatre.initial)inre.initial_states<-(cat,st)::re.initial_states;stletget_colorre(s:string)pos=ifpos<0then-1elseletslen=String.lengthsinifpos>=slenthen-1elseifpos=slen-1&&re.lnl<>-1&&s.[pos]='\n'then(* Special case for the last newline *)re.lnlelseChar.code(Bytes.getre.cols(Char.codes.[pos]))letrechandle_last_newlineinfoposstgroups=letst'=st.next.(info.re.lnl)inifst'.idx>=0thenbeginifgroupstheninfo.positions.(st'.idx)<-pos+1;st'endelseifst'.idx=breakthenbeginifgroupstheninfo.positions.(st'.real_idx)<-pos+1;st'endelsebegin(* Unknown *)letc=info.re.lnlinletreal_c=Char.code(Bytes.getinfo.i_cols(Char.code'\n'))inletcat=categoryinfo.recinletdesc'=deltainfocatreal_cstinletst'=find_stateinfo.redesc'inst.next.(c)<-st';handle_last_newlineinfoposstgroupsendletrecscan_strinfo(s:string)initial_stategroups=letpos=info.posinletlast=info.lastinif(last=String.lengths&&info.re.lnl<>-1&&last>pos&&String.gets(last-1)='\n')thenbeginletinfo={infowithlast=last-1}inletst=scan_strinfosinitial_stategroupsinifst.idx=breakthenstelsehandle_last_newlineinfo(last-1)stgroupsendelseifgroupsthenloopinfosposinitial_stateelseloop_no_markinfosposlastinitial_stateletmatch_str~groups~partialres~pos~len=letslen=String.lengthsinletlast=iflen=-1thenslenelsepos+leninletinfo={re=re;i_cols=re.cols;pos=pos;last=last;positions=ifgroupsthenbeginletn=Automata.index_countre.tbl+1inifn<=10then[|0;0;0;0;0;0;0;0;0;0|]elseArray.maken0endelse[||]}inletinitial_cat=ifpos=0thencat_search_boundarylorcat_inexistantelsecat_search_boundarylorcategoryre(get_colorres(pos-1))inletinitial_state=find_initial_statereinitial_catinletst=scan_strinfosinitial_stategroupsinletres=ifst.idx=break||partialthenAutomata.statusst.descelseletfinal_cat=iflast=slenthencat_search_boundarylorcat_inexistantelsecat_search_boundarylorcategoryre(get_colorreslast)inlet(idx,res)=finalinfostfinal_catinifgroupstheninfo.positions.(idx)<-last+1;resinmatchreswithAutomata.Match(marks,pmarks)->Match{s;marks;pmarks;gpos=info.positions;gcount=re.group_count}|Automata.Failed->Failed|Automata.Running->Runningletmk_reinitcolscol_reprncollnlgroup_count={initial=init;initial_states=[];cols=cols;col_repr=col_repr;ncol=ncol;lnl=lnl;tbl=Automata.create_working_area();states=Automata.State.Table.create97;group_count=group_count}(**** Character sets ****)letcseqcc'=Cset.seq(Char.codec)(Char.codec')letcaddcs=Cset.add(Char.codec)slettrans_setcachecms=matchCset.one_charswith|Somei->Cset.csingle(Bytes.getcmi)|None->letv=(Cset.hash_recs,s)intryCset.CSetMap.findv!cachewithNot_found->letl=Cset.fold_rights~f:(fun(i,j)l->Cset.union(cseq(Bytes.getcmi)(Bytes.getcmj))l)~init:Cset.emptyincache:=Cset.CSetMap.addvl!cache;l(****)typeregexp=SetofCset.t|Sequenceofregexplist|Alternativeofregexplist|Repeatofregexp*int*intoption|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stop|SemofAutomata.sem*regexp|Sem_greedyofAutomata.rep_kind*regexp|Groupofregexp|No_groupofregexp|Nestofregexp|Caseofregexp|No_caseofregexp|Intersectionofregexplist|Complementofregexplist|Differenceofregexp*regexp|PmarkofAutomata.Pmark.t*regexpletrecppfmtt=letopenRe_fmtinletvarsre=sexpfmtsppreinletseqsrel=sexpfmts(listpp)relinmatchtwith|Sets->sexpfmt"Set"Cset.pps|Sequencesq->seq"Sequence"sq|Alternativealt->seq"Alternative"alt|Repeat(re,start,stop)->letpp'fmt()=fprintffmt"%a@ %d%a"pprestartoptintstopinsexpfmt"Repeat"pp'()|Beg_of_line->strfmt"Beg_of_line"|End_of_line->strfmt"End_of_line"|Beg_of_word->strfmt"Beg_of_word"|End_of_word->strfmt"End_of_word"|Not_bound->strfmt"Not_bound"|Beg_of_str->strfmt"Beg_of_str"|End_of_str->strfmt"End_of_str"|Last_end_of_line->strfmt"Last_end_of_line"|Start->strfmt"Start"|Stop->strfmt"Stop"|Sem(sem,re)->sexpfmt"Sem"(pairAutomata.pp_sempp)(sem,re)|Sem_greedy(k,re)->sexpfmt"Sem_greedy"(pairAutomata.pp_rep_kindpp)(k,re)|Groupc->var"Group"c|No_groupc->var"No_group"c|Nestc->var"Nest"c|Casec->var"Case"c|No_casec->var"No_case"c|Intersectionc->seq"Intersection"c|Complementc->seq"Complement"c|Difference(a,b)->sexpfmt"Difference"(pairpppp)(a,b)|Pmark(m,r)->sexpfmt"Pmark"(pairAutomata.Pmark.pppp)(m,r)letrecis_charset=function|Set_->true|Alternativel|Intersectionl|Complementl->List.for_allis_charsetl|Difference(r,r')->is_charsetr&&is_charsetr'|Sem(_,r)|Sem_greedy(_,r)|No_groupr|Caser|No_caser->is_charsetr|Sequence_|Repeat_|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Beg_of_str|End_of_str|Not_bound|Last_end_of_line|Start|Stop|Group_|Nest_|Pmark(_,_)->false(**** Colormap ****)(*XXX Use a better algorithm allowing non-contiguous regions? *)letsplitscm=Re_cset.iters~f:(funij->Bytes.setcmi'\001';Bytes.setcm(j+1)'\001';)letcupper=Cset.union(cseq'A''Z')(Cset.union(cseq'\192''\214')(cseq'\216''\222'))letclower=Cset.offset32cupperletcalpha=List.fold_rightcadd['\170';'\181';'\186';'\223';'\255'](Cset.unionclowercupper)letcdigit=cseq'0''9'letcalnum=Cset.unioncalphacdigitletcword=cadd'_'calnumletcolorizecregexp=letlnl=reffalseinletreccolorizeregexp=matchregexpwithSets->splitsc|Sequencel->List.itercolorizel|Alternativel->List.itercolorizel|Repeat(r,_,_)->colorizer|Beg_of_line|End_of_line->split(Cset.csingle'\n')c|Beg_of_word|End_of_word|Not_bound->splitcwordc|Beg_of_str|End_of_str|Start|Stop->()|Last_end_of_line->lnl:=true|Sem(_,r)|Sem_greedy(_,r)|Groupr|No_groupr|Nestr|Pmark(_,r)->colorizer|Case_|No_case_|Intersection_|Complement_|Difference_->assertfalseincolorizeregexp;!lnlletmake_cmap()=Bytes.make257'\000'letflatten_cmapcm=letc=Bytes.create256inletcol_repr=Bytes.create256inletv=ref0inBytes.setc0'\000';Bytes.setcol_repr0'\000';fori=1to255doifBytes.getcmi<>'\000'thenincrv;Bytes.setci(Char.chr!v);Bytes.setcol_repr!v(Char.chri)done;(c,Bytes.subcol_repr0(!v+1),!v+1)(**** Compilation ****)letrecequalx1x2=matchx1,x2withSets1,Sets2->s1=s2|Sequencel1,Sequencel2->eq_listl1l2|Alternativel1,Alternativel2->eq_listl1l2|Repeat(x1',i1,j1),Repeat(x2',i2,j2)->i1=i2&&j1=j2&&equalx1'x2'|Beg_of_line,Beg_of_line|End_of_line,End_of_line|Beg_of_word,Beg_of_word|End_of_word,End_of_word|Not_bound,Not_bound|Beg_of_str,Beg_of_str|End_of_str,End_of_str|Last_end_of_line,Last_end_of_line|Start,Start|Stop,Stop->true|Sem(sem1,x1'),Sem(sem2,x2')->sem1=sem2&&equalx1'x2'|Sem_greedy(k1,x1'),Sem_greedy(k2,x2')->k1=k2&&equalx1'x2'|Group_,Group_->(* Do not merge groups! *)false|No_groupx1',No_groupx2'->equalx1'x2'|Nestx1',Nestx2'->equalx1'x2'|Casex1',Casex2'->equalx1'x2'|No_casex1',No_casex2'->equalx1'x2'|Intersectionl1,Intersectionl2->eq_listl1l2|Complementl1,Complementl2->eq_listl1l2|Difference(x1',x1''),Difference(x2',x2'')->equalx1'x2'&&equalx1''x2''|Pmark(m1,r1),Pmark(m2,r2)->Automata.Pmark.equalm1m2&&equalr1r2|_->falseandeq_listl1l2=matchl1,l2with[],[]->true|x1::r1,x2::r2->equalx1x2&&eq_listr1r2|_->falseletsequence=function|[x]->x|l->Sequencelletrecmerge_sequences=function|[]->[]|Alternativel'::r->merge_sequences(l'@r)|Sequence(x::y)::r->beginmatchmerge_sequencesrwithSequence(x'::y')::r'whenequalxx'->Sequence[x;Alternative[sequencey;sequencey']]::r'|r'->Sequence(x::y)::r'end|x::r->x::merge_sequencesrmoduleA=Automataletenforce_kindidskindkind'cr=matchkind,kind'with`First,`First->cr|`First,k->A.seqidskcr(A.epsids)|_->cr(* XXX should probably compute a category mask *)letrectranslateidskindign_groupign_casegreedyposcachec=function|Sets->(A.cstids(trans_setcachecs),kind)|Sequencel->(trans_seqidskindign_groupign_casegreedyposcachecl,kind)|Alternativel->beginmatchmerge_sequenceslwith[r']->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'in(enforce_kindidskindkind'cr,kind)|merged_sequences->(A.altids(List.map(funr'->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'inenforce_kindidskindkind'cr)merged_sequences),kind)end|Repeat(r',i,j)->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'inletrem=matchjwithNone->A.repidsgreedykind'cr|Somej->letf=matchgreedywith`Greedy->funrem->A.altids[A.seqidskind'(A.renameidscr)rem;A.epsids]|`Non_greedy->funrem->A.altids[A.epsids;A.seqidskind'(A.renameidscr)rem]initer(j-i)f(A.epsids)in(iteri(funrem->A.seqidskind'(A.renameidscr)rem)rem,kind)|Beg_of_line->(A.afterids(cat_inexistantlorcat_newline),kind)|End_of_line->(A.beforeids(cat_inexistantlorcat_newline),kind)|Beg_of_word->(A.seqids`First(A.afterids(cat_inexistantlorcat_not_letter))(A.beforeids(cat_inexistantlorcat_letter)),kind)|End_of_word->(A.seqids`First(A.afterids(cat_inexistantlorcat_letter))(A.beforeids(cat_inexistantlorcat_not_letter)),kind)|Not_bound->(A.altids[A.seqids`First(A.afteridscat_letter)(A.beforeidscat_letter);A.seqids`First(A.afteridscat_letter)(A.beforeidscat_letter)],kind)|Beg_of_str->(A.afteridscat_inexistant,kind)|End_of_str->(A.beforeidscat_inexistant,kind)|Last_end_of_line->(A.beforeids(cat_inexistantlorcat_lastnewline),kind)|Start->(A.afteridscat_search_boundary,kind)|Stop->(A.beforeidscat_search_boundary,kind)|Sem(kind',r')->let(cr,kind'')=translateidskind'ign_groupign_casegreedyposcachecr'in(enforce_kindidskind'kind''cr,kind')|Sem_greedy(greedy',r')->translateidskindign_groupign_casegreedy'poscachecr'|Groupr'->ifign_groupthentranslateidskindign_groupign_casegreedyposcachecr'elseletp=!posinpos:=!pos+2;let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'in(A.seqids`First(A.markidsp)(A.seqids`Firstcr(A.markids(p+1))),kind')|No_groupr'->translateidskindtrueign_casegreedyposcachecr'|Nestr'->letb=!posinlet(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'inlete=!pos-1inife<bthen(cr,kind')else(A.seqids`First(A.eraseidsbe)cr,kind')|Difference_|Complement_|Intersection_|No_case_|Case_->assertfalse|Pmark(i,r')->let(cr,kind')=translateidskindign_groupign_casegreedyposcachecr'in(A.seqids`First(A.pmarkidsi)cr,kind')andtrans_seqidskindign_groupign_casegreedyposcachec=function|[]->A.epsids|[r]->let(cr',kind')=translateidskindign_groupign_casegreedyposcachecrinenforce_kindidskindkind'cr'|r::rem->let(cr',kind')=translateidskindign_groupign_casegreedyposcachecrinletcr''=trans_seqidskindign_groupign_casegreedyposcachecreminifA.is_epscr''thencr'elseifA.is_epscr'thencr''elseA.seqidskind'cr'cr''(**** Case ****)letcase_insenss=Cset.unions(Cset.union(Cset.offset32(Cset.interscupper))(Cset.offset(-32)(Cset.intersclower)))letas_set=function|Sets->s|_->assertfalse(* XXX Should split alternatives into (1) charsets and (2) more
complex regular expressions; alternative should therefore probably
be flatten here *)letrechandle_caseign_case=function|Sets->Set(ifign_casethencase_insensselses)|Sequencel->Sequence(List.map(handle_caseign_case)l)|Alternativel->letl'=List.map(handle_caseign_case)linifis_charset(Alternativel')thenSet(List.fold_left(funsr->Cset.unions(as_setr))Cset.emptyl')elseAlternativel'|Repeat(r,i,j)->Repeat(handle_caseign_caser,i,j)|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|End_of_str|Last_end_of_line|Start|Stopasr->r|Sem(k,r)->letr'=handle_caseign_caserinifis_charsetr'thenr'elseSem(k,r')|Sem_greedy(k,r)->letr'=handle_caseign_caserinifis_charsetr'thenr'elseSem_greedy(k,r')|Groupr->Group(handle_caseign_caser)|No_groupr->letr'=handle_caseign_caserinifis_charsetr'thenr'elseNo_groupr'|Nestr->letr'=handle_caseign_caserinifis_charsetr'thenr'elseNestr'|Caser->handle_casefalser|No_caser->handle_casetruer|Intersectionl->letl'=List.map(funr->handle_caseign_caser)linSet(List.fold_left(funsr->Cset.inters(as_setr))Cset.canyl')|Complementl->letl'=List.map(funr->handle_caseign_caser)linSet(Cset.diffCset.cany(List.fold_left(funsr->Cset.unions(as_setr))Cset.emptyl'))|Difference(r,r')->Set(Cset.inter(as_set(handle_caseign_caser))(Cset.diffCset.cany(as_set(handle_caseign_caser'))))|Pmark(i,r)->Pmark(i,handle_caseign_caser)(****)letcompile_1regexp=letregexp=handle_casefalseregexpinletc=make_cmap()inletneed_lnl=colorizecregexpinlet(col,col_repr,ncol)=flatten_cmapcinletlnl=ifneed_lnlthenncolelse-1inletncol=ifneed_lnlthenncol+1elsencolinletids=A.create_ids()inletpos=ref0inlet(r,kind)=translateids`Firstfalsefalse`Greedypos(refCset.CSetMap.empty)colregexpinletr=enforce_kindids`Firstkindrin(*Format.eprintf "<%d %d>@." !ids ncol;*)mk_rercolcol_reprncollnl(!pos/2)(****)letrecanchored=function|Sequencel->List.existsanchoredl|Alternativel->List.for_allanchoredl|Repeat(r,i,_)->i>0&&anchoredr|Set_|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|End_of_str|Last_end_of_line|Stop|Intersection_|Complement_|Difference_->false|Beg_of_str|Start->true|Sem(_,r)|Sem_greedy(_,r)|Groupr|No_groupr|Nestr|Caser|No_caser|Pmark(_,r)->anchoredr(****)typet=regexpletstrs=letl=ref[]infori=String.lengths-1downto0dol:=Set(Cset.csingles.[i])::!ldone;Sequence!lletcharc=Set(Cset.csinglec)letalt=function|[r]->r|l->Alternativelletseq=function|[r]->r|l->Sequencelletempty=alt[]letepsilon=seq[]letrepnrij=ifi<0theninvalid_arg"Re.repn";beginmatchjwith|Somejwhenj<i->invalid_arg"Re.repn"|_->()end;Repeat(r,i,j)letrepr=repnr0Noneletrep1r=repnr1Noneletoptr=repnr0(Some1)letbol=Beg_of_lineleteol=End_of_lineletbow=Beg_of_wordleteow=End_of_wordletwordr=seq[bow;r;eow]letnot_boundary=Not_boundletbos=Beg_of_strleteos=End_of_strletwhole_stringr=seq[bos;r;eos]letleol=Last_end_of_lineletstart=Startletstop=Stopletlongestr=Sem(`Longest,r)letshortestr=Sem(`Shortest,r)letfirstr=Sem(`First,r)letgreedyr=Sem_greedy(`Greedy,r)letnon_greedyr=Sem_greedy(`Non_greedy,r)letgroupr=Grouprletno_groupr=No_grouprletnestr=Nestrletmarkr=leti=Automata.Pmark.gen()in(i,Pmark(i,r))letsetstr=lets=refCset.emptyinfori=0toString.lengthstr-1dos:=Cset.union(Cset.csinglestr.[i])!sdone;Set!sletrgcc'=Set(cseqcc')letinterl=letr=Intersectionlinifis_charsetrthenrelseinvalid_arg"Re.inter"letcompll=letr=Complementlinifis_charsetrthenrelseinvalid_arg"Re.compl"letdiffrr'=letr''=Difference(r,r')inifis_charsetr''thenr''elseinvalid_arg"Re.diff"letany=SetCset.canyletnotnl=Set(Cset.diffCset.cany(Cset.csingle'\n'))letlower=alt[rg'a''z';char'\181';rg'\223''\246';rg'\248''\255']letupper=alt[rg'A''Z';rg'\192''\214';rg'\216''\222']letalpha=alt[lower;upper;char'\170';char'\186']letdigit=rg'0''9'letalnum=alt[alpha;digit]letwordc=alt[alnum;char'_']letascii=rg'\000''\127'letblank=set"\t "letcntrl=alt[rg'\000''\031';rg'\127''\159']letgraph=alt[rg'\033''\126';rg'\160''\255']letprint=alt[rg'\032''\126';rg'\160''\255']letpunct=alt[rg'\033''\047';rg'\058''\064';rg'\091''\096';rg'\123''\126';rg'\160''\169';rg'\171''\180';rg'\182''\185';rg'\187''\191';char'\215';char'\247']letspace=alt[char' ';rg'\009''\013']letxdigit=alt[digit;rg'a''f';rg'A''F']letcaser=Caserletno_caser=No_caser(****)letcompiler=compile_1(ifanchoredrthengrouprelseseq[shortest(repany);groupr])letexec_internalname?(pos=0)?(len=-1)~groupsres=ifpos<0||len<-1||pos+len>String.lengthstheninvalid_argname;match_str~groups~partial:falseres~pos~lenletexec?pos?lenres=matchexec_internal"Re.exec"?pos?len~groups:truereswithMatchsubstr->substr|_->raiseNot_foundletexec_opt?pos?lenres=matchexec_internal"Re.exec_opt"?pos?len~groups:truereswithMatchsubstr->Somesubstr|_->Noneletexecp?pos?lenres=matchexec_internal~groups:false"Re.execp"?pos?lenreswithMatch_substr->true|_->falseletexec_partial?pos?lenres=matchexec_internal~groups:false"Re.exec_partial"?pos?lenreswithMatch_->`Full|Running->`Partial|Failed->`MismatchmoduleGroup=structtypet=groupsletoffsetti=if2*i+1>=Array.lengtht.marksthenraiseNot_found;letm1=t.marks.(2*i)inifm1=-1thenraiseNot_found;letp1=t.gpos.(m1)-1inletp2=t.gpos.(t.marks.(2*i+1))-1in(p1,p2)letgetti=let(p1,p2)=offsettiinString.subt.sp1(p2-p1)letstartsubsi=fst(offsetsubsi)letstopsubsi=snd(offsetsubsi)lettestti=if2*i>=Array.lengtht.marksthenfalseelseletidx=t.marks.(2*i)inidx<>-1letdummy_offset=(-1,-1)letall_offsett=letres=Array.maket.gcountdummy_offsetinfori=0toArray.lengtht.marks/2-1doletm1=t.marks.(2*i)inifm1<>-1thenbeginletp1=t.gpos.(m1)inletp2=t.gpos.(t.marks.(2*i+1))inres.(i)<-(p1-1,p2-1)enddone;resletdummy_string=""letallt=letres=Array.maket.gcountdummy_stringinfori=0toArray.lengtht.marks/2-1doletm1=t.marks.(2*i)inifm1<>-1thenbeginletp1=t.gpos.(m1)inletp2=t.gpos.(t.marks.(2*i+1))inres.(i)<-String.subt.s(p1-1)(p2-p1)enddone;resletppfmtt=letmatches=letoffsets=all_offsettinletstrs=alltinArray.to_list(Array.init(Array.lengthstrs)(funi->strs.(i),offsets.(i)))inletopenRe_fmtinletpp_matchfmt(str,(start,stop))=fprintffmt"@[(%s (%d %d))@]"strstartstopinsexpfmt"Group"(listpp_match)matchesletnb_groupst=t.gcountendmoduleMark=structtypet=Automata.Pmark.tlettest{pmarks;_}p=Automata.PmarkSet.memppmarksletalls=s.pmarksmoduleSet=MarkSetletequal=Automata.Pmark.equalletcompare=Automata.Pmark.compareendtype'agen=unit->'aoptionletall_gen?(pos=0)?lenres=ifpos<0theninvalid_arg"Re.all";(* index of the first position we do not consider.
!pos < limit is an invariant *)letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.all";pos+lin(* iterate on matches. When a match is found, search for the next
one just after its end *)letpos=refposinfun()->if!pos>=limitthenNone(* no more matches *)elsematchmatch_str~groups:true~partial:falseres~pos:!pos~len:(limit-!pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0inpos:=ifp1=p2thenp2+1elsep2;Somesubstr|Running|Failed->Noneletall?pos?lenres=letl=ref[]inletg=all_gen?pos?lenresinletreciter()=matchg()with|None->List.rev!l|Somesub->l:=sub::!l;iter()initer()letmatches_gen?pos?lenres=letg=all_gen?pos?lenresinfun()->matchg()with|None->None|Somesub->Some(Group.getsub0)letmatches?pos?lenres=letl=ref[]inletg=all_gen?pos?lenresinletreciter()=matchg()with|None->List.rev!l|Somesub->l:=Group.getsub0::!l;iter()initer()typesplit_token=[`Textofstring|`Delimofgroups]letsplit_full_gen?(pos=0)?lenres=ifpos<0theninvalid_arg"Re.split";letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.split";pos+lin(* i: start of delimited string
pos: first position after last match of [re]
limit: first index we ignore (!pos < limit is an invariant) *)letpos0=posinletstate=ref`Idleinleti=refposandpos=refposinletnext()=match!statewith|`Idlewhen!pos>=limit->if!i<limitthen(letsub=String.subs!i(limit-!i)inincri;Some(`Textsub))elseNone|`Idle->beginmatchmatch_str~groups:true~partial:falseres~pos:!pos~len:(limit-!pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0inpos:=ifp1=p2thenp2+1elsep2;letold_i=!iini:=p2;ifp1>pos0then((* string does not start by a delimiter *)lettext=String.subsold_i(p1-old_i)instate:=`Yield(`Delimsubstr);Some(`Texttext))elseSome(`Delimsubstr)|Running->None|Failed->if!i<limitthen(lettext=String.subs!i(limit-!i)ini:=limit;Some(`Texttext)(* yield last string *))elseNoneend|`Yieldx->state:=`Idle;Somexinnextletsplit_full?pos?lenres=letl=ref[]inletg=split_full_gen?pos?lenresinletreciter()=matchg()with|None->List.rev!l|Somes->l:=s::!l;iter()initer()letsplit_gen?pos?lenres=letg=split_full_gen?pos?lenresinletrecnext()=matchg()with|None->None|Some(`Delim_)->next()|Some(`Texts)->Somesinnextletsplit?pos?lenres=letl=ref[]inletg=split_full_gen?pos?lenresinletreciter()=matchg()with|None->List.rev!l|Some(`Delim_)->iter()|Some(`Texts)->l:=s::!l;iter()initer()letreplace?(pos=0)?len?(all=true)re~fs=ifpos<0theninvalid_arg"Re.replace";letlimit=matchlenwith|None->String.lengths|Somel->ifl<0||pos+l>String.lengthstheninvalid_arg"Re.replace";pos+lin(* buffer into which we write the result *)letbuf=Buffer.create(String.lengths)in(* iterate on matched substrings. *)letreciterpos=ifpos<limitthenmatchmatch_str~groups:true~partial:falseres~pos~len:(limit-pos)with|Matchsubstr->letp1,p2=Group.offsetsubstr0in(* add string between previous match and current match *)Buffer.add_substringbufspos(p1-pos);(* what should we replace the matched group with? *)letreplacing=fsubstrinBuffer.add_stringbufreplacing;ifallthen(* if we matched a non-char e.g. ^ we must manually advance by 1 *)iter(ifp1=p2then((* a non char could be past the end of string. e.g. $ *)ifp2<limitthenBuffer.add_charbufs.[p2];p2+1)elsep2)elseBuffer.add_substringbufsp2(limit-p2)|Running->()|Failed->Buffer.add_substringbufspos(limit-pos)initerpos;Buffer.contentsbufletreplace_string?pos?len?allre~bys=replace?pos?len?allres~f:(fun_->by)letwitnesst=letrecwitness=function|Setc->String.make1(Char.chr(Cset.pickc))|Sequencexs->String.concat""(List.mapwitnessxs)|Alternative(x::_)->witnessx|Alternative[]->assertfalse|Repeat(r,from,_to)->letw=witnessrinletb=Buffer.create(String.lengthw*from)infor_i=1tofromdoBuffer.add_stringbwdone;Buffer.contentsb|No_caser->witnessr|Intersection_|Complement_|Difference(_,_)->assertfalse|Groupr|No_groupr|Nestr|Sem(_,r)|Pmark(_,r)|Caser|Sem_greedy(_,r)->witnessr|Beg_of_line|End_of_line|Beg_of_word|End_of_word|Not_bound|Beg_of_str|Last_end_of_line|Start|Stop|End_of_str->""inwitness(handle_casefalset)(** {2 Deprecated functions} *)typesubstrings=groupsletget=Group.getletget_ofs=Group.offsetletget_all=Group.allletget_all_ofs=Group.all_offsetlettest=Group.testtypemarkid=Mark.tletmarked=Mark.testletmark_set=Mark.all(**********************************)(*
Information about the previous character:
- does not exists
- is a letter
- is not a letter
- is a newline
- is last newline
Beginning of word:
- previous is not a letter or does not exist
- current is a letter or does not exist
End of word:
- previous is a letter or does not exist
- current is not a letter or does not exist
Beginning of line:
- previous is a newline or does not exist
Beginning of buffer:
- previous does not exist
End of buffer
- current does not exist
End of line
- current is a newline or does not exist
*)(*
Rep: e = T,e | ()
- semantics of the comma (shortest/longest/first)
- semantics of the union (greedy/non-greedy)
Bounded repetition
a{0,3} = (a,(a,a?)?)?
*)