123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558openImportletreciternfv=ifInt.equaln0thenvelseiter(n-1)f(fv)letunknown=-2letbreak=-3typematch_info=|MatchofGroup.t|Failed|Runningof{no_match_starts_before:int}typestate_info={idx:int;(* Index of the current position in the position table.
Not yet computed transitions point to a dummy state where
[idx] is set to [unknown];
If [idx] is set to [break] for states that either always
succeed or always fail. *)real_idx:int;(* The real index, in case [idx] is set to [break] *)mutablefinal:(Category.t*(Automata.idx*Automata.status))list;(* Mapping from the category of the next character to
- the index where the next position should be saved
- possibly, the list of marks (and the corresponding indices)
corresponding to the best match *)desc:Automata.State.t(* Description of this state of the automata *)}(* A state [t] is a pair composed of some information about the
state [state_info] and a transition table [t array], indexed by
color. For performance reason, to avoid an indirection, we manually
unbox the transition table: we allocate a single array, with the
state information at index 0, followed by the transitions. *)moduleState:sigtypetvalmake:ncol:int->state_info->tvalget_info:t->state_infovalfollow_transition:t->color:Cset.c->tvalset_transition:t->color:Cset.c->t->unitend=structtypet=Tableoftarray[@@unboxed]letget_info(Tablest):state_info=Obj.magic(Array.unsafe_getst0)[@@inlinealways];;letset_info(Tablest)(info:state_info)=st.(0)<-Obj.magicinfoletfollow_transition(Tablest)~color=Array.unsafe_getst(1+Cset.to_intcolor)[@@inlinealways];;letset_transition(Tablest)~colorst'=st.(1+Cset.to_intcolor)<-st'letdummy(info:state_info)=Table[|Obj.magicinfo|]letunknown_state=dummy{idx=unknown;real_idx=0;final=[];desc=Automata.State.dummy};;letmake~ncolstate=letst=Table(Array.make(ncol+1)unknown_state)inset_infoststate;st;;end(* Automata (compiled regular expression) *)typere={initial:Automata.expr;(* The whole regular expression *)mutableinitial_states:(Category.t*State.t)list;(* Initial states, indexed by initial category *)colors:Color_map.Table.t;(* Color table *)color_repr:Color_map.Repr.t;(* Table from colors to one character of this color *)ncolor:int;(* Number of colors. *)lnl:int;(* Color of the last newline. -1 if unnecessary *)tbl:Automata.Working_area.t;(* Temporary table used to compute the first available index
when computing a new state *)states:State.tAutomata.State.Table.t;(* States of the deterministic automata *)group_names:(string*int)list;(* Named groups in the regular expression *)group_count:int(* Number of groups in the regular expression *)}letpp_rechre=Automata.ppchre.initialletprint_re=pp_reletgroup_countre=re.group_countletgroup_namesre=re.group_names(* Information used during matching *)typeinfo={re:re;(* The automata *)mutablepositions:intarray;(* Array of mark positions
The mark are off by one for performance reasons *)pos:int;(* Position where the match is started *)last:int(* Position where the match should stop *)}(****)letcategoryre~color=ifCset.to_intcolor=-1thenCategory.inexistant(* Special category for the last newline *)elseifCset.to_intcolor=re.lnlthenCategory.(lastnewline++newline++not_letter)elseCategory.from_char(Color_map.Repr.reprre.color_reprcolor);;(****)letmk_statencoldesc=letbreak_state=matchAutomata.State.statusdescwith|Automata.Running->false|Automata.Failed|Automata.Match_->trueinletst=letreal_idx=Automata.State.idxdescin{idx=(ifbreak_statethenbreakelsereal_idx);real_idx;final=[];desc}inState.make~ncol:(ifbreak_statethen0elsencol)st;;letfind_stateredesc=tryAutomata.State.Table.findre.statesdescwith|Not_found->letst=mk_statere.ncolordescinAutomata.State.Table.addre.statesdescst;st;;(**** Match with marks ****)letdeltainfocat~colorst=letdesc=Automata.deltainfo.re.tblcatcolorst.descinletlen=Array.lengthinfo.positionsinifAutomata.State.idxdesc=len&&len>0then(letpos=info.positionsininfo.positions<-Array.make(2*len)0;Array.blitpos0info.positions0len);desc;;letvalidateinfo(s:string)~posst=letcolor=Color_map.Table.getinfo.re.colorss.[pos]inletst'=letdesc'=letcat=categoryinfo.re~colorindeltainfocat~color(State.get_infost)infind_stateinfo.redesc'inState.set_transitionst~colorst';;letnextcolorsstspos=State.follow_transitionst~color:(Color_map.Table.getcolors(String.unsafe_getspos));;letrecloopinfo~colors~positionss~pos~lastst0st=ifpos<lastthen(letst'=nextcolorsstsposinletstate_info=State.get_infost'inletidx=state_info.idxinifidx>=0then(Array.unsafe_setpositionsidxpos;loopinfo~colors~positionss~pos:(pos+1)~lastst'st')elseifidx=breakthen(Array.unsafe_setpositionsstate_info.real_idxpos;st')else((* Unknown *)validateinfos~posst0;loopinfo~colors~positions:info.positionss~pos~lastst0st0))elsest;;letrecloop_no_markinfo~colorss~pos~lastst0st=ifpos<lastthen(letst'=nextcolorsstsposinletstate_info=State.get_infost'inletidx=state_info.idxinifidx>=0thenloop_no_markinfo~colorss~pos:(pos+1)~lastst'st'elseifidx=breakthenst'else((* Unknown *)validateinfos~posst0;loop_no_markinfo~colorss~pos~lastst0st0))elsest;;letfinalinfostcat=tryList.assqcatst.finalwith|Not_found->letst'=deltainfocat~color:(Cset.of_int(-1))stinletres=Automata.State.idxst',Automata.State.statusst'inst.final<-(cat,res)::st.final;res;;letfind_initial_staterecat=tryList.assqcatre.initial_stateswith|Not_found->letst=find_statere(Automata.State.createcatre.initial)inre.initial_states<-(cat,st)::re.initial_states;st;;letget_colorre(s:string)pos=ifpos<0thenCset.of_int@@-1else(letslen=String.lengthsinifpos>=slenthenCset.of_int(-1)elseifpos=slen-1&&re.lnl<>-1&&Char.equals.[pos]'\n'then(* Special case for the last newline *)Cset.of_intre.lnlelseColor_map.Table.getre.colorss.[pos]);;letrechandle_last_newlineinfo~posst~groups=letst'=State.follow_transitionst~color:(Cset.of_intinfo.re.lnl)inletinfo'=State.get_infost'inifinfo'.idx>=0then(ifgroupstheninfo.positions.(info'.idx)<-pos;st')elseifinfo'.idx=breakthen(ifgroupstheninfo.positions.(info'.real_idx)<-pos;st')else((* Unknown *)letcolor=Cset.of_intinfo.re.lnlinletst'=letdesc'=letcat=categoryinfo.re~colorinletreal_c=Color_map.Table.getinfo.re.colors'\n'indeltainfocat~color:real_c(State.get_infost)infind_stateinfo.redesc'inState.set_transitionst~colorst';handle_last_newlineinfo~posst~groups);;letrecscan_strinfo(s:string)initial_state~groups=letpos=info.posinletlast=info.lastiniflast=String.lengths&&info.re.lnl<>-1&&last>pos&&Char.equal(String.gets(last-1))'\n'then(letinfo={infowithlast=last-1}inletst=scan_strinfosinitial_state~groupsinif(State.get_infost).idx=breakthenstelsehandle_last_newlineinfo~pos:(last-1)st~groups)elseifgroupsthenloopinfo~colors:info.re.colors~positions:info.positionss~pos~lastinitial_stateinitial_stateelseloop_no_markinfo~colors:info.re.colorss~pos~lastinitial_stateinitial_state;;(* This function adds a final boundary check on the input.
This is useful to indicate that the output failed because
of insufficient input, or to verify that the output actually
matches for regex that have boundary conditions with respect
to the input string.
*)letfinal_boundary_check~last~slenres~info~st~groups=letidx,res=letfinal_cat=Category.(search_boundary++iflast=slentheninexistantelsecategoryre~color:(get_colorreslast))infinalinfo(State.get_infost)final_catin(matchgroups,reswith|true,Match_->info.positions.(idx)<-last|_->());res;;letmatch_str~groups~partialres~pos~len=letslen=String.lengthsinletlast=iflen=-1thenslenelsepos+leninletinfo={re;pos;last;positions=(ifgroupsthen(letn=Automata.Working_area.index_countre.tbl+1inifn<=10then[|0;0;0;0;0;0;0;0;0;0|]elseArray.maken0)else[||])}inletst=letinitial_state=letinitial_cat=Category.(search_boundary++ifpos=0theninexistantelsecategoryre~color:(get_colorres(pos-1)))infind_initial_statereinitial_catinscan_strinfosinitial_state~groupsinmatchletstate_info=State.get_infostinifstate_info.idx=break||(partial&¬groups)thenAutomata.State.statusstate_info.descelseifpartial&&groupsthen(matchAutomata.State.statusstate_info.descwith|(Match_|Failed)asstatus->status|Running->(* This could be because it's still not fully matched, or it
could be that because we need to run special end of input
checks. *)(matchfinal_boundary_check~last~slenres~info~st~groupswith|Match_asstatus->status|Failed|Running->(* A failure here just means that we need more data, i.e.
it's a partial match. *)Running))elsefinal_boundary_check~last~slenres~info~st~groupswith|Match(marks,pmarks)->Match{s;marks;pmarks;gpos=info.positions;gcount=re.group_count}|Failed->Failed|Running->letno_match_starts_before=ifgroupstheninfo.positions.(0)else0inRunning{no_match_starts_before};;letmk_re~initial~colors~color_repr~ncolor~lnl~group_names~group_count={initial;initial_states=[];colors;color_repr;ncolor;lnl;tbl=Automata.Working_area.create();states=Automata.State.Table.create97;group_names;group_count};;(**** Compilation ****)moduleA=Automataletenforce_kindidskindkind'cr=matchkind,kind'with|`First,`First->cr|`First,k->A.seqidskcr(A.epsids)|_->cr;;typecontext={ids:A.Ids.t;kind:A.Sem.t;ign_group:bool;greedy:A.Rep_kind.t;pos:A.Mark.tref;names:(string*int)listref;cache:Cset.tCset.CSetMap.tref;colors:Color_map.Table.t}lettrans_setcache(cm:Color_map.Table.t)s=matchCset.one_charswith|Somei->Cset.csingle(Color_map.Table.get_charcmi)|None->letv=Cset.hash_recs,sin(tryCset.CSetMap.findv!cachewith|Not_found->letl=Color_map.Table.translate_colorscmsincache:=Cset.CSetMap.addvl!cache;l);;letmake_repeateridscrkindgreedy=matchgreedywith|`Greedy->funrem->A.altids[A.seqidskind(A.renameidscr)rem;A.epsids]|`Non_greedy->funrem->A.altids[A.epsids;A.seqidskind(A.renameidscr)rem];;(* XXX should probably compute a category mask *)letrectranslate({ids;kind;ign_group;greedy;pos;names;cache;colors}asctx)(ast:Ast.no_case)=matchastwith|Sets->A.cstids(trans_setcachecolorss),kind|Sequencel->trans_seqctxl,kind|Ast(Alternativel)->(matchAst.merge_sequenceslwith|[r']->letcr,kind'=translatectxr'inenforce_kindidskindkind'cr,kind|merged_sequences->(A.altids(List.mapmerged_sequences~f:(funr'->letcr,kind'=translatectxr'inenforce_kindidskindkind'cr)),kind))|Repeat(r',i,j)->letcr,kind'=translatectxr'inletrem=matchjwith|None->A.repidsgreedykind'cr|Somej->letf=make_repeateridscrkind'greedyiniter(j-i)f(A.epsids)initeri(funrem->A.seqidskind'(A.renameidscr)rem)rem,kind|Beg_of_line->A.afteridsCategory.(inexistant++newline),kind|End_of_line->A.beforeidsCategory.(inexistant++newline),kind|Beg_of_word->(A.seqids`First(A.afteridsCategory.(inexistant++not_letter))(A.beforeidsCategory.letter),kind)|End_of_word->(A.seqids`First(A.afteridsCategory.letter)(A.beforeidsCategory.(inexistant++not_letter)),kind)|Not_bound->(A.altids[A.seqids`First(A.afteridsCategory.letter)(A.beforeidsCategory.letter);(letcat=Category.(inexistant++not_letter)inA.seqids`First(A.afteridscat)(A.beforeidscat))],kind)|Beg_of_str->A.afteridsCategory.inexistant,kind|End_of_str->A.beforeidsCategory.inexistant,kind|Last_end_of_line->A.beforeidsCategory.(inexistant++lastnewline),kind|Start->A.afteridsCategory.search_boundary,kind|Stop->A.beforeidsCategory.search_boundary,kind|Sem(kind',r')->letcr,kind''=translate{ctxwithkind=kind'}r'inenforce_kindidskind'kind''cr,kind'|Sem_greedy(greedy',r')->translate{ctxwithgreedy=greedy'}r'|Group(n,r')->ifign_groupthentranslatectxr'else(letp=!posinlet()=matchnwith|Somename->names:=(name,A.Mark.group_countp)::!names|None->()inpos:=A.Mark.next2!pos;letcr,kind'=translatectxr'in(A.seqids`First(A.markidsp)(A.seqids`Firstcr(A.markids(A.Mark.nextp))),kind'))|No_groupr'->translate{ctxwithign_group=true}r'|Nestr'->letb=!posinletcr,kind'=translatectxr'inlete=A.Mark.prev!posinife<bthencr,kind'elseA.seqids`First(A.eraseidsbe)cr,kind'|Pmark(i,r')->letcr,kind'=translatectxr'inA.seqids`First(A.pmarkidsi)cr,kind'andtrans_seq({ids;kind;_}asctx)=function|[]->A.epsids|[r]->letcr',kind'=translatectxrinenforce_kindidskindkind'cr'|r::rem->letcr',kind'=translatectxrinletcr''=trans_seqctxreminifA.is_epscr''thencr'elseifA.is_epscr'thencr''elseA.seqidskind'cr'cr'';;letcompile_1regexp=letregexp=Ast.handle_casefalseregexpinletcolor_map=Color_map.make()inletneed_lnl=Ast.colorizecolor_mapregexpinletcolors,color_repr=Color_map.flattencolor_mapinletncolor=Color_map.Repr.lengthcolor_reprinletlnl=ifneed_lnlthenncolorelse-1inletncolor=ifneed_lnlthenncolor+1elsencolorinletctx={ids=A.Ids.create();kind=`First;ign_group=false;greedy=`Greedy;pos=refA.Mark.start;names=ref[];cache=refCset.CSetMap.empty;colors}inletr,kind=translatectxregexpinletr=enforce_kindctx.ids`Firstkindrin(*Format.eprintf "<%d %d>@." !ids ncol;*)mk_re~initial:r~colors~color_repr~ncolor~lnl~group_names:(List.rev!(ctx.names))~group_count:(A.Mark.group_count!(ctx.pos));;letcompiler=letopenAst.Exportincompile_1(ifAst.anchoredrthengrouprelseseq[shortest(rep(Ast.csetCset.cany));groupr]);;