123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615openCoreopenInt.Replace_polymorphic_compareincludeParser_intfmoduleBody=structmoduleT=struct(* Requirements on [regex_string]:
- it must have a valid Re2 syntax;
- it must not change meaning if concatenated with another allowed regex_string:
in particular, 'a|b' is not allowed (use e.g. '(?:a|b)' instead).
*)type'at={regex_string:Rope.t;num_submatches:int;to_result:int->stringoptionarray->'a}letreturnx={regex_string=Rope.of_string"";num_submatches=0;to_result=(fun__->x)};;letmap=`Custom(funt~f->{twithto_result=(funshiftmatches->f(t.to_resultshiftmatches))});;letapplytftx=letto_resultshiftmatches=letf=tf.to_resultshiftmatchesinletx=tx.to_result(shift+tf.num_submatches)matchesinfxin{regex_string=Rope.(tf.regex_string^tx.regex_string);num_submatches=tf.num_submatches+tx.num_submatches;to_result};;endincludeTincludeApplicative.Make(T)letto_regex_stringt=Rope.to_stringt.regex_stringletsexp_of_t_sexp_of_at=Sexp.Atom(to_regex_stringt)letto_re2?(case_sensitive=true)t=letoptions={Options.defaultwithcase_sensitive;encoding=Latin1;dot_nl=true}inlets=to_regex_stringtin(* We created [t.regex_string] ourselves, so the syntax ought to be good. *)matchRegex.create~optionsswith|Okrex->rex|Errore->failwiths~here:[%here]"Re2.Parser.to_re2 BUG: failed to compile"(s,e)[%sexp_of:string*Error.t];;letcompile?case_sensitivet=letrex=to_re2?case_sensitivetinletto_result=t.to_resultinStaged.stage(funs->matchRegex.get_matches_exn~max:1rexswith|[]->None|m::_->letm=Regex.without_trailing_noneminSome(to_result1(Regex.Match.get_allm)));;letrun?case_sensitivet=Staged.unstage(compile?case_sensitivet)letignore_mt={regex_string=t.regex_string;num_submatches=t.num_submatches;to_result=(fun__->())};;letmatches?case_sensitivet=letr=to_re2?case_sensitive(ignore_mt)infuninput->Regex.matchesrinput;;moduleFor_test=structletshould_match_with_case~case_sensitivesexp_of_rrexinpresult=[%test_pred:rt*string]~message:"should_match"(fun(rex,inp)->matches~case_sensitiverexinp)(rex,inp);[%test_result:Sexp.toption]~expect:(Some(sexp_of_rresult))(Option.map~f:sexp_of_r(run~case_sensitiverexinp));;letshould_not_match_with_case~case_sensitiverexinp=[%test_pred:_t*string]~message:"should_not_match"(fun(rex,inp)->not(matches~case_sensitiverexinp))(rex,inp);[%test_pred:stringt*string](fun(rex,inp)->Option.is_none(run~case_sensitiverexinp))(rex,inp);;letmatch_only_if_case_insensitivesexp_of_rrexinpresult=should_match_with_case~case_sensitive:falsesexp_of_rrexinpresult;should_not_match_with_case~case_sensitive:truerexinp;;letshould_matchsexp_of_rrexinpresult=should_match_with_case~case_sensitive:truesexp_of_rrexinpresult;should_match_with_case~case_sensitive:falsesexp_of_rrexinpresult;;letshould_match_string=should_matchString.sexp_of_tletshould_match_unitrexinp=should_matchUnit.sexp_of_trexinp()letshould_not_matchrexinp=should_not_match_with_case~case_sensitive:truerexinp;should_not_match_with_case~case_sensitive:falserexinp;;endopenFor_testletfail={regex_string=Rope.of_string"$x";num_submatches=0;to_result=(fun__->failwith"Re2.Parser.fail BUG: something matched regex '$x'")};;let%test_unit_=should_not_matchfail""let%test_unit_=should_not_matchfail"$x"let%test_unit_=should_not_matchfail"foo\nxyz"letof_captureless_stringregex_string={regex_string=Rope.of_stringregex_string;num_submatches=0;to_result=(fun__->())};;letstrings=of_captureless_string(Regex.escapes)let%test_unit_=should_match_unit(string"blah")"bloblahba"let%test_unit_=should_not_match(string".")"x"let%test_unit_=letnasty_string="^(he+l*l.o)[abc]{{\\\\$"inshould_match_unit(stringnasty_string)("before"^nasty_string^"after");;let%test_unit_=should_match_unit(string"blah\000blee")"blah\000blee"let%test_unit_=should_not_match(string"blah\000nope")"blah\000blee"letand_capturet={regex_string=Rope.(of_string"("^t.regex_string^of_string")");num_submatches=t.num_submatches+1;to_result=(funshiftmatches->(t.to_result(shift+1)matches,Option.value_exn~message:"Re2.Parser.capture bug: failed to capture"matches.(shift)))};;letcapturet=map(and_capturet)~f:(fun((),s)->s)let%test_unit_=match_only_if_case_insensitive[%sexp_of:string](capture(string"bLaH"))"baBLAHba""BLAH";;let%test_unit_=should_match[%sexp_of:string*string](both(capture(string"a")<*ignore_m(capture(string"b")))(capture(string"c")))"abc"("a","c");;letor_=function|[]->fail|[x]->x|ts->(* We do a dummy capture for each branch so that we can tell which actually matched,
for the purpose of calling the correct [to_result] callback (if any).
This extra match is the cause of all the +1s below. *){regex_string=Rope.(of_string"(?:"^List.reduce_exn~f:(funxy->x^of_string"|"^y)(List.mapts~f:(funt->Rope.(of_string"()"^t.regex_string)))^of_string")");num_submatches=List.sum(moduleInt)ts~f:(funt->t.num_submatches+1);to_result=(letrecgoimatches=function|[]->failwith"Re2.Parser.or_.to_result bug: called on non-match"|t::ts->ifOption.is_somematches.(i)thent.to_result(i+1)matcheselsego(i+1+t.num_submatches)matchestsinfunshiftmatches->goshiftmatchests)};;let%test_unit_=should_match_string(or_[capture(string"a");capture(string"b")])"a""a";;let%test_unit_=should_match_string(capture(or_[string"a";string"b"]))"a""a"let%test_unit_=should_not_match(or_[string"a";string"b"])"c"let%test_unit_=should_match_string(or_[string"b";string"c"]*>capture(string"a"))"ca""a";;let%test_unit_=should_match_string(capture(string"a")<*or_[string"b";string"c"])"ac""a";;let%test_unit_=should_match_string(ignore_m(or_[string"a";string"b"])*>capture(string"c"))"ac""c";;(* This is not subsumed by the [with_quantity] stuff because we can capture here, but we
can't there. *)letoptional?(greedy=true)t=letq=Rope.of_string(ifgreedythen"?"else"??")in{regex_string=Rope.(of_string"("^t.regex_string^of_string")"^q);num_submatches=t.num_submatches+1;to_result=(funshiftmatches->Option.mapmatches.(shift)~f:(fun_->t.to_result(shift+1)matches))};;let%test_unit_=should_match[%sexp_of:unitoption](optional(string"x"))"x"(Some());;let%test_unit_=should_match[%sexp_of:unitoption](optional~greedy:false(string"x"))"x"None;;letwith_quantityt~greedy~quantity=letq=Rope.of_string(ifgreedythen""else"?")in{regex_string=Rope.(of_string"(?:"^t.regex_string^of_string")"^of_stringquantity^q);num_submatches=t.num_submatches;to_result=(fun__->())};;letrepeat?(greedy=true)?(min=0)?(max=None)t=letvalidate_if_somevalidatex=Option.validate~none:Validate.pass_unit~some:validatexinValidate.of_list[Validate.name"min"(Int.validate_non_negativemin);Validate.name"max"(validate_if_some(Int.validate_lbound~min:(Inclmin))max);Validate.name"re2 implementation restrictions"(Validate.of_list[Int.validate_ubound~max:(Incl1000)min;validate_if_some(Int.validate_ubound~max:(Incl1000))max])]|>Validate.maybe_raise;matchmin,maxwith(* special cases *)|0,None->with_quantityt~greedy~quantity:"*"|1,None->with_quantityt~greedy~quantity:"+"|0,Some1->with_quantityt~greedy~quantity:"?"(* silly cases *)|0,Some0->of_captureless_string""|1,Some1->ignore_mt(* actual cases *)|min,None->with_quantityt~greedy~quantity:(sprintf"{%d,}"min)|min,Somemax->(* actually, [~greedy:false] is fine here as well -- r{n}? is permitted *)ifmin=maxthenwith_quantityt~greedy:true~quantity:(sprintf"{%d}"min)elsewith_quantityt~greedy~quantity:(sprintf"{%d,%d}"minmax);;let%test_module"repeat"=(modulestructlet%test_unit_=List.iter~f:([%test_pred:intoption*intoption*string*stringoption](fun(min,max,inp,result)->leta's=capture(repeat?min~max(string"a"))in0=[%compare:stringoption]result(run(string"c"*>a's<*string"b")inp)))[None,None,"caaab",Some"aaa";None,None,"cb",Some"";Some0,None,"cb",Some"";Some1,None,"cb",None;Some1,None,"cab",Some"a";Some1,Some2,"caaab",None;Some2,Some2,"caaab",None;Some3,Some3,"caaab",Some"aaa";Some4,Some4,"caaab",None;Some2,None,"caaab",Some"aaa";Some0,Some0,"cb",Some"";Some1,Some1,"cab",Some"a";None,Some0,"cb",Some""];;let%test_=Exn.does_raise(fun()->repeat~min:3~max:(Some2)fail)let%test_=Exn.does_raise(fun()->repeat~min:(-1)fail)let%test_=Exn.does_raise(fun()->repeat~max:(Some(-1))fail)let%test_=Exn.does_raise(fun()->repeat~min:1001fail)let%test_unit_=should_match_string(capture(repeat(or_[string"a";string"b"])*>string"a"))"baba""baba";;let%test_unit_=should_match_string(capture(repeat~greedy:false(or_[string"a";string"b"])*>string"a"))"baba""ba";;end);;lettimestn=repeat~min:n~max:(Somen)tlet%test_unit_=should_match_string(capture(times(or_[string"a";string"b"])3))"cabbage""abb";;let%test_unit_=should_match_unit(times(string"hello")0)""let%test_unit_=should_match_string(repeat(map~f:(fun_->())(capture(string"x")))*>capture(string"y"))"xxy""y";;letstart_of_input=of_captureless_string"^"let%test_unit_=should_match_unit(start_of_input*>string"blah")"blahblee"let%test_unit_=should_not_match(start_of_input*>string"blah")"bloblahba"letend_of_input=of_captureless_string"$"let%test_unit_=should_match_unit(string"blee"*>end_of_input)"blahblee"let%test_unit_=should_not_match(string"blah"*>end_of_input)"bloblahba"let%test_unit_=should_match[%sexp_of:(unitoption*unitoption)*string](and_capture(both(optional(string"a")<*start_of_input<*string"b")(end_of_input*>optional(string"c"))))"b"((None,None),"b");;let%test_unit_=should_match_string(or_[start_of_input*>capture(string"a");capture(string"b")])"ba""b";;letof_re2r=letregex_string=Regex.patternrin(* [Regex.num_submatches] includes 1 for the whole match, which we omit *)letnum_submatches=Regex.num_submatchesr-1in{regex_string=Rope.(of_string"(?:"^of_stringregex_string^of_string")");num_submatches;to_result=(funshiftmatches->Array.submatches~pos:shift~len:num_submatches)};;let%test_module_=(modulestructletmks=of_re2(Regex.create_exns)let%test_unit_=letr=mk"a(b)(?:c([de])|(?P<foo>f)g)"inshould_match[%sexp_of:stringoptionarray]r"abcd"[|Some"b";Some"d";None|];should_match[%sexp_of:stringoptionarraylist](all[r;r;r])"abcdabfgabce"[[|Some"b";Some"d";None|];[|Some"b";None;Some"f"|];[|Some"b";Some"e";None|]];;let%test_unit"messing with options"=should_match_unit(ignore_m(mk"abc(?i)def"))"abcDEF";match_only_if_case_insensitivesexp_of_string(capture(string"abc"*>ignore_m(mk"(?i)")*>string"def"))"abcDEF""abcDEF";should_not_match(mk"(?-i)abcdef")"abcDEF";;end);;let%test_unit_=letr=of_re2(Regex.create_exn"a|b")inletrs=[start_of_input*>ignore_m(r<*string"x")<*end_of_input;start_of_input*>ignore_m(capture(ignore_m(r<*string"x")))<*end_of_input]inList.iterrs~f:(funr->should_match_unitr"ax";should_match_unitr"bx";should_not_matchr"axq";should_not_matchr"qbx");;moduleChar=structletcapture_chart=map(capturet)~f:(funs->s.[0])letcr=capture_char(of_captureless_stringr)letupper=c"[[:upper:]]"letlower=c"[[:lower:]]"letalpha=c"[[:alpha:]]"letdigit=c"[0-9]"letalnum=c"[[:alnum:]]"letspace=c"[[:space:]]"letany=c"."let%test_module_=(modulestructletall_chars=List.init(Char.to_intChar.max_value+1)~f:Char.of_int_exnletmatches_predregexpred=List.iterall_chars~f:(func->ifpredcthenshould_match_with_case~case_sensitive:truesexp_of_charregex(String.of_charc)celseshould_not_match_with_case~case_sensitive:trueregex(String.of_charc));;let%test_unit_=matches_predupperChar.is_uppercaselet%test_unit_=matches_predlowerChar.is_lowercaselet%test_unit_=matches_predalphaChar.is_alphalet%test_unit_=matches_preddigitChar.is_digitlet%test_unit_=matches_predalnumChar.is_alphanumlet%test_unit_=matches_predspaceChar.is_whitespacelet%test_unit_=matches_predany(Fn.consttrue)end);;letchar_list_to_regex_stringchars=Regex.escape(String.of_char_listchars)letone_of=function|[]->fail|[x]->capture_char(string(String.of_charx))|chars->capture_char(of_captureless_string("["^char_list_to_regex_stringchars^"]"));;letnot_one_of=function(* this case is necessary because "[^]blah]" means "none of ]blah" *)|[]->any|chars->capture_char(of_captureless_string("[^"^char_list_to_regex_stringchars^"]"));;let%test_module_=(modulestructletshould_match_char=should_matchsexp_of_charlet%test_unit_=should_match_char(one_of['\xe2';'\x82';'\xac'])"\x82"'\x82';;let%test_unit_=should_match_char(one_of['x';'x'])"x"'x'let%test_unit_=should_not_match(one_of[])"x"let%test_unit_=should_match_char(or_[one_of[];one_of['x']])"x"'x'let%test_unit_=should_match_char(one_of['0';'-';'9'])"-"'-'let%test_unit_=should_not_match(one_of['0';'-';'9'])"5"let%test_unit_=letdifficult_char=one_of['^';'[';']']inletr=all[difficult_char;difficult_char;difficult_char]inshould_match[%sexp_of:charlist]r"^[]"['^';'[';']'];should_not_matchr"]]x";;let%test_unit_=should_not_match(one_of['^';']'])"\\"let%test_unit_=should_match_char(one_of['\\';'n'])"n"'n'let%test_unit_=should_match_char(one_of['x';'\\'])"\\"'\\'let%test_unit_=should_not_match(one_of['.'])"a"let%test_unit_=should_match_string(capture(ignore_m(all[any;one_of['^'];any])))"a^c""a^c";;let%test_unit_=should_not_match(not_one_of['x'])"x"let%test_unit_=should_match_char(not_one_of[']'])"x"'x'let%test_unit_=should_match_char(one_of['\000'])"\000"'\000'let%test_unit_=should_match_char(one_of['a';'\000';'b'])"b"'b'let%test_unit_=letdifficult_char=not_one_of['^';'[';']']inletr=bothdifficult_chardifficult_charinshould_match[%sexp_of:char*char]r"ab"('a','b');should_not_matchr"a^";;end);;endmoduleDecimal=structletdigit=mapChar.digit~f:(func->Int.of_string(String.of_charc))letsign=map(optional(Char.one_of['+';'-']))~f:(function|None|Some'+'->1|Some'-'->-1|Somec->failwiths~here:[%here]"matched unexpected character"c[%sexp_of:char]);;letunsigned=map(capture(repeat~min:1Char.digit))~f:Int.of_stringletint=map2signunsigned~f:(*)let%test_unit"Parsing an empty string shouldn't raise"=runint""|>Core.ignorelet%test_unit_=should_not_matchint""let%test_unit_=should_matchInt.sexp_of_tint"-10"(-10)let%test_unit_=should_matchInt.sexp_of_tint"+005"5let%test_unit_=should_matchInt.sexp_of_tint"42"42endletany_string=capture(repeat(ignore_mChar.any))let%bench_module"big regex"=(modulestructletbig_regex_benchmarkn=letregex=compile(Fn.apply_n_times~n(funx->map(or_[x;capture(string"boo")])~f:(funx->x))any_string)infun()->[%test_result:stringoption](Staged.unstageregex(String.maken'x'))~expect:(Some(String.maken'x'));;let%bench_fun("compilation"[@indexedn=[500;1000;2000;10000]])=fun()->let(_:unit->unit)=big_regex_benchmarknin();;let%bench_fun("matching only"[@indexedn=[500;1000;2000]])=big_regex_benchmarkn;;end);;endincludeBodymoduleOpen_on_rhs_intf=structmoduletypeS=Swithtype'at='atendincludeApplicative.Make_let_syntax(Body)(Open_on_rhs_intf)(Body)