123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434# 1 "Camomile/public/uRe.ml"(** Regular expression engine. *)(* Copyright (C) 2003 Yamagata Yoriyuki. distributed with LGPL *)(* This library is free software; you can redistribute it and/or *)(* modify it under the terms of the GNU Lesser General Public License *)(* as published by the Free Software Foundation; either version 2 of *)(* the License, or (at your option) any later version. *)(* As a special exception to the GNU Library General Public License, you *)(* may link, statically or dynamically, a "work that uses this library" *)(* with a publicly distributed version of this library to produce an *)(* executable file containing portions of this library, and distribute *)(* that executable file under terms of your choice, without any of the *)(* additional requirements listed in clause 6 of the GNU Library General *)(* Public License. By "a publicly distributed version of this library", *)(* we mean either the unmodified Library as distributed by the authors, *)(* or a modified version of this library that is distributed under the *)(* conditions defined in clause 3 of the GNU Library General Public *)(* License. This exception does not however invalidate any other reasons *)(* why the executable file might be covered by the GNU Library General *)(* Public License . *)(* This library is distributed in the hope that it will be useful, *)(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)(* Lesser General Public License for more details. *)(* You should have received a copy of the GNU Lesser General Public *)(* License along with this library; if not, write to the Free Software *)(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)(* USA *)(* You can contact the authour by sending email to *)(* yoriyuki.y@gmail.com *)typeregexp=[`Altofregexp*regexp|`Seqofregexp*regexp|`Repofregexp|`Repnofregexp*int*intoption|`Afterofregexp|`Beforeofregexp|`Epsilon|`Groupofregexp|`OneChar|`StringofUChar.tlist|`SetofUSet.t|`BoS|`EoS]typematch_semantics=[`First|`Shortest|`Longest]letrecno_group=function`Alt(r1,r2)->`Alt(no_groupr1,no_groupr2)|`Seq(r1,r2)->`Alt(no_groupr1,no_groupr2)|`Repr->`Rep(no_groupr)|`Groupr->r|r->rmoduletypeType=sigtypetexttypeindextypecompiled_regexpmoduleSubText:SubText.Typewithtypeur_text=textandtypeur_index=indexvalcompile:regexp->compiled_regexpvalregexp_match:?sem:match_semantics->compiled_regexp->text->index->SubText.toptionarrayoptionvalstring_match:compiled_regexp->text->index->boolvalsearch_forward:?sem:match_semantics->compiled_regexp->text->index->SubText.toptionarrayoptionendmoduleMake(Text:UnicodeString.Type)=structtypetext=Text.ttypeindex=Text.indexmoduleSubText=SubText.Make(Text)typeinstr=StringofUChar.tlist|OneChar|SetofUSet.t|Parofinstr*instr|Seqofinstr*instr|Repofinstr|Repnofinstr*int*intoption|Epsilon|Groupofint*instr|BoS|EoS|Beforeofinstr|Afterofinstr|Group_endofint*Text.indextypecompiled_regexp=int*instrletcompile(r:regexp)=letrecloopn=function`Alt(r1,r2)->letn,r1=loopnr1inletn,r2=loopnr2inn,Par(r1,r2)|`Seq(r1,r2)->letn,r1=loopnr1inletn,r2=loopnr2inn,Seq(r1,r2)|`Repr->letn,r=loopnrinn,Repr|`Repn(r,n,m)->letn,r=loopnrinn,Repn(r,n,m)|`Groupr->letn',r=loop(n+1)rinn',Group(n,r)|`Epsilon->n,Epsilon|`Strings->n,Strings|`OneChar->n,OneChar|`Sets->n,Sets|`BoS->n,BoS|`EoS->n,EoS|`Afterr->letn,r=loopnrinn,Afterr|`Beforer->letn,r=loopnrinn,Beforerinloop1rletrecstring_matchti=function[]->i|u::rest->ifText.out_of_rangetithenraiseExitelseifUChar.eq(Text.lookti)uthenstring_matcht(Text.nextti)restelseraiseExitletreverse_string_matchtius=letus=List.revusinletrecloopi=function[]->i|u::rest->ifText.out_of_rangetithenraiseExitelseifUChar.eq(Text.lookti)uthenloop(Text.prevti)restelseraiseExitinloopiusletdec_opt=functionNone->None|Somem->Some(m-1)letrecexec_firstgroupsti=function|[]->i,groups|Seq(r1,r2)::rest->exec_firstgroupsti(r1::r2::rest)|Repr::restasrs->(tryexec_firstgroupstirestwithExit->exec_firstgroupsti(r::rs))|Repn(r,n,m)::rest->ifn>0thenexec_firstgroupsti(r::Repn(r,n-1,dec_optm)::rest)else(matchmwithNone->exec_firstgroupsti(Repr::rest)|Somem->ifm<=0thenexec_firstgroupstirestelselets=Repn(r,0,Some(m-1))intryexec_firstgroupstirestwithExit->exec_firstgroupsti(r::s::rest))|Par(r1,r2)::rest->(tryexec_firstgroupsti(r1::rest)withExit->exec_firstgroupsti(r2::rest))|Group(n,r)::rest->exec_firstgroupsti(r::Group_end(n,i)::rest)|Group_end(n,i0)::rest->lets=SubText.referti0iinexec_first((n,s)::groups)tirest|Stringt0::rest->exec_firstgroupst(string_matchtit0)rest|OneChar::rest->ifText.out_of_rangetithenraiseExitelseexec_firstgroupst(Text.nextti)rest|Sets::rest->(ifText.out_of_rangetithenraiseExitelsematchUSet.mem(Text.lookti)swithtrue->exec_firstgroupst(Text.nextti)rest|false->raiseExit)|Epsilon::rest->exec_firstgroupstirest|BoS::rest->ifText.compare_indexti(Text.ntht0)>0thenraiseExitelseexec_firstgroupstirest|EoS::rest->ifText.compare_indexti(Text.lastt)<=0thenraiseExitelseexec_firstgroupstirest|Afterr::rest->let_,groups=reversegroupsti[r]inexec_firstgroupstirest|Beforer::rest->let_,groups=exec_firstgroupsti[r]inexec_firstgroupstirestandreversegti=function|[]->i,g|Seq(r1,r2)::rest->reversegti(r2::r1::rest)|Repr::restasrs->(tryreversegtirestwithExit->reversegti(r::rs))|Repn(r,n,m)::rest->ifn>0thenreversegti(r::Repn(r,n-1,dec_optm)::rest)else(matchmwithNone->reversegti(Repr::rest)|Somem->ifm<=0thenreversegtirestelselets=Repn(r,0,Some(m-1))intryreversegtirestwithExit->reversegti(r::s::rest))|Par(r1,r2)::rest->(tryreversegti(r1::rest)withExit->reversegti(r2::rest))|Group(n,r)::rest->reversegti(r::Group_end(n,i)::rest)|Group_end(n,j)::rest->lets=SubText.refertijinreverse((n,s)::g)tirest|Stringt0::rest->reversegt(reverse_string_matchtit0)rest|OneChar::rest->ifText.out_of_rangetithenraiseExitelsereversegt(Text.prevti)rest|Sets::rest->(ifText.out_of_rangetithenraiseExitelsematchUSet.mem(Text.lookti)swithtrue->reversegt(Text.prevti)rest|false->raiseExit)|Epsilon::rest->reversegtirest|BoS::rest->ifText.compare_indexti(Text.ntht0)>0thenraiseExitelsereversegtirest|EoS::rest->ifText.compare_indexti(Text.lastt)<=0thenraiseExitelsereversegtirest|Afterr::rest->let_,g=reversegti[r]inreversegtirest|Beforer::rest->let_,g=exec_firstgti[r]inreversegtirestletrecexec_longestgroupsti=function|[]->i,groups|Seq(r1,r2)::rest->exec_longestgroupsti(r1::r2::rest)|Repr::restasrs->(tryleti1,g1=exec_longestgroupstirestintryleti2,g2=exec_longestgroupsti(r::rs)inifText.compare_indexti1i2>=0theni1,g1elsei2,g2withExit->i1,g1withExit->exec_longestgroupsti(r::rs))|Repn(r,n,m)::rest->ifn>0thenexec_longestgroupsti(r::Repn(r,n-1,dec_optm)::rest)else(matchmwithNone->exec_longestgroupsti(Repr::rest)|Somem->ifm<=0thenexec_longestgroupstirestelselets=Repn(r,0,Some(m-1))intryleti1,g1=exec_longestgroupstirestintryleti2,g2=exec_longestgroupsti(r::s::rest)inifText.compare_indexti1i2>=0theni1,g1elsei2,g2withExit->i1,g1withExit->exec_longestgroupsti(r::s::rest))|Par(r1,r2)::rest->(tryexec_longestgroupsti(r1::rest)withExit->exec_longestgroupsti(r2::rest))|Group(n,r)::rest->exec_longestgroupsti(r::Group_end(n,i)::rest)|Group_end(n,i0)::rest->lets=SubText.referti0iinexec_longest((n,s)::groups)tirest|Stringt0::rest->exec_longestgroupst(string_matchtit0)rest|OneChar::rest->ifText.out_of_rangetithenraiseExitelseexec_longestgroupst(Text.nextti)rest|Sets::rest->(ifText.out_of_rangetithenraiseExitelsematchUSet.mem(Text.lookti)swithtrue->exec_longestgroupst(Text.nextti)rest|false->raiseExit)|Epsilon::rest->exec_longestgroupstirest|BoS::rest->ifText.compare_indexti(Text.ntht0)>0thenraiseExitelseexec_longestgroupstirest|EoS::rest->ifText.compare_indexti(Text.lastt)<=0thenraiseExitelseexec_longestgroupstirest|Afterr::rest->let_,g=reversegroupsti[r]inexec_longestgtirest|Beforer::rest->let_,g=exec_firstgroupsti[r]inexec_longestgtirestletrecexec_shortestgroupsti=function|[]->i,groups|Seq(r1,r2)::rest->exec_shortestgroupsti(r1::r2::rest)|Repr::restasrs->(tryleti1,g1=exec_shortestgroupstirestintryleti2,g2=exec_shortestgroupsti(r::rs)inifText.compare_indexti1i2<=0theni1,g1elsei2,g2withExit->i1,g1withExit->exec_shortestgroupsti(r::rs))|Repn(r,n,m)::rest->ifn>0thenexec_shortestgroupsti(r::Repn(r,n-1,dec_optm)::rest)else(matchmwithNone->exec_shortestgroupsti(Repr::rest)|Somem->ifm<=0thenexec_shortestgroupstirestelselets=Repn(r,0,Some(m-1))intryleti1,g1=exec_shortestgroupstirestintryleti2,g2=exec_shortestgroupsti(r::s::rest)inifText.compare_indexti1i2<=0theni1,g1elsei2,g2withExit->i1,g1withExit->exec_shortestgroupsti(r::s::rest))|Par(r1,r2)::rest->(tryexec_shortestgroupsti(r1::rest)withExit->exec_shortestgroupsti(r2::rest))|Group(n,r)::rest->exec_shortestgroupsti(r::Group_end(n,i)::rest)|Group_end(n,i0)::rest->lets=SubText.referti0iinexec_shortest((n,s)::groups)tirest|Stringt0::rest->exec_shortestgroupst(string_matchtit0)rest|OneChar::rest->ifText.out_of_rangetithenraiseExitelseexec_shortestgroupst(Text.nextti)rest|Sets::rest->(ifText.out_of_rangetithenraiseExitelsematchUSet.mem(Text.lookti)swithtrue->exec_shortestgroupst(Text.nextti)rest|false->raiseExit)|Epsilon::rest->exec_shortestgroupstirest|BoS::rest->ifText.compare_indexti(Text.ntht0)>0thenraiseExitelseexec_shortestgroupstirest|EoS::rest->ifText.compare_indexti(Text.lastt)<=0thenraiseExitelseexec_shortestgroupstirest|Afterr::rest->let_,g=reversegroupsti[r]inexec_shortestgtirest|Beforer::rest->let_,g=exec_firstgroupsti[r]inexec_shortestgtirestletset_groupsgroupsg=letrecloop=function[]->()|(n,s)::rest->groups.(n)<-Somes;looprestinloop(List.revg)letregexp_match?(sem=`Longest)(n,r)ti=letgroups=Array.makenNoneintrymatchsemwith`First->letj,g=exec_first[]ti[r]inset_groupsgroupsg;groups.(0)<-Some(SubText.refertij);Somegroups|`Shortest->letj,g=exec_shortest[]ti[r]inset_groupsgroupsg;groups.(0)<-Some(SubText.refertij);Somegroups|`Longest->letj,g=exec_longest[]ti[r]inset_groupsgroupsg;groups.(0)<-Some(SubText.refertij);SomegroupswithExit->Noneletstring_match(_,r)ti=tryignore(exec_first[]ti[r]);truewithExit->falseletsearch_forward?(sem=`Longest)((n,r)asc)ti=letgroups=Array.makenNoneinletrecscani=ifText.out_of_rangetithenNoneelsetryletj,g=exec_first[]ti[r]inSome(i,j,g)withExit->scan(Text.nextti)inmatchscaniwithSome(i,j,g)->(matchsemwith`First->set_groupsgroupsg;groups.(0)<-Some(SubText.refertij);Somegroups|_->regexp_match~semcti)|None->Noneend