123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)moduleRe=CoreexceptionParse_errorexceptionNot_supportedletposix_class_of_string=function|"alpha"->Re.alpha|"alnum"->Re.alnum|"ascii"->Re.ascii|"blank"->Re.blank|"cntrl"->Re.cntrl|"digit"->Re.digit|"lower"->Re.lower|"print"->Re.print|"space"->Re.space|"upper"->Re.upper|"word"->Re.wordc|"punct"->Re.punct|"graph"->Re.graph|"xdigit"->Re.xdigit|class_->invalid_arg("Invalid pcre class: "^class_)letposix_class_strings=["alpha";"alnum";"ascii";"blank";"cntrl";"digit";"lower";"print";"space";"upper";"word";"punct";"graph";"xdigit"]letparsemultilinedollar_endonlydotallungreedys=leti=ref0inletl=String.lengthsinleteos()=!i=linlettestc=not(eos())&&s.[!i]=cinletacceptc=letr=testcinifrthenincri;rinletaccept_ss'=letlen=String.lengths'intryforj=0tolen-1dotryifs'.[j]<>s.[!i+j]thenraiseExitwith_->raiseExitdone;i:=!i+len;truewithExit->falseinletget()=letr=s.[!i]inincri;rinletunget()=decriinletgreedy_modr=letgr=accept'?'inletgr=ifungreedythennotgrelsegrinifgrthenRe.non_greedyrelseRe.greedyrinletrecregexp()=regexp'(branch())andregexp'left=ifaccept'|'thenregexp'(Re.alt[left;branch()])elseleftandbranch()=branch'[]andbranch'left=ifeos()||test'|'||test')'thenRe.seq(List.revleft)elsebranch'(piece()::left)andpiece()=letr=atom()inifaccept'*'thengreedy_mod(Re.repr)elseifaccept'+'thengreedy_mod(Re.rep1r)elseifaccept'?'thengreedy_mod(Re.optr)elseifaccept'{'thenmatchinteger()withSomei->letj=ifaccept','theninteger()elseSomeiinifnot(accept'}')thenraiseParse_error;beginmatchjwithSomejwhenj<i->raiseParse_error|_->()end;greedy_mod(Re.repnrij)|None->unget();relserandatom()=ifaccept'.'thenbeginifdotallthenRe.anyelseRe.notnlendelseifaccept'('thenbeginifaccept'?'thenbeginifaccept':'thenbeginletr=regexp()inifnot(accept')')thenraiseParse_error;rendelseifaccept'#'thenbegincomment()endelseifaccept'<'thenbeginletname=name()inletr=regexp()inifnot(accept')')thenraiseParse_error;Re.group~namerendelseraiseParse_errorendelsebeginletr=regexp()inifnot(accept')')thenraiseParse_error;Re.grouprendendelseifaccept'^'thenbeginifmultilinethenRe.bolelseRe.bosendelseifaccept'$'thenbeginifmultilinethenRe.eolelseifdollar_endonlythenRe.leolelseRe.eosendelseifaccept'['thenbeginifaccept'^'thenRe.compl(bracket[])elseRe.alt(bracket[])endelseifaccept'\\'thenbegin(* XXX
- Back-references
- \cx (control-x), \ddd
*)ifeos()thenraiseParse_error;matchget()with'w'->Re.alt[Re.alnum;Re.char'_']|'W'->Re.compl[Re.alnum;Re.char'_']|'s'->Re.space|'S'->Re.compl[Re.space]|'d'->Re.digit|'D'->Re.compl[Re.digit]|'b'->Re.alt[Re.bow;Re.eow]|'B'->Re.not_boundary|'A'->Re.bos|'Z'->Re.leol|'z'->Re.eos|'G'->Re.start|'e'->Re.char'\x1b'|'f'->Re.char'\x0c'|'n'->Re.char'\n'|'r'->Re.char'\r'|'t'->Re.char'\t'|'x'->letc1=hexdigit()inletc2=hexdigit()inletcode=c1*16+c2inRe.char(char_of_intcode)|'a'..'z'|'A'..'Z'->raiseParse_error|'0'..'9'->raiseNot_supported|c->Re.charcendelsebeginifeos()thenraiseParse_error;matchget()with'*'|'+'|'?'|'{'|'\\'->raiseParse_error|c->Re.charcendandhexdigit()=ifeos()thenraiseParse_error;matchget()with'0'..'9'asd->Char.coded-Char.code'0'|'a'..'f'asd->Char.coded-Char.code'a'+10|'A'..'F'asd->Char.coded-Char.code'A'+10|_->raiseParse_errorandinteger()=ifeos()thenNoneelsematchget()with'0'..'9'asd->integer'(Char.coded-Char.code'0')|_->unget();Noneandinteger'i=ifeos()thenSomeielsematchget()with'0'..'9'asd->leti'=10*i+(Char.coded-Char.code'0')inifi'<ithenraiseParse_error;integer'i'|_->unget();Someiandname()=ifeos()thenraiseParse_errorelsematchget()with('_'|'a'..'z'|'A'..'Z')asc->letb=Buffer.create32inBuffer.add_charbc;name'b|_->raiseParse_errorandname'b=ifeos()thenraiseParse_errorelsematchget()with('_'|'a'..'z'|'A'..'Z'|'0'..'9')asc->Buffer.add_charbc;name'b|'>'->Buffer.contentsb|_->raiseParse_errorandbrackets=ifs<>[]&&accept']'thenselsebeginmatchchar()with|`Charc->ifaccept'-'thenbeginifaccept']'thenRe.charc::Re.char'-'::selsebeginmatchchar()with`Charc'->bracket(Re.rgcc'::s)|`Setst'->bracket(Re.charc::Re.char'-'::st'::s)endendelsebracket(Re.charc::s)|`Setst->bracket(st::s)endandchar()=ifeos()thenraiseParse_error;letc=get()inifc='['thenbeginifaccept'='thenraiseNot_supported;ifaccept':'thenletcompl=accept'^'inletcls=tryList.findaccept_sposix_class_stringswithNot_found->raiseParse_errorinifnot(accept_s":]")thenraiseParse_error;letre=letposix_class=posix_class_of_stringclsinifcomplthenRe.compl[posix_class]elseposix_classin`Set(re)elseifaccept'.'thenbeginifeos()thenraiseParse_error;letc=get()inifnot(accept'.')thenraiseNot_supported;ifnot(accept']')thenraiseParse_error;`Charcendelse`Charcendelseifc='\\'thenbeginifeos()thenraiseParse_error;letc=get()in(* XXX
\127, ...
*)matchcwith'b'->`Char'\008'|'n'->`Char'\n'(*XXX*)|'r'->`Char'\r'(*XXX*)|'t'->`Char'\t'(*XXX*)|'w'->`Set(Re.alt[Re.alnum;Re.char'_'])|'W'->`Set(Re.compl[Re.alnum;Re.char'_'])|'s'->`Set(Re.space)|'S'->`Set(Re.compl[Re.space])|'d'->`Set(Re.digit)|'D'->`Set(Re.compl[Re.digit])|'a'..'z'|'A'..'Z'->raiseParse_error|'0'..'9'->raiseNot_supported|_->`Charcendelse`Charcandcomment()=ifeos()thenraiseParse_error;ifaccept')'thenRe.epsilonelsebeginincri;comment()endinletres=regexp()inifnot(eos())thenraiseParse_error;restypeopt=[`Ungreedy|`Dotall|`Dollar_endonly|`Multiline|`Anchored|`Caseless]letre?(opts=[])s=letr=parse(List.memq`Multilineopts)(List.memq`Dollar_endonlyopts)(List.memq`Dotallopts)(List.memq`Ungreedyopts)sinletr=ifList.memq`AnchoredoptsthenRe.seq[Re.start;r]elserinletr=ifList.memq`CaselessoptsthenRe.no_caserelserinrletcompile=Re.compileletcompile_pat?(opts=[])s=compile(re~optss)