123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)moduleRe=CoreexceptionParse_error=Parse_buffer.Parse_errorexceptionNot_supportedletparsemultilinedollar_endonlydotallungreedys=letbuf=Parse_buffer.createsinletaccept=Parse_buffer.acceptbufinleteos()=Parse_buffer.eosbufinlettestc=Parse_buffer.testbufcinletunget()=Parse_buffer.ungetbufinletget()=Parse_buffer.getbufinletgreedy_modr=letgr=accept'?'inletgr=ifungreedythennotgrelsegrinifgrthenRe.non_greedyrelseRe.greedyrinletrecregexp()=regexp'(branch())andregexp'left=ifaccept'|'thenregexp'(Re.alt[left;branch()])elseleftandbranch()=branch'[]andbranch'left=ifeos()||test'|'||test')'thenRe.seq(List.revleft)elsebranch'(piece()::left)andpiece()=letr=atom()inifaccept'*'thengreedy_mod(Re.repr)elseifaccept'+'thengreedy_mod(Re.rep1r)elseifaccept'?'thengreedy_mod(Re.optr)elseifaccept'{'then(matchParse_buffer.integerbufwith|Somei->letj=ifaccept','thenParse_buffer.integerbufelseSomeiinifnot(accept'}')thenraiseParse_error;(matchjwith|Somejwhenj<i->raiseParse_error|_->());greedy_mod(Re.repnrij)|None->unget();r)elserandatom()=ifaccept'.'thenifdotallthenRe.anyelseRe.notnlelseifaccept'('thenifaccept'?'thenifaccept':'then(letr=regexp()inifnot(accept')')thenraiseParse_error;r)elseifaccept'#'thencomment()elseifaccept'<'then(letname=name()inletr=regexp()inifnot(accept')')thenraiseParse_error;Re.group~namer)elseraiseParse_errorelse(letr=regexp()inifnot(accept')')thenraiseParse_error;Re.groupr)elseifaccept'^'thenifmultilinethenRe.bolelseRe.boselseifaccept'$'thenifmultilinethenRe.eolelseifdollar_endonlythenRe.leolelseRe.eoselseifaccept'['thenifaccept'^'thenRe.compl(bracket[])elseRe.alt(bracket[])elseifaccept'\\'then((* XXX
- Back-references
- \cx (control-x), \ddd
*)ifeos()thenraiseParse_error;matchget()with|'w'->Re.alt[Re.alnum;Re.char'_']|'W'->Re.compl[Re.alnum;Re.char'_']|'s'->Re.space|'S'->Re.compl[Re.space]|'d'->Re.digit|'D'->Re.compl[Re.digit]|'b'->Re.alt[Re.bow;Re.eow]|'B'->Re.not_boundary|'A'->Re.bos|'Z'->Re.leol|'z'->Re.eos|'G'->Re.start|'e'->Re.char'\x1b'|'f'->Re.char'\x0c'|'n'->Re.char'\n'|'r'->Re.char'\r'|'t'->Re.char'\t'|'x'->letc1=hexdigit()inletc2=hexdigit()inletcode=(c1*16)+c2inRe.char(char_of_intcode)|'a'..'z'|'A'..'Z'->raiseParse_error|'0'..'9'->raiseNot_supported|c->Re.charc)else(ifeos()thenraiseParse_error;matchget()with|'*'|'+'|'?'|'{'|'\\'->raiseParse_error|c->Re.charc)andhexdigit()=ifeos()thenraiseParse_error;matchget()with|'0'..'9'asd->Char.coded-Char.code'0'|'a'..'f'asd->Char.coded-Char.code'a'+10|'A'..'F'asd->Char.coded-Char.code'A'+10|_->raiseParse_errorandname()=ifeos()thenraiseParse_errorelse(matchget()with|('_'|'a'..'z'|'A'..'Z')asc->letb=Buffer.create32inBuffer.add_charbc;name'b|_->raiseParse_error)andname'b=ifeos()thenraiseParse_errorelse(matchget()with|('_'|'a'..'z'|'A'..'Z'|'0'..'9')asc->Buffer.add_charbc;name'b|'>'->Buffer.contentsb|_->raiseParse_error)andbrackets=ifs<>[]&&accept']'thenselse(matchchar()with|`Setst->bracket(st::s)|`Charc->ifaccept'-'thenifaccept']'thenRe.charc::Re.char'-'::selsebracket(matchchar()with|`Charc'->Re.rgcc'::s|`Setst'->Re.charc::Re.char'-'::st'::s)elsebracket(Re.charc::s))andchar()=ifeos()thenraiseParse_error;letc=get()inifc='['then(ifaccept'='thenraiseNot_supported;matchPosix_class.parsebufwith|Someset->`Setset|None->ifaccept'.'then(ifeos()thenraiseParse_error;letc=get()inifnot(accept'.')thenraiseNot_supported;ifnot(accept']')thenraiseParse_error;`Charc)else`Charc)elseifc='\\'then(ifeos()thenraiseParse_error;letc=get()in(* XXX
\127, ...
*)matchcwith|'b'->`Char'\008'|'n'->`Char'\n'(*XXX*)|'r'->`Char'\r'(*XXX*)|'t'->`Char'\t'(*XXX*)|'w'->`Set(Re.alt[Re.alnum;Re.char'_'])|'W'->`Set(Re.compl[Re.alnum;Re.char'_'])|'s'->`SetRe.space|'S'->`Set(Re.compl[Re.space])|'d'->`SetRe.digit|'D'->`Set(Re.compl[Re.digit])|'a'..'z'|'A'..'Z'->raiseParse_error|'0'..'9'->raiseNot_supported|_->`Charc)else`Charcandcomment()=ifeos()thenraiseParse_error;ifaccept')'thenRe.epsilonelse(Parse_buffer.junkbuf;comment())inletres=regexp()inifnot(eos())thenraiseParse_error;res;;typeopt=[`Ungreedy|`Dotall|`Dollar_endonly|`Multiline|`Anchored|`Caseless]letre?(opts=[])s=letr=parse(List.memq`Multilineopts)(List.memq`Dollar_endonlyopts)(List.memq`Dotallopts)(List.memq`Ungreedyopts)sinletr=ifList.memq`AnchoredoptsthenRe.seq[Re.start;r]elserinletr=ifList.memq`CaselessoptsthenRe.no_caserelserinr;;letcompile=Re.compileletcompile_pat?(opts=[])s=compile(re~optss)