123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156(*
RE - A regular expression library
Copyright (C) 2001 Jerome Vouillon
email: Jerome.Vouillon@pps.jussieu.fr
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation, with
linking exception; either version 2.1 of the License, or (at
your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)(*
What we could (should?) do:
- a* ==> longest ((shortest (no_group a)* ), a | ()) (!!!)
- abc understood as (ab)c
- "((a?)|b)" against "ab" should not bind the first subpattern to anything
Note that it should be possible to handle "(((ab)c)d)e" efficiently
*)moduleRe=CoreexceptionParse_errorexceptionNot_supportedletparsenewlines=leti=ref0inletl=String.lengthsinleteos()=!i=linlettestc=not(eos())&&s.[!i]=cinletacceptc=letr=testcinifrthenincri;rinletget()=letr=s.[!i]inincri;rinletunget()=decriinletrecregexp()=regexp'(branch())andregexp'left=ifaccept'|'thenregexp'(Re.alt[left;branch()])elseleftandbranch()=branch'[]andbranch'left=ifeos()||test'|'||test')'thenRe.seq(List.revleft)elsebranch'(piece()::left)andpiece()=letr=atom()inifaccept'*'thenRe.rep(Re.nestr)elseifaccept'+'thenRe.rep1(Re.nestr)elseifaccept'?'thenRe.optrelseifaccept'{'thenmatchinteger()withSomei->letj=ifaccept','theninteger()elseSomeiinifnot(accept'}')thenraiseParse_error;beginmatchjwithSomejwhenj<i->raiseParse_error|_->()end;Re.repn(Re.nestr)ij|None->unget();relserandatom()=ifaccept'.'thenbeginifnewlinethenRe.notnlelseRe.anyendelseifaccept'('thenbeginletr=regexp()inifnot(accept')')thenraiseParse_error;Re.grouprendelseifaccept'^'thenbeginifnewlinethenRe.bolelseRe.bosendelseifaccept'$'thenbeginifnewlinethenRe.eolelseRe.eosendelseifaccept'['thenbeginifaccept'^'thenRe.diff(Re.compl(bracket[]))(Re.char'\n')elseRe.alt(bracket[])endelseifaccept'\\'thenbeginifeos()thenraiseParse_error;matchget()with'|'|'('|')'|'*'|'+'|'?'|'['|'.'|'^'|'$'|'{'|'\\'asc->Re.charc|_->raiseParse_errorendelsebeginifeos()thenraiseParse_error;matchget()with'*'|'+'|'?'|'{'|'\\'->raiseParse_error|c->Re.charcendandinteger()=ifeos()thenNoneelsematchget()with'0'..'9'asd->integer'(Char.coded-Char.code'0')|_->unget();Noneandinteger'i=ifeos()thenSomeielsematchget()with'0'..'9'asd->leti'=10*i+(Char.coded-Char.code'0')inifi'<ithenraiseParse_error;integer'i'|_->unget();Someiandbrackets=ifs<>[]&&accept']'thenselsebeginletc=char()inifaccept'-'thenbeginifaccept']'thenRe.charc::Re.char'-'::selsebeginletc'=char()inbracket(Re.rgcc'::s)endendelsebracket(Re.charc::s)endandchar()=ifeos()thenraiseParse_error;letc=get()inifc='['thenbeginifaccept'='thenraiseNot_supportedelseifaccept':'thenbeginraiseNot_supported(*XXX*)endelseifaccept'.'thenbeginifeos()thenraiseParse_error;letc=get()inifnot(accept'.')thenraiseNot_supported;ifnot(accept']')thenraiseParse_error;cendelsecendelsecinletres=regexp()inifnot(eos())thenraiseParse_error;restypeopt=[`ICase|`NoSub|`Newline]letre?(opts=[])s=letr=parse(List.memq`Newlineopts)sinletr=ifList.memq`ICaseoptsthenRe.no_caserelserinletr=ifList.memq`NoSuboptsthenRe.no_grouprelserinrletcompilere=Re.compile(Re.longestre)letcompile_pat?(opts=[])s=compile(re~optss)