123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537(* The package sedlex is released under the terms of an MIT-like license. *)(* See the attached LICENSE file. *)(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)exceptionInvalidCodepointofintexceptionMalFormedletgen_of_channelchan=letf()=trySome(input_charchan)withEnd_of_file->Noneinflet(>>=)of=matchowith|Somex->fx|None->None(* For legacy purposes. *)letgen_of_streamstream=letf()=trySome(Stream.nextstream)withStream.Failure->Noneinf(* Absolute position from the beginning of the stream *)typeapos=inttypelexbuf={refill:(Uchar.tarray->int->int->int);mutablebuf:Uchar.tarray;mutablelen:int;(* Number of meaningful char in buffer *)mutableoffset:apos;(* Position of the first char in buffer
in the input stream *)mutablepos:int;(* pos is the index in the buffer *)mutablecurr_bol:int;(* bol is the index in the input stream but not buffer *)mutablecurr_line:int;(* start from 1, if it is 0, we would not track postion info for you *)mutablestart_pos:int;(* First char we need to keep visible *)mutablestart_bol:int;mutablestart_line:int;mutablemarked_pos:int;mutablemarked_bol:int;mutablemarked_line:int;mutablemarked_val:int;mutablefilename:string;mutablefinished:bool;}letchunk_size=512letempty_lexbuf={refill=(fun___->assertfalse);buf=[||];len=0;offset=0;pos=0;curr_bol=0;curr_line=0;start_pos=0;start_bol=0;start_line=0;marked_pos=0;marked_bol=0;marked_line=0;marked_val=0;filename="";finished=false;}letcreatef={empty_lexbufwithrefill=f;buf=Array.makechunk_size(Uchar.of_int0);curr_line=1;}letset_positionlexbufposition=lexbuf.offset<-position.Lexing.pos_cnum-lexbuf.pos;lexbuf.curr_bol<-position.Lexing.pos_bol;lexbuf.curr_line<-position.Lexing.pos_lnumletset_filenamelexbuffname=lexbuf.filename<-fnameletfill_buf_from_genfgenbufposlen=letrecauxi=ifi>=lenthenlenelsematchgen()with|Somec->buf.(pos+i)<-fc;aux(i+1)|None->iinaux0letfrom_gens=create(fill_buf_from_gen(funid->id)s)letfrom_streams=from_gen@@gen_of_streamsletfrom_int_arraya=letlen=Array.lengthain{empty_lexbufwithbuf=Array.initlen(funi->Uchar.of_inta.(i));len=len;finished=true;}letfrom_uchar_arraya=letlen=Array.lengthain{empty_lexbufwithbuf=Array.initlen(funi->a.(i));len=len;finished=true;}letrefilllexbuf=iflexbuf.len+chunk_size>Array.lengthlexbuf.bufthenbeginlets=lexbuf.start_posinletls=lexbuf.len-sinifls+chunk_size<=Array.lengthlexbuf.bufthenArray.blitlexbuf.bufslexbuf.buf0lselsebeginletnewlen=(Array.lengthlexbuf.buf+chunk_size)*2inletnewbuf=Array.makenewlen(Uchar.of_int0)inArray.blitlexbuf.bufsnewbuf0ls;lexbuf.buf<-newbufend;lexbuf.len<-ls;lexbuf.offset<-lexbuf.offset+s;lexbuf.pos<-lexbuf.pos-s;lexbuf.marked_pos<-lexbuf.marked_pos-s;lexbuf.start_pos<-0end;letn=lexbuf.refilllexbuf.buflexbuf.poschunk_sizeinifn=0thenlexbuf.finished<-trueelselexbuf.len<-lexbuf.len+nletnew_linelexbuf=iflexbuf.curr_line!=0thenlexbuf.curr_line<-lexbuf.curr_line+1;lexbuf.curr_bol<-lexbuf.pos+lexbuf.offsetletnextlexbuf=if(notlexbuf.finished)&&(lexbuf.pos=lexbuf.len)thenrefilllexbuf;iflexbuf.finished&&(lexbuf.pos=lexbuf.len)thenNoneelsebeginletret=lexbuf.buf.(lexbuf.pos)inlexbuf.pos<-lexbuf.pos+1;ifret=(Uchar.of_int10)thennew_linelexbuf;Someretendlet__private__next_intlexbuf:int=if(notlexbuf.finished)&&(lexbuf.pos=lexbuf.len)thenrefilllexbuf;iflexbuf.finished&&(lexbuf.pos=lexbuf.len)then-1elsebeginletret=lexbuf.buf.(lexbuf.pos)inlexbuf.pos<-lexbuf.pos+1;ifret=(Uchar.of_int10)thennew_linelexbuf;Uchar.to_intretendletmarklexbufi=lexbuf.marked_pos<-lexbuf.pos;lexbuf.marked_bol<-lexbuf.curr_bol;lexbuf.marked_line<-lexbuf.curr_line;lexbuf.marked_val<-iletstartlexbuf=lexbuf.start_pos<-lexbuf.pos;lexbuf.start_bol<-lexbuf.curr_bol;lexbuf.start_line<-lexbuf.curr_line;marklexbuf(-1)letbacktracklexbuf=lexbuf.pos<-lexbuf.marked_pos;lexbuf.curr_bol<-lexbuf.marked_bol;lexbuf.curr_line<-lexbuf.marked_line;lexbuf.marked_valletrollbacklexbuf=lexbuf.pos<-lexbuf.start_pos;lexbuf.curr_bol<-lexbuf.start_bol;lexbuf.curr_line<-lexbuf.start_lineletlexeme_startlexbuf=lexbuf.start_pos+lexbuf.offsetletlexeme_endlexbuf=lexbuf.pos+lexbuf.offsetletloclexbuf=(lexbuf.start_pos+lexbuf.offset,lexbuf.pos+lexbuf.offset)letlexeme_lengthlexbuf=lexbuf.pos-lexbuf.start_posletsub_lexemelexbufposlen=Array.sublexbuf.buf(lexbuf.start_pos+pos)lenletlexemelexbuf=Array.sublexbuf.buf(lexbuf.start_pos)(lexbuf.pos-lexbuf.start_pos)letlexeme_charlexbufpos=lexbuf.buf.(lexbuf.start_pos+pos)letlexing_positionslexbuf=letstart_p={Lexing.pos_fname=lexbuf.filename;pos_lnum=lexbuf.start_line;pos_cnum=lexbuf.start_pos+lexbuf.offset;pos_bol=lexbuf.start_bol;}andcurr_p={Lexing.pos_fname=lexbuf.filename;pos_lnum=lexbuf.curr_line;pos_cnum=lexbuf.pos+lexbuf.offset;pos_bol=lexbuf.curr_bol;}in(start_p,curr_p)letwith_tokenizerlexer'lexbuf=letlexer()=lettoken=lexer'lexbufinlet(start_p,curr_p)=lexing_positionslexbufin(token,start_p,curr_p)inlexermoduleLatin1=structletfrom_gens=create(fill_buf_from_genUchar.of_chars)letfrom_streams=from_gen@@gen_of_streamsletfrom_strings=letlen=String.lengthsin{empty_lexbufwithbuf=Array.initlen(funi->Uchar.of_chars.[i]);len=len;finished=true;}letfrom_channelic=from_gen(gen_of_channelic)letto_latin1c=ifUchar.is_charcthenUchar.to_charcelseraise(InvalidCodepoint(Uchar.to_intc))letlexeme_charlexbufpos=to_latin1(lexeme_charlexbufpos)letsub_lexemelexbufposlen=lets=Bytes.createleninfori=0tolen-1doBytes.setsi(to_latin1lexbuf.buf.(lexbuf.start_pos+pos+i))done;Bytes.to_stringsletlexemelexbuf=sub_lexemelexbuf0(lexbuf.pos-lexbuf.start_pos)endmoduleUtf8=structmoduleHelper=struct(* http://www.faqs.org/rfcs/rfc3629.html *)letwidth=Array.make256(-1)let()=fori=0to127dowidth.(i)<-1done;fori=192to223dowidth.(i)<-2done;fori=224to239dowidth.(i)<-3done;fori=240to247dowidth.(i)<-4doneletnextsi=matchs.[i]with|'\000'..'\127'asc->Char.codec|'\192'..'\223'asc->letn1=Char.codecinletn2=Char.codes.[i+1]inif(n2lsr6!=0b10)thenraiseMalFormed;((n1land0x1f)lsl6)lor(n2land0x3f)|'\224'..'\239'asc->letn1=Char.codecinletn2=Char.codes.[i+1]inletn3=Char.codes.[i+2]inif(n2lsr6!=0b10)||(n3lsr6!=0b10)thenraiseMalFormed;letp=((n1land0x0f)lsl12)lor((n2land0x3f)lsl6)lor(n3land0x3f)inif(p>=0xd800)&&(p<=0xdf00)thenraiseMalFormed;p|'\240'..'\247'asc->letn1=Char.codecinletn2=Char.codes.[i+1]inletn3=Char.codes.[i+2]inletn4=Char.codes.[i+3]inif(n2lsr6!=0b10)||(n3lsr6!=0b10)||(n4lsr6!=0b10)thenraiseMalFormed;((n1land0x07)lsl18)lor((n2land0x3f)lsl12)lor((n3land0x3f)lsl6)lor(n4land0x3f)|_->raiseMalFormedletfrom_gens=Gen.nexts>>=function|'\000'..'\127'asc->Some(Uchar.of_charc)|'\192'..'\223'asc->letn1=Char.codecinGen.nexts>>=func2->letn2=Char.codec2inif(n2lsr6!=0b10)thenraiseMalFormed;Some(Uchar.of_int(((n1land0x1f)lsl6)lor(n2land0x3f)))|'\224'..'\239'asc->letn1=Char.codecinGen.nexts>>=func2->letn2=Char.codec2inGen.nexts>>=func3->letn3=Char.codec3inif(n2lsr6!=0b10)||(n3lsr6!=0b10)thenraiseMalFormed;Some(Uchar.of_int(((n1land0x0f)lsl12)lor((n2land0x3f)lsl6)lor(n3land0x3f)))|'\240'..'\247'asc->letn1=Char.codecinGen.nexts>>=func2->letn2=Char.codec2inGen.nexts>>=func3->letn3=Char.codec3inGen.nexts>>=func4->letn4=Char.codec4inif(n2lsr6!=0b10)||(n3lsr6!=0b10)||(n4lsr6!=0b10)thenraiseMalFormed;Some(Uchar.of_int(((n1land0x07)lsl18)lor((n2land0x3f)lsl12)lor((n3land0x3f)lsl6)lor(n4land0x3f)))|_->raiseMalFormedletcompute_lensposbytes=letrecauxni=ifi>=pos+bytesthenifi=pos+bytesthennelseraiseMalFormedelseletw=width.(Char.codes.[i])inifw>0thenaux(succn)(i+w)elseraiseMalFormedinaux0posletrecblit_to_intssposaaposn=ifn>0thenbegina.(apos)<-nextsspos;blit_to_ints(spos+width.(Char.codes.[spos]))a(succapos)(predn)endletto_int_arraysposbytes=letn=compute_lensposbytesinleta=Array.maken0inblit_to_intsposa0n;a(**************************)letstorebp=ifp<=0x7fthenBuffer.add_charb(Char.chrp)elseifp<=0x7ffthen(Buffer.add_charb(Char.chr(0xc0lor(plsr6)));Buffer.add_charb(Char.chr(0x80lor(pland0x3f))))elseifp<=0xffffthen(if(p>=0xd800&&p<0xe000)thenraiseMalFormed;Buffer.add_charb(Char.chr(0xe0lor(plsr12)));Buffer.add_charb(Char.chr(0x80lor((plsr6)land0x3f)));Buffer.add_charb(Char.chr(0x80lor(pland0x3f))))elseifp<=0x10ffffthen(Buffer.add_charb(Char.chr(0xf0lor(plsr18)));Buffer.add_charb(Char.chr(0x80lor((plsr12)land0x3f)));Buffer.add_charb(Char.chr(0x80lor((plsr6)land0x3f)));Buffer.add_charb(Char.chr(0x80lor(pland0x3f))))elseraiseMalFormedletfrom_uchar_arrayaaposlen=letb=Buffer.create(len*4)inletrecauxaposlen=iflen>0then(storeb(Uchar.to_inta.(apos));aux(succapos)(predlen))elseBuffer.contentsbinauxaposlenletgen_from_char_gens=(fun()->from_gens)endletfrom_channelic=from_gen(Helper.gen_from_char_gen(gen_of_channelic))letfrom_gens=create(fill_buf_from_gen(funid->id)(Helper.gen_from_char_gens))letfrom_streams=from_gen@@gen_of_streamsletfrom_strings=from_int_array(Helper.to_int_arrays0(String.lengths))letsub_lexemelexbufposlen=Helper.from_uchar_arraylexbuf.buf(lexbuf.start_pos+pos)lenletlexemelexbuf=sub_lexemelexbuf0(lexbuf.pos-lexbuf.start_pos)endmoduleUtf16=structtypebyte_order=Little_endian|Big_endianmoduleHelper=struct(* http://www.ietf.org/rfc/rfc2781.txt *)letnumber_of_char_pairboc1c2=matchbowith|Little_endian->((Char.codec2)lsl8)+(Char.codec1)|Big_endian->((Char.codec1)lsl8)+(Char.codec2)letchar_pair_of_numberbonum=matchbowith|Little_endian->(Char.chr(numland0xFF),Char.chr((numlsr8)land0xFF))|Big_endian->(Char.chr((numlsr8)land0xFF),Char.chr(numland0xFF))letnext_in_genbos=Gen.nexts>>=func1->Gen.nexts>>=func2->Some(number_of_char_pairboc1c2)letfrom_genbosw1=ifw1=0xfffethenraise(InvalidCodepointw1);ifw1<0xd800||0xdfff<w1thenSome(Uchar.of_intw1)elseifw1<=0xdbffthennext_in_genbos>>=funw2->ifw2<0xdc00||w2>0xdfffthenraiseMalFormed;letupper10=(w1land0x3ff)lsl10andlower10=w2land0x3ffinSome(Uchar.of_int(0x10000+upper10+lower10))elseraiseMalFormedletgen_from_char_genopt_bos=letbo=refopt_boinfun()->Gen.nexts>>=func1->Gen.nexts>>=func2->leto=match!bowith|Someo->o|None->leto=match(Char.codec1,Char.codec2)with|(0xff,0xfe)->Little_endian|_->Big_endianinbo:=Someo;oinfrom_genos(number_of_char_pairoc1c2)letcompute_lenopt_bostrposbytes=lets=gen_from_char_genopt_bo(Gen.init~limit:(bytes-pos)(funi->(str.[i+pos])))inletl=ref0inGen.iter(fun_->incrl)s;!lletblit_to_intopt_bossposaaposbytes=lets=gen_from_char_genopt_bo(Gen.init~limit:(bytes-spos)(funi->(s.[i+spos])))inletp=refaposinGen.iter(funx->a.(!p)<-x;incrp)sletto_uchar_arrayopt_bosposbytes=letlen=compute_lenopt_bosposbytesinleta=Array.makelen(Uchar.of_int0)inblit_to_intopt_bosposa0bytes;aletstorebobufcode=ifcode<0x10000then(let(c1,c2)=char_pair_of_numberbocodeinBuffer.add_charbufc1;Buffer.add_charbufc2)else(letu'=code-0x10000inletw1=0xd800+(u'lsr10)andw2=0xdc00+(u'land0x3ff)inlet(c1,c2)=char_pair_of_numberbow1and(c3,c4)=char_pair_of_numberbow2inBuffer.add_charbufc1;Buffer.add_charbufc2;Buffer.add_charbufc3;Buffer.add_charbufc4)letfrom_uchar_arrayboaaposlenbom=letb=Buffer.create(len*4+2)in(* +2 for the BOM *)ifbomthenstorebob0xfeff;(* first, store the BOM *)letrecauxaposlen=iflen>0then(storebob(Uchar.to_inta.(apos));aux(succapos)(predlen))elseBuffer.contentsbinauxaposlenendletfrom_gensopt_bo=from_gen(Helper.gen_from_char_genopt_bos)letfrom_streams=from_gen@@gen_of_streamsletfrom_channelicopt_bo=from_gen(gen_of_channelic)opt_boletfrom_stringsopt_bo=leta=Helper.to_uchar_arrayopt_bos0(String.lengths)infrom_uchar_arrayaletsub_lexemelbposlenbobom=Helper.from_uchar_arraybolb.buf(lb.start_pos+pos)lenbomletlexemelbbobom=sub_lexemelb0(lb.pos-lb.start_pos)bobomend