123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659(* The package sedlex is released under the terms of an MIT-like license. *)(* See the attached LICENSE file. *)(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *)exceptionInvalidCodepointofintexceptionMalFormedmoduleUchar=struct(* This for compatibility with ocaml < 4.14.0 *)letutf_8_byte_lengthu=matchUchar.to_intuwith|uwhenu<0->assertfalse|uwhenu<=0x007F->1|uwhenu<=0x07FF->2|uwhenu<=0xFFFF->3|uwhenu<=0x10FFFF->4|_->assertfalseletutf_16_byte_lengthu=matchUchar.to_intuwith|uwhenu<0->assertfalse|uwhenu<=0xFFFF->2|uwhenu<=0x10FFFF->4|_->assertfalselet()=ignoreutf_8_byte_length;ignoreutf_16_byte_lengthincludeUcharletof_intx=ifUchar.is_validxthenUchar.unsafe_of_intxelseraiseMalFormedend(* shadow polymorphic equal *)let(=)(a:int)b=a=blet(>>|)of=matchowithSomex->Some(fx)|None->None(* Absolute position from the beginning of the stream *)typeapos=inttypelexbuf={refill:Uchar.tarray->int->int->int;bytes_per_char:Uchar.t->int;mutablebuf:Uchar.tarray;mutablelen:int;(* Number of meaningful uchar in buffer *)mutableoffset:apos;(* Number of meaningful bytes in buffer *)mutablebytes_offset:apos;(* Position of the first uchar in buffer
in the input stream *)mutablepos:int;(* Position of the first byte in buffer
in the input stream *)mutablebytes_pos:int;(* Position of the beginning of the line in the buffer, in uchar *)mutablecurr_bol:int;(* Position of the beginning of the line in the buffer, in bytes *)mutablecurr_bytes_bol:int;(* Index of the current line in the input stream. *)mutablecurr_line:int;(* starting position, in uchar. *)mutablestart_pos:int;(* starting position, in bytes. *)mutablestart_bytes_pos:int;(* First uchar we need to keep visible *)mutablestart_bol:int;(* First byte we need to keep visible *)mutablestart_bytes_bol:int;(* start from 1 *)mutablestart_line:int;mutablemarked_pos:int;mutablemarked_bytes_pos:int;mutablemarked_bol:int;mutablemarked_bytes_bol:int;mutablemarked_line:int;mutablemarked_val:int;mutablefilename:string;mutablefinished:bool;}letchunk_size=512letempty_lexbufbytes_per_char={refill=(fun___->assertfalse);bytes_per_char;buf=[||];len=0;offset=0;bytes_offset=0;pos=0;bytes_pos=0;curr_bol=0;curr_bytes_bol=0;curr_line=1;start_pos=0;start_bytes_pos=0;start_bol=0;start_bytes_bol=0;start_line=0;marked_pos=0;marked_bytes_pos=0;marked_bol=0;marked_bytes_bol=0;marked_line=0;marked_val=0;filename="";finished=false;}letdummy_uchar=Uchar.of_int0letnl_uchar=Uchar.of_int10letcreate?(bytes_per_char=fun_->1)refill={(empty_lexbufbytes_per_char)withrefill;buf=Array.makechunk_sizedummy_uchar;}letset_position?bytes_positionlexbufposition=lexbuf.offset<-position.Lexing.pos_cnum-lexbuf.pos;lexbuf.curr_bol<-position.Lexing.pos_bol;lexbuf.curr_line<-position.Lexing.pos_lnum;letbytes_position=Option.value~default:positionbytes_positioninlexbuf.bytes_offset<-bytes_position.Lexing.pos_cnum-lexbuf.bytes_pos;lexbuf.curr_bytes_bol<-bytes_position.Lexing.pos_bolletset_filenamelexbuffname=lexbuf.filename<-fnameletfrom_gen?bytes_per_chargen=letmalformed=reffalseinletrefillbufposlen=letrecloopi=if!malformedthenraiseMalFormed;ifi>=lenthenlenelse(matchgen()with|Somec->buf.(pos+i)<-c;loop(i+1)|None->i|exceptionMalFormedwheni<>0->malformed:=true;i)inloop0increate?bytes_per_charrefillletfrom_int_array?bytes_per_chara=from_gen?bytes_per_char(Gen.init~limit:(Array.lengtha)(funi->Uchar.of_inta.(i)))letfrom_uchar_array?(bytes_per_char=fun_->1)a=letlen=Array.lengthain{(empty_lexbufbytes_per_char)withbuf=Array.initlen(funi->a.(i));len;finished=true;}letrefilllexbuf=iflexbuf.len+chunk_size>Array.lengthlexbuf.bufthenbeginlets=lexbuf.start_posinlets_bytes=lexbuf.start_bytes_posinletls=lexbuf.len-sinifls+chunk_size<=Array.lengthlexbuf.bufthenArray.blitlexbuf.bufslexbuf.buf0lselsebeginletnewlen=(Array.lengthlexbuf.buf+chunk_size)*2inletnewbuf=Array.makenewlendummy_ucharinArray.blitlexbuf.bufsnewbuf0ls;lexbuf.buf<-newbufend;lexbuf.len<-ls;lexbuf.offset<-lexbuf.offset+s;lexbuf.bytes_offset<-lexbuf.bytes_offset+s_bytes;lexbuf.pos<-lexbuf.pos-s;lexbuf.bytes_pos<-lexbuf.bytes_pos-s_bytes;lexbuf.marked_pos<-lexbuf.marked_pos-s;lexbuf.marked_bytes_pos<-lexbuf.marked_bytes_pos-s_bytes;lexbuf.start_pos<-0;lexbuf.start_bytes_pos<-0end;letn=lexbuf.refilllexbuf.buflexbuf.poschunk_sizeinifn=0thenlexbuf.finished<-trueelselexbuf.len<-lexbuf.len+nletnew_linelexbuf=lexbuf.curr_line<-lexbuf.curr_line+1;lexbuf.curr_bol<-lexbuf.pos+lexbuf.offset;lexbuf.curr_bytes_bol<-lexbuf.bytes_pos+lexbuf.bytes_offsetlet[@inlinealways]next_auxsomenonelexbuf=if(notlexbuf.finished)&&lexbuf.pos=lexbuf.lenthenrefilllexbuf;iflexbuf.finished&&lexbuf.pos=lexbuf.lenthennoneelsebeginletret=lexbuf.buf.(lexbuf.pos)inlexbuf.pos<-lexbuf.pos+1;lexbuf.bytes_pos<-lexbuf.bytes_pos+lexbuf.bytes_per_charret;ifUchar.equalretnl_ucharthennew_linelexbuf;someretendletnextlexbuf=(next_aux[@inlined])(funx->Somex)Nonelexbuflet__private__next_intlexbuf=(next_aux[@inlined])Uchar.to_int(-1)lexbufletmarklexbufi=lexbuf.marked_pos<-lexbuf.pos;lexbuf.marked_bytes_pos<-lexbuf.bytes_pos;lexbuf.marked_bol<-lexbuf.curr_bol;lexbuf.marked_bytes_bol<-lexbuf.curr_bytes_bol;lexbuf.marked_line<-lexbuf.curr_line;lexbuf.marked_val<-iletstartlexbuf=lexbuf.start_pos<-lexbuf.pos;lexbuf.start_bytes_pos<-lexbuf.bytes_pos;lexbuf.start_bol<-lexbuf.curr_bol;lexbuf.start_bytes_bol<-lexbuf.curr_bytes_bol;lexbuf.start_line<-lexbuf.curr_line;marklexbuf(-1)letbacktracklexbuf=lexbuf.pos<-lexbuf.marked_pos;lexbuf.bytes_pos<-lexbuf.marked_bytes_pos;lexbuf.curr_bol<-lexbuf.marked_bol;lexbuf.curr_bytes_bol<-lexbuf.marked_bytes_bol;lexbuf.curr_line<-lexbuf.marked_line;lexbuf.marked_valletrollbacklexbuf=lexbuf.pos<-lexbuf.start_pos;lexbuf.bytes_pos<-lexbuf.start_bytes_pos;lexbuf.curr_bol<-lexbuf.start_bol;lexbuf.curr_bytes_bol<-lexbuf.start_bytes_bol;lexbuf.curr_line<-lexbuf.start_lineletlexeme_startlexbuf=lexbuf.start_pos+lexbuf.offsetletlexeme_bytes_startlexbuf=lexbuf.start_bytes_pos+lexbuf.bytes_offsetletlexeme_endlexbuf=lexbuf.pos+lexbuf.offsetletlexeme_bytes_endlexbuf=lexbuf.bytes_pos+lexbuf.bytes_offsetletloclexbuf=(lexbuf.start_pos+lexbuf.offset,lexbuf.pos+lexbuf.offset)letbytes_loclexbuf=(lexbuf.start_bytes_pos+lexbuf.bytes_offset,lexbuf.bytes_pos+lexbuf.bytes_offset)letlexeme_lengthlexbuf=lexbuf.pos-lexbuf.start_posletlexeme_bytes_lengthlexbuf=lexbuf.bytes_pos-lexbuf.start_bytes_posletsub_lexemelexbufposlen=Array.sublexbuf.buf(lexbuf.start_pos+pos)lenletlexemelexbuf=Array.sublexbuf.buflexbuf.start_pos(lexbuf.pos-lexbuf.start_pos)letlexeme_charlexbufpos=lexbuf.buf.(lexbuf.start_pos+pos)letlexing_position_startlexbuf={Lexing.pos_fname=lexbuf.filename;pos_lnum=lexbuf.start_line;pos_cnum=lexbuf.start_pos+lexbuf.offset;pos_bol=lexbuf.start_bol;}letlexing_position_currlexbuf={Lexing.pos_fname=lexbuf.filename;pos_lnum=lexbuf.curr_line;pos_cnum=lexbuf.pos+lexbuf.offset;pos_bol=lexbuf.curr_bol;}letlexing_positionslexbuf=letstart_p=lexing_position_startlexbufandcurr_p=lexing_position_currlexbufin(start_p,curr_p)letlexing_bytes_position_startlexbuf={Lexing.pos_fname=lexbuf.filename;pos_lnum=lexbuf.start_line;pos_cnum=lexbuf.start_bytes_pos+lexbuf.bytes_offset;pos_bol=lexbuf.start_bytes_bol;}letlexing_bytes_position_currlexbuf={Lexing.pos_fname=lexbuf.filename;pos_lnum=lexbuf.curr_line;pos_cnum=lexbuf.bytes_pos+lexbuf.bytes_offset;pos_bol=lexbuf.curr_bytes_bol;}letlexing_bytes_positionslexbuf=letstart_p=lexing_bytes_position_startlexbufandcurr_p=lexing_bytes_position_currlexbufin(start_p,curr_p)letwith_tokenizerlexer'lexbuf=letlexer()=lettoken=lexer'lexbufinletstart_p,curr_p=lexing_positionslexbufin(token,start_p,curr_p)inlexermoduleChan=structexceptionMissing_inputtypet={b:Bytes.t;ic:in_channel;mutablelen:int;mutablepos:int;}letmin_buffer_size=64letcreateiclen:t=letlen=maxlenmin_buffer_sizein{b=Bytes.createlen;ic;len=0;pos=0}letavailable(t:t)=t.len-t.posletrecensure_bytes_available(t:t)~can_refilln=ifavailablet>=nthen()elseifcan_refillthen(letlen=t.len-t.posiniflen>0thenBytes.blitt.bt.post.b0len;letread=inputt.ict.blen(Bytes.lengtht.b-len)int.len<-len+read;t.pos<-0;ifread=0thenraiseMissing_inputelseensure_bytes_availablet~can_refilln)elseraiseMissing_inputletensure_bytes_availablet~can_refilln=(* [n] should not exceed the size of the buffer. Here we are
conservative and make sure it doesn't exceed the mininum size
for the buffer. *)ifn<=0||n>min_buffer_sizetheninvalid_arg"Sedlexing.Chan.ensure";ensure_bytes_availablet~can_refillnletget(t:t)i=Bytes.gett.b(t.pos+i)letadvance(t:t)n=ift.pos+n>t.lentheninvalid_arg"advance";t.pos<-t.pos+nletraw_buf(t:t)=t.bletraw_pos(t:t)=t.posendletmake_from_channel?bytes_per_charic~max_bytes_per_uchar~min_bytes_per_uchar~read_uchar=lett=Chan.createic(chunk_size*max_bytes_per_uchar)inletmalformed=reffalseinletrefillbufposlen=letrecloopi=if!malformedthenraiseMalFormed;ifi=lenthenielse(match(* we refill our bytes buffer only if we haven't refilled any uchar yet. *)letcan_refill=i=0inChan.ensure_bytes_availablet~can_refillmin_bytes_per_uchar;read_uchar~can_refilltwith|c->buf.(pos+i)<-c;loop(i+1)|exceptionMalFormedwheni<>0->malformed:=true;i|exceptionChan.Missing_input->ifi=0&&Chan.availablet>0thenraiseMalFormed;i)inloop0increate?bytes_per_charrefillmoduleLatin1=structletfrom_gens=from_gen~bytes_per_char:(fun_->1)(Gen.mapUchar.of_chars)letfrom_strings=letlen=String.lengthsin{(empty_lexbuf(fun_->1))withbuf=Array.initlen(funi->Uchar.of_chars.[i]);len;finished=true;}letfrom_channelic=make_from_channelic~bytes_per_char:(fun_->1)~min_bytes_per_uchar:1~max_bytes_per_uchar:1~read_uchar:(fun~can_refill:_t->letc=Chan.gett0inChan.advancet1;Uchar.of_charc)letto_latin1c=ifUchar.is_charcthenUchar.to_charcelseraise(InvalidCodepoint(Uchar.to_intc))letlexeme_charlexbufpos=to_latin1(lexeme_charlexbufpos)letsub_lexemelexbufposlen=lets=Bytes.createleninfori=0tolen-1doBytes.setsi(to_latin1lexbuf.buf.(lexbuf.start_pos+pos+i))done;Bytes.to_stringsletlexemelexbuf=sub_lexemelexbuf0(lexbuf.pos-lexbuf.start_pos)endmoduleUtf8=structmoduleHelper=struct(* http://www.faqs.org/rfcs/rfc3629.html *)letwidth=function|'\000'..'\127'->1|'\192'..'\223'->2|'\224'..'\239'->3|'\240'..'\247'->4|_->raiseMalFormed(* https://www.unicode.org/versions/corrigendum1.html *)letcheck_twon1n2=ifn1<0xc2||0xdf<n1thenraiseMalFormed;ifn2<0x80||0xbf<n2thenraiseMalFormed;ifn2lsr6!=0b10thenraiseMalFormed;((n1land0x1f)lsl6)lor(n2land0x3f)letcheck_threen1n2n3=ifn1=0xe0then(ifn2<0xa0||0xbf<n2thenraiseMalFormed;ifn3<0x80||0xbf<n3thenraiseMalFormed)else(ifn1<0xe1||0xef<n1thenraiseMalFormed;ifn2<0x80||0xbf<n2thenraiseMalFormed;ifn3<0x80||0xbf<n3thenraiseMalFormed);ifn2lsr6!=0b10||n3lsr6!=0b10thenraiseMalFormed;letp=((n1land0x0f)lsl12)lor((n2land0x3f)lsl6)lor(n3land0x3f)inifp>=0xd800&&p<=0xdf00thenraiseMalFormed;pletcheck_fourn1n2n3n4=ifn1=0xf0then(ifn2<0x90||0xbf<n2thenraiseMalFormed;ifn3<0x80||0xbf<n3thenraiseMalFormed;ifn4<0x80||0xbf<n4thenraiseMalFormed)elseifn1=0xf4then(ifn2<0x80||0x8f<n2thenraiseMalFormed;ifn3<0x80||0xbf<n3thenraiseMalFormed;ifn4<0x80||0xbf<n4thenraiseMalFormed)else(ifn1<0xf1||0xf3<n1thenraiseMalFormed;ifn2<0x80||0xbf<n2thenraiseMalFormed;ifn3<0x80||0xbf<n3thenraiseMalFormed;ifn4<0x80||0xbf<n4thenraiseMalFormed);ifn2lsr6!=0b10||n3lsr6!=0b10||n4lsr6!=0b10thenraiseMalFormed;((n1land0x07)lsl18)lor((n2land0x3f)lsl12)lor((n3land0x3f)lsl6)lor(n4land0x3f)letnextsi=letc1=s.[i]inmatchwidthc1with|1->Char.codec1|2->letn1=Char.codec1inletn2=Char.codes.[i+1]incheck_twon1n2|3->letn1=Char.codec1inletn2=Char.codes.[i+1]inletn3=Char.codes.[i+2]incheck_threen1n2n3|4->letn1=Char.codec1inletn2=Char.codes.[i+1]inletn3=Char.codes.[i+2]inletn4=Char.codes.[i+3]incheck_fourn1n2n3n4|_->assertfalseletgen_from_char_gens=letnext_or_fail()=matchGen.nextswithNone->raiseMalFormed|Somex->Char.codexinfun()->Gen.nexts>>|func1->matchwidthc1with|1->Uchar.of_charc1|2->letn1=Char.codec1inletn2=next_or_fail()inUchar.of_int(check_twon1n2)|3->letn1=Char.codec1inletn2=next_or_fail()inletn3=next_or_fail()inUchar.of_int(check_threen1n2n3)|4->letn1=Char.codec1inletn2=next_or_fail()inletn3=next_or_fail()inletn4=next_or_fail()inUchar.of_int(check_fourn1n2n3n4)|_->raiseMalFormed(**************************)letto_bufferaaposlenb=fori=apostoapos+len-1doBuffer.add_utf_8_ucharba.(i)doneendletfrom_channelic=make_from_channelic~bytes_per_char:Uchar.utf_8_byte_length~min_bytes_per_uchar:1~max_bytes_per_uchar:4~read_uchar:(fun~can_refillt->letw=Helper.width(Chan.gett0)inChan.ensure_bytes_availablet~can_refillw;letc=Helper.next(Bytes.unsafe_to_string(Chan.raw_buft))(Chan.raw_post)inChan.advancetw;Uchar.of_intc)letfrom_gens=from_gen~bytes_per_char:Uchar.utf_8_byte_length(Helper.gen_from_char_gens)letfrom_strings=from_gen(Gen.init~limit:(String.lengths)(funi->String.getsi))letsub_lexemelexbufposlen=letbuf=Buffer.create(len*4)inHelper.to_bufferlexbuf.buf(lexbuf.start_pos+pos)lenbuf;Buffer.contentsbufletlexemelexbuf=sub_lexemelexbuf0(lexbuf.pos-lexbuf.start_pos)endmoduleUtf16=structtypebyte_order=Little_endian|Big_endianmoduleHelper=struct(* http://www.ietf.org/rfc/rfc2781.txt *)letnumber_of_pairboc1c2=matchbowith|Little_endian->(c2lsl8)+c1|Big_endian->(c1lsl8)+c2letget_boboc1c2=match!bowith|Someo->o|None->leto=match(c1,c2)with|0xff,0xfe->Little_endian|_->Big_endianinbo:=Someo;oletgen_from_char_genopt_bos=letnext_or_fail()=matchGen.nextswithNone->raiseMalFormed|Somex->Char.codexinletbo=refopt_boinfun()->Gen.nexts>>|func1->letn1=Char.codec1inletn2=next_or_fail()inleto=get_bobon1n2inletw1=number_of_pairon1n2inifw1=0xfffethenraise(InvalidCodepointw1);ifw1<0xd800||0xdfff<w1thenUchar.of_intw1elseifw1<=0xdbffthen(letn3=next_or_fail()inletn4=next_or_fail()inletw2=number_of_pairon3n4inifw2<0xdc00||w2>0xdfffthenraiseMalFormed;letupper10=(w1land0x3ff)lsl10andlower10=w2land0x3ffinUchar.of_int(0x10000+upper10+lower10))elseraiseMalFormedletto_bufferboaaposlenbomb=letstore=matchbowith|Big_endian->Buffer.add_utf_16be_ucharb|Little_endian->Buffer.add_utf_16le_ucharbinifbomthenstore(Uchar.of_int0xfeff);(* first, store the BOM *)fori=apostoapos+len-1dostorea.(i)doneendletfrom_channelicopt_bo=letbo=refopt_boinmake_from_channelic~bytes_per_char:Uchar.utf_16_byte_length~min_bytes_per_uchar:2~max_bytes_per_uchar:4~read_uchar:(fun~can_refillt->letn1=Char.code(Chan.gett0)inletn2=Char.code(Chan.gett1)inleto=Helper.get_bobon1n2inletw1=Helper.number_of_pairon1n2inifw1=0xfffethenraise(InvalidCodepointw1);ifw1<0xd800||0xdfff<w1then(Chan.advancet2;Uchar.of_intw1)elseifw1<=0xdbffthen(Chan.ensure_bytes_availablet~can_refill4;letn3=Char.code(Chan.gett2)inletn4=Char.code(Chan.gett3)inletw2=Helper.number_of_pairon3n4inifw2<0xdc00||w2>0xdfffthenraiseMalFormed;letupper10=(w1land0x3ff)lsl10andlower10=w2land0x3ffinChan.advancet4;Uchar.of_int(0x10000+upper10+lower10))elseraiseMalFormed)letfrom_gensopt_bo=from_gen~bytes_per_char:Uchar.utf_16_byte_length(Helper.gen_from_char_genopt_bos)letfrom_strings=from_gen(Gen.init~limit:(String.lengths)(funi->String.getsi))letsub_lexemelbposlenbobom=letbuf=Buffer.create((len*4)+2)in(* +2 for the BOM *)Helper.to_bufferbolb.buf(lb.start_pos+pos)lenbombuf;Buffer.contentsbufletlexemelbbobom=sub_lexemelb0(lb.pos-lb.start_pos)bobomend