123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822(*---------------------------------------------------------------------------
Copyright (c) 2012 The uutf programmers. All rights reserved.
Distributed under the ISC license, see terms at the end of the file.
---------------------------------------------------------------------------*)letio_buffer_size=65536(* IO_BUFFER_SIZE 4.0.0 *)letpp=Format.fprintfletinvalid_encode()=invalid_arg"expected `Await encode"letinvalid_boundsjl=invalid_arg(Printf.sprintf"invalid bounds (index %d, length %d)"jl)(* Unsafe string byte manipulations. If you don't believe the author's
invariants, replacing with safe versions makes everything safe in
the module. He won't be upset. *)letunsafe_chr=Char.unsafe_chrletunsafe_blit=Bytes.unsafe_blitletunsafe_array_get=Array.unsafe_getletunsafe_bytesj=Char.code(Bytes.unsafe_getsj)letunsafe_set_bytesjbyte=Bytes.unsafe_setsj(Char.unsafe_chrbyte)(* Unicode characters *)letu_bom=Uchar.unsafe_of_int0xFEFF(* BOM. *)letu_rep=Uchar.unsafe_of_int0xFFFD(* replacement character. *)(* Unicode encoding schemes *)typeencoding=[`UTF_8|`UTF_16|`UTF_16BE|`UTF_16LE]typedecoder_encoding=[encoding|`US_ASCII|`ISO_8859_1]letencoding_of_strings=matchString.uppercase_asciiswith(* IANA names. *)|"UTF-8"->Some`UTF_8|"UTF-16"->Some`UTF_16|"UTF-16LE"->Some`UTF_16LE|"UTF-16BE"->Some`UTF_16BE|"ANSI_X3.4-1968"|"ISO-IR-6"|"ANSI_X3.4-1986"|"ISO_646.IRV:1991"|"ASCII"|"ISO646-US"|"US-ASCII"|"US"|"IBM367"|"CP367"|"CSASCII"->Some`US_ASCII|"ISO_8859-1:1987"|"ISO-IR-100"|"ISO_8859-1"|"ISO-8859-1"|"LATIN1"|"L1"|"IBM819"|"CP819"|"CSISOLATIN1"->Some`ISO_8859_1|_->Noneletencoding_to_string=function|`UTF_8->"UTF-8"|`UTF_16->"UTF-16"|`UTF_16BE->"UTF-16BE"|`UTF_16LE->"UTF-16LE"|`US_ASCII->"US-ASCII"|`ISO_8859_1->"ISO-8859-1"(* Base character decoders. They assume enough data. *)letmalformedsjl=`Malformed(Bytes.sub_stringsjl)letmalformed_pairbehisjl=(* missing or half low surrogate at eoi. *)letbs1=Bytes.(subsjl)inletbs0=Bytes.create2inletj0,j1=ifbethen(0,1)else(1,0)inunsafe_set_bytebs0j0(hilsr8);unsafe_set_bytebs0j1(hiland0xFF);`MalformedBytes.(unsafe_to_string(catbs0bs1))letr_us_asciisj=(* assert (0 <= j && j < String.length s); *)letb0=unsafe_bytesjinifb0<=127then`Uchar(Uchar.unsafe_of_intb0)elsemalformedsj1letr_iso_8859_1sj=(* assert (0 <= j && j < String.length s); *)`Uchar(Uchar.unsafe_of_int@@unsafe_bytesj)letutf_8_len=[|(* uchar byte length according to first UTF-8 byte. *)1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3;3;3;3;3;3;3;4;4;4;4;4;0;0;0;0;0;0;0;0;0;0;0|]letr_utf_8sjl=(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)letucharc=`Uchar(Uchar.unsafe_of_intc)inmatchlwith|1->uchar(unsafe_bytesj)|2->letb0=unsafe_bytesjinletb1=unsafe_bytes(j+1)inifb1lsr6!=0b10thenmalformedsjlelseuchar(((b0land0x1F)lsl6)lor(b1land0x3F))|3->letb0=unsafe_bytesjinletb1=unsafe_bytes(j+1)inletb2=unsafe_bytes(j+2)inletc=((b0land0x0F)lsl12)lor((b1land0x3F)lsl6)lor(b2land0x3F)inifb2lsr6!=0b10thenmalformedsjlelsebeginmatchb0with|0xE0->ifb1<0xA0||0xBF<b1thenmalformedsjlelseucharc|0xED->ifb1<0x80||0x9F<b1thenmalformedsjlelseucharc|_->ifb1lsr6!=0b10thenmalformedsjlelseucharcend|4->letb0=unsafe_bytesjinletb1=unsafe_bytes(j+1)inletb2=unsafe_bytes(j+2)inletb3=unsafe_bytes(j+3)inletc=(((b0land0x07)lsl18)lor((b1land0x3F)lsl12)lor((b2land0x3F)lsl6)lor(b3land0x3F))inifb3lsr6!=0b10||b2lsr6!=0b10thenmalformedsjlelsebeginmatchb0with|0xF0->ifb1<0x90||0xBF<b1thenmalformedsjlelseucharc|0xF4->ifb1<0x80||0x8F<b1thenmalformedsjlelseucharc|_->ifb1lsr6!=0b10thenmalformedsjlelseucharcend|_->assertfalseletr_utf_16sj0j1=(* May return a high surrogate. *)(* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)letb0=unsafe_bytesj0inletb1=unsafe_bytesj1inletu=(b0lsl8)lorb1inifu<0xD800||u>0xDFFFthen`Uchar(Uchar.unsafe_of_intu)elseifu>0xDBFFthenmalformeds(minj0j1)2else`Hiuletr_utf_16_lohisj0j1=(* Combines [hi] with a low surrogate. *)(* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)letb0=unsafe_bytesj0inletb1=unsafe_bytesj1inletlo=(b0lsl8)lorb1iniflo<0xDC00||lo>0xDFFFthenmalformed_pair(j0<j1(* true => be *))his(minj0j1)2else`Uchar(Uchar.unsafe_of_int((((hiland0x3FF)lsl10)lor(loland0x3FF))+0x10000))letr_encodingsjl=(* guess encoding with max. 3 bytes. *)(* assert (0 <= j && 0 <= l && j + l <= String.length s) *)letsomei=ifi<lthenSome(unsafe_bytes(j+i))elseNoneinmatch(some0),(some1),(some2)with|Some0xEF,Some0xBB,Some0xBF->`UTF_8`BOM|Some0xFE,Some0xFF,_->`UTF_16BE`BOM|Some0xFF,Some0xFE,_->`UTF_16LE`BOM|Some0x00,Somep,_whenp>0->`UTF_16BE(`ASCIIp)|Somep,Some0x00,_whenp>0->`UTF_16LE(`ASCIIp)|Someu,_,_whenutf_8_len.(u)<>0->`UTF_8`Decode|Some_,Some_,_->`UTF_16BE`Decode|Some_,None,None->`UTF_8`Decode|None,None,None->`UTF_8`End|None,Some_,_->assertfalse|Some_,None,Some_->assertfalse|None,None,Some_->assertfalse(* Decode *)typesrc=[`Channelofin_channel|`Stringofstring|`Manual]typenln=[`ASCIIofUchar.t|`NLFofUchar.t|`ReadlineofUchar.t]typedecode=[`Await|`End|`Malformedofstring|`UcharofUchar.t]letpp_decodeppf=function|`Ucharu->ppppf"@[`Uchar U+%04X@]"(Uchar.to_intu)|`End->ppppf"`End"|`Await->ppppf"`Await"|`Malformedbs->letl=String.lengthbsinppppf"@[`Malformed (";ifl>0thenppppf"%02X"(Char.code(bs.[0]));fori=1tol-1doppppf" %02X"(Char.code(bs.[i]))done;ppppf")@]"typedecoder={src:src;(* input source. *)mutableencoding:decoder_encoding;(* decoded encoding. *)nln:nlnoption;(* newline normalization (if any). *)nl:Uchar.t;(* newline normalization character. *)mutablei:Bytes.t;(* current input chunk. *)mutablei_pos:int;(* input current position. *)mutablei_max:int;(* input maximal position. *)t:Bytes.t;(* four bytes temporary buffer for overlapping reads. *)mutablet_len:int;(* current byte length of [t]. *)mutablet_need:int;(* number of bytes needed in [t]. *)mutableremoved_bom:bool;(* [true] if an initial BOM was removed. *)mutablelast_cr:bool;(* [true] if last char was CR. *)mutableline:int;(* line number. *)mutablecol:int;(* column number. *)mutablebyte_count:int;(* byte count. *)mutablecount:int;(* char count. *)mutablepp:(* decoder post-processor for BOM, position and nln. *)decoder->[`Malformedofstring|`UcharofUchar.t]->decode;mutablek:decoder->decode}(* decoder continuation. *)(* On decodes that overlap two (or more) [d.i] buffers, we use [t_fill] to copy
the input data to [d.t] and decode from there. If the [d.i] buffers are not
too small this is faster than continuation based byte per byte writes.
End of input (eoi) is signalled by [d.i_pos = 0] and [d.i_max = min_int]
which implies that [i_rem d < 0] is [true]. *)leti_remd=d.i_max-d.i_pos+1(* remaining bytes to read in [d.i]. *)leteoid=d.i<-Bytes.empty;d.i_pos<-0;d.i_max<-min_int(* set eoi in [d]. *)letsrcdsjl=(* set [d.i] with [s]. *)if(j<0||l<0||j+l>Bytes.lengths)theninvalid_boundsjlelseif(l=0)theneoidelse(d.i<-s;d.i_pos<-j;d.i_max<-j+l-1)letrefillkd=matchd.srcwith(* get new input in [d.i] and [k]ontinue. *)|`Manual->d.k<-k;`Await|`String_->eoid;kd|`Channelic->letrc=inputicd.i0(Bytes.lengthd.i)in(srcdd.i0rc;kd)lett_needdneed=d.t_len<-0;d.t_need<-needletrect_fillkd=(* get [d.t_need] bytes (or less if eoi) in [i.t]. *)letblitdl=unsafe_blitd.id.i_posd.td.t_len(* write pos. *)l;d.i_pos<-d.i_pos+l;d.t_len<-d.t_len+l;inletrem=i_remdinifrem<0(* eoi *)thenkdelseletneed=d.t_need-d.t_leninifrem<needthen(blitdrem;refill(t_fillk)d)else(blitdneed;kd)letretkvbyte_countd=(* return post-processed [v]. *)d.k<-k;d.byte_count<-d.byte_count+byte_count;d.ppdv(* Decoders. *)letrecdecode_us_asciid=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_us_asciid)elseletj=d.i_posind.i_pos<-d.i_pos+1;retdecode_us_ascii(r_us_asciid.ij)1dletrecdecode_iso_8859_1d=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_iso_8859_1d)elseletj=d.i_posind.i_pos<-d.i_pos+1;retdecode_iso_8859_1(r_iso_8859_1d.ij)1d(* UTF-8 decoder *)letrect_decode_utf_8d=(* decode from [d.t]. *)ifd.t_len<d.t_needthenretdecode_utf_8(malformedd.t0d.t_len)d.t_lendelseretdecode_utf_8(r_utf_8d.t0d.t_len)d.t_lendanddecode_utf_8d=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_utf_8d)elseletneed=unsafe_array_getutf_8_len(unsafe_byted.id.i_pos)inifrem<needthen(t_needdneed;t_fillt_decode_utf_8d)elseletj=d.i_posinifneed=0then(d.i_pos<-d.i_pos+1;retdecode_utf_8(malformedd.ij1)1d)else(d.i_pos<-d.i_pos+need;retdecode_utf_8(r_utf_8d.ijneed)needd)(* UTF-16BE decoder *)letrect_decode_utf_16be_lohid=(* decode from [d.t]. *)letbcount=d.t_len+2(* hi count *)inifd.t_len<d.t_needthenretdecode_utf_16be(malformed_pairtruehid.t0d.t_len)bcountdelseretdecode_utf_16be(r_utf_16_lohid.t01)bcountdandt_decode_utf_16bed=(* decode from [d.t]. *)ifd.t_len<d.t_needthenretdecode_utf_16be(malformedd.t0d.t_len)d.t_lendelsedecode_utf_16be_lo(r_utf_16d.t01)danddecode_utf_16be_lovd=matchvwith|`Uchar_|`Malformed_asv->retdecode_utf_16bev2d|`Hihi->letrem=i_remdinifrem<2then(t_needd2;t_fill(t_decode_utf_16be_lohi)d)elseletj=d.i_posind.i_pos<-d.i_pos+2;retdecode_utf_16be(r_utf_16_lohid.ij(j+1))4danddecode_utf_16bed=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_utf_16bed)elseifrem<2then(t_needd2;t_fillt_decode_utf_16bed)elseletj=d.i_posind.i_pos<-d.i_pos+2;decode_utf_16be_lo(r_utf_16d.ij(j+1))d(* UTF-16LE decoder, same as UTF-16BE with byte swapped. *)letrect_decode_utf_16le_lohid=(* decode from [d.t]. *)letbcount=d.t_len+2(* hi count *)inifd.t_len<d.t_needthenretdecode_utf_16le(malformed_pairfalsehid.t0d.t_len)bcountdelseretdecode_utf_16le(r_utf_16_lohid.t10)bcountdandt_decode_utf_16led=(* decode from [d.t]. *)ifd.t_len<d.t_needthenretdecode_utf_16le(malformedd.t0d.t_len)d.t_lendelsedecode_utf_16le_lo(r_utf_16d.t10)danddecode_utf_16le_lovd=matchvwith|`Uchar_|`Malformed_asv->retdecode_utf_16lev2d|`Hihi->letrem=i_remdinifrem<2then(t_needd2;t_fill(t_decode_utf_16le_lohi)d)elseletj=d.i_posind.i_pos<-d.i_pos+2;retdecode_utf_16le(r_utf_16_lohid.i(j+1)j)4danddecode_utf_16led=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_utf_16led)elseifrem<2then(t_needd2;t_fillt_decode_utf_16led)elseletj=d.i_posind.i_pos<-d.i_pos+2;decode_utf_16le_lo(r_utf_16d.i(j+1)j)d(* Encoding guessing. The guess is simple but starting the decoder
after is tedious, uutf's decoders are not designed to put bytes
back in the stream. *)letguessed_utf_8d=(* start decoder after `UTF_8 guess. *)letb3d=(* handles the third read byte. *)letb3=unsafe_byted.t2inmatchutf_8_len.(b3)with|0->retdecode_utf_8(malformedd.t21)1d|n->d.t_need<-n;d.t_len<-1;unsafe_set_byted.t0b3;t_fillt_decode_utf_8dinletb2d=(* handle second read byte. *)letb2=unsafe_byted.t1inletb3=ifd.t_len>2thenb3elsedecode_utf_8(* decodes `End *)inmatchutf_8_len.(b2)with|0->retb3(malformedd.t11)1d|1->retb3(r_utf_8d.t11)1d|n->(* copy d.t.(1-2) to d.t.(0-1) and decode *)d.t_need<-n;unsafe_set_byted.t0b2;if(d.t_len<3)thend.t_len<-1else(d.t_len<-2;unsafe_set_byted.t1(unsafe_byted.t2););t_fillt_decode_utf_8dinletb1=unsafe_byted.t0in(* handle first read byte. *)letb2=ifd.t_len>1thenb2elsedecode_utf_8(* decodes `End *)inmatchutf_8_len.(b1)with|0->retb2(malformedd.t01)1d|1->retb2(r_utf_8d.t01)1d|2->ifd.t_len<2thenretdecode_utf_8(malformedd.t01)1delseifd.t_len<3thenretdecode_utf_8(r_utf_8d.t02)2delseretb3(r_utf_8d.t02)2d|3->ifd.t_len<3thenretdecode_utf_8(malformedd.t0d.t_len)d.t_lendelseretdecode_utf_8(r_utf_8d.t03)3d|4->ifd.t_len<3thenretdecode_utf_8(malformedd.t0d.t_len)d.t_lendelse(d.t_need<-4;t_fillt_decode_utf_8d)|n->assertfalseletguessed_utf_16dbev=(* start decoder after `UTF_16{BE,LE} guess. *)letdecode_utf_16,t_decode_utf_16,t_decode_utf_16_lo,j0,j1=ifbethendecode_utf_16be,t_decode_utf_16be,t_decode_utf_16be_lo,0,1elsedecode_utf_16le,t_decode_utf_16le,t_decode_utf_16le_lo,1,0inletb3kd=ifd.t_len<3thendecode_utf_16d(* decodes `End *)elsebegin(* copy d.t.(2) to d.t.(0) and decode. *)d.t_need<-2;d.t_len<-1;unsafe_set_byted.t0(unsafe_byted.t2);t_fillkdendinmatchvwith|`BOM->ret(b3t_decode_utf_16)(`Ucharu_bom)2d|`ASCIIu->ret(b3t_decode_utf_16)(`Uchar(Uchar.unsafe_of_intu))2d|`Decode->matchr_utf_16d.tj0j1with|`Malformed_|`Uchar_asv->ret(b3t_decode_utf_16)v2d|`Hihi->ifd.t_len<3thenretdecode_utf_16(malformed_pairbehiBytes.empty00)d.t_lendelse(b3(t_decode_utf_16_lohi))dletguess_encodingd=(* guess encoding and start decoder. *)letsetupd=matchr_encodingd.t0d.t_lenwith|`UTF_8r->d.encoding<-`UTF_8;d.k<-decode_utf_8;beginmatchrwith|`BOM->retdecode_utf_8(`Ucharu_bom)3d|`Decode->guessed_utf_8d|`End->`Endend|`UTF_16BEr->d.encoding<-`UTF_16BE;d.k<-decode_utf_16be;guessed_utf_16dtruer|`UTF_16LEr->d.encoding<-`UTF_16LE;d.k<-decode_utf_16le;guessed_utf_16dfalserin(t_needd3;t_fillsetupd)(* Character post-processors. Used for BOM handling, newline
normalization and position tracking. The [pp_remove_bom] is only
used for the first character to remove a possible initial BOM and
handle UTF-16 endianness recognition. *)letnlined=d.col<-0;d.line<-d.line+1(* inlined. *)letncold=d.col<-d.col+1(* inlined. *)letncountd=d.count<-d.count+1(* inlined. *)letcrdb=d.last_cr<-b(* inlined. *)letpp_remove_bomutf16ppd=function(* removes init. BOM, handles UTF-16. *)|`Malformed_asv->d.removed_bom<-false;d.pp<-pp;d.ppdv|`Ucharuasv->matchUchar.to_intuwith|0xFEFF(* BOM *)->ifutf16then(d.encoding<-`UTF_16BE;d.k<-decode_utf_16be);d.removed_bom<-true;d.pp<-pp;d.kd|0xFFFE(* BOM reversed from decode_utf_16be *)whenutf16->d.encoding<-`UTF_16LE;d.k<-decode_utf_16le;d.removed_bom<-true;d.pp<-pp;d.kd|_->d.removed_bom<-false;d.pp<-pp;d.ppdvletpp_nln_noned=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;ncountd;iflast_crthenvelse(nlined;v)|0x000D(* CR *)->crdtrue;ncountd;nlined;v|(0x0085|0x000C|0x2028|0x2029)(* NEL | FF | LS | PS *)->crdfalse;ncountd;nlined;v|_->crdfalse;ncountd;ncold;vletpp_nln_readlined=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;iflast_crthend.kdelse(ncountd;nlined;`Uchard.nl)|0x000D(* CR *)->crdtrue;ncountd;nlined;`Uchard.nl|(0x0085|0x000C|0x2028|0x2029)(* NEL | FF | LS | PS *)->crdfalse;ncountd;nlined;`Uchard.nl|_->crdfalse;ncountd;ncold;vletpp_nln_nlfd=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;iflast_crthend.kdelse(ncountd;nlined;`Uchard.nl)|0x000D(* CR *)->crdtrue;ncountd;nlined;`Uchard.nl|0x0085(* NEL *)->crdfalse;ncountd;nlined;`Uchard.nl|(0x000C|0x2028|0x2029)(* FF | LS | PS *)->crdfalse;ncountd;nlined;v|_->crdfalse;ncountd;ncold;vletpp_nln_asciid=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;iflast_crthend.kdelse(ncountd;nlined;`Uchard.nl)|0x000D(* CR *)->crdtrue;ncountd;nlined;`Uchard.nl|(0x0085|0x000C|0x2028|0x2029)(* NEL | FF | LS | PS *)->crdfalse;ncountd;nlined;v|_->crdfalse;ncountd;ncold;vletdecode_fun=function|`UTF_8->decode_utf_8|`UTF_16->decode_utf_16be(* see [pp_remove_bom]. *)|`UTF_16BE->decode_utf_16be|`UTF_16LE->decode_utf_16le|`US_ASCII->decode_us_ascii|`ISO_8859_1->decode_iso_8859_1letdecoder?nln?encodingsrc=letpp,nl=matchnlnwith|None->pp_nln_none,Uchar.unsafe_of_int0x000A(* not used. *)|Some(`ASCIInl)->pp_nln_ascii,nl|Some(`NLFnl)->pp_nln_nlf,nl|Some(`Readlinenl)->pp_nln_readline,nlinletencoding,k=matchencodingwith|None->`UTF_8,guess_encoding|Somee->(e:>decoder_encoding),decode_funeinleti,i_pos,i_max=matchsrcwith|`Manual->Bytes.empty,1,0(* implies src_rem d = 0. *)|`Channel_->Bytes.createio_buffer_size,1,0(* idem. *)|`Strings->Bytes.unsafe_of_strings,0,String.lengths-1in{src=(src:>src);encoding;nln=(nln:>nlnoption);nl;i;i_pos;i_max;t=Bytes.create4;t_len=0;t_need=0;removed_bom=false;last_cr=false;line=1;col=0;byte_count=0;count=0;pp=pp_remove_bom(encoding=`UTF_16)pp;k}letdecoded=d.kdletdecoder_lined=d.lineletdecoder_cold=d.colletdecoder_byte_countd=d.byte_countletdecoder_countd=d.countletdecoder_removed_bomd=d.removed_bomletdecoder_srcd=d.srcletdecoder_nlnd=d.nlnletdecoder_encodingd=d.encodingletset_decoder_encodingde=d.encoding<-(e:>decoder_encoding);d.k<-decode_fune(* Encode *)typedst=[`Channelofout_channel|`BufferofBuffer.t|`Manual]typeencode=[`Await|`End|`UcharofUchar.t]typeencoder={dst:dst;(* output destination. *)encoding:encoding;(* encoded encoding. *)mutableo:Bytes.t;(* current output chunk. *)mutableo_pos:int;(* next output position to write. *)mutableo_max:int;(* maximal output position to write. *)t:Bytes.t;(* four bytes buffer for overlapping writes. *)mutablet_pos:int;(* next position to read in [t]. *)mutablet_max:int;(* maximal position to read in [t]. *)mutablek:(* encoder continuation. *)encoder->encode->[`Ok|`Partial]}(* On encodes that overlap two (or more) [e.o] buffers, we encode the
character to the temporary buffer [o.t] and continue with
[tmp_flush] to write this data on the different [e.o] buffers. If
the [e.o] buffers are not too small this is faster than
continuation based byte per byte writes. *)leto_reme=e.o_max-e.o_pos+1(* remaining bytes to write in [e.o]. *)letdstesjl=(* set [e.o] with [s]. *)if(j<0||l<0||j+l>Bytes.lengths)theninvalid_boundsjl;e.o<-s;e.o_pos<-j;e.o_max<-j+l-1letpartialke=function`Await->ke|`Uchar_|`End->invalid_encode()letflushke=matche.dstwith(* get free storage in [d.o] and [k]ontinue. *)|`Manual->e.k<-partialk;`Partial|`Channeloc->outputoce.o0e.o_pos;e.o_pos<-0;ke|`Bufferb->leto=Bytes.unsafe_to_stringe.oinBuffer.add_substringbo0e.o_pos;e.o_pos<-0;kelett_rangeemax=e.t_pos<-0;e.t_max<-maxletrect_flushke=(* flush [d.t] up to [d.t_max] in [d.i]. *)letblitel=unsafe_blite.te.t_pose.oe.o_posl;e.o_pos<-e.o_pos+l;e.t_pos<-e.t_pos+linletrem=o_remeinletlen=e.t_max-e.t_pos+1inifrem<lenthen(bliterem;flush(t_flushk)e)else(blitelen;ke)(* Encoders. *)letrecencode_utf_8ev=letke=e.k<-encode_utf_8;`Okinmatchvwith|`Await->ke|`End->flushke|`Ucharuasv->letu=Uchar.to_intuinletrem=o_remeinifu<=0x007Fthenifrem<1thenflush(fune->encode_utf_8ev)eelse(unsafe_set_bytee.oe.o_posu;e.o_pos<-e.o_pos+1;ke)elseifu<=0x07FFthenbeginlets,j,k=ifrem<2then(t_rangee1;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+2;e.o,j,k)inunsafe_set_bytesj(0xC0lor(ulsr6));unsafe_set_bytes(j+1)(0x80lor(uland0x3F));keendelseifu<=0xFFFFthenbeginlets,j,k=ifrem<3then(t_rangee2;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+3;e.o,j,k)inunsafe_set_bytesj(0xE0lor(ulsr12));unsafe_set_bytes(j+1)(0x80lor((ulsr6)land0x3F));unsafe_set_bytes(j+2)(0x80lor(uland0x3F));keendelsebeginlets,j,k=ifrem<4then(t_rangee3;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+4;e.o,j,k)inunsafe_set_bytesj(0xF0lor(ulsr18));unsafe_set_bytes(j+1)(0x80lor((ulsr12)land0x3F));unsafe_set_bytes(j+2)(0x80lor((ulsr6)land0x3F));unsafe_set_bytes(j+3)(0x80lor(uland0x3F));keendletrecencode_utf_16beev=letke=e.k<-encode_utf_16be;`Okinmatchvwith|`Await->ke|`End->flushke|`Ucharu->letu=Uchar.to_intuinletrem=o_remeinifu<0x10000thenbeginlets,j,k=ifrem<2then(t_rangee1;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+2;e.o,j,k)inunsafe_set_bytesj(ulsr8);unsafe_set_bytes(j+1)(uland0xFF);keendelsebeginlets,j,k=ifrem<4then(t_rangee3;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+4;e.o,j,k)inletu'=u-0x10000inlethi=(0xD800lor(u'lsr10))inletlo=(0xDC00lor(u'land0x3FF))inunsafe_set_bytesj(hilsr8);unsafe_set_bytes(j+1)(hiland0xFF);unsafe_set_bytes(j+2)(lolsr8);unsafe_set_bytes(j+3)(loland0xFF);keendletrecencode_utf_16leev=(* encode_uft_16be with bytes swapped. *)letke=e.k<-encode_utf_16le;`Okinmatchvwith|`Await->ke|`End->flushke|`Ucharu->letu=Uchar.to_intuinletrem=o_remeinifu<0x10000thenbeginlets,j,k=ifrem<2then(t_rangee1;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+2;e.o,j,k)inunsafe_set_bytesj(uland0xFF);unsafe_set_bytes(j+1)(ulsr8);keendelsebeginlets,j,k=ifrem<4then(t_rangee3;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+4;e.o,j,k)inletu'=u-0x10000inlethi=(0xD800lor(u'lsr10))inletlo=(0xDC00lor(u'land0x3FF))inunsafe_set_bytesj(hiland0xFF);unsafe_set_bytes(j+1)(hilsr8);unsafe_set_bytes(j+2)(loland0xFF);unsafe_set_bytes(j+3)(lolsr8);keendletencode_fun=function|`UTF_8->encode_utf_8|`UTF_16->encode_utf_16be|`UTF_16BE->encode_utf_16be|`UTF_16LE->encode_utf_16leletencoderencodingdst=leto,o_pos,o_max=matchdstwith|`Manual->Bytes.empty,1,0(* implies o_rem e = 0. *)|`Buffer_|`Channel_->Bytes.createio_buffer_size,0,io_buffer_size-1in{dst=(dst:>dst);encoding=(encoding:>encoding);o;o_pos;o_max;t=Bytes.create4;t_pos=1;t_max=0;k=encode_funencoding}letencodeev=e.ke(v:>encode)letencoder_encodinge=e.encodingletencoder_dste=e.dst(* Manual sources and destinations. *)moduleManual=structletsrc=srcletdst=dstletdst_rem=o_remend(* Strings folders and Buffer encoders *)moduleString=structletencoding_guesss=lets=Bytes.unsafe_of_stringsinmatchr_encodings0(max(Bytes.lengths)3)with|`UTF_8d->`UTF_8,(d=`BOM)|`UTF_16BEd->`UTF_16BE,(d=`BOM)|`UTF_16LEd->`UTF_16LE,(d=`BOM)type'afolder='a->int->[`UcharofUchar.t|`Malformedofstring]->'aletfold_utf_8?(pos=0)?lenfaccs=letrecloopaccfsilast=ifi>lastthenaccelseletneed=unsafe_array_getutf_8_len(unsafe_bytesi)inifneed=0thenloop(facci(malformedsi1))fs(i+1)lastelseletrem=last-i+1inifrem<needthenfacci(malformedsirem)elseloop(facci(r_utf_8sineed))fs(i+need)lastinletlen=matchlenwithNone->String.lengths-pos|Somel->linletlast=pos+len-1inloopaccf(Bytes.unsafe_of_strings)poslastletfold_utf_16be?(pos=0)?lenfaccs=letrecloopaccfsilast=ifi>lastthenaccelseletrem=last-i+1inifrem<2thenfacci(malformedsi1)elsematchr_utf_16si(i+1)with|`Uchar_|`Malformed_asv->loop(facciv)fs(i+2)last|`Hihi->ifrem<4thenfacci(malformedsirem)elseloop(facci(r_utf_16_lohis(i+2)(i+3)))fs(i+4)lastinletlen=matchlenwithNone->String.lengths-pos|Somel->linletlast=pos+len-1inloopaccf(Bytes.unsafe_of_strings)poslastletfold_utf_16le?(pos=0)?lenfaccs=(* [fold_utf_16be], bytes swapped. *)letrecloopaccfsilast=ifi>lastthenaccelseletrem=last-i+1inifrem<2thenfacci(malformedsi1)elsematchr_utf_16s(i+1)iwith|`Uchar_|`Malformed_asv->loop(facciv)fs(i+2)last|`Hihi->ifrem<4thenfacci(malformedsirem)elseloop(facci(r_utf_16_lohis(i+3)(i+2)))fs(i+4)lastinletlen=matchlenwithNone->String.lengths-pos|Somel->linletlast=pos+len-1inloopaccf(Bytes.unsafe_of_strings)poslastendmoduleBuffer=structletadd_utf_8bu=letu=Uchar.to_intuinletwbyte=Buffer.add_charb(unsafe_chrbyte)in(* inlined. *)ifu<=0x007Fthen(wu)elseifu<=0x07FFthen(w(0xC0lor(ulsr6));w(0x80lor(uland0x3F)))elseifu<=0xFFFFthen(w(0xE0lor(ulsr12));w(0x80lor((ulsr6)land0x3F));w(0x80lor(uland0x3F)))else(w(0xF0lor(ulsr18));w(0x80lor((ulsr12)land0x3F));w(0x80lor((ulsr6)land0x3F));w(0x80lor(uland0x3F)))letadd_utf_16bebu=letu=Uchar.to_intuinletwbyte=Buffer.add_charb(unsafe_chrbyte)in(* inlined. *)ifu<0x10000then(w(ulsr8);w(uland0xFF))elseletu'=u-0x10000inlethi=(0xD800lor(u'lsr10))inletlo=(0xDC00lor(u'land0x3FF))inw(hilsr8);w(hiland0xFF);w(lolsr8);w(loland0xFF)letadd_utf_16lebu=(* swapped add_utf_16be. *)letu=Uchar.to_intuinletwbyte=Buffer.add_charb(unsafe_chrbyte)in(* inlined. *)ifu<0x10000then(w(uland0xFF);w(ulsr8))elseletu'=u-0x10000inlethi=(0xD800lor(u'lsr10))inletlo=(0xDC00lor(u'land0x3FF))inw(hiland0xFF);w(hilsr8);w(loland0xFF);w(lolsr8)end(*---------------------------------------------------------------------------
Copyright (c) 2012 The uutf programmers
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
---------------------------------------------------------------------------*)