123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772(*---------------------------------------------------------------------------
Copyright (c) 2012 The uutf programmers. All rights reserved.
SPDX-License-Identifier: ISC
---------------------------------------------------------------------------*)letio_buffer_size=65536(* IO_BUFFER_SIZE 4.0.0 *)letpp=Format.fprintfletinvalid_encode()=invalid_arg"expected `Await encode"letinvalid_boundsjl=invalid_arg(Printf.sprintf"invalid bounds (index %d, length %d)"jl)(* Unsafe string byte manipulations. If you don't believe the author's
invariants, replacing with safe versions makes everything safe in
the module. He won't be upset. *)letunsafe_chr=Char.unsafe_chrletunsafe_blit=Bytes.unsafe_blitletunsafe_array_get=Array.unsafe_getletunsafe_bytesj=Char.code(Bytes.unsafe_getsj)letunsafe_set_bytesjbyte=Bytes.unsafe_setsj(Char.unsafe_chrbyte)(* Unicode characters *)letu_bom=Uchar.unsafe_of_int0xFEFF(* BOM. *)letu_rep=Uchar.unsafe_of_int0xFFFD(* replacement character. *)(* Unicode encoding schemes *)typeencoding=[`UTF_8|`UTF_16|`UTF_16BE|`UTF_16LE]typedecoder_encoding=[encoding|`US_ASCII|`ISO_8859_1]letencoding_of_strings=matchString.uppercase_asciiswith(* IANA names. *)|"UTF-8"->Some`UTF_8|"UTF-16"->Some`UTF_16|"UTF-16LE"->Some`UTF_16LE|"UTF-16BE"->Some`UTF_16BE|"ANSI_X3.4-1968"|"ISO-IR-6"|"ANSI_X3.4-1986"|"ISO_646.IRV:1991"|"ASCII"|"ISO646-US"|"US-ASCII"|"US"|"IBM367"|"CP367"|"CSASCII"->Some`US_ASCII|"ISO_8859-1:1987"|"ISO-IR-100"|"ISO_8859-1"|"ISO-8859-1"|"LATIN1"|"L1"|"IBM819"|"CP819"|"CSISOLATIN1"->Some`ISO_8859_1|_->Noneletencoding_to_string=function|`UTF_8->"UTF-8"|`UTF_16->"UTF-16"|`UTF_16BE->"UTF-16BE"|`UTF_16LE->"UTF-16LE"|`US_ASCII->"US-ASCII"|`ISO_8859_1->"ISO-8859-1"(* Base character decoders. They assume enough data. *)letmalformedsjl=`Malformed(Bytes.sub_stringsjl)letmalformed_pairbehisjl=(* missing or half low surrogate at eoi. *)letbs1=Bytes.(subsjl)inletbs0=Bytes.create2inletj0,j1=ifbethen(0,1)else(1,0)inunsafe_set_bytebs0j0(hilsr8);unsafe_set_bytebs0j1(hiland0xFF);`MalformedBytes.(unsafe_to_string(catbs0bs1))letr_us_asciisj=(* assert (0 <= j && j < String.length s); *)letb0=unsafe_bytesjinifb0<=127then`Uchar(Uchar.unsafe_of_intb0)elsemalformedsj1letr_iso_8859_1sj=(* assert (0 <= j && j < String.length s); *)`Uchar(Uchar.unsafe_of_int@@unsafe_bytesj)letutf_8_len=[|(* uchar byte length according to first UTF-8 byte. *)1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;2;3;3;3;3;3;3;3;3;3;3;3;3;3;3;3;3;4;4;4;4;4;0;0;0;0;0;0;0;0;0;0;0|]letr_utf_8sjl=(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)letucharc=`Uchar(Uchar.unsafe_of_intc)inmatchlwith|1->uchar(unsafe_bytesj)|2->letb0=unsafe_bytesjinletb1=unsafe_bytes(j+1)inifb1lsr6!=0b10thenmalformedsjlelseuchar(((b0land0x1F)lsl6)lor(b1land0x3F))|3->letb0=unsafe_bytesjinletb1=unsafe_bytes(j+1)inletb2=unsafe_bytes(j+2)inletc=((b0land0x0F)lsl12)lor((b1land0x3F)lsl6)lor(b2land0x3F)inifb2lsr6!=0b10thenmalformedsjlelsebeginmatchb0with|0xE0->ifb1<0xA0||0xBF<b1thenmalformedsjlelseucharc|0xED->ifb1<0x80||0x9F<b1thenmalformedsjlelseucharc|_->ifb1lsr6!=0b10thenmalformedsjlelseucharcend|4->letb0=unsafe_bytesjinletb1=unsafe_bytes(j+1)inletb2=unsafe_bytes(j+2)inletb3=unsafe_bytes(j+3)inletc=(((b0land0x07)lsl18)lor((b1land0x3F)lsl12)lor((b2land0x3F)lsl6)lor(b3land0x3F))inifb3lsr6!=0b10||b2lsr6!=0b10thenmalformedsjlelsebeginmatchb0with|0xF0->ifb1<0x90||0xBF<b1thenmalformedsjlelseucharc|0xF4->ifb1<0x80||0x8F<b1thenmalformedsjlelseucharc|_->ifb1lsr6!=0b10thenmalformedsjlelseucharcend|_->assertfalseletr_utf_16sj0j1=(* May return a high surrogate. *)(* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)letb0=unsafe_bytesj0inletb1=unsafe_bytesj1inletu=(b0lsl8)lorb1inifu<0xD800||u>0xDFFFthen`Uchar(Uchar.unsafe_of_intu)elseifu>0xDBFFthenmalformeds(minj0j1)2else`Hiuletr_utf_16_lohisj0j1=(* Combines [hi] with a low surrogate. *)(* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)letb0=unsafe_bytesj0inletb1=unsafe_bytesj1inletlo=(b0lsl8)lorb1iniflo<0xDC00||lo>0xDFFFthenmalformed_pair(j0<j1(* true => be *))his(minj0j1)2else`Uchar(Uchar.unsafe_of_int((((hiland0x3FF)lsl10)lor(loland0x3FF))+0x10000))letr_encodingsjl=(* guess encoding with max. 3 bytes. *)(* assert (0 <= j && 0 <= l && j + l <= String.length s) *)letsomei=ifi<lthenSome(unsafe_bytes(j+i))elseNoneinmatch(some0),(some1),(some2)with|Some0xEF,Some0xBB,Some0xBF->`UTF_8`BOM|Some0xFE,Some0xFF,_->`UTF_16BE`BOM|Some0xFF,Some0xFE,_->`UTF_16LE`BOM|Some0x00,Somep,_whenp>0->`UTF_16BE(`ASCIIp)|Somep,Some0x00,_whenp>0->`UTF_16LE(`ASCIIp)|Someu,_,_whenutf_8_len.(u)<>0->`UTF_8`Decode|Some_,Some_,_->`UTF_16BE`Decode|Some_,None,None->`UTF_8`Decode|None,None,None->`UTF_8`End|None,Some_,_->assertfalse|Some_,None,Some_->assertfalse|None,None,Some_->assertfalse(* Decode *)typesrc=[`Channelofin_channel|`Stringofstring|`Manual]typenln=[`ASCIIofUchar.t|`NLFofUchar.t|`ReadlineofUchar.t]typedecode=[`Await|`End|`Malformedofstring|`UcharofUchar.t]letpp_decodeppf=function|`Ucharu->ppppf"@[`Uchar U+%04X@]"(Uchar.to_intu)|`End->ppppf"`End"|`Await->ppppf"`Await"|`Malformedbs->letl=String.lengthbsinppppf"@[`Malformed (";ifl>0thenppppf"%02X"(Char.code(bs.[0]));fori=1tol-1doppppf" %02X"(Char.code(bs.[i]))done;ppppf")@]"typedecoder={src:src;(* input source. *)mutableencoding:decoder_encoding;(* decoded encoding. *)nln:nlnoption;(* newline normalization (if any). *)nl:Uchar.t;(* newline normalization character. *)mutablei:Bytes.t;(* current input chunk. *)mutablei_pos:int;(* input current position. *)mutablei_max:int;(* input maximal position. *)t:Bytes.t;(* four bytes temporary buffer for overlapping reads. *)mutablet_len:int;(* current byte length of [t]. *)mutablet_need:int;(* number of bytes needed in [t]. *)mutableremoved_bom:bool;(* [true] if an initial BOM was removed. *)mutablelast_cr:bool;(* [true] if last char was CR. *)mutableline:int;(* line number. *)mutablecol:int;(* column number. *)mutablebyte_count:int;(* byte count. *)mutablecount:int;(* char count. *)mutablepp:(* decoder post-processor for BOM, position and nln. *)decoder->[`Malformedofstring|`UcharofUchar.t]->decode;mutablek:decoder->decode}(* decoder continuation. *)(* On decodes that overlap two (or more) [d.i] buffers, we use [t_fill] to copy
the input data to [d.t] and decode from there. If the [d.i] buffers are not
too small this is faster than continuation based byte per byte writes.
End of input (eoi) is signalled by [d.i_pos = 0] and [d.i_max = min_int]
which implies that [i_rem d < 0] is [true]. *)leti_remd=d.i_max-d.i_pos+1(* remaining bytes to read in [d.i]. *)leteoid=d.i<-Bytes.empty;d.i_pos<-0;d.i_max<-min_int(* set eoi in [d]. *)letsrcdsjl=(* set [d.i] with [s]. *)if(j<0||l<0||j+l>Bytes.lengths)theninvalid_boundsjlelseif(l=0)theneoidelse(d.i<-s;d.i_pos<-j;d.i_max<-j+l-1)letrefillkd=matchd.srcwith(* get new input in [d.i] and [k]ontinue. *)|`Manual->d.k<-k;`Await|`String_->eoid;kd|`Channelic->letrc=inputicd.i0(Bytes.lengthd.i)in(srcdd.i0rc;kd)lett_needdneed=d.t_len<-0;d.t_need<-needletrect_fillkd=(* get [d.t_need] bytes (or less if eoi) in [i.t]. *)letblitdl=unsafe_blitd.id.i_posd.td.t_len(* write pos. *)l;d.i_pos<-d.i_pos+l;d.t_len<-d.t_len+l;inletrem=i_remdinifrem<0(* eoi *)thenkdelseletneed=d.t_need-d.t_leninifrem<needthen(blitdrem;refill(t_fillk)d)else(blitdneed;kd)letretkvbyte_countd=(* return post-processed [v]. *)d.k<-k;d.byte_count<-d.byte_count+byte_count;d.ppdv(* Decoders. *)letrecdecode_us_asciid=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_us_asciid)elseletj=d.i_posind.i_pos<-d.i_pos+1;retdecode_us_ascii(r_us_asciid.ij)1dletrecdecode_iso_8859_1d=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_iso_8859_1d)elseletj=d.i_posind.i_pos<-d.i_pos+1;retdecode_iso_8859_1(r_iso_8859_1d.ij)1d(* UTF-8 decoder *)letrect_decode_utf_8d=(* decode from [d.t]. *)ifd.t_len<d.t_needthenretdecode_utf_8(malformedd.t0d.t_len)d.t_lendelseretdecode_utf_8(r_utf_8d.t0d.t_len)d.t_lendanddecode_utf_8d=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_utf_8d)elseletneed=unsafe_array_getutf_8_len(unsafe_byted.id.i_pos)inifrem<needthen(t_needdneed;t_fillt_decode_utf_8d)elseletj=d.i_posinifneed=0then(d.i_pos<-d.i_pos+1;retdecode_utf_8(malformedd.ij1)1d)else(d.i_pos<-d.i_pos+need;retdecode_utf_8(r_utf_8d.ijneed)needd)(* UTF-16BE decoder *)letrect_decode_utf_16be_lohid=(* decode from [d.t]. *)letbcount=d.t_len+2(* hi count *)inifd.t_len<d.t_needthenretdecode_utf_16be(malformed_pairtruehid.t0d.t_len)bcountdelseretdecode_utf_16be(r_utf_16_lohid.t01)bcountdandt_decode_utf_16bed=(* decode from [d.t]. *)ifd.t_len<d.t_needthenretdecode_utf_16be(malformedd.t0d.t_len)d.t_lendelsedecode_utf_16be_lo(r_utf_16d.t01)danddecode_utf_16be_lovd=matchvwith|`Uchar_|`Malformed_asv->retdecode_utf_16bev2d|`Hihi->letrem=i_remdinifrem<2then(t_needd2;t_fill(t_decode_utf_16be_lohi)d)elseletj=d.i_posind.i_pos<-d.i_pos+2;retdecode_utf_16be(r_utf_16_lohid.ij(j+1))4danddecode_utf_16bed=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_utf_16bed)elseifrem<2then(t_needd2;t_fillt_decode_utf_16bed)elseletj=d.i_posind.i_pos<-d.i_pos+2;decode_utf_16be_lo(r_utf_16d.ij(j+1))d(* UTF-16LE decoder, same as UTF-16BE with byte swapped. *)letrect_decode_utf_16le_lohid=(* decode from [d.t]. *)letbcount=d.t_len+2(* hi count *)inifd.t_len<d.t_needthenretdecode_utf_16le(malformed_pairfalsehid.t0d.t_len)bcountdelseretdecode_utf_16le(r_utf_16_lohid.t10)bcountdandt_decode_utf_16led=(* decode from [d.t]. *)ifd.t_len<d.t_needthenretdecode_utf_16le(malformedd.t0d.t_len)d.t_lendelsedecode_utf_16le_lo(r_utf_16d.t10)danddecode_utf_16le_lovd=matchvwith|`Uchar_|`Malformed_asv->retdecode_utf_16lev2d|`Hihi->letrem=i_remdinifrem<2then(t_needd2;t_fill(t_decode_utf_16le_lohi)d)elseletj=d.i_posind.i_pos<-d.i_pos+2;retdecode_utf_16le(r_utf_16_lohid.i(j+1)j)4danddecode_utf_16led=letrem=i_remdinifrem<=0then(ifrem<0then`Endelserefilldecode_utf_16led)elseifrem<2then(t_needd2;t_fillt_decode_utf_16led)elseletj=d.i_posind.i_pos<-d.i_pos+2;decode_utf_16le_lo(r_utf_16d.i(j+1)j)d(* Encoding guessing. The guess is simple but starting the decoder
after is tedious, uutf's decoders are not designed to put bytes
back in the stream. *)letguessed_utf_8d=(* start decoder after `UTF_8 guess. *)letb3d=(* handles the third read byte. *)letb3=unsafe_byted.t2inmatchutf_8_len.(b3)with|0->retdecode_utf_8(malformedd.t21)1d|n->d.t_need<-n;d.t_len<-1;unsafe_set_byted.t0b3;t_fillt_decode_utf_8dinletb2d=(* handle second read byte. *)letb2=unsafe_byted.t1inletb3=ifd.t_len>2thenb3elsedecode_utf_8(* decodes `End *)inmatchutf_8_len.(b2)with|0->retb3(malformedd.t11)1d|1->retb3(r_utf_8d.t11)1d|n->(* copy d.t.(1-2) to d.t.(0-1) and decode *)d.t_need<-n;unsafe_set_byted.t0b2;if(d.t_len<3)thend.t_len<-1else(d.t_len<-2;unsafe_set_byted.t1(unsafe_byted.t2););t_fillt_decode_utf_8dinletb1=unsafe_byted.t0in(* handle first read byte. *)letb2=ifd.t_len>1thenb2elsedecode_utf_8(* decodes `End *)inmatchutf_8_len.(b1)with|0->retb2(malformedd.t01)1d|1->retb2(r_utf_8d.t01)1d|2->ifd.t_len<2thenretdecode_utf_8(malformedd.t01)1delseifd.t_len<3thenretdecode_utf_8(r_utf_8d.t02)2delseretb3(r_utf_8d.t02)2d|3->ifd.t_len<3thenretdecode_utf_8(malformedd.t0d.t_len)d.t_lendelseretdecode_utf_8(r_utf_8d.t03)3d|4->ifd.t_len<3thenretdecode_utf_8(malformedd.t0d.t_len)d.t_lendelse(d.t_need<-4;t_fillt_decode_utf_8d)|n->assertfalseletguessed_utf_16dbev=(* start decoder after `UTF_16{BE,LE} guess. *)letdecode_utf_16,t_decode_utf_16,t_decode_utf_16_lo,j0,j1=ifbethendecode_utf_16be,t_decode_utf_16be,t_decode_utf_16be_lo,0,1elsedecode_utf_16le,t_decode_utf_16le,t_decode_utf_16le_lo,1,0inletb3kd=ifd.t_len<3thendecode_utf_16d(* decodes `End *)elsebegin(* copy d.t.(2) to d.t.(0) and decode. *)d.t_need<-2;d.t_len<-1;unsafe_set_byted.t0(unsafe_byted.t2);t_fillkdendinmatchvwith|`BOM->ret(b3t_decode_utf_16)(`Ucharu_bom)2d|`ASCIIu->ret(b3t_decode_utf_16)(`Uchar(Uchar.unsafe_of_intu))2d|`Decode->matchr_utf_16d.tj0j1with|`Malformed_|`Uchar_asv->ret(b3t_decode_utf_16)v2d|`Hihi->ifd.t_len<3thenretdecode_utf_16(malformed_pairbehiBytes.empty00)d.t_lendelse(b3(t_decode_utf_16_lohi))dletguess_encodingd=(* guess encoding and start decoder. *)letsetupd=matchr_encodingd.t0d.t_lenwith|`UTF_8r->d.encoding<-`UTF_8;d.k<-decode_utf_8;beginmatchrwith|`BOM->retdecode_utf_8(`Ucharu_bom)3d|`Decode->guessed_utf_8d|`End->`Endend|`UTF_16BEr->d.encoding<-`UTF_16BE;d.k<-decode_utf_16be;guessed_utf_16dtruer|`UTF_16LEr->d.encoding<-`UTF_16LE;d.k<-decode_utf_16le;guessed_utf_16dfalserin(t_needd3;t_fillsetupd)(* Character post-processors. Used for BOM handling, newline
normalization and position tracking. The [pp_remove_bom] is only
used for the first character to remove a possible initial BOM and
handle UTF-16 endianness recognition. *)letnlined=d.col<-0;d.line<-d.line+1(* inlined. *)letncold=d.col<-d.col+1(* inlined. *)letncountd=d.count<-d.count+1(* inlined. *)letcrdb=d.last_cr<-b(* inlined. *)letpp_remove_bomutf16ppd=function(* removes init. BOM, handles UTF-16. *)|`Malformed_asv->d.removed_bom<-false;d.pp<-pp;d.ppdv|`Ucharuasv->matchUchar.to_intuwith|0xFEFF(* BOM *)->ifutf16then(d.encoding<-`UTF_16BE;d.k<-decode_utf_16be);d.removed_bom<-true;d.pp<-pp;d.kd|0xFFFE(* BOM reversed from decode_utf_16be *)whenutf16->d.encoding<-`UTF_16LE;d.k<-decode_utf_16le;d.removed_bom<-true;d.pp<-pp;d.kd|_->d.removed_bom<-false;d.pp<-pp;d.ppdvletpp_nln_noned=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;ncountd;iflast_crthenvelse(nlined;v)|0x000D(* CR *)->crdtrue;ncountd;nlined;v|(0x0085|0x000C|0x2028|0x2029)(* NEL | FF | LS | PS *)->crdfalse;ncountd;nlined;v|_->crdfalse;ncountd;ncold;vletpp_nln_readlined=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;iflast_crthend.kdelse(ncountd;nlined;`Uchard.nl)|0x000D(* CR *)->crdtrue;ncountd;nlined;`Uchard.nl|(0x0085|0x000C|0x2028|0x2029)(* NEL | FF | LS | PS *)->crdfalse;ncountd;nlined;`Uchard.nl|_->crdfalse;ncountd;ncold;vletpp_nln_nlfd=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;iflast_crthend.kdelse(ncountd;nlined;`Uchard.nl)|0x000D(* CR *)->crdtrue;ncountd;nlined;`Uchard.nl|0x0085(* NEL *)->crdfalse;ncountd;nlined;`Uchard.nl|(0x000C|0x2028|0x2029)(* FF | LS | PS *)->crdfalse;ncountd;nlined;v|_->crdfalse;ncountd;ncold;vletpp_nln_asciid=function|`Malformed_asv->crdfalse;ncountd;ncold;v|`Ucharuasv->matchUchar.to_intuwith|0x000A(* LF *)->letlast_cr=d.last_crincrdfalse;iflast_crthend.kdelse(ncountd;nlined;`Uchard.nl)|0x000D(* CR *)->crdtrue;ncountd;nlined;`Uchard.nl|(0x0085|0x000C|0x2028|0x2029)(* NEL | FF | LS | PS *)->crdfalse;ncountd;nlined;v|_->crdfalse;ncountd;ncold;vletdecode_fun=function|`UTF_8->decode_utf_8|`UTF_16->decode_utf_16be(* see [pp_remove_bom]. *)|`UTF_16BE->decode_utf_16be|`UTF_16LE->decode_utf_16le|`US_ASCII->decode_us_ascii|`ISO_8859_1->decode_iso_8859_1letdecoder?nln?encodingsrc=letpp,nl=matchnlnwith|None->pp_nln_none,Uchar.unsafe_of_int0x000A(* not used. *)|Some(`ASCIInl)->pp_nln_ascii,nl|Some(`NLFnl)->pp_nln_nlf,nl|Some(`Readlinenl)->pp_nln_readline,nlinletencoding,k=matchencodingwith|None->`UTF_8,guess_encoding|Somee->(e:>decoder_encoding),decode_funeinleti,i_pos,i_max=matchsrcwith|`Manual->Bytes.empty,1,0(* implies src_rem d = 0. *)|`Channel_->Bytes.createio_buffer_size,1,0(* idem. *)|`Strings->Bytes.unsafe_of_strings,0,String.lengths-1in{src=(src:>src);encoding;nln=(nln:>nlnoption);nl;i;i_pos;i_max;t=Bytes.create4;t_len=0;t_need=0;removed_bom=false;last_cr=false;line=1;col=0;byte_count=0;count=0;pp=pp_remove_bom(encoding=`UTF_16)pp;k}letdecoded=d.kdletdecoder_lined=d.lineletdecoder_cold=d.colletdecoder_byte_countd=d.byte_countletdecoder_countd=d.countletdecoder_removed_bomd=d.removed_bomletdecoder_srcd=d.srcletdecoder_nlnd=d.nlnletdecoder_encodingd=d.encodingletset_decoder_encodingde=d.encoding<-(e:>decoder_encoding);d.k<-decode_fune(* Encode *)typedst=[`Channelofout_channel|`BufferofBuffer.t|`Manual]typeencode=[`Await|`End|`UcharofUchar.t]typeencoder={dst:dst;(* output destination. *)encoding:encoding;(* encoded encoding. *)mutableo:Bytes.t;(* current output chunk. *)mutableo_pos:int;(* next output position to write. *)mutableo_max:int;(* maximal output position to write. *)t:Bytes.t;(* four bytes buffer for overlapping writes. *)mutablet_pos:int;(* next position to read in [t]. *)mutablet_max:int;(* maximal position to read in [t]. *)mutablek:(* encoder continuation. *)encoder->encode->[`Ok|`Partial]}(* On encodes that overlap two (or more) [e.o] buffers, we encode the
character to the temporary buffer [o.t] and continue with
[tmp_flush] to write this data on the different [e.o] buffers. If
the [e.o] buffers are not too small this is faster than
continuation based byte per byte writes. *)leto_reme=e.o_max-e.o_pos+1(* remaining bytes to write in [e.o]. *)letdstesjl=(* set [e.o] with [s]. *)if(j<0||l<0||j+l>Bytes.lengths)theninvalid_boundsjl;e.o<-s;e.o_pos<-j;e.o_max<-j+l-1letpartialke=function`Await->ke|`Uchar_|`End->invalid_encode()letflushke=matche.dstwith(* get free storage in [d.o] and [k]ontinue. *)|`Manual->e.k<-partialk;`Partial|`Channeloc->outputoce.o0e.o_pos;e.o_pos<-0;ke|`Bufferb->leto=Bytes.unsafe_to_stringe.oinBuffer.add_substringbo0e.o_pos;e.o_pos<-0;kelett_rangeemax=e.t_pos<-0;e.t_max<-maxletrect_flushke=(* flush [d.t] up to [d.t_max] in [d.i]. *)letblitel=unsafe_blite.te.t_pose.oe.o_posl;e.o_pos<-e.o_pos+l;e.t_pos<-e.t_pos+linletrem=o_remeinletlen=e.t_max-e.t_pos+1inifrem<lenthen(bliterem;flush(t_flushk)e)else(blitelen;ke)(* Encoders. *)letrecencode_utf_8ev=letke=e.k<-encode_utf_8;`Okinmatchvwith|`Await->ke|`End->flushke|`Ucharuasv->letu=Uchar.to_intuinletrem=o_remeinifu<=0x007Fthenifrem<1thenflush(fune->encode_utf_8ev)eelse(unsafe_set_bytee.oe.o_posu;e.o_pos<-e.o_pos+1;ke)elseifu<=0x07FFthenbeginlets,j,k=ifrem<2then(t_rangee1;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+2;e.o,j,k)inunsafe_set_bytesj(0xC0lor(ulsr6));unsafe_set_bytes(j+1)(0x80lor(uland0x3F));keendelseifu<=0xFFFFthenbeginlets,j,k=ifrem<3then(t_rangee2;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+3;e.o,j,k)inunsafe_set_bytesj(0xE0lor(ulsr12));unsafe_set_bytes(j+1)(0x80lor((ulsr6)land0x3F));unsafe_set_bytes(j+2)(0x80lor(uland0x3F));keendelsebeginlets,j,k=ifrem<4then(t_rangee3;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+4;e.o,j,k)inunsafe_set_bytesj(0xF0lor(ulsr18));unsafe_set_bytes(j+1)(0x80lor((ulsr12)land0x3F));unsafe_set_bytes(j+2)(0x80lor((ulsr6)land0x3F));unsafe_set_bytes(j+3)(0x80lor(uland0x3F));keendletrecencode_utf_16beev=letke=e.k<-encode_utf_16be;`Okinmatchvwith|`Await->ke|`End->flushke|`Ucharu->letu=Uchar.to_intuinletrem=o_remeinifu<0x10000thenbeginlets,j,k=ifrem<2then(t_rangee1;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+2;e.o,j,k)inunsafe_set_bytesj(ulsr8);unsafe_set_bytes(j+1)(uland0xFF);keendelsebeginlets,j,k=ifrem<4then(t_rangee3;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+4;e.o,j,k)inletu'=u-0x10000inlethi=(0xD800lor(u'lsr10))inletlo=(0xDC00lor(u'land0x3FF))inunsafe_set_bytesj(hilsr8);unsafe_set_bytes(j+1)(hiland0xFF);unsafe_set_bytes(j+2)(lolsr8);unsafe_set_bytes(j+3)(loland0xFF);keendletrecencode_utf_16leev=(* encode_uft_16be with bytes swapped. *)letke=e.k<-encode_utf_16le;`Okinmatchvwith|`Await->ke|`End->flushke|`Ucharu->letu=Uchar.to_intuinletrem=o_remeinifu<0x10000thenbeginlets,j,k=ifrem<2then(t_rangee1;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+2;e.o,j,k)inunsafe_set_bytesj(uland0xFF);unsafe_set_bytes(j+1)(ulsr8);keendelsebeginlets,j,k=ifrem<4then(t_rangee3;e.t,0,t_flushk)elseletj=e.o_posin(e.o_pos<-e.o_pos+4;e.o,j,k)inletu'=u-0x10000inlethi=(0xD800lor(u'lsr10))inletlo=(0xDC00lor(u'land0x3FF))inunsafe_set_bytesj(hiland0xFF);unsafe_set_bytes(j+1)(hilsr8);unsafe_set_bytes(j+2)(loland0xFF);unsafe_set_bytes(j+3)(lolsr8);keendletencode_fun=function|`UTF_8->encode_utf_8|`UTF_16->encode_utf_16be|`UTF_16BE->encode_utf_16be|`UTF_16LE->encode_utf_16leletencoderencodingdst=leto,o_pos,o_max=matchdstwith|`Manual->Bytes.empty,1,0(* implies o_rem e = 0. *)|`Buffer_|`Channel_->Bytes.createio_buffer_size,0,io_buffer_size-1in{dst=(dst:>dst);encoding=(encoding:>encoding);o;o_pos;o_max;t=Bytes.create4;t_pos=1;t_max=0;k=encode_funencoding}letencodeev=e.ke(v:>encode)letencoder_encodinge=e.encodingletencoder_dste=e.dst(* Manual sources and destinations. *)moduleManual=structletsrc=srcletdst=dstletdst_rem=o_remend(* Strings folders and Buffer encoders *)moduleString=structletencoding_guesss=lets=Bytes.unsafe_of_stringsinmatchr_encodings0(max(Bytes.lengths)3)with|`UTF_8d->`UTF_8,(d=`BOM)|`UTF_16BEd->`UTF_16BE,(d=`BOM)|`UTF_16LEd->`UTF_16LE,(d=`BOM)type'afolder='a->int->[`UcharofUchar.t|`Malformedofstring]->'aletfold_utf_8?(pos=0)?lenfaccs=letrecloopaccfsilast=ifi>lastthenaccelseletneed=unsafe_array_getutf_8_len(unsafe_bytesi)inifneed=0thenloop(facci(malformedsi1))fs(i+1)lastelseletrem=last-i+1inifrem<needthenfacci(malformedsirem)elseloop(facci(r_utf_8sineed))fs(i+need)lastinletlen=matchlenwithNone->String.lengths-pos|Somel->linletlast=pos+len-1inloopaccf(Bytes.unsafe_of_strings)poslastletfold_utf_16be?(pos=0)?lenfaccs=letrecloopaccfsilast=ifi>lastthenaccelseletrem=last-i+1inifrem<2thenfacci(malformedsi1)elsematchr_utf_16si(i+1)with|`Uchar_|`Malformed_asv->loop(facciv)fs(i+2)last|`Hihi->ifrem<4thenfacci(malformedsirem)elseloop(facci(r_utf_16_lohis(i+2)(i+3)))fs(i+4)lastinletlen=matchlenwithNone->String.lengths-pos|Somel->linletlast=pos+len-1inloopaccf(Bytes.unsafe_of_strings)poslastletfold_utf_16le?(pos=0)?lenfaccs=(* [fold_utf_16be], bytes swapped. *)letrecloopaccfsilast=ifi>lastthenaccelseletrem=last-i+1inifrem<2thenfacci(malformedsi1)elsematchr_utf_16s(i+1)iwith|`Uchar_|`Malformed_asv->loop(facciv)fs(i+2)last|`Hihi->ifrem<4thenfacci(malformedsirem)elseloop(facci(r_utf_16_lohis(i+3)(i+2)))fs(i+4)lastinletlen=matchlenwithNone->String.lengths-pos|Somel->linletlast=pos+len-1inloopaccf(Bytes.unsafe_of_strings)poslastendmoduleBuffer=structletadd_utf_8=Buffer.add_utf_8_ucharletadd_utf_16be=Buffer.add_utf_16be_ucharletadd_utf_16le=Buffer.add_utf_16le_ucharend