123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103(**
* Copyright (c) 2017-present, Facebook, Inc.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*)(*
* WTF-8 is a superset of UTF-8 that allows unpaired surrogates.
*
* From ES6 6.1.4, "The String Type":
*
* Where ECMAScript operations interpret String values, each element is
* interpreted as a single UTF-16 code unit. However, ECMAScript does not
* place any restrictions or requirements on the sequence of code units in
* a String value, so they may be ill-formed when interpreted as UTF-16 code
* unit sequences. Operations that do not interpret String contents treat
* them as sequences of undifferentiated 16-bit unsigned integers.
*
* If we try to encode these ill-formed code units into UTF-8, we similarly
* get ill-formed UTF-8. WTF-8 is a fun name for that encoding.
*
* https://simonsapin.github.io/wtf-8/
*)typecodepoint=|Pointofint|Malformedtype'afolder='a->int->codepoint->'a(* WTF-8 is a variable length encoding. The first byte in each codepoint
determines how many other bytes follow. *)letneeded_bytesc=if0x00<=c&&c<=0x7Fthen1elseif0xC2<=c&&c<=0xDFthen2elseif0xE0<=c&&c<=0xEFthen3elseif0xF0<=c&&c<=0xF4then4else0letunsafe_charsi=Char.code(Bytes.unsafe_getsi)letcodepointsi=function|1->unsafe_charsi|2->letb0=unsafe_charsiinletb1=unsafe_chars(i+1)in((b0land0x1F)lsl6)lor(b1land0x3F)|3->letb0=unsafe_chars(i)inletb1=unsafe_chars(i+1)inletb2=unsafe_chars(i+2)in((b0land0x0F)lsl12)lor((b1land0x3F)lsl6)lor(b2land0x3F)|4->letb0=unsafe_chars(i)inletb1=unsafe_chars(i+1)inletb2=unsafe_chars(i+2)inletb3=unsafe_chars(i+3)in((b0land0x07)lsl18)lor((b1land0x3F)lsl12)lor((b2land0x3F)lsl6)lor(b3land0x3F)|_->assertfalse(* Fold over the WTF-8 code units in a string *)letfold_wtf_8?(pos=0)?lenfaccs=letrecloopaccfsil=ifi=lthenaccelseletneed=needed_bytes(unsafe_charsi)inifneed=0then(loop[@tailcall])(facciMalformed)fs(i+1)lelseletrem=l-iinifrem<needthenfacciMalformedelse(loop[@tailcall])(facci(Point(codepointsineed)))fs(i+need)linletlen=matchlenwith|None->String.lengths-pos|Somel->linloopaccf(Bytes.unsafe_of_strings)poslen(* Add a UTF-16 code unit to a buffer, encoded in WTF-8. *)letadd_wtf_8bufcode=let[@inline]wbyte=Buffer.add_charbuf(Char.unsafe_chrbyte)inifcode>=0x10000thenbegin(* 4 bytes *)w(0xf0lor(codelsr18));w(0x80lor((codelsr12)land0x3F));w(0x80lor((codelsr6)land0x3F));w(0x80lor(codeland0x3F))endelseifcode>=0x800thenbegin(* 3 bytes *)w(0xe0lor(codelsr12));w(0x80lor((codelsr6)land0x3F));w(0x80lor(codeland0x3F))endelseifcode>=0x80thenbegin(* 2 bytes *)w(0xc0lor(codelsr6));w(0x80lor(codeland0x3F))endelse(* 1 byte *)wcode