Source: wtf8.ml (p.melange.5.0.1-53.doc.src.melange.js

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103(** * Copyright (c) 2017-present, Facebook, Inc. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. *) (* * WTF-8 is a superset of UTF-8 that allows unpaired surrogates. * * From ES6 6.1.4, "The String Type": * * Where ECMAScript operations interpret String values, each element is * interpreted as a single UTF-16 code unit. However, ECMAScript does not * place any restrictions or requirements on the sequence of code units in * a String value, so they may be ill-formed when interpreted as UTF-16 code * unit sequences. Operations that do not interpret String contents treat * them as sequences of undifferentiated 16-bit unsigned integers. * * If we try to encode these ill-formed code units into UTF-8, we similarly * get ill-formed UTF-8. WTF-8 is a fun name for that encoding. * * https://simonsapin.github.io/wtf-8/ *) type codepoint = | Point of int | Malformed type 'a folder = 'a -> int -> codepoint -> 'a (* WTF-8 is a variable length encoding. The first byte in each codepoint determines how many other bytes follow. *) let needed_bytes c = if 0x00 <= c && c <= 0x7F then 1 else if 0xC2 <= c && c <= 0xDF then 2 else if 0xE0 <= c && c <= 0xEF then 3 else if 0xF0 <= c && c <= 0xF4 then 4 else 0 let unsafe_char s i = Char.code (Bytes.unsafe_get s i) let codepoint s i = function | 1 -> unsafe_char s i | 2 -> let b0 = unsafe_char s i in let b1 = unsafe_char s (i + 1) in ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) | 3 -> let b0 = unsafe_char s (i) in let b1 = unsafe_char s (i + 1) in let b2 = unsafe_char s (i + 2) in ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) | 4 -> let b0 = unsafe_char s (i) in let b1 = unsafe_char s (i + 1) in let b2 = unsafe_char s (i + 2) in let b3 = unsafe_char s (i + 3) in ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) | _ -> assert false (* Fold over the WTF-8 code units in a string *) let fold_wtf_8 ?(pos = 0) ?len f acc s = let rec loop acc f s i l = if i = l then acc else let need = needed_bytes (unsafe_char s i) in if need = 0 then (loop [@tailcall]) (f acc i Malformed) f s (i + 1) l else let rem = l - i in if rem < need then f acc i Malformed else (loop [@tailcall]) (f acc i (Point (codepoint s i need))) f s (i + need) l in let len = match len with | None -> String.length s - pos | Some l -> l in loop acc f (Bytes.unsafe_of_string s) pos len (* Add a UTF-16 code unit to a buffer, encoded in WTF-8. *) let add_wtf_8 buf code = let[@inline] w byte = Buffer.add_char buf (Char.unsafe_chr byte) in if code >= 0x10000 then begin (* 4 bytes *) w (0xf0 lor (code lsr 18)); w (0x80 lor ((code lsr 12) land 0x3F)); w (0x80 lor ((code lsr 6) land 0x3F)); w (0x80 lor (code land 0x3F)) end else if code >= 0x800 then begin (* 3 bytes *) w (0xe0 lor (code lsr 12)); w (0x80 lor ((code lsr 6) land 0x3F)); w (0x80 lor (code land 0x3F)) end else if code >= 0x80 then begin (* 2 bytes *) w (0xc0 lor (code lsr 6)); w (0x80 lor (code land 0x3F)) end else (* 1 byte *) w code