Source file bytesrw_utf.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
open Bytesrw
let uchar_max_utf_8_byte_length = 4
let[@inline] uchar_utf_8_byte_decode_length = function
| '\x00' .. '\x7F' -> 1
| '\x80' .. '\xC1' -> 0
| '\xC2' .. '\xDF' -> 2
| '\xE0' .. '\xEF' -> 3
| '\xF0' .. '\xF4' -> 4
| _ -> 0
module Encoding = struct
type t = [ `Utf_8 | `Utf_16be | `Utf_16le ]
let to_iana_charset = function
| `Utf_8 -> "UTF-8" | `Utf_16 -> "UTF-16" | `Utf_16be -> "UTF-16BE"
| `Utf_16le -> "UTF-16LE"
let pp ppf e = Format.pp_print_string ppf (to_iana_charset e)
end
let guess_reader_encoding r = match Bytes.Reader.sniff 3 r with
| s when String.length s <= 1 -> `Utf_8
| "\xEF\xBB\xBF" -> `Utf_8
| s when s.[0] = '\xFE' && s.[1] = '\xFF' -> `Utf_16be
| s when s.[0] = '\xFF' && s.[1] = '\xFE' -> `Utf_16le
| s when s.[0] = '\x00' && Char.code s.[1] > 0 -> `Utf_16be
| s when Char.code s.[0] > 0 && s.[1] = '\x00' -> `Utf_16le
| s when uchar_utf_8_byte_decode_length s.[0] <> 0 -> `Utf_8
| s -> `Utf_16be