1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
open StdLabels
type unichar = int
type unistring = unichar array
module Error = struct
type error =
| NO_CONVERSION
| ILLEGAL_SEQUENCE
| FAILED
| PARTIAL_INPUT
| BAD_URI
| NOT_ABSOLUTE_PATH
exception Error of error * string
let raise_bad_utf8 () =
raise (Error (ILLEGAL_SEQUENCE, "Invalid byte sequence for UTF-8 string"))
end
open Error
let rec log64 n =
if n = 0 then 0 else
1 + log64 (n lsr 5)
let utf8_storage_len n =
if n < 0x80 then 1 else
log64 (n lsr 1)
let write_unichar s ~pos (c : unichar) =
let len = utf8_storage_len c in
let p = !pos in
if len = 1 then
Bytes.unsafe_set s p (Char.unsafe_chr c)
else begin
Bytes.unsafe_set s p
(Char.unsafe_chr (((1 lsl len - 1) lsl (8-len)) lor (c lsr ((len-1)*6))));
for i = 1 to len-1 do
Bytes.unsafe_set s (p+i)
(Char.unsafe_chr (((c lsr ((len-1-i)*6)) land 0x3f) lor 0x80))
done;
end;
pos := p + len
let sub_string s ~pos ~len = Bytes.sub_string s pos len
let from_unichar (n : unichar) =
let s = Bytes.create 6 and pos = ref 0 in
write_unichar s ~pos n;
sub_string s ~pos:0 ~len:!pos
let from_unistring (s : unistring) =
let len = Array.length s in
let r = Bytes.create (len*6) in
let pos = ref 0 in
for i = 0 to len-1 do write_unichar r ~pos s.(i) done;
sub_string r ~pos:0 ~len:!pos
let rec hi_bits n =
if n land 0x80 = 0 then 0 else
1 + hi_bits (n lsl 1)
let to_unichar s ~pos : unichar =
let c = Char.code s.[!pos] in
incr pos;
let n = hi_bits c in
if n = 0 then c else
let u = ref (c land (1 lsl (7-n) - 1)) in
for i = 1 to n-1 do
let c = Char.code s.[!pos] in
u := !u lsl 6 + c land 0x3f ;
incr pos
done;
!u
let first_char s =
to_unichar s ~pos:(ref 0)
let validate c =
c < 0x110000 && (c land 0x7FFFF800) <> 0xD800 &&
(c < 0xFDD0 || c > 0xFDEF) && (c land 0xFFFE) <> 0xFFFE
let to_unichar_validated s ~pos : unichar =
let c = Char.code s.[!pos] in
incr pos;
let n = hi_bits c in
if n = 0 then c else begin
if n = 1 || n > 6 then raise_bad_utf8 () ;
if !pos + n > String.length s then
raise (Error(PARTIAL_INPUT, "partial UTF-8 character"));
let u = ref (c land (1 lsl (7-n) - 1)) in
for i = 1 to n-1 do
let c = Char.code s.[!pos] in
if c lsr 6 <> 0b10 then raise_bad_utf8 () ;
u := !u lsl 6 + c land 0x3f ;
incr pos
done;
let v = !u in
if utf8_storage_len v <> n || not (validate v)
then raise_bad_utf8 () ;
v
end
let rec end_of_char s ~pos =
let c = Char.code s.[pos] in
if (c land 0xc0) = 0x80 then end_of_char s ~pos:(pos+1) else pos
let next s ~pos =
let c = Char.code s.[pos] in
let n = hi_bits c in
if n = 0 then pos + 1 else
if n = 1 then end_of_char s ~pos:(pos+1) else pos + n
let length s =
let len = String.length s in
let rec loop count ~pos =
if pos >= len then count else
loop (count+1) ~pos:(next s ~pos)
in loop 0 ~pos:0
let to_unistring s : unistring =
let len = length s in
let us = Array.make len 0 in
let pos = ref 0 in
for i = 0 to len - 1 do
us.(i) <- to_unichar s ~pos
done;
us