1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
exception UCS_value_out_of_range
exception UCS_value_prohibited_in_UTF8
exception UCS_value_prohibited_in_XML
exception UTF8_character_incomplete
exception UTF8_continuation_byte_invalid
exception UTF8_encoding_not_canonical
exception String_incomplete
let ( +++ ) = Int32.add
let ( --- ) = Int32.sub
let ( &&& ) = Int32.logand
let ( ||| ) = Int32.logor
let ( <<< ) = Int32.shift_left
let ( >>> ) = Int32.shift_right_logical
module UCS = struct
let min_value = 0x000000l
let max_value = 0x1fffffl
let is_non_character value = false
|| (0xfdd0l <= value && value <= 0xfdefl)
|| (Int32.logand 0xfffel value = 0xfffel)
let is_out_of_range value =
value < min_value || value > max_value
let is_surrogate value =
(0xd800l <= value && value <= 0xdfffl)
end
module XML = struct
let is_forbidden_control_character value = value < 0x20l
&& value <> 0x09l
&& value <> 0x0al
&& value <> 0x0dl
end
module type UCS_VALIDATOR = sig
val validate : int32 -> unit
end
module UTF8_UCS_validator : UCS_VALIDATOR = struct
let validate value =
if UCS.is_out_of_range value then raise UCS_value_out_of_range;
if UCS.is_non_character value then raise UCS_value_prohibited_in_UTF8;
if UCS.is_surrogate value then raise UCS_value_prohibited_in_UTF8
end
module XML_UTF8_UCS_validator : UCS_VALIDATOR = struct
let validate value =
UTF8_UCS_validator.validate value;
if XML.is_forbidden_control_character value
then raise UCS_value_prohibited_in_XML
end
module type CHARACTER_DECODER = sig
val decode_character : string -> int -> int32 * int
end
module type CHARACTER_ENCODER = sig
val encode_character : int32 -> string
end
module UTF8_CODEC (UCS_validator : UCS_VALIDATOR) = struct
let width_required_for_ucs_value value =
if value < 0x000080l then 1 else
if value < 0x000800l then 2 else
if value < 0x010000l then 3 else 4
let byte =
if byte land 0b10000000 = 0b00000000 then (byte , 1) else
if byte land 0b11100000 = 0b11000000 then (byte land 0b0011111, 2) else
if byte land 0b11110000 = 0b11100000 then (byte land 0b0001111, 3) else
if byte land 0b11111000 = 0b11110000 then (byte land 0b0000111, 4) else
raise UTF8_header_byte_invalid
let decode_continuation_byte byte =
if byte land 0b11000000 = 0b10000000 then byte land 0b00111111 else
raise UTF8_continuation_byte_invalid
let decode_character string index =
let value, width = decode_header_byte (Char.code string.[index]) in
let value = if width = 1 then (Int32.of_int value)
else begin
let value = ref (Int32.of_int value) in
for index = index + 1 to index + width - 1 do
let chunk = decode_continuation_byte (Char.code string.[index]) in
value := (!value <<< 6) ||| (Int32.of_int chunk)
done;
if width > (width_required_for_ucs_value !value)
then raise UTF8_encoding_not_canonical;
!value
end in
UCS_validator.validate value;
(value, width)
let width value =
match width with
| 1 -> value
| 2 -> value ||| 0b11000000l
| 3 -> value ||| 0b11100000l
| 4 -> value ||| 0b11110000l
| _ -> raise UCS_value_out_of_range
let encode_continuation_byte value =
((value &&& 0b00111111l) ||| 0b10000000l, value >>> 6)
let encode_character value =
UCS_validator.validate value;
let width = width_required_for_ucs_value value in
let b = Bytes.make width ' ' in
let rec encode_continuation_bytes remainder index =
if index = 0 then remainder else
let byte, remainder = encode_continuation_byte remainder in
Bytes.set b index @@ Char.chr (Int32.to_int byte);
encode_continuation_bytes remainder (index - 1) in
let remainder = encode_continuation_bytes value (width - 1) in
let byte = encode_header_byte width remainder in
Bytes.set b 0 @@ Char.chr (Int32.to_int byte);
Bytes.unsafe_to_string b
end
module UTF8_codec = UTF8_CODEC ( UTF8_UCS_validator)
module XML_UTF8_codec = UTF8_CODEC (XML_UTF8_UCS_validator)
module type STRING_VALIDATOR = sig
val is_valid : string -> bool
val validate : string -> unit
val longest_valid_prefix : string -> string
end
exception Validation_error of int * exn
module String_validator (Decoder : CHARACTER_DECODER) : STRING_VALIDATOR = struct
let validate string =
let index = ref 0 and length = String.length string in
begin try
while !index < length do
let _, width = Decoder.decode_character string !index in
index := !index + width
done;
with
| Invalid_argument _ -> raise String_incomplete
| error -> raise (Validation_error (!index, error))
end; assert (!index = length)
let is_valid string =
try validate string; true with _ -> false
let longest_valid_prefix string =
try validate string; string
with Validation_error (index, _) -> String.sub string 0 index
end
module UTF8 = String_validator ( UTF8_codec)
module UTF8_XML = String_validator (XML_UTF8_codec)