Source file ocaml_util.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# 1 "ocaml_util.cppo.ml"
let print_loc ppf loc =
Location.print_loc ppf loc
let print_error loc f ppf x =
# 17 "ocaml_util.cppo.ml"
let error = Location.error_of_printer ~loc f x in
Location.print_report ppf error
# 265 "ocaml_util.cppo.ml"
(** {1 Minimal support for Unicode characters in identifiers} *)
module Utf8_lexeme = struct
type t = string
type case = Upper of Uchar.t | Lower of Uchar.t
let known_chars : (Uchar.t, case) Hashtbl.t = Hashtbl.create 32
let _ =
List.iter
(fun (upper, lower) ->
let upper = Uchar.of_int upper and lower = Uchar.of_int lower in
Hashtbl.add known_chars upper (Upper lower);
Hashtbl.add known_chars lower (Lower upper))
[
(0xc0, 0xe0); (0xc1, 0xe1);
(0xc2, 0xe2); (0xc3, 0xe3);
(0xc4, 0xe4); (0xc5, 0xe5);
(0xc6, 0xe6); (0xc7, 0xe7);
(0xc8, 0xe8); (0xc9, 0xe9);
(0xca, 0xea); (0xcb, 0xeb);
(0xcc, 0xec); (0xcd, 0xed);
(0xce, 0xee); (0xcf, 0xef);
(0xd0, 0xf0); (0xd1, 0xf1);
(0xd2, 0xf2); (0xd3, 0xf3);
(0xd4, 0xf4); (0xd5, 0xf5);
(0xd6, 0xf6); (0xd8, 0xf8);
(0xd9, 0xf9); (0xda, 0xfa);
(0xdb, 0xfb); (0xdc, 0xfc);
(0xdd, 0xfd); (0xde, 0xfe);
(0x160, 0x161); (0x17d, 0x17e);
(0x152, 0x153); (0x178, 0xff);
(0x1e9e, 0xdf);
]
let known_pairs : (Uchar.t * Uchar.t, Uchar.t) Hashtbl.t = Hashtbl.create 32
let _ =
List.iter
(fun (c1, n2, n) ->
Hashtbl.add known_pairs
(Uchar.of_char c1, Uchar.of_int n2) (Uchar.of_int n))
[
('A', 0x300, 0xc0); ('A', 0x301, 0xc1);
('A', 0x302, 0xc2); ('A', 0x303, 0xc3);
('A', 0x308, 0xc4); ('A', 0x30a, 0xc5);
('C', 0x327, 0xc7); ('E', 0x300, 0xc8);
('E', 0x301, 0xc9); ('E', 0x302, 0xca);
('E', 0x308, 0xcb); ('I', 0x300, 0xcc);
('I', 0x301, 0xcd); ('I', 0x302, 0xce);
('I', 0x308, 0xcf); ('N', 0x303, 0xd1);
('O', 0x300, 0xd2); ('O', 0x301, 0xd3);
('O', 0x302, 0xd4); ('O', 0x303, 0xd5);
('O', 0x308, 0xd6);
('U', 0x300, 0xd9); ('U', 0x301, 0xda);
('U', 0x302, 0xdb); ('U', 0x308, 0xdc);
('Y', 0x301, 0xdd); ('Y', 0x308, 0x178);
('S', 0x30c, 0x160); ('Z', 0x30c, 0x17d);
('a', 0x300, 0xe0); ('a', 0x301, 0xe1);
('a', 0x302, 0xe2); ('a', 0x303, 0xe3);
('a', 0x308, 0xe4); ('a', 0x30a, 0xe5);
('c', 0x327, 0xe7); ('e', 0x300, 0xe8);
('e', 0x301, 0xe9); ('e', 0x302, 0xea);
('e', 0x308, 0xeb); ('i', 0x300, 0xec);
('i', 0x301, 0xed); ('i', 0x302, 0xee);
('i', 0x308, 0xef); ('n', 0x303, 0xf1);
('o', 0x300, 0xf2); ('o', 0x301, 0xf3);
('o', 0x302, 0xf4); ('o', 0x303, 0xf5);
('o', 0x308, 0xf6);
('u', 0x300, 0xf9); ('u', 0x301, 0xfa);
('u', 0x302, 0xfb); ('u', 0x308, 0xfc);
('y', 0x301, 0xfd); ('y', 0x308, 0xff);
('s', 0x30c, 0x161); ('z', 0x30c, 0x17e);
]
let normalize_generic ~keep_ascii transform s =
let rec norm check buf prev i =
if i >= String.length s then begin
Buffer.add_utf_8_uchar buf (transform prev)
end else begin
let d = String.get_utf_8_uchar s i in
let u = Uchar.utf_decode_uchar d in
check d u;
let i' = i + Uchar.utf_decode_length d in
match Hashtbl.find_opt known_pairs (prev, u) with
| Some u' ->
norm check buf u' i'
| None ->
Buffer.add_utf_8_uchar buf (transform prev);
norm check buf u i'
end in
let ascii_limit = 128 in
if s = ""
|| keep_ascii && String.for_all (fun x -> Char.code x < ascii_limit) s
then Ok s
else
let buf = Buffer.create (String.length s) in
let valid = ref true in
let check d u =
valid := !valid && Uchar.utf_decode_is_valid d && u <> Uchar.rep
in
let d = String.get_utf_8_uchar s 0 in
let u = Uchar.utf_decode_uchar d in
check d u;
norm check buf u (Uchar.utf_decode_length d);
let contents = Buffer.contents buf in
if !valid then
Ok contents
else
Error contents
let normalize s =
normalize_generic ~keep_ascii:true (fun u -> u) s
let uchar_is_uppercase u =
let c = Uchar.to_int u in
if c < 0x80 then c >= 65 && c <= 90 else
match Hashtbl.find_opt known_chars u with
| Some(Upper _) -> true
| _ -> false
let uchar_lowercase u =
let c = Uchar.to_int u in
if c < 0x80 then
if c >= 65 && c <= 90 then Uchar.of_int (c + 32) else u
else
match Hashtbl.find_opt known_chars u with
| Some(Upper u') -> u'
| _ -> u
let uchar_uppercase u =
let c = Uchar.to_int u in
if c < 0x80 then
if c >= 97 && c <= 122 then Uchar.of_int (c - 32) else u
else
match Hashtbl.find_opt known_chars u with
| Some(Lower u') -> u'
| _ -> u
let capitalize s =
let first = ref true in
normalize_generic ~keep_ascii:false
(fun u -> if !first then (first := false; uchar_uppercase u) else u)
s
let uncapitalize s =
let first = ref true in
normalize_generic ~keep_ascii:false
(fun u -> if !first then (first := false; uchar_lowercase u) else u)
s
let is_capitalized s =
s <> "" &&
uchar_is_uppercase (Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0))
let uchar_valid_in_identifier ~with_dot u =
let c = Uchar.to_int u in
if c < 0x80 then
c >= 97 && c <= 122
|| c >= 65 && c <= 90
|| c >= 48 && c <= 57
|| c = 95
|| c = 39
|| (with_dot && c = 46)
else
Hashtbl.mem known_chars u
let uchar_not_identifier_start u =
let c = Uchar.to_int u in
c >= 48 && c <= 57
|| c = 39
type validation_result =
| Valid
| Invalid_character of Uchar.t (** Character not allowed *)
| Invalid_beginning of Uchar.t (** Character not allowed as first char *)
let validate_identifier ?(with_dot=false) s =
let rec check i =
if i >= String.length s then Valid else begin
let d = String.get_utf_8_uchar s i in
let u = Uchar.utf_decode_uchar d in
let i' = i + Uchar.utf_decode_length d in
if not (uchar_valid_in_identifier ~with_dot u) then
Invalid_character u
else if i = 0 && uchar_not_identifier_start u then
Invalid_beginning u
else
check i'
end
in check 0
let is_valid_identifier s =
validate_identifier s = Valid
let starts_like_a_valid_identifier s =
s <> "" &&
(let u = Uchar.utf_decode_uchar (String.get_utf_8_uchar s 0) in
uchar_valid_in_identifier ~with_dot:false u
&& not (uchar_not_identifier_start u))
let is_lowercase s =
let rec is_lowercase_at len s n =
if n >= len then true
else
let d = String.get_utf_8_uchar s n in
let u = Uchar.utf_decode_uchar d in
(uchar_valid_in_identifier ~with_dot:false u)
&& not (uchar_is_uppercase u)
&& is_lowercase_at len s (n+Uchar.utf_decode_length d)
in
is_lowercase_at (String.length s) s 0
end