1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
module Int = struct type t = int let compare : int -> int -> int = compare end
module Imap = Map.Make(Int)
module Iset = Set.Make(Int)
let latin_uchar_to_base_map =
let add map (k, v) = Imap.add k v map in
let map1 = List.fold_left add Imap.empty Ubase_data.latin_uchar_to_base_alist in
List.fold_left add map1 Ubase_custom.misc_to_ascii_alist
let uchar_to_string u =
let x = Uchar.to_int u in
if x <= 126 then Char.chr x |> String.make 1
else Imap.find (Uchar.to_int u) latin_uchar_to_base_map
let uchar_replacement u =
Imap.find_opt (Uchar.to_int u) latin_uchar_to_base_map
let string_to_char ?(unknown='?') s =
if String.length s > 2 then unknown
else s.[0]
let uchar_to_char ?(unknown='?') u =
string_to_char ~unknown (uchar_to_string u)
let from_utf8 ?(malformed="?") ?strip s =
let len = String.length s in
let b = Buffer.create len in
let strip = match strip with
| None -> Buffer.add_utf_8_uchar b
| Some strip -> fun _ -> Buffer.add_string b strip in
let rec iterator i =
if i < len then
let dec = String.get_utf_8_uchar s i in
let () = if Uchar.utf_decode_is_valid dec
then let u = Uchar.utf_decode_uchar dec in
if Uchar.to_int u <= 127
then Buffer.add_utf_8_uchar b u
else match uchar_replacement u with
| Some t -> Buffer.add_string b t
| None -> strip u
else
Buffer.add_string b malformed
in iterator (i + Uchar.utf_decode_length dec)
in
iterator 0;
Buffer.contents b
let from_utf8_string = from_utf8
let cp1252_to_utf8 ?(undefined=0xFFFD) s =
let len = String.length s in
let b = Buffer.create len in
for i = 0 to len - 1 do
let x = Char.code s.[i] in
let y =
if x <= 0x7F then x
else match Ubase_custom.cp1252_to_utf8_array.(x-0x80) with
| 0 -> undefined
| y -> y in
Buffer.add_utf_8_uchar b (Uchar.of_int y)
done;
Buffer.contents b
let isolatin_to_utf8 ?(control=0x0080) s =
let len = String.length s in
let b = Buffer.create len in
for i = 0 to len - 1 do
let x = Char.code s.[i] in
if x <= 0x7F then Buffer.add_char b (Char.chr x)
else let y = if x >= 0x80 && x <= 0x9F
then control
else Ubase_custom.cp1252_to_utf8_array.(x-0x80) in
Buffer.add_utf_8_uchar b (Uchar.of_int y)
done;
Buffer.contents b
let white_space_set = Iset.of_list Ubase_data.white_space
let is_space u =
Iset.mem (Uchar.to_int u) white_space_set