Source file metadataCharEncoding.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
type recode =
?source:[ `ISO_8859_1 | `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
?target:[ `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
string ->
string
module type T = sig
val convert :
?source:[ `ISO_8859_1 | `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
?target:[ `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
string ->
string
end
module Naive : T = struct
let convert ?(source = `UTF_8) ?(target = `UTF_8) s =
match (source, target) with
| `UTF_8, `UTF_8
| `UTF_16, `UTF_16
| `UTF_16LE, `UTF_16LE
| `UTF_16BE, `UTF_16BE
| _ ->
let buf = Buffer.create 10 in
let len = String.length s in
let add_unicode_char =
match target with
| `UTF_8 -> Buffer.add_utf_8_uchar
| `UTF_16BE -> Buffer.add_utf_16be_uchar
| _ -> Buffer.add_utf_16le_uchar
in
let unicode_byte_length =
match source with
| `ISO_8859_1 -> fun _ -> 1
| `UTF_8 -> Uchar.utf_8_byte_length
| _ -> Uchar.utf_16_byte_length
in
let s, get_unicode_char =
match source with
| `ISO_8859_1 -> (s, fun s pos -> Uchar.of_char s.[pos])
| `UTF_8 ->
( s,
fun s pos ->
Uchar.utf_decode_uchar (String.get_utf_8_uchar s pos) )
| `UTF_16BE ->
( s,
fun s pos ->
Uchar.utf_decode_uchar (String.get_utf_16be_uchar s pos)
)
| `UTF_16LE ->
( s,
fun s pos ->
Uchar.utf_decode_uchar (String.get_utf_16le_uchar s pos)
)
| `UTF_16 ->
let default =
( "",
fun s pos ->
Uchar.utf_decode_uchar (String.get_utf_16be_uchar s pos)
)
in
if len < 2 then default
else (
let rem = String.sub s 2 (len - 2) in
match (s.[0], s.[1]) with
| '\xfe', '\xff' ->
( rem,
fun s pos ->
Uchar.utf_decode_uchar
(String.get_utf_16be_uchar s pos) )
| '\xff', '\xfe' ->
( rem,
fun s pos ->
Uchar.utf_decode_uchar
(String.get_utf_16le_uchar s pos) )
| _ -> default)
in
if target = `UTF_16 then add_unicode_char buf Uchar.bom;
let len = String.length s in
let rec f pos =
if pos = len then Buffer.contents buf
else (
let c = get_unicode_char s pos in
add_unicode_char buf c;
f (pos + unicode_byte_length c))
in
f 0
end