Source file metadataCharEncoding.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
module type T = sig
val convert :
?source:[ `ISO_8859_1 | `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
string ->
string
end
module Naive : T = struct
let convert ?source s =
let source = match source with None -> `UTF_8 | Some x -> x in
let endianness = ref `BE in
let buf = Buffer.create 10 in
match source with
| (`UTF_16 | `UTF_16LE | `UTF_16BE) as source ->
let get_char =
match source with
| `UTF_16LE -> String.get_utf_16le_uchar
| `UTF_16BE -> String.get_utf_16be_uchar
| `UTF_16 -> (
match !endianness with
| `LE -> String.get_utf_16le_uchar
| `BE -> String.get_utf_16be_uchar)
in
let len = String.length s in
let rec f pos =
if pos = len then Buffer.contents buf
else if pos + 2 <= len && s.[pos] = '\xfe' && s.[pos] = '\xff' then (
endianness := `BE;
f (pos + 2))
else if pos + 2 <= len && s.[pos] = '\xff' && s.[pos] = '\xfe' then (
endianness := `LE;
f (pos + 2))
else (
let d = get_char s pos in
let c = Uchar.utf_decode_uchar d in
Buffer.add_utf_8_uchar buf c;
f (pos + Uchar.utf_decode_length d))
in
f 0
| `UTF_8 -> s
| _ -> s
end