Source file rosetta.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
type yuscii_encoding = [`UTF_7]

type encoding = [yuscii_encoding | Uuuu.encoding | Coin.encoding]

let invalid_arg fmt = Format.ksprintf (fun s -> invalid_arg s) fmt

let try_or f g x = try f x with _ -> g x

let ( <.> ) f g x = f (g x)

let cast x = (x :> encoding)

let yuscii_encoding_of_string = function
  | "UTF-7" | "csUTF7" -> `UTF_7
  | s -> invalid_arg "Invalid_character-sets: %s" s

let encoding_of_string =
  try_or (cast <.> yuscii_encoding_of_string)
  @@ try_or (cast <.> Uuuu.encoding_of_string)
  @@ try_or (cast <.> Coin.encoding_of_string)
  @@ invalid_arg "Invalid_character-sets: %s"

let encoding_to_string = function
  | #Uuuu.encoding as encoding -> Uuuu.encoding_to_string encoding
  | #Coin.encoding as encoding -> Coin.encoding_to_string encoding
  | `UTF_7 -> "UTF-7"

type ('kind, 'decoder) tag =
  | UTF_7 : ([> yuscii_encoding], Yuscii.decoder) tag
  | ISO8859 : ([> Uuuu.encoding], 'kind Uuuu.decoder) tag
  | KOI8 : ([> Coin.encoding], 'kind Coin.decoder) tag

type 'kind pack =
  | V : ('kind, 'decoder) tag * 'decoder -> 'kind pack [@unboxed]

type src = [`Channel of in_channel | `String of string | `Manual]

type decode = [`Await | `End | `Uchar of Uchar.t | `Malformed of string]

type 'kind decoder =
  {src: src; pack: 'kind pack}
  constraint 'kind = [< encoding]

let src {pack= V (kind, decoder); _} source off len =
  match kind with
  | UTF_7 -> Yuscii.src decoder source off len
  | ISO8859 -> Uuuu.src decoder source off len
  | KOI8 -> Coin.src decoder source off len

let decode {pack= V (kind, decoder); _} =
  match kind with
  | UTF_7 -> Yuscii.decode decoder
  | ISO8859 -> Uuuu.decode decoder
  | KOI8 -> Coin.decode decoder

let decoder : ([< encoding] as 'kind) -> src -> 'kind decoder =
 fun kind src ->
  match kind with
  | #Uuuu.encoding as k -> {src; pack= V (ISO8859, Uuuu.decoder k src)}
  | #Coin.encoding as k -> {src; pack= V (KOI8, Coin.decoder k src)}
  | #yuscii_encoding -> {src; pack= V (UTF_7, Yuscii.decoder src)}

let decoder_byte_count {pack= V (kind, decoder); _} =
  match kind with
  | UTF_7 -> Yuscii.decoder_byte_count decoder
  | ISO8859 -> Uuuu.decoder_byte_count decoder
  | KOI8 -> Coin.decoder_byte_count decoder

let decoder_src {src; _} = src

let decoder_kind {pack= V (kind, decoder); _} =
  match kind with
  | UTF_7 -> `UTF_7
  | ISO8859 -> (Uuuu.decoder_kind decoder :> encoding)
  | KOI8 -> (Coin.decoder_kind decoder :> encoding)

module String = struct
  type 'a folder =
    'a -> int -> [`Malformed of string | `Uchar of Uchar.t] -> 'a

  let fold kind ?off ?len folder acc str =
    let off, len =
      match (off, len) with
      | Some off, Some len -> (off, len)
      | None, Some len -> (0, len)
      | Some off, None -> (off, String.length str - off)
      | None, None -> (0, String.length str)
    in
    let acc = ref acc in
    let decoder = decoder kind (`String (String.sub str off len)) in
    let rec go decoder =
      match decode decoder with
      | (`Uchar _ | `Malformed _) as res ->
          acc := folder !acc (decoder_byte_count decoder) res ;
          go decoder
      | `End -> !acc
      | `Await -> assert false
    in
    go decoder
end