Source file `unicode.ml`

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
open Piece_types

let create_offsets utf8_pos utf16_pos utf32_pos line_num =
  { utf8_pos; utf16_pos; utf32_pos; line_num }

(** Given the first byte of a UTF-8 code point, returns the length of that character in UTF-8. *)
let utf8_length chr =
  match chr with
  | '\x00' .. '\x7f' -> 1
  | '\xc0' .. '\xdf' -> 2
  | '\xe0' .. '\xef' -> 3
  | '\xf0' .. '\xf7' -> 4
  | _ -> failwith "invalid utf-8 start"

(** 
    Given the first byte of a UTF-8 code point, returns the length of that character in UTF-16. 
    This is almost identical to the utf8_length function above and that function can support a tuple,
    but the additional allocation a tuple has bothered me.
 *)
let utf16_length chr =
  match chr with
  | '\x00' .. '\xef' -> 1
  | '\xf0' .. '\xf7' -> 2
  | _ -> failwith "invalid utf-8 start"

(** Counts the length of the string in UTF-16 and Unicode code points, 
    and builds an array of line breaks in terms of UTF-8 as that is OCaml's native string encoding. *)
let count_string_stats (str : string) (buffer_length : int) =
  let rec get utf8_pos utf16_cntr utf32_cntr line_breaks prev_is_cr =
    if utf8_pos >= String.length str then
      (utf16_cntr, utf32_cntr, line_breaks |> List.rev |> Array.of_list)
    else
      let chr = String.unsafe_get str utf8_pos in
      let utf8_length = utf8_length chr in
      let utf16_length = utf16_length chr in

      let line_breaks =
        if chr = '\r' || (chr = '\n' && not prev_is_cr) then
          (utf32_cntr + buffer_length) :: line_breaks
        else line_breaks
      in
      get (utf8_pos + utf8_length)
        (utf16_cntr + utf16_length)
        (utf32_cntr + 1) line_breaks (chr = '\r')
  in
  get 0 0 0 [] false

let utf32_sub (str : string) (start : int) (length : int) =
  let finish = start + length in
  let rec sub str_pos cd_pos str_start str_finish =
    if str_pos = String.length str then
      String.sub str str_start (String.length str - str_start)
    else
      let str_start = if cd_pos = start then str_pos else str_start in
      if cd_pos = finish then String.sub str str_start (str_pos - str_start)
      else
        let utf8_length = utf8_length (String.unsafe_get str str_pos) in
        sub (str_pos + utf8_length) (cd_pos + 1) str_start str_finish
  in
  sub 0 0 0 0

(*
    The count_to functions duplicate some code intentionally.
    A previous generic version existed,
    but I was bothered by additional if-statements on each recursion
    to handle UTF-8/16 cases when the specified index is inside a code point.
*)
let count_to_utf32 (str : string) (count_to : int) =
  let rec cnt utf8_pos utf16_pos utf32_pos line_count prev_is_cr =
    if utf32_pos = count_to then
      create_offsets utf8_pos utf16_pos utf32_pos line_count
    else
      let chr = String.unsafe_get str utf8_pos in
      let u8_length = utf8_length chr in
      let u16_length = utf16_length chr in

      let nextUtf8 = utf8_pos + u8_length in
      let nextUtf16 = utf16_pos + u16_length in
      let nextUtf32 = utf32_pos + 1 in
      let line_count =
        if chr = '\r' || (chr = '\n' && not prev_is_cr) then line_count + 1
        else line_count
      in
      cnt nextUtf8 nextUtf16 nextUtf32 line_count (chr = '\r')
  in
  cnt 0 0 0 0 false

let count_to_utf16 (str : string) (count_to : int) =
  let rec cnt utf8_pos utf16_pos utf32_pos line_count prev_is_cr =
    if utf16_pos = count_to then
      create_offsets utf8_pos utf16_pos utf32_pos line_count
    else
      let chr = String.unsafe_get str utf8_pos in
      let u8_length = utf8_length chr in
      let u16_length = utf16_length chr in

      let next_u8 = utf8_pos + u8_length in
      let next_u16 = utf16_pos + u16_length in
      let next_u32 = utf32_pos + 1 in
      let next_line_count =
        if chr = '\r' || (chr = '\n' && not prev_is_cr) then line_count + 1
        else line_count
      in
      if next_u16 > count_to then
        create_offsets utf8_pos utf16_pos utf32_pos line_count
      else cnt next_u8 next_u16 next_u32 next_line_count (chr = '\r')
  in
  cnt 0 0 0 0 false

let count_to_utf8 (str : string) (count_to : int) =
  let rec cnt utf8_pos utf16_pos utf32_pos line_count prev_is_cr =
    if utf8_pos = count_to then
      create_offsets utf8_pos utf16_pos utf32_pos line_count
    else
      let chr = String.unsafe_get str utf8_pos in
      let u8_length = utf8_length chr in
      let u16_length = utf16_length chr in

      let next_u8 = utf8_pos + u8_length in
      let next_u16 = utf16_pos + u16_length in
      let next_u32 = utf32_pos + 1 in
      let next_line_count =
        if chr = '\r' || (chr = '\n' && not prev_is_cr) then line_count + 1
        else line_count
      in
      if next_u8 > count_to then
        create_offsets utf8_pos utf16_pos utf32_pos line_count
      else cnt next_u8 next_u16 next_u32 next_line_count (chr = '\r')
  in
  cnt 0 0 0 0 false

let count_to (str : string) (count_towards : int) (enc : encoding) =
  match enc with
  | Utf8 -> count_to_utf8 str count_towards
  | Utf16 -> count_to_utf16 str count_towards
  | Utf32 -> count_to_utf32 str count_towards