1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
open Piece_types
let create_offsets utf8_pos utf16_pos utf32_pos line_num =
{ utf8_pos; utf16_pos; utf32_pos; line_num }
(** Given the first byte of a UTF-8 code point, returns the length of that character in UTF-8. *)
let utf8_length chr =
match chr with
| '\x00' .. '\x7f' -> 1
| '\xc0' .. '\xdf' -> 2
| '\xe0' .. '\xef' -> 3
| '\xf0' .. '\xf7' -> 4
| _ -> failwith "invalid utf-8 start"
(**
Given the first byte of a UTF-8 code point, returns the length of that character in UTF-16.
This is almost identical to the utf8_length function above and that function can support a tuple,
but the additional allocation a tuple has bothered me.
*)
let utf16_length chr =
match chr with
| '\x00' .. '\xef' -> 1
| '\xf0' .. '\xf7' -> 2
| _ -> failwith "invalid utf-8 start"
(** Counts the length of the string in UTF-16 and Unicode code points,
and builds an array of line breaks in terms of UTF-8 as that is OCaml's native string encoding. *)
let count_string_stats (str : string) (buffer_length : int) =
let rec get utf8_pos utf16_cntr utf32_cntr line_breaks prev_is_cr =
if utf8_pos >= String.length str then
(utf16_cntr, utf32_cntr, line_breaks |> List.rev |> Array.of_list)
else
let chr = String.unsafe_get str utf8_pos in
let utf8_length = utf8_length chr in
let utf16_length = utf16_length chr in
let line_breaks =
if chr = '\r' || (chr = '\n' && not prev_is_cr) then
(utf32_cntr + buffer_length) :: line_breaks
else line_breaks
in
get (utf8_pos + utf8_length)
(utf16_cntr + utf16_length)
(utf32_cntr + 1) line_breaks (chr = '\r')
in
get 0 0 0 [] false
let utf32_sub (str : string) (start : int) (length : int) =
let finish = start + length in
let rec sub str_pos cd_pos str_start str_finish =
if str_pos = String.length str then
String.sub str str_start (String.length str - str_start)
else
let str_start = if cd_pos = start then str_pos else str_start in
if cd_pos = finish then String.sub str str_start (str_pos - str_start)
else
let utf8_length = utf8_length (String.unsafe_get str str_pos) in
sub (str_pos + utf8_length) (cd_pos + 1) str_start str_finish
in
sub 0 0 0 0
let count_to_utf32 (str : string) (count_to : int) =
let rec cnt utf8_pos utf16_pos utf32_pos line_count prev_is_cr =
if utf32_pos = count_to then
create_offsets utf8_pos utf16_pos utf32_pos line_count
else
let chr = String.unsafe_get str utf8_pos in
let u8_length = utf8_length chr in
let u16_length = utf16_length chr in
let nextUtf8 = utf8_pos + u8_length in
let nextUtf16 = utf16_pos + u16_length in
let nextUtf32 = utf32_pos + 1 in
let line_count =
if chr = '\r' || (chr = '\n' && not prev_is_cr) then line_count + 1
else line_count
in
cnt nextUtf8 nextUtf16 nextUtf32 line_count (chr = '\r')
in
cnt 0 0 0 0 false
let count_to_utf16 (str : string) (count_to : int) =
let rec cnt utf8_pos utf16_pos utf32_pos line_count prev_is_cr =
if utf16_pos = count_to then
create_offsets utf8_pos utf16_pos utf32_pos line_count
else
let chr = String.unsafe_get str utf8_pos in
let u8_length = utf8_length chr in
let u16_length = utf16_length chr in
let next_u8 = utf8_pos + u8_length in
let next_u16 = utf16_pos + u16_length in
let next_u32 = utf32_pos + 1 in
let next_line_count =
if chr = '\r' || (chr = '\n' && not prev_is_cr) then line_count + 1
else line_count
in
if next_u16 > count_to then
create_offsets utf8_pos utf16_pos utf32_pos line_count
else cnt next_u8 next_u16 next_u32 next_line_count (chr = '\r')
in
cnt 0 0 0 0 false
let count_to_utf8 (str : string) (count_to : int) =
let rec cnt utf8_pos utf16_pos utf32_pos line_count prev_is_cr =
if utf8_pos = count_to then
create_offsets utf8_pos utf16_pos utf32_pos line_count
else
let chr = String.unsafe_get str utf8_pos in
let u8_length = utf8_length chr in
let u16_length = utf16_length chr in
let next_u8 = utf8_pos + u8_length in
let next_u16 = utf16_pos + u16_length in
let next_u32 = utf32_pos + 1 in
let next_line_count =
if chr = '\r' || (chr = '\n' && not prev_is_cr) then line_count + 1
else line_count
in
if next_u8 > count_to then
create_offsets utf8_pos utf16_pos utf32_pos line_count
else cnt next_u8 next_u16 next_u32 next_line_count (chr = '\r')
in
cnt 0 0 0 0 false
let count_to (str : string) (count_towards : int) (enc : encoding) =
match enc with
| Utf8 -> count_to_utf8 str count_towards
| Utf16 -> count_to_utf16 str count_towards
| Utf32 -> count_to_utf32 str count_towards