1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
type vocab = {
token_to_idx : (string, int) Hashtbl.t;
idx_to_token : (int, string) Hashtbl.t;
mutable size : int;
}
type tokenizer_method = [ `Words | `Chars | `Regex of string ]
let tokenize_words text =
let tokens = ref [] in
let start = ref 0 in
let in_token = ref false in
let len = String.length text in
let is_word_char c =
match c with
| 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '\'' | '-' -> true
| _ -> false
in
for i = 0 to len do
if i = len || not (is_word_char text.[i]) then (
if !in_token then (
let token = String.sub text !start (i - !start) in
tokens := token :: !tokens;
in_token := false);
if
i < len
&& not
(text.[i] = ' '
|| text.[i] = '\t'
|| text.[i] = '\n'
|| text.[i] = '\r')
then tokens := String.make 1 text.[i] :: !tokens)
else if not !in_token then (
start := i;
in_token := true)
done;
List.rev !tokens
let tokenize_chars text =
let decoder = Uutf.decoder (`String text) in
let tokens = ref [] in
let rec loop () =
match Uutf.decode decoder with
| `Uchar u ->
let buf = Buffer.create 4 in
Uutf.Buffer.add_utf_8 buf u;
tokens := Buffer.contents buf :: !tokens;
loop ()
| `End -> ()
| `Malformed _ -> loop ()
| `Await -> assert false
in
loop ();
List.rev !tokens
let tokenize_regex pattern text =
try
let re = Re.Perl.re pattern |> Re.compile in
Re.all re text |> List.map (fun g -> Re.Group.get g 0)
with Re.Perl.Parse_error ->
Nx_core.Error.invalid ~op:"tokenize"
~what:(Printf.sprintf "regex pattern '%s'" pattern)
~reason:"invalid regex pattern" ()
module Vocab = struct
type t = vocab
let pad_token = "<pad>"
let unk_token = "<unk>"
let bos_token = "<bos>"
let eos_token = "<eos>"
let create () =
let v =
{
token_to_idx = Hashtbl.create 1024;
idx_to_token = Hashtbl.create 1024;
size = 0;
}
in
List.iter
(fun token ->
Hashtbl.add v.token_to_idx token v.size;
Hashtbl.add v.idx_to_token v.size token;
v.size <- v.size + 1)
[ pad_token; unk_token; bos_token; eos_token ];
v
let add t token =
if not (Hashtbl.mem t.token_to_idx token) then (
Hashtbl.add t.token_to_idx token t.size;
Hashtbl.add t.idx_to_token t.size token;
t.size <- t.size + 1)
let add_batch t tokens = List.iter (add t) tokens
let get_index t token = Hashtbl.find_opt t.token_to_idx token
let get_token t idx = Hashtbl.find_opt t.idx_to_token idx
let size t = t.size
let pad_idx t = Hashtbl.find t.token_to_idx pad_token
let unk_idx t = Hashtbl.find t.token_to_idx unk_token
let bos_idx t = Hashtbl.find t.token_to_idx bos_token
let eos_idx t = Hashtbl.find t.token_to_idx eos_token
let from_tokens ?(max_size = max_int) ?(min_freq = 1) tokens =
if min_freq < 1 then
Nx_core.Error.invalid ~op:"vocab"
~what:(Printf.sprintf "min_freq %d" min_freq)
~reason:"must be >= 1" ();
let freq_table = Hashtbl.create 1024 in
List.iter
(fun token ->
let count =
Option.value (Hashtbl.find_opt freq_table token) ~default:0
in
Hashtbl.replace freq_table token (count + 1))
tokens;
let vocab = create () in
let sorted_tokens =
Hashtbl.fold (fun token count acc -> (token, count) :: acc) freq_table []
|> List.filter (fun (_, count) -> count >= min_freq)
|> List.sort (fun (_, c1) (_, c2) -> compare c2 c1)
|> List.map fst
in
let rec add_tokens tokens remaining =
match (tokens, remaining) with
| _, 0 -> ()
| [], _ -> ()
| token :: rest, n ->
add vocab token;
add_tokens rest (n - 1)
in
add_tokens sorted_tokens (max_size - 4);
if size vocab > max_size then
Nx_core.Error.invalid ~op:"vocab"
~what:(Printf.sprintf "vocab size %d" (size vocab))
~reason:(Printf.sprintf "exceeds maximum %d" max_size)
();
vocab
end
let tokenize ?(method_ = `Words) text =
match method_ with
| `Words -> tokenize_words text
| `Chars -> tokenize_chars text
| `Regex pattern -> tokenize_regex pattern text
let vocab = Vocab.from_tokens
let vocab_size = Vocab.size
let encode ?vocab text =
let tokens = tokenize text in
let vocab =
match vocab with Some v -> v | None -> Vocab.from_tokens tokens
in
List.map
(fun token ->
match Vocab.get_index vocab token with
| Some idx -> idx
| None -> Vocab.unk_idx vocab)
tokens
let encode_batch ?vocab ?(max_len = 512) ?(pad = true) texts =
let vocab =
match vocab with
| Some v -> v
| None ->
let all_tokens = List.concat_map tokenize texts in
Vocab.from_tokens all_tokens
in
let encoded = List.map (encode ~vocab) texts in
let batch_size = List.length texts in
let actual_max_len =
if pad then max_len
else min max_len (List.fold_left max 0 (List.map List.length encoded))
in
List.iter
(fun seq ->
if List.length seq > actual_max_len then
Nx_core.Error.cannot ~op:"encode_batch" ~what:"encode sequence"
~from:(Printf.sprintf "length %d" (List.length seq))
~to_:(Printf.sprintf "max_length %d" actual_max_len)
~hint:"increase max_length or truncate input" ())
encoded;
let arr = Nx.zeros Nx.int32 [| batch_size; actual_max_len |] in
let pad_idx = Vocab.pad_idx vocab in
if pad then ignore (Nx.fill (Int32.of_int pad_idx) arr);
List.iteri
(fun i seq ->
let seq_len = min (List.length seq) actual_max_len in
List.iteri
(fun j idx ->
if j < seq_len then Nx.set_item [ i; j ] (Int32.of_int idx) arr)
seq)
encoded;
arr
let decode vocab indices =
let tokens = List.filter_map (Vocab.get_token vocab) indices in
String.concat " " tokens
let decode_batch vocab tensor =
let shape = Nx.shape tensor in
match Array.to_list shape with
| [ batch_size; seq_len ] ->
let results = ref [] in
for i = 0 to batch_size - 1 do
let indices = ref [] in
for j = 0 to seq_len - 1 do
let idx = Nx.get_item [ i; j ] tensor |> Int32.to_int in
if idx <> Vocab.pad_idx vocab then indices := idx :: !indices
done;
let text = decode vocab (List.rev !indices) in
results := text :: !results
done;
List.rev !results
| _ ->
Nx_core.Error.invalid ~op:"decode_batch" ~what:"tensor shape"
~reason:"expected 2D tensor [batch_size; seq_len]" ()
let normalize ?(lowercase = false) ?(strip_accents = false)
?(collapse_whitespace = false) text =
let text = if lowercase then Unicode.case_fold text else text in
let text = if strip_accents then Unicode.strip_accents text else text in
let text =
if collapse_whitespace then
Unicode.clean_text ~remove_control:false ~normalize_whitespace:true text
else text
in
text
let vocab_save vocab path =
try
let oc = open_out path in
for i = 0 to Vocab.size vocab - 1 do
match Vocab.get_token vocab i with
| Some token -> Printf.fprintf oc "%s\n" token
| None -> ()
done;
close_out oc
with Sys_error msg ->
Nx_core.Error.failed ~op:"vocab_save"
~what:(Printf.sprintf "save to '%s'" path)
~reason:msg ()
let vocab_load path =
try
let ic = open_in path in
let vocab = Vocab.create () in
try
while true do
let token = input_line ic in
Vocab.add vocab token
done;
vocab
with End_of_file ->
close_in ic;
vocab
with Sys_error _ ->
Nx_core.Error.failed ~op:"vocab_load"
~what:(Printf.sprintf "load vocab from '%s'" path)
~reason:"file not found" ()
module Tokenizer = struct
type 'a t = {
tokenize : string -> string list;
normalizer : (string -> string) option;
pre_tokenizer : (string -> string list) option;
}
let words =
{ tokenize = tokenize_words; normalizer = None; pre_tokenizer = None }
let chars =
{ tokenize = tokenize_chars; normalizer = None; pre_tokenizer = None }
let regex pattern =
{
tokenize = tokenize_regex pattern;
normalizer = None;
pre_tokenizer = None;
}
let bpe ~vocab:_ ~merges:_ =
Nx_core.Error.failed ~op:"Tokenizer.bpe"
~what:"BPE tokenizer not implemented yet" ()
let wordpiece ~vocab:_ ~unk_token:_ =
Nx_core.Error.failed ~op:"Tokenizer.wordpiece"
~what:"WordPiece tokenizer not implemented yet" ()
let run t text =
let text = match t.normalizer with Some f -> f text | None -> text in
match t.pre_tokenizer with
| Some pre -> List.concat_map t.tokenize (pre text)
| None -> t.tokenize text
let run_with_offsets t text =
let tokens = run t text in
let rec find_offsets tokens text pos acc =
match tokens with
| [] -> List.rev acc
| tok :: rest -> (
match String.index_from_opt text pos tok.[0] with
| None -> find_offsets rest text pos acc
| Some start ->
let end_ = start + String.length tok in
find_offsets rest text end_ ((tok, start, end_) :: acc))
in
find_offsets tokens text 0 []
let with_normalizer f t = { t with normalizer = Some f }
let with_pre_tokenizer f t = { t with pre_tokenizer = Some f }
end
module Unicode = Unicode