Source file stringCodepointSplitter.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
open Stdlib
(**
The Module needs [Uutf] Module.
It only contains [split_string_by_unicode_codepoint], which splits an OCaml string [str] to a [string list]
*)
(** Split an OCaml string [str] to a [string list]
{b Arguments}
{ul
{- [str] the string to be splitted.
}}
{b Example}
{[let example= "m̄知 who you're." (*don't know who you are*) in
List.map (fun x -> print_string (x ^ ", ")) (split_string_by_unicode_codepoint example);;
(*it will output : "m, ̄, 知, , w, h, o, , y, o, u, ', r, e, ., "*)]}
*)
let split_string_by_unicode_codepoint str =
let pred_codepoint = ref (-1) in
let segmented_unit_list = ref [] in
let iterator () y _ =
let () = if !pred_codepoint > -1 then
let current_codepoint = y in
let pred_char_len = current_codepoint - !pred_codepoint in
let unit_substring = Stdlib.String.sub str !pred_codepoint pred_char_len in
segmented_unit_list := unit_substring :: !segmented_unit_list
in
let () = pred_codepoint := y in
()
in
let _ = Uutf.String.fold_utf_8 iterator () str in
let last_char_len = (Stdlib.String.length str) - !pred_codepoint in
let () =
if last_char_len > 0 then
let unit_substring = Stdlib.String.sub str !pred_codepoint last_char_len in
segmented_unit_list := unit_substring :: !segmented_unit_list
in
List.rev !segmented_unit_list;;