Source file owl_nlp_utils.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# 1 "src/owl/nlp/owl_nlp_utils.ml"
let regexp_split = Str.regexp "[ \t;,.'!?()’“”\\/&—\\-]+"
let _allocate_space x =
Owl_log.info "allocate more space";
let l = Array.length x in
let y = Array.make l [||] in
Array.append x y
let load_from_file ?stopwords f =
Owl_log.info "load text corpus";
let t =
match stopwords with
| Some t -> t
| None -> Hashtbl.create 2
in
let x = ref (Array.make (64 * 1024) [||]) in
let c = ref 0 in
let w = ref 0 in
let h = open_in f in
Fun.protect
(fun () ->
(try
while true do
if !c = Array.length !x - 1 then x := _allocate_space !x;
let s =
Str.split (Str.regexp " ") (input_line h)
|> List.filter (fun w -> Hashtbl.mem t w = false)
|> Array.of_list
in
!x.(!c) <- s;
c := !c + 1;
w := !w + Array.length s
done
with
| End_of_file -> ());
Owl_log.info "load %i docs, %i words" !c !w;
Array.sub !x 0 !c)
~finally:(fun () -> close_in h)
let load_from_string ?stopwords s =
let t =
match stopwords with
| Some t -> t
| None -> Hashtbl.create 2
in
Str.split (Str.regexp " ") s
|> List.filter (fun w -> Hashtbl.mem t w = false)
|> Array.of_list
let load_stopwords f =
Owl_log.info "load stopwords";
let x = Hashtbl.create (64 * 1024) in
let h = open_in f in
Fun.protect
(fun () ->
(try
while true do
let w = input_line h in
if Hashtbl.mem x w = false then Hashtbl.add x w 0
done
with
| End_of_file -> ());
x)
~finally:(fun () -> close_in h)
let build_vocabulary x =
Owl_log.info "build up vocabulary";
let w2i = Hashtbl.create (64 * 1024) in
Array.iter
(fun l ->
Array.iter (fun w -> if Hashtbl.mem w2i w = false then Hashtbl.add w2i w 0) l)
x;
let y = Array.make (Hashtbl.length w2i) "" in
let i = ref 0 in
Hashtbl.iter
(fun w _ ->
y.(!i) <- w;
i := !i + 1)
w2i;
Array.sort String.compare y;
let i2w = Hashtbl.(create (length w2i)) in
Hashtbl.reset w2i;
Array.iteri
(fun i w ->
Hashtbl.add w2i w i;
Hashtbl.add i2w i w)
y;
w2i, i2w
let tokenise dict data = Array.map (Hashtbl.find dict) data
let tokenise_all dict data = Array.map (Array.map (Hashtbl.find dict)) data
let save_vocabulary x f = Owl_io.marshal_to_file x f
let load_vocabulary f = Owl_io.marshal_from_file f
let save_lda_model m f =
Owl_log.info "save LDA model";
Owl_io.marshal_to_file m (f ^ ".model")
let load_lda_model f =
Owl_log.info "load LDA model";
Owl_io.marshal_from_file (f ^ ".model")
let simple_process s = s