1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
open Printf
module A = BatArray
module BA = Bigarray
module BA1 = BA.Array1
module Fp = Fingerprint
module L = BatList
type dense = (int, BA.int8_unsigned_elt, BA.c_layout) BA1.t
type hashed = int array
let get_seeds k =
let global_seed = 314159265 in
let rng = Random.State.make [|global_seed|] in
let bound = (BatInt.pow 2 30) - 1 in
Array.init k (fun _ -> Random.State.int rng bound)
let to_dense (feat_id_bound: int) (fp: Fp.t): dense =
let res = BA1.create BA.int8_unsigned BA.C_layout feat_id_bound in
BA1.fill res 0;
let n = BA1.dim fp in
let i = ref 0 in
while !i < n do
let k = BA1.get fp !i in
let v = BA1.get fp (!i + 1) in
assert(k < feat_id_bound && v < 256);
BA1.set res k v;
i := !i + 2
done;
res
let string_of_dense x =
let n = BA1.dim x in
let buff = Buffer.create 80 in
for i = 0 to n - 1 do
let j = BA1.get x i in
bprintf buff " %d:%d" i j
done;
Buffer.contents buff
let update_bounds (bounds: int array) (fp: Fp.t): unit =
let n = BA1.dim fp in
let i = ref 0 in
while !i < n do
let k = BA1.get fp !i in
let v = BA1.get fp (!i + 1) in
bounds.(k) <- max (bounds.(k)) v;
i := !i + 2
done
let bounds (max_feat_id: int) (train: Fp.t array): int array =
let bounds = A.make max_feat_id 0 in
A.iter (update_bounds bounds) train;
bounds
let lookup_table (bounds: int array): int array =
let total = A.sum bounds in
let res = A.create total 0 in
let j = ref 0 in
A.iteri (fun i bound ->
for _ = 1 to bound do
res.(!j) <- i;
incr j
done
) bounds;
res
let acc_bounds_table (bounds: int array): int array =
let n = A.length bounds in
let res = A.create n 0 in
let acc = ref 0 in
A.iteri (fun i bound ->
res.(i) <- !acc;
acc := !acc + bound
) bounds;
res
let gen_rands seeds rand_bound =
A.map (fun seed ->
let rng = Random.State.make [|seed|] in
let ints = A.init rand_bound (fun i -> i) in
A.shuffle ~state:rng ints;
ints
) seeds
let hash pregen_rands idx2feat feat2acc_bound (dense_fp: dense): hashed =
let k = A.length pregen_rands in
let res = A.make k 0 in
for i = 0 to k - 1 do
let misses = ref 0 in
let j = ref 0 in
let rands = pregen_rands.(i) in
let rand' = rands.(!j) in
let test_feat_id = ref (idx2feat.(rand')) in
let test_feat_val = ref (rand' - feat2acc_bound.(!test_feat_id)) in
while !test_feat_val >= (BA1.unsafe_get dense_fp !test_feat_id) do
incr misses;
incr j;
let rand = rands.(!j) in
test_feat_id := idx2feat.(rand);
test_feat_val := rand - feat2acc_bound.(!test_feat_id)
done;
A.unsafe_set res i !misses
done;
res
let estimate_jaccard (hash1: hashed) (hash2: hashed): float =
let res = ref 0 in
let k = A.length hash1 in
for i = 0 to k - 1 do
if (A.unsafe_get hash1 i) = (A.unsafe_get hash2 i) then
incr res
done;
(float !res) /. (float k)
let estimate_distance (h1: hashed) (h2: hashed): float =
1.0 -. (estimate_jaccard h1 h2)