Source file fingerprint.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
module A = Array
module BA = Bigarray
module BA1 = BA.Array1
module IntMap = BatMap.Int
module L = MyList
type t = (int, BA.int_elt, BA.c_layout) BA1.t
let of_string s: t =
let previous = ref (-1) in
let n = ref 0 in
let kvs =
L.of_string (fun str ->
Scanf.sscanf str "%d:%d"
(fun k v ->
Utls.enforce_f (k >= 0 && k > !previous && v > 0)
(fun () -> "Fingerprint.of_string: invalid line: " ^ s);
previous := k;
incr n;
(k, v)
)
) s in
let res = BA1.create BA.Int BA.C_layout (2 * !n) in
let i = ref 0 in
L.iter (fun (k, v) ->
BA1.unsafe_set res !i k;
incr i;
BA1.unsafe_set res !i v;
incr i
) kvs;
res
let nb_features x =
let n = BA1.dim x in
1 + (BA1.get x (n - 2))
let tanimoto (m1: t) (m2: t): float =
let icard = ref 0 in
let ucard = ref 0 in
let len1 = BA1.dim m1 in
let len2 = BA1.dim m2 in
let i = ref 0 in
let j = ref 0 in
while !i < len1 && !j < len2 do
let k1 = BA1.unsafe_get m1 !i in
let v1 = BA1.unsafe_get m1 (!i + 1) in
let k2 = BA1.unsafe_get m2 !j in
let v2 = BA1.unsafe_get m2 (!j + 1) in
if k1 < k2 then
(ucard := !ucard + v1;
i := !i + 2)
else if k2 < k1 then
(ucard := !ucard + v2;
j := !j + 2)
else
if v1 <= v2 then
(icard := !icard + v1;
ucard := !ucard + v2;
i := !i + 2;
j := !j + 2)
else
(icard := !icard + v2;
ucard := !ucard + v1;
i := !i + 2;
j := !j + 2)
done;
incr i;
while !i < len1 do
ucard := !ucard + (BA1.unsafe_get m1 !i);
i := !i + 2
done;
incr j;
while !j < len2 do
ucard := !ucard + (BA1.unsafe_get m2 !j);
j := !j + 2
done;
if !ucard = 0 then 0.0
else (float !icard) /. (float !ucard)
let distance x y =
1.0 -. (tanimoto x y)