Source file fingerprint.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

module Log = Dolog.Log.Make(struct let section = "Fprt" end)

module S = BatString
module IntSet = MyIntSet

type t = MACCS of Bitv.t
       | ECFP4 of Bitv.t
       | PUBCH of Bitv.t
       | MOP2D of IntSet.t

let to_string: t -> string = function
  | MOP2D ints -> IntSet.to_string ints
  | MACCS bits
  | PUBCH bits
  | ECFP4 bits -> Bitv.M.to_string bits

let identify: t -> string = function
  | MOP2D _ -> "mop2d"
  | MACCS _ -> "maccs"
  | ECFP4 _ -> "ecfp4"
  | PUBCH _ -> "pubch"

let get_bits: t -> Bitv.t = function
  | MOP2D _ -> failwith "Fp.get_bits: MOP2D"
  | MACCS bits
  | PUBCH bits
  | ECFP4 bits -> bits

let get_ints: t -> IntSet.t = function
  | MOP2D ints -> ints
  | MACCS _bits -> failwith "Fp.get_ints: MACCS"
  | PUBCH _bits -> failwith "Fp.get_ints: PUBCH"
  | ECFP4 _bits -> failwith "Fp.get_ints: ECFP4"

let count_set_bits: t -> int = function
  | MOP2D ints -> IntSet.cardinal ints
  | MACCS bits
  | PUBCH bits
  | ECFP4 bits -> Bitv.pop bits

let size: t -> int = function
  | MOP2D _ -> failwith "Fp.size: MOP2D"
  | MACCS bits
  | PUBCH bits
  | ECFP4 bits -> Bitv.length bits

let union fp1 fp2 = match fp1, fp2 with
  | MOP2D m1, MOP2D m2 -> MOP2D (IntSet.union m1 m2)
  | MACCS m1, MACCS m2 -> MACCS (Bitv.bw_or m1 m2)
  | ECFP4 m1, ECFP4 m2 -> ECFP4 (Bitv.bw_or m1 m2)
  | _, _ -> failwith "Fp.union: incompatible FPs"

exception Failed

let of_string s size =
  if MyUtils.string_contains_only_zeros_or_ones s &&
     String.length s = size then
    Bitv.M.of_string s
  else
    raise Failed

let maccs_length = 166
let ecfp4_length = 2048
let pubch_length = 881

let of_maccs_string (s: string): t =
  try MACCS (of_string s maccs_length)
  with Failed ->
    failwith ("of_maccs_string: non MACCS string: " ^ s)

let of_ecfp4_string (s: string): t =
  try ECFP4 (of_string s ecfp4_length)
  with Failed ->
    failwith ("of_ecfp4_string: non ECFP4 string: " ^ s)

let of_pubch_string (s: string): t =
  try PUBCH (of_string s pubch_length)
  with Failed ->
    failwith ("of_pubch_string: non PUBCH string: " ^ s)

(* this one has dataset-dependant variable length *)
let of_mop2d_string (s: string): t =
  let _size_str, set_bit_indexes_str = S.split s ~by:":" in
  let set_bit_indexes = IntSet.of_string set_bit_indexes_str in
  MOP2D set_bit_indexes