Source file sdf.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

(* one molecule in SDF format (i.e. consecutive lines from a .sdf file) *)
type t = string

exception Read_one

let read_one (input: in_channel): t =
  let buff = Buffer.create 10240 in
  try
    while true do
      let line = input_line input in
      if line = "$$$$" then (* end of molecule in SDF format *)
        (Buffer.add_string buff line;
         Buffer.add_char buff '\n';
         raise Read_one)
      else
        (Buffer.add_string buff line;
         Buffer.add_char buff '\n')
    done;
    assert(false)
  with
  | End_of_file | Read_one ->
    let res = Buffer.contents buff in
    if res = "" then
      raise End_of_file
    else
      res

(* return the inchi string, no trailing '\n' *)
let get_inchi (mol: t): string =
  let line_before = "> <PUBCHEM_IUPAC_INCHI>\n" in
  let n = String.length line_before in
  try
    let i = BatString.find mol line_before in
    let j = i + n in
    let k = BatString.find_from mol j "\n" in
    BatString.sub mol j (k - j)
  with Not_found ->
    failwith ("Sdf.get_inchi: no inchi for: " ^ mol)

let get_inchikey (mol: t): string =
  let line_before = "> <PUBCHEM_IUPAC_INCHIKEY>\n" in
  let n = String.length line_before in
  try
    let i = BatString.find mol line_before in
    let j = i + n in
    let k = BatString.find_from mol j "\n" in
    BatString.sub mol j (k - j)
  with Not_found ->
    failwith ("Sdf.get_inchikey: no inchikey for: " ^ mol)

let get_fst_line m =
  fst (BatString.split m ~by:"\n")