Source file `MakeROC.ml`

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501

module A = BatArray
module L = MyList

module type SCORE_LABEL = sig
  type t
  val get_score: t -> float
  val get_label: t -> bool
end

module type ROC_FUNCTOR = functor (SL: SCORE_LABEL) ->
sig
  (** sort score labels putting high scores first *)
  val rank_order_by_score: SL.t list -> SL.t list

  (** in-place sort of score labels; putting high scores first *)
  val rank_order_by_score_a: SL.t array -> unit

  (** cumulated actives curve given an already sorted list of score labels *)
  val cumulated_actives_curve: SL.t list -> int list

  (** ROC curve (list of (FPR,TPR) values) corresponding to
      those score labels *)
  val roc_curve: SL.t list -> (float * float) list

  (** same as [roc_curve] but for an already sorted array of score-labels *)
  val fast_roc_curve_a: SL.t array -> (float * float) array

  (** Precision Recall curve (list of (recall,precision) pairs)
      corresponding to given score labels *)
  val pr_curve: SL.t list -> (float * float) list

  (** compute Area Under the ROC curve given an already sorted list of
      score labels *)
  val fast_auc: SL.t list -> float

  (** ROC AUC: Area Under the ROC curve given an unsorted list
      of score labels *)
  val auc: SL.t list -> float

  (** PR AUC: Area Under the Precision-Recall curve given an unsorted list
      of score labels *)
  val pr_auc: SL.t list -> float

  (** Area Under the ROC curve given an unsorted array
      of score labels; WARNING: array will be modified (sorted) *)
  val auc_a: SL.t array -> float

  (** Area Under the ROC curve given an already sorted array
      of score labels *)
  val fast_auc_a: SL.t array -> float

  (** (early) enrichment factor at given threshold (database percentage)
      given an unsorted list of score labels *)
  val enrichment_factor: float -> SL.t list -> float

  (** (early) enrichment factor at given threshold (database percentage)
      given an already sorted array of score labels *)
  val fast_enrichment_factor: float -> SL.t array -> float

  (** [initial_enhancement a score_labels] will compute
      S = sum_over_i (exp (-rank(active_i) / a))
      given an unsorted list of score labels.
      Robust Initial Enhancement (RIE) = S/<S> where <S> is
      the average S for randomly ordered score labels.
      RIE = 1.0 <=> random performance. Cf. DOI:10.1021/ci0100144
      for details. *)
  val initial_enhancement: float -> SL.t list -> float

  (** same as [initial_enhancement] but does not reorder the list
      of score_labels *)
  val fast_initial_enhancement: float -> SL.t list -> float

  (** power metric at given threshold given an unsorted list of score labels *)
  val power_metric: float -> SL.t list -> float

  (** power metric at given threshold for an already decr. sorted list of score labels *)
  val fast_power_metric_a: float -> SL.t array -> float

  (** bedroc_auc at given alpha. Default alpha = 20.0. *)
  val bedroc_auc: ?alpha:float -> SL.t list -> float

  (** bedroc_auc at given alpha for an array of score-labels.
      Default alpha = 20.0.
      WARNING: the array will be modified (sorted by decrasing scores)
      if [sorted = false] which is the default. *)
  val bedroc_auc_a: ?alpha:float -> ?sorted:bool -> SL.t array -> float

  (** equivalent to [bedroc_auc_a ~alpha ~sorted:true arr]. *)
  val fast_bedroc_auc_a: ?alpha:float -> SL.t array -> float

  (** Matthews' Correlation Coefficient (MCC)
      use: [mcc classif_threshold score_labels].
      scores >= threshold are predicted as targets;
      scores < threshold are predicted as non targets. *)
  val mcc: float -> SL.t list -> float

  (** [a, b = platt_scaling ~debug score_labels]
      Fit a logistic curve (1 / (1 + exp (ax + b))) to [score_labels]
      and return its [(a, b)] parameters.
      Gnuplot it used underneath to do the fitting.
      Biblio:
      Platt, J. (1999). Probabilistic outputs for support vector machines
      and comparisons to regularized likelihood methods.
      Advances in large margin classifiers, 10(3), 61-74. *)
  val platt_scaling: ?debug:bool -> SL.t list -> (float * float)

  (** [platt_probability a b score] transform [score] into
      a probability, given logistic function parameters [a] and [b]
      obtained from a prior call to [platt_scaling]. *)
  val platt_probability: float -> float -> float -> float

end

(* functions for ROC analysis *)
module Make: ROC_FUNCTOR = functor (SL: SCORE_LABEL) ->
struct

  let trapezoid_surface x1 x2 y1 y2 =
    let base = abs_float (x1 -. x2) in
    let height = 0.5 *. (y1 +. y2) in
    base *. height

  let rev_compare_scores x y =
    BatFloat.compare (SL.get_score y) (SL.get_score x)

  (* put molecules with the highest scores at the top of the list *)
  let rank_order_by_score (score_labels: SL.t list) =
    L.stable_sort rev_compare_scores score_labels

  (* put molecules with the highest scores at the top of the array *)
  let rank_order_by_score_a (score_labels: SL.t array) =
    A.stable_sort rev_compare_scores score_labels

  (* compute the cumulated number of actives curve,
     given an already sorted list of score labels *)
  let cumulated_actives_curve (high_scores_first: SL.t list) =
    let sum = ref 0 in
    L.map (fun sl ->
        if SL.get_label sl then incr sum;
        !sum
      ) high_scores_first

  let roc_curve (score_labels: SL.t list) =
    let high_scores_first = rank_order_by_score score_labels in
    let nacts = ref 0 in
    let ndecs = ref 0 in
    let nb_act_decs =
      L.fold_left (fun acc x ->
          if SL.get_label x then incr nacts
          else incr ndecs;
          (!nacts, !ndecs) :: acc
        ) [(0, 0)] high_scores_first in
    let nb_actives = float !nacts in
    let nb_decoys = float !ndecs in
    L.rev_map (fun (na, nd) ->
        let tpr = float na /. nb_actives in
        let fpr = float nd /. nb_decoys in
        (fpr, tpr)
      ) nb_act_decs

  let fast_roc_curve_a (score_labels: SL.t array) =
    let nacts = ref 0 in
    let ndecs = ref 0 in
    let nb_act_decs =
      A.map (fun x ->
          if SL.get_label x then incr nacts
          else incr ndecs;
          (!nacts, !ndecs)
        ) score_labels in
    let nb_actives = float !nacts in
    let nb_decoys = float !ndecs in
    A.map (fun (na, nd) ->
        let tpr = float na /. nb_actives in
        let fpr = float nd /. nb_decoys in
        (fpr, tpr)
      ) nb_act_decs

  (* Saito, T., & Rehmsmeier, M. (2015).
     The precision-recall plot is more informative than the ROC plot when
     evaluating binary classifiers on imbalanced datasets.
     PloS one, 10(3), e0118432. *)
  (* Davis, J., & Goadrich, M. (2006, June).
     The relationship between Precision-Recall and ROC curves.
     In Proceedings of the 23rd international conference on Machine learning
     (pp. 233-240). ACM. *)
  let pr_curve (score_labels: SL.t list) =
    let precision tp fp =
      tp /. (tp +. fp) in
    let recall tp fn =
      tp /. (tp +. fn) in
    let negate p x =
      not (p x) in
    let high_scores_first_uniq =
      let all_scores = L.map SL.get_score score_labels in
      L.sort_uniq (fun x y -> BatFloat.compare y x) all_scores in
    (* L.iter (Printf.printf "threshold: %f\n") high_scores_first_uniq; *)
    let high_scores_first = rank_order_by_score score_labels in
    let before = ref [] in
    let after = ref high_scores_first in
    let res =
      L.map (fun threshold ->
          let higher, lower =
            L.partition_while (fun x -> (SL.get_score x) >= threshold) !after in
          before := L.rev_append higher !before;
          after := lower;
          (* TP <=> (score >= t) && label *)
          let tp = float (L.filter_count (SL.get_label) !before) in
          (* FN <=> (score < t) && label *)
          let fn = float (L.filter_count (SL.get_label) !after) in
          (* FP <=> (score >= t) && (not label) *)
          let fp = float (L.filter_count (negate SL.get_label) !before) in
          let r = recall tp fn in
          let p = precision tp fp in
          (r, p)
        ) high_scores_first_uniq in
    (0.0, 1.0) :: res (* add missing first point *)

  let fast_auc_common fold_fun high_scores_first =
    let fp, tp, fp_prev, tp_prev, a, _p_prev =
      fold_fun (fun (fp, tp, fp_prev, tp_prev, a, p_prev) sl ->
          let si = SL.get_score sl in
          let li = SL.get_label sl in
          let new_a, new_p_prev, new_fp_prev, new_tp_prev =
            if si <> p_prev then
              a +. trapezoid_surface fp fp_prev tp tp_prev,
              si,
              fp,
              tp
            else
              a,
              p_prev,
              fp_prev,
              tp_prev
          in
          let new_tp, new_fp =
            if li then
              tp +. 1., fp
            else
              tp, fp +. 1.
          in
          (new_fp, new_tp, new_fp_prev, new_tp_prev, new_a, new_p_prev)
        ) (0., 0., 0., 0., 0., neg_infinity)
        high_scores_first
    in
    (a +. trapezoid_surface fp fp_prev tp tp_prev) /. (fp *. tp)

  (* area under the ROC curve given an already sorted list of score-labels *)
  let fast_auc high_scores_first =
    fast_auc_common L.fold_left high_scores_first

  let fast_auc_a high_scores_first =
    fast_auc_common A.fold_left high_scores_first

  (* area under the ROC curve given an unsorted list of score-labels
     TP cases have the label set to true
     TN cases have the label unset *)
  let auc (score_labels: SL.t list) =
    let high_scores_first = rank_order_by_score score_labels in
    fast_auc high_scores_first

  let pr_auc (score_labels: SL.t list) =
    let curve = pr_curve score_labels in
    let rec loop acc = function
      | [] -> acc
      | [_] -> acc
      | (x1, y1) :: (x2, y2) :: xs ->
        let area = trapezoid_surface x1 x2 y1 y2 in
        loop (area +. acc) ((x2, y2) :: xs) in
    loop 0.0 curve

  let auc_a (score_labels: SL.t array) =
    rank_order_by_score_a score_labels;
    fast_auc_a score_labels

  (* proportion of actives given an unsorted list of score-labels
     TP cases have the label set to true
     TN cases have the label unset
     returns: (nb_molecules, actives_rate) *)
  let actives_rate (score_labels: SL.t list) =
    let tp_count, fp_count =
      L.fold_left
        (fun (tp_c, fp_c) sl ->
           if SL.get_label sl then
             (tp_c + 1, fp_c)
           else
             (tp_c, fp_c + 1)
        ) (0, 0) score_labels
    in
    let nb_molecules = tp_count + fp_count in
    (nb_molecules, (float tp_count) /. (float nb_molecules))

  (* enrichment rate at x (e.g. x = 0.01 --> ER @ 1%) given a list
     of unsorted score-labels *)
  let enrichment_factor (p: float) (score_labels: SL.t list) =
    let nb_molecules, rand_actives_rate = actives_rate score_labels in
    let top_n = BatFloat.round_to_int (p *. (float nb_molecules)) in
    let top_p_percent_molecules =
      L.take top_n (rank_order_by_score score_labels) in
    let _, top_actives_rate = actives_rate top_p_percent_molecules in
    let enr_rate = top_actives_rate /. rand_actives_rate in
    enr_rate

  (* this should land in batteries, not here... *)
  let array_filter_count p a =
    let count = ref 0 in
    A.iter (fun x ->
        if p x then incr count
      ) a;
    !count

  let array_actives_rate a =
    let nb_actives = array_filter_count SL.get_label a in
    let n = A.length a in
    (float nb_actives) /. (float n)

  let fast_enrichment_factor p score_labels =
    let nb_molecules = A.length score_labels in
    let rand_actives_rate = array_actives_rate score_labels in
    let top_n = BatFloat.round_to_int (p *. (float nb_molecules)) in
    let top_p_percent_molecules = A.sub score_labels 0 top_n in
    let top_actives_rate = array_actives_rate top_p_percent_molecules in
    (top_actives_rate /. rand_actives_rate)

  let fast_initial_enhancement (a: float) (l: SL.t list) =
    L.fold_lefti (fun acc i x ->
        if SL.get_label x then
          let rank = float i in
          acc +. exp (-. rank /. a)
        else
          acc
      ) 0.0 l

  let initial_enhancement (a: float) (l: SL.t list) =
    fast_initial_enhancement a (rank_order_by_score l)

  let nb_actives l =
    float
      (L.fold_left (fun acc x ->
           if SL.get_label x then acc + 1
           else acc
         ) 0 l)

  let nb_actives_a a =
    let res = ref 0 in
    A.iter (fun x -> if SL.get_label x then incr res) a;
    !res

  (* Cf. http://jcheminf.springeropen.com/articles/10.1186/s13321-016-0189-4
     for formulas:
     The power metric: a new statistically robust enrichment-type metric for
     virtual screening applications with early recovery capability
     Lopes et. al. Journal of Cheminformatics 2017 *)
  let power_metric (cutoff: float) (scores_tot: SL.t list): float =
    assert(cutoff > 0.0 && cutoff <= 1.0);
    let size_tot = float (L.length scores_tot) in
    let x = BatFloat.round (cutoff *. size_tot) in
    let size_x = int_of_float x in
    assert(size_x >= 1);
    let sorted = rank_order_by_score scores_tot in
    let scores_x = L.take size_x sorted in
    let actives_x = nb_actives scores_x in
    let actives_tot = nb_actives scores_tot in
    let tpr_x = actives_x /. actives_tot in
    let fpr_x = (x -. actives_x) /. (size_tot -. actives_tot) in
    tpr_x /. (tpr_x +. fpr_x)

  (* Same as [power_metric], but for an already sorted array of score-labels. *)
  let fast_power_metric_a (cutoff: float) (scores_tot: SL.t array): float =
    assert(cutoff > 0.0 && cutoff <= 1.0);
    let size_tot = float (A.length scores_tot) in
    let x = BatFloat.round (cutoff *. size_tot) in
    let size_x = int_of_float x in
    assert(size_x >= 1);
    let scores_x = A.sub scores_tot 0 size_x in
    let actives_x = float (nb_actives_a scores_x) in
    let actives_tot = float (nb_actives_a scores_tot) in
    let tpr_x = actives_x /. actives_tot in
    let fpr_x = (x -. actives_x) /. (size_tot -. actives_tot) in
    tpr_x /. (tpr_x +. fpr_x)

  (* formula comes from
     "Evaluating Virtual Screening Methods:
     Good and Bad Metrics for the “Early Recognition” Problem"
     Jean-François Truchon * and Christopher I. Bayly. DOI: 10.1021/ci600426e
     Reference implementation in Python:
     ---
     def calculateBEDROC(self, alpha = 20.0 ):
           if alpha < 0.00001:
               os.stderr.write( "In method calculatBEDROC,
                                 the alpha parameter argument must be
                                 greater than zero." )
               sys.exit(1)
           N = float( self.getNbrTotal() )
           n = float( self.getNbrActives() )
           sum = 0.0
           for rank in self.ranks:
               sum += math.exp( -alpha * rank / N )
           ra = n/N
           factor1 =
             ra * math.sinh( alpha/2.0 )/( math.cosh(alpha/2.0) -
                                           math.cosh(alpha/2.0 - ra*alpha ) )
           factor2 =
             1.0 / ra * (math.exp(alpha/N) - 1.0)/( 1.0 - math.exp(-alpha))
           constant = 1.0 / ( 1.0 - math.exp( alpha * ( 1.0 - ra ) ) )
           bedroc = sum * factor1 * factor2 + constant
           return bedroc
     --- *)
  let bedroc_auc_a ?alpha:(alpha = 20.0) ?sorted:(sorted = false)
      (score_labels: SL.t array): float =
    let half_alpha = 0.5 *. alpha in
    let n_tot = float (A.length score_labels) in
    let n_act = float (nb_actives_a score_labels) in
    (if not sorted then rank_order_by_score_a score_labels);
    let sum =
      A.fold_lefti (fun acc rank x ->
          if SL.get_label x then
            (* ranks must start at 1 *)
            acc +. exp (-.alpha *. (1.0 +. float rank) /. n_tot)
          else
            acc
        ) 0.0 score_labels in
    let r_a = n_act /. n_tot in
    let factor1 = r_a *. sinh half_alpha /.
                  (cosh half_alpha -. cosh (half_alpha -. r_a *. alpha)) in
    let factor2 = 1.0 /. r_a *.
                  (exp (alpha /. n_tot) -. 1.0) /. (1.0 -. exp (-.alpha)) in
    let constant = 1.0 /. (1.0 -. exp (alpha *. ( 1.0 -. r_a))) in
    sum *. factor1 *. factor2 +. constant

  let fast_bedroc_auc_a ?alpha:(alpha = 20.0) (score_labels: SL.t array) =
    bedroc_auc_a ~alpha ~sorted:true score_labels

  let bedroc_auc ?alpha:(alpha = 20.0) (score_labels: SL.t list): float =
    bedroc_auc_a ~alpha (A.of_list score_labels)

  (* accumulator type for the mcc metric *)
  type mcc_accum = { tp: int ;
                     tn: int ;
                     fp: int ;
                     fn: int }

  (* Matthews' Correlation Coefficient (MCC)
     Biblio: Matthews, B. W. (1975).
     "Comparison of the predicted and observed secondary structure of T4 phage
     lysozyme". Biochimica et Biophysica Acta (BBA)-Protein Structure,
     405(2), 442-451. *)
  let mcc classif_threshold score_labels =
    (* formula:
       mcc = (tp * tn - fp * fn) /
             sqrt ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) *)
    let acc =
      L.fold_left (fun acc sl ->
          let truth = SL.get_label sl in
          let score = SL.get_score sl in
          let prediction = score >= classif_threshold in
          match (truth, prediction) with
          | (true, true)   -> { acc with tp = acc.tp + 1 } (* TP *)
          | (false, false) -> { acc with tn = acc.tn + 1 } (* TN *)
          | (true, false)  -> { acc with fn = acc.fn + 1 } (* FN *)
          | (false, true)  -> { acc with fp = acc.fp + 1 } (* FP *)
        ) { tp = 0; tn = 0; fp = 0; fn = 0 } score_labels in
    let tp = float acc.tp in
    let tn = float acc.tn in
    let fp = float acc.fp in
    let fn = float acc.fn in
    let denum' = (tp +. fp) *. (tp +. fn) *. (tn +. fp) *. (tn +. fn) in
    if denum' = 0.0 then 0.0 (* div by 0 protection *)
    else
      let num = (tp *. tn) -. (fp *. fn) in
      let denum = sqrt denum' in
      num /. denum

  let platt_scaling ?(debug = false) score_labels =
    let scores_fn = Filename.temp_file "scores_" ".txt" in
    Utls.with_out_file scores_fn (fun out ->
        L.iter (fun sl ->
            let score = SL.get_score sl in
            let label = SL.get_label sl in
            Printf.fprintf out "%f %d\n" score (if label then 1 else 0)
          ) score_labels
      );
    let gnuplot_script_fn = Filename.temp_file "gnuplot_" ".gpl" in
    Utls.with_out_file gnuplot_script_fn (fun out ->
        Printf.fprintf out
          "g(x) = 1 / (1 + exp(a * x + b))\n\
           fit g(x) '%s' using 1:2 via a, b\n\
           print a, b\n"
          scores_fn
      );
    let a_b_str =
      Utls.get_command_output
        (Printf.sprintf "gnuplot %s 2>&1 | tail -1" gnuplot_script_fn) in
    if not debug then
      L.iter Sys.remove [scores_fn; gnuplot_script_fn];
    Scanf.sscanf a_b_str "%f %f" (fun a b -> (a, b))

  let platt_probability a b x =
    1.0 /. (1.0 +. exp (a *. x +. b))

end