123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501moduleA=BatArraymoduleL=MyListmoduletypeSCORE_LABEL=sigtypetvalget_score:t->floatvalget_label:t->boolendmoduletypeROC_FUNCTOR=functor(SL:SCORE_LABEL)->sig(** sort score labels putting high scores first *)valrank_order_by_score:SL.tlist->SL.tlist(** in-place sort of score labels; putting high scores first *)valrank_order_by_score_a:SL.tarray->unit(** cumulated actives curve given an already sorted list of score labels *)valcumulated_actives_curve:SL.tlist->intlist(** ROC curve (list of (FPR,TPR) values) corresponding to
those score labels *)valroc_curve:SL.tlist->(float*float)list(** same as [roc_curve] but for an already sorted array of score-labels *)valfast_roc_curve_a:SL.tarray->(float*float)array(** Precision Recall curve (list of (recall,precision) pairs)
corresponding to given score labels *)valpr_curve:SL.tlist->(float*float)list(** compute Area Under the ROC curve given an already sorted list of
score labels *)valfast_auc:SL.tlist->float(** ROC AUC: Area Under the ROC curve given an unsorted list
of score labels *)valauc:SL.tlist->float(** PR AUC: Area Under the Precision-Recall curve given an unsorted list
of score labels *)valpr_auc:SL.tlist->float(** Area Under the ROC curve given an unsorted array
of score labels; WARNING: array will be modified (sorted) *)valauc_a:SL.tarray->float(** Area Under the ROC curve given an already sorted array
of score labels *)valfast_auc_a:SL.tarray->float(** (early) enrichment factor at given threshold (database percentage)
given an unsorted list of score labels *)valenrichment_factor:float->SL.tlist->float(** (early) enrichment factor at given threshold (database percentage)
given an already sorted array of score labels *)valfast_enrichment_factor:float->SL.tarray->float(** [initial_enhancement a score_labels] will compute
S = sum_over_i (exp (-rank(active_i) / a))
given an unsorted list of score labels.
Robust Initial Enhancement (RIE) = S/<S> where <S> is
the average S for randomly ordered score labels.
RIE = 1.0 <=> random performance. Cf. DOI:10.1021/ci0100144
for details. *)valinitial_enhancement:float->SL.tlist->float(** same as [initial_enhancement] but does not reorder the list
of score_labels *)valfast_initial_enhancement:float->SL.tlist->float(** power metric at given threshold given an unsorted list of score labels *)valpower_metric:float->SL.tlist->float(** power metric at given threshold for an already decr. sorted list of score labels *)valfast_power_metric_a:float->SL.tarray->float(** bedroc_auc at given alpha. Default alpha = 20.0. *)valbedroc_auc:?alpha:float->SL.tlist->float(** bedroc_auc at given alpha for an array of score-labels.
Default alpha = 20.0.
WARNING: the array will be modified (sorted by decrasing scores)
if [sorted = false] which is the default. *)valbedroc_auc_a:?alpha:float->?sorted:bool->SL.tarray->float(** equivalent to [bedroc_auc_a ~alpha ~sorted:true arr]. *)valfast_bedroc_auc_a:?alpha:float->SL.tarray->float(** Matthews' Correlation Coefficient (MCC)
use: [mcc classif_threshold score_labels].
scores >= threshold are predicted as targets;
scores < threshold are predicted as non targets. *)valmcc:float->SL.tlist->float(** [a, b = platt_scaling ~debug score_labels]
Fit a logistic curve (1 / (1 + exp (ax + b))) to [score_labels]
and return its [(a, b)] parameters.
Gnuplot it used underneath to do the fitting.
Biblio:
Platt, J. (1999). Probabilistic outputs for support vector machines
and comparisons to regularized likelihood methods.
Advances in large margin classifiers, 10(3), 61-74. *)valplatt_scaling:?debug:bool->SL.tlist->(float*float)(** [platt_probability a b score] transform [score] into
a probability, given logistic function parameters [a] and [b]
obtained from a prior call to [platt_scaling]. *)valplatt_probability:float->float->float->floatend(* functions for ROC analysis *)moduleMake:ROC_FUNCTOR=functor(SL:SCORE_LABEL)->structlettrapezoid_surfacex1x2y1y2=letbase=abs_float(x1-.x2)inletheight=0.5*.(y1+.y2)inbase*.heightletrev_compare_scoresxy=BatFloat.compare(SL.get_scorey)(SL.get_scorex)(* put molecules with the highest scores at the top of the list *)letrank_order_by_score(score_labels:SL.tlist)=L.stable_sortrev_compare_scoresscore_labels(* put molecules with the highest scores at the top of the array *)letrank_order_by_score_a(score_labels:SL.tarray)=A.stable_sortrev_compare_scoresscore_labels(* compute the cumulated number of actives curve,
given an already sorted list of score labels *)letcumulated_actives_curve(high_scores_first:SL.tlist)=letsum=ref0inL.map(funsl->ifSL.get_labelslthenincrsum;!sum)high_scores_firstletroc_curve(score_labels:SL.tlist)=lethigh_scores_first=rank_order_by_scorescore_labelsinletnacts=ref0inletndecs=ref0inletnb_act_decs=L.fold_left(funaccx->ifSL.get_labelxthenincrnactselseincrndecs;(!nacts,!ndecs)::acc)[(0,0)]high_scores_firstinletnb_actives=float!nactsinletnb_decoys=float!ndecsinL.rev_map(fun(na,nd)->lettpr=floatna/.nb_activesinletfpr=floatnd/.nb_decoysin(fpr,tpr))nb_act_decsletfast_roc_curve_a(score_labels:SL.tarray)=letnacts=ref0inletndecs=ref0inletnb_act_decs=A.map(funx->ifSL.get_labelxthenincrnactselseincrndecs;(!nacts,!ndecs))score_labelsinletnb_actives=float!nactsinletnb_decoys=float!ndecsinA.map(fun(na,nd)->lettpr=floatna/.nb_activesinletfpr=floatnd/.nb_decoysin(fpr,tpr))nb_act_decs(* Saito, T., & Rehmsmeier, M. (2015).
The precision-recall plot is more informative than the ROC plot when
evaluating binary classifiers on imbalanced datasets.
PloS one, 10(3), e0118432. *)(* Davis, J., & Goadrich, M. (2006, June).
The relationship between Precision-Recall and ROC curves.
In Proceedings of the 23rd international conference on Machine learning
(pp. 233-240). ACM. *)letpr_curve(score_labels:SL.tlist)=letprecisiontpfp=tp/.(tp+.fp)inletrecalltpfn=tp/.(tp+.fn)inletnegatepx=not(px)inlethigh_scores_first_uniq=letall_scores=L.mapSL.get_scorescore_labelsinL.sort_uniq(funxy->BatFloat.compareyx)all_scoresin(* L.iter (Printf.printf "threshold: %f\n") high_scores_first_uniq; *)lethigh_scores_first=rank_order_by_scorescore_labelsinletbefore=ref[]inletafter=refhigh_scores_firstinletres=L.map(funthreshold->lethigher,lower=L.partition_while(funx->(SL.get_scorex)>=threshold)!afterinbefore:=L.rev_appendhigher!before;after:=lower;(* TP <=> (score >= t) && label *)lettp=float(L.filter_count(SL.get_label)!before)in(* FN <=> (score < t) && label *)letfn=float(L.filter_count(SL.get_label)!after)in(* FP <=> (score >= t) && (not label) *)letfp=float(L.filter_count(negateSL.get_label)!before)inletr=recalltpfninletp=precisiontpfpin(r,p))high_scores_first_uniqin(0.0,1.0)::res(* add missing first point *)letfast_auc_commonfold_funhigh_scores_first=letfp,tp,fp_prev,tp_prev,a,_p_prev=fold_fun(fun(fp,tp,fp_prev,tp_prev,a,p_prev)sl->letsi=SL.get_scoreslinletli=SL.get_labelslinletnew_a,new_p_prev,new_fp_prev,new_tp_prev=ifsi<>p_prevthena+.trapezoid_surfacefpfp_prevtptp_prev,si,fp,tpelsea,p_prev,fp_prev,tp_previnletnew_tp,new_fp=iflithentp+.1.,fpelsetp,fp+.1.in(new_fp,new_tp,new_fp_prev,new_tp_prev,new_a,new_p_prev))(0.,0.,0.,0.,0.,neg_infinity)high_scores_firstin(a+.trapezoid_surfacefpfp_prevtptp_prev)/.(fp*.tp)(* area under the ROC curve given an already sorted list of score-labels *)letfast_auchigh_scores_first=fast_auc_commonL.fold_lefthigh_scores_firstletfast_auc_ahigh_scores_first=fast_auc_commonA.fold_lefthigh_scores_first(* area under the ROC curve given an unsorted list of score-labels
TP cases have the label set to true
TN cases have the label unset *)letauc(score_labels:SL.tlist)=lethigh_scores_first=rank_order_by_scorescore_labelsinfast_auchigh_scores_firstletpr_auc(score_labels:SL.tlist)=letcurve=pr_curvescore_labelsinletrecloopacc=function|[]->acc|[_]->acc|(x1,y1)::(x2,y2)::xs->letarea=trapezoid_surfacex1x2y1y2inloop(area+.acc)((x2,y2)::xs)inloop0.0curveletauc_a(score_labels:SL.tarray)=rank_order_by_score_ascore_labels;fast_auc_ascore_labels(* proportion of actives given an unsorted list of score-labels
TP cases have the label set to true
TN cases have the label unset
returns: (nb_molecules, actives_rate) *)letactives_rate(score_labels:SL.tlist)=lettp_count,fp_count=L.fold_left(fun(tp_c,fp_c)sl->ifSL.get_labelslthen(tp_c+1,fp_c)else(tp_c,fp_c+1))(0,0)score_labelsinletnb_molecules=tp_count+fp_countin(nb_molecules,(floattp_count)/.(floatnb_molecules))(* enrichment rate at x (e.g. x = 0.01 --> ER @ 1%) given a list
of unsorted score-labels *)letenrichment_factor(p:float)(score_labels:SL.tlist)=letnb_molecules,rand_actives_rate=actives_ratescore_labelsinlettop_n=BatFloat.round_to_int(p*.(floatnb_molecules))inlettop_p_percent_molecules=L.taketop_n(rank_order_by_scorescore_labels)inlet_,top_actives_rate=actives_ratetop_p_percent_moleculesinletenr_rate=top_actives_rate/.rand_actives_rateinenr_rate(* this should land in batteries, not here... *)letarray_filter_countpa=letcount=ref0inA.iter(funx->ifpxthenincrcount)a;!countletarray_actives_ratea=letnb_actives=array_filter_countSL.get_labelainletn=A.lengthain(floatnb_actives)/.(floatn)letfast_enrichment_factorpscore_labels=letnb_molecules=A.lengthscore_labelsinletrand_actives_rate=array_actives_ratescore_labelsinlettop_n=BatFloat.round_to_int(p*.(floatnb_molecules))inlettop_p_percent_molecules=A.subscore_labels0top_ninlettop_actives_rate=array_actives_ratetop_p_percent_moleculesin(top_actives_rate/.rand_actives_rate)letfast_initial_enhancement(a:float)(l:SL.tlist)=L.fold_lefti(funaccix->ifSL.get_labelxthenletrank=floatiinacc+.exp(-.rank/.a)elseacc)0.0lletinitial_enhancement(a:float)(l:SL.tlist)=fast_initial_enhancementa(rank_order_by_scorel)letnb_activesl=float(L.fold_left(funaccx->ifSL.get_labelxthenacc+1elseacc)0l)letnb_actives_aa=letres=ref0inA.iter(funx->ifSL.get_labelxthenincrres)a;!res(* Cf. http://jcheminf.springeropen.com/articles/10.1186/s13321-016-0189-4
for formulas:
The power metric: a new statistically robust enrichment-type metric for
virtual screening applications with early recovery capability
Lopes et. al. Journal of Cheminformatics 2017 *)letpower_metric(cutoff:float)(scores_tot:SL.tlist):float=assert(cutoff>0.0&&cutoff<=1.0);letsize_tot=float(L.lengthscores_tot)inletx=BatFloat.round(cutoff*.size_tot)inletsize_x=int_of_floatxinassert(size_x>=1);letsorted=rank_order_by_scorescores_totinletscores_x=L.takesize_xsortedinletactives_x=nb_activesscores_xinletactives_tot=nb_activesscores_totinlettpr_x=actives_x/.actives_totinletfpr_x=(x-.actives_x)/.(size_tot-.actives_tot)intpr_x/.(tpr_x+.fpr_x)(* Same as [power_metric], but for an already sorted array of score-labels. *)letfast_power_metric_a(cutoff:float)(scores_tot:SL.tarray):float=assert(cutoff>0.0&&cutoff<=1.0);letsize_tot=float(A.lengthscores_tot)inletx=BatFloat.round(cutoff*.size_tot)inletsize_x=int_of_floatxinassert(size_x>=1);letscores_x=A.subscores_tot0size_xinletactives_x=float(nb_actives_ascores_x)inletactives_tot=float(nb_actives_ascores_tot)inlettpr_x=actives_x/.actives_totinletfpr_x=(x-.actives_x)/.(size_tot-.actives_tot)intpr_x/.(tpr_x+.fpr_x)(* formula comes from
"Evaluating Virtual Screening Methods:
Good and Bad Metrics for the “Early Recognition” Problem"
Jean-François Truchon * and Christopher I. Bayly. DOI: 10.1021/ci600426e
Reference implementation in Python:
---
def calculateBEDROC(self, alpha = 20.0 ):
if alpha < 0.00001:
os.stderr.write( "In method calculatBEDROC,
the alpha parameter argument must be
greater than zero." )
sys.exit(1)
N = float( self.getNbrTotal() )
n = float( self.getNbrActives() )
sum = 0.0
for rank in self.ranks:
sum += math.exp( -alpha * rank / N )
ra = n/N
factor1 =
ra * math.sinh( alpha/2.0 )/( math.cosh(alpha/2.0) -
math.cosh(alpha/2.0 - ra*alpha ) )
factor2 =
1.0 / ra * (math.exp(alpha/N) - 1.0)/( 1.0 - math.exp(-alpha))
constant = 1.0 / ( 1.0 - math.exp( alpha * ( 1.0 - ra ) ) )
bedroc = sum * factor1 * factor2 + constant
return bedroc
--- *)letbedroc_auc_a?alpha:(alpha=20.0)?sorted:(sorted=false)(score_labels:SL.tarray):float=lethalf_alpha=0.5*.alphainletn_tot=float(A.lengthscore_labels)inletn_act=float(nb_actives_ascore_labels)in(ifnotsortedthenrank_order_by_score_ascore_labels);letsum=A.fold_lefti(funaccrankx->ifSL.get_labelxthen(* ranks must start at 1 *)acc+.exp(-.alpha*.(1.0+.floatrank)/.n_tot)elseacc)0.0score_labelsinletr_a=n_act/.n_totinletfactor1=r_a*.sinhhalf_alpha/.(coshhalf_alpha-.cosh(half_alpha-.r_a*.alpha))inletfactor2=1.0/.r_a*.(exp(alpha/.n_tot)-.1.0)/.(1.0-.exp(-.alpha))inletconstant=1.0/.(1.0-.exp(alpha*.(1.0-.r_a)))insum*.factor1*.factor2+.constantletfast_bedroc_auc_a?alpha:(alpha=20.0)(score_labels:SL.tarray)=bedroc_auc_a~alpha~sorted:truescore_labelsletbedroc_auc?alpha:(alpha=20.0)(score_labels:SL.tlist):float=bedroc_auc_a~alpha(A.of_listscore_labels)(* accumulator type for the mcc metric *)typemcc_accum={tp:int;tn:int;fp:int;fn:int}(* Matthews' Correlation Coefficient (MCC)
Biblio: Matthews, B. W. (1975).
"Comparison of the predicted and observed secondary structure of T4 phage
lysozyme". Biochimica et Biophysica Acta (BBA)-Protein Structure,
405(2), 442-451. *)letmccclassif_thresholdscore_labels=(* formula:
mcc = (tp * tn - fp * fn) /
sqrt ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) *)letacc=L.fold_left(funaccsl->lettruth=SL.get_labelslinletscore=SL.get_scoreslinletprediction=score>=classif_thresholdinmatch(truth,prediction)with|(true,true)->{accwithtp=acc.tp+1}(* TP *)|(false,false)->{accwithtn=acc.tn+1}(* TN *)|(true,false)->{accwithfn=acc.fn+1}(* FN *)|(false,true)->{accwithfp=acc.fp+1}(* FP *)){tp=0;tn=0;fp=0;fn=0}score_labelsinlettp=floatacc.tpinlettn=floatacc.tninletfp=floatacc.fpinletfn=floatacc.fninletdenum'=(tp+.fp)*.(tp+.fn)*.(tn+.fp)*.(tn+.fn)inifdenum'=0.0then0.0(* div by 0 protection *)elseletnum=(tp*.tn)-.(fp*.fn)inletdenum=sqrtdenum'innum/.denumletplatt_scaling?(debug=false)score_labels=letscores_fn=Filename.temp_file"scores_"".txt"inUtls.with_out_filescores_fn(funout->L.iter(funsl->letscore=SL.get_scoreslinletlabel=SL.get_labelslinPrintf.fprintfout"%f %d\n"score(iflabelthen1else0))score_labels);letgnuplot_script_fn=Filename.temp_file"gnuplot_"".gpl"inUtls.with_out_filegnuplot_script_fn(funout->Printf.fprintfout"g(x) = 1 / (1 + exp(a * x + b))\n\
fit g(x) '%s' using 1:2 via a, b\n\
print a, b\n"scores_fn);leta_b_str=Utls.get_command_output(Printf.sprintf"gnuplot %s 2>&1 | tail -1"gnuplot_script_fn)inifnotdebugthenL.iterSys.remove[scores_fn;gnuplot_script_fn];Scanf.sscanfa_b_str"%f %f"(funab->(a,b))letplatt_probabilityabx=1.0/.(1.0+.exp(a*.x+.b))end