123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221(* Copyright (C) 2020, Francois Berenger
Yamanishi laboratory,
Department of Bioscience and Bioinformatics,
Faculty of Computer Science and Systems Engineering,
Kyushu Institute of Technology,
680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)moduleA=ArraymoduleBA=BigarraymoduleBA1=BA.Array1moduleHt=HashtblmoduleIntMap=BatMap.IntmoduleL=MyList(* an unfolded-counted FP *)(* the int16 bigarray trick reduces memory consumption by four times compared
* to regular 64 bits OCaml integers; thanks to Oleg for suggesting it
* and to Chet Murthy for suggesting arrays *)typet=(int,BA.int_elt,BA.c_layout)BA1.tletcreate_BA1n=BA1.createBA.IntBA.C_layoutnletof_strings:t=letprevious=ref(-1)inletn=ref0inletkvs=L.of_string(funstr->Scanf.sscanfstr"%d:%d"(funkv->(* indices are >= 0 *)(* indices are incr. sorted *)(* feature counts are > 0 *)Utls.enforce_f(k>=0&&k>!previous&&v>0)(fun()->"Fingerprint.of_string: invalid line: "^s);previous:=k;incrn;(k,v)))sinletres=create_BA1(2*!n)inleti=ref0inL.iter(fun(k,v)->BA1.unsafe_setres!ik;incri;BA1.unsafe_setres!iv;incri)kvs;resletto_string(x:t):string=letbuff=Buffer.create80inletn=BA1.dimxinleti=ref0inwhile!i<ndoletk=BA1.unsafe_getx!iinletv=BA1.unsafe_getx(!i+1)inif!i=0thenPrintf.bprintfbuff"%d:%d"kvelsePrintf.bprintfbuff";%d:%d"kv;i:=!i+2done;Buffer.contentsbuffletmax_feat_idx=letn=BA1.dimxinBA1.getx(n-2)letnb_featuresx=1+(max_feat_idx)(* sparse to dense conversion *)letto_dense(max_len:int)(x:t):intarray=letres=A.makemax_len0inletn=BA1.dimxinleti=ref0inwhile!i<ndores.(BA1.unsafe_getx!i)<-BA1.unsafe_getx(!i+1);i:=!i+2done;res(* sparse to dense printf;
without creation of the intermediate dense array (for perfs) *)letto_dense_printfnb_features(x:t):unit=letn=BA1.dimxinleti=ref0inletj=ref0inwhile!i<ndoletk=BA1.unsafe_getx!iinwhile!j<kdoPrintf.printf" 0";incrjdone;letv=BA1.unsafe_getx(!i+1)inPrintf.printf" %d"v;incrj;i:=!i+2done;while!j<nb_featuresdoPrintf.printf" 0";incrjdoneletsum_min_max(m1:t)(m2:t):(int*int)=leticard=ref0inletucard=ref0inletlen1=BA1.dimm1inletlen2=BA1.dimm2inleti=ref0inletj=ref0inwhile!i<len1&&!j<len2do(* unsafe *)letk1=BA1.unsafe_getm1!iinletv1=BA1.unsafe_getm1(!i+1)inletk2=BA1.unsafe_getm2!jinletv2=BA1.unsafe_getm2(!j+1)in(* process keys in increasing order *)ifk1<k2then(ucard:=!ucard+v1;i:=!i+2)elseifk2<k1then(ucard:=!ucard+v2;j:=!j+2)else(* k1 = k2 *)ifv1<=v2then(icard:=!icard+v1;ucard:=!ucard+v2;i:=!i+2;j:=!j+2)else(icard:=!icard+v2;ucard:=!ucard+v1;i:=!i+2;j:=!j+2)done;incri;(* go to value *)while!i<len1do(* finish m1; unsafe *)ucard:=!ucard+(BA1.unsafe_getm1!i);i:=!i+2done;incrj;(* go to value *)while!j<len2do(* finish m2; unsafe *)ucard:=!ucard+(BA1.unsafe_getm2!j);j:=!j+2done;(!icard,!ucard)(* tani(A,B) = |inter(A,B)| / |union(A,B)|
= sum(min_i) / sum(max_i) *)lettanimoto(m1:t)(m2:t):float=leticard,ucard=sum_min_maxm1m2inifucard=0then0.0else(floaticard)/.(floatucard)(* tanimoto distance (this _is_ a metric) *)letdistancexy=1.0-.(tanimotoxy)(* convert to int map: feat_id -> feat_val *)letkey_valuesfp=letres=refIntMap.emptyinletlen=BA1.dimfpinleti=ref0inwhile!i<lendoletk=BA1.unsafe_getfp!iinletv=BA1.unsafe_getfp(!i+1)inres:=IntMap.addkv!res;i:=!i+2done;!resletkey_value_pairsfp=letres=ref[]inletlen=BA1.dimfpinleti=ref0inwhile!i<lendoletk=BA1.unsafe_getfp!iinletv=BA1.unsafe_getfp(!i+1)inres:=(k,v)::!res;i:=!i+2done;!res(* iterate given function on all key-value pairs *)letkv_iterffp=letlen=BA1.dimfpinleti=ref0inwhile!i<lendof(BA1.unsafe_getfp!i)(BA1.unsafe_getfp(!i+1));i:=!i+2doneletdrop_featuresto_dropfp=letkept=letkvs=key_valuesfpinIntMap.filter(funk_v->not(Ht.memto_dropk))kvsinletn=IntMap.cardinalkeptinletres=create_BA1(2*n)inleti=ref0inIntMap.iter(funkv->BA1.unsafe_setres!ik;incri;BA1.unsafe_setres!iv;incri)kept;resletsum_valuesfp=letlen=BA1.dimfpinleti=ref1in(* values start at 1 *)lettotal=ref0inwhile!i<lendototal:=!total+(BA1.unsafe_getfp!i);i:=!i+2done;!total