123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990(* Copyright (C) 2020, Francois Berenger
Yamanishi laboratory,
Department of Bioscience and Bioinformatics,
Faculty of Computer Science and Systems Engineering,
Kyushu Institute of Technology,
680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)(* mini molecule module *)moduleA=BatArraymoduleIntSet=BatSet.IntmoduleHt=BatHashtblmoduleL=BatListmoduleStringMap=BatMap.Stringtypet={name:string;graph:Node.tarray;diameter:int;matrix:intarrayarray}letget_namem=m.nameletget_graphm=m.graphletnb_atomsm=A.lengthm.graphletcreatenamegraphdiametermatrix={name;graph;diameter;matrix}letget_typ(m:t)(i:int)=Node.get_typ(A.unsafe_getm.graphi)letget_succs(m:t)(i:int)=Node.get_succs(A.unsafe_getm.graphi)(* list (sorted-uniq-counted) atom types of all atoms
at given distance from center atom *)lettypes_at_distance(center:int)(curr_height:int)(mol:t)=letmatrix_line=mol.matrix.(center)inletunsorted=A.fold_lefti(funaccix->ifx=curr_heightthen(get_typmoli)::accelseacc)[]matrix_linein(* layer at 'curr_height' *)(curr_height,Utls.list_uniq_countunsorted)letencode(max_height:int)(mol:t):(Atom_env.t*int)list=(* compute atom envs. of given atom up to maximum height allowed *)(* we cannot go deeper than 'maxi' on this molecule *)letmaxi=minmax_heightmol.diameterinletencode_atom(n_i:int):Atom_env.t=letdepths=L.range0`Tomaxiinletlayers=L.map(funheight->types_at_distancen_iheightmol)depthsin(* non empty layers *)L.filter(fun(_h,typs)->typs<>[])layersinletnb_atoms=A.lengthmol.graphinletatom_indexes=L.range0`To(nb_atoms-1)in(* canonicalize the encoding of the molecule by sorting its atom envs
and counting duplicates *)letatom_envs=L.mapencode_atomatom_indexesinUtls.list_uniq_countatom_envs(* encode the molecule to counted atom pairs *)letatom_pairs(mol:t):(Atom_pair.t*int)list=letn=nb_atomsmolinassert(n>=1);(* at least one heavy atom *)letmax_nb_pairs=max1(n*(n-1)/2)inletpair2count=Ht.createmax_nb_pairsinfori=0ton-1dolettype_i=get_typmoliinforj=iton-1dolettype_j=get_typmoljinletdist=A.unsafe_get(A.unsafe_getmol.matrixi)jinletpair=Atom_pair.createtype_itype_jdistinletprev_count=Ht.find_defaultpair2countpair0inHt.replacepair2countpair(prev_count+1)done;done;(* canonicalization will be done later; when the features (string) are
* converted to feature ids (int) *)Ht.bindingspair2count