12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667(* Copyright (C) 2019, Francois Berenger
Yamanishi laboratory,
Department of Bioscience and Bioinformatics,
Faculty of Computer Science and Systems Engineering,
Kyushu Institute of Technology,
680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)(* Multi-Scale-Encoded molecule *)openPrintfmoduleL=MyListmoduleLog=Dolog.LogmoduleString=BatStringmoduleStringMap=BatMap.Stringtypet={name:string;map:intStringMap.t}letcreatenamemap={name;map}letget_namex=x.nameletget_mapx=x.mapletfeat_count_of_strings=tryScanf.sscanfs"%s %d"(funsd->(s,d))withexn->(eprintf"MSE_mol.feat_count_of_string: cannot parse: %s"s;raiseexn)letread_one=function|[]->failwith"MSE_mol.read_one: empty list"|name_line::feat_count_strs->(* molecule separator is a line starting with a '#' char *)assert(String.getname_line0='#');letname=String.lchopname_linein(* remove it *)letmap=List.fold_left(funaccline->letfeat,count=feat_count_of_stringlinein(* feature cannot already be here; otherwise,
there was a problem during encoding of the molecule *)ifStringMap.memfeataccthenLog.warn"mol: %s dup feat: %s"namefeat;StringMap.addfeatcountacc)StringMap.emptyfeat_count_strsincreatenamemapletof_lineslines=letrecloopaccls=matchlswith|[]->L.revacc|_->letname_l,rest=L.fold_while(funl->String.starts_withl"#")(funaccx->x::acc)[]lsin(matchname_lwith|[name]->(letfeat_counts,remaining_mols=L.fold_while(funl->not(String.starts_withl"#"))(funaccx->x::acc)[]restinletmol=read_one(name::feat_counts)inloop(mol::acc)remaining_mols)|_->assert(false))inloop[]lines