123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103(* Copyright (C) 2020, Francois Berenger
Yamanishi laboratory,
Department of Bioscience and Bioinformatics,
Faculty of Computer Science and Systems Engineering,
Kyushu Institute of Technology,
680-4 Kawazu, Iizuka, Fukuoka, 820-8502, Japan. *)(* Multi-Scale-Encoded molecule *)openPrintfmoduleL=MyListmoduleLog=Dolog.LogmoduleString=BatStringmoduleStringMap=BatMap.Stringtypet={name:string;map:intStringMap.t}letcreatenamemap={name;map}letget_namex=x.nameletget_mapx=x.mapletfeat_count_of_strings=tryScanf.sscanfs"%s %d"(funsd->(s,d))withexn->(eprintf"MSE_mol.feat_count_of_string: cannot parse: %s"s;raiseexn)(* to construct one molecules with all its constituent lines
already read from the input file *)letread_one=function|[]->failwith"MSE_mol.read_one: empty list"|name_line::feat_count_strs->(* molecule separator is a line starting with a '#' char *)assert(String.getname_line0='#');letname=String.lchopname_linein(* remove it *)letmap=List.fold_left(funaccline->letfeat,count=feat_count_of_stringlinein(* feature cannot already be here; otherwise,
there was a problem during encoding of the molecule *)ifStringMap.memfeataccthenLog.warn"mol: %s dup feat: %s"namefeat;StringMap.addfeatcountacc)StringMap.emptyfeat_count_strsincreatenamemapletprevious_name=ref""exceptionBreak(* get lines for just one molecule (i.e. for one call to read_one after) *)letget_linesinput=letacc=ref[]inif!previous_name=""thenbeginletline=input_lineinputinassert(BatString.starts_withline"#");(* enforce name line *)previous_name:=lineend;acc:=[!previous_name];trywhiletruedoletline'=input_lineinputinifBatString.starts_withline'"#"then(* this is the start of another molecule *)beginprevious_name:=line';raiseBreakendelseacc:=line'::!accdone;assert(false)(* for typing: should never be reached at exec *)withBreak->L.rev!acc|End_of_file->beginprevious_name:="";L.rev!accendletof_lineslines=letrecloopaccls=matchlswith|[]->L.revacc|_->letname_l,rest=L.fold_while(funl->String.starts_withl"#")(funaccx->x::acc)[]lsin(matchname_lwith|[name]->(letfeat_counts,remaining_mols=L.fold_while(funl->not(String.starts_withl"#"))(funaccx->x::acc)[]restinletmol=read_one(name::feat_counts)inloop(mol::acc)remaining_mols)|_->assert(false))inloop[]lines