123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416(* ********************************* *)(* Preliminary stuff for xml parsing *)(* ********************************* *)typetree=Eofstring*Xmlm.attributelist*treelist|Dofstringletin_treei=letel((_,tag),attrs)children=E(tag,attrs,children)inletdatad=DdinXmlm.input_doc_tree~el~datailettree_of_stringstr=in_tree(Xmlm.make_input(`String(0,str)))letrecstring_of_tree=function|E(tag,_,children)->sprintf"<%s>%s<%s/>"tag(String.concat~sep:""(List.map~f:string_of_treechildren))tag|Ds->sletattrk=function|E(_,attrs,_)->List.find_mapattrs~f:(fun((_,k'),value)->ifString.(k=k')thenSomevalueelseNone)|_->Noneletbattrkx=Option.map(attrkx)~f:bool_of_stringletleaf_exnfkx=trymatchxwith|E(_,_,children)->(matchList.find_mapchildren~f:(function|E(tag,_,[Dd])whenString.(tag=k)->Some(fd)|_->None)with|Somex->x|None->raiseCaml.Not_found)|D_->raiseCaml.Not_foundwithCaml.Not_found->invalid_arg(sprintf"Entrez.leaf: no %s child"k)letileaf_exn=leaf_exnint_of_stringletsleaf_exn=leaf_exnFn.idletsleafkx=trySome(sleaf_exnkx)withInvalid_argument_->Noneletleavesfkt=matchtwithE(_,_,children)->List.filter_mapchildren~f:(function|E(tag,_,[Dd])whenString.equaltagk->Some(fd)|_->None)|_->[]letsleaves=leavesFn.idlettag_of_tree=function|E(tag,_,_)->Sometag|D_->Noneletechild_exnk=function|E(_,_,children)->begintry(matchList.find_mapchildren~f:(function|E(tag,_,_)asrwhenString.equaltagk->Somer|_->None)with|Somex->x|None->raiseCaml.Not_found)withCaml.Not_found->(lettags=List.filter_map~f:tag_of_treechildreninletmsg=sprintf"child: looked for %s but only got %s children"k(String.concat~sep:","tags)inraise(Invalid_argumentmsg))end|D_->(letmsg=sprintf"child: looked for %s tag but only got a PCDATA node"kinraise(Invalid_argumentmsg))letechildkx=trySome(echild_exnkx)with_->Noneletfold_echildren?tagf=letpred=Option.value_maptag~default:(fun_->true)~f:String.(=)infunxinit->matchxwith|E(_,_,children)->List.fold_rightchildren~init~f:(funxaccu->matchxwith|E(tag,_,_)asxwhenpredtag->fxaccu|_->accu)|D_->initletmap_echildren?tagfx=fold_echildren?tag(funxaccu->(fx)::accu)x[](*
exhaustive list of databases:
http://www.ncbi.nlm.nih.gov/books/NBK25497/table/chapter2.chapter2_table1/?report=objectonly
*)typedatabase=[|`gene|`genome|`geodatasets|`geoprofiles|`protein|`pubmed|`pubmedcentral|`sra|`unigene|`taxonomy]letid_of_database=function|`pubmed->"pubmed"|`gene->"gene"|`unigene->"unigene"|`genome->"genome"|`geoprofiles->"geoprofiles"|`geodatasets->"geodatasets"|`pubmedcentral->"pmc"|`protein->"protein"|`sra->"sra"|`taxonomy->"taxonomy"letsearch_base_url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"letparametersl=List.filter_map~f:Fn.idl|>List.map~f:(fun(k,v)->sprintf"%s=%s"kv)|>String.concat~sep:"&"letstring_of_datetype=function|`pdat->"pdat"|`mdat->"mdat"|`edat->"edat"letesearch_url?retstart?retmax?rettype?field?datetype?reldate?mindate?maxdatedatabasequery=search_base_url^"?"^parametersOption.([Some("db",id_of_databasedatabase);Some("term",Uri.pct_encodequery);map~f:(funi->"retstart",string_of_inti)retstart;map~f:(funi->"reldate",string_of_inti)reldate;map~f:(funi->"retmax",string_of_inti)retmax;map~f:(function`uilist->("rettype","uilist")|`count->("rettype","count"))rettype;map~f:(funs->"field",s)field;map~f:(fundt->"datetype",string_of_datetypedt)datetype;map~f:(fund->"mindate",d)mindate;map~f:(fund->"maxdate",d)maxdate;])typeesearch_answer={count:int;retmax:int;retstart:int;ids:stringlist}letesearch_answer_of_tree=function|E("eSearchResult",_,_)ast->{count=ileaf_exn"Count"t;retmax=ileaf_exn"RetMax"t;retstart=ileaf_exn"RetStart"t;ids=echild_exn"IdList"t|>sleaves"Id"}|_->assertfalseletesearch_answer_of_stringstr=tree_of_stringstr|>snd|>esearch_answer_of_treeletsummary_base_url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"letesummary_url?retstart?retmaxdbids=ifList.lengthids>200thenraise(Invalid_argument"Entrez.esummary_url: cannot fetch more than 200 summaries");summary_base_url^"?"^parametersOption.([Some("db",id_of_databasedb);Some("id",String.concat~sep:","ids);Some("version","2.0");map~f:(funi->"retstart",string_of_inti)retstart;map~f:(funi->"retmax",string_of_inti)retmax;])letfetch_base_url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"letstring_of_retmode=function|`xml->"xml"|`asn_1->"asn.1"|`text->"text"letefetch_url?rettype?retmode?retstart?retmax?strand?seq_start?seq_stopdbids=ifList.lengthids>200thenraise(Invalid_argument"Entrez.efetch_url: cannot fetch more than 200 records");fetch_base_url^"?"^parametersOption.([Some("db",id_of_databasedb);Some("id",String.concat~sep:","ids);map~f:(funr->"rettype",r)rettype;map~f:(funr->"retmode",string_of_retmoder)retmode;map~f:(funi->"retstart",string_of_inti)retstart;map~f:(funi->"retmax",string_of_inti)retmax;map~f:(funs->"strand",matchswith`plus->"1"|`minus->"2")strand;map~f:(funi->"seq_start",string_of_inti)seq_start;map~f:(funi->"seq_stop",string_of_inti)seq_stop;])moduletypeFetch=sigtype'afetchedvalfetch:string->(string->'a)->'afetchedval(>>=):'afetched->('a->'bfetched)->'bfetchedval(>|=):'afetched->('a->'b)->'bfetchedendmoduleMake(F:Fetch)=structopenF(* DTD for the databases can be found at http://www.ncbi.nlm.nih.gov/data_specs/dtd/NCBI_Entrezgene.dtd *)letsearch_and_fetchdatabaseof_xmlquery=letquery_url=esearch_urldatabasequeryinfetchquery_urlesearch_answer_of_string>>=funanswer->letobject_url=efetch_url~retmode:`xmldatabaseanswer.idsinfetchobject_url(funx->x|>tree_of_string|>snd|>of_xml)letsearch_and_summarydatabaseof_xmlquery=letquery_url=esearch_urldatabasequeryinfetchquery_urlesearch_answer_of_string>>=funanswer->letobject_url=esummary_urldatabaseanswer.idsinfetchobject_url(funx->x|>tree_of_string|>snd|>of_xml)moduleObject_id=structtypet=[`intofint|`stringofstring]letto_string=function|`inti->string_of_inti|`strings->sletof_xmlx=try`int(ileaf_exn"Object-id_id"x)with_->(try`string(sleaf_exn"Object-id_str"x)with_->invalid_arg(sprintf"Entrez.Make.Object_id.of_xml: %s"(string_of_treex)))endmoduleDbtag=structtypet={db:string;tag:Object_id.t;}letof_xmlx={db=sleaf_exn"Dbtag_db"x;tag=Object_id.of_xml(x|>echild_exn"Dbtag_tag"|>echild_exn"Object-id")}endmoduleGene_ref=structtypet={locus:stringoption;allele:stringoption;desc:stringoption;maploc:stringoption;pseudo:booloption;db:Dbtag.tlist;}letof_xmlt=lett=echild_exn"Gene-ref"tin{locus=sleaf"Gene-ref_locus"t;allele=sleaf"Gene-ref_allele"t;desc=sleaf"Gene-ref_desc"t;maploc=sleaf"Gene-ref_maploc"t;pseudo=Option.bind(echild"Gene-ref_pseudo"t)~f:(battr"value");db=Option.value_map(echild"Gene-ref_db"t)~default:[]~f:(map_echildren~tag:"Dbtag"Dbtag.of_xml);}endmodulePubmedSummary=struct(* DTD is at http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummaryDTD/eSummary_pubmed.dtd *)typet={pmid:int;doi:stringoption;pubdate:stringoption;source:stringoption;title:string;}letparse_article_idsx=map_echildren~tag:"ArticleId"(funx->sleaf_exn"IdType"x,sleaf_exn"Value"x)xletparse_document_summaryx=letarticle_ids=parse_article_ids(echild_exn"ArticleIds"x)in{pmid=int_of_string(List.Assoc.find_exn~equal:String.equalarticle_ids"pubmed");doi=List.Assoc.find~equal:String.equalarticle_ids"doi";pubdate=sleaf"PubDate"x;source=sleaf"Source"x;title=sleaf_exn"Title"x}letparse_eSummaryResultx=map_echildren~tag:"DocumentSummary"parse_document_summary(echild_exn"DocumentSummarySet"x)letsearch=search_and_summary`pubmedparse_eSummaryResultendmodulePubmed=structtypet={pmid:int;title:string;abstract:string;}letparse_book_documentbd={pmid=ileaf_exn"PMID"bd;title=sleaf_exn"ArticleTitle"bd;abstract=echild_exn"Abstract"bd|>sleaf_exn"AbstractText"}letparse_medline_citationmc=letarticle=echild_exn"Article"mcin{pmid=ileaf_exn"PMID"mc;title=sleaf_exn"ArticleTitle"article;abstract=echild_exn"Abstract"article|>sleaf_exn"AbstractText"}letparse_pubmed_article_set_elementx=matchtag_of_treexwith|Some"PubmedArticle"->Some(parse_medline_citation(echild_exn"MedlineCitation"x))|Some"PubmedBookArticle"->Some(parse_book_document(echild_exn"BookDocument"x))|Somet->failwith(sprintf"Unexpected %s tag while parsing PubmedArticleSet element"t)|None->Noneletparse_pubmed_article_set=function|E("PubmedArticleSet",_,children)->List.filter_map~f:parse_pubmed_article_set_elementchildren|_->assertfalseletsearch=search_and_fetch`pubmedparse_pubmed_article_setend(* http://www.ncbi.nlm.nih.gov/data_specs/dtd/NCBI_Entrezgene.mod.dtd *)moduleGene=structtypet={_type:[`unknown|`tRNA|`rRNA|`snRNA|`scRNA|`snoRNA|`protein_coding|`pseudo|`transposon|`miscRNA|`ncRNA|`other];summary:stringoption;gene:Gene_ref.t;}lettype_of_int=function|0->`unknown|1->`tRNA|2->`rRNA|3->`snRNA|4->`scRNA|5->`snoRNA|6->`protein_coding|7->`pseudo|8->`transposon|9->`miscRNA|10->`ncRNA|11->`ncRNA|n->invalid_arg(sprintf"Entrez.Make.Gene.type_of_int: %d"n)letparse_entrez_gene=function|E("Entrezgene",_,_)asx->Some{summary=sleaf"Entrezgene_summary"x;_type=type_of_int(ileaf_exn"Entrezgene_type"x);gene=Gene_ref.of_xml(echild_exn"Entrezgene_gene"x);}|_->Noneletparse_entrez_gene_set=function|E("Entrezgene-Set",_,children)->List.filter_map~f:parse_entrez_genechildren|_->assertfalseletsearchquery=letdatabase=`geneinletof_xml=parse_entrez_gene_setinletquery_url=esearch_urldatabasequeryin(* print_endline query_url ; *)fetchquery_urlesearch_answer_of_string>>=funanswer->letobject_url=efetch_url~retmode:`xmldatabaseanswer.idsin(* print_endline object_url ; *)fetchobject_url(funx->x|>tree_of_string|>snd|>of_xml)endend