123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537moduleSafe=structtypeerror=[`invalid_intofstring|`invalid_floatofstring]letint_of_strings=tryOk(Int.of_strings)with_->Error(`invalid_ints)letfloat_of_strings=tryOk(Float.of_strings)with_->Error(`invalid_floats)endletis_valid_dna=String.for_all~f:(String.contains"ACGTN")typevcf_id=stringtypevcf_description=stringtypevcf_number=|Numberofint|OnePerAllele|OnePerGenotype|Unknowntypevcf_format_type=[`integer_value|`float_value|`character_value|`string_value]typevcf_info_type=[vcf_format_type|`flag_value]typevcf_info_meta=Infoofvcf_number*vcf_info_type*vcf_descriptiontypevcf_filter_meta=Filterofvcf_descriptiontypevcf_format_meta=Formatofvcf_number*vcf_format_type*vcf_descriptiontypevcf_alt_meta=Altofvcf_descriptiontypevcf_meta={vcfm_version:string;vcfm_id_cache:vcf_idSet.Poly.t;vcfm_info:(vcf_id,vcf_info_meta)Hashtbl.t;vcfm_filters:(vcf_id*vcf_filter_meta)list;vcfm_format:(vcf_id,vcf_format_meta)Hashtbl.t;vcfm_alt:(string,vcf_alt_meta)Hashtbl.t;vcfm_arbitrary:(string,string)Hashtbl.t;vcfm_header:stringlist;vcfm_samples:stringlist}typevcf_format=[`integerofint|`floatoffloat|`characterofchar|`stringofstring|`missing(* FIXME(superbobry): use option? *)]typevcf_info=[vcf_format|`flagofstring]typevcf_row={vcfr_chrom:string;(* FIXME(superbobry): Chr.t *)vcfr_pos:int;vcfr_ids:stringlist;vcfr_ref:string;vcfr_alts:stringlist;(* FIXME(superbobry): proper typing! *)vcfr_qual:floatoption;vcfr_filter:vcf_idlist;vcfr_info:(vcf_id,vcf_infolist)Hashtbl.t;vcfr_samples:(vcf_id,(vcf_id*vcf_formatlist)list)Hashtbl.t}(* +--------------+
| Error types. |
+--------------+ *)typevcf_parse_row_error=[Safe.error|`info_type_coersion_failureofvcf_info_type*string|`format_type_coersion_failureofvcf_format_type*string|`invalid_dnaofstring|`unknown_infoofvcf_id|`unknown_filterofvcf_id|`unknown_altofstring|`duplicate_idsofvcf_idlist|`invalid_arguments_lengthofvcf_id*int*int|`invalid_row_lengthofint*int|`malformed_sampleofstring|`unknown_formatofvcf_id]typevcf_parse_error=[`malformed_metaofPos.t*string|`malformed_rowofPos.t*vcf_parse_row_error*string|`malformed_headerofPos.t*string|`incomplete_inputofPos.t*stringlist*stringoption|`not_ready](* +---------------------------------------------------------+
| Functions for converting VCF types to and from strings. |
+---------------------------------------------------------+ *)letstring_to_vcf_number=function|"A"->OnePerAllele|"G"->OnePerGenotype|"."->Unknown|arity->Number(int_of_stringarity)letstring_to_vcf_format_types=matchString.lowercaseswith|"integer"->`integer_value|"float"->`float_value|"character"->`character_value|"string"->`string_value|v->failwith("string_to_vcf_format_type: invalid format: "^v)letvcf_format_type_to_string=function|`integer_value->"integer"|`float_value->"float"|`character_value->"character"|`string_value->"string"letcoerce_to_vcf_format_typets=ifString.equals"."then(* Note(superbobry): any value might be missing, according to VCF4.1
specification. *)Ok`missingelsematchtwith|`integer_value->Result.map(Safe.int_of_strings)~f:(funx->`integerx)|`float_value->Result.map(Safe.float_of_strings)~f:(funx->`floatx)|`character_valuewhenString.lengths=1->Ok(`characters.[0])|`string_value->Ok(`strings)|_->Error(`format_type_coersion_failure(t,s))letcoerce_n~fkeyns=letopenResult.Monad_infixinletres=lazy(Result.all(List.map~f(String.split~on:','s)))inmatchnwith|Numbern->Lazy.forceres>>=funvalues->ifList.lengthvalues=nthenOkvalueselseError(`invalid_arguments_length(key,List.lengthvalues,n))|Unknown(* TODO(superbobry): how do we know the nr. of alleles? *)|OnePerAllele(* TODO(superbobry): how to make sure that we have exactly _one_
value per genotype? *)|OnePerGenotype->Lazy.forceresletstring_to_vcf_info_types=matchString.lowercaseswith|"flag"->`flag_value|s->string_to_vcf_format_typesletvcf_info_type_to_string=function|`flag_value->"flag"|#vcf_format_typeast->vcf_format_type_to_stringtletcoerce_to_vcf_info_typets=letres=matchtwith|`flag_value->Ok(`flags)|#vcf_format_type->coerce_to_vcf_format_typetsinResult.map_errorres~f:(fun_exn->`info_type_coersion_failure(t,s))(* +-------------------+
| Error formatting. |
+-------------------+ *)letparse_row_error_to_string=function|`invalid_ints->sprintf"invalid_integer (%s)"s|`invalid_floats->sprintf"invalid_float (%s)"s|`info_type_coersion_failure(t,s)->sprintf"info_type_coersion_failure (%s, %S)"(vcf_info_type_to_stringt)s|`format_type_coersion_failure(t,s)->sprintf"format_type_coersion_failure (%s, %S)"(vcf_format_type_to_stringt)s|`invalid_dnas->sprintf"invalid_dna (%s)"s|`unknown_infos->sprintf"unknown_info (%s)"s|`unknown_filterf->sprintf"unknown_filter (%s)"f|`unknown_alts->sprintf"unknown_alt (%s)"s|`duplicate_idsids->sprintf"duplicate_ids (%s)"(String.concat~sep:", "ids)|`invalid_arguments_length(key,got,expected)->sprintf"invalid_arguments_length (%s, %i, %i)"keygotexpected|`invalid_row_length(got,expected)->sprintf"invalid_row_length (%i, %i)"gotexpected|`malformed_samples->sprintf"malformed_sample (%s)"s|`unknown_formatf->sprintf"unknown_formar (%s)"fletparse_error_to_string=letpos()a=Pos.to_stringainfunction|`malformed_meta(p,s)->sprintf"malformed_meta (%a, %S)"posps|`malformed_row(p,err,s)->sprintf"malformed_row (%s, %a, %S)"(parse_row_error_to_stringerr)posps|`malformed_header(p,s)->sprintf"malformed_header (%a, %s)"posps|_->sprintf"unknown_error"(* +-------------------------------------------------+
| Types for reserved INFO, FORMAT and ALT entries |
+-------------------------------------------------+ *)(** Note(superbobry): this is mostly taken from PyVCF module by
James Casbon. See https://github.com/jamescasbon/PyVCF. The
standard does _NOT_ specify how many arguments are allowed
for each reserved item, so we assume 'Unknown'. *)letreserved_info=Hashtbl.Poly.of_alist_exn[("AA",`string_value);("AC",`integer_value);("AF",`float_value);("AN",`integer_value);("BQ",`float_value);("CIGAR",`string_value);("DB",`flag_value);("DP",`integer_value);("H2",`flag_value);("MQ",`float_value);("MQ0",`integer_value);("NS",`integer_value);("SB",`string_value);("SOMATIC",`flag_value);("VALIDATED",`flag_value);(* VCF 4.1 Additions *)("IMPRECISE",`flag_value);("NOVEL",`flag_value);("END",`integer_value);("SVTYPE",`string_value);("CIPOS",`integer_value);("CIEND",`integer_value);("HOMLEN",`integer_value);("HOMSEQ",`integer_value);("BKPTID",`string_value);("MEINFO",`string_value);("METRANS",`string_value);("DGVID",`string_value);("DBVARID",`string_value);("MATEID",`string_value);("PARID",`string_value);("EVENT",`string_value);("CILEN",`integer_value);("CN",`integer_value);("CNADJ",`integer_value);("CICN",`integer_value);("CICNADJ",`integer_value)]andreserved_format=Hashtbl.Poly.of_alist_exn[("GT",`string_value);("DP",`integer_value);("FT",`string_value);("GL",`float_value);("GQ",`float_value);("HQ",`float_value);(* VCF 4.1 Additions *)("CN",`integer_value);("CNQ",`float_value);("CNL",`float_value);("NQ",`integer_value);("HAP",`integer_value);("AHAP",`integer_value)]andreserved_alt=Hashtbl.Poly.of_alist_exn[("DEL",Alt"Deletion relative to the reference");("INS",Alt"Insertion of novel sequence relative to the reference");("DUP",Alt"Region of elevated copy number relative to the reference");("INV",Alt"Inversion of reference sequence");("CNV",Alt"Copy number variable region");("DUP:TANDEM",Alt"TANDEM Tandem duplication");("DEL:ME",Alt"ME Deletion of mobile element relative to the reference");("INS:ME",Alt"ME Insertion of a mobile element relative to the reference");]letdefault_meta={vcfm_version="<unknown>";vcfm_id_cache=Set.Poly.empty;vcfm_info=Hashtbl.Poly.create();vcfm_filters=[];vcfm_format=Hashtbl.Poly.create();vcfm_alt=reserved_alt;vcfm_arbitrary=Hashtbl.Poly.create();vcfm_header=[];vcfm_samples=[]}(* +--------------------+
| VCF heavy-lifting. |
+--------------------+ *)letstring_to_vcfr_refs=lets=String.uppercasesinifis_valid_dnasthenOkselseError(`invalid_dnas)letstring_to_vcfr_info{vcfm_info;_}s=letgovalues=List.map(String.split~on:';'s)~f:(funchunk->let(key,raw_value)=Option.value~default:(chunk,"")(String.lsplit2~on:'='chunk)inletchunk_values=matchHashtbl.findvcfm_infokeywith|Some(Info(_t,`flag_value,_description))whenString.equalraw_value""->Ok[`flagkey]|Some(Info(n,t,_description))->coerce_n~f:(coerce_to_vcf_info_typet)keynraw_value|None->Error(`unknown_infokey)inResult.mapchunk_values~f:(fundata->Hashtbl.setvalues~key~data))andvalues=Hashtbl.Poly.create()inResult.(map(all_unit(govalues))~f:(Fn.constvalues))letstring_to_vcfr_filter{vcfm_filters;_}s=matchString.split~on:';'swith|["PASS"]->Ok[]|chunks->matchList.findchunks~f:(funchunk->not(List.Assoc.mem~equal:String.equalvcfm_filterschunk))with|Someunknown_filter->Error(`unknown_filterunknown_filter)|None->Okchunksletstring_to_vcfr_ids{vcfm_id_cache;_}s=matchString.split~on:';'swith|["."]->Ok[]|chunks->letduplicate_ids=List.filterchunks~f:(Set.memvcfm_id_cache)inifList.is_emptyduplicate_idsthenOkchunkselseError(`duplicate_idsduplicate_ids)letstring_to_vcfr_alts{vcfm_alt;_}s=matchString.split~on:','swith|["."]->Ok[]|chunks->letres=List.mapchunks~f:(funchunk->letn=String.lengthchunkinmatchChar.(chunk.[0]='<'&&chunk.[n-1]='>',is_valid_dnachunk)with|(true,_)->ifHashtbl.memvcfm_alt(String.sub~pos:1~len:(n-2)chunk)thenOkchunkelseError(`unknown_altchunk)|(false,true)->Okchunk|(false,false)->Error(`invalid_dnachunk))(* Impossible. *)inResult.allresletlist_to_vcfr_samples{vcfm_format;vcfm_samples;_}chunks=letopenResult.Monad_infixinletsamples=Hashtbl.Poly.create()inletgosample_keysidraw_sample=letsample_chunks=String.split~on:':'raw_sampleinifList.(lengthsample_keys<>lengthsample_chunks)thenError(`malformed_sampleraw_sample)elseletres=List.map2_exnsample_keyssample_chunks~f:(funkeyraw_value->matchHashtbl.findvcfm_formatkeywith|Some(Format(n,t,_description))->coerce_n~f:(coerce_to_vcf_format_typet)keynraw_value>>=funvalue->Ok(key,value)|None->Error(`unknown_formatkey))inResult.(map(allres)~f:(fundata->Hashtbl.setsamples~key:id~data))inmatchchunkswith|raw_sample_keys::raw_samples->letsample_keys=String.split~on:':'raw_sample_keysinletres=List.map2_exnvcfm_samplesraw_samples~f:(gosample_keys)inResult.map(Result.all_unitres)~f:(Fn.constsamples)|[]->Oksamplesletlist_to_vcf_rowmetachunks=letopenResult.Monad_infixinletn_chunks=List.lengthchunksandn_columns=List.(lengthmeta.vcfm_header+lengthmeta.vcfm_samples)inmatchchunkswith|vcfr_chrom::raw_pos::raw_id::raw_ref::raw_alt::raw_qual::raw_filter::raw_info::raw_sampleswhenn_chunks=n_columns->Safe.int_of_stringraw_pos>>=funvcfr_pos->string_to_vcfr_idsmetaraw_id>>=funvcfr_ids->string_to_vcfr_refraw_ref>>=funvcfr_ref->string_to_vcfr_altsmetaraw_alt>>=funvcfr_alts->string_to_vcfr_infometaraw_info>>=funvcfr_info->string_to_vcfr_filtermetaraw_filter>>=funvcfr_filter->(* FIXME(superbobry): actually, we need monadic 'when here',
because if there's no samples, there's nothing to parse. *)list_to_vcfr_samplesmetaraw_samples>>=funvcfr_samples->letrow={vcfr_chrom;vcfr_pos;vcfr_ids;vcfr_ref;vcfr_alts;vcfr_qual=Result.ok(Safe.float_of_stringraw_qual);vcfr_filter;vcfr_info;vcfr_samples}inOkrow|_->Error(`invalid_row_length(n_chunks,n_columns))(** +-------------------+
| Parser interface. |
+-------------------+ *)typeitem=vcf_rowmoduleTransform=structletnext_vcf_headermetap=letopenLines.Bufferinlet{vcfm_info;vcfm_format;_}=metainletl=Option.value_exn(next_linep:>stringoption)inletchunks=List.filter~f:String.(funs->s<>"")(String.split~on:'\t'l)inbeginmatchchunkswith|"#CHROM"::"POS"::"ID"::"REF"::"ALT"::"QUAL"::"FILTER"::"INFO"::rest->beginmatchrestwith(* Note(superbobry): FORMAT only makes sense if we have at least
a single sample. *)|"FORMAT"::((_::_)assamples)|([]assamples)->letmerge_with_reserved~c=Hashtbl.merge~f:(fun~key:_v->matchvwith|`Leftt->Some(cUnknownt"<reserved>")|`Rightparsed->Someparsed|`Both(_t,parsed)->Someparsed)in(* Note(superbobry): merge parsed INFO and FORMAT entries with
predefined ones; a VCF file _may_ override any of the
reserved fields, in that case, default definition won't be
used. *)letvcfm_info=merge_with_reservedreserved_infovcfm_info~c:(funntdescription->Info(n,t,description));andvcfm_format=merge_with_reservedreserved_formatvcfm_format~c:(funntdescription->Format(n,t,description));and(vcfm_header,vcfm_samples)=ifList.is_emptysamplesthen(chunks,samples)elseList.split_nchunksList.(lengthchunks-lengthsamples)inOk(`complete{metawithvcfm_info;vcfm_format;vcfm_header;vcfm_samples})|_::_->Error(`malformed_header(current_positionp,l))end|_->Error(`malformed_header(current_positionp,l))endletnext_vcf_metametap=letopenLines.Bufferinlet{vcfm_info;vcfm_filters;vcfm_format;vcfm_alt;_}=metainmatch(peek_linep:>stringoption)with|SomelwhenString.is_prefixl~prefix:"##"->let_l=next_linepinlets=String.suffixl(String.lengthl-2)inbeginmatchString.lsplit2s~on:'='with|Some("fileformat",v)->Ok(`partial{metawithvcfm_version=v})|Some("INFO",v)->Scanf.sscanfv"<ID=%s@,Number=%s@,Type=%s@,Description=%S>"(funidntdescription->letinfo_meta=Info(string_to_vcf_numbern,string_to_vcf_info_typet,description)inHashtbl.setvcfm_info~key:id~data:info_meta);Ok(`partialmeta)|Some("FILTER",v)->Scanf.sscanfv"<ID=%s@,Description=%S>"(funiddescription->letfilter_meta=Filterdescriptioninletmeta={metawithvcfm_filters=(id,filter_meta)::vcfm_filters}inOk(`partialmeta))|Some("FORMAT",v)->Scanf.sscanfv"<ID=%s@,Number=%s@,Type=%s@,Description=%S>"(funidntdescription->letformat_meta=Format(string_to_vcf_numbern,string_to_vcf_format_typet,description)inHashtbl.setvcfm_format~key:id~data:format_meta);Ok(`partialmeta)|Some("ALT",v)->Scanf.sscanfv"<ID=%s@,Description=%S>"(funiddescription->letalt_meta=AltdescriptioninHashtbl.setvcfm_alt~key:id~data:alt_meta);Ok(`partialmeta)|Some(k,v)->beginHashtbl.setmeta.vcfm_arbitrary~key:k~data:v;Ok(`partialmeta)end|None->Error(`malformed_meta(current_positionp,s))end|Some_l->(* Note(superbobry): if the line *does not* start with '##' it
must be a header. *)next_vcf_headermetap|None->Error`not_readyletnext_vcf_rowmetap=letopenLines.Bufferinmatch(next_linep:>stringoption)with|Somelwhennot(String.is_emptyl)->letchunks=List.filter~f:String.(funs->s<>"")(String.split~on:'\t'l)inbeginmatchlist_to_vcf_rowmetachunkswith|Okrow->`output(Okrow)|Errorerr->`output(Error(`malformed_row(current_positionp,err,l)))end|Some_|None->`not_ready(* All done, boss! *)letnextmeta_refp=match!meta_refwith|`completemeta->next_vcf_rowmetap|`partialmeta->beginmatchnext_vcf_metametapwith|Okmeta->meta_ref:=meta;`not_ready|Error`not_ready->`not_ready|Errorerr->`output(Errorerr)endletstring_to_item?filename()=letname=sprintf"vcf_parser:%s"Option.(value~default:"<>"filename)inletmeta_ref=ref(`partialdefault_meta)inLines.Transform.make_merge_error~name?filename~next:(nextmeta_ref)()end