1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126moduleResult=Biocaml_resultopenResult.Monad_infixlet(>>?~)(x:'aoptionOr_error.t)(f:'a->'bOr_error.t):'boptionOr_error.t=letopenResult.Monad_infixinx>>=function|None->OkNone|Somex->fx>>|Option.some(******************************************************************************)(* Header Types *)(******************************************************************************)typeheader_item_tag=[|`HD|`SQ|`RG|`PG|`CO|`Otherofstring][@@derivingsexp]typetag_value=string*string[@@derivingsexp]typesort_order=[`Unknown|`Unsorted|`Query_name|`Coordinate][@@derivingsexp]typegroup_order=[`None|`Query|`Reference][@@derivingsexp]typeheader_line={version:string;sort_order:sort_orderoption;group_order:group_orderoption;}[@@derivingsexp]typeref_seq={name:string;length:int;assembly:stringoption;md5:stringoption;species:stringoption;uri:stringoption;}[@@derivingsexp]typeplatform=[|`Capillary|`LS454|`Illumina|`Solid|`Helicos|`Ion_Torrent|`Pac_Bio][@@derivingsexp]typeread_group={id:string;seq_center:stringoption;description:stringoption;run_date:[`Dateofstring|`Timeofstring]option;flow_order:stringoption;key_seq:stringoption;library:stringoption;program:stringoption;predicted_median_insert_size:intoption;platform:platformoption;platform_unit:stringoption;sample:stringoption;}[@@derivingsexp]typeprogram={id:string;name:stringoption;command_line:stringoption;previous_id:stringoption;description:stringoption;version:stringoption;}[@@derivingsexp]typeheader_item=[|`HDofheader_line|`SQofref_seq|`RGofread_group|`PGofprogram|`COofstring|`Otherofstring*tag_valuelist][@@derivingsexp]typeheader={version:stringoption;sort_order:sort_orderoption;group_order:group_orderoption;ref_seqs:ref_seqlist;read_groups:read_grouplist;programs:programlist;comments:stringlist;others:(string*tag_valuelist)list;}letempty_header={version=None;sort_order=None;group_order=None;ref_seqs=[];read_groups=[];programs=[];comments=[];others=[];}(******************************************************************************)(* Alignment Types *)(******************************************************************************)moduleFlags=structtypet=int[@@derivingsexp]letof_intx=if(0<=x)&&(x<=65535)thenOkxelseerror"flag out of range"xsexp_of_intletflag_is_setsf=(flands)<>0lethas_multiple_segments=flag_is_set0x1leteach_segment_properly_aligned=flag_is_set0x2letsegment_unmapped=flag_is_set0x4letnext_segment_unmapped=flag_is_set0x8letseq_is_reverse_complemented=flag_is_set0x10letnext_seq_is_reverse_complemented=flag_is_set0x20letfirst_segment=flag_is_set0x40letlast_segment=flag_is_set0x80letsecondary_alignment=flag_is_set0x100letnot_passing_quality_controls=flag_is_set0x200letpcr_or_optical_duplicate=flag_is_set0x400letsupplementary_alignment=flag_is_set0x800endtypecigar_op=[|`Alignment_matchofint|`Insertionofint|`Deletionofint|`Skippedofint|`Soft_clippingofint|`Hard_clippingofint|`Paddingofint|`Seq_matchofint|`Seq_mismatchofint][@@derivingsexp]typeoptional_field_value=[|`Aofchar|`iofInt64.t|`foffloat|`Zofstring|`Hofstring|`Bofchar*stringlist][@@derivingsexp]typeoptional_field={tag:string;value:optional_field_value}[@@derivingsexp]typernext=[`Valueofstring|`Equal_to_RNAME][@@derivingsexp]typealignment={qname:stringoption;flags:Flags.t;rname:stringoption;pos:intoption;mapq:intoption;cigar:cigar_oplist;rnext:rnextoption;pnext:intoption;tlen:intoption;seq:stringoption;qual:Phred_score.tlist;optional_fields:optional_fieldlist;}[@@derivingsexp](******************************************************************************)(* Header Parsers and Constructors *)(******************************************************************************)letparse_header_versions=leterr=error"invalid version"(`HD,s)[%sexp_of:header_item_tag*string]inmatchString.lsplit2~on:'.'swith|None->err|Some(a,b)->if(String.for_alla~f:Char.is_digit)&&(String.for_allb~f:Char.is_digit)thenOkselseerrletheader_line~version?sort_order?group_order()=parse_header_versionversion>>|funversion->{version;sort_order;group_order}letref_seq~name~length?assembly?md5?species?uri()=letis_name_first_char_ok=function|'!'..')'|'+'..'<'|'>'..'~'->true|_->falseinletis_name_other_char_ok=function'!'..'~'->true|_->falsein(if(1<=length)&&(length<=2147483647)thenOklengthelseerror"invalid reference sequence length"lengthsexp_of_int)>>=funlength->(if(String.lengthname>0)&&(String.foldiname~init:true~f:(funiaccumc->accum&&(ifi=0thenis_name_first_char_okcelseis_name_other_char_okc)))thenOknameelseerror"invalid ref seq name"namesexp_of_string)>>=funname->Ok{name;length;assembly;md5;species;uri}letread_group~id?seq_center?description?run_date?flow_order?key_seq?library?program?predicted_median_insert_size?platform?platform_unit?sample()=(matchrun_datewith|None->OkNone|Somerun_date->tryOk(Some(`Daterun_date))with_->tryOk(Some(`Timerun_date))with_->error"invalid run date/time"run_datesexp_of_string)>>=funrun_date->(matchflow_orderwith|None->OkNone|Some""->Or_error.error_string"invalid empty flow order"|Some"*"->Okflow_order|Somex->ifString.for_allx~f:(function|'A'|'C'|'M'|'G'|'R'|'S'|'V'|'T'|'W'|'Y'|'H'|'K'|'D'|'B'|'N'->true|_->false)thenOkflow_orderelseerror"invalid flow order"xsexp_of_string)>>|funflow_order->{id;seq_center;description;run_date;flow_order;key_seq;library;program;predicted_median_insert_size;platform;platform_unit;sample;}letheader?version?sort_order?group_order?(ref_seqs=[])?(read_groups=[])?(programs=[])?(comments=[])?(others=[])()=[(matchversionwith|None->None|Somex->matchparse_header_versionxwith|Errore->Somee|Ok_->None);(ifOption.is_somesort_order&&Poly.(version=None)thenSome(Error.create"sort order cannot be defined without version"(sort_order,version)[%sexp_of:sort_orderoption*stringoption])elseNone);(List.mapref_seqs~f:(fun(x:ref_seq)->x.name)|>List.find_a_dup~compare:String.compare|>Option.map~f:(funname->Error.create"duplicate ref seq name"namesexp_of_string));]|>List.filter_map~f:Fn.id|>function|[]->Ok{version;sort_order;group_order;ref_seqs;read_groups;programs;comments;others;}|errs->Error(Error.of_listerrs)letparse_header_item_tags=letis_letter=function'A'..'Z'|'a'..'z'->true|_->falseinmatchString.chop_prefixs~prefix:"@"with|None->error"header item tag must begin with @"ssexp_of_string|Some"HD"->Ok`HD|Some"SQ"->Ok`SQ|Some"RG"->Ok`RG|Some"PG"->Ok`PG|Some"CO"->Ok`CO|Somex->if(String.lengthx=2)&&(String.for_allx~f:is_letter)thenOk(`Otherx)elseerror"invalid header item tag"ssexp_of_stringletparse_tag_values=letparse_tags=if(String.lengths=2)&&(matchs.[0]with'A'..'Z'|'a'..'z'->true|_->false)&&(matchs.[1]with|'A'..'Z'|'a'..'z'|'0'..'9'->true|_->false)thenOkselseerror"invalid tag"ssexp_of_stringinletparse_valuetags=ifString.(s<>"")&&(String.for_alls~f:(function' '..'~'->true|_->false))thenOkselseerror"tag has invalid value"(tag,s)[%sexp_of:string*string]inmatchString.lsplit2s~on:':'with|None->error"tag-value not colon separated"ssexp_of_string|Some(tag,value)->parse_tagtag>>=funtag->parse_valuetagvalue>>=funvalue->Ok(tag,value)(** Find all occurrences of [x'] in the association list [l]. *)letfind_alllx'=letrecloopaccum=function|[]->accum|(x,y)::l->letaccum=ifPoly.(x=x')theny::accumelseaccuminloopaccumlinList.rev(loop[]l)(** Find exactly 1 occurrence [x] in association list [l]. Return
error if [x] is not defined exactly once. *)letfind1header_item_taglx=matchfind_alllxwith|[]->error"required tag not found"(header_item_tag,x)[%sexp_of:header_item_tag*string]|y::[]->Oky|ys->error"tag found multiple times"(header_item_tag,x,ys)[%sexp_of:header_item_tag*string*stringlist](** Find 0 or 1 occurrence [x] in association list [l]. Return
error if [x] is defined more than once. *)letfind01header_item_taglx=matchfind_alllxwith|[]->OkNone|y::[]->Ok(Somey)|ys->error"tag found multiple times"(header_item_tag,x,ys)[%sexp_of:header_item_tag*string*stringlist](** Assert that [tvl] contains at most the given [tags]. *)letassert_tagsheader_item_tagtvltags=letexpected_tags=String.Set.of_listtagsinletgot_tags=List.maptvl~f:fst|>String.Set.of_listinletunexpected_tags=Set.diffgot_tagsexpected_tagsinifSet.lengthunexpected_tags=0thenOk()elseerror"unexpected tag for given header item type"(header_item_tag,unexpected_tags)[%sexp_of:header_item_tag*String.Set.t]letparse_sort_order=function|"unknown"->Ok`Unknown|"unsorted"->Ok`Unsorted|"queryname"->Ok`Query_name|"coordinate"->Ok`Coordinate|x->error"invalid sort order"xsexp_of_stringletparse_group_order=function|"none"->Ok`None|"query"->Ok`Query|"reference"->Ok`Reference|x->error"invalid group order"xsexp_of_stringletparse_header_linetvl=find1`HDtvl"VN">>=funversion->find01`HDtvl"SO">>?~parse_sort_order>>=funsort_order->find01`HDtvl"GO">>?~parse_group_order>>=fungroup_order->assert_tags`HDtvl["VN";"SO";"GO"]>>=fun()->header_line~version?sort_order?group_order()letparse_ref_seqtvl=find1`SQtvl"SN">>=funname->find1`SQtvl"LN">>=funlength->(tryOk(Int.of_stringlength)with_->error"invalid ref seq length"lengthsexp_of_string)>>=funlength->find01`SQtvl"AS">>=funassembly->find01`SQtvl"M5">>=funmd5->find01`SQtvl"SP">>=funspecies->find01`SQtvl"UR">>=funuri->assert_tags`SQtvl["SN";"LN";"AS";"M5";"SP";"UR"]>>=fun()->ref_seq~name~length?assembly?md5?species?uri()letparse_platform=function|"CAPILLARY"->Ok`Capillary|"LS454"->Ok`LS454|"ILLUMINA"->Ok`Illumina|"SOLID"->Ok`Solid|"HELICOS"->Ok`Helicos|"IONTORRENT"->Ok`Ion_Torrent|"PACBIO"->Ok`Pac_Bio|x->error"unknown platform"xsexp_of_stringletparse_read_grouptvl=find1`RGtvl"ID">>=funid->find01`RGtvl"CN">>=funseq_center->find01`RGtvl"DS">>=fundescription->find01`RGtvl"DT">>=funrun_date->find01`RGtvl"FO">>=funflow_order->find01`RGtvl"KS">>=funkey_seq->find01`RGtvl"LB">>=funlibrary->find01`RGtvl"PG">>=funprogram->find01`RGtvl"PI">>?~(funpredicted_median_insert_size->tryOk(Int.of_stringpredicted_median_insert_size)with_->error"invalid predicted median insert size"predicted_median_insert_sizesexp_of_string)>>=funpredicted_median_insert_size->find01`RGtvl"PL">>?~parse_platform>>=funplatform->find01`RGtvl"PU">>=funplatform_unit->find01`RGtvl"SM">>=funsample->assert_tags`RGtvl["ID";"CN";"DS";"DT";"FO";"KS";"LB";"PG";"PI";"PL";"PU";"SM"]>>=fun()->read_group~id?seq_center?description?run_date?flow_order?key_seq?library?program?predicted_median_insert_size?platform?platform_unit?sample()letparse_programtvl=find1`PGtvl"ID">>=funid->find01`PGtvl"PN">>=funname->find01`PGtvl"CL">>=funcommand_line->find01`PGtvl"PP">>=funprevious_id->find01`PGtvl"DS">>=fundescription->find01`PGtvl"VN">>=funversion->assert_tags`PGtvl["ID";"PN";"CL";"PP";"DS";"VN"]>>|fun()->{id;name;command_line;previous_id;description;version}letparse_header_itemline=letparse_datatagtvl=matchtagwith|`HD->parse_header_linetvl>>|funx->`HDx|`SQ->parse_ref_seqtvl>>|funx->`SQx|`RG->parse_read_grouptvl>>|funx->`RGx|`PG->parse_programtvl>>|funx->`PGx|`Othertag->Ok(`Other(tag,tvl))|`CO->assertfalseinmatchString.lsplit2~on:'\t'(line:Line.t:>string)with|None->error"header line contains no tabs"lineLine.sexp_of_t|Some(tag,data)->parse_header_item_tagtag>>=function|`CO->Ok(`COdata)|tag->matchString.split~on:'\t'datawith|[]->assertfalse|""::[]->error"header contains no data"tagsexp_of_header_item_tag|tvl->Result.List.maptvl~f:parse_tag_value>>=funtvl->parse_datatagtvl(******************************************************************************)(* Alignment Parsers and Constructors *)(******************************************************************************)letalignment?ref_seqs?qname~flags?rname?pos?mapq?(cigar=[])?rnext?pnext?tlen?seq?(qual=[])?(optional_fields=[])()=[(matchref_seqs,rnamewith|(None,_)|(_,None)->None|Someref_seqs,Somername->ifSet.memref_seqsrnamethenNoneelseSome(Error.create"RNAME not defined in any SQ line"rnamesexp_of_string));(matchref_seqs,rnextwith|(None,_)|(_,None)->None|Some_,Some`Equal_to_RNAME->None(* error will already be detected in RNAME check above *)|Someref_seqs,Some(`Valuernext)->ifSet.memref_seqsrnextthenNoneelseSome(Error.create"RNEXT not defined in any SQ line"rnextsexp_of_string));(matchseq,qualwith|_,[]->None|None,_->Some(Error.of_string"QUAL provided without SEQ")|Someseq,_->lets=String.lengthseqinletq=List.lengthqualinifs=qthenNoneelseSome(Error.create"SEQ and QUAL lengths differ"(s,q)[%sexp_of:int*int]));(List.mapoptional_fields~f:(funx->x.tag)|>List.find_a_dup~compare:String.compare|>Option.map~f:(fundup->Error.create"TAG occurs more than once"dupsexp_of_string));]|>List.filter_map~f:Fn.id|>function|[]->Ok{qname;flags;rname;pos;mapq;cigar;rnext;pnext;tlen;seq;qual;optional_fields}|errs->Error(Error.of_listerrs)letparse_int_rangefieldlohis=letout_of_range=sprintf"%s out of range"fieldinletnot_an_int=sprintf"%s not an int"fieldintryletn=Int.of_stringsinif(lo<=n)&&(n<=hi)thenOknelseerrorout_of_range(n,lo,hi)[%sexp_of:int*int*int]with_->errornot_an_intssexp_of_string(** Parse a string that can either by "*" or some other regexp, with
"*" denoting [None]. The given regexp [re] should include "*" as
one of the alternatives. *)letparse_opt_stringfieldres=ifnot(Re.execpres)thenerror(sprintf"invalid %s"field)ssexp_of_stringelsematchswith|"*"->OkNone|_->Ok(Somes)letqname_re=letopenReinalt[char'*';repn(alt[rg'!''?';rg'A''~'])1(Some255);]|>compileletparse_qnames=parse_opt_string"QNAME"qname_resletparse_flagss=tryFlags.of_int(Int.of_strings)with_->error"invalid FLAG"ssexp_of_stringletrname_re=Re.Perl.compile_pat"^\\*|[!-()+-<>-~][!-~]*$"letparse_rnames=parse_opt_string"RNAME"rname_resletparse_poss=parse_int_range"POS"02147483647s>>|function|0->None|x->Somexletparse_mapqs=parse_int_range"MAPQ"0255s>>|function|255->None|x->Somexletpositivei=letopenOr_errorinifi>0thenreturnielseerror_string"positive argument expected for cigar operation"letcigar_op_alignment_matchi=Or_error.(positivei>>|funi->`Alignment_matchi)letcigar_op_insertioni=Or_error.(positivei>>|funi->`Insertioni)letcigar_op_deletioni=Or_error.(positivei>>|funi->`Deletioni)letcigar_op_skippedi=Or_error.(positivei>>|funi->`Skippedi)letcigar_op_soft_clippingi=Or_error.(positivei>>|funi->`Soft_clippingi)letcigar_op_hard_clippingi=Or_error.(positivei>>|funi->`Hard_clippingi)letcigar_op_paddingi=Or_error.(positivei>>|funi->`Paddingi)letcigar_op_seq_matchi=Or_error.(positivei>>|funi->`Seq_matchi)letcigar_op_seq_mismatchi=Or_error.(positivei>>|funi->`Seq_mismatchi)letparse_cigartext=matchtextwith|"*"->Ok[]|""->error"invalid cigar string"textsexp_of_string|_->letch=Scanf.Scanning.from_stringtextinletrecloopaccum=ifScanf.Scanning.end_of_inputchthenOkaccumelsetryletn=Scanf.bscanfch"%d"identinletc=Scanf.bscanfch"%c"identinletx=matchcwith|'M'->cigar_op_alignment_matchn|'I'->cigar_op_insertionn|'D'->cigar_op_deletionn|'N'->cigar_op_skippedn|'S'->cigar_op_soft_clippingn|'H'->cigar_op_hard_clippingn|'P'->cigar_op_paddingn|'='->cigar_op_seq_matchn|'X'->cigar_op_seq_mismatchn|other->Or_error.error"invalid cigar operation type"otherChar.sexp_of_tinOr_error.tagx~tag:"Sam.parse_cigar: invalid cigar string">>=funx->loop(x::accum)with_->error"invalid cigar string"textsexp_of_stringinloop[]>>|List.revletrnext_re=Re.Perl.compile_pat"^\\*|=|[!-()+-<>-~][!-~]*$"letparse_rnexts=ifnot(Re.execprnext_res)thenerror"invalid RNEXT"ssexp_of_stringelsematchswith|"*"->OkNone|"="->Ok(Some`Equal_to_RNAME)|_->Ok(Some(`Values))letparse_pnexts=parse_int_range"PNEXT"02147483647s>>|function|0->None|x->Somexletparse_tlens=parse_int_range"TLEN"~-21474836472147483647s>>|function|0->None|x->Somexletseq_re=Re.Perl.compile_pat"^\\*|[A-Za-z=.]+$"letparse_seqs=parse_opt_string"SEQ"seq_resletparse_quals=matchswith|""->Or_error.error_string"invalid empty QUAL"|"*"->Ok[]|_->String.to_lists|>Result.List.map~f:(Phred_score.of_char~offset:`Offset33)letopt_field_tag_re=Re.Perl.compile_pat"^[A-Za-z][A-Za-z0-9]$"letopt_field_Z_re=Re.Perl.compile_pat"^[ !-~]+$"letopt_field_H_re=Re.Perl.compile_pat"^[0-9A-F]+$"letopt_field_int_re=Re.Perl.compile_pat"^-?[0-9]+$"letopt_field_float_re=Re.Perl.compile_pat"^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$"letoptional_field_value_errtypvalue=error"invalid value"(typ,value)[%sexp_of:string*string]letoptional_field_value_Avalue=ifList.mem~equal:Char.equal['!';'-';'~']valuethenoptional_field_value_err"A"(Char.to_stringvalue)elseOk(`Avalue)letoptional_field_value_ii=`iiletoptional_field_value_ff=`ffletoptional_field_value_Zvalue=ifRe.execpopt_field_Z_revaluethenOk(`Zvalue)elseoptional_field_value_err"Z"valueletoptional_field_value_Hvalue=ifRe.execpopt_field_H_revaluethenOk(`Hvalue)elseoptional_field_value_err"H"valueletoptional_field_value_Belt_typeelts=letvalid_args=matchelt_typewith|'c'|'C'|'s'|'S'|'i'|'I'->List.for_allelts~f:(Re.execpopt_field_int_re)|'f'->List.for_allelts~f:(Re.execpopt_field_float_re)|_->falseinifvalid_argsthenOk(`B(elt_type,elts))elseerror"invalid value"("B",elt_type,elts)[%sexp_of:string*char*stringlist]letoptional_fieldtagvalue=ifnot(Re.execpopt_field_tag_retag)thenerror"invalid TAG"tagsexp_of_stringelseOk{tag;value}letparse_optional_field_values=matchString.lsplit2s~on:':'with|None->error"missing TYPE in optional field"ssexp_of_string|Some(typ,value)->matchtypwith|"A"->ifString.lengthvalue=1thenoptional_field_value_Avalue.[0]elseoptional_field_value_errtypvalue|"i"->(tryifnot(Re.execpopt_field_int_revalue)thenfailwith"";Ok(optional_field_value_i(Int64.of_stringvalue))(* matching the regular expression is not enough: the number could not fit in 64 bits *)with_->optional_field_value_errtypvalue)|"f"->(tryifnot(Re.execpopt_field_float_revalue)thenfailwith"";Ok(optional_field_value_f(Float.of_stringvalue))(* matching the regular expression is not enough: the number could not fit in native floats *)with_->optional_field_value_errtypvalue)|"Z"->optional_field_value_Zvalue|"H"->optional_field_value_Hvalue|"B"->(matchString.split~on:','valuewith|num_typ::values->ifString.lengthnum_typ=1thenoptional_field_value_Bnum_typ.[0]valueselseerror"invalid array type"num_typsexp_of_string|_->assertfalse(* [String.split] cannot return an empty list *))|_->error"invalid type"typsexp_of_stringletparse_optional_fields=matchString.lsplit2s~on:':'with|None->error"missing TAG in optional field"ssexp_of_string|Some(tag,s)->parse_optional_field_values>>=funvalue->optional_fieldtagvalueletparse_alignment?ref_seqsline=matchString.split~on:'\t'(line:Line.t:>string)with|qname::flags::rname::pos::mapq::cigar::rnext::pnext::tlen::seq::qual::optional_fields->(parse_qnameqname>>=funqname->parse_flagsflags>>=funflags->parse_rnamername>>=funrname->parse_pospos>>=funpos->parse_mapqmapq>>=funmapq->parse_cigarcigar>>=funcigar->parse_rnextrnext>>=funrnext->parse_pnextpnext>>=funpnext->parse_tlentlen>>=funtlen->parse_seqseq>>=funseq->parse_qualqual>>=funqual->Result.List.mapoptional_fields~f:parse_optional_field>>=funoptional_fields->alignment?ref_seqs?qname~flags?rname?pos?mapq~cigar?rnext?pnext?tlen?seq~qual~optional_fields())|_->Or_error.error_string"alignment line contains < 12 fields"(******************************************************************************)(* Header Printers *)(******************************************************************************)letprint_header_item_tag=function|`HD->"@HD"|`SQ->"@SQ"|`RG->"@RG"|`PG->"@PG"|`CO->"@CO"|`Otherx->sprintf"@%s"xletprint_tag_value(tag,value)=sprintf"%s:%s"tagvalueletprint_tag_value'=sprintf"%s:%s"letprint_header_versionx=print_tag_value'"VN"xletprint_sort_orderx=print_tag_value'"SO"(matchxwith|`Unknown->"unknown"|`Unsorted->"unsorted"|`Query_name->"queryname"|`Coordinate->"coordinate")letprint_group_orderx=print_tag_value'"GO"(matchxwith|`None->"none"|`Query->"query"|`Reference->"reference")letprint_header_line({version;sort_order;group_order}:header_line)=sprintf"@HD\tVN:%s%s%s"version(matchsort_orderwith|None->""|Somex->sprintf"\t%s"(print_sort_orderx))(matchgroup_orderwith|None->""|Somex->sprintf"\t%s"(print_group_orderx))letprint_ref_seq(x:ref_seq)=sprintf"@SQ\tSN:%s\tLN:%d%s%s%s%s"x.namex.length(matchx.assemblywithNone->""|Somex->sprintf"\tAS:%s"x)(matchx.md5withNone->""|Somex->sprintf"\tM5:%s"x)(matchx.specieswithNone->""|Somex->sprintf"\tSP:%s"x)(matchx.uriwithNone->""|Somex->sprintf"\tUR:%s"x)letprint_platform=function|`Capillary->"CAPILLARY"|`LS454->"LS454"|`Illumina->"ILLUMINA"|`Solid->"SOLID"|`Helicos->"HELICOS"|`Ion_Torrent->"IONTORRENT"|`Pac_Bio->"PACBIO"letprint_read_group(x:read_group)=letstagvalue=matchvaluewith|None->""|Somex->sprintf"\t%s:%s"tagxinsprintf"@RG\tID:%s%s%s%s%s%s%s%s%s%s%s%s"x.id(s"CN"x.seq_center)(s"DS"x.description)(s"DT"(Option.mapx.run_date~f:(function|`Datex|`Timex->x)))(s"FO"x.flow_order)(s"KS"x.key_seq)(s"LB"x.library)(s"PG"x.program)(s"PI"(Option.mapx.predicted_median_insert_size~f:Int.to_string))(s"PL"(Option.mapx.platform~f:print_platform))(s"PU"x.platform_unit)(s"SM"x.sample)letprint_program(x:program)=letstagvalue=matchvaluewith|None->""|Somex->sprintf"\t%s:%s"tagxinsprintf"@PG\tID:%s%s%s%s%s%s"x.id(s"PN"x.name)(s"CL"x.command_line)(s"PP"x.previous_id)(s"DS"x.description)(s"VN"x.version)letprint_other((tag,l):string*tag_valuelist)=sprintf"@%s%s"tag(List.mapl~f:(fun(x,y)->sprintf"\t%s:%s"xy)|>String.concat~sep:"")(******************************************************************************)(* Alignment Printers *)(******************************************************************************)letprint_qname=functionSomex->x|None->"*"letprint_flags=Int.to_stringletprint_rname=functionSomex->x|None->"*"letprint_pos=functionSomex->Int.to_stringx|None->"0"letprint_mapq=functionSomex->Int.to_stringx|None->"255"letprint_cigar_op=function|`Alignment_matchx->sprintf"%dM"x|`Insertionx->sprintf"%dI"x|`Deletionx->sprintf"%dD"x|`Skippedx->sprintf"%dN"x|`Soft_clippingx->sprintf"%dS"x|`Hard_clippingx->sprintf"%dH"x|`Paddingx->sprintf"%dP"x|`Seq_matchx->sprintf"%d="x|`Seq_mismatchx->sprintf"%dX"xletprint_cigar=function|[]->"*"|cigar_ops->List.mapcigar_ops~f:print_cigar_op|>String.concat~sep:""letprint_rnext=function|None->"*"|Some`Equal_to_RNAME->"="|Some(`Valuex)->xletprint_pnext=functionSomex->Int.to_stringx|None->"0"letprint_tlen=functionSomex->Int.to_stringx|None->"0"letprint_seq=functionSomex->x|None->"*"letprint_qual=function|[]->"*"|quals->List.mapquals~f:(funx->ok_exn(Phred_score.to_char~offset:`Offset33x))|>String.of_char_listletprint_optional_field(x:optional_field)=lettyp,value=matchx.valuewith|`Ax->'A',Char.to_stringx|`ix->'i',Int64.to_stringx|`fx->'f',Float.to_stringx|`Zx->'Z',x|`Hx->'H',x|`B(c,l)->'B',(String.concat~sep:","((String.of_charc)::l))insprintf"%s:%c:%s"x.tagtypvalueletprint_alignmenta=sprintf"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"(print_qnamea.qname)(print_flagsa.flags)(print_rnamea.rname)(print_posa.pos)(print_mapqa.mapq)(print_cigara.cigar)(print_rnexta.rnext)(print_pnexta.pnext)(print_tlena.tlen)(print_seqa.seq)(print_quala.qual)(List.mapa.optional_fields~f:print_optional_field|>String.concat~sep:"\t")(******************************************************************************)(* Input/Output *)(******************************************************************************)moduleMakeIO(Future:Future.S)=structopenFuturemoduleLines=structincludeLinesincludeMakeIO(Future)endletread_headerlines=letrecloophdr:headerOr_error.tDeferred.t=Pipe.peek_deferredlines>>=(function|`Eof->return(Okhdr)|`Okline->ifString.length(line:Line.t:>string)=0thenreturn(Or_error.error_string"invalid empty line")elseifChar.((line:Line.t:>string).[0]<>'@')thenreturn(Okhdr)else(Pipe.junklines>>=fun()->parse_header_itemline|>function|Error_ase->returne|Ok(`HD({version;sort_order;group_order}:header_line))->(matchhdr.versionwith|Some_->return(Or_error.error_string"multiple @HD lines not allowed")|None->loop{hdrwithversion=Someversion;sort_order;group_order})|Ok(`SQx)->loop{hdrwithref_seqs=x::hdr.ref_seqs}|Ok(`RGx)->loop{hdrwithread_groups=x::hdr.read_groups}|Ok(`PGx)->loop{hdrwithprograms=x::hdr.programs}|Ok(`COx)->loop{hdrwithcomments=x::hdr.comments}|Ok(`Otherx)->loop{hdrwithothers=x::hdr.others}))inloopempty_header>>|function|Error_ase->e|Ok({version;sort_order;group_order;_}asx)->letref_seqs=List.revx.ref_seqsinletread_groups=List.revx.read_groupsinletprograms=List.revx.programsinletcomments=List.revx.commentsinletothers=List.revx.othersinheader?version?sort_order?group_order~ref_seqs~read_groups~programs~comments~others()letread?(start=Pos.(incr_lineunknown))r=letpos=refstartinletlines=Pipe.map(Lines.readr)~f:(funline->pos:=Pos.incr_line!pos;line)inread_headerlines>>|function|Error_ase->Or_error.tag_arge"position"!posPos.sexp_of_t|Okhdr->letalignments=Pipe.maplines~f:(funline->Or_error.tag_arg(parse_alignmentline)"position"!posPos.sexp_of_t)inOk(hdr,alignments)letwrite_headerw(h:header)=letopenWriterin(matchh.versionwith|None->Deferred.unit|Someversion->write_linew(print_header_line{version;sort_order=h.sort_order;group_order=h.group_order}))>>=fun()->Deferred.List.iterh.ref_seqs~f:(funx->write_linew(print_ref_seqx))>>=fun()->Deferred.List.iterh.read_groups~f:(funx->write_linew(print_read_groupx))>>=fun()->Deferred.List.iterh.programs~f:(funx->write_linew(print_programx))>>=fun()->Deferred.List.iterh.comments~f:(funx->writew"@CO\t">>=fun()->write_linewx)>>=fun()->Deferred.List.iterh.others~f:(funx->write_linew(print_otherx))letwritew?(header=empty_header)alignments=write_headerwheader>>=fun()->Pipe.iteralignments~f:(funa->Writer.write_linew(print_alignmenta))letwrite_file?perm?appendfile?headeralignments=Writer.with_file?perm?appendfile~f:(funw->writew?headeralignments)endincludeMakeIO(Future_unix)letparse_headertext=read_header(Lines.of_stringtext)