123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310(* FIXME: max_line_length and alphabet format options are not implemented *)openRresulttypeheader=stringlisttypeitem={description:string;sequence:string;}[@@derivingsexp](* FIXME: should check there is no newline in the arguments *)letitem~description~sequence={description;sequence;}typefmt={allow_sharp_comments:bool;allow_semicolon_comments:bool;allow_empty_lines:bool;max_line_length:intoption;alphabet:stringoption;}letfmt?(allow_sharp_comments=true)?(allow_semicolon_comments=false)?(allow_empty_lines=false)?max_line_length?alphabet()={allow_sharp_comments;allow_semicolon_comments;allow_empty_lines;max_line_length;alphabet;}letdefault_fmt=fmt()typeitem0=[|`Commentofstring|`Empty_line|`Descriptionofstring|`Partial_sequenceofstring][@@derivingsexp]letsequence_to_int_lists=tryString.splits~on:' '|>List.map~f:Int.of_string|>R.okwithFailuremsg->R.error_msgmsgtypeparser_error=[`Fasta_parser_errorofint*string][@@derivingsexp]moduleParser0=structtypestate={fmt:fmt;line:int;line_start:bool;started_first_item:bool;symbol:symbol;}andsymbol=|S(* start of comment or description *)|Commentofstring|Descriptionofstring|Sequenceof{empty:bool}|Terminalletinitial_state?(fmt=default_fmt)()={fmt;line=0;line_start=true;started_first_item=false;symbol=S}letfailstmsg=R.fail(`Fasta_parser_error(st.line,msg))letfailfstfmt=letkx=failstxinPrintf.ksprintfkfmtletnewline?symst={stwithline=st.line+1;line_start=true;symbol=matchsymwith|None->st.symbol|Somes->s}letstepst=function|None->(matchst.symbolwith|S->R.ok({stwithsymbol=Terminal},[])|Commentc->assert(notst.started_first_item);R.ok({stwithsymbol=Terminal},[`Commentc])|Description_->failst"Missing sequence in last item"|Sequence{empty=true}->failst"Missing sequence in last item"|Sequence{empty=false}->R.ok({stwithsymbol=Terminal},[])|Terminal->R.ok(st,[]))|Somebuf->(letallowed_comment_charc=letopenCharin(c='#'&&st.fmt.allow_sharp_comments)||(c=';'&&st.fmt.allow_semicolon_comments)inletn=String.lengthbufinletrecloopstaccuij=ifj<nthenmatchbuf.[j],st.line_start,st.symbolwith|_,_,Terminal->R.ok(st,[])|_,false,S->assertfalse(* unreachable state *)|'>',true,S->loop{stwithline_start=false;started_first_item=true;symbol=Description""}accu(j+1)(j+1)|_,true,Comment_|_,true,Description_->assertfalse(* unreachable states *)|'>',true,Sequence{empty=true}->failst"Expected sequence, not description"|'>',true,Sequence{empty=false}->loop{stwithline_start=false;symbol=Description""}accu(j+1)(j+1)|(';'|'#'asc),true,S->assert(i=j&&i=0);ifallowed_comment_charcthenloop{stwithline_start=false;symbol=Comment""}accu(i+1)(j+1)elsefailfst"Character %c not allowed for comments"c|(';'|'#'),true,Sequence_->failst"Comment after first item"|'\n',true,(Sequence_|S)->ifst.fmt.allow_empty_linesthenloop(newlinest)(`Empty_line::accu)(j+1)(j+1)elsefailst"Empty line"|c,true,S->failfst"Unexpected character %c at beginning of line"c|'\n',false,Commentc->letc'=String.subbuf~pos:i~len:(j-i)inloop(newlinest~sym:S)(`Comment(c^c')::accu)(j+1)(j+1)|'\n',false,Descriptiond->letd'=String.subbuf~pos:i~len:(j-i)inloop(newlinest~sym:(Sequence{empty=true}))(`Description(d^d')::accu)(j+1)(j+1)|'\n',false,Sequence_->letseq=String.subbuf~pos:i~len:(j-i)inloop(newlinest~sym:(Sequence{empty=false}))(`Partial_sequenceseq::accu)(j+1)(j+1)|_,false,(Comment_|Description_|Sequence{empty=false})->loopstaccui(j+1)|_,false,Sequence{empty=true}->assertfalse(* unreachable state *)|_,true,Sequence{empty=true}->loop{stwithline_start=false;symbol=Sequence{empty=false}}accui(j+1)|_,true,Sequence{empty=false}->loop{stwithline_start=false}accui(j+1)elsematchst.symbolwith|S|Terminal->R.ok(st,accu)|Commentc->letc'=String.subbuf~pos:i~len:(j-i)inR.ok({stwithsymbol=Comment(c^c')},accu)|Descriptiond->letd'=String.subbuf~pos:i~len:(j-i)inR.ok({stwithsymbol=Description(d^d')},accu)|Sequence_assym->letsymbol,res=ifi=jthensym,accuelseletseq=String.subbuf~pos:i~len:(j-i)inSequence{empty=false},(`Partial_sequenceseq::accu)inR.ok({stwithsymbol},res)inloopst[]00>>|fun(st,res)->st,List.revres)endletunparser0=function|`Commentc->"#"^c|`Empty_line->""|`Descriptiond->">"^d|`Partial_sequences->s(* This could probably be optimized *)letrev_concatxs=String.concat~sep:""(List.revxs)moduleParser=structtypestate={state0:Parser0.state;symbol:symbol;}andsymbol=|Init|Itemofstring*stringlist|Terminalletinitial_state?fmt()={state0=Parser0.initial_state?fmt();symbol=Init}letstep_aux(sym,accu)item0=matchitem0,symwith|_,Terminal->Terminal,accu|(`Comment_|`Empty_line),_->sym,accu|`Descriptiond,Init->Item(d,[]),accu|`Description_,Item(_,[])->assertfalse(* should be detected by Parser0.step *)|`Descriptiond',Item(d,xs)->letitem=item~description:d~sequence:(rev_concatxs)inItem(d',[]),item::accu|`Partial_sequence_,Init->assertfalse(* should be detected by Parser0.step *)|`Partial_sequences,Item(d,xs)->Item(d,s::xs),acculetstep_finalinput((symbol,items)asres)=matchinput,symbolwith|Some_,_|None,(Init|Terminal)->res|None,Item(d,xs)->Terminal,item~description:d~sequence:(rev_concatxs)::itemsletstepstinput=Parser0.stepst.state0input>>|fun(state0,items0)->letinit=st.symbol,[]inletsymbol,items=List.fold_leftitems0~init~f:step_aux|>step_finalinputin{state0;symbol},List.revitemsendletunparseritem=Printf.sprintf">%s\n%s\n"item.descriptionitem.sequence