123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147(** Parsing btab files. Treats query as the record unit rather than individual
hits. *)open!Base(** A record type for Btab homology search files
{1 Overview}
Unlike the {!Btab} module, the query sequence is the basis for the record.
All hits are grouped by the query sequence. For example, in a btab file with
query-target pairs, [q1-t1, q1-t2, q2-t2] then you will have two records.
One for [q1] and one for [q2]. *)moduleRecord:sig(** {1 API} *)(** A btab query record. I.e., query name and a list of all [Btab.t] hits. *)typet[@@derivingsexp_of](** {2 Creating} *)valcreate:string->Btab.Record.tlist->t(** [create query_name hit_list] creates a [t] given the name of the query and
a list of hits. You probably won't use this function directly. *)(** {2 Accessing} *)valquery:t->string(** [query t] returns the name of the query for this record (i.e., for the
list of hits). *)valhits:t->Btab.Record.tlist(** [hits t] returns the list of hits associated with this query sequence. *)end=struct[@@@coverageoff]typet={query:string;hits:Btab.Record.tlist}[@@derivingsexp_of][@@@coverageon]letcreatequeryhits={query;hits}letqueryt=t.querylethitst=t.hitsend(** [In_channel] for Btab files where each query in the file is a single record.
{1 Overview}
{b WARNING}: This module assumes that queries are sorted. One case in which
this assumption does not hold is with [mmseqs] when using more than one
iteration. E.g., [mmseqs easy-search --num-iterations 3]. This behavior will
likely change in the future.
You should consider this module experimental.
{1 Example}
Here is a short example program. It reads a btab file and prints out the
records.
{[
open! Base
open! Bio_io.Btab_queries
let parse_argv () =
match Sys.get_argv () with
| [|_; file_name|] ->
file_name
| _ ->
failwith "missing file_name"
let () =
let file_name = parse_argv () in
In_channel.with_file_iter_records file_name ~f:(fun r ->
Stdio.print_endline "===" ;
Stdio.print_endline @@ Record.query r ;
let hits = List.map ~f:Bio_io.Btab.Record.parse @@ Record.hits r in
Stdio.print_s @@ [%sexp_of: Bio_io.Btab.Record.Parsed.t list] hits )
]}
The output will be somthing like.
{[
===
Q 1
(((query "Q 1") (target q1t1) (pident 0.1) (alnlen 2) (mismatch 3)
(gapopen 4) (qstart 5) (qend 6) (tstart 7) (tend 8) (evalue 9.99E-05)
(bits 10) (qlen ()) (tlen ()))
((query "Q 1") (target q1t2) (pident 0.11) (alnlen 12) (mismatch 13)
(gapopen 14) (qstart 15) (qend 16) (tstart 17) (tend 18) (evalue 1.9E-05)
(bits 20) (qlen ()) (tlen ())))
===
Q_2
(((query Q_2) (target q2t1) (pident 0.21) (alnlen 22) (mismatch 23)
(gapopen 24) (qstart 25) (qend 26) (tstart 27) (tend 28) (evalue 2.9E-05)
(bits 30) (qlen ()) (tlen ())))
]} *)moduleIn_channel=structmoduleT=structincludePrivate.Peekable_in_channeltyperecord=Record.t(* TODO put somewhere in the docs that this assumes that the queries are
sorted. One specific case in which they are not is with mmseqs searches
with multiple iterations. *)letinput_recordic=letconsume_lineic=Btab.Record.of_string@@input_line_exn~fix_win_eol:trueicinletmkrecordqueryhits=Some(Record.createquery(List.revhits))inletis_new_query~last_query~new_query=String.(last_query<>new_query)inletreclooplast_queryhits=match(last_query,peek_line~fix_win_eol:trueic)with|None,None->None|None,Some_->(* Need to consume this line right away. *)letr=consume_lineicinloop(Some(Btab.Record.queryr))(r::hits)|Somelast_query',None->mkrecordlast_query'hits|Somelast_query',Someline->(* We need to check if this is a new record or not. *)letr=Btab.Record.of_stringlineinifis_new_query~last_query:last_query'~new_query:(Btab.Record.queryr)thenmkrecordlast_query'hitselse(* Consume this line and loop, because we have more hits for this
current query. *)letr=consume_lineicinlooplast_query(r::hits)inloopNone[]endincludeTincludeRecord_in_channel.Make(T)end