1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
open Core
type item = {
description : string;
sequence : string;
}
[@@ deriving sexp, show]
module Parser = struct
open Angstrom
let is_not_eol = function
| '\n' | '\r' -> false
| _ -> true
let eolf = end_of_line <|> end_of_input
let =
choice [ char ';' ; char '#' ] >>= fun ->
take_while is_not_eol >>= fun ->
eolf *>
return (comment_char, comment_msg)
let = many comment_line
let description_line =
char '>' *> take_while is_not_eol <* eolf
let sequence_line =
peek_char_fail >>= (function
| '>' -> fail "Expected sequence line, not description"
| _ -> take_while is_not_eol
) <* eolf
let item =
description_line >>= fun description ->
many sequence_line >>= fun seqs ->
let sequence = String.concat seqs in
return { description ; sequence }
let fasta =
let p =
header >>= fun ->
many item >>= fun items ->
end_of_input >>| fun () ->
List.map ~f:snd header, items
in
p <?> "fasta"
end
let from_string s = Angstrom.(parse_string ~consume:All) Parser.fasta s
let%expect_test "parse FASTA without header" =
let fa = from_string ">s1\nAA\n>s2\nA\n" in
print_endline @@ [%show: (string list * item list, string) result] fa ;
[%expect {|
(Ok ([],
[{ Fasta.description = "s1"; sequence = "AA" };
{ Fasta.description = "s2"; sequence = "A" }])) |}]
let%expect_test "parse FASTA with header" =
let fa = from_string ";a few\n#sequences\n>s1\nAA\n>s2\nA\n" in
print_endline @@ [%show: (string list * item list, string) result] fa ;
[%expect {|
(Ok (["a few"; "sequences"],
[{ Fasta.description = "s1"; sequence = "AA" };
{ Fasta.description = "s2"; sequence = "A" }])) |}]
let from_file fn =
In_channel.with_file fn ~f:(fun ic ->
let Angstrom.Buffered.{ buf ; off ; len }, res = Angstrom_unix.parse Parser.fasta ic in
match res with
| Ok r -> Ok r
| Error _ ->
let snippet = Bigstringaf.substring buf ~off ~len:(Int.min 30 len) in
let msg = sprintf "Failed to parse: %s" snippet in
Error msg
)
let from_file_exn fn =
from_file fn
|> Result.ok_or_failwith
let sequences_from_file_exn fn =
match from_file fn with
| Ok (_, items) -> List.map items ~f:(fun i -> i.sequence)
| Error msg -> failwith msg
let to_file fn items =
Out_channel.with_file fn ~f:(fun oc ->
List.iter items ~f:(fun it ->
fprintf oc ">%s\n%s\n" it.description it.sequence
)
)