1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
open CFStream
type probe = {org_name:string; version:string; chr_name:string; start_pos:int; sequence:Seq.t}
type row = {pmcoord:int*int; mmcoord:int*int; probe:probe}
type t = row list
exception Bad of string
let raise_bad msg = raise (Bad msg)
let col_names = ["PMX";"PMY";"MMX";"MMY";"Seq";"Pos";"Probe"]
let num_probes = List.length
let iter f l = List.iter ~f l
let fold f init l = List.fold_left ~f ~init l
let to_list t = t
module Parser = struct
let (s:string) : string list =
let sl = String.split s ~on:'\t' in
if Poly.(sl = col_names) then sl
else raise_bad "incorrectly formatted header"
let row ~chr_map (s:string) : row =
match String.split s ~on:'\t' with
| [pmcx; pmcy; mmcx; mmcy; org_ver_chr; pos; seq] ->
let org, ver_chr =
match String.split org_ver_chr ~on:':' with
| [o; ver_chr] -> o, ver_chr
| _ -> raise_bad "expecting exactly one colon in Seq column" in
let ver, chr =
match String.split ver_chr ~on:';' with
| [v; c] -> v, chr_map c
| _ -> raise_bad "expecting exactly one semicolon in Seq column" in
{
pmcoord = int_of_string pmcx, int_of_string pmcy;
mmcoord = int_of_string mmcx, int_of_string mmcy;
probe =
{
org_name = org;
version = ver;
chr_name = chr;
start_pos = int_of_string pos;
sequence = try Seq.of_string seq
with Seq.Bad m -> raise_bad m
}
}
| _ -> raise_bad "expecting 7 columns"
let bpmap ~chr_map file =
let parse file cin =
let lines = Lines.of_channel cin in
let err msg = Msg.err ~pos:(Pos.make ~source:file ~line:(Stream.count lines) ()) msg in
try
ignore (header ((Stream.next_exn lines) : Lines.item :> string) : string list);
Stream.to_list (Stream.map ~f:(fun (x : Lines.item) -> row ~chr_map (x :> string)) lines)
with
Failure msg | Bad msg -> raise_bad (err msg)
in
In_channel.with_file file ~f:(parse file)
end
let of_file ?(chr_map=ident) file = Parser.bpmap ~chr_map file
let row_to_string r =
let (pmx,pmy) = r.pmcoord in
let (mmx,mmy) = r.mmcoord in
String.concat ~sep:"\t"
[string_of_int pmx;
string_of_int pmy;
string_of_int mmx;
string_of_int mmy;
r.probe.org_name ^ ":" ^ r.probe.version ^ ";" ^ r.probe.chr_name;
string_of_int (r.probe.start_pos);
Seq.to_string (r.probe.sequence)
]
let to_file file t =
let print cout =
fprintf cout "%s\n" (String.concat ~sep:"\t" col_names);
List.iter ~f:(fun x -> fprintf cout "%s\n" (row_to_string x)) t
in
Out_channel.with_file file ~f:print