Source file alignment_stats.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
open Core_kernel
open Bistro
open Formats

let bamstats (bam : bam file) =
  let f = fun%workflow dest ->
    let open Biocaml_ez in
    let open CFStream in
    Bam.with_file [%path bam] ~f:(fun _ als ->
        Stream.fold als ~init:Bamstats.zero ~f:Bamstats.update
      )
    |> Bamstats.sexp_of_t
    |> Sexp.to_string_hum
    |> fun data -> Out_channel.write_all dest ~data
  in
  Workflow.path_plugin ~descr:"alignment_stats.bamstats" f

let fragment_length_stats (bam : bam file) =
  let f = fun%workflow dest ->
    let open Biocaml_ez in
    let open CFStream in
    Bam.with_file0 [%path bam] ~f:Bamstats.Fragment_length_histogram.(fun _ als ->
        let h = create ~min_mapq:5 () in
        Stream.iter als ~f:(fun al -> ok_exn (update0 h al)) ;
        Biocaml_unix.Accu.Counter.to_alist h.counts
      ) ;
    |> [%sexp_of: (int * int) list]
    |> Sexp.to_string_hum
    |> fun data -> Out_channel.write_all dest ~data
  in
  Workflow.path_plugin ~descr:"alignment_stats.fragment_length_histogram" f
  
let chrstats (bam : bam file) =
  let f = fun%workflow dest ->
    let open Biocaml_ez in
    let open CFStream in
    Bam.with_file0 [%path bam] ~f:Bamstats.Chr_histogram.(fun header als ->
        let h = create ~min_mapq:5 header in
        Stream.iter als ~f:(fun al -> ok_exn (update0 h al)) ;
        Biocaml_unix.Accu.Counter.to_alist h.counts
      ) ;
    |> [%sexp_of: (string * int) list]
    |> Sexp.to_string_hum
    |> fun data -> Out_channel.write_all dest ~data
  in
  Workflow.path_plugin ~descr:"alignment_stats.chrstats" f
  
let summary ~sample_name ~mapped_reads samples =
  let stat_files =
    List.map samples ~f:(fun s -> bamstats (mapped_reads s))
    |> Workflow.path_list
 
  and samples = List.map samples ~f:sample_name

  and chrstat_files =
    List.map samples ~f:(fun s -> chrstats (mapped_reads s))
    |> Workflow.path_list
  in
  let f = fun%workflow dest ->
    let    stat_files = [%eval stat_files] in
    let       samples = [%param samples] in
    let chrstat_files = [%eval chrstat_files] in

    let open Biocaml_ez in
    let open Tyxml_html in
    let k = txt in
    let stats =
      List.map stat_files ~f:(fun fn ->
          In_channel.read_all fn
          |> Sexp.of_string
          |> Bamstats.t_of_sexp
        )
    in
    let chrstats = match chrstat_files with
      | [] -> None
      | xs ->
        Some (
          List.map xs ~f:(fun fn ->
              In_channel.read_all fn
              |> Sexp.of_string
              |> [%of_sexp: (string * int) list]
            )
        )
    in
    let chrstat_table samples = function
      | [] -> raise (Invalid_argument "chrstat_table: needs at least one sample")
      | stats ->
        let all_chr =
          List.map stats ~f:(List.map ~f:fst)
          |> List.concat
          |> List.dedup_and_sort ~compare:String.compare
        in
        let header =
          thead [
            tr (
              th [k "Sample"] :: List.map all_chr ~f:(fun chr -> th [k chr])
            ) ;
          ]
        in
        let line sample stats =
          let n = List.fold stats ~init:0 ~f:(fun acc (_, n) -> acc + n) in
          let cols = List.map stats ~f:(fun (_, k) ->
              let p = Float.(of_int k / of_int n *. 100.) in
              td [ txt (sprintf "%.1f%%" p) ] ;
            )
          in
          tr (td [ k sample ] :: cols)
        in
        let lines = List.map2_exn samples stats ~f:line in
        table ~thead:header ~a:[a_class ["table"]] lines
    in
    let flagstat_table samples stats =
      let fraction n k =
        let f =
          if n <= 0 then 0.
          else Float.(of_int k / of_int n * 100.)
        in
        txt (sprintf "%d (%.1f%%)" k f)
      in
      let header =
        thead [
          tr [
            th [k "Sample"] ;
            th [k "Total"] ;
            th [k "QC pass"] ;
            th [k "Mapped reads"] ;
            th [k "Read pairs"] ;
            th [k "Mapped pairs"] ;
          ]
        ]
      in
      let line sample { Bamstats.total ;
                        qc_pass ;
                        read_pairs ;
                        mapped_reads ;
                        mapped_pairs ; _ } =
        tr [
          td [ k sample ] ;
          td [ k Int.(to_string total) ] ;
          td [ fraction total qc_pass ] ;
          td [ fraction total mapped_reads ] ;
          td [ k Int.(to_string read_pairs) ] ;
          td [ fraction read_pairs mapped_pairs ] ;
        ]
      in
      let lines = List.map2_exn samples stats ~f:line in
      table ~thead:header ~a:[a_class ["table"]] lines
    in
    let contents = [
      flagstat_table samples stats ;
      br () ;
      Option.value_map chrstats ~default:(txt "") ~f:(chrstat_table samples) ;
    ]
    in
    let bootstrap_head t =
      let open Tyxml_html in
      head (title (txt t)) [
        link ~rel:[`Stylesheet] ~href:"http://netdna.bootstrapcdn.com/bootstrap/3.0.2/css/bootstrap.min.css" () ;
        link ~rel:[`Stylesheet] ~href:"http://netdna.bootstrapcdn.com/bootstrap/3.0.2/css/bootstrap-theme.min.css" () ;
        script ~a:[a_src "https://code.jquery.com/jquery.js"] (txt "") ;
        script ~a:[a_src "http://netdna.bootstrapcdn.com/bootstrap/3.0.2/js/bootstrap.min.js"] (txt "") ;
      ]
    in
    let page ~title:t contents =
      html (bootstrap_head t) (body [ div ~a:[a_class ["container"]] contents ])
    in
    let to_file doc fn =
      Out_channel.with_file fn ~f:(fun oc ->
          Tyxml_html.pp () (Format.formatter_of_out_channel oc) doc
        )
    in
    to_file
      (page ~title:"Alignment summary" contents)
      dest
  in
  Workflow.path_plugin ~descr:"alignment_stats.summary" f