1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
let max_block_size = 0x10000
let max_isize = 0xff00
exception Error of string
type in_channel = {
ic : Stdlib.in_channel ;
in_bufz : bytes ;
in_buf : bytes ;
mutable in_block_offset : Int64.t ;
mutable in_pos : int ;
mutable in_avail : int ;
mutable in_eof : bool ;
mutable in_stream : Zlib.stream;
}
let of_in_channel ic = {
ic ;
in_bufz = Bytes.make max_block_size '\000' ;
in_buf = Bytes.make max_block_size '\000' ;
in_block_offset = Int64.zero ;
in_pos = 0 ;
in_avail = 0 ;
in_stream = Zlib.inflate_init false ;
in_eof = false
}
let open_in fn = of_in_channel (Stdlib.open_in_bin fn)
let dispose_in iz =
iz.in_eof <- true ;
Zlib.inflate_end iz.in_stream
let close_in iz =
dispose_in iz ;
In_channel.close iz.ic
let input_byte t = Caml.input_byte t
let input_u16 ic =
let b1 = input_byte ic in
let b2 = input_byte ic in
b1 + b2 lsl 8
let input_s32 ic =
let b1 = input_byte ic in
let b2 = input_byte ic in
let b3 = input_byte ic in
let b4 = input_byte ic in
let open Int32 in
bit_or (of_int_exn b1)
(bit_or (shift_left (of_int_exn b2) 8)
(bit_or (shift_left (of_int_exn b3) 16)
(shift_left (of_int_exn b4) 24)))
let iz =
match In_channel.input_byte iz.ic with
| None -> iz.in_eof <- true ; raise End_of_file
| Some id1 ->
try
let id2 = input_byte iz.ic in
if id1 <> 0x1F || id2 <> 0x8B then raise (Error "bad magic number, not a bgzf file") ;
let cm = input_byte iz.ic in
if cm <> 8 then raise(Error "unknown compression method") ;
let flags = input_byte iz.ic in
if flags <> 0x04 then raise(Error("bad flags, not a bgzf file"));
for _ = 1 to 6 do ignore (input_byte iz.ic : int) done;
let xlen = input_u16 iz.ic in
let si1 = input_byte iz.ic in
let si2 = input_byte iz.ic in
let slen = input_u16 iz.ic in
if si1 <> 66 || si2 <> 67 || slen <> 2 then raise (Error "bad extra subfield") ;
let bsize = input_u16 iz.ic in
for _ = 1 to xlen - 6 do ignore (input_byte iz.ic : int) done ;
bsize - xlen - 19
with End_of_file -> raise (Error "premature end of file, not a bgzf file")
let read_block iz =
let rec loop posz lenz pos len crc size =
let (finished, used_in, used_out) =
try
Zlib.inflate iz.in_stream iz.in_bufz posz lenz iz.in_buf pos len Zlib.Z_SYNC_FLUSH
with Zlib.Error (_, _) ->
raise (Error "error during decompression") in
let posz = posz + used_in in
let lenz = lenz - used_in in
let crc = Zlib.update_crc crc iz.in_buf pos used_out in
let size = size + used_out in
if finished then crc, size
else loop posz lenz (pos + used_out) (len - used_out) crc size
in
try
iz.in_block_offset <- In_channel.pos iz.ic ;
let cdata_size = read_header iz in
try
Stdlib.really_input iz.ic iz.in_bufz 0 cdata_size ;
let ref_crc = input_s32 iz.ic in
let ref_size = input_s32 iz.ic |> Int32.to_int_exn in
Zlib.inflate_end iz.in_stream ;
iz.in_stream <- Zlib.inflate_init false ;
let crc, size = loop 0 cdata_size 0 max_block_size Int32.zero 0 in
if Int32.(crc <> ref_crc) then raise (Error "CRC mismatch, data corrupted") ;
if size <> ref_size then raise (Error "size mismatch, data corrupted") ;
iz.in_pos <- 0 ;
iz.in_avail <- size
with End_of_file -> raise (Error "premature end of file, not a bgzf file")
with End_of_file -> iz.in_eof <- true
let input iz buf pos len =
let n = Bytes.length buf in
if pos < 0 || len < 0 || pos + len > n then raise (Invalid_argument "Bgzf.input") ;
if iz.in_eof then 0
else (
let rec loop pos len read =
if len = 0 then read
else (
if iz.in_pos = iz.in_avail then read_block iz ;
if iz.in_eof then read
else (
let n = min (iz.in_avail - iz.in_pos) len in
Stdlib.Bytes.blit iz.in_buf iz.in_pos buf pos n ;
iz.in_pos <- iz.in_pos + n ;
loop (pos + n) (len - n) (read + n)
)
)
in
loop pos len 0
)
let rec really_input iz buf pos len =
if len <= 0 then ()
else (
let n = input iz buf pos len in
if n = 0 then raise End_of_file
else really_input iz buf (pos + n) (len - n)
)
let input_string iz n =
if n < 0 then raise (Invalid_argument "Bgzf.input_string iz n: n should be non negative") ;
let r = Bytes.make n '@' in
really_input iz r 0 n ;
Bytes.unsafe_to_string ~no_mutation_while_string_reachable:r
let input_char =
let buf = Bytes.create 1 in
fun iz ->
if input iz buf 0 1 = 0 then raise End_of_file
else Bytes.get buf 0
let input_u8 iz =
Char.to_int (input_char iz)
let input_s8 iz =
let b = input_u8 iz in
if b land 128 <> 0 then b - 256
else b
let input_u16 iz =
let b1 = input_u8 iz in
let b2 = input_u8 iz in
b1 lor (b2 lsl 8)
let input_s16 iz =
let i = input_u16 iz in
if i land 32768 <> 0 then i - 65536
else i
let input_s32 iz =
let b1 = input_u8 iz in
let b2 = input_u8 iz in
let b3 = input_u8 iz in
let b4 = input_u8 iz in
Int32.bit_or (Int32.of_int_exn b1)
(Int32.bit_or (Int32.shift_left (Int32.of_int_exn b2) 8)
(Int32.bit_or (Int32.shift_left (Int32.of_int_exn b3) 16)
(Int32.shift_left (Int32.of_int_exn b4) 24)))
let seek_in iz i =
let coffset = Int64.shift_right i 16 in
let uoffset = Int64.(to_int_exn (bit_and 0xFFFFL i)) in
In_channel.seek iz.ic coffset ;
iz.in_block_offset <- coffset ;
iz.in_eof <- false ;
if uoffset = 0 then (
iz.in_pos <- 0 ;
iz.in_avail <- 0
)
else (
read_block iz ;
iz.in_pos <- iz.in_pos + uoffset
)
let virtual_offset iz =
if iz.in_pos = iz.in_avail then
Int64.(shift_left (In_channel.pos iz.ic) 16)
else
Int64.(shift_left iz.in_block_offset 16 + of_int_exn iz.in_pos)
let with_file_in fn ~f =
let iz = open_in fn in
let r =
try `Ok (f iz)
with e -> `Error e
in
close_in iz ;
match r with
| `Ok y -> y
| `Error exn -> raise exn
exception Unparser_error of string
type out_channel = {
out_chan : Stdlib.out_channel ;
out_ubuffer : bytes ;
out_cbuffer : bytes ;
mutable out_pos : int ;
out_level : int ;
}
let output_int16 oc n =
Out_channel.output_byte oc n ;
Out_channel.output_byte oc (n lsr 8)
let output_int32 oc n =
let r = ref n in
for _ = 1 to 4 do
Out_channel.output_byte oc (Int32.to_int_exn !r);
r := Int32.shift_right_logical !r 8
done
let write_block oc buf len ~isize ~crc32 =
let xlen = 6 in
let bsize = 20 + xlen + len in
assert (bsize < 0x10000) ;
Out_channel.output_byte oc 0x1F;
Out_channel.output_byte oc 0x8B;
Out_channel.output_byte oc 8;
Out_channel.output_byte oc 4;
for _ = 1 to 4 do
Out_channel.output_byte oc 0
done ;
Out_channel.output_byte oc 0;
Out_channel.output_byte oc 0xFF;
output_int16 oc xlen ;
Out_channel.output_byte oc 0x42 ;
Out_channel.output_byte oc 0x43 ;
output_int16 oc 2 ;
output_int16 oc (bsize - 1);
Caml.output oc buf 0 len ;
output_int32 oc crc32 ;
output_int32 oc isize
let of_out_channel ?(level = 6) oc =
if level < 1 || level > 9 then raise (invalid_arg "Bgzf: bad compression level") ;
{
out_chan = oc;
out_ubuffer = Bytes.create max_isize ;
out_cbuffer = Bytes.create max_block_size ;
out_pos = 0 ;
out_level = level ;
}
let open_out ?(level = 6) filename =
of_out_channel ~level (Stdlib.open_out_bin filename)
let push_block oz =
let stream = Zlib.deflate_init oz.out_level false in
let (_, used_in, used_out) =
try
Zlib.deflate
stream
oz.out_ubuffer 0 oz.out_pos
oz.out_cbuffer 0 (Bytes.length oz.out_cbuffer)
Zlib.Z_FINISH
with Zlib.Error(_, _) -> raise (Unparser_error("error during compression"))
in
assert (used_in = oz.out_pos) ;
let crc32 = Zlib.update_crc Int32.zero oz.out_ubuffer 0 used_in in
Zlib.deflate_end stream ;
write_block oz.out_chan oz.out_cbuffer used_out
~isize:(Int32.of_int_exn used_in) ~crc32 ;
oz.out_pos <- 0
let rec output ~length ~blit oz buf ~pos ~len =
if pos < 0 || len < 0 || pos + len > length buf then invalid_arg "Bgzf.output";
if oz.out_pos = Bytes.length oz.out_ubuffer then push_block oz ;
let available = Bytes.length oz.out_ubuffer - oz.out_pos in
let ncopy = min len available in
blit buf pos oz.out_ubuffer oz.out_pos ncopy ;
oz.out_pos <- oz.out_pos + ncopy ;
let remaining = len - ncopy in
if remaining > 0 then output ~length ~blit oz buf ~pos:(pos + ncopy) ~len:remaining
let output_from_string = output ~length:String.length ~blit:Caml.Bytes.blit_string
let output = output ~length:Bytes.length ~blit:Caml.Bytes.blit
let output_char =
let buf = Bytes.make 1 ' ' in
fun oz c ->
Bytes.set buf 0 c ;
output oz buf ~pos:0 ~len:1
let output_u8 oz n =
output_char oz (Char.unsafe_of_int (n land 0xFF))
let output_s8 oz n =
if n < -0x80 || n > 0x7F then raise (Invalid_argument "Bgzf.output_s8") ;
if n < 0 then
output_u8 oz (n + 256)
else
output_u8 oz n
let output_u16 oz n =
output_u8 oz n ;
output_u8 oz (n lsr 8)
let output_s16 oz n =
if n < -0x8000 || n > 0x7FFF then raise (Invalid_argument "Bgzf.output_s16") ;
if n < 0 then
output_u16 oz (65536 + n)
else
output_u16 oz n
let output_s32 oz n =
let base = Int32.to_int_exn n in
let big = Int32.to_int_exn (Int32.shift_right_logical n 24) in
output_u8 oz base ;
output_u8 oz (base lsr 8) ;
output_u8 oz (base lsr 16) ;
output_u8 oz big
let output_string oz s =
output_from_string oz s ~pos:0 ~len:(String.length s)
let bgzf_eof = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
let dispose_out oz =
if oz.out_pos > 0 then push_block oz ;
Stdlib.output_string oz.out_chan bgzf_eof
let close_out oz =
dispose_out oz ;
Stdlib.close_out oz.out_chan
let with_file_out ?level fn ~f =
let oz = open_out ?level fn in
let r =
try `Ok (f oz)
with e -> `Error e
in
close_out oz ;
match r with
| `Ok y -> y
| `Error exn -> raise exn