1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
open Core
let rec seq_of_sequence xs () =
match Sequence.next xs with
| None -> Seq.Nil
| Some (h, t) -> Seq.Cons (h, seq_of_sequence t)
module String_map = struct
include Map.Make(String)
let to_seq t = seq_of_sequence (Map.to_sequence t)
let of_seq xs =
Seq.fold_left (fun accu (key,data) -> Map.set accu ~key ~data) empty xs
end
module Selection = struct
type t = ISet.t String_map.t
let empty = String_map.empty
let add sel GLoc.{ chr ; lo ; hi } =
let set_chr =
match Map.find sel chr with
| None -> ISet.empty
| Some s -> s
in
let set_chr = ISet.add_range set_chr lo hi in
Map.set sel ~key:chr ~data:set_chr
let inter u v =
Map.fold u ~init:String_map.empty ~f:(fun ~key:k ~data:set_u accu ->
match Map.find v k with
| Some set_v -> Map.set accu ~key:k ~data:(ISet.inter set_u set_v)
| None -> accu
)
let union u v =
let keys = List.dedup_and_sort ~compare:String.compare (Map.keys u @ Map.keys v) in
List.fold keys ~init:String_map.empty ~f:(fun accu k ->
Map.set accu ~key:k ~data:(
ISet.union
(Option.value (Map.find u k) ~default:ISet.empty)
(Option.value (Map.find v k) ~default:ISet.empty)
)
)
let diff u v =
Map.fold u ~init:String_map.empty ~f:(fun ~key:k ~data:set_u accu ->
let set_u' =
match Map.find v k with
| Some set_v -> ISet.diff set_u set_v
| None -> set_u
in
Map.set ~key:k ~data:set_u' accu
)
let size x =
Map.fold x ~init:0 ~f:(fun ~key:_ ~data:set accu -> ISet.cardinal set + accu)
let overlap sel GLoc.{ chr ; lo ; hi } = ISet.(
match Map.find sel chr with
| Some x ->
inter (add_range empty lo hi) x
|> cardinal
| None -> 0
)
let intersects sel GLoc.{ chr ; lo ; hi } =
Option.value_map
(Map.find sel chr)
~default:false
~f:(fun x -> ISet.intersects_range x lo hi)
let to_seq sel =
String_map.to_seq sel
|> Seq.map (fun (chr, s) ->
Seq.map
(fun (lo, hi) -> GLoc.{ chr ; lo ; hi })
(ISet.to_seq s)
)
|> Seq.concat
let of_seq e =
let accu =
Binning.create
~bin:(fun x -> x.GLoc.chr)
~zero:ISet.empty
~add:GLoc.(fun loc x -> ISet.add_range x loc.lo loc.hi)
()
in
Seq.iter (fun loc -> Binning.add accu loc loc) e ;
String_map.of_seq (Binning.seq accu)
end
module LMap = struct
type 'a t = 'a Interval_tree.t String_map.t
let empty = String_map.empty
let intersects lmap { GLoc.chr ; lo ; hi } =
Option.value_map
(Map.find lmap chr)
~default:false
~f:(fun x -> Interval_tree.intersects x ~low:lo ~high:hi)
let closest lmap { GLoc.chr ; lo ; hi } =
Option.bind
(Map.find lmap chr)
~f:(fun x ->
try
let lo, hi, label, d = Interval_tree.find_closest x lo hi in
Some ({ GLoc.chr ; lo ; hi }, label, d)
with Interval_tree.Empty_tree -> None
)
let intersecting_elems lmap { GLoc.chr ; lo ; hi } =
match Map.find lmap chr with
| Some x ->
Interval_tree.find_intersecting_elem x lo hi
|> Seq.map (fun (lo, hi, x) -> { GLoc.chr ; lo ; hi }, x)
| None -> Seq.empty
let to_seq lmap =
String_map.to_seq lmap
|> Seq.map (fun (chr, t) ->
Seq.map
(fun (lo, hi, x) -> { GLoc.chr ; lo ; hi }, x)
(Interval_tree.to_seq t))
|> Seq.concat
let of_seq e =
let accu =
Binning.create
~bin:GLoc.(fun l -> l.chr)
~zero:Interval_tree.empty
~add:GLoc.(fun (l, v) -> Interval_tree.add ~data:v ~low:l.lo ~high:l.hi)
()
in
Seq.iter (fun (loc, value) -> Binning.add accu loc (loc, value)) e ;
String_map.of_seq (Binning.seq accu)
let add m k v =
let chr = k.GLoc.chr in
let t = Option.value ~default:Interval_tree.empty (Map.find m chr) in
let t = Interval_tree.(add t ~data:v ~low:k.lo ~high:k.hi) in
Map.set m ~key:chr ~data:t
end
module LSet = struct
module T = Interval_tree
type t = unit Interval_tree.t String_map.t
let empty = String_map.empty
let intersects = LMap.intersects
let closest lset loc =
Option.map (LMap.closest lset loc) ~f:(fun (loc', (), d) -> loc', d)
let intersecting_elems lset loc =
LMap.intersecting_elems lset loc |> Seq.map fst
let to_seq lset = LMap.to_seq lset |> Seq.map fst
let of_seq e = e |> Seq.map (fun x -> x, ()) |> LMap.of_seq
end
module LAssoc = struct
type 'a t = (GLoc.t * 'a) list
let compare (x, _) (y, _) = GLoc.compare x y
let of_alist xs =
List.sort ~compare xs
let of_list xs ~f = of_alist (List.map xs ~f:(fun x -> f x, x))
let to_alist xs = xs
let filter xs ~f = List.filter xs ~f:(fun (loc, value) -> f loc value)
let fold_neighbors xs ys ~init ~f =
let rec main_loop acc xs ys =
match xs with
| [] -> List.rev acc
| ((x_loc, x_val) as x) :: xs_tail ->
let ys = drop_until ys x_loc in
let r = neighbor_loop x ys ~init:(init x_loc x_val) in
let acc = (fst x, r) :: acc in
main_loop acc xs_tail ys
and drop_until ys x_loc =
match ys with
| [] -> []
| (y_loc, _) :: tail_ys ->
if GLoc.strictly_before y_loc x_loc then drop_until tail_ys x_loc
else ys
and neighbor_loop ((x_loc, _) as x) ys ~init =
match ys with
| [] -> init
| (y_loc, y_val) :: tail_ys ->
if GLoc.intersects y_loc x_loc then
neighbor_loop x tail_ys ~init:(f y_loc y_val init)
else init
in
main_loop [] xs ys
let example1 = [
GLoc.{ chr = "a" ; lo = 0 ; hi = 4 }, 1 ;
GLoc.{ chr = "a" ; lo = 40 ; hi = 400 }, 1 ;
GLoc.{ chr = "a" ; lo = 90 ; hi = 110 }, 1 ;
GLoc.{ chr = "a" ; lo = 91 ; hi = 92 }, 1 ;
GLoc.{ chr = "b" ; lo = 91 ; hi = 92 }, 1 ;
]
let%test "fold_neighbors_1" =
Int.(
fold_neighbors example1 example1 ~init:(fun _ _ -> 0) ~f:(fun _ _ acc -> acc + 1)
|> List.sum (module Int) ~f:(fun (_, d) -> d)
= 11
)
module type Topology = sig
type t = GLoc.t
val dist : t -> t -> int option
val compare : t -> t -> int
end
module Interval_topology = struct
type t = GLoc.t
let dist = GLoc.dist
let compare = GLoc.compare
end
module Point_topology = struct
type t = GLoc.t
let midpoint u = GLoc.(u.hi + u.lo) / 2
let dist (u : GLoc.t) (v : GLoc.t) =
if String.(u.chr = v.chr) then
Some (Int.abs (midpoint u - midpoint v))
else None
let midloc u = u.GLoc.chr, midpoint u
let compare (u : GLoc.t) (v : GLoc.t) =
Stdlib.compare (midloc u) (midloc v)
end
module Score = struct
type t = {
value : int ;
weight : int ;
}
let compare x y =
Stdlib.compare (x.value, y.weight) (y.value, x.weight)
let%test "matching_score_compare" =
compare { value = 3 ; weight = 6 } { value = 2 ; weight = 3 } > 0
let gt x y = compare x y > 0
let zero = { value = 0 ; weight = 0 }
let add_match s d = { value = s.value + 1 ; weight = s.weight + d }
end
module Matching(T : Topology) = struct
type choice = Left | Match | Right
type trace = {
score : Score.t ;
choice : choice ;
}
let dist (u, _) (v, _) = T.dist u v
let is_before (u, _) (v, _) =
T.compare u v <= 0
let score_step max_dist xs ys score_rec i j =
if i < 0 then { score = Score.zero ; choice = Right }
else if j < 0 then { score = Score.zero ; choice = Left }
else
let score_rec i j = (score_rec i j).score in
let edge_case () =
if is_before xs.(i) ys.(j) then
{ score = score_rec i (j - 1) ; choice = Right }
else
{ score = score_rec (i - 1) j ; choice = Left }
in
match dist xs.(i) ys.(j) with
| None -> edge_case ()
| Some d when d > max_dist -> edge_case ()
| Some d ->
let left_score = score_rec (i - 1) j in
let right_score = score_rec i (j - 1) in
let match_score = Score.add_match (score_rec (i - 1) (j - 1)) d in
match Score.(gt left_score match_score, gt left_score right_score, gt match_score right_score) with
| true, true, _ -> { score = left_score ; choice = Left }
| false, _, true -> { score = match_score ; choice = Match }
| _, false, false -> { score = right_score ; choice = Right }
| false, true, false
| true, false, true -> assert false
let memo_rec ff =
let open Stdlib in
let h = Hashtbl.create 0 in
let rec f x y =
try Hashtbl.find h (x,y)
with Not_found ->
let v = ff f x y in
Hashtbl.add h (x,y) v;
v
in f
let matching ~max_dist xs ys =
let f = memo_rec (score_step max_dist xs ys) in
let rec loop i j acc =
if i < 0 && j < 0 then acc
else
let trace = f i j in
match trace.choice with
| Left -> loop (i - 1) j (`Left xs.(i) :: acc)
| Right -> loop i (j - 1) (`Right ys.(j) :: acc)
| Match -> loop (i - 1) (j - 1) (`Match (xs.(i), ys.(j)) :: acc)
in
loop (Array.length xs - 1) (Array.length ys - 1) []
end
let matching ~mode ~max_dist xs ys =
let module T = (val (
match mode with
| `Interval -> (module Interval_topology)
| `Point -> (module Point_topology)
) : Topology)
in
let module M = Matching(T) in
M.matching ~max_dist (Array.of_list xs) (Array.of_list ys)
type matching =
[ `Match of (GLoc.t * unit) * (GLoc.t * unit)
| `Left of (GLoc.t * unit)
| `Right of (GLoc.t * unit)] list
[@@deriving sexp]
let%test_module "MATCHING" = (module struct
let loc lo hi = GLoc.{ chr = "chr" ; lo ; hi }, ()
let ( = ) = Stdlib.( = )
let%test "matching_1" = matching ~mode:`Point ~max_dist:10 [] [] = []
let i1 = loc 1 3
let%test "matching_2" =
matching ~mode:`Point ~max_dist:10 [i1] [] = [`Left i1]
let i2 = loc 2 3
let i3 = loc 7 8
let%test "matching_3" =
matching ~mode:`Point ~max_dist:10 [i1] [i2] = [`Match (i1, i2)]
let _print_match m =
print_endline (Sexp.to_string_hum (sexp_of_matching m))
let%test "matching_4" =
matching ~mode:`Interval ~max_dist:10 [i1;i3] [i2] = [`Match (i1, i2) ; `Left i3]
let%test "matching_5" =
matching ~mode:`Point ~max_dist:10 [i1;i3] [i2] = [`Match (i1, i2) ; `Left i3]
let i4 = loc 3 4
let i5 = loc 12 15
let%test "matching_6" =
matching ~mode:`Interval ~max_dist:10 [i1;i4;i5] [i2;i3] = [`Match (i1, i2) ; `Match (i4, i3) ; `Left i5]
end)
end