Source file qcow_recycler.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
(* Securely erase and then recycle clusters *)

let src =
  let src = Logs.Src.create "qcow" ~doc:"qcow2-formatted BLOCK device" in
  Logs.Src.set_level src (Some Logs.Info) ;
  src

module Log = (val Logs.src_log src : Logs.LOG)

open Qcow_types

let ( <| ) = Int64.shift_left

let ( |> ) = Int64.shift_right

module Cache = Qcow_cache
module Error = Qcow_error
module Locks = Qcow_locks
module Metadata = Qcow_metadata
module Physical = Qcow_physical

module Make (B : Qcow_s.RESIZABLE_BLOCK) = struct
  type t = {
      base: B.t
    ; sector_size: int
    ; cluster_bits: int
    ; mutable cluster_map: Qcow_cluster_map.t option (* free/ used space map *)
    ; cache: Cache.t
    ; locks: Locks.t
    ; metadata: Metadata.t
    ; zero_buffer: Cstruct.t
    ; mutable background_thread: unit Lwt.t
    ; mutable need_to_flush: bool
    ; need_to_flush_c: unit Lwt_condition.t
    ; flush_m: Lwt_mutex.t
    ; runtime_asserts: bool
  }

  let create ~base ~sector_size ~cluster_bits ~cache ~locks ~metadata
      ~runtime_asserts =
    let zero_buffer = Io_page.(to_cstruct @@ get 256) in
    (* 1 MiB *)
    Cstruct.memset zero_buffer 0 ;
    let background_thread = Lwt.return_unit in
    let flush_m = Lwt_mutex.create () in
    let cluster_map = None in
    let need_to_flush = false in
    let need_to_flush_c = Lwt_condition.create () in
    {
      base
    ; sector_size
    ; cluster_bits
    ; cluster_map
    ; cache
    ; locks
    ; metadata
    ; zero_buffer
    ; background_thread
    ; need_to_flush
    ; need_to_flush_c
    ; flush_m
    ; runtime_asserts
    }

  let set_cluster_map t cluster_map = t.cluster_map <- Some cluster_map

  let allocate t n =
    let cluster_map =
      match t.cluster_map with Some x -> x | None -> assert false
    in
    match
      Cluster.IntervalSet.take (Qcow_cluster_map.Available.get cluster_map) n
    with
    | Some (set, _free) ->
        Log.debug (fun f ->
            f "Allocated %a clusters from free list: %a"
              (Fmt.of_to_string Cluster.to_string) n
              Cluster.IntervalSet.pp set
        ) ;
        Qcow_cluster_map.(set_cluster_state cluster_map set Available Roots) ;
        Some set
    | None ->
        None

  let copy_already_locked t src dst =
    let src = Cluster.to_int64 src and dst = Cluster.to_int64 dst in
    let cluster_map =
      match t.cluster_map with Some x -> x | None -> assert false
    in
    Log.debug (fun f -> f "Copy cluster %Ld to %Ld" src dst) ;
    let npages = 1 lsl (t.cluster_bits - 12) in
    let pages = Io_page.(to_cstruct @@ get npages) in
    let cluster = Cstruct.sub pages 0 (1 lsl t.cluster_bits) in

    let sectors_per_cluster =
      Int64.(div (1L <| t.cluster_bits) (of_int t.sector_size))
    in

    let src_sector = Int64.mul src sectors_per_cluster in
    let dst_sector = Int64.mul dst sectors_per_cluster in
    let open Lwt.Infix in
    B.read t.base src_sector [cluster] >>= function
    | Error `Disconnected ->
        Lwt.return (Error `Disconnected)
    | Error e ->
        Format.kasprintf Lwt.fail_with "Unknown error: %a" B.pp_error e
    | Ok () -> (
        B.write t.base dst_sector [cluster] >>= function
        | Error `Disconnected ->
            Lwt.return (Error `Disconnected)
        | Error `Is_read_only ->
            Lwt.return (Error `Is_read_only)
        | Error e ->
            Format.kasprintf Lwt.fail_with "Unknown error: %a" B.pp_write_error
              e
        | Ok () ->
            let dst' = Cluster.of_int64 dst in
            Cache.Debug.assert_not_cached t.cache dst' ;
            if not @@ Qcow_cluster_map.Copies.mem cluster_map dst' then (
              Log.err (fun f ->
                  f "Copy cluster %Ld to %Ld: but %Ld is not Junk" src dst dst
              ) ;
              Qcow_cluster_map.Debug.assert_no_leaked_blocks cluster_map ;
              assert false
            ) ;
            if Qcow_cluster_map.is_moving cluster_map dst' then (
              Log.err (fun f ->
                  f "Copy cluster from %Ld to %Ld: but %Ld is also moving" src
                    dst dst
              ) ;
              Qcow_cluster_map.Debug.assert_no_leaked_blocks cluster_map ;
              assert false
            ) ;
            Lwt.return (Ok ())
      )

  let copy t src dst =
    Locks.Read.with_lock t.locks src (fun () ->
        Locks.Write.with_lock t.locks dst (fun () ->
            copy_already_locked t src dst
        )
    )

  let move t move =
    let cluster_map =
      match t.cluster_map with Some x -> x | None -> assert false
    in
    let src, dst = Qcow_cluster_map.Move.(move.src, move.dst) in
    Log.debug (fun f ->
        f "move %s -> %s" (Cluster.to_string src) (Cluster.to_string dst)
    ) ;
    let open Lwt.Infix in
    Locks.Read.with_lock t.locks src (fun () ->
        Locks.Write.with_lock t.locks dst (fun () ->
            (* Consider that a discard might have arrived and removed the src
               cluster. *)
            if not (Qcow_cluster_map.is_moving cluster_map src) then (
              Log.info (fun f ->
                  f "Copy of cluster %s prevented: move operation cancelled"
                    (Cluster.to_string src)
              ) ;
              Lwt.return (Ok ())
            ) else
              copy_already_locked t src dst >>= function
              | Error `Disconnected ->
                  Lwt.return (Error `Disconnected)
              | Error `Is_read_only ->
                  Lwt.return (Error `Is_read_only)
              | Error _ ->
                  Format.kasprintf Lwt.fail_with
                    "Unknown error in qcow_recylcer.ml"
              | Ok () ->
                  Qcow_cluster_map.(set_move_state cluster_map move Copied) ;
                  Lwt.return (Ok ())
        )
    )

  let move_all ?(progress_cb = fun ~percent:_ -> ()) t moves =
    let total = List.length moves in
    let rec loop i = function
      | [] ->
          Lwt.return (Ok ())
      | m :: ms -> (
          let open Lwt.Infix in
          move t m >>= function
          | Error e ->
              Lwt.return_error e
          | Ok () ->
              progress_cb ~percent:(100 * i / total) ;
              loop (i + 1) ms
        )
    in
    loop 0 moves

  let erase t remaining =
    let open Lwt.Infix in
    let intervals =
      Cluster.IntervalSet.fold (fun i acc -> i :: acc) remaining []
    in
    let buffer_size_clusters =
      Int64.of_int (Cstruct.length t.zero_buffer) |> t.cluster_bits
    in

    Lwt_list.fold_left_s
      (fun acc i ->
        match acc with
        | Error e ->
            Lwt.return (Error e)
        | Ok () ->
            let x, y = Cluster.IntervalSet.Interval.(x i, y i) in
            let x = Cluster.to_int64 x and y = Cluster.to_int64 y in
            let n = Int64.(succ @@ sub y x) in
            Log.debug (fun f -> f "erasing %Ld clusters (%Ld -> %Ld)" n x y) ;
            let erase cluster n =
              (* Erase [n] clusters starting from [cluster] *)
              assert (n <= buffer_size_clusters) ;
              let buf =
                Cstruct.sub t.zero_buffer 0 (Int64.to_int (n <| t.cluster_bits))
              in
              let sector =
                Int64.(div (cluster <| t.cluster_bits) (of_int t.sector_size))
              in
              (* No-one else is writing to this cluster so no locking is needed *)
              B.write t.base sector [buf]
            in
            let rec loop from n m =
              if n = 0L then
                Lwt.return (Ok ())
              else if n > m then
                erase from m >>= function
                | Error e ->
                    Lwt.return (Error e)
                | Ok () ->
                    loop (Int64.add from m) (Int64.sub n m) m
              else
                erase from n
            in
            loop x n buffer_size_clusters
      )
      (Ok ()) intervals

  let update_references t =
    let cluster_map =
      match t.cluster_map with
      | None ->
          assert false (* by construction, see `make` *)
      | Some x ->
          x
    in
    let open Qcow_cluster_map in
    (* Build a list of moves per referring cluster, so we can take the referring
       cluster lock once, make all the updates and release it. *)
    let flushed' =
      Cluster.Map.fold
        (fun src move acc ->
          assert (src = move.Qcow_cluster_map.move.Qcow_cluster_map.Move.src) ;
          match move.state with
          | Flushed -> (
            match Qcow_cluster_map.find cluster_map src with
            | exception Not_found ->
                acc
            | ref_cluster, _ ->
                let existing =
                  if Cluster.Map.mem ref_cluster acc then
                    Cluster.Map.find ref_cluster acc
                  else
                    []
                in
                Cluster.Map.add ref_cluster (move :: existing) acc
          )
          | _ ->
              acc
        )
        (moves cluster_map) Cluster.Map.empty
    in
    let flushed = Cluster.Map.bindings flushed' in

    let nr_updated = ref 0L in
    let open Lwt.Infix in
    (* If I can't acquire a write lock on the metadata cluster then skip
       this update and do it later. *)
    let client =
      Locks.Client.make (fun () -> "Rewriting references after a block copy")
    in
    Lwt_list.fold_left_s
      (fun acc (ref_cluster', moves) ->
        match acc with
        | Error e ->
            Lwt.return (Error e)
        | Ok subst -> (
            let ref_cluster =
              try Cluster.Map.find ref_cluster' subst
              with Not_found -> ref_cluster'
            in
            match Locks.Write.try_lock ~client t.locks ref_cluster with
            | None ->
                List.iter
                  (fun {move= {Move.src; dst}; _} ->
                    Log.debug (fun f ->
                        f
                          "Not rewriting reference in %s from %s to %s: \
                           metadata cluster is locked"
                          (Cluster.to_string ref_cluster)
                          (Cluster.to_string src) (Cluster.to_string dst)
                    ) ;
                    cancel_move cluster_map src
                  )
                  moves ;
                Lwt.return (Ok subst)
            | Some lock ->
                Lwt.finalize
                  (fun () ->
                    (* The flush function will call complete move for all moves with state Referenced.
                       However these won't actually have hit the disk until Metadata.update returns
                       and the disk write has been performed. *)
                    Lwt_mutex.with_lock t.flush_m (fun () ->
                        Metadata.update ~client t.metadata ref_cluster (fun c ->
                            Log.info (fun f ->
                                f "Updating %d references in cluster %s"
                                  (List.length moves)
                                  (Cluster.to_string ref_cluster)
                            ) ;
                            let addresses = Metadata.Physical.of_contents c in
                            try
                              let result =
                                List.fold_left
                                  (fun acc ({move= {Move.src; dst}; _} as move) ->
                                    match acc with
                                    | Error e ->
                                        Error e
                                    | Ok subst -> (
                                      match
                                        Qcow_cluster_map.find cluster_map src
                                      with
                                      | exception Not_found ->
                                          (* Block was probably discarded after we started running. *)
                                          Log.warn (fun f ->
                                              f
                                                "Not copying cluster %s to %s: \
                                                 %s has been discarded"
                                                (Cluster.to_string src)
                                                (Cluster.to_string dst)
                                                (Cluster.to_string src)
                                          ) ;
                                          Ok subst
                                      | ref_cluster', ref_cluster_within ->
                                          if ref_cluster' <> ref_cluster then (
                                            Log.info (fun f ->
                                                f
                                                  "Reference to %s moved from \
                                                   %s:%d to %s:%d"
                                                  (Cluster.to_string src)
                                                  (Cluster.to_string ref_cluster)
                                                  ref_cluster_within
                                                  (Cluster.to_string
                                                     ref_cluster'
                                                  )
                                                  ref_cluster_within
                                            ) ;
                                            Ok subst
                                          ) else if
                                              not
                                                (Cluster.Map.mem src
                                                   (Qcow_cluster_map.moves
                                                      cluster_map
                                                   )
                                                )
                                            then (
                                            Log.debug (fun f ->
                                                f
                                                  "Not rewriting reference in \
                                                   %s :%d from %s to %s: move \
                                                   as been cancelled"
                                                  (Cluster.to_string ref_cluster)
                                                  ref_cluster_within
                                                  (Cluster.to_string src)
                                                  (Cluster.to_string dst)
                                            ) ;
                                            Ok subst
                                          ) else
                                            (* Read the current value in the referencing cluster as a sanity check *)
                                            let old_reference =
                                              Metadata.Physical.get addresses
                                                ref_cluster_within
                                            in
                                            let old_cluster =
                                              Qcow_physical.cluster
                                                ~cluster_bits:t.cluster_bits
                                                old_reference
                                            in
                                            if old_cluster <> src then (
                                              Log.err (fun f ->
                                                  f
                                                    "Rewriting reference in %s \
                                                     :%d from %s to %s, old \
                                                     reference actually \
                                                     pointing to %s"
                                                    (Cluster.to_string
                                                       ref_cluster
                                                    )
                                                    ref_cluster_within
                                                    (Cluster.to_string src)
                                                    (Cluster.to_string dst)
                                                    (Cluster.to_string
                                                       old_cluster
                                                    )
                                              ) ;
                                              assert false
                                            ) ;
                                            Log.debug (fun f ->
                                                f
                                                  "Rewriting reference in %s \
                                                   :%d from %s to %s"
                                                  (Cluster.to_string ref_cluster)
                                                  ref_cluster_within
                                                  (Cluster.to_string src)
                                                  (Cluster.to_string dst)
                                            ) ;
                                            (* Preserve any flags but update the pointer *)
                                            let dst' =
                                              Cluster.to_int dst
                                              lsl t.cluster_bits
                                            in
                                            let new_reference =
                                              Qcow_physical.make
                                                ~is_mutable:
                                                  (Qcow_physical.is_mutable
                                                     old_reference
                                                  )
                                                ~is_compressed:
                                                  (Qcow_physical.is_compressed
                                                     old_reference
                                                  )
                                                dst'
                                            in
                                            set_move_state cluster_map move.move
                                              Referenced ;
                                            Metadata.Physical.set addresses
                                              ref_cluster_within new_reference ;
                                            nr_updated := Int64.succ !nr_updated ;
                                            (* The move cannot be cancelled now that the metadata has
                                               been updated. *)
                                            Ok (Cluster.Map.add src dst subst)
                                    )
                                  )
                                  (Ok subst) moves
                              in
                              match result with
                              | Error e ->
                                  Lwt.return (Error e)
                              | Ok subst ->
                                  (* If `ref_cluster` is an L1 table entry then `src` must be an
                                     L2 block, and the values in `cluster_map.refs` will point to it.
                                     These need to be redirected to `dst` otherwise the `cluster_map`
                                     will be out-of-sync. This only happens because we bypass the
                                     `Metadata.Physical.set` function in the block copier. *)
                                  if
                                    Qcow_cluster_map.is_immovable cluster_map
                                      ref_cluster
                                  then (
                                    Log.info (fun f ->
                                        f
                                          "Cluster %s is L1: we must remap L2 \
                                           references"
                                          (Cluster.to_string ref_cluster)
                                    ) ;
                                    Qcow_cluster_map.update_references
                                      cluster_map subst
                                  ) ;
                                  Lwt.return (Ok subst)
                            with
                            | Error.Duplicate_reference
                                ((c, w), (c', w'), (target : int64)) as e ->
                                Log.err (fun f ->
                                    f
                                      "Duplicate_reference during \
                                       update_references of %s"
                                      (String.concat ", "
                                      @@ List.map
                                           Qcow_cluster_map.string_of_move
                                      @@ List.concat
                                      @@ List.map snd flushed
                                      )
                                ) ;
                                let open Error.Lwt_write_error.Infix in
                                Qcow_debug.on_duplicate_reference t.metadata
                                  cluster_map ~cluster_bits:t.cluster_bits
                                  (c, w) (c', w') target
                                >>= fun () ->
                                Qcow_cluster_map.Debug.assert_no_leaked_blocks
                                  cluster_map ;
                                Lwt.fail e
                            | e ->
                                Qcow_cluster_map.Debug.assert_no_leaked_blocks
                                  cluster_map ;
                                raise e
                        )
                    )
                  )
                  (fun () -> Locks.unlock lock ; Lwt.return_unit)
          )
      )
      (Ok Cluster.Map.empty) flushed
    >>= function
    | Ok _subst ->
        t.need_to_flush <- true ;
        Lwt_condition.signal t.need_to_flush_c () ;
        Lwt.return (Ok !nr_updated)
    | Error e ->
        Lwt.return (Error e)

  let flush t =
    let open Qcow_cluster_map in
    let cluster_map =
      match t.cluster_map with
      | None ->
          assert false (* by construction, see `make` *)
      | Some x ->
          x
    in
    let open Lwt.Infix in
    (* This can be called concurrently by both the user and by the background
       flusher thread. *)
    Lwt_mutex.with_lock t.flush_m (fun () ->
        (* Anything erased right now will become available *)
        let erased = Qcow_cluster_map.Erased.get cluster_map in
        let moves = Qcow_cluster_map.moves cluster_map in
        B.flush t.base >>= function
        | Error e ->
            Lwt.return (Error e)
        | Ok () ->
            (* Walk over the snapshot of moves before the flush and update. This
               ensures we don't accidentally advance the state of moves which appeared
               after the flush. *)
            let nr_flushed, nr_completed =
              Cluster.Map.fold
                (fun _ (move : move) (nr_flushed, nr_completed) ->
                  match move.state with
                  | Copying | Flushed ->
                      (* no change *)
                      (nr_flushed, nr_completed)
                  | Copied ->
                      Qcow_cluster_map.(
                        set_move_state cluster_map move.move Flushed
                      ) ;
                      (nr_flushed + 1, nr_completed)
                  | Referenced ->
                      Qcow_cluster_map.complete_move cluster_map move.move ;
                      (nr_flushed, nr_completed + 1)
                )
                moves (0, 0)
            in
            let nr_erased =
              Cluster.to_int @@ Cluster.IntervalSet.cardinal erased
            in
            Qcow_cluster_map.(
              set_cluster_state cluster_map erased Erased Available
            ) ;
            if nr_flushed <> 0 || nr_completed <> 0 || nr_erased <> 0 then (
              Log.info (fun f ->
                  f
                    "block recycler: %d cluster copies flushed; %d cluster \
                     copies complete; %d clusters erased"
                    nr_flushed nr_completed nr_erased
              ) ;
              Log.info (fun f ->
                  f "block recycler: flush: %s"
                    (Qcow_cluster_map.to_summary_string cluster_map)
              )
            ) ;
            Lwt.return (Ok ())
    )

  let start_background_thread t ~keep_erased ?compact_after_unmaps () =
    let th, _ = Lwt.task () in
    Lwt.on_cancel th (fun () ->
        Log.info (fun f -> f "cancellation of block recycler not implemented")
    ) ;
    let cluster_map =
      match t.cluster_map with Some x -> x | None -> assert false
    in
    Log.info (fun f ->
        f "block recycler starting with keep_erased = %Ld" keep_erased
    ) ;
    let open Lwt.Infix in
    let rec background_flusher () =
      let rec wait () =
        match t.need_to_flush with
        | true ->
            Lwt.return_unit
        | false ->
            Lwt_condition.wait t.need_to_flush_c >>= fun () -> wait ()
      in
      wait () >>= fun () ->
      t.need_to_flush <- false ;
      Mirage_sleep.ns 5_000_000_000L >>= fun () ->
      Log.info (fun f ->
          f "block recycler: triggering background flush: %s"
            (Qcow_cluster_map.to_summary_string cluster_map)
      ) ;
      flush t >>= function
      | Error _ ->
          Log.err (fun f -> f "block recycler: flush failed") ;
          Lwt.return_unit
      | Ok () ->
          background_flusher ()
    in
    Lwt.async background_flusher ;

    let last_block = ref (Qcow_cluster_map.get_last_block cluster_map) in
    let rec wait_for_work () =
      let junk = Qcow_cluster_map.Junk.get cluster_map in
      let nr_junk = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal junk in
      let erased = Qcow_cluster_map.Erased.get cluster_map in
      let nr_erased = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal erased in
      let available = Qcow_cluster_map.Available.get cluster_map in
      let nr_available =
        Cluster.to_int64 @@ Cluster.IntervalSet.cardinal available
      in
      (* Apply the threshold to the total clusters erased, which includes those
         marked as available *)
      let total_erased = Int64.add nr_erased nr_available in
      (* Prioritise cluster reuse because it's more efficient not to have to
         move a cluster at all U*)
      let highest_priority =
        if total_erased < keep_erased && nr_junk > 0L then
          (* Take some of the junk and erase it *)
          let n =
            Cluster.of_int64 @@ min nr_junk (Int64.sub keep_erased total_erased)
          in
          if Cluster.IntervalSet.cardinal junk < n then
            None
          else
            Some (`Erase n)
        else
          None
      in
      (* If we need to update references, do that next *)
      let moves = Qcow_cluster_map.moves cluster_map in
      let middle_priority =
        let flushed =
          Cluster.Map.fold
            (fun _src move acc ->
              match move.Qcow_cluster_map.state with
              | Qcow_cluster_map.Flushed ->
                  true
              | _ ->
                  acc
            )
            moves false
        in
        if flushed then Some `Update_references else None
      in
      ( match (highest_priority, middle_priority, compact_after_unmaps) with
      | Some x, _, _ ->
          Lwt.return (Some x)
      | _, Some x, _ ->
          Lwt.return (Some x)
      | None, _, Some x when x < nr_junk ->
          if not (Cluster.Map.is_empty moves) then
            Lwt.return None
          else (
            (* Wait for the junk data to stabilise before starting to copy *)
            Log.info (fun f ->
                f
                  "Discards (%Ld) over threshold (%Ld): waiting for discards \
                   to finish before beginning compaction"
                  nr_junk x
            ) ;
            let rec wait nr_junk n =
              Mirage_sleep.ns 5_000_000_000L >>= fun () ->
              let nr_junk' =
                Cluster.to_int64
                @@ Cluster.IntervalSet.cardinal
                @@ Qcow_cluster_map.Junk.get cluster_map
              in
              if nr_junk = nr_junk' then (
                Log.info (fun f ->
                    f "Discards have finished, %Ld clusters have been discarded"
                      nr_junk
                ) ;
                Lwt.return ()
              ) else (
                if n mod 60 = 0 then
                  Log.info (fun f ->
                      f "Total discards %Ld, still waiting" nr_junk'
                  ) ;
                wait nr_junk' (n + 1)
              )
            in
            wait nr_junk 0 >>= fun () -> Lwt.return (Some `Junk)
          )
      | _ ->
          let last_block' = Qcow_cluster_map.get_last_block cluster_map in
          let result =
            if last_block' < !last_block then Some `Resize else None
          in
          last_block := last_block' ;
          Lwt.return result
      )
      >>= function
      | None ->
          Qcow_cluster_map.wait cluster_map >>= fun () -> wait_for_work ()
      | Some work ->
          Lwt.return work
    in

    let resize () =
      Locks.with_metadata_lock t.locks (fun () ->
          let new_last_block =
            1 + (Cluster.to_int @@ Qcow_cluster_map.get_last_block cluster_map)
          in
          Log.info (fun f ->
              f "block recycler: resize to %d clusters" new_last_block
          ) ;
          let new_size = Physical.make (new_last_block lsl t.cluster_bits) in
          let sector = Physical.sector ~sector_size:t.sector_size new_size in
          let cluster =
            Physical.cluster ~cluster_bits:t.cluster_bits new_size
          in
          Qcow_cluster_map.resize cluster_map cluster ;
          B.resize t.base sector >>= function
          | Error _ ->
              Lwt.fail_with "resize"
          | Ok () ->
              Log.debug (fun f ->
                  f "Resized device to %d sectors of size %d"
                    (Qcow_physical.to_bytes new_size)
                    t.sector_size
              ) ;
              Lwt.return_unit
      )
    in
    let rec loop () =
      t.need_to_flush <- true ;
      Lwt_condition.signal t.need_to_flush_c () ;
      (* trigger a flush later *)
      wait_for_work () >>= function
      | `Erase n -> (
        match
          Cluster.IntervalSet.take (Qcow_cluster_map.Junk.get cluster_map) n
        with
        | None ->
            loop ()
        | Some (to_erase, _) ->
            Log.debug (fun f ->
                f "block recycler: should erase %s clusters"
                  (Cluster.to_string @@ Cluster.IntervalSet.cardinal to_erase)
            ) ;
            Qcow_cluster_map.(set_cluster_state cluster_map to_erase Junk Roots) ;
            Lwt.catch
              (fun () ->
                erase t to_erase >>= function
                | Error e ->
                    Format.kasprintf Lwt.fail_with "%a" B.pp_write_error e
                | Ok () ->
                    Qcow_cluster_map.(
                      set_cluster_state cluster_map to_erase Roots Erased
                    ) ;
                    Lwt.return_unit
              )
              (fun e ->
                Qcow_cluster_map.(
                  set_cluster_state cluster_map to_erase Roots Junk
                ) ;
                Lwt.fail e
              )
            >>= fun () -> loop ()
      )
      | `Junk ->
          if t.runtime_asserts then
            Qcow_cluster_map.Debug.assert_no_leaked_blocks cluster_map ;
          (* There must be no moves already in progress when starting new moves, otherwise
             we might move the same block twice maybe even to a different location. *)
          assert (Cluster.Map.is_empty @@ Qcow_cluster_map.moves cluster_map) ;
          let junk = Qcow_cluster_map.Junk.get cluster_map in
          let nr_junk = Cluster.to_int64 @@ Cluster.IntervalSet.cardinal junk in
          let moves = Qcow_cluster_map.start_moves cluster_map in
          Log.info (fun f ->
              f "block recycler: %Ld clusters are junk, %d moves are possible"
                nr_junk (List.length moves)
          ) ;
          Qcow_error.Lwt_write_error.or_fail_with @@ move_all t moves
          >>= fun () ->
          resize () >>= fun () -> loop ()
      | `Update_references -> (
          Log.info (fun f ->
              f "block recycler: need to update references to blocks"
          ) ;
          update_references t >>= function
          | Error (`Msg x) ->
              Lwt.fail_with x
          | Error `Disconnected ->
              Lwt.fail_with "Disconnected"
          | Error `Is_read_only ->
              Lwt.fail_with "Is_read_only"
          | Ok nr_updated ->
              Log.info (fun f ->
                  f "block recycler: %Ld block references updated" nr_updated
              ) ;
              loop ()
        )
      | `Resize ->
          resize () >>= fun () -> loop ()
    in

    Lwt.async loop ;
    t.background_thread <- th
end