Source file lvt.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
(*
   Multiport memories for Hardcaml
   ===============================

   The basic memory primitive provide by Hardcaml has one write (wr) and
   one read (rd) port.  The write is synchronous and the read is asynchronous.
   By adding a register to either the read address or output data we can
   generate standard read-before-write and write-before-read synchronous
   memories.  The read and write clocks can be from different domains.

   A simple way to build a multi-port memory is out of simple registers
   This is, of course, very inefficient.  One restriction that
   still exists is that each write port must be from the same clock
   domain.  I (think) the reads ports could potentially be in differing
   clock domains.

   LVT
   ===

   To do a bit better we can instead build multi-port memories using
   a Live Value Table (LVT).  The idea here is in 3 steps;

   1] To make N rd ports, we replicate a 1 rd ram N times each written
   with the same data.  Note each replicated RAM will contain exactly
   the same data.  The replication provides N access ports to this data.
   Call this memory_nrd.

   2] To make M wr ports we initially replicate memory_nrd M times.
   Each write port is connected to one memory_nrd instance.  Note that
   each memory_nrd bank will this time contain different data.
   We now have M * N ram outputs to select between to for the N read
   ports.

   3] Build the live value table.  This tracks the bank which was most
   recently written for each address in the RAM.  This is built as
   a multi-port memory itself, specifically using the simple register
   scheme described before.  The outputs of the LVT (one for each read
   port) selects the bank built in step 2 that contains the most
   recently written value for a particular address.

   Consider 2 read, 4 write ports on a 256 element x 32 bit memory.
   Using the pure register scheme we will need to generate 256x32
   register bits, write port selection logic at the input to each
   register, and 2 256x32 muxes to select the read data.

   With the LVT scheme we still need a register based multiport
   memory - in this case 2 read, 4 write ports on a 256 element x 2 bit
   memory.  Note the change here - we went from a 32 bit memory (to
   store the data) to a 2 bit memory (to store the index of the latest
   write bank ie it's of width log2(number of write ports).  In this
   case we have save approx 16x logic resources.

   Of course we also need 2*4 256 x 32 memories (with 1 read and 1 write
   port each) to store the data.
*)

open! Base
open Hardcaml
open Signal

module type Config = sig
  val abits : int
  val dbits : int
  val size : int
end

module Wr = struct
  type 'a t =
    { we : 'a
    ; wa : 'a
    ; d : 'a
    }
  [@@deriving hardcaml]
end

module Rd = struct
  type 'a t =
    { re : 'a
    ; ra : 'a
    }
  [@@deriving hardcaml]
end

(* fallthrough = wbr (write before read) *)
type mode =
  [ `async_rbw
  | `async_wbr
  | `sync_rbw
  | `sync_wbr
  ]

let is_sync = function
  | `sync_wbr | `sync_rbw -> true
  | _ -> false
;;

let is_async m = not (is_sync m)

let is_rbw = function
  | `sync_rbw | `async_rbw -> true
  | _ -> false
;;

let is_wbr m = not (is_rbw m)

type wr_port =
  { wr : t Wr.t
  ; ram_spec : Reg_spec.t
  ; reg_spec : Reg_spec.t
  }

type rd_port =
  { rd : t Rd.t
  ; reg_spec : Reg_spec.t
  ; mode : mode
  }

module Ports (C : Config) = struct
  module Wr = struct
    include Wr

    let bits = { we = 1; wa = C.abits; d = C.dbits }
    let port_names_and_widths = zip port_names bits
  end

  module Rd = struct
    include Rd

    let bits = { re = 1; ra = C.abits }
    let port_names_and_widths = zip port_names bits
  end
end

module Multiport_regs (C : Config) = struct
  (* async read memories with multiple read and write ports, implemented as registers *)

  open C
  include Ports (C)
  open Wr
  open Rd

  let pri =
    tree ~arity:2 ~f:(function
      | [ a ] -> a
      | [ (s0, d0); (s1, d1) ] -> s1 |: s0, mux2 s1 d1 d0
      | _ -> empty, empty)
  ;;

  let reg_we_enable ~we ~wa =
    select (binary_to_onehot wa) (size - 1) 0 &: mux2 we (ones size) (zero size)
  ;;

  let memory_nwr_array ~(wr : wr_port array) =
    let reg_spec = wr.(0).reg_spec in
    let wr = List.map ~f:(fun wr -> wr.wr) @@ Array.to_list wr in
    let we1h = List.map ~f:(fun wr -> reg_we_enable ~we:wr.we ~wa:wr.wa) wr in
    Array.to_list
    @@ Array.init size ~f:(fun elt ->
         let wed = List.map2_exn ~f:(fun we1h wr -> bit we1h elt, wr.d) we1h wr in
         let we, d = pri wed in
         (* last d with write enable set *)
         let r = reg reg_spec ~enable:we d in
         we, d, r)
  ;;

  (* n write, n read ports *)
  let memory ~(wr : wr_port array) ~(rd : rd_port array) =
    let base = memory_nwr_array ~wr in
    Array.init (Array.length rd) ~f:(fun i ->
      let reg_spec = rd.(i).reg_spec in
      let mr = List.map ~f:(fun (we, d, r) -> mux2 we d r) base in
      let r = List.map ~f:(fun (_, _, r) -> r) base in
      match rd.(i).mode with
      | `async_wbr -> mux rd.(i).rd.ra mr
      | `async_rbw -> mux rd.(i).rd.ra r
      | `sync_wbr -> mux (reg reg_spec ~enable:rd.(i).rd.re rd.(i).rd.ra) r
      | `sync_rbw -> reg reg_spec ~enable:rd.(i).rd.re (mux rd.(i).rd.ra r))
  ;;
end

module Make (C : Config) = struct
  include Ports (C)
  open Wr
  open Rd

  (* compatibility shim *)
  let memory ram_spec size ~we ~wa ~d ~ra =
    memory
      size
      ~write_port:
        { write_clock = Reg_spec.clock ram_spec
        ; write_enable = we
        ; write_address = wa
        ; write_data = d
        }
      ~read_address:ra
  ;;

  let memory_1rd ~wr ~rd =
    let ram_spec = wr.ram_spec in
    let reg_spec = rd.reg_spec in
    let mode = rd.mode in
    let wr, rd = wr.wr, rd.rd in
    match mode with
    | `sync_rbw ->
      reg
        reg_spec
        ~enable:rd.re
        (memory ram_spec C.size ~we:wr.we ~wa:wr.wa ~d:wr.d ~ra:rd.ra)
    | `sync_wbr ->
      memory
        ram_spec
        C.size
        ~we:wr.we
        ~wa:wr.wa
        ~d:wr.d
        ~ra:(reg reg_spec ~enable:rd.re rd.ra)
    | `async_rbw -> memory ram_spec C.size ~we:wr.we ~wa:wr.wa ~d:wr.d ~ra:rd.ra
    | `async_wbr ->
      mux2
        (wr.we &: (wr.wa ==: rd.ra))
        wr.d
        (memory ram_spec C.size ~we:wr.we ~wa:wr.wa ~d:wr.d ~ra:rd.ra)
  ;;

  let memory_nrd ~wr ~rd =
    let nrd = Array.length rd in
    Array.init nrd ~f:(fun i -> memory_1rd ~wr ~rd:rd.(i))
  ;;

  let memory ~wr ~rd =
    let nwr, nrd = Array.length wr, Array.length rd in
    (* create the live value table *)
    let module Lvt_cfg = struct
      let abits = C.abits
      let dbits = Int.ceil_log2 nwr
      let size = C.size
    end
    in
    let module Lvt = Multiport_regs (Lvt_cfg) in
    let lvt =
      if nwr = 1
      then [||]
      else (
        let lvt_wr =
          Array.init nwr ~f:(fun i ->
            { (wr.(i)) with wr = { wr.(i).wr with d = of_int ~width:Lvt_cfg.dbits i } })
        in
        Lvt.memory ~wr:lvt_wr ~rd)
    in
    (* create the memory banks *)
    let mem = Array.init nwr ~f:(fun i -> memory_nrd ~wr:wr.(i) ~rd) in
    (* select the correct memory bank *)
    Array.init nrd ~f:(fun rd ->
      let mem = Array.init nwr ~f:(fun wr -> mem.(wr).(rd)) in
      if nwr = 1 then mem.(0) else mux lvt.(rd) (Array.to_list mem))
  ;;
end

module Make_wren (C : Config) = struct
  module Wr = struct
    include Wr

    let bits = { we = C.dbits; wa = C.abits; d = C.dbits }
    let port_names_and_widths = zip port_names bits
  end

  module Rd = struct
    include Rd

    let bits = { re = 1; ra = C.abits }
    let port_names_and_widths = zip port_names bits
  end

  module L = Make (C)

  let get_layout wrnets =
    let runs list =
      let rec f acc prev l =
        match prev, l with
        | None, [] -> []
        | None, h :: t -> f acc (Some (h, 1)) t
        | Some (prev, run), [] -> List.rev ((prev, run) :: acc)
        | Some (prev, run), h :: t ->
          if Poly.equal prev h
          then f acc (Some (prev, run + 1)) t
          else f ((prev, run) :: acc) (Some (h, 1)) t
      in
      f [] None list
    in
    let transpose a =
      let i0 = Array.length a in
      let i1 = Array.length a.(0) in
      Array.init i1 ~f:(fun i1 -> Array.init i0 ~f:(fun i0 -> a.(i0).(i1)))
    in
    let rec starts pos = function
      | [] -> []
      | (h, r) :: t -> (h, pos, r) :: starts (pos + r) t
    in
    starts 0 @@ runs @@ Array.to_list @@ transpose wrnets
  ;;

  let memory ~layout =
    let layout = get_layout layout in
    let memory ~wr ~rd =
      let nrd = Array.length rd in
      let concat l =
        Array.init nrd ~f:(fun i -> concat_lsb @@ List.map ~f:(fun x -> x.(i)) l)
      in
      concat
      @@ List.map
           ~f:(fun (_, n, bits) ->
             let sel_wr wr =
               { wr with
                 wr =
                   { wr.wr with
                     Wr.we = select wr.wr.Wr.we n n
                   ; d = select wr.wr.Wr.d (n + bits - 1) n
                   }
               }
             in
             let wr = Array.map ~f:sel_wr wr in
             L.memory ~wr ~rd)
           layout
    in
    memory
  ;;
end