Source file bytes_intf.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
open! Import

(** Interface for Unicode encodings, such as UTF-8. *)
module type Utf = sig
  type t := bytes

  (** Writes a Unicode character to a given position using this encoding. *)
  val set : t -> int -> Uchar0.t -> int
end

module type Bytes = sig
  (** OCaml's byte sequence type, semantically similar to a [char array], but
      taking less space in memory.

      A byte sequence is a mutable data structure that contains a fixed-length
      sequence of bytes (of type [char]). Each byte can be indexed in constant
      time for reading or writing. *)

  open! Import

  type t = bytes [@@deriving_inline globalize, sexp, sexp_grammar]

  val globalize : t -> t

  include Sexplib0.Sexpable.S with type t := t

  val t_sexp_grammar : t Sexplib0.Sexp_grammar.t

  [@@@end]

  (** {1 Common Interfaces} *)

  include Blit.S with type t := t
  include Comparable.S with type t := t
  include Ppx_compare_lib.Comparable.S_local with type t := t
  include Ppx_compare_lib.Equal.S_local with type t := t
  include Stringable.S with type t := t

  (** Note that [pp] allocates in order to preserve the state of the byte
      sequence it was initially called with. *)
  include Pretty_printer.S with type t := t

  include Invariant.S with type t := t

  module To_string : sig
    val sub : (t, string) Blit.sub
    val subo : (t, string) Blit.subo
  end

  module From_string : Blit.S_distinct with type src := string and type dst := t

  (** [create len] returns a newly-allocated and uninitialized byte sequence of
      length [len].  No guarantees are made about the contents of the return
      value. *)
  val create : int -> t

  (** [create_local] is like [create], but returns a stack-allocated [Bytes.t]. *)
  val create_local : int -> t

  (** [make len c] returns a newly-allocated byte sequence of length [len] filled
      with the byte [c]. *)
  val make : int -> char -> t

  (** [map f t] applies function [f] to every byte, in order, and builds the byte
      sequence with the results returned by [f]. *)
  val map : t -> f:(char -> char) -> t

  (** Like [map], but passes each character's index to [f] along with the char. *)
  val mapi : t -> f:(int -> char -> char) -> t

  (** [copy t] returns a newly-allocated byte sequence that contains the same
      bytes as [t]. *)
  val copy : t -> t

  (** [init len ~f] returns a newly-allocated byte sequence of length [len] with
      index [i] in the sequence being initialized with the result of [f i]. *)
  val init : int -> f:(int -> char) -> t

  (** [of_char_list l] returns a newly-allocated byte sequence where each byte in
      the sequence corresponds to the byte in [l] at the same index. *)
  val of_char_list : char list -> t

  (** [length t] returns the number of bytes in [t]. *)
  external length : (t[@local_opt]) -> int = "%bytes_length"

  (** [get t i] returns the [i]th byte of [t]. *)
  val get : t -> int -> char

  external unsafe_get : (t[@local_opt]) -> (int[@local_opt]) -> char = "%bytes_unsafe_get"

  (** [set t i c] sets the [i]th byte of [t] to [c]. *)
  external set
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> (char[@local_opt])
    -> unit
    = "%bytes_safe_set"

  external unsafe_set
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> (char[@local_opt])
    -> unit
    = "%bytes_unsafe_set"

  external unsafe_get_int64
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> int64
    = "%caml_bytes_get64u"

  external unsafe_set_int64
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> (int64[@local_opt])
    -> unit
    = "%caml_bytes_set64u"

  external unsafe_get_int32
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> int32
    = "%caml_bytes_get32u"

  external unsafe_set_int32
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> (int32[@local_opt])
    -> unit
    = "%caml_bytes_set32u"

  external unsafe_get_int16
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> int
    = "%caml_bytes_get16u"

  external unsafe_set_int16
    :  (t[@local_opt])
    -> (int[@local_opt])
    -> (int[@local_opt])
    -> unit
    = "%caml_bytes_set16u"

  (** [fill t ~pos ~len c] modifies [t] in place, replacing all the bytes from
      [pos] to [pos + len] with [c]. *)
  val fill : t -> pos:int -> len:int -> char -> unit

  (** [tr ~target ~replacement t] modifies [t] in place, replacing every instance
      of [target] in [s] with [replacement]. *)
  val tr : target:char -> replacement:char -> t -> unit

  (** [tr_multi ~target ~replacement] returns an in-place function that replaces
      every instance of a character in [target] with the corresponding character
      in [replacement].

      If [replacement] is shorter than [target], it is lengthened by repeating
      its last character. Empty [replacement] is illegal unless [target] also is.

      If [target] contains multiple copies of the same character, the last
      corresponding [replacement] character is used. Note that character ranges
      are {b not} supported, so [~target:"a-z"] means the literal characters ['a'],
      ['-'], and ['z']. *)
  val tr_multi : target:string -> replacement:string -> (t -> unit) Staged.t

  (** [to_list t] returns the bytes in [t] as a list of chars. *)
  val to_list : t -> char list

  (** [to_array t] returns the bytes in [t] as an array of chars. *)
  val to_array : t -> char array

  (** [fold a ~f ~init:b] is [f a1 (f a2 (...))] *)
  val fold : t -> init:'acc -> f:('acc -> char -> 'acc) -> 'acc

  (** [foldi] works similarly to [fold], but also passes the index of each character to
      [f]. *)
  val foldi : t -> init:'acc -> f:(int -> 'acc -> char -> 'acc) -> 'acc

  (** [contains ?pos ?len t c] returns [true] iff [c] appears in [t] between [pos]
      and [pos + len]. *)
  val contains : ?pos:int -> ?len:int -> t -> char -> bool

  (** Maximum length of a byte sequence, which is architecture-dependent.  Attempting to
      create a [Bytes] larger than this will raise an exception. *)
  val max_length : int

  (** {2:unsafe Unsafe conversions (for advanced users)}

      This section describes unsafe, low-level conversion functions between
      [bytes] and [string]. They might not copy the internal data; used
      improperly, they can break the immutability invariant on strings provided
      by the [-safe-string] option. They are available for expert library
      authors, but for most purposes you should use the always-correct
      {!Bytes.to_string} and {!Bytes.of_string} instead.
  *)

  (** Unsafely convert a byte sequence into a string.

      To reason about the use of [unsafe_to_string], it is convenient to
      consider an "ownership" discipline. A piece of code that
      manipulates some data "owns" it; there are several disjoint ownership
      modes, including:
      {ul
      {- Unique ownership: the data may be accessed and mutated}
      {- Shared ownership: the data has several owners, that may only
      access it, not mutate it.}}
      Unique ownership is linear: passing the data to another piece of
      code means giving up ownership (we cannot access the
      data again). A unique owner may decide to make the data shared
      (giving up mutation rights on it), but shared data may not become
      uniquely-owned again.
      [unsafe_to_string s] can only be used when the caller owns the byte
      sequence [s] -- either uniquely or as shared immutable data. The
      caller gives up ownership of [s], and gains (the same mode of) ownership
      of the returned string.
      There are two valid use-cases that respect this ownership
      discipline:
      {ol
      {- The first is creating a string by initializing and mutating a byte
      sequence that is never changed after initialization is performed.
      {[
        let string_init len f : string =
          let s = Bytes.create len in
          for i = 0 to len - 1 do Bytes.set s i (f i) done;
          Bytes.unsafe_to_string ~no_mutation_while_string_reachable:s
      ]}
      This function is safe because the byte sequence [s] will never be
      accessed or mutated after [unsafe_to_string] is called. The
      [string_init] code gives up ownership of [s], and returns the
      ownership of the resulting string to its caller.

      Note that it would be unsafe if [s] was passed as an additional
      parameter to the function [f] as it could escape this way and be
      mutated in the future -- [string_init] would give up ownership of
      [s] to pass it to [f], and could not call [unsafe_to_string]
      safely.

      We have provided the {!String.init}, {!String.map} and
      {!String.mapi} functions to cover most cases of building
      new strings. You should prefer those over [to_string] or
      [unsafe_to_string] whenever applicable.}
      {- The second is temporarily giving ownership of a byte sequence to
      a function that expects a uniquely owned string and returns ownership
      back, so that we can mutate the sequence again after the call ended.
      {[
        let bytes_length (s : bytes) =
          String.length
            (Bytes.unsafe_to_string ~no_mutation_while_string_reachable:s)
      ]}
      In this use-case, we do not promise that [s] will never be mutated
      after the call to [bytes_length s]. The {!String.length} function
      temporarily borrows unique ownership of the byte sequence
      (and sees it as a [string]), but returns this ownership back to
      the caller, which may assume that [s] is still a valid byte
      sequence after the call. Note that this is only correct because we
      know that {!String.length} does not capture its argument -- it could
      escape by a side-channel such as a memoization combinator.
      The caller may not mutate [s] while the string is borrowed (it has
      temporarily given up ownership). This affects concurrent programs,
      but also higher-order functions: if {!String.length} returned
      a closure to be called later, [s] should not be mutated until this
      closure is fully applied and returns ownership.}}
  *)
  external unsafe_to_string
    :  no_mutation_while_string_reachable:(t[@local_opt])
    -> (string[@local_opt])
    = "%bytes_to_string"

  (** Unsafely convert a shared string to a byte sequence that should
      not be mutated.

      The same ownership discipline that makes [unsafe_to_string]
      correct applies to [unsafe_of_string_promise_no_mutation],
      however unique ownership of string values is extremely difficult
      to reason about correctly in practice. As such, one should always
      assume strings are shared, never uniquely owned (For example,
      string literals are implicitly shared by the compiler, so you
      never uniquely own them)

      The only case we have reasonable confidence is safe is if the
      produced [bytes] is shared -- used as an immutable byte
      sequence. This is possibly useful for incremental migration of
      low-level programs that manipulate immutable sequences of bytes
      (for example {!Marshal.from_bytes}) and previously used the
      [string] type for this purpose.
  *)
  external unsafe_of_string_promise_no_mutation
    :  (string[@local_opt])
    -> (t[@local_opt])
    = "%bytes_of_string"

  (** UTF-8 encoding. See [Utf] interface. *)
  module Utf8 : Utf

  (** UTF-16 little-endian encoding. See [Utf] interface. *)
  module Utf16le : Utf

  (** UTF-16 big-endian encoding. See [Utf] interface. *)
  module Utf16be : Utf

  (** UTF-32 little-endian encoding. See [Utf] interface. *)
  module Utf32le : Utf

  (** UTF-32 big-endian encoding. See [Utf] interface. *)
  module Utf32be : Utf

  module type Utf = Utf
end