Source file character_intf.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
module type CHARACTER_PARSER =
sig
    include Interfaces.FULL_PARSER
        with type expect = string * Indent.expectation option
    (** @inline *)


    (** {1 Position Information} *)

    val position: t -> Position.t
    (** [position p] The current position in the input stream.

        Can be called at any time.
    *)


    val line:   t -> int
    (** [line p] The current line in the input stream.

        Can be called at any time.
    *)


    val column: t -> int
    (** [column p] The current column in the input stream.

        Can be called at any time.
    *)


    val byte_column: t -> int
    (** [byte_column p] The current byte_column in the input stream.

        Can be called at any time.
    *)



    (** {1 Run the Parser on Streams} *)

    val run_on_string: string -> t -> t
    (** [run_on_string str p] Run the parser [p] on the string [str]. *)


    val run_on_string_at: int -> string -> t -> int * t
    (** [run_on_string str start p] Run the parser [p] on the string [str]
        starting at index [start] Return the parser and the index next to be
        pushed in. *)

    val run_on_channel: in_channel -> t -> t
    (** [run_on_channel ic p] Run the parser [p] on input channel [ic]. *)
end



module type END_OF_INPUT_COMBINATOR =
sig
    type _ t

    val expect_end: 'a -> 'a t
    (** [expect_end a] Expect the end of token stream.

        In case of success return [a].

        In case of failure return the syntax error with the expectation "end of
        input".

        {b CAUTION}: There is usually no need to use this combinator! This
        combinator is needed only for partial parsers.

        {b Never ever} backtrack over this combinator.
    *)
end







module type BASE_64_COMBINATORS =
sig
    type _ t


    val base64: (string -> 'r) -> (string -> 'r -> 'r) -> 'r t
    (** [base64 start next] Parse a base64 encoding into an object of type ['r].


        A base64 encoding is a sequence of zero or more base64 characters
        (A-Za-z0-9+/) grouped into sequences of 4 characters and optionally
        padded with the character [=]. Each group of 2-4 base64 characters are
        decoded into a string of 1-3 bytes.

        [start] gets the first 1-3 bytes and [next] gets all subsequent 1-3
        bytes until the end of the encoding is reached.
    *)


    val string_of_base64: string t
    (** Parse a base64 encoding and decode it into a string. *)
end


module type LEXER_COMBINATOR =
sig
    type _ t

    val lexer: 'a t -> 'tok ->'tok t -> (Position.range * 'tok) t
    (** [lexer whitespace end_token tok]

        A lexer combinator.

        - The [whitespace] combinator recognizes a possibly empty sequence of
        whitespace (usually blanks, tabs, newlines, comments, ...).

        - [end_token] is a token which the lexer returns when it has successfully
        consumed the end of input.

        - [tok] is a combinator recognizing tokens
        (usually [tok1 </> tok2 </> ... </> tokn]).

        The lexer combinator recognizes tokens in an input stream of the form
        {v
           WS Token WS Token .... WS EOF
        v}

        Note: If a combinator fails to recognize a token and having
        consumed some input, then the subsequent combinators are not used
        anymore as alternatives. Therefore if there are tokens which can begin
        with the same prefix, then it is necessary to make the recognition of
        the common prefixes backtrackable in all but the last combinator
        recognizing a token with the same prefix. The same applies to whitespace
        if part of the whitespace can begin like a token.

        Examples:
        - comment: "// ...."
        - division operator: "/"

        In this case the recognition at least of the first slash of the comment
        has to be backtrackable.
    *)
end






module type LOCATION_COMBINATORS =
sig
    type _ t

    val located: 'a t -> 'a Located.t t
    (** [located p] Parse [p] and return its result with its start and end
        position.

        Note: If [p] removes whitespace at the end, the returned end position is
        at the end of the whitespace. This is not what you usually want.
        Therefore first parse the essential part located and then remove the
        whitespace.
    *)


    val position: Position.t t
    (** The current position in the file. *)
end






module type INDENTATION_COMBINATORS =
sig
    type _ t



    (** The indentation of a normal construct is the indentation of its leftmost
        token.  The indentation of a vertically aligned construct is the
        indentation of its first token.
    *)


    val indent: int -> 'a t -> 'a t
    (** [indent i p] Indent [p] by [i] columns relative to its parent.

        Precondition: [0 <= i]

        The indentation of [p] is defined by the indentation of its first token.
        The first token has to be indented at least [i] columns relative to the
        parent of [p]. After the first token of [p] has been parsed
        successfully, all subsequent tokens must have at least the same
        indentation.

        Note: Indentation of [p] relative to its parent only makes sense, if the
        first token of [p] is not the first token of its parent! I.e. the parent
        of [p] should have consumed at least one token before the parsing of [p]
        starts.
    *)


    (** CAUTION WITH ALIGNMENT !!

        If you want to align a certain number of constructs vertically it is {e
        mandatory} to indent the whole block of constructs. Do not indent the
        individual items to be aligned. Indent the whole block.

        Reason: The parent of the block usually has already consumed some token
        and the indentation of a construct is the position of the leftmost
        token. If you don't indent the aligned block, then it will be aligned
        with the leftmost token of the parent construct. This is usually not
        intended and a common pitfall. Any indentation e.g. zero indentation is
        ok.
    *)

    val align: 'a t -> 'a t
    (** [align p]

        Use the start position of the first token of [p] to align it with other
        constructs. If [p] does not consume any token, then [align p] has no
        effect.

        Alignment makes sense if there are at least two combinators which
        are aligned and indented. E.g. suppose there are two combinators [p] and
        [q]. Then we can form
        {[
        indent 1 (
                let* a = align p in
                let* b = align q in
                return (a,b)
        )
        ]}

        This combinator parses [p] whose first token has to be indented at least
        one column relative to its parent. And then it parses [q] whose first
        token must be aligned with the first token of [p].

        The indentation decouples the alignment of [p] and [q] with other
        aligned siblings or parents. [indent 0 ...] can be used to make the
        indentation optional.
    *)


    val left_align: 'a t -> 'a t
    (** [left_align p]

        Align a construct described by [p] at its leftmost possible column. If a
        whole block of constructs have to be vertically left aligned, then it is
        important that at least the first construct is left aligned. The
        subsequent constructs will be aligned exactly vertically. For the
        subsequent constructs [left_align] has the same effect as {!align}.
    *)


    val detach: 'a t -> 'a t
    (** [detach p] Parse [p] without any indentation and alignment restrictions.

        Detachment is needed to parse whitespace. The whitespace at the
        beginning of a line never satisfies any nontrivial indentation or
        aligment requirements.
    *)

end






module type CHARACTER_COMBINATORS =
sig
    type _ t


    val charp: (char -> bool) -> string -> char t
    (** [charp p expect] Parse a character which satisfies the predicate [p].

        In case of failure, report the failed expectation [expect].
    *)


    val range: char -> char -> char t
    (** [range c1 c2] Parses a character in the range between [c1] and [c2], i.e.
        a character [c] which satisfies [c1 <= c && c <= c2].*)



    val char: char -> char t
    (** [char c] Parse the character [c]. *)


    val one_of_chars: string -> string -> char t
    (** [one_of_chars str expect]

        Parse one of the characters in the string [str]. In case of failure,
        report the failed expectation [expect].
    *)


    val string: string -> string t
    (** [string str] Parse the string [str]. *)


    val uppercase_letter: char t
    (** Parse an uppercase letter. *)


    val lowercase_letter: char t
    (** Parse a lowercase letter. *)


    val letter: char t
    (** Parse a letter. *)


    val digit_char: char t
    (** Parse a digit [0..9] and return it as character. *)


    val digit: int t
    (** Parse a digit and return it as number. *)


    val word: (char -> bool) -> (char -> bool) -> string -> string t
    (** [word first inner error]

        Parse a word which starts with a character satisfying the predicate
        [first] followed by zero or more characters satisfying the predicate
        [inner]. In case of failure add the expectation [error].
    *)


    val hex_uppercase: int t
    (** Equivalent to [range 'A' 'F'] and then converted to the corresponding
        number between [10] and [15]. *)


    val hex_lowercase: int t
    (** Equivalent to [range 'a' 'f'] and then converted to the corresponding
        number between [10] and [15]. *)



    val hex_digit: int t
    (** Parse a hexadecimal digit and return the corresponding number between
        [0] and [15]. *)

end



module type MAKE_FINAL_COMBINATORS =
sig
    type _ t
    type state
    type final
    type parser

    val make: state -> final t -> parser
    (** [make state c]

        Make a parser which starts in state [state] and parses a construct
        defined by the combinator [c]. The token stream must be ended by
        [put_end], otherwise the parse won't succeed.

        {b CAUTION}: [c] must not be a combinator containing [expect_end].
        Moreover it must not have been constructed by {!lexer}.
    *)


    val make_partial: Position.t -> state -> final t -> parser
    (** [make_partial pos state c]

        Make parser which analyzes a part of the input stream.
        The parser starts at position [pos] in state [state] and
        parses a construct defined by the combinator [c]. The parser can succeed
        even if no end token has been pushed into the parser.
    *)
end