Source file lexer.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
open Let_syntax.Result

type token =
  | OPEN_PAREN
  | CLOSE_PAREN
  | OPEN_BRACE
  | CLOSE_BRACE
  | OPEN_BRACKET
  | CLOSE_BRACKET
  | COLON
  | COMMA
  | TRUE
  | FALSE
  | NULL
  | FLOAT of string
  | INT_OR_FLOAT of string
  | INT of string
  | STRING of string
  | IDENTIFIER_NAME of string
  | EOF

let pp_token ppf =
  let ps = Format.pp_print_string ppf in
  let pf = Format.fprintf ppf in
  function
  | OPEN_PAREN -> ps "'('"
  | CLOSE_PAREN -> ps "')'"
  | OPEN_BRACE -> ps "'{'"
  | CLOSE_BRACE -> ps "'}'"
  | OPEN_BRACKET -> ps "'['"
  | CLOSE_BRACKET -> ps "']'"
  | COLON -> ps "':'"
  | COMMA -> ps "','"
  | TRUE -> ps "'true'"
  | FALSE -> ps "'false'"
  | NULL -> ps "'null'"
  | FLOAT s -> pf "FLOAT %S" s
  | INT_OR_FLOAT s -> pf "INT_OR_STRING %S" s
  | INT s -> pf "INT %S" s
  | STRING s -> pf "%S" s
  | IDENTIFIER_NAME s -> pf "IDENTIFIER_NAME %S" s
  | EOF -> ps "EOF"

let lexer_error lexbuf =
  let pos_start, _pos_end = Sedlexing.lexing_positions lexbuf in
  let location = Errors.string_of_position pos_start in
  let msg =
    Printf.sprintf "%s: Unexpected character '%s'" location
      (Sedlexing.Utf8.lexeme lexbuf)
  in
  Error msg

let source_character = [%sedlex.regexp? any]
let line_terminator = [%sedlex.regexp? 0x000A | 0x000D | 0x2028 | 0x2029]

let line_terminator_sequence =
  [%sedlex.regexp? 0x000A | 0x000D, Opt 0x000A | 0x2028 | 0x2029]

(* NUMBERS, 7.8.3 *)
let non_zero_digit = [%sedlex.regexp? '1' .. '9']
let decimal_digit = [%sedlex.regexp? '0' .. '9']
let decimal_digits = [%sedlex.regexp? Plus decimal_digit]
let hex_digit = [%sedlex.regexp? '0' .. '9' | 'a' .. 'f' | 'A' .. 'F']
let exponent_indicator = [%sedlex.regexp? 'e' | 'E']

let signed_integer =
  [%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits]

let exponent_part = [%sedlex.regexp? exponent_indicator, signed_integer]

let decimal_integer_literal =
  [%sedlex.regexp? '0' | non_zero_digit, Opt decimal_digits]

let hex_integer_literal =
  [%sedlex.regexp? "0x", Plus hex_digit | "0X", Plus hex_digit]

(* float *)
let float_literal =
  [%sedlex.regexp?
    ( decimal_integer_literal, '.', Opt decimal_digits, Opt exponent_part
    | '.', decimal_digits, Opt exponent_part )]

let json5_float =
  [%sedlex.regexp? float_literal | '+', float_literal | '-', float_literal]

(* int_or_float *)
let int_or_float_literal =
  [%sedlex.regexp? decimal_integer_literal, Opt exponent_part]

let json5_int_or_float =
  [%sedlex.regexp?
    int_or_float_literal | '+', int_or_float_literal | '-', int_or_float_literal]

(* int/hex *)
let int_literal =
  [%sedlex.regexp? decimal_digits | '+', decimal_digits | '-', decimal_digits]

let json5_int =
  [%sedlex.regexp?
    ( hex_integer_literal
    | '+', hex_integer_literal
    | '-', hex_integer_literal
    | int_literal )]

(* STRING LITERALS, 7.8.4 *)
let unicode_escape_sequence =
  [%sedlex.regexp? 'u', hex_digit, hex_digit, hex_digit, hex_digit]

let single_escape_character = [%sedlex.regexp? Chars {|'"\\bfnrtv|}]

let escape_character =
  [%sedlex.regexp? single_escape_character | decimal_digit | 'x' | 'u']

let non_escape_character =
  [%sedlex.regexp? Sub (source_character, (escape_character | line_terminator))]

let character_escape_sequence =
  [%sedlex.regexp? single_escape_character | non_escape_character]

let line_continuation = [%sedlex.regexp? '\\', line_terminator_sequence]
let hex_escape_sequence = [%sedlex.regexp? 'x', hex_digit, hex_digit]

let escape_sequence =
  [%sedlex.regexp?
    ( character_escape_sequence
    | '0', Opt (decimal_digit, decimal_digit)
    | hex_escape_sequence | unicode_escape_sequence )]

let single_string_character =
  [%sedlex.regexp?
    ( Sub (source_character, ('\'' | '\\' | line_terminator))
    | '\\', escape_sequence
    | line_continuation )]

let double_string_character =
  [%sedlex.regexp?
    ( Sub (source_character, ('"' | '\\' | line_terminator))
    | '\\', escape_sequence
    | line_continuation )]

let string_literal =
  [%sedlex.regexp?
    ( '"', Star double_string_character, '"'
    | '\'', Star single_string_character, '\'' )]

(* IDENTIFIER_NAME (keys in objects) *)
let unicode_combining_mark = [%sedlex.regexp? mn | mc]
let unicode_digit = [%sedlex.regexp? nd]
let unicode_connector_punctuation = [%sedlex.regexp? pc]
let unicode_letter = [%sedlex.regexp? lu | ll | lt | lm | lo | nl]
let zwnj = [%sedlex.regexp? 0x200C]
let zwj = [%sedlex.regexp? 0x200D]

let identifier_start =
  [%sedlex.regexp? unicode_letter | '$' | '_' | '\\', unicode_escape_sequence]

let identifier_part =
  [%sedlex.regexp?
    ( identifier_start | unicode_combining_mark | unicode_digit
    | unicode_connector_punctuation | zwnj | zwj )]

let identifier_name = [%sedlex.regexp? identifier_start, Star identifier_part]

(* COMMENTS 7.4 *)
let single_line_comment_char =
  [%sedlex.regexp? Sub (source_character, line_terminator)]

let single_line_comment = [%sedlex.regexp? "//", Star single_line_comment_char]
let multi_line_not_asterisk_char = [%sedlex.regexp? Sub (source_character, '*')]
let multi_line_not_slash_char = [%sedlex.regexp? Sub (source_character, '/')]

let multi_line_comment_char =
  [%sedlex.regexp?
    multi_line_not_asterisk_char | '*', Plus multi_line_not_slash_char]

let multi_line_comment =
  [%sedlex.regexp? "/*", Star multi_line_comment_char, Opt '*', "*/"]

let comment = [%sedlex.regexp? multi_line_comment | single_line_comment]

let white_space =
  [%sedlex.regexp? 0x0009 | 0x000B | 0x000C | 0x0020 | 0x00A0 | 0xFEFF | zs]

let string_lex_single lexbuf strbuf =
  let lexeme = Sedlexing.Utf8.lexeme in
  let rec lex lexbuf strbuf =
    match%sedlex lexbuf with
    | '\'' -> Ok (Buffer.contents strbuf)
    | '\\', escape_sequence ->
        let* s = Unescape.unescape (lexeme lexbuf) in
        Buffer.add_string strbuf s;
        lex lexbuf strbuf
    | line_continuation -> lex lexbuf strbuf
    | Sub (source_character, ('\'' | line_terminator)) ->
        Buffer.add_string strbuf (lexeme lexbuf);
        lex lexbuf strbuf
    | _ -> lexer_error lexbuf
  in
  lex lexbuf strbuf

let string_lex_double lexbuf strbuf =
  let lexeme = Sedlexing.Utf8.lexeme in
  let rec lex lexbuf strbuf =
    match%sedlex lexbuf with
    | '"' -> Ok (Buffer.contents strbuf)
    | '\\', escape_sequence ->
        let* s = Unescape.unescape (lexeme lexbuf) in
        Buffer.add_string strbuf s;
        lex lexbuf strbuf
    | line_continuation -> lex lexbuf strbuf
    | Sub (source_character, ('"' | line_terminator)) ->
        Buffer.add_string strbuf (lexeme lexbuf);
        lex lexbuf strbuf
    | _ -> lexer_error lexbuf
  in
  lex lexbuf strbuf

let string_lex lexbuf quote =
  let strbuf = Buffer.create 200 in
  match quote with
  | "'" -> string_lex_single lexbuf strbuf
  | {|"|} -> string_lex_double lexbuf strbuf
  | _ -> Error (Printf.sprintf "Invalid string quote %S" quote)

let rec lex tokens buf =
  let lexeme = Sedlexing.Utf8.lexeme in
  let pos, _ = Sedlexing.lexing_positions buf in
  match%sedlex buf with
  | '(' -> lex ((OPEN_PAREN, pos) :: tokens) buf
  | ')' -> lex ((CLOSE_PAREN, pos) :: tokens) buf
  | '{' -> lex ((OPEN_BRACE, pos) :: tokens) buf
  | '}' -> lex ((CLOSE_BRACE, pos) :: tokens) buf
  | '[' -> lex ((OPEN_BRACKET, pos) :: tokens) buf
  | ']' -> lex ((CLOSE_BRACKET, pos) :: tokens) buf
  | ':' -> lex ((COLON, pos) :: tokens) buf
  | ',' -> lex ((COMMA, pos) :: tokens) buf
  | Chars {|"'|} ->
      let* s = string_lex buf (lexeme buf) in
      lex ((STRING s, pos) :: tokens) buf
  | multi_line_comment | single_line_comment | white_space | line_terminator ->
      lex tokens buf
  | "true" -> lex ((TRUE, pos) :: tokens) buf
  | "false" -> lex ((FALSE, pos) :: tokens) buf
  | "null" -> lex ((NULL, pos) :: tokens) buf
  | json5_float ->
      let s = lexeme buf in
      lex ((FLOAT s, pos) :: tokens) buf
  | json5_int ->
      let s = lexeme buf in
      lex ((INT s, pos) :: tokens) buf
  | json5_int_or_float ->
      let s = lexeme buf in
      lex ((INT_OR_FLOAT s, pos) :: tokens) buf
  | identifier_name ->
      let s = lexeme buf in
      lex ((IDENTIFIER_NAME s, pos) :: tokens) buf
  | eof -> Ok (List.rev ((EOF, pos) :: tokens))
  | _ -> lexer_error buf