Source file tokenizer.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
open Common

type token = {
  ending : int;
  scopes : string list;
}

let ending token = token.ending

let scopes token = token.scopes

type stack_elem = {
  stack_delim : delim;
  stack_region : Oniguruma.Region.t;
  stack_begin_line : string;
  stack_grammar : grammar;
  stack_repos : (string, repo_item) Hashtbl.t list;
  stack_scopes : string list;
  stack_prev_scopes : string list;
}

type stack = stack_elem list

let empty = []

let rec add_scopes scopes = function
  | [] -> scopes
  | None :: xs -> add_scopes scopes xs
  | Some x :: xs -> add_scopes (x :: scopes) xs

(* If the stack is empty, returns the main patterns associated with the
   grammar. Otherwise, returns the patterns associated with the delimiter at
   the top of the stack. *)
let next_pats grammar = function
  | [] -> grammar.patterns
  | s :: _ -> s.stack_delim.delim_patterns

(* Should the character be escaped in a regex? *)
let is_special = function
  | '|' | '.' | '*' | '+' | '?' | '^' | '$' | '-' | ':' | '~' | '#' | '&'
  | '(' | ')' | '[' | ']' | '{' | '}' | '<' | '>' | '\\' | '\'' -> true
  | _ -> false

(* Insert the substring of [line] from [beg] to [end_] into [buf]. *)
let insert_capture buf line beg end_ =
  let rec loop i =
    if i = end_ then
      ()
    else
      let ch = line.[i] in
      if is_special ch then
        Buffer.add_char buf '\\';
      Buffer.add_char buf ch;
      loop (i + 1)
  in
  loop beg

(* Substitute the begin pattern's captures for the backreferences in the end
   delimiter. *)
let subst_backrefs stack_top =
  let { stack_delim = { delim_end = regex_str; delim_begin = begin_re; _ }
      ; stack_begin_line = line
      ; stack_region = region
      ; _ } = stack_top in
  let buf = Buffer.create (String.length regex_str) in
  let num_beg_captures = Oniguruma.num_captures begin_re in
  let rec loop i escaped =
    if i < String.length regex_str then
      match regex_str.[i], escaped with
      | '\\', true ->
        Buffer.add_string buf "\\\\";
        loop (i + 1) false
      | '\\', false ->
        loop (i + 1) true
      | char, true ->
        if char >= '0' && char <= '9' then (
          let idx = Char.code char - Char.code '0' in
          if idx < num_beg_captures then
            let beg = Oniguruma.Region.capture_beg region idx in
            let end_ = Oniguruma.Region.capture_end region idx in
            if beg <> -1 then
              insert_capture buf line beg end_
        ) else (
          Buffer.add_char buf '\\';
          Buffer.add_char buf char
        );
        loop (i + 1) false
      | char, false ->
        Buffer.add_char buf char;
        loop (i + 1) false
  in
  loop 0 false;
  Buffer.contents buf

let match_subst se =
  match
    Oniguruma.create (subst_backrefs se)
      Oniguruma.Options.none Oniguruma.Encoding.utf8
      Oniguruma.Syntax.default
  with
  | Error e -> error ("End pattern: " ^ se.stack_delim.delim_end ^ ": " ^ e)
  | Ok re -> re

let rec find_nested scope = function
  | [] -> None
  | repo :: repos ->
    match Hashtbl.find_opt repo scope with
    | Some x -> Some x
    | None -> find_nested scope repos

(* Discard zero-length tokens. *)
let remove_empties =
  let rec go acc = function
    | [] -> acc
    | tok :: toks ->
      let prev = match toks with
        | [] -> 0
        | tok :: _ -> tok.ending
      in
      if tok.ending = prev then
        go acc toks
      else
        go (tok :: acc) toks
  in go []

(* Emit tokens for the match region's captures. *)
let handle_captures
    re scopes default mat_start mat_end region captures tokens =
  let int_map =
    Hashtbl.fold
      (fun k v acc ->
         match k with
         | Capture_idx int -> IntMap.add int v acc
         | Capture_name str ->
           Array.fold_left (fun acc idx -> IntMap.add idx v acc)
             IntMap.empty (Oniguruma.name_to_group_numbers re str))
      captures IntMap.empty
  in
  let _, stack, tokens =
    (* Regex captures are ordered by their left parentheses. Do a depth-first
       preorder traversal by keeping a stack of captures. *)
    IntMap.fold (fun idx capture (start, stack, tokens) ->
        (* If the capture mentions a lookahead, it can go past the bounds of
           its parent. The match is capped at the boundary for the parent. Is
           this the right decision to make? Clearly the writer of the grammar
           intended for the capture to exceed the parent in this case. *)
        if idx < 0 || idx >= Oniguruma.Region.length region then
          (start, stack, tokens)
        else
          let cap_start = Oniguruma.Region.capture_beg region idx in
          let cap_end = Oniguruma.Region.capture_end region idx in
          if cap_start = -1 then
            (* Capture wasn't found, ignore *)
            (start, stack, tokens)
          else
            let rec pop start tokens = function
              | [] ->
                ( { scopes = add_scopes scopes [default]
                  ; ending = start } :: tokens, [])
              | ((ending, scopes) :: stack') as stack ->
                if start >= ending then
                  (* The next capture comes after the previous one *)
                  pop start ({ scopes; ending } :: tokens) stack'
                else
                  (* The next capture goes on top of the previous one *)
                  ({ scopes; ending = start } :: tokens, stack)
            in
            let cap_start = if cap_start < start then start else cap_start in
            let cap_end = if cap_end > mat_end then mat_end else cap_end in
            let tokens, stack = pop cap_start tokens stack in
            ( cap_start
            , (cap_end, add_scopes scopes [capture.capture_name]) :: stack
            , tokens )
      ) int_map (mat_start, [], tokens)
  in
  let rec pop tokens = function
    | [] -> tokens
    | (ending, scopes) :: stack -> pop ({ scopes; ending } :: tokens) stack
  in pop tokens stack

let get_whiles =
  let rec loop acc = function
    | [] -> acc
    | se :: stack ->
      match se.delim_kind with
      | End -> loop acc stack
      | While -> loop acc stack
  in
  loop []

(* Tokenizes a line according to the grammar.

   [t]: The collection of grammars.
   [grammar]: The language grammar.
   [stack]: The stack that keeps track of nested delimiters
   [pos]: The current index into the string.
   [toks]: The list of tokens, with the rightmost ones at the front.
   [line]: The string that is being matched and tokenized.
   [rem_pats]: The remaining patterns yet to be tried *)
let rec match_line ~t ~grammar ~stack ~pos ~toks ~line rem_pats =
  let len = String.length line in
  let scopes, stk_pats, repos, cur_grammar = match stack with
    | [] ->
      ([grammar.scope_name], grammar.patterns, [grammar.repository], grammar)
    | se :: _ ->
      let d = se.stack_delim in
      (se.stack_scopes, d.delim_patterns, se.stack_repos, se.stack_grammar)
  in
  (* Try each pattern in the list until one matches. If none match, increment
     [pos] and try all the patterns again. *)
  let rec try_pats repos cur_grammar ~k = function
    | [] -> k () (* No patterns have matched, so call the continuation *)
    | Match m :: pats ->
      let match_result =
        Oniguruma.match_ m.pattern line pos Oniguruma.Options.none
      in
      begin match match_result with
        | None -> try_pats repos cur_grammar ~k pats
        | Some region ->
          let start = Oniguruma.Region.capture_beg region 0 in
          let end_ = Oniguruma.Region.capture_end region 0 in
          assert (start = pos);
          let toks = { scopes; ending = pos } :: toks in
          let toks =
            handle_captures
              m.pattern scopes m.name pos end_ region m.captures toks
          in
          let toks =
            { scopes = add_scopes scopes [m.name]; ending = end_ } :: toks
          in
          match_line ~t ~grammar ~stack ~pos:end_ ~toks ~line
            (next_pats grammar stack)
      end
    | Delim d :: pats ->
      (* Try to match the delimiter's begin pattern *)
      let match_result =
        Oniguruma.match_ d.delim_begin line pos Oniguruma.Options.none
      in
      begin match match_result with
        | None -> try_pats repos cur_grammar ~k pats
        | Some region ->
          let start = Oniguruma.Region.capture_beg region 0 in
          let end_ = Oniguruma.Region.capture_end region 0 in
          assert (start = pos);
          let toks = { scopes; ending = pos } :: toks in
          let toks =
            handle_captures d.delim_begin scopes d.delim_name pos end_ region
              d.delim_begin_captures toks
          in
          let toks =
            { scopes = add_scopes scopes [d.delim_name]
            ; ending = end_ } :: toks
          in
          let se =
            { stack_delim = d
            ; stack_region = region
            ; stack_begin_line = line
            ; stack_repos = repos
            ; stack_grammar = cur_grammar
            ; stack_scopes =
                add_scopes scopes [d.delim_name; d.delim_content_name]
            ; stack_prev_scopes = scopes }
          in
          match d.delim_kind with
          | End ->
            (* Push the delimiter on the stack and continue *)
            match_line ~t ~grammar ~stack:(se :: stack) ~pos:end_ ~toks ~line
              d.delim_patterns
          | While ->
            (* Subsume the remainder of the line into a span *)
            ( remove_empties
                ({ scopes =
                     add_scopes scopes [d.delim_name; d.delim_content_name]
                 ; ending = len } :: toks)
            , se :: stack )
      end
    | Include_scope name :: pats ->
      begin match find_by_scope_name t name with
        | None ->
          (* Grammar not found; try the next pattern. *)
          try_pats repos cur_grammar ~k pats
        | Some nested_grammar ->
          let k () = try_pats repos cur_grammar ~k pats in
          try_pats [nested_grammar.repository] nested_grammar
            nested_grammar.patterns ~k
      end
    | Include_base :: pats ->
      let k () = try_pats repos cur_grammar ~k pats in
      try_pats [grammar.repository] grammar grammar.patterns ~k
    | Include_self :: pats ->
      let k () = try_pats repos cur_grammar ~k pats in
      try_pats [cur_grammar.repository] cur_grammar cur_grammar.patterns ~k
    | Include_local key :: pats ->
      match find_nested key repos with
      | None -> error ("Unknown repository key " ^ key ^ ".")
      | Some item ->
        match item.repo_item_kind with
        | Repo_rule rule ->
          try_pats (item.repo_inner :: repos) cur_grammar (rule :: pats) ~k
        | Repo_patterns pats' ->
          let k () = try_pats repos cur_grammar ~k pats in
          try_pats (item.repo_inner :: repos) cur_grammar pats' ~k
  in
  let try_delim stack_top stack' ~k =
    (* Try to match the delimiter's end pattern *)
    let delim = stack_top.stack_delim in
    let end_match =
      let re = match_subst stack_top in
      match Oniguruma.match_ re line pos Oniguruma.Options.none with
      | None -> None
      | Some region ->
        let start = Oniguruma.Region.capture_beg region 0 in
        let end_ = Oniguruma.Region.capture_end region 0 in
        assert (start = pos);
        let toks =
          { scopes =
              add_scopes
                stack_top.stack_prev_scopes
                [delim.delim_name; delim.delim_content_name]
          ; ending = pos } :: toks in
        let toks =
          handle_captures
            re stack_top.stack_prev_scopes delim.delim_name pos end_ region
            delim.delim_end_captures toks
        in Some (end_, toks)
    in
    match delim.delim_kind, end_match with
    | End, None -> k ()
    | End, Some (end_, toks) ->
      let toks =
        { scopes = add_scopes scopes [delim.delim_name]
        ; ending = end_ } :: toks
      in
      (* Pop the delimiter off the stack and continue *)
      match_line ~t ~grammar ~stack:stack' ~pos:end_ ~toks ~line
        (next_pats grammar stack')
    | While, _ -> error "Unreachable"
  in
  if pos > len then
    (* End of string reached *)
    match stack with
    | [] -> (remove_empties ({ scopes; ending = len } :: toks), stack)
    | se :: _stack' ->
      let d = se.stack_delim in
      ( remove_empties
          ({ scopes = add_scopes scopes [d.delim_name]
           ; ending = len } :: toks)
      , stack )
  else
    (* No patterns have matched, so increment the position and try again *)
    let k () =
      match_line ~t ~grammar:cur_grammar ~stack ~pos:(pos + 1) ~toks ~line
        stk_pats
    in
    match stack with
    | [] -> try_pats repos grammar rem_pats ~k
    | se :: stack' ->
      match se.stack_delim.delim_kind with
      | While -> try_pats repos se.stack_grammar rem_pats ~k
      | End ->
        if se.stack_delim.delim_apply_end_pattern_last then
          try_pats repos se.stack_grammar rem_pats
            ~k:(fun () -> try_delim se stack' ~k)
        else
          try_delim se stack'
            ~k:(fun () -> try_pats repos se.stack_grammar rem_pats ~k)

let tokenize_exn t grammar stack line =
  (* See https://github.com/Microsoft/vscode-textmate/issues/25 for how to
     handle while rules. This is important for the Markdown grammar. *)
  let rec try_while_rules pos toks rem_stack = function
    | [] -> (toks, pos, rem_stack)
    | se :: stack ->
      match se.stack_delim.delim_kind with
      | End -> try_while_rules pos toks (se :: rem_stack) stack
      | While ->
        let rec loop pos' =
          if pos' = String.length line then
            (toks, pos, rem_stack)
          else
            let re = match_subst se in
            match Oniguruma.match_ re line pos' Oniguruma.Options.none with
            | None -> loop (pos' + 1)
            | Some region ->
              let start = Oniguruma.Region.capture_beg region 0 in
              let end_ = Oniguruma.Region.capture_end region 0 in
              assert (start = pos');
              let toks =
                { scopes = se.stack_prev_scopes; ending = pos' } :: toks
              in
              let toks =
                handle_captures
                  re se.stack_prev_scopes se.stack_delim.delim_name pos end_
                  region se.stack_delim.delim_end_captures toks
              in
              let toks =
                { scopes =
                    add_scopes se.stack_prev_scopes [se.stack_delim.delim_name]
                ; ending = end_ } :: toks
              in
              try_while_rules end_ toks (se :: rem_stack) stack
        in loop pos
  in
  let toks, pos, stack = try_while_rules 0 [] [] (List.rev stack) in
  match_line ~t ~grammar ~stack ~pos ~toks ~line (next_pats grammar stack)