Source file tokenize.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
open Errors
include TokenTypes

let regexps =
  [ (Str.regexp "(", fun _ -> LParen)
  ; (Str.regexp "{", fun _ -> LBrace)
  ; (Str.regexp "\\[", fun _ -> LBracket)
  ; (Str.regexp ")", fun _ -> RParen)
  ; (Str.regexp "}", fun _ -> RBrace)
  ; (Str.regexp "]", fun _ -> RBracket)
  ; (Str.regexp "[-+]?[0-9]+\\.[0-9]*\\([eE][-+]?[0-9]+\\)?", (fun s -> Float (float_of_string s)))
  ; (Str.regexp "[-+]?[0-9]+[eE][-+]?[0-9]+", (fun s -> Float (float_of_string s)))
  ; (Str.regexp "[-+]?[0-9]+", (fun s -> Integer (int_of_string s)))
  ; (Str.regexp {r|"[^"]*"|r}, (fun s -> String (String.sub s 1 ((String.length s) - 2))))
  ; (Str.regexp "[^])}[({ \t\n0-9][^])}[({ \t\n]*", (fun s -> Symbol s))
  ]

let whitespace = Str.regexp "[ \t\n]*"

let tokenize (input : string) : token list =
  let length = String.length input in
  let pos = ref 0 in
  let advance (offset : int) =
    pos := !pos + offset in
  let spaces () =
    let _ = Str.string_match whitespace input !pos in
    advance (String.length (Str.matched_string input)) in
  let try_regexp (regexp, confun) : (int * token) option =
    if Str.string_match regexp input !pos
    then
      let s = Str.matched_string input in
      Some (String.length s, confun s)
    else None in
  let tokens = ref [] in
  while !pos < length do
    spaces ();
    let matches = List.filter_map try_regexp regexps in
    let (length, token) = List.fold_left
        (fun p c -> if fst c > fst p then c else p)
        (-1, LParen) matches in
    if length < 0
    then raise (CamlrackError (("Could not lex input at position " ^ string_of_int !pos ^
                                ", beginning with character: " ^ String.sub input !pos 1)))
    else (tokens := token::!tokens; advance length)
  done;
  List.rev !tokens