1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
(** *)
type token_text = string * int
type token =
(** block comment *)
| Constant of token_text
| Directive of token_text
| Escape of token_text (** Escape sequence like [\123] *)
| Id of token_text
| Keyword of int * token_text
(** one line comment *)
| Numeric of token_text
| String of token_text
| Symbol of int * token_text
| Text of token_text (** Used for everything else *)
| Title of int * token_text
let string_of_token = function
| Bcomment (s,_) -> Printf.sprintf "Bcomment(%S)" s
| Constant (s,_) -> Printf.sprintf "Constant(%S)" s
| Directive (s,_) -> Printf.sprintf "Directive(%S)" s
| Escape (s,_) -> Printf.sprintf "Escape(%S)" s
| Id (s,_) -> Printf.sprintf "Id(%S)" s
| Keyword (n, (s, _)) -> Printf.sprintf "Keyword(%d, %S)" n s
| Lcomment (s,_) -> Printf.sprintf "Lcomment(%S)" s
| Numeric (s,_) -> Printf.sprintf "Numeric(%S)" s
| String (s,_) -> Printf.sprintf "String(%S)" s
| Symbol (n, (s, _)) -> Printf.sprintf "Symbol(%d, %S)" n s
| Text (s,_) -> Printf.sprintf "Text(%S)" s
| Title (n, (s,_)) -> Printf.sprintf "Title(%d, %S)" n s
let string_of_token_with_length = function
| Bcomment (s,len) -> Printf.sprintf "Bcomment(%S,%d)" s len
| Constant (s,len) -> Printf.sprintf "Constant(%S,%d)" s len
| Directive (s,len) -> Printf.sprintf "Directive(%S,%d)" s len
| Escape (s,len) -> Printf.sprintf "Escape(%S,%d)" s len
| Id (s,len) -> Printf.sprintf "Id(%S,%d)" s len
| Keyword (n, (s, len)) -> Printf.sprintf "Keyword(%d, %S,%d)" n s len
| Lcomment (s,len) -> Printf.sprintf "Lcomment(%S,%d)" s len
| Numeric (s,len) -> Printf.sprintf "Numeric(%S,%d)" s len
| String (s,len) -> Printf.sprintf "String(%S,%d)" s len
| Symbol (n, (s, len)) -> Printf.sprintf "Symbol(%d, %S,%d)" n s len
| Text (s,len) -> Printf.sprintf "Text(%S,%d)" s len
| Title (n, (s,len)) -> Printf.sprintf "Title(%d, %S,%d)" n s len
module Smap = Map.Make (String)
type error =
| Unknown_lang of string
| Lex_error of Location.t * string
exception Error of error
let string_of_error = function
| Unknown_lang s -> Printf.sprintf "Unknown language %S" s
| Lex_error (loc, s) ->
let b = Buffer.create 256 in
let fmt = Format.formatter_of_buffer b in
Location.print_loc fmt loc ;
Format.pp_print_flush fmt ();
let msg = Printf.sprintf "Lexing error at %s: %s"
(Buffer.contents b) s
in
msg
let pp fmt e = Format.pp_print_string fmt (string_of_error e)
let () = Printexc.register_printer
(function Error e -> Some (string_of_error e) | _ -> None)
type lexer = Sedlexing.lexbuf -> token list
let langs = ref Smap.empty
let get_lexer lang =
try Smap.find lang !langs
with Not_found -> raise (Error (Unknown_lang lang))
;;
let registered_langs () = Smap.bindings !langs
let register_lang name f = langs := Smap.add name f !langs ;;
let concat_texts l =
let f (acc_text, acc_len) (s,len) =
s :: acc_text, acc_len + len
in
let (strings, len) = List.fold_left f ([],0) l in
(String.concat "" strings, len)
let parse_lexbuf ?on_exn ~lang lexbuf =
let lexer = get_lexer lang in
let rec merge_text_tokens acc text_acc = function
| [] ->
let l =
match text_acc with
| [] -> acc
| l -> (Text (concat_texts l)) :: acc
in
List.rev l
| Text s :: q -> merge_text_tokens acc (s :: text_acc) q
| t :: q ->
let t1 =
match text_acc with
| [] -> None
| l -> Some (Text (concat_texts l))
in
let acc = match t1 with
| None -> t :: acc
| Some t1 -> t :: t1 :: acc
in
merge_text_tokens acc [] q
in
let rec iter acc =
match lexer lexbuf with
| [] -> List.rev acc
| tokens -> iter ((List.rev tokens) @ acc)
in
try
let tokens = iter [] in
merge_text_tokens [] [] tokens
with e ->
match on_exn with
| Some s -> [Text (s, -1)]
| None ->
match e with
| Failure s ->
let (loc_start, loc_end) = Sedlexing.lexing_positions lexbuf in
let loc = { Location.loc_start ; loc_end ; loc_ghost = false } in
raise (Error (Lex_error (loc, s)))
| e -> raise e
;;
let parse ?(raise_exn=false) ~lang s =
let lexbuf = Sedlexing.Utf8.from_string s in
let on_exn = if raise_exn then Some s else None in
parse_lexbuf ?on_exn ~lang lexbuf