Source file lang.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
(*********************************************************************************)
(*                Higlo                                                          *)
(*                                                                               *)
(*    Copyright (C) 2014-2021 Institut National de Recherche en Informatique     *)
(*    et en Automatique. All rights reserved.                                    *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU Lesser General Public License version        *)
(*    3 as published by the Free Software Foundation.                            *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *)
(*    GNU Library General Public License for more details.                       *)
(*                                                                               *)
(*    You should have received a copy of the GNU Lesser General Public           *)
(*    License along with this program; if not, write to the Free Software        *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*                                                                               *)
(*********************************************************************************)

(** *)

type token_text = string * int

type token =
| Bcomment of token_text (** block comment *)
| Constant of token_text
| Directive of token_text
| Escape of token_text (** Escape sequence like [\123] *)
| Id of token_text
| Keyword of int * token_text
| Lcomment of token_text (** one line comment *)
| Numeric of token_text
| String of token_text
| Symbol of int * token_text
| Text of token_text (** Used for everything else *)
| Title of int * token_text

let string_of_token = function
| Bcomment (s,_) -> Printf.sprintf "Bcomment(%S)" s
| Constant (s,_) -> Printf.sprintf "Constant(%S)" s
| Directive (s,_) -> Printf.sprintf "Directive(%S)" s
| Escape (s,_) -> Printf.sprintf "Escape(%S)" s
| Id (s,_) -> Printf.sprintf "Id(%S)" s
| Keyword (n, (s, _)) -> Printf.sprintf "Keyword(%d, %S)" n s
| Lcomment (s,_) -> Printf.sprintf "Lcomment(%S)" s
| Numeric (s,_) -> Printf.sprintf "Numeric(%S)" s
| String (s,_) -> Printf.sprintf "String(%S)" s
| Symbol (n, (s, _)) -> Printf.sprintf "Symbol(%d, %S)" n s
| Text (s,_) -> Printf.sprintf "Text(%S)" s
| Title (n, (s,_)) -> Printf.sprintf "Title(%d, %S)" n s

let string_of_token_with_length = function
| Bcomment (s,len) -> Printf.sprintf "Bcomment(%S,%d)" s len
| Constant (s,len) -> Printf.sprintf "Constant(%S,%d)" s len
| Directive (s,len) -> Printf.sprintf "Directive(%S,%d)" s len
| Escape (s,len) -> Printf.sprintf "Escape(%S,%d)" s len
| Id (s,len) -> Printf.sprintf "Id(%S,%d)" s len
| Keyword (n, (s, len)) -> Printf.sprintf "Keyword(%d, %S,%d)" n s len
| Lcomment (s,len) -> Printf.sprintf "Lcomment(%S,%d)" s len
| Numeric (s,len) -> Printf.sprintf "Numeric(%S,%d)" s len
| String (s,len) -> Printf.sprintf "String(%S,%d)" s len
| Symbol (n, (s, len)) -> Printf.sprintf "Symbol(%d, %S,%d)" n s len
| Text (s,len) -> Printf.sprintf "Text(%S,%d)" s len
| Title (n, (s,len)) -> Printf.sprintf "Title(%d, %S,%d)" n s len

module Smap = Map.Make (String)
type error =
| Unknown_lang of string
| Lex_error of Location.t * string

exception Error of error
let string_of_error = function
| Unknown_lang s -> Printf.sprintf "Unknown language %S" s
| Lex_error (loc, s) ->
    let b = Buffer.create 256 in
    let fmt = Format.formatter_of_buffer b in
    Location.print_loc fmt loc ;
    Format.pp_print_flush fmt ();
    let msg = Printf.sprintf "Lexing error at %s: %s"
      (Buffer.contents b) s
    in
    msg
let pp fmt e = Format.pp_print_string fmt (string_of_error e)
let () = Printexc.register_printer
  (function Error e -> Some (string_of_error e) | _ -> None)

type lexer = Sedlexing.lexbuf -> token list

let langs = ref Smap.empty

let get_lexer lang =
  try Smap.find lang !langs
  with Not_found -> raise (Error (Unknown_lang lang))
;;

let registered_langs () = Smap.bindings !langs

let register_lang name f = langs := Smap.add name f !langs ;;

let concat_texts l =
  let f (acc_text, acc_len) (s,len) =
    s :: acc_text, acc_len + len
  in
  let (strings, len) = List.fold_left f ([],0) l in
  (String.concat "" strings, len)

let parse_lexbuf ?on_exn ~lang lexbuf =
  let lexer = get_lexer lang in
  let rec merge_text_tokens acc text_acc = function
  | [] ->
      let l =
        match text_acc with
        | [] -> acc
        | l -> (Text (concat_texts l)) :: acc
      in
      List.rev l
    | Text s :: q -> merge_text_tokens acc (s :: text_acc) q
  | t :: q ->
      let t1 =
        match text_acc with
        | [] -> None
        | l -> Some (Text (concat_texts l))
      in
      let acc = match t1 with
        | None -> t :: acc
        | Some t1 -> t :: t1 :: acc
      in
      merge_text_tokens acc [] q
  in
  let rec iter acc =
    match lexer lexbuf with
    | [] -> List.rev acc
    | tokens -> iter ((List.rev tokens) @ acc)
  in
  try
    let tokens =  iter [] in
    merge_text_tokens [] [] tokens
  with e ->
      match on_exn with
      | Some s -> [Text (s, -1)]
      | None ->
         match e with
         | Failure s ->
             let (loc_start, loc_end) = Sedlexing.lexing_positions lexbuf in
             let loc = { Location.loc_start ; loc_end ; loc_ghost = false } in
             raise (Error (Lex_error (loc, s)))
         | e -> raise e
;;

let parse ?(raise_exn=false) ~lang s =
  let lexbuf = Sedlexing.Utf8.from_string s in
  let on_exn = if raise_exn then Some s else None in
  parse_lexbuf ?on_exn ~lang lexbuf