Source file lang.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
(*********************************************************************************)
(*                Higlo                                                          *)
(*                                                                               *)
(*    Copyright (C) 2014-2021 Institut National de Recherche en Informatique     *)
(*    et en Automatique. All rights reserved.                                    *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU Lesser General Public License version        *)
(*    3 as published by the Free Software Foundation.                            *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *)
(*    GNU Library General Public License for more details.                       *)
(*                                                                               *)
(*    You should have received a copy of the GNU Lesser General Public           *)
(*    License along with this program; if not, write to the Free Software        *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*                                                                               *)
(*********************************************************************************)

(** *)

type token =
| Bcomment of string (** block comment *)
| Constant of string
| Directive of string
| Escape of string (** Escape sequence like [\123] *)
| Id of string
| Keyword of int * string
| Lcomment of string (** one line comment *)
| Numeric of string
| String of string
| Symbol of int * string
| Text of string (** Used for everything else *)

let string_of_token = function
| Bcomment s -> Printf.sprintf "Bcomment(%S)" s
| Constant s -> Printf.sprintf "Constant(%S)" s
| Directive s -> Printf.sprintf "Directive(%S)" s
| Escape s -> Printf.sprintf "Escape(%S)" s
| Id s -> Printf.sprintf "Id(%S)" s
| Keyword (n, s) -> Printf.sprintf "Keyword(%d, %S)" n s
| Lcomment s -> Printf.sprintf "Lcomment(%S)" s
| Numeric s -> Printf.sprintf "Numeric(%S)" s
| String s -> Printf.sprintf "String(%S)" s
| Symbol (n, s) -> Printf.sprintf "Symbol(%d, %S)" n s
| Text s -> Printf.sprintf "Text(%S)" s

module Smap = Map.Make (String)
type error =
| Unknown_lang of string
| Lex_error of Location.t * string

exception Error of error
let string_of_error = function
| Unknown_lang s -> Printf.sprintf "Unknown language %S" s
| Lex_error (loc, s) ->
    let b = Buffer.create 256 in
    let fmt = Format.formatter_of_buffer b in
    Location.print_loc fmt loc ;
    Format.pp_print_flush fmt ();
    let msg = Printf.sprintf "Lexing error at %s: %s"
      (Buffer.contents b) s
    in
    msg
let pp fmt e = Format.pp_print_string fmt (string_of_error e)
let () = Printexc.register_printer
  (function Error e -> Some (string_of_error e) | _ -> None)

type lexer = Sedlexing.lexbuf -> token list

let langs = ref Smap.empty

let get_lexer lang =
  try Smap.find lang !langs
  with Not_found -> raise (Error (Unknown_lang lang))
;;
let register_lang name f = langs := Smap.add name f !langs ;;

let parse ?(raise_exn=false) ~lang s =
    let lexer = get_lexer lang in
    let lexbuf = Sedlexing.Utf8.from_string s in
    let rec merge_text_tokens acc text_acc = function
    | [] ->
      let l =
        match text_acc with
        | [] -> acc
        | l -> (Text (String.concat "" (List.rev l))) :: acc
      in
      List.rev l
    | Text s :: q -> merge_text_tokens acc (s :: text_acc) q
    | t :: q ->
      let t1 =
        match text_acc with
        | [] -> None
        | l -> Some (Text (String.concat "" (List.rev l)))
      in
      let acc = match t1 with
        | None -> t :: acc
        | Some t1 -> t :: t1 :: acc
      in
      merge_text_tokens acc [] q
    in
    let rec iter acc =
      match lexer lexbuf with
      | [] -> List.rev acc
      | tokens -> iter ((List.rev tokens) @ acc)
    in
    try
      let tokens =  iter [] in
      merge_text_tokens [] [] tokens
    with e ->
      if raise_exn then
        (
         match e with
         | Failure s ->
             let (loc_start, loc_end) = Sedlexing.lexing_positions lexbuf in
             let loc = { Location.loc_start ; loc_end ; loc_ghost = false } in
             raise (Error (Lex_error (loc, s)))
         | e -> raise e
        )
      else
        [Text s]
;;