Source file ocaml.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
(*********************************************************************************)
(*                Higlo                                                          *)
(*                                                                               *)
(*    Copyright (C) 2014-2021 Institut National de Recherche en Informatique     *)
(*    et en Automatique. All rights reserved.                                    *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU Lesser General Public License version        *)
(*    3 as published by the Free Software Foundation.                            *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *)
(*    GNU Library General Public License for more details.                       *)
(*                                                                               *)
(*    You should have received a copy of the GNU Lesser General Public           *)
(*    License along with this program; if not, write to the Free Software        *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*                                                                               *)
(*********************************************************************************)

open Lang

let lexeme lb = Sedlexing.(Utf8.lexeme lb, lexeme_length lb);;
let sedlexeme = Sedlexing.Utf8.lexeme;;

let digit = [%sedlex.regexp? '0'..'9'|'_']
let hex = [%sedlex.regexp? digit | 'A'..'F' | 'a'..'f']
let integer = [%sedlex.regexp? Plus(digit)]
let decimal = [%sedlex.regexp? Star('0'..'9'), '.', Plus('0'..'9')]
let exponent = [%sedlex.regexp? ('e'|'E'), Opt('+'|'-'), Plus('0'..'9')]
let double = [%sedlex.regexp? (Plus('0'..'9'), '.', Star('0'..'9'), exponent) | ('.', Plus('0'..'9'), exponent) | (Plus('0'..'9'), exponent)]
let integer_positive = [%sedlex.regexp? '+',integer]
let decimal_positive = [%sedlex.regexp? '+',decimal]
let double_positive = [%sedlex.regexp? '+',double]
let integer_negative = [%sedlex.regexp? '-',integer]
let decimal_negative = [%sedlex.regexp? '-',decimal]
let double_negative = [%sedlex.regexp? '-',double]

let binary = [%sedlex.regexp? "0b",Plus('0'|'1')]
let octal = [%sedlex.regexp? "0o",Plus('0'..'7')]
let hexa = [%sedlex.regexp? "0x",Plus(hex)]

let numeric = [%sedlex.regexp? integer_positive | decimal_positive | double_positive | integer_negative | decimal_negative | double_negative | integer | decimal | double | binary | octal | hexa]

let boolean = [%sedlex.regexp? "true" | "false"]
let echar = [%sedlex.regexp? 't' | 'b' | 'n' | 'r' | 'f' | '\\' | '"' | '\'']

let escaped_char = [%sedlex.regexp? '\\', echar]
let string = [%sedlex.regexp? '"', Star(Compl(0x22)| escaped_char),'"']
let char = [%sedlex.regexp? "'", (Compl(0x27)| escaped_char), "'"]

let space = [%sedlex.regexp? Plus(' ' | '\n' | '\t' | '\r') ]

let capchar = [%sedlex.regexp? 'A'..'Z']
let lowchar = [%sedlex.regexp? 'a'..'z']
let idchar = [%sedlex.regexp? lowchar | capchar | '_' | digit]

let modname = [%sedlex.regexp? capchar, Star(idchar)]

let start_comment = [%sedlex.regexp? "(*"]
let end_comment = [%sedlex.regexp? "*)"]

(* 28 is left parenthesis, 29 is right parenthesis;
   do not eat char and string delimiter, as they
   are handled in lexer, or else the the comment char would
   eat the string or char start chararacter.
 *)
let comment_char = [%sedlex.regexp? (Compl(0x28|0x29|'"'|'\''|'*') | (0x28,Compl('\''|'"'|'*')) | ('*',Compl('\''|'"'|0x29))) ]

let id = [%sedlex.regexp? ('_'|lowchar), Star(idchar)]
let cap_id = [%sedlex.regexp? capchar, id]
let attr_id = [%sedlex.regexp? (id | cap_id), Star('.', Plus(id | cap_id))]

let percent_id = [%sedlex.regexp? '%', attr_id]

let decl_kw = [%sedlex.regexp? "and" |"class" |"constraint" |"exception" |"external" |"let" |"fun" |"function" |"functor" |"in" |"include" |"inherit" |"initializer" |"method" |"module" |"mutable" | "nonrec" | "of" |"open" |"private" |"rec" |"type" |"val" |"virtual"]

(* split keyword list because of sedlex bug:
  https://github.com/ocaml-community/sedlex/issues/97 *)
let expr_kw = [%sedlex.regexp? "asr" |"do" |"else" |"for" |"if" |"while" |"as" |"assert" |"begin" |"do" |"done" |"downto"]
let expr_kw2 = [%sedlex.regexp? "else" |"end" |"for" |"if" |"land" |"lazy" |"lor" |"lsl" |"lsr" |"lxor" |"match" |"mod"]
let expr_kw3 = [%sedlex.regexp? "new" |"object" |"or" | "ref" |"sig" |"struct" |"then" |"to"|"try" |"when" |"while" |"with" |"#" ]

let type_kw = [%sedlex.regexp? "bool" | "int" |"string" |"list" |"array" |"float" |"char" |"unit"]

let lwt_kw = [%sedlex.regexp? "lwt" | "raise_lwt" | ">>=" | ">>" | "=<<" | "for_lwt" | "assert_lwt" | "match_lwt" | "while_lwt"]
let label = [%sedlex.regexp? '~', id]

let directive = [%sedlex.regexp? Opt('\n',Opt('\r')), '#', lowchar, Star(idchar)]

let rec main lexbuf =
  match%sedlex lexbuf with
| eof -> []
| space -> [Text (lexeme lexbuf)]
| numeric -> [Numeric (lexeme lexbuf)]
| boolean -> [Constant (lexeme lexbuf)]
| directive ->
    begin
      let (s,len) = lexeme lexbuf in
      match String.get s 0 with
        '\n' -> [Directive (s, len)]
      | _ ->
         match Sedlexing.lexeme_start lexbuf with
         | 0 -> [Directive (s, len)]
         | _ ->
           [Keyword (1, ("#", 1)) ; Id (String.sub s 1 (String.length s - 1), len - 1)]
    end
| decl_kw -> [Keyword (0, lexeme lexbuf)]
| expr_kw -> [Keyword (1, lexeme lexbuf)]
| expr_kw2 -> [Keyword (1, lexeme lexbuf)]
| expr_kw3 -> [Keyword (1, lexeme lexbuf)]
| modname -> [Keyword (2, lexeme lexbuf)]
| type_kw -> [Keyword (3, lexeme lexbuf)]
| percent_id ->
    begin
      let lexeme = lexeme lexbuf in
      [ Keyword (5, lexeme) ]
    end
| lwt_kw -> [Keyword (10, lexeme lexbuf)]
| label -> [Keyword (4, lexeme lexbuf)]
| id -> [Id (lexeme lexbuf)]
| string -> [String (lexeme lexbuf)]
| char -> [String (lexeme lexbuf)]
| start_comment ->
      let b = Buffer.create 256 in
      let (s,len) = lexeme lexbuf in
      Buffer.add_string b s ;
      let len = bcomment (b,len) 1 lexbuf in
      [Bcomment (Buffer.contents b, len)]
| any -> [Text (lexeme lexbuf)]
| _ -> failwith "Invalid state"

and bcomment (b,len) level lexbuf =
  match%sedlex lexbuf with
| eof -> len
| start_comment ->
      let (s,n) = lexeme lexbuf in
      Buffer.add_string b s;
      bcomment (b,len+n) (level+1) lexbuf
| Opt('*'),end_comment ->
      let (s,n) = lexeme lexbuf in
      Buffer.add_string b s ;
      let level = level - 1 in
      if level <= 0 then
        (len+n)
      else
        bcomment (b,len+n) level lexbuf
| string
| char
| comment_char ->
      let (s,n) = lexeme lexbuf in
      Buffer.add_string b s;
      bcomment (b,len+n) level lexbuf
| any ->
      let (s,n) = lexeme lexbuf in
      Buffer.add_string b s;
      bcomment (b,len+n) level lexbuf
| _ -> len


let () = Lang.register_lang "ocaml" main;;