Source file iri_lexer.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
(*********************************************************************************)
(*                OCaml-IRI                                                      *)
(*                                                                               *)
(*    Copyright (C) 2016 Institut National de Recherche en Informatique          *)
(*    et en Automatique. All rights reserved.                                    *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU Lesser General Public License version        *)
(*    3 as published by the Free Software Foundation.                            *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *)
(*    GNU Library General Public License for more details.                       *)
(*                                                                               *)
(*    You should have received a copy of the GNU Lesser General Public           *)
(*    License along with this program; if not, write to the Free Software        *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*                                                                               *)
(*********************************************************************************)

(* Core rules from ABNF http://tools.ietf.org/html/rfc2234 *)

let digit = [%sedlex.regexp? '0'..'9']
let alpha = [%sedlex.regexp? 'a'..'z'|'A'..'Z']
let bit = [%sedlex.regexp? '0' | '1']
let char = [%sedlex.regexp? 0x01 .. 0x7F]
  (* any 7-bit US-ASCII character, excluding NUL *)

let cr = [%sedlex.regexp? 0x0D] (* carriage return *)
let lf = [%sedlex.regexp? 0x0A] (* line feed *)
let crlf = [%sedlex.regexp? cr,lf] (* Internet standard newline *)
let ctl = [%sedlex.regexp? 0x00 .. 0x1F | 0x7F] (* controls *)
let dquote = [%sedlex.regexp? '"'] (* Double Quote, \x22 *)
let hexdig = [%sedlex.regexp? digit | 'A'..'F' | 'a'..'f']
   (* we tolerate lowercase *)
let htab = [%sedlex.regexp? 0x09] (* horizontal tab *)
let sp = [%sedlex.regexp? ' '] (* space, \x20 *)
let wsp = [%sedlex.regexp? sp | htab] (* white space *)
let lwsp = [%sedlex.regexp? Star((wsp | crlf), wsp)]
   (* linear white space (past newline) *)

let octect = [%sedlex.regexp? 0x00 .. 0xFF] (* 8 bits of data *)
let vchar = [%sedlex.regexp? 0x21 .. 0x7E]
  (* visible (printing) characters *)

(* tools to handle locations in lexbuf *)

let pos ?(file="") ~line ~bol ~char () =
  Lexing.{ pos_lnum = line ; pos_bol = bol ; pos_cnum = char ; pos_fname = file }

type loc = { loc_start: Lexing.position; loc_stop: Lexing.position }
type 'a with_loc = 'a * loc option

type error = loc * string
exception Error of error
let error ?(msg="Parse error") loc = raise (Error (loc, msg))

let string_of_loc loc =
  let open Lexing in
  let start = loc.loc_start in
  let stop = loc.loc_stop in
  let line = start.pos_lnum in
  let char = start.pos_cnum - start.pos_bol in
  let len =
    if start.pos_fname = stop.pos_fname then
      stop.pos_cnum - start.pos_cnum
    else
      1
  in
  let file = start.pos_fname in
  Printf.sprintf "%sline %d, character%s %d%s"
    (match file with
     | "" -> ""
     | _ -> Printf.sprintf "File %S, " file)
    line
    (if len > 1 then "s" else "")
    char
    (if len > 1 then Printf.sprintf "-%d" (char + len) else "")

let loc_sprintf loc fmt =
  match loc with
  | None -> Printf.sprintf fmt
  | Some loc -> Printf.ksprintf
      (fun s -> Printf.sprintf "%s:\n%s" (string_of_loc loc) s)
      fmt

let string_of_error (loc, str) =
  Printf.sprintf "%s: %s" (string_of_loc loc) str

let loc loc_start loc_stop = { loc_start ; loc_stop }
let loc_of_pos pos len =
  { loc_start = pos ;
    loc_stop = Lexing.{ pos with pos_cnum = pos.pos_cnum + len } ;
  }
let error_pos ?msg pos = error ?msg (loc_of_pos pos 1)

let nl_char = Uchar.of_char '\n'

let update_pos pos str =
  let open Lexing in
  let f pos i = function
  | `Malformed msg -> error ~msg (loc_of_pos pos 1)
  | `Uchar c when Uchar.equal c nl_char ->
      let bol = pos.pos_cnum in
      { pos with
        pos_lnum = pos.pos_lnum + 1;
        pos_bol = bol ;
        pos_cnum = pos.pos_cnum + 1 ;
      }
  | _ -> { pos with pos_cnum = pos.pos_cnum + 1}
  in
  Uutf.String.fold_utf_8 f pos str

let lexeme pos lexbuf =
  try Sedlexing.Utf8.lexeme lexbuf
  with Sedlexing.MalFormed ->
    error_pos ~msg:"Malformed character in lexeme" pos

let upd pos lexbuf = update_pos pos (lexeme pos lexbuf)

(* rules from IRI RFC *)

let ucschar = [%sedlex.regexp?
    0xA0 .. 0xD7FF | 0xF900 .. 0xFDCF | 0xFDF0 .. 0xFFEF
  | 0x10000 .. 0x1FFFD | 0x20000 .. 0x2FFFD | 0x30000 .. 0x3FFFD
  | 0x40000 .. 0x4FFFD | 0x50000 .. 0x5FFFD | 0x60000 .. 0x6FFFD
  | 0x70000 .. 0x7FFFD | 0x80000 .. 0x8FFFD | 0x90000 .. 0x9FFFD
  | 0xA0000 .. 0xAFFFD | 0xB0000 .. 0xBFFFD | 0xC0000 .. 0xCFFFD
  | 0xD0000 .. 0xDFFFD | 0xE1000 .. 0xEFFFD ]

let iprivate = [%sedlex.regexp?
    0xE000 .. 0xF8FF | 0xF0000 .. 0xFFFFD | 0x100000 .. 0x10FFFD]

let iunreserved = [%sedlex.regexp? alpha | digit | Chars "-._~" | ucschar]
let pct_encoded = [%sedlex.regexp? '%', hexdig, hexdig ]
let gen_delims = [%sedlex.regexp? Chars ":/?#[]@"]
let sub_delims = [%sedlex.regexp? Chars "!$&'()*+,;="]
let iuserinfo = [%sedlex.regexp? Star(iunreserved|pct_encoded|sub_delims|':')]

let unreserved = [%sedlex.regexp? alpha|digit|Chars "-._~"]
let dec_octet = [%sedlex.regexp?
    digit | ('1'..'9',digit) | ('1',digit,digit) | ('2','0'..'4',digit) | ("25",'0'..'5')]
let ipv4address = [%sedlex.regexp? dec_octet,'.',dec_octet,'.',dec_octet,'.',dec_octet]
let ipvfuture = [%sedlex.regexp? 'v', Plus(hexdig), '.', Plus(unreserved|sub_delims|':')]

let h16 = [%sedlex.regexp?
    hexdig | (hexdig,hexdig) | (hexdig,hexdig,hexdig) | (hexdig,hexdig,hexdig,hexdig) ]
let ls32 = [%sedlex.regexp? (h16, ':', h16) | ipv4address]
let ipv6address = [%sedlex.regexp?
    ( h16,':',h16,':',h16,':',h16,':',h16,':',h16,':',ls32 )
  | ("::" ,h16,':',h16,':',h16,':',h16,':',h16,':',ls32)
  | (h16,"::",h16,':',h16,':',h16,':',h16,':',ls32)
  | (Opt(h16,':'),h16,"::",h16,':',h16,':',h16,':',ls32)
  | (Opt(h16,':'),Opt(h16,':'),h16,"::",h16,':',h16,':',ls32)
  | (Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),h16,"::",h16,':',ls32)
  | (Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),h16,"::",ls32)
  | (Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),h16,"::",h16)
  | (Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),Opt(h16,':'),h16,"::")
]

let ip_literal = [%sedlex.regexp? '[', (ipv6address|ipvfuture), ']']
let ireg_name = [%sedlex.regexp? Star(iunreserved|pct_encoded|sub_delims)]
let ihost = [%sedlex.regexp? ip_literal | ipv4address | ireg_name]
let port = [%sedlex.regexp? Star(digit)]

let ipchar_noat = [%sedlex.regexp? iunreserved|pct_encoded|sub_delims|':']
let ipchar = [%sedlex.regexp? ipchar_noat|'@']

open Iri_types

let fragment_opt pos lexbuf =
  match%sedlex lexbuf with
    '#', Star(ipchar|'/'|'?') ->
      let str = lexeme pos lexbuf in
      let len = String.length str in
      let pos = upd pos lexbuf in
      (pos, Some (String.sub str 1 (len-1)))
  | '#', any ->
      error_pos pos
  | _ ->
      Sedlexing.rollback lexbuf;
      (pos, None)

let query_opt pos lexbuf =
  match%sedlex lexbuf with
    '?', Star(ipchar|iprivate|'/'|'?') ->
      let str = lexeme pos lexbuf in
      let len = String.length str in
      let pos = upd pos lexbuf in
      (pos, Some (String.sub str 1 (len-1)))
  | '?', any ->
      error_pos pos
  | _ ->
      Sedlexing.rollback lexbuf;
      (pos, None)

let rec isegment_list acc pos lexbuf =
  match%sedlex lexbuf with
    '/', Star(ipchar) ->
      let str = lexeme pos lexbuf in
      let len = String.length str in
      let pos = upd pos lexbuf in
      isegment_list ((String.sub str 1 (len-1)) :: acc) pos lexbuf
  | _ ->
      Sedlexing.rollback lexbuf ;
      (pos, List.rev acc)

let ipath_abempty pos lexbuf =
  let (pos, path) = isegment_list [] pos lexbuf in
  (pos, Absolute path)

let iauthority pos lexbuf =
  match%sedlex lexbuf with
    ihost, ':', port ->
      begin
        let str = lexeme pos lexbuf in
        let len = String.length str in
        let p = String.rindex str ':' in
        let port =
          match String.sub str (p+1) (len - p - 1) with
            "" -> None
          | s -> Some (int_of_string s)
        in
        let host = String.sub str 0 p in
        let pos = upd pos lexbuf in
        (pos, host, port)
      end
  | ihost ->
      let host = lexeme pos lexbuf in
      let pos = upd pos lexbuf in
      (pos, host, None)
  | _ ->
      error_pos pos

let iauthority_with_user pos lexbuf =
  match%sedlex lexbuf with
  | iuserinfo, '@' ->
      let str = lexeme pos lexbuf in
      let len = String.length str in
      let user = String.sub str 0 (len - 1) in
      let pos = upd pos lexbuf in
      let (pos, h, p) = iauthority pos lexbuf in
      (pos, Some user, h, p)
  | _ ->
      Sedlexing.rollback lexbuf ;
      let (pos, h, p) = iauthority pos lexbuf in
      (pos, None, h, p)

let ipath_absolute pos lexbuf =
  let str = lexeme pos lexbuf in
  let len = String.length str in
  let str = String.sub str 1 (len - 1) in
  let pos = upd pos lexbuf in
  let (pos, path) = isegment_list [str] pos lexbuf in
  (pos, None, None, None, Absolute path)

let ihier_part pos lexbuf =
  match%sedlex lexbuf with
  | "//" ->
      let pos = upd pos lexbuf in
      let (pos, u, h, p) = iauthority_with_user pos lexbuf in
      let (pos, path) = ipath_abempty pos lexbuf in
      (pos, u, Some h, p, path)
  | '/', Plus(ipchar) -> ipath_absolute pos lexbuf
  | iuserinfo, '@' ->
      Sedlexing.rollback lexbuf ;
      let (pos, u, h, p) = iauthority_with_user pos lexbuf in
      let (pos, path) = ipath_abempty pos lexbuf in
      (pos, u, Some h, p, path)
  | Plus(ipchar_noat) ->
      (* ipath_rootless *)
      let str = lexeme pos lexbuf in
      let pos = upd pos lexbuf in
      let (pos, path) = isegment_list [str] pos lexbuf in
      (pos, None, None, None, Relative path)
  | '?' | '#' -> (* ipath-empty *)
      Sedlexing.rollback lexbuf ;
      (pos, None, None, None, Relative [])
  | eof -> (* ipath-empty *)
      (pos, None, None, None, Relative [])
  | _ ->
      let pos = upd pos lexbuf in
      error_pos pos


let assert_eof pos lexbuf =
  match%sedlex lexbuf with
    eof -> ()
  | _ -> error_pos pos

let pct_decode_path =
  let f = List.map Iri_types.pct_decode in
  function
    Absolute l -> Absolute (f l)
  | Relative l -> Relative (f l)

let irelative_part pos lexbuf =
  match%sedlex lexbuf with
  | "//" ->
      let pos = upd pos lexbuf in
      let (pos, u, h, p) = iauthority_with_user pos lexbuf in
      let (pos, path) = ipath_abempty pos lexbuf in
      (pos, u, Some h, p, path)
  | '/', Star(ipchar) -> ipath_absolute pos lexbuf
  | Plus(iunreserved|pct_encoded|sub_delims|'@') -> (* ipath-noscheme *)
      let str = lexeme pos lexbuf in
      let pos = upd pos lexbuf in
      let (pos, path) = isegment_list [str] pos lexbuf in
      (pos, None, None, None, Relative path)
  | '?' | '#' -> (* ipath-empty *)
      Sedlexing.rollback lexbuf ;
      (pos, None, None, None, Relative [])
  | eof ->
      let pos = upd pos lexbuf in
      (pos, None, None, None, Relative [])
  | _ ->
      let pos = upd pos lexbuf in
      error_pos pos

let relative_iri pctdecode pos lexbuf =
  let (pos, user, host, port, path) = irelative_part pos lexbuf in
  let (pos, query) = query_opt pos lexbuf in
  let (pos, fragment) = fragment_opt pos lexbuf in
  let user = if pctdecode then Iri_types.map_opt Iri_types.pct_decode user else user in
  let host = if pctdecode then Iri_types.map_opt Iri_types.pct_decode host else host in
  let path = if pctdecode then pct_decode_path path else path in
  let query =
    (* since query will be %-decoded, let's %-encode it *)
    if pctdecode then Iri_types.map_opt Iri_types.pct_encode_query query else query
  in
  let fragment = if pctdecode then Iri_types.map_opt Iri_types.pct_decode fragment else fragment in
  assert_eof pos lexbuf ;
  let iri = Iri_types.iri
    ~scheme: ""  ?user ?host ?port ~path
      ?query ?fragment ()
  in
  iri

let iri ?(pctdecode=true) ?(pos=pos ~line: 1 ~bol: 0 ~char: 1 ()) lexbuf =
  match%sedlex lexbuf with
    alpha, Star(alpha|digit|Chars"+-."), ':' ->
      let str = lexeme pos lexbuf in
      let len = String.length str in
      let scheme = String.sub str 0 (len - 1) in
      let pos = upd pos lexbuf in
      let (pos, user, host, port, path) = ihier_part pos lexbuf in
      let (pos, query) = query_opt pos lexbuf in
      let (pos, fragment) = fragment_opt pos lexbuf in
      let scheme = if pctdecode then Iri_types.pct_decode scheme else scheme in
      let user = if pctdecode then Iri_types.map_opt Iri_types.pct_decode user else user in
      let host = if pctdecode then Iri_types.map_opt Iri_types.pct_decode host else host in
      let path = if pctdecode then pct_decode_path path else path in
      let query =
        (* since query will be %-decoded, let's %-encode it *)
        if pctdecode then Iri_types.map_opt Iri_types.pct_encode_query query else query
      in
      let fragment = if pctdecode then Iri_types.map_opt Iri_types.pct_decode fragment else fragment in
      assert_eof pos lexbuf ;
      let iri = Iri_types.iri
        ~scheme  ?user ?host ?port ~path
          ?query ?fragment ()
      in
      iri
  |  _ ->
      Sedlexing.rollback lexbuf ;
      relative_iri pctdecode pos lexbuf

let rec link acc pos lexbuf =
  match%sedlex lexbuf with
  | Star(wsp),'<',Plus(Compl('>')), '>', Star(wsp), ';', Star(wsp) ->
      let str = lexeme pos lexbuf in
      let p1 = String.index str '<' in
      let p2 = String.index_from str p1 '>' in
      let pos2 = Lexing.{ pos with pos_cnum = pos.pos_cnum + p1 + 1 } in
      let lb =
        try Sedlexing.Utf8.from_string (String.sub str (p1 + 1) (p2 - p1 - 1))
        with Sedlexing.MalFormed ->
          error_pos ~msg: "Malformed character in link" pos
      in
      (* FIXME: pct-decode iri ? *)
      let iri = iri ~pos: pos2 lb in
      let pos = upd pos lexbuf in
      rel acc iri pos lexbuf
  | _ -> List.rev acc

and rel acc iri pos lexbuf =
  match%sedlex lexbuf with
  | "rel=\"",Plus(alpha),'"',Star(wsp),Opt(',') ->
      let str = lexeme pos lexbuf in
      let p1 = String.index str '"' in
      let p2 = String.index_from str (p1+1) '"' in
      let str = String.sub str (p1+1) (p2 - p1 - 1) in
      let pos = upd pos lexbuf in
      link ((str, iri) :: acc) pos lexbuf
  | Star(Compl(',')),Star(wsp),Opt(',') ->
      let pos = upd pos lexbuf in
      link (("", iri)::acc) pos lexbuf
  | _ ->
      let pos = upd pos lexbuf in
      link (("", iri)::acc) pos lexbuf


let http_link ?(pos=pos ~line: 1 ~bol: 0 ~char: 1 ()) lexbuf =
  link [] pos lexbuf