Source file lang.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
(*********************************************************************************)
(*                OCaml-RDF                                                      *)
(*                                                                               *)
(*    Copyright (C) 2012-2024 Institut National de Recherche en Informatique     *)
(*    et en Automatique. All rights reserved.                                    *)
(*                                                                               *)
(*    This program is free software; you can redistribute it and/or modify       *)
(*    it under the terms of the GNU Lesser General Public License version        *)
(*    3 as published by the Free Software Foundation.                            *)
(*                                                                               *)
(*    This program is distributed in the hope that it will be useful,            *)
(*    but WITHOUT ANY WARRANTY; without even the implied warranty of             *)
(*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *)
(*    GNU General Public License for more details.                               *)
(*                                                                               *)
(*    You should have received a copy of the GNU General Public License          *)
(*    along with this program; if not, write to the Free Software                *)
(*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA                   *)
(*    02111-1307  USA                                                            *)
(*                                                                               *)
(*    Contact: Maxence.Guesdon@inria.fr                                          *)
(*                                                                               *)
(*********************************************************************************)

(** *)

(* https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1 *)

module L = Sedlexing.Utf8

let digit = [%sedlex.regexp? '0'..'9']
let alpha = [%sedlex.regexp? 'a'..'z' | 'A'..'Z']
let alphanum = [%sedlex.regexp? digit | alpha]

let script = [%sedlex.regexp? alpha,alpha,alpha,alpha]
let region = [%sedlex.regexp? (alpha,alpha) | (digit,digit,digit)]

let singleton = [%sedlex.regexp? digit
  | 0x41 .. 0x57 | 0x59 .. 0x5A | 0x61 .. 0x77 | 0x79 .. 0x7A ]

let privateuse = [%sedlex.regexp? "x", Plus("-",alphanum,alphanum,Opt(alphanum),Opt(alphanum),Opt(alphanum),Opt(alphanum),Opt(alphanum),Opt(alphanum))]

let extension = [%sedlex.regexp?
    singleton, Plus("-", alphanum,alphanum,Opt(alphanum),Opt(alphanum),Opt(alphanum),Opt(alphanum),Opt(alphanum),Opt(alphanum))]

let variant = [%sedlex.regexp?
    (alphanum,alphanum,alphanum,alphanum,alphanum,Opt(alphanum),Opt(alphanum),Opt(alphanum))
  | (digit, alphanum, alphanum, alphanum)
  ]

let extlang = [%sedlex.regexp?
    (alpha,alpha,alpha)
  | (("-",alpha,alpha,alpha),Plus("-",alpha,alpha,alpha))
  ]

let language = [%sedlex.regexp?
    (alpha,alpha,Opt(alpha),Opt("-",extlang))
  | (alpha,alpha,alpha,alpha),Opt(alpha),Opt(alpha),Opt(alpha),Opt(alpha)]

let langtag = [%sedlex.regexp? language,
    Opt("-", script),
    Opt('-', region),
    Star("-", variant),
    Star("-", extension),
    Opt("-", privateuse) ]

let irregular = [%sedlex.regexp?
    "en-GB-oed" | "i-ami" | "i-bnn" | "i-default"
  | "i-enochian" | "i-hak" | "i-klingon" | "i-lux"
  | "i-mingo" | "i-navajo" | "i-pwn" | "i-tao"
  | "i-tay" | "i-tsu" | "sgn-BE-FR" | "sgn-BE-NL"
  | "sgn-CH-DE" ]

let regular = [%sedlex.regexp?
    "art-lojban"
  | "cel-gaulish"
  | "no-bok"
  | "no-nyn"
  | "zh-guoyu"
  | "zh-hakka"
  | "zh-min"
  | "zh-min-nan"
  | "zh-xiang"
  ]

let grandfathered = [%sedlex.regexp? irregular | regular ]

let language_tag = [%sedlex.regexp? (langtag | privateuse | grandfathered), eof ]

let language_tag lexbuf =
  match%sedlex lexbuf with
  | language_tag -> ()
  | _ -> failwith "Invalid language tag"

let is_valid_language_tag str =
  let lexbuf = Sedlexing.Utf8.from_string str in
  try language_tag lexbuf; true
  with Failure _ -> false