1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
type t = {
bc : Unsigned.uint8 Ctypes_static.ptr;
source : string;
flags : int;
mutable lastIndex : int;
}
type result = { captures : string array; input : string; index : int }
let lre_flag_global = 0b01
let lre_flag_ignorecase = 0b10
let lre_flag_multiline = 0b100
let lre_flag_dotall = 0b1000
let lre_flag_unicode = 0b10000
let lre_flag_sticky = 0b100000
let has_flag flags flag = flags land flag != 0
let global regexp = has_flag regexp.flags lre_flag_global
let ignorecase regexp = has_flag regexp.flags lre_flag_ignorecase
let multiline regexp = has_flag regexp.flags lre_flag_multiline
let dotall regexp = has_flag regexp.flags lre_flag_dotall
let sticky regexp = has_flag regexp.flags lre_flag_sticky
let unicode regexp = has_flag regexp.flags lre_flag_unicode
let parse_flags flags =
let rec parse_flags' flags acc =
match flags with
| [] -> acc
| 'g' :: rest -> parse_flags' rest (acc lor lre_flag_global)
| 'i' :: rest -> parse_flags' rest (acc lor lre_flag_ignorecase)
| 'm' :: rest -> parse_flags' rest (acc lor lre_flag_multiline)
| 's' :: rest -> parse_flags' rest (acc lor lre_flag_dotall)
| 'u' :: rest -> parse_flags' rest (acc lor lre_flag_unicode)
| 'y' :: rest -> parse_flags' rest (acc lor lre_flag_sticky)
| _ :: rest -> parse_flags' rest acc
in
parse_flags' (String.to_seq flags |> List.of_seq) 0
let flags_to_string flags =
let rec flags_to_string' flags acc =
match flags with
| 0 -> acc
| _ when has_flag flags lre_flag_global ->
flags_to_string' (flags land lre_flag_global lxor flags) (acc ^ "g")
| _ when has_flag flags lre_flag_ignorecase ->
flags_to_string' (flags land lre_flag_ignorecase lxor flags) (acc ^ "i")
| _ when has_flag flags lre_flag_multiline ->
flags_to_string' (flags land lre_flag_multiline lxor flags) (acc ^ "m")
| _ when has_flag flags lre_flag_dotall ->
flags_to_string' (flags land lre_flag_dotall lxor flags) (acc ^ "s")
| _ when has_flag flags lre_flag_unicode ->
flags_to_string' (flags land lre_flag_unicode lxor flags) (acc ^ "u")
| _ when has_flag flags lre_flag_sticky ->
flags_to_string' (flags land lre_flag_sticky lxor flags) (acc ^ "y")
| _ -> acc
in
flags_to_string' flags ""
let compile re flags =
let compiled_byte_code_len = Ctypes.allocate Ctypes.int 0 in
let size_of_error_msg = 64 in
let error_msg = Ctypes.allocate_n ~count:size_of_error_msg Ctypes.char in
let input = Ctypes.ocaml_string_start re in
let input_length = String.length re |> Unsigned.Size_t.of_int in
let flags = parse_flags flags in
let compiled_byte_code =
Bindings.C.Functions.lre_compile compiled_byte_code_len error_msg
size_of_error_msg input input_length flags Ctypes.null
in
match Ctypes.is_null compiled_byte_code with
| false -> { bc = compiled_byte_code; flags; lastIndex = 0; source = re }
| true ->
let error = Ctypes.string_from_ptr ~length:64 error_msg in
raise (Invalid_argument (Printf.sprintf "Compilation failed %s" error))
let index result = result.index
let lastIndex regexp = regexp.lastIndex
let source regexp = regexp.source
let input result = result.input
let setLastIndex regexp lastIndex = regexp.lastIndex <- lastIndex
let captures regexp = regexp.captures
let flags regexp = flags_to_string regexp.flags
let exec regexp input =
let capture_count = Bindings.C.Functions.lre_get_capture_count regexp.bc in
let capture_size = capture_count * 2 in
let capture = Ctypes.CArray.make (Ctypes.ptr Ctypes.uint8_t) capture_size in
let start_capture = Ctypes.CArray.start capture in
let matching_length = String.length input in
let _matching = Ctypes.ocaml_string_start input in
let bufp =
Ctypes.CArray.of_list Ctypes.char (input |> String.to_seq |> List.of_seq)
in
let buffer =
Ctypes.coerce (Ctypes.ptr Ctypes.char)
(Ctypes.ptr Ctypes.uint8_t)
(Ctypes.CArray.start bufp)
in
let lastIndex =
match global regexp || sticky regexp with
| true -> regexp.lastIndex
| false -> 0
in
let shift = 0 in
let exec_result =
Bindings.C.Functions.lre_exec start_capture regexp.bc buffer lastIndex
matching_length shift Ctypes.null
in
match exec_result with
| 1 ->
let substrings = Array.make capture_count "" in
let i = ref 0 in
let index = ref 0 in
while !i < capture_size - 1 do
let start_ptr = Ctypes.CArray.get capture !i in
let end_ptr = Ctypes.CArray.get capture (!i + 1) in
let start_index = Ctypes.ptr_diff buffer start_ptr in
let length = Ctypes.ptr_diff start_ptr end_ptr in
index := start_index;
let substring =
match String.sub input start_index length with
| sub -> sub
| exception _ -> ""
in
substrings.(!i / 2) <- substring;
regexp.lastIndex <- start_index + length;
i := !i + 2
done;
{ captures = substrings; input; index = !index }
| 0 ->
(match sticky regexp || global regexp with
| true -> regexp.lastIndex <- 0
| false -> ());
{ captures = [||]; input; index = 0 }
| _ -> raise (Invalid_argument "Error")
let test regexp input =
let result = exec regexp input in
Array.length result.captures > 0