1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
type t = {
bc : Unsigned.uint8 Ctypes_static.ptr;
source : string;
flags : int;
mutable lastIndex : int;
}
type matchResult = {
captures : string array;
input : string;
index : int;
groups : string list; [@warning "-69"]
}
type result = (matchResult, string) Stdlib.result
let lre_flag_global = 0b01
let lre_flag_ignorecase = 0b10
let lre_flag_multiline = 0b100
let lre_flag_dotall = 0b1000
let lre_flag_unicode = 0b10000
let lre_flag_sticky = 0b100000
let has_flag flags flag = flags land flag != 0
let global regexp = has_flag regexp.flags lre_flag_global
let ignorecase regexp = has_flag regexp.flags lre_flag_ignorecase
let multiline regexp = has_flag regexp.flags lre_flag_multiline
let dotall regexp = has_flag regexp.flags lre_flag_dotall
let sticky regexp = has_flag regexp.flags lre_flag_sticky
let unicode regexp = has_flag regexp.flags lre_flag_unicode
let parse_flags flags =
let rec parse_flags' flags acc =
match flags with
| [] -> acc
| 'g' :: rest -> parse_flags' rest (acc lor lre_flag_global)
| 'i' :: rest -> parse_flags' rest (acc lor lre_flag_ignorecase)
| 'm' :: rest -> parse_flags' rest (acc lor lre_flag_multiline)
| 's' :: rest -> parse_flags' rest (acc lor lre_flag_dotall)
| 'u' :: rest -> parse_flags' rest (acc lor lre_flag_unicode)
| 'y' :: rest -> parse_flags' rest (acc lor lre_flag_sticky)
| _ :: rest -> parse_flags' rest acc
in
parse_flags' (String.to_seq flags |> List.of_seq) 0
let flags_to_string flags =
let rec flags_to_string' flags acc =
match flags with
| 0 -> acc
| _ when has_flag flags lre_flag_global ->
flags_to_string' (flags land lre_flag_global lxor flags) (acc ^ "g")
| _ when has_flag flags lre_flag_ignorecase ->
flags_to_string' (flags land lre_flag_ignorecase lxor flags) (acc ^ "i")
| _ when has_flag flags lre_flag_multiline ->
flags_to_string' (flags land lre_flag_multiline lxor flags) (acc ^ "m")
| _ when has_flag flags lre_flag_dotall ->
flags_to_string' (flags land lre_flag_dotall lxor flags) (acc ^ "s")
| _ when has_flag flags lre_flag_unicode ->
flags_to_string' (flags land lre_flag_unicode lxor flags) (acc ^ "u")
| _ when has_flag flags lre_flag_sticky ->
flags_to_string' (flags land lre_flag_sticky lxor flags) (acc ^ "y")
| _ -> acc
in
flags_to_string' flags ""
let strlen ptr =
let rec aux ptr len =
let c = Ctypes.( !@ ) ptr in
if c = char_of_int 0 then len else aux (Ctypes.( +@ ) ptr 1) (len + 1)
in
aux ptr 0
let string_from_ptr ptr = Ctypes.string_from_ptr ~length:(strlen ptr) ptr
let compile ~flags re =
let compiled_byte_code_len = Ctypes.allocate Ctypes.int 0 in
let size_of_error_msg = 64 in
let error_msg = Ctypes.allocate_n ~count:size_of_error_msg Ctypes.char in
let input = Ctypes.ocaml_string_start re in
let input_length = String.length re |> Unsigned.Size_t.of_int in
let flags = parse_flags flags in
let compiled_byte_code =
Bindings.C.Functions.lre_compile compiled_byte_code_len error_msg
size_of_error_msg input input_length flags Ctypes.null
in
match Ctypes.is_null compiled_byte_code with
| false -> Ok { bc = compiled_byte_code; flags; lastIndex = 0; source = re }
| true ->
let length = strlen error_msg in
let error = Ctypes.string_from_ptr ~length error_msg in
Error
( (match error with
| "unexpected end" -> `Unexpected_end
| "malformed unicode char" -> `Malformed_unicode_char
| "nothing to repeat" -> `Nothing_to_repeat
| "invalid escape sequence in regular expression" ->
`Invalid_escape_sequence
| unknown -> `Unknown unknown),
error )
let index result = match result with Ok result -> result.index | Error _ -> 0
let lastIndex regexp = regexp.lastIndex
let source regexp = regexp.source
let input result = match result with Ok result -> result.input | Error _ -> ""
let setLastIndex regexp lastIndex = regexp.lastIndex <- lastIndex
let captures result =
match result with Ok result -> result.captures | Error _ -> [||]
let flags regexp = flags_to_string regexp.flags
let exec regexp input =
let capture_count = Bindings.C.Functions.lre_get_capture_count regexp.bc in
let capture_size = capture_count * 2 in
let capture = Ctypes.CArray.make (Ctypes.ptr Ctypes.uint8_t) capture_size in
let start_capture = Ctypes.CArray.start capture in
let matching_length = String.length input in
let bufp =
Ctypes.CArray.of_list Ctypes.char (input |> String.to_seq |> List.of_seq)
in
let buffer =
Ctypes.coerce (Ctypes.ptr Ctypes.char)
(Ctypes.ptr Ctypes.uint8_t)
(Ctypes.CArray.start bufp)
in
let lastIndex =
match global regexp || sticky regexp with
| true -> regexp.lastIndex
| false -> 0
in
let shift = 0 in
let exec_result =
Bindings.C.Functions.lre_exec start_capture regexp.bc buffer lastIndex
matching_length shift Ctypes.null
in
match exec_result with
| 1 ->
let substrings = Array.make capture_count "" in
let i = ref 0 in
let index = ref 0 in
let groups = ref [] in
let group_name_ptr =
ref (Bindings.C.Functions.lre_get_groupnames regexp.bc)
in
while !i < capture_size - 1 do
let start_ptr = Ctypes.CArray.get capture !i in
let end_ptr = Ctypes.CArray.get capture (!i + 1) in
let start_index = Ctypes.ptr_diff buffer start_ptr in
let length = Ctypes.ptr_diff start_ptr end_ptr in
index := start_index;
let substring =
match String.sub input start_index length with
| sub -> sub
| exception _ -> ""
in
substrings.(!i / 2) <- substring;
regexp.lastIndex <- start_index + length;
(match !group_name_ptr with
| Some pointer when !i > 0 ->
let current_group_name = string_from_ptr pointer in
groups := current_group_name :: !groups;
let next_group_name_ptr =
Ctypes.( +@ ) pointer (strlen pointer + 1)
in
if Ctypes.is_null next_group_name_ptr then group_name_ptr := None
else group_name_ptr := Some next_group_name_ptr
| None | Some _ -> ());
i := !i + 2
done;
Ok { captures = substrings; input; index = !index; groups = !groups }
| 0 ->
(match sticky regexp || global regexp with
| true -> regexp.lastIndex <- 0
| false -> ());
Ok { captures = [||]; input; index = 0; groups = [] }
| _ -> Error "Error"
let test regexp input =
let result = exec regexp input in
match result with
| Ok result -> Array.length result.captures > 0
| Error _ -> false