1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
open Import
let rec iter n f v = if Int.equal n 0 then v else iter (n - 1) f (f v)
module Idx : sig
type t [@@immediate]
val unknown : t
val make_break : Automata.Idx.t -> t
val of_idx : Automata.Idx.t -> t
val is_idx : t -> bool
val is_break : t -> bool
val idx : t -> int
val break_idx : t -> int
end = struct
type t = int
let unknown = -2
let break = -3
let of_idx (x : Automata.Idx.t) = Automata.Idx.to_int x [@@inline always]
let is_idx t = t >= 0 [@@inline always]
let is_break x = x <= break [@@inline always]
let idx t = t [@@inline always]
let make_break (idx : Automata.Idx.t) = -5 - Automata.Idx.to_int idx [@@inline always]
let break_idx t = (t + 5) * -1 [@@inline always]
end
type match_info =
| Match of Group.t
| Failed
| Running of { no_match_starts_before : int }
type state_info =
{ idx : Idx.t
;
mutable final : (Category.t * (Automata.Idx.t * Automata.Status.t)) list
;
desc : Automata.State.t
}
module State : sig
type t
val make : ncol:int -> state_info -> t
val make_break : state_info -> t
val get_info : t -> state_info
val follow_transition : t -> color:Cset.c -> t
val set_transition : t -> color:Cset.c -> t -> unit
end = struct
type t = Table of t array [@@unboxed]
let get_info (Table st) : state_info = Obj.magic (Array.unsafe_get st 0)
[@@inline always]
;;
let set_info (Table st) (info : state_info) = st.(0) <- Obj.magic info
let follow_transition (Table st) ~color = Array.unsafe_get st (1 + Cset.to_int color)
[@@inline always]
;;
let set_transition (Table st) ~color st' = st.(1 + Cset.to_int color) <- st'
let dummy (info : state_info) = Table [| Obj.magic info |]
let unknown_state = dummy { idx = Idx.unknown; final = []; desc = Automata.State.dummy }
let make ~ncol state =
let st = Table (Array.make (ncol + 1) unknown_state) in
set_info st state;
st
;;
let make_break state = Table [| Obj.magic state |]
end
type re =
{ initial : Automata.expr
;
mutable initial_states : (Category.t * State.t) list
;
colors : Color_map.Table.t
;
color_repr : Color_map.Repr.t
;
ncolor : int
;
lnl : Cset.c
;
tbl : Automata.Working_area.t
;
states : State.t Automata.State.Table.t
;
group_names : (string * int) list
;
group_count : int
}
let pp_re ch re = Automata.pp ch re.initial
let group_count re = re.group_count
let group_names re = re.group_names
module Positions = struct
type t =
{ mutable positions : int array
}
let empty = { positions = [||] }
let length t = Array.length t.positions
let set t idx pos = Array.unsafe_set t.positions idx pos
let resize t =
let len = Array.length t.positions in
let pos = t.positions in
t.positions <- Array.make (2 * len) 0;
Array.blit pos 0 t.positions 0 len
;;
let all t = t.positions
let first t = t.positions.(0)
let make ~groups re =
if groups
then
{ positions =
(let n = Automata.Working_area.index_count re.tbl + 1 in
if n <= 10 then [| 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |] else Array.make n 0)
}
else empty
;;
end
let category re ~color =
if Cset.equal_c color Cset.null_char
then Category.inexistant
else if Cset.equal_c color re.lnl
then Category.(lastnewline ++ newline ++ not_letter)
else Category.from_char (Color_map.Repr.repr re.color_repr color)
;;
let find_state re desc =
try Automata.State.Table.find re.states desc with
| Not_found ->
let st =
let break_state =
match Automata.State.status desc with
| Running -> false
| Failed | Match _ -> true
in
let st =
{ idx =
(let idx = Automata.State.idx desc in
if break_state then Idx.make_break idx else Idx.of_idx idx)
; final = []
; desc
}
in
if break_state then State.make_break st else State.make ~ncol:re.ncolor st
in
Automata.State.Table.add re.states desc st;
st
;;
let delta re positions cat ~color st =
let desc = Automata.delta re.tbl cat color st.desc in
let len = Positions.length positions in
if len > 0 && Automata.State.idx desc |> Automata.Idx.to_int = len
then Positions.resize positions;
desc
;;
let validate re positions (s : string) ~pos st =
let color = Color_map.Table.get re.colors s.[pos] in
let st' =
let desc' =
let cat = category re ~color in
delta re positions cat ~color (State.get_info st)
in
find_state re desc'
in
State.set_transition st ~color st'
;;
let next colors st s pos =
State.follow_transition st ~color:(Color_map.Table.get colors (String.unsafe_get s pos))
;;
let rec loop re ~colors ~positions s ~pos ~last st0 st =
if pos < last
then (
let st' = next colors st s pos in
let idx = (State.get_info st').idx in
if Idx.is_idx idx
then (
Positions.set positions (Idx.idx idx) pos;
loop re ~colors ~positions s ~pos:(pos + 1) ~last st' st')
else if Idx.is_break idx
then (
Positions.set positions (Idx.break_idx idx) pos;
st')
else (
validate re positions s ~pos st0;
loop re ~colors ~positions s ~pos ~last st0 st0))
else st
;;
let rec loop_no_mark re ~colors s ~pos ~last st0 st =
if pos < last
then (
let st' = next colors st s pos in
let idx = (State.get_info st').idx in
if Idx.is_idx idx
then loop_no_mark re ~colors s ~pos:(pos + 1) ~last st' st'
else if Idx.is_break idx
then st'
else (
validate re Positions.empty s ~pos st0;
loop_no_mark re ~colors s ~pos ~last st0 st0))
else st
;;
let final re positions st cat =
try List.assq cat st.final with
| Not_found ->
let st' = delta re positions cat ~color:Cset.null_char st in
let res = Automata.State.idx st', Automata.State.status st' in
st.final <- (cat, res) :: st.final;
res
;;
let find_initial_state re cat =
try List.assq cat re.initial_states with
| Not_found ->
let st = find_state re (Automata.State.create cat re.initial) in
re.initial_states <- (cat, st) :: re.initial_states;
st
;;
let get_color re (s : string) pos =
if pos < 0
then Cset.null_char
else (
let slen = String.length s in
if pos >= slen
then Cset.null_char
else if pos = slen - 1
&& (not (Cset.equal_c re.lnl Cset.null_char))
&& Char.equal (String.unsafe_get s pos) '\n'
then
re.lnl
else Color_map.Table.get re.colors (String.unsafe_get s pos))
;;
let rec handle_last_newline re positions ~pos st ~groups =
let st' = State.follow_transition st ~color:re.lnl in
let info = State.get_info st' in
if Idx.is_idx info.idx
then (
if groups then Positions.set positions (Idx.idx info.idx) pos;
st')
else if Idx.is_break info.idx
then (
if groups then Positions.set positions (Idx.break_idx info.idx) pos;
st')
else (
let color = re.lnl in
let st' =
let desc =
let cat = category re ~color in
let real_c = Color_map.Table.get re.colors '\n' in
delta re positions cat ~color:real_c (State.get_info st)
in
find_state re desc
in
State.set_transition st ~color st';
handle_last_newline re positions ~pos st ~groups)
;;
let rec scan_str re positions (s : string) initial_state ~last ~pos ~groups =
if last = String.length s
&& (not (Cset.equal_c re.lnl Cset.null_char))
&& last > pos
&& Char.equal (String.get s (last - 1)) '\n'
then (
let last = last - 1 in
let st = scan_str re positions ~pos s initial_state ~last ~groups in
if Idx.is_break (State.get_info st).idx
then st
else handle_last_newline re positions ~pos:last st ~groups)
else if groups
then loop re ~colors:re.colors ~positions s ~pos ~last initial_state initial_state
else loop_no_mark re ~colors:re.colors s ~pos ~last initial_state initial_state
;;
let final_boundary_check re positions ~last ~slen s state_info ~groups =
let idx, res =
let final_cat =
Category.(
search_boundary
++ if last = slen then inexistant else category re ~color:(get_color re s last))
in
final re positions state_info final_cat
in
(match groups, res with
| true, Match _ -> Positions.set positions (Automata.Idx.to_int idx) last
| _ -> ());
res
;;
let make_match_str re positions ~len ~groups ~partial s ~pos =
let slen = String.length s in
let last = if len = -1 then slen else pos + len in
let st =
let initial_state =
let initial_cat =
Category.(
search_boundary
++ if pos = 0 then inexistant else category re ~color:(get_color re s (pos - 1)))
in
find_initial_state re initial_cat
in
scan_str re positions s initial_state ~pos ~last ~groups
in
let state_info = State.get_info st in
if Idx.is_break state_info.idx || (partial && not groups)
then Automata.State.status state_info.desc
else if partial && groups
then (
match Automata.State.status state_info.desc with
| (Match _ | Failed) as status -> status
| Running ->
(match final_boundary_check re positions ~last ~slen s state_info ~groups with
| Match _ as status -> status
| Failed | Running ->
Running))
else final_boundary_check re positions ~last ~slen s state_info ~groups
;;
let match_str_no_bounds ~groups ~partial re s ~pos ~len =
let positions = Positions.make ~groups re in
match make_match_str re positions ~len ~groups ~partial s ~pos with
| Match (marks, pmarks) ->
Match
(Group.create s marks pmarks ~gpos:(Positions.all positions) ~gcount:re.group_count)
| Failed -> Failed
| Running ->
let no_match_starts_before = if groups then Positions.first positions else 0 in
Running { no_match_starts_before }
;;
let match_str_p re s ~pos ~len =
if pos < 0 || len < -1 || pos + len > String.length s
then invalid_arg "Re.exec: out of bounds";
match make_match_str re Positions.empty ~len ~groups:false ~partial:false s ~pos with
| Match _ -> true
| _ -> false
;;
let match_str ~groups ~partial re s ~pos ~len =
if pos < 0 || len < -1 || pos + len > String.length s
then invalid_arg "Re.exec: out of bounds";
match_str_no_bounds ~groups ~partial re s ~pos ~len
;;
let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_names ~group_count =
{ initial
; initial_states = []
; colors
; color_repr
; ncolor
; lnl
; tbl = Automata.Working_area.create ()
; states = Automata.State.Table.create 97
; group_names
; group_count
}
;;
module A = Automata
let enforce_kind ids kind kind' cr =
match kind, kind' with
| `First, `First -> cr
| `First, k -> A.seq ids k cr (A.eps ids)
| _ -> cr
;;
type context =
{ ids : A.Ids.t
; kind : A.Sem.t
; ign_group : bool
; greedy : A.Rep_kind.t
; pos : A.Mark.t ref
; names : (string * int) list ref
; cache : Cset.t Cset.CSetMap.t ref
; colors : Color_map.Table.t
}
let trans_set cache (cm : Color_map.Table.t) s =
match Cset.one_char s with
| Some i -> Cset.csingle (Color_map.Table.get_char cm i)
| None ->
let v = Cset.hash s, s in
(try Cset.CSetMap.find v !cache with
| Not_found ->
let l = Color_map.Table.translate_colors cm s in
cache := Cset.CSetMap.add v l !cache;
l)
;;
let make_repeater ids cr kind greedy =
match greedy with
| `Greedy -> fun rem -> A.alt ids [ A.seq ids kind (A.rename ids cr) rem; A.eps ids ]
| `Non_greedy ->
fun rem -> A.alt ids [ A.eps ids; A.seq ids kind (A.rename ids cr) rem ]
;;
let rec translate
({ ids; kind; ign_group; greedy; pos; names; cache; colors } as ctx)
(ast : Ast.no_case)
=
match ast with
| Set s -> A.cst ids (trans_set cache colors s), kind
| Sequence l -> trans_seq ctx l, kind
| Ast (Alternative l) ->
(match Ast.merge_sequences l with
| [ r' ] ->
let cr, kind' = translate ctx r' in
enforce_kind ids kind kind' cr, kind
| merged_sequences ->
( A.alt
ids
(List.map merged_sequences ~f:(fun r' ->
let cr, kind' = translate ctx r' in
enforce_kind ids kind kind' cr))
, kind ))
| Repeat (r', i, j) ->
let cr, kind' = translate ctx r' in
let rem =
match j with
| None -> A.rep ids greedy kind' cr
| Some j ->
let f = make_repeater ids cr kind' greedy in
iter (j - i) f (A.eps ids)
in
iter i (fun rem -> A.seq ids kind' (A.rename ids cr) rem) rem, kind
| Beg_of_line -> A.after ids Category.(inexistant ++ newline), kind
| End_of_line -> A.before ids Category.(inexistant ++ newline), kind
| Beg_of_word ->
( A.seq
ids
`First
(A.after ids Category.(inexistant ++ not_letter))
(A.before ids Category.letter)
, kind )
| End_of_word ->
( A.seq
ids
`First
(A.after ids Category.letter)
(A.before ids Category.(inexistant ++ not_letter))
, kind )
| Not_bound ->
( A.alt
ids
[ A.seq ids `First (A.after ids Category.letter) (A.before ids Category.letter)
; (let cat = Category.(inexistant ++ not_letter) in
A.seq ids `First (A.after ids cat) (A.before ids cat))
]
, kind )
| Beg_of_str -> A.after ids Category.inexistant, kind
| End_of_str -> A.before ids Category.inexistant, kind
| Last_end_of_line -> A.before ids Category.(inexistant ++ lastnewline), kind
| Start -> A.after ids Category.search_boundary, kind
| Stop -> A.before ids Category.search_boundary, kind
| Sem (kind', r') ->
let cr, kind'' = translate { ctx with kind = kind' } r' in
enforce_kind ids kind' kind'' cr, kind'
| Sem_greedy (greedy', r') -> translate { ctx with greedy = greedy' } r'
| Group (n, r') ->
if ign_group
then translate ctx r'
else (
let p = !pos in
let () =
match n with
| Some name -> names := (name, A.Mark.group_count p) :: !names
| None -> ()
in
pos := A.Mark.next2 !pos;
let cr, kind' = translate ctx r' in
( A.seq ids `First (A.mark ids p) (A.seq ids `First cr (A.mark ids (A.Mark.next p)))
, kind' ))
| No_group r' -> translate { ctx with ign_group = true } r'
| Nest r' ->
let b = !pos in
let cr, kind' = translate ctx r' in
let e = A.Mark.prev !pos in
if A.Mark.compare e b = -1
then cr, kind'
else A.seq ids `First (A.erase ids b e) cr, kind'
| Pmark (i, r') ->
let cr, kind' = translate ctx r' in
A.seq ids `First (A.pmark ids i) cr, kind'
and trans_seq ({ ids; kind; _ } as ctx) = function
| [] -> A.eps ids
| [ r ] ->
let cr', kind' = translate ctx r in
enforce_kind ids kind kind' cr'
| r :: rem ->
let cr', kind' = translate ctx r in
let cr'' = trans_seq ctx rem in
if A.is_eps cr'' then cr' else if A.is_eps cr' then cr'' else A.seq ids kind' cr' cr''
;;
let compile_1 regexp =
let regexp = Ast.handle_case false regexp in
let color_map = Color_map.make () in
let need_lnl = Ast.colorize color_map regexp in
let colors, color_repr = Color_map.flatten color_map in
let ncolor = Color_map.Repr.length color_repr in
let lnl = if need_lnl then Cset.of_int ncolor else Cset.null_char in
let ncolor = if need_lnl then ncolor + 1 else ncolor in
let ctx =
{ ids = A.Ids.create ()
; kind = `First
; ign_group = false
; greedy = `Greedy
; pos = ref A.Mark.start
; names = ref []
; cache = ref Cset.CSetMap.empty
; colors
}
in
let r, kind = translate ctx regexp in
let r = enforce_kind ctx.ids `First kind r in
mk_re
~initial:r
~colors
~color_repr
~ncolor
~lnl
~group_names:(List.rev !(ctx.names))
~group_count:(A.Mark.group_count !(ctx.pos))
;;
let compile r =
let open Ast.Export in
compile_1 (if Ast.anchored r then group r else seq [ shortest (rep any); group r ])
;;