1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
open StdLabels
type t = int
let[@inline] to_int (x : t) : int = x
let[@inline] unsafe_of_int (x : int) : t = x
let empty = 0
type width_method = [ `Unicode | `Wcwidth | `No_zwj ]
type pool = {
mutable storage : bytes;
mutable offsets : int array;
mutable lengths : int array;
mutable capacities : int array;
mutable refcounts : int array;
mutable generations : int array;
mutable free_stack : int array;
mutable free_count : int;
mutable next_id : int;
mutable storage_cursor : int;
segmenter : Uuseg_grapheme_cluster.t;
}
let flag_grapheme = if Sys.word_size = 64 then 1 lsl 62 else 0
let flag_continuation = if Sys.word_size = 64 then 1 lsl 61 else 0
let shift_right_extent = 59
let shift_left_extent = 57
let shift_generation = 18
let mask_generation = 0x7F
let mask_index = 0x3FFFF
let shift_width = 21
let mask_codepoint = 0x1FFFFF
let default_tab_width = 2
let initial_pool_ids = 4096
let initial_pool_bytes = 4096 * 8
let () =
if Sys.word_size <> 64 then
failwith "Glyph: 64-bit OCaml required (63-bit integer packing)"
let[@inline] normalize_tab_width w = if w <= 0 then default_tab_width else w
let[@inline] ascii_width ~tab_width b =
if b = 0x09 then tab_width else if b >= 0x20 && b <= 0x7E then 1 else 0
let[@inline] is_ascii_4 str i =
let c0 = Char.code (String.unsafe_get str i) in
let c1 = Char.code (String.unsafe_get str (i + 1)) in
let c2 = Char.code (String.unsafe_get str (i + 2)) in
let c3 = Char.code (String.unsafe_get str (i + 3)) in
c0 lor c1 lor c2 lor c3 < 128
let rec is_ascii_only_tail str len j =
j >= len
|| Char.code (String.unsafe_get str j) < 128
&& is_ascii_only_tail str len (j + 1)
let rec is_ascii_only str len i =
if i + 4 <= len then is_ascii_4 str i && is_ascii_only str len (i + 4)
else is_ascii_only_tail str len i
let[@inline] is_regional_indicator cp = cp >= 0x1F1E6 && cp <= 0x1F1FF
let[@inline] is_virama cp = cp land 0x7F = 0x4D && cp >= 0x094D && cp <= 0x0D4D
let[@inline] is_devanagari_base cp =
(cp >= 0x0915 && cp <= 0x0939) || (cp >= 0x0958 && cp <= 0x095F)
let[@inline] codepoint_width_wcwidth ~tab_width cp =
if cp < 0x80 then
if cp = 0x09 then tab_width else if cp < 32 || cp = 127 then 0 else 1
else
let w = Unicode.tty_width_hint (Uchar.unsafe_of_int cp) in
if w = -1 then 0 else w
let[@inline] codepoint_width_unicode ~tab_width cp =
if cp < 0x80 then
if cp = 0x09 then tab_width else if cp < 32 || cp = 127 then -1 else 1
else Unicode.tty_width_hint (Uchar.unsafe_of_int cp)
let[@inline] codepoint_width ~method_ ~tab_width cp =
match method_ with
| `Wcwidth -> codepoint_width_wcwidth ~tab_width cp
| `Unicode | `No_zwj -> codepoint_width_unicode ~tab_width cp
let width_flag_has_width = 1
let width_flag_ri_pair = 2
let width_flag_virama = 4
let rec grapheme_width_unicode_loop str limit tab_width i width flags =
if i >= limit then width
else
let d = String.get_utf_8_uchar str i in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let next = i + Uchar.utf_decode_length d in
let cp_width = codepoint_width_unicode ~tab_width cp in
let has_width = flags land width_flag_has_width <> 0 in
let is_ri_pair = flags land width_flag_ri_pair <> 0 in
let has_virama = flags land width_flag_virama <> 0 in
if cp = 0xFE0F then
let new_width = if has_width && width = 1 then 2 else width in
grapheme_width_unicode_loop str limit tab_width next new_width flags
else if is_virama cp then
grapheme_width_unicode_loop str limit tab_width next width
(flags lor width_flag_virama)
else if is_regional_indicator cp then
if is_ri_pair then
grapheme_width_unicode_loop str limit tab_width next (width + cp_width)
(flags lor width_flag_has_width land lnot width_flag_ri_pair
land lnot width_flag_virama)
else
let new_w = if not has_width then cp_width else width in
grapheme_width_unicode_loop str limit tab_width next new_w
(flags lor width_flag_has_width lor width_flag_ri_pair
land lnot width_flag_virama)
else if has_width && has_virama && is_devanagari_base cp then
let add = if cp <> 0x0930 && cp_width > 0 then cp_width else 0 in
grapheme_width_unicode_loop str limit tab_width next (width + add)
(flags lor width_flag_has_width land lnot width_flag_virama)
else if (not has_width) && cp_width > 0 then
grapheme_width_unicode_loop str limit tab_width next cp_width
(flags lor width_flag_has_width land lnot width_flag_virama)
else
grapheme_width_unicode_loop str limit tab_width next width
(flags land lnot width_flag_virama)
let rec grapheme_width_wcwidth_loop str limit tab_width i acc =
if i >= limit then acc
else
let d = String.get_utf_8_uchar str i in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let next = i + Uchar.utf_decode_length d in
grapheme_width_wcwidth_loop str limit tab_width next
(acc + codepoint_width_wcwidth ~tab_width cp)
let cluster_width ~method_ ~tab_width str off len =
let limit = off + len in
match method_ with
| `Wcwidth -> grapheme_width_wcwidth_loop str limit tab_width off 0
| `Unicode | `No_zwj ->
grapheme_width_unicode_loop str limit tab_width off 0 0
let rec find_boundary_loop seg str limit pos =
if pos >= limit then limit
else
let d = String.get_utf_8_uchar str pos in
let u = Uchar.utf_decode_uchar d in
if Uuseg_grapheme_cluster.check_boundary seg u then pos
else find_boundary_loop seg str limit (pos + Uchar.utf_decode_length d)
let next_boundary seg ~ignore_zwj str start limit =
if start >= limit then limit
else (
Uuseg_grapheme_cluster.reset seg;
Uuseg_grapheme_cluster.set_ignore_zwj seg ignore_zwj;
let d = String.get_utf_8_uchar str start in
let u = Uchar.utf_decode_uchar d in
let _ = Uuseg_grapheme_cluster.check_boundary seg u in
find_boundary_loop seg str limit (start + Uchar.utf_decode_length d))
let[@inline] clamp_extent v = if v < 0 then 0 else if v > 3 then 3 else v
let[@inline] pack_start idx gen width =
let w = if width < 1 then 1 else width in
let right = if w > 4 then 3 else w - 1 in
flag_grapheme
lor (right lsl shift_right_extent)
lor (gen lsl shift_generation) lor (idx land mask_index)
let[@inline] pack_continuation ~idx ~gen ~left ~right =
flag_grapheme lor flag_continuation
lor (clamp_extent left lsl shift_left_extent)
lor (clamp_extent right lsl shift_right_extent)
lor (gen lsl shift_generation) lor (idx land mask_index)
let[@inline] pack_simple cp w = (w lsl shift_width) lor cp
let[@inline] is_inline c = c land flag_grapheme = 0
let[@inline] is_complex c = c land flag_grapheme <> 0
let[@inline] is_start c = is_inline c || c land flag_continuation = 0
let[@inline] is_continuation c =
(not (is_inline c)) && c land flag_continuation <> 0
let[@inline] is_empty c = c = 0
let[@inline] right_extent c = (c lsr shift_right_extent) land 3
let[@inline] left_extent c = (c lsr shift_left_extent) land 3
let[@inline] codepoint c = c land mask_codepoint
let[@inline] pool_payload c = c land 0x01FFFFFF
let[@inline] pool_index c = c land mask_index
let[@inline] unpack_idx c = c land mask_index
let[@inline] unpack_gen c = (c lsr shift_generation) land mask_generation
let[@inline] validate_complex pool c =
let idx = unpack_idx c in
let gen = unpack_gen c in
if
idx > 0 && idx < pool.next_id
&& Array.unsafe_get pool.generations idx = gen
&& Array.unsafe_get pool.refcounts idx >= 0
then idx
else -1
let space = pack_simple 0x20 1
let[@inline] grapheme_width ?(tab_width = default_tab_width) c =
let tab_width = normalize_tab_width tab_width in
if is_empty c then 0
else if is_inline c then
let cp = c land mask_codepoint in
if cp = 0x09 then tab_width else (c lsr shift_width) land 3
else
let l = left_extent c in
let r = right_extent c in
if is_continuation c then l + 1 + r else if l <> 0 then 0 else r + 1
let[@inline] pool_key c =
if is_inline c then None
else
let idx = pool_index c in
if idx = 0 then None else Some (pool_payload c)
let[@inline] cell_width c =
if c = 0 then 0
else if is_inline c then
let w = (c lsr shift_width) land 3 in
if w = 0 then 1 else w
else if is_continuation c then 0
else right_extent c + 1
let make_continuation ~code ~left ~right =
let payload = if is_inline code then 0 else pool_payload code in
let l_enc = clamp_extent left in
let r_enc = clamp_extent right in
flag_grapheme lor flag_continuation lor payload
lor (l_enc lsl shift_left_extent)
lor (r_enc lsl shift_right_extent)
let of_uchar uchar =
let u = Uchar.to_int uchar in
let tab_width = default_tab_width in
if u < 128 then
let w = ascii_width ~tab_width u in
if w <= 0 then 0 else if u = 0x09 then pack_simple u 0 else pack_simple u w
else
let w = codepoint_width ~method_:`Unicode ~tab_width u in
if w <= 0 then 0 else pack_simple u w
module Pool = struct
type t = pool
let create () =
{
storage = Bytes.create initial_pool_bytes;
offsets = Array.make initial_pool_ids 0;
lengths = Array.make initial_pool_ids 0;
capacities = Array.make initial_pool_ids 0;
refcounts = Array.make initial_pool_ids 0;
generations = Array.make initial_pool_ids 0;
free_stack = Array.make initial_pool_ids 0;
free_count = 0;
next_id = 1;
storage_cursor = 0;
segmenter = Uuseg_grapheme_cluster.create ();
}
let clear pool =
let used = pool.next_id in
pool.next_id <- 1;
pool.storage_cursor <- 0;
pool.free_count <- 0;
Array.fill pool.lengths ~pos:0 ~len:used 0;
Array.fill pool.capacities ~pos:0 ~len:used 0;
Array.fill pool.generations ~pos:0 ~len:used 0
let ensure_id_capacity pool =
let cap = Array.length pool.offsets in
if pool.next_id >= cap then (
let new_cap = cap * 2 in
if new_cap > mask_index + 1 then failwith "Glyph pool ID exhaustion";
let resize arr def =
let new_arr = Array.make new_cap def in
Array.blit ~src:arr ~src_pos:0 ~dst:new_arr ~dst_pos:0 ~len:cap;
new_arr
in
pool.offsets <- resize pool.offsets 0;
pool.lengths <- resize pool.lengths 0;
pool.capacities <- resize pool.capacities 0;
pool.refcounts <- resize pool.refcounts 0;
pool.generations <- resize pool.generations 0;
pool.free_stack <- resize pool.free_stack 0)
let ensure_storage_capacity pool needed =
let cap = Bytes.length pool.storage in
if pool.storage_cursor + needed > cap then (
let new_cap = max (cap * 2) (pool.storage_cursor + needed) in
let new_bytes = Bytes.create new_cap in
Bytes.blit ~src:pool.storage ~src_pos:0 ~dst:new_bytes ~dst_pos:0
~len:pool.storage_cursor;
pool.storage <- new_bytes)
let[@inline] next_free_id pool =
if pool.free_count > 0 then (
let i = pool.free_count - 1 in
pool.free_count <- i;
let id = Array.unsafe_get pool.free_stack i in
let g = (Array.unsafe_get pool.generations id + 1) land mask_generation in
Array.unsafe_set pool.generations id g;
id)
else
let id = pool.next_id in
pool.next_id <- id + 1;
Array.unsafe_set pool.generations id 0;
id
let[@inline] push_free pool idx =
Array.unsafe_set pool.free_stack pool.free_count idx;
pool.free_count <- pool.free_count + 1
let alloc_string pool str off len =
ensure_id_capacity pool;
let id = next_free_id pool in
let cap = Array.unsafe_get pool.capacities id in
let cursor =
if cap >= len then Array.unsafe_get pool.offsets id
else (
ensure_storage_capacity pool len;
let cur = pool.storage_cursor in
pool.storage_cursor <- cur + len;
Array.unsafe_set pool.capacities id len;
cur)
in
Bytes.blit_string ~src:str ~src_pos:off ~dst:pool.storage ~dst_pos:cursor
~len;
Array.unsafe_set pool.offsets id cursor;
Array.unsafe_set pool.lengths id len;
Array.unsafe_set pool.refcounts id 0;
id
let incref pool c =
if is_inline c then ()
else
let idx = validate_complex pool c in
if idx >= 0 then
Array.unsafe_set pool.refcounts idx
(Array.unsafe_get pool.refcounts idx + 1)
let decref pool c =
if is_inline c then ()
else
let idx = validate_complex pool c in
if idx < 0 then ()
else
let rc = Array.unsafe_get pool.refcounts idx in
if rc < 0 then ()
else
let rc' = rc - 1 in
if rc' > 0 then Array.unsafe_set pool.refcounts idx rc'
else (
Array.unsafe_set pool.refcounts idx (-1);
push_free pool idx)
let rec ascii_width_loop_tail str limit tab_width i acc =
if i >= limit then acc
else
let b = Char.code (String.unsafe_get str i) in
if b >= 128 then -1
else
ascii_width_loop_tail str limit tab_width (i + 1)
(acc + ascii_width ~tab_width b)
let rec ascii_width_loop str limit tab_width i acc =
if i + 4 <= limit then
if not (is_ascii_4 str i) then -1
else
let w0 = ascii_width ~tab_width (Char.code (String.unsafe_get str i)) in
let w1 =
ascii_width ~tab_width (Char.code (String.unsafe_get str (i + 1)))
in
let w2 =
ascii_width ~tab_width (Char.code (String.unsafe_get str (i + 2)))
in
let w3 =
ascii_width ~tab_width (Char.code (String.unsafe_get str (i + 3)))
in
ascii_width_loop str limit tab_width (i + 4) (acc + w0 + w1 + w2 + w3)
else ascii_width_loop_tail str limit tab_width i acc
let intern_core pool method_ tab_width precomputed_width off len str =
if len = 0 then 0
else if len = 1 then
let b = Char.code (String.unsafe_get str off) in
if b < 0x80 then
let w =
match precomputed_width with
| Some w -> w
| None -> ascii_width ~tab_width b
in
if w <= 0 then 0
else if b = 0x09 then pack_simple b 0
else pack_simple b w
else
let d = String.get_utf_8_uchar str off in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let w =
match precomputed_width with
| Some w -> w
| None -> codepoint_width ~method_ ~tab_width cp
in
if w <= 0 then 0 else pack_simple cp w
else
let d = String.get_utf_8_uchar str off in
let cp_len = Uchar.utf_decode_length d in
if cp_len = len then
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let w =
match precomputed_width with
| Some w -> w
| None -> codepoint_width ~method_ ~tab_width cp
in
if w <= 0 then 0 else pack_simple cp w
else
let w =
match precomputed_width with
| Some w -> w
| None ->
let first_b = Char.code (String.unsafe_get str off) in
if first_b >= 128 then
cluster_width ~method_ ~tab_width str off len
else
let ascii_w =
ascii_width_loop str (off + len) tab_width off 0
in
if ascii_w >= 0 then ascii_w
else cluster_width ~method_ ~tab_width str off len
in
if w <= 0 then 0
else
let idx = alloc_string pool str off len in
pack_start idx (Array.unsafe_get pool.generations idx) w
let intern pool ?(width_method = `Unicode) ?(tab_width = default_tab_width)
str =
let tab_width = normalize_tab_width tab_width in
intern_core pool width_method tab_width None 0 (String.length str) str
let intern_sub pool ~width_method ~tab_width str ~pos ~len ~width =
let tab_width = normalize_tab_width tab_width in
intern_core pool width_method tab_width (Some width) pos len str
let encode pool ~width_method ~tab_width f str =
let tab_width = normalize_tab_width tab_width in
let len = String.length str in
if is_ascii_only str len 0 then
for i = 0 to len - 1 do
let b = Char.code (String.unsafe_get str i) in
if b = 0x09 then f (pack_simple 0x09 0)
else if b >= 0x20 && b <= 0x7E then f (pack_simple b 1)
done
else
let ignore_zwj = width_method = `No_zwj in
let seg = pool.segmenter in
Uuseg_grapheme_cluster.reset seg;
Uuseg_grapheme_cluster.set_ignore_zwj seg ignore_zwj;
let emit_complex ~off ~clus_len ~width =
let idx = alloc_string pool str off clus_len in
let gen = Array.unsafe_get pool.generations idx in
f (pack_start idx gen width);
if width > 1 then
let max_span = min 4 width - 1 in
for k = 1 to max_span do
f (pack_continuation ~idx ~gen ~left:k ~right:(max_span - k))
done
in
let emit_ascii b =
if b = 0x09 then f (pack_simple 0x09 0)
else if b >= 0x20 && b <= 0x7E then f (pack_simple b 1)
in
let rec loop i =
if i >= len then ()
else if i + 4 <= len && is_ascii_4 str i then (
emit_ascii (Char.code (String.unsafe_get str i));
emit_ascii (Char.code (String.unsafe_get str (i + 1)));
emit_ascii (Char.code (String.unsafe_get str (i + 2)));
emit_ascii (Char.code (String.unsafe_get str (i + 3)));
loop (i + 4))
else
let c = String.unsafe_get str i in
if Char.code c < 128 then (
emit_ascii (Char.code c);
loop (i + 1))
else
let end_pos = next_boundary seg ~ignore_zwj str i len in
let clus_len = end_pos - i in
let w =
cluster_width ~method_:width_method ~tab_width str i clus_len
in
(if w > 0 then
let d = String.get_utf_8_uchar str i in
if Uchar.utf_decode_length d = clus_len then (
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
f (pack_simple cp w);
if w > 1 then
let max_span = min 4 w - 1 in
for k = 1 to max_span do
f
(pack_continuation ~idx:0 ~gen:0 ~left:k
~right:(max_span - k))
done)
else emit_complex ~off:i ~clus_len ~width:w);
loop end_pos
in
loop 0
let length pool c =
if is_inline c then
Uchar.utf_8_byte_length (Uchar.unsafe_of_int (c land mask_codepoint))
else
let idx = validate_complex pool c in
if idx < 0 then 0 else Array.unsafe_get pool.lengths idx
let blit pool c buf ~pos =
if is_inline c then
let u = Uchar.unsafe_of_int (c land mask_codepoint) in
let len = Uchar.utf_8_byte_length u in
if len > Bytes.length buf - pos then 0
else Bytes.set_utf_8_uchar buf pos u
else
let idx = validate_complex pool c in
if idx < 0 then 0
else
let len = Array.unsafe_get pool.lengths idx in
if len > Bytes.length buf - pos then 0
else
let src_off = Array.unsafe_get pool.offsets idx in
Bytes.blit ~src:pool.storage ~src_pos:src_off ~dst:buf ~dst_pos:pos
~len;
len
let copy ~src c ~dst =
if is_inline c then c
else
let idx = validate_complex src c in
if idx < 0 then 0
else
let len = Array.unsafe_get src.lengths idx in
let src_off = Array.unsafe_get src.offsets idx in
if src_off + len > Bytes.length src.storage then 0
else (
ensure_id_capacity dst;
let dst_id = next_free_id dst in
let cap = Array.unsafe_get dst.capacities dst_id in
let cursor =
if cap >= len then Array.unsafe_get dst.offsets dst_id
else (
ensure_storage_capacity dst len;
let cur = dst.storage_cursor in
dst.storage_cursor <- cur + len;
Array.unsafe_set dst.capacities dst_id len;
cur)
in
Bytes.blit ~src:src.storage ~src_pos:src_off ~dst:dst.storage
~dst_pos:cursor ~len;
Array.unsafe_set dst.offsets dst_id cursor;
Array.unsafe_set dst.lengths dst_id len;
Array.unsafe_set dst.refcounts dst_id 0;
let dst_gen = Array.unsafe_get dst.generations dst_id in
if is_continuation c then
pack_continuation ~idx:dst_id ~gen:dst_gen ~left:(left_extent c)
~right:(right_extent c)
else pack_start dst_id dst_gen (grapheme_width c))
let to_string pool c =
if is_inline c then (
let u = Uchar.unsafe_of_int (c land mask_codepoint) in
let len = Uchar.utf_8_byte_length u in
let buf = Bytes.create len in
ignore (Bytes.set_utf_8_uchar buf 0 u);
Bytes.unsafe_to_string buf)
else
let idx = validate_complex pool c in
if idx < 0 then ""
else
let len = Array.unsafe_get pool.lengths idx in
let off = Array.unsafe_get pool.offsets idx in
Bytes.sub_string pool.storage ~pos:off ~len
end
type line_break_kind = [ `LF | `CR | `CRLF ]
module String = struct
let rec iter_graphemes_ascii str len f i =
if i >= len then ()
else if
Char.code (Stdlib.String.unsafe_get str i) = 0x0D
&& i + 1 < len
&& Stdlib.String.unsafe_get str (i + 1) = '\n'
then (
f ~offset:i ~len:2;
iter_graphemes_ascii str len f (i + 2))
else (
f ~offset:i ~len:1;
iter_graphemes_ascii str len f (i + 1))
let rec iter_graphemes_unicode seg str len f i start =
if i >= len then (if start < len then f ~offset:start ~len:(len - start))
else
let d = Stdlib.String.get_utf_8_uchar str i in
let u = Uchar.utf_decode_uchar d in
let next = i + Uchar.utf_decode_length d in
if Uuseg_grapheme_cluster.check_boundary seg u then (
f ~offset:start ~len:(i - start);
iter_graphemes_unicode seg str len f next i)
else iter_graphemes_unicode seg str len f next start
let iter_graphemes ?(ignore_zwj = false) f str =
let len = Stdlib.String.length str in
if len = 0 then ()
else if is_ascii_only str len 0 then iter_graphemes_ascii str len f 0
else
let seg = Uuseg_grapheme_cluster.create ~ignore_zwj () in
let d = Stdlib.String.get_utf_8_uchar str 0 in
let _ =
Uuseg_grapheme_cluster.check_boundary seg (Uchar.utf_decode_uchar d)
in
iter_graphemes_unicode seg str len f (Uchar.utf_decode_length d) 0
let iter_grapheme_info ~width_method ~tab_width f str =
let tab_width = normalize_tab_width tab_width in
let len = Stdlib.String.length str in
if len = 0 then ()
else
let seg = Uuseg_grapheme_cluster.create () in
let ignore_zwj = width_method = `No_zwj in
let emit_ascii i =
let b = Char.code (Stdlib.String.unsafe_get str i) in
let w = ascii_width ~tab_width b in
if w > 0 then f ~offset:i ~len:1 ~width:w
in
let rec loop i =
if i >= len then ()
else if i + 4 <= len && is_ascii_4 str i then (
emit_ascii i;
emit_ascii (i + 1);
emit_ascii (i + 2);
emit_ascii (i + 3);
loop (i + 4))
else
let c = Stdlib.String.unsafe_get str i in
if Char.code c < 128 then (
emit_ascii i;
loop (i + 1))
else
let end_pos = next_boundary seg ~ignore_zwj str i len in
let clus_len = end_pos - i in
let w =
cluster_width ~method_:width_method ~tab_width str i clus_len
in
if w > 0 then (
f ~offset:i ~len:clus_len ~width:w;
loop end_pos)
else loop end_pos
in
loop 0
let rec measure_ascii_tail str len tab_width i total =
if i >= len then total
else
let w =
ascii_width ~tab_width (Char.code (Stdlib.String.unsafe_get str i))
in
measure_ascii_tail str len tab_width (i + 1) (total + w)
let rec measure_ascii str len tab_width i total =
if i + 4 <= len && is_ascii_4 str i then
let w0 =
ascii_width ~tab_width (Char.code (Stdlib.String.unsafe_get str i))
in
let w1 =
ascii_width ~tab_width
(Char.code (Stdlib.String.unsafe_get str (i + 1)))
in
let w2 =
ascii_width ~tab_width
(Char.code (Stdlib.String.unsafe_get str (i + 2)))
in
let w3 =
ascii_width ~tab_width
(Char.code (Stdlib.String.unsafe_get str (i + 3)))
in
measure_ascii str len tab_width (i + 4) (total + w0 + w1 + w2 + w3)
else measure_ascii_tail str len tab_width i total
let rec measure_wcwidth str len tab_width i total =
if i >= len then total
else
let d = Stdlib.String.get_utf_8_uchar str i in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let w = codepoint_width_wcwidth ~tab_width cp in
measure_wcwidth str len tab_width
(i + Uchar.utf_decode_length d)
(total + w)
let ms_has_width = 1
let ms_ri_pair = 2
let ms_virama = 4
let rec measure_segmented seg str len tab_width i total g_w flags =
if i >= len then if flags land ms_has_width <> 0 then total + g_w else total
else
let d = Stdlib.String.get_utf_8_uchar str i in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let next = i + Uchar.utf_decode_length d in
let bw =
Uuseg_grapheme_cluster.check_boundary_with_width seg
(Uchar.unsafe_of_int cp)
in
let cp_w = if cp = 0x09 then tab_width else (bw land 3) - 1 in
if bw land 4 <> 0 then
let new_total =
if flags land ms_has_width <> 0 then total + g_w else total
in
if cp = 0xFE0F then
measure_segmented seg str len tab_width next new_total 0 0
else if is_virama cp then
measure_segmented seg str len tab_width next new_total 0 ms_virama
else if is_regional_indicator cp then
measure_segmented seg str len tab_width next new_total cp_w
(ms_has_width lor ms_ri_pair)
else if cp_w > 0 then
measure_segmented seg str len tab_width next new_total cp_w
ms_has_width
else measure_segmented seg str len tab_width next new_total 0 0
else if cp = 0xFE0F then
let new_w =
if flags land ms_has_width <> 0 && g_w = 1 then 2 else g_w
in
measure_segmented seg str len tab_width next total new_w flags
else if is_virama cp then
measure_segmented seg str len tab_width next total g_w
(flags lor ms_virama)
else if is_regional_indicator cp then
if flags land ms_ri_pair <> 0 then
measure_segmented seg str len tab_width next total (g_w + cp_w)
(ms_has_width land lnot ms_virama)
else
let new_w = if flags land ms_has_width = 0 then cp_w else g_w in
measure_segmented seg str len tab_width next total new_w
(flags lor ms_has_width lor ms_ri_pair land lnot ms_virama)
else if
flags land ms_has_width <> 0
&& flags land ms_virama <> 0
&& is_devanagari_base cp
then
let add = if cp <> 0x0930 && cp_w > 0 then cp_w else 0 in
measure_segmented seg str len tab_width next total (g_w + add)
(flags lor ms_has_width land lnot ms_virama)
else if flags land ms_has_width = 0 && cp_w > 0 then
measure_segmented seg str len tab_width next total cp_w
(flags lor ms_has_width land lnot ms_virama)
else
measure_segmented seg str len tab_width next total g_w
(flags land lnot ms_virama)
let measure ~width_method ~tab_width str =
let tab_width = normalize_tab_width tab_width in
let len = Stdlib.String.length str in
if len = 0 then 0
else if is_ascii_only str len 0 then measure_ascii str len tab_width 0 0
else
match width_method with
| `Wcwidth -> measure_wcwidth str len tab_width 0 0
| `Unicode | `No_zwj ->
let seg = Uuseg_grapheme_cluster.create () in
Uuseg_grapheme_cluster.set_ignore_zwj seg (width_method = `No_zwj);
let d = Stdlib.String.get_utf_8_uchar str 0 in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let bw =
Uuseg_grapheme_cluster.check_boundary_with_width seg
(Uchar.unsafe_of_int cp)
in
let w = if cp = 0x09 then tab_width else (bw land 3) - 1 in
let init_w = if w > 0 then w else 0 in
let init_flags =
(if w > 0 then ms_has_width else 0)
lor (if is_regional_indicator cp then ms_ri_pair else 0)
lor if is_virama cp then ms_virama else 0
in
measure_segmented seg str len tab_width
(Uchar.utf_decode_length d)
0 init_w init_flags
let measure_sub ~width_method ~tab_width str ~pos ~len:sub_len =
let tab_width = normalize_tab_width tab_width in
let end_pos = pos + sub_len in
if sub_len <= 0 then 0
else if is_ascii_only str end_pos pos then
measure_ascii str end_pos tab_width pos 0
else
match width_method with
| `Wcwidth -> measure_wcwidth str end_pos tab_width pos 0
| `Unicode | `No_zwj ->
let seg = Uuseg_grapheme_cluster.create () in
Uuseg_grapheme_cluster.set_ignore_zwj seg (width_method = `No_zwj);
let d = Stdlib.String.get_utf_8_uchar str pos in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
let bw =
Uuseg_grapheme_cluster.check_boundary_with_width seg
(Uchar.unsafe_of_int cp)
in
let w = if cp = 0x09 then tab_width else (bw land 3) - 1 in
let init_w = if w > 0 then w else 0 in
let init_flags =
(if w > 0 then ms_has_width else 0)
lor (if is_regional_indicator cp then ms_ri_pair else 0)
lor if is_virama cp then ms_virama else 0
in
measure_segmented seg str end_pos tab_width
(pos + Uchar.utf_decode_length d)
0 init_w init_flags
let grapheme_count str =
let n = ref 0 in
iter_graphemes (fun ~offset:_ ~len:_ -> incr n) str;
!n
let[@inline] is_ascii_wrap_break b =
match b with
| 0x20 | 0x09 | 0x2D | 0x2F | 0x5C | 0x2E | 0x2C | 0x3B | 0x3A | 0x21 | 0x3F
| 0x28 | 0x29 | 0x5B | 0x5D | 0x7B | 0x7D ->
true
| _ -> false
let[@inline] is_unicode_wrap_break cp =
match cp with
| 0x00A0 | 0x1680 | 0x202F | 0x205F | 0x3000 | 0x200B | 0x00AD | 0x2010 ->
true
| cp when cp >= 0x2000 && cp <= 0x200A -> true
| _ -> false
let iter_wrap_breaks_core ?(width_method = `Unicode) f s =
let len = Stdlib.String.length s in
let ignore_zwj = width_method = `No_zwj in
let seg = Uuseg_grapheme_cluster.create () in
let rec has_break i limit =
if i >= limit then false
else
let b0 = Char.code (Stdlib.String.unsafe_get s i) in
if b0 < 0x80 then is_ascii_wrap_break b0 || has_break (i + 1) limit
else
let d = Stdlib.String.get_utf_8_uchar s i in
let cp = Uchar.to_int (Uchar.utf_decode_uchar d) in
is_unicode_wrap_break cp
|| has_break (i + Uchar.utf_decode_length d) limit
in
let rec loop byte_off g_off =
if byte_off >= len then ()
else
let next = next_boundary seg ~ignore_zwj s byte_off len in
if has_break byte_off next then
f ~byte_off ~next_off:next ~grapheme_off:g_off;
loop next (g_off + 1)
in
loop 0 0
let iter_wrap_breaks ?(width_method = `Unicode) f s =
iter_wrap_breaks_core ~width_method
(fun ~byte_off ~next_off ~grapheme_off ->
f ~break_byte_offset:byte_off ~next_byte_offset:next_off
~grapheme_offset:grapheme_off)
s
let iter_line_breaks f s =
let len = Stdlib.String.length s in
let rec loop i =
if i < len then
let b = Char.code (Stdlib.String.unsafe_get s i) in
if b = 0x0D then
if
i + 1 < len && Char.code (Stdlib.String.unsafe_get s (i + 1)) = 0x0A
then (
f ~pos:(i + 1) ~kind:`CRLF;
loop (i + 2))
else (
f ~pos:i ~kind:`CR;
loop (i + 1))
else if b = 0x0A then (
f ~pos:i ~kind:`LF;
loop (i + 1))
else loop (i + 1)
in
loop 0
end