123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204[@@@ocamlformat"disable"](*---------------------------------------------------------------------------
Copyright (c) 2014 The uuseg programmers. All rights reserved.
SPDX-License-Identifier: ISC
---------------------------------------------------------------------------*)(* Vendored from uuseg v17.0.0 with the following modifications:
- Added [reset] function for segmenter reuse (zero allocation)
- Added [ignore_zwj] option to disable GB11 (emoji ZWJ sequences)
- Added [set_ignore_zwj] to change the option after creation
- Added [check_boundary] for zero-allocation direct boundary checks
- Changed [break] and [update_left] to take pre-computed [is_extpic] flag
*)(* These are the rules as found in [1], with property values aliases [2]
substituted.
GB1. sot ÷ Any
GB2. Any ÷ eot
GB3. CR × LF
GB4. (CN|CR|LF) ÷
GB5. ÷ (CN|CR|LF)
GB6. L × (L|V|LV|LVT)
GB7. (LV|V) × (V|T)
GB8. (LVT|T) × T
GB9. × (EX|ZWJ)
GB9a. × SM
GB9b. PP ×
GB9c. \p{InCB=Consonant} [\p{InCB=Extend}\p{InCB=Linker}]*
\p{InCB=Linker} [\p{InCB=Extend}\p{InCB=Linker}]*
×
\p{InCB=Consonant}
GB11. \p{Extended_Pictographic} EX* ZWJ x \p{Extended_Pictographic}
GB12. sot (RI RI)* RI × RI
GB13. [^RI] (RI RI)* × RI
GB999. Any ÷ Any
[1]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
[2]: http://www.unicode.org/Public/7.0.0/ucd/PropertyValueAliases.txt
[3]: http://www.unicode.org/Public/7.0.0/ucd/auxiliary/GraphemeBreakTest.html
By the structure of the rules we see that grapheme clusters
boundaries can *mostly* be determined by simply looking at the
grapheme cluster break property value of the character on the left
and on the right of a boundary. The exceptions are GB9c, GB10 and GB12-13
which are handled specially by enriching the segmenter state in
a horribly ad-hoc fashion. *)typegcb=|CN|CR|EX|EB|EBG|EM|GAZ|L|LF|LV|LVT|PP|RI|SM|T|V|XX|ZWJ|Sottypeincb=Consonant|Extend|Linker|None'(* WARNING. The indexes used here need to be synchronized with those
in Unicode_data (generated from uucp).
*)letbyte_to_gcb=[|CN;CR;EX;EB;EBG;EM;GAZ;L;LF;LV;LVT;PP;RI;SM;T;V;XX;ZWJ;|]letgcbu=byte_to_gcb.(Unicode.grapheme_cluster_breaku)letbyte_to_incb=[|Consonant;Extend;Linker;None'|]letincbu=byte_to_incb.(Unicode.indic_conjunct_breaku)typeleft_gb9c_state=(* Ad-hoc state for matching GB9c *)|Reset|Has_consonant|Has_linkertypestate=|Fill(* get next uchar to decide boundary. *)|Flush(* an uchar is buffered, client needs to get it out with `Await. *)|End(* `End was added. *)typet={mutablestate:state;(* current state. *)mutableleft_gb9c:left_gb9c_state;(* state for matching gb9c. *)mutableleft:gcb;(* break property value left of boundary. *)mutableleft_odd_ri:bool;(* odd number of RI on the left. *)mutableleft_emoji_seq:bool;(* emoji seq on the left. *)mutablebuf:[`UcharofUchar.t];(* bufferized add. *)mutableignore_zwj:bool}(* if true, disable GB11 rule. *)letnul_buf=`Uchar(Uchar.unsafe_of_int0x0000)letcreate?(ignore_zwj=false)()={state=Fill;left_gb9c=Reset;left=Sot;left_odd_ri=false;left_emoji_seq=false;buf=nul_buf(* overwritten *);ignore_zwj}letcopys={swithstate=s.state;}letequal=(=)letresets=s.state<-Fill;s.left_gb9c<-Reset;s.left<-Sot;s.left_odd_ri<-false;s.left_emoji_seq<-false(* Note: ignore_zwj is preserved across reset *)letset_ignore_zwjsv=s.ignore_zwj<-vletgb9c_matchsright_incb=matchs.left_gb9c,right_incbwith|Has_linker,Consonant->true|_,_->false(* Core break check - takes pre-computed is_extpic for efficiency *)letbreaksrightright_incb~is_extpic=matchs.left,rightwith|(* GB1 *)Sot,_->true(* GB2 is handled by `End *)|(* GB3 *)CR,LF->false|(* GB4 *)(CN|CR|LF),_->true|(* GB5 *)_,(CN|CR|LF)->true|(* GB6 *)L,(L|V|LV|LVT)->false|(* GB7 *)(LV|V),(V|T)->false|(* GB8 *)(LVT|T),T->false|(* GB9+a *)_,(EX|ZWJ|SM)->false|(* GB9b *)PP,_->false|(* GB9c *)_,_whengb9c_matchsright_incb->false|(* GB11 *)ZWJ,_when(nots.ignore_zwj)&&s.left_emoji_seq&&is_extpic->false|(* GB12+13 *)RI,RIwhens.left_odd_ri->false|(* GB999 *)_,_->true(* Core state update - takes pre-computed is_extpic for efficiency *)letupdate_leftsrightright_incb~is_extpic=s.left<-right;beginmatchs.leftwith|EX|ZWJ->s.left_odd_ri<-false(* keep s.left_emoji_seq as is *)|RI->s.left_odd_ri<-nots.left_odd_ri;s.left_emoji_seq<-false;|_whenis_extpic->s.left_odd_ri<-false;s.left_emoji_seq<-true;|_->s.left_odd_ri<-false;s.left_emoji_seq<-falseend;s.left_gb9c<-beginmatchright_incbwith|None'->Reset|Consonant->Has_consonant|Linkerwhens.left_gb9c=Has_consonant->Has_linker|Extend|Linker->s.left_gb9cend(* Direct boundary check - zero allocation.
Returns true if there is a boundary BEFORE this character.
The first character always has a boundary before it (GB1).
Uses combined property lookup for efficiency. *)let[@inline]check_boundarysu=(* Single lookup: bits 0-4 = gcb, bits 5-6 = incb, bit 7 = extpic *)letprops=Unicode.grapheme_propsuinletright=byte_to_gcb.(propsland0x1F)inletright_incb=byte_to_incb.((propslsr5)land0x03)inletis_extpic=propsland0x80<>0inletis_break=breaksrightright_incb~is_extpicinupdate_leftsrightright_incb~is_extpic;is_break(* Combined boundary check + width extraction in one property lookup.
Returns packed int: bit 2 = is_boundary, bits 0-1 = width_enc.
Width encoding: 0 → -1, 1 → 0, 2 → 1, 3 → 2. *)let[@inline]check_boundary_with_widthsu=letpacked=Unicode.all_propsuinletright=byte_to_gcb.(packedland0x1F)inletright_incb=byte_to_incb.((packedlsr5)land0x03)inletis_extpic=packedland0x80<>0inletis_break=breaksrightright_incb~is_extpicinupdate_leftsrightright_incb~is_extpic;letwidth_enc=(packedlsr8)land0x03inifis_breakthenwidth_enclor4elsewidth_encletadds=function|`Ucharuasadd->beginmatchs.statewith|Fill->letright=gcbuinletright_incb=incbuinletis_extpic=Unicode.is_extended_pictographicuinletis_break=breaksrightright_incb~is_extpicinupdate_leftsrightright_incb~is_extpic;ifnotis_breakthenaddelse(s.state<-Flush;s.buf<-add;`Boundary)|Flush->Uuseg_base.err_exp_awaitadd|End->Uuseg_base.err_endedaddend|`Await->beginmatchs.statewith|Flush->s.state<-Fill;(s.buf:>Uuseg_base.ret)|End->`End|Fill->`Awaitend|`End->beginmatchs.statewith|Fill->s.state<-End;ifs.left=Sotthen`Endelse`Boundary|Flush->Uuseg_base.err_exp_await`End|End->Uuseg_base.err_ended`Endend