123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254(*---------------------------------------------------------------------------
Copyright (c) 2012 The uunf programmers. All rights reserved.
SPDX-License-Identifier: ISC
---------------------------------------------------------------------------*)typeret=[`UcharofUchar.t|`End|`Await]letpp_retppfv=match(v:>ret)with|`Ucharu->Format.fprintfppf"`Uchar U+%04X"(Uchar.to_intu)|`End->Format.fprintfppf"`End"|`Await->Format.fprintfppf"`Await"leterr_exp_awaitadd=invalid_arg(Format.asprintf"can't add %a, expected `Await"pp_retadd)leterr_endedadd=invalid_arg(Format.asprintf"can't add %a, `End already added"pp_retadd)(* The normalization process is implemented as described in UAX #15
section 9.1 for normalizing the concatenation of normalized
strings. We detect ranges of characters in the input sequence
enclosed between two characters for which NFX_quick_check=YES *and*
ccc = 0 (6.1.0 wrongly claims that quick_check=YES implies ccc = 0,
we therefore call this property nfx_boundary). Only these ranges
(including the left boundary) need to be bufferized to perform the
normalization process. *)(* Characters *)letux_none=max_int(* no char, outside unicode range. *)letu_dumb=(* placeholder, overwritten. *)`Uchar(Uchar.of_int0x0000)(* Normalization properties. *)letunicode_version=Uunf_data.unicode_versionletnfc_boundaryu=Uunf_tmapbool.getUunf_data.nfc_boundary_mapuletnfd_boundaryu=Uunf_tmapbool.getUunf_data.nfd_boundary_mapuletnfkc_boundaryu=Uunf_tmapbool.getUunf_data.nfkc_boundary_mapuletnfkd_boundaryu=Uunf_tmapbool.getUunf_data.nfkd_boundary_mapulet_cccu=Uunf_tmapbyte.getUunf_data.ccc_mapuletcccu=_ccc(Uchar.to_intu)letdecomp_propu=Uunf_tmap.getUunf_data.decomp_mapuletcompose_propu=Uunf_tmap.getUunf_data.compose_mapumoduleH=struct(* Hangul arithmetic constants. *)letsbase=0xAC00letlbase=0x1100letvbase=0x1161lettbase=0x11A7letscount=11172letlcount=19letvcount=21lettcount=28letncount=588letscount=11172endletdecompu=letu=Uchar.to_intuinifu<0xAC00||0xD7A3<uthendecomp_propuelsebegin(* LV or LVT hangul composite *)letsindex=u-H.sbaseinletl=H.lbase+(sindex/H.ncount)inletv=H.vbase+(sindexmodH.ncount)/H.tcountinlett=H.tbase+(sindexmodH.tcount)inift=H.tbasethen[|l;v|]else[|l;v;t|]end(* N.B. to help stream-safe text implementers we *could* use the bits
25-27 of [(decomp u).(0)] to indicate the number of initial non
starters in the NFKD decomposition of [u] and bits and 28-30 to
indicate the non starter count increment. *)letd_compatibilityi=iland(1lsl24)>0let_d_uchari=iland0x1FFFFFletd_uchari=Uchar.unsafe_of_int(_d_uchari)let_compositeu1u2=if0x1100<=u1&&u1<=0x1112thenbeginifu2<0x1161||0x1175<u2thenux_noneelseletl=u1-H.lbasein(* LV hangul composite *)letv=u2-H.vbaseinH.sbase+l*H.ncount+v*H.tcountendelseif0xAC00<=u1&&u1<=0xD788&&(u1-0x0AC00)modH.tcount=0thenbeginifu2<0x11A8||u2>0x11C3thenux_noneelse(u1+u2-H.tbase)(* LVT hangul composite *)endelsematchcompose_propu1with|[||]->ux_none|a(* [u2; c; u2'; c'; ...] sorted *)->letlen=Array.lengtha/2inleti=ref0intrywhile(!i<len)doifa.(!i*2)=u2thenraiseExitelseincri;done;ux_nonewithExit->(a.(!i*2+1))letcompositeu1u2=letu=_composite(Uchar.to_intu1)(Uchar.to_intu2)inifu=ux_nonethenNoneelseSome(Uchar.unsafe_of_intu)(* Normalize *)typeform=[`NFC|`NFD|`NFKC|`NFKD]typestate=(* normalizer state. *)|Start(* no cp seen yet. *)|Boundary(* cp with boundary = true found in n.uc, no accumulation yet. *)|Acc(* accumulate until next cp with boundary = true. *)|Flush(* next cp with boundary = true found, flush previous data. *)|End(* end of normalization sequence. *)typet={form:form;(* normalization form. *)compat:bool;(* true if compatibility decomposition needed. *)compose:bool;(* true if composition needed. *)boundary:int->bool;(* nfx_boundary. *)mutablestate:state;(* normalizer state. *)mutableuc:[`UcharofUchar.t];(* last cp with boundary = true. *)mutableacc:intarray;(* code point accumulator. *)mutablefirst:int;(* index of first code point in acc. *)mutablelast:int;(* index of last code point in acc. *)mutableis_end:bool;}(* [true] if `End was seen. *)letcreate_acc()=Array.make35ux_noneletcreateform=letboundary,compat,compose=matchformwith|`NFC->nfc_boundary,false,true|`NFD->nfd_boundary,false,false|`NFKC->nfkc_boundary,true,true|`NFKD->nfkd_boundary,true,falsein{form=(form:>form);compat;compose;boundary;state=Start;uc=u_dumb;acc=create_acc();first=0;last=-1;is_end=false}letget_un=let`Ucharu=n.ucinUchar.to_intuletacc_emptyn=n.first>n.lastletformn=n.formletcopyn={nwithacc=Array.copyn.acc}letresetn=n.state<-Start;n.uc<-u_dumb;n.acc<-create_acc();n.first<-0;n.last<--1;n.is_end<-falseletgrow_accn=letlen=Array.lengthn.accinletacc'=Array.make(2*len)ux_noneinArray.blitn.acc0acc'0len;n.acc<-acc'letordered_addnu=(* canonical ordering algorithm via insertion sort. *)n.last<-n.last+1;ifn.last=Array.lengthn.accthengrow_accn;letc=_cccuinifc=0thenn.acc.(n.last)<-uelsebeginleti=ref(n.last-1)inwhile(!i>=0&&_ccc(n.acc.(!i))>c)don.acc.(!i+1)<-n.acc.(!i);decri;(* shift right. *)done;n.acc.(!i+1)<-uendletrecaddnu=if0xAC00<=u&&u<=0xD7A3thenbegin(* LV or LVT hangul composite, copied from decomp to avoid alloc. *)letsindex=u-H.sbaseinletl=H.lbase+(sindex/H.ncount)inletv=H.vbase+(sindexmodH.ncount)/H.tcountinlett=H.tbase+(sindexmodH.tcount)inift=H.tbasethen(ordered_addnl;ordered_addnv)else(ordered_addnl;ordered_addnv;ordered_addnt)endelsebeginmatchdecomp_propuwith|[||]->ordered_addnu|d->ifd_compatibilityd.(0)&¬n.compatthenordered_addnuelsebeginaddn(_d_uchard.(0));fori=1toArray.lengthd-1doaddnd.(i)doneendendletcomposen=(* canonical composition algorithm. *)letrecloop~last_starter~prev_ccci=ifi>n.lastthen()elseletccc_i=_cccn.acc.(i)inletu_comp=_compositen.acc.(last_starter)n.acc.(i)inmatch(u_comp=ux_none||(ccc_i=0&&last_starter<>i-1))with|true->letlast_starter=ifccc_i=0thenielselast_starterinloop~last_starter~prev_ccc:ccc_i(i+1)|false->matchprev_ccc<>0&&prev_ccc>=ccc_iwith|true->loop~last_starter~prev_ccc:ccc_i(i+1)|false->n.acc.(last_starter)<-u_comp;Array.blitn.acc(i+1)n.acci(n.last-i);n.last<-n.last-1;letprev_ccc=_cccn.acc.(last_starter)inloop~last_starter~prev_ccc(last_starter+1)inletlast_starter=n.firstinletprev_ccc=_cccn.acc.(last_starter)inloop~last_starter~prev_ccc(last_starter+1)letflush_nextn=letret=`Uchar(Uchar.unsafe_of_intn.acc.(n.first))inifn.first=n.lastthen(n.first<-0;n.last<--1)else(n.first<-n.first+1);retletflush_startn=ifn.composethencomposen;flush_nextnletaddn=function|`Ucharuasuc->letu=Uchar.to_intuinbeginmatchn.statewith|Boundary->ifn.boundaryuthen(letprev=n.ucinn.uc<-uc;(prev:>ret))else(n.state<-Acc;addn(get_un);addnu;`Await)|Acc->ifn.boundaryuthen(n.state<-Flush;n.uc<-uc;flush_startn)else(addnu;`Await)|Start->ifn.boundaryuthen(n.state<-Boundary;n.uc<-uc;`Await)else(n.state<-Acc;addnu;`Await)|Flush->err_exp_awaituc|End->err_endeducend|`Await->beginmatchn.statewith|Flush->ifnot(acc_emptyn)thenflush_nextnelseifn.is_endthen(n.state<-End;`End)else(n.state<-Boundary;`Await)|Start|Boundary|Acc->`Await|End->`Endend|`End->n.is_end<-true;beginmatchn.statewith|Boundary->n.state<-End;(n.uc:>ret)|Acc->n.state<-Flush;flush_startn|Start->n.state<-End;`End|Flush->err_exp_await`End|End->err_ended`Endend