123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270(*---------------------------------------------------------------------------
Copyright (c) 2012 The uunf programmers. All rights reserved.
Distributed under the ISC license, see terms at the end of the file.
---------------------------------------------------------------------------*)typeret=[`UcharofUchar.t|`End|`Await]letpp_retppfv=match(v:>ret)with|`Ucharu->Format.fprintfppf"`Uchar U+%04X"(Uchar.to_intu)|`End->Format.fprintfppf"`End"|`Await->Format.fprintfppf"`Await"leterr_exp_awaitadd=invalid_arg(Format.asprintf"can't add %a, expected `Await"pp_retadd)leterr_endedadd=invalid_arg(Format.asprintf"can't add %a, `End already added"pp_retadd)(* The normalization process is implemented as described in UAX #15
section 9.1 for normalizing the concatenation of normalized
strings. We detect ranges of characters in the input sequence
enclosed between two characters for which NFX_quick_check=YES *and*
ccc = 0 (6.1.0 wrongly claims that quick_check=YES implies ccc = 0,
we therefore call this property nfx_boundary). Only these ranges
(including the left boundary) need to be bufferized to perform the
normalization process. *)(* Characters *)letux_none=max_int(* no char, outside unicode range. *)letu_dumb=(* placeholder, overwritten. *)`Uchar(Uchar.of_int0x0000)(* Normalization properties. *)letunicode_version="14.0.0"letnfc_boundaryu=Uunf_tmapbool.getUunf_data.nfc_boundary_mapuletnfd_boundaryu=Uunf_tmapbool.getUunf_data.nfd_boundary_mapuletnfkc_boundaryu=Uunf_tmapbool.getUunf_data.nfkc_boundary_mapuletnfkd_boundaryu=Uunf_tmapbool.getUunf_data.nfkd_boundary_mapulet_cccu=Uunf_tmapbyte.getUunf_data.ccc_mapuletcccu=_ccc(Uchar.to_intu)letdecomp_propu=Uunf_tmap.getUunf_data.decomp_mapuletcompose_propu=Uunf_tmap.getUunf_data.compose_mapumoduleH=struct(* Hangul arithmetic constants. *)letsbase=0xAC00letlbase=0x1100letvbase=0x1161lettbase=0x11A7letscount=11172letlcount=19letvcount=21lettcount=28letncount=588letscount=11172endletdecompu=letu=Uchar.to_intuinifu<0xAC00||0xD7A3<uthendecomp_propuelsebegin(* LV or LVT hangul composite *)letsindex=u-H.sbaseinletl=H.lbase+(sindex/H.ncount)inletv=H.vbase+(sindexmodH.ncount)/H.tcountinlett=H.tbase+(sindexmodH.tcount)inift=H.tbasethen[|l;v|]else[|l;v;t|]end(* N.B. to help stream-safe text implementers we *could* use the bits
25-27 of [(decomp u).(0)] to indicate the number of initial non
starters in the NFKD decomposition of [u] and bits and 28-30 to
indicate the non starter count increment. *)letd_compatibilityi=iland(1lsl24)>0let_d_uchari=iland0x1FFFFFletd_uchari=Uchar.unsafe_of_int(_d_uchari)let_compositeu1u2=if0x1100<=u1&&u1<=0x1112thenbeginifu2<0x1161||0x1175<u2thenux_noneelseletl=u1-H.lbasein(* LV hangul composite *)letv=u2-H.vbaseinH.sbase+l*H.ncount+v*H.tcountendelseif0xAC00<=u1&&u1<=0xD788&&(u1-0x0AC00)modH.tcount=0thenbeginifu2<0x11A8||u2>0x11C3thenux_noneelse(u1+u2-H.tbase)(* LVT hangul composite *)endelsematchcompose_propu1with|[||]->ux_none|a(* [u2; c; u2'; c'; ...] sorted *)->letlen=Array.lengtha/2inleti=ref0intrywhile(!i<len)doifa.(!i*2)=u2thenraiseExitelseincri;done;ux_nonewithExit->(a.(!i*2+1))letcompositeu1u2=letu=_composite(Uchar.to_intu1)(Uchar.to_intu2)inifu=ux_nonethenNoneelseSome(Uchar.unsafe_of_intu)(* Normalize *)typeform=[`NFC|`NFD|`NFKC|`NFKD]typestate=(* normalizer state. *)|Start(* no cp seen yet. *)|Boundary(* cp with boundary = true found in n.uc, no accumulation yet. *)|Acc(* accumulate until next cp with boundary = true. *)|Flush(* next cp with boundary = true found, flush previous data. *)|End(* end of normalization sequence. *)typet={form:form;(* normalization form. *)compat:bool;(* true if compatibility decomposition needed. *)compose:bool;(* true if composition needed. *)boundary:int->bool;(* nfx_boundary. *)mutablestate:state;(* normalizer state. *)mutableuc:[`UcharofUchar.t];(* last cp with boundary = true. *)mutableacc:intarray;(* code point accumulator. *)mutablefirst:int;(* index of first code point in acc. *)mutablelast:int;(* index of last code point in acc. *)mutableis_end:bool;}(* [true] if `End was seen. *)letcreate_acc()=Array.make35ux_noneletcreateform=letboundary,compat,compose=matchformwith|`NFC->nfc_boundary,false,true|`NFD->nfd_boundary,false,false|`NFKC->nfkc_boundary,true,true|`NFKD->nfkd_boundary,true,falsein{form=(form:>form);compat;compose;boundary;state=Start;uc=u_dumb;acc=create_acc();first=0;last=-1;is_end=false}letget_un=let`Ucharu=n.ucinUchar.to_intuletacc_emptyn=n.first>n.lastletformn=n.formletcopyn={nwithacc=Array.copyn.acc}letresetn=n.state<-Start;n.uc<-u_dumb;n.acc<-create_acc();n.first<-0;n.last<--1;n.is_end<-falseletgrow_accn=letlen=Array.lengthn.accinletacc'=Array.make(2*len)ux_noneinArray.blitn.acc0acc'0len;n.acc<-acc'letordered_addnu=(* canonical ordering algorithm via insertion sort. *)n.last<-n.last+1;ifn.last=Array.lengthn.accthengrow_accn;letc=_cccuinifc=0thenn.acc.(n.last)<-uelsebeginleti=ref(n.last-1)inwhile(!i>=0&&_ccc(n.acc.(!i))>c)don.acc.(!i+1)<-n.acc.(!i);decri;(* shift right. *)done;n.acc.(!i+1)<-uendletrecaddnu=if0xAC00<=u&&u<=0xD7A3thenbegin(* LV or LVT hangul composite, copied from decomp to avoid alloc. *)letsindex=u-H.sbaseinletl=H.lbase+(sindex/H.ncount)inletv=H.vbase+(sindexmodH.ncount)/H.tcountinlett=H.tbase+(sindexmodH.tcount)inift=H.tbasethen(ordered_addnl;ordered_addnv)else(ordered_addnl;ordered_addnv;ordered_addnt)endelsebeginmatchdecomp_propuwith|[||]->ordered_addnu|d->ifd_compatibilityd.(0)&¬n.compatthenordered_addnuelsebeginaddn(_d_uchard.(0));fori=1toArray.lengthd-1doaddnd.(i)doneendendletcomposen=(* canonical composition algorithm. *)letrecloop~last_starter~prev_ccci=ifi>n.lastthen()elseletccc_i=_cccn.acc.(i)inletu_comp=_compositen.acc.(last_starter)n.acc.(i)inmatch(u_comp=ux_none||(ccc_i=0&&last_starter<>i-1))with|true->letlast_starter=ifccc_i=0thenielselast_starterinloop~last_starter~prev_ccc:ccc_i(i+1)|false->matchprev_ccc<>0&&prev_ccc>=ccc_iwith|true->loop~last_starter~prev_ccc:ccc_i(i+1)|false->n.acc.(last_starter)<-u_comp;Array.blitn.acc(i+1)n.acci(n.last-i);n.last<-n.last-1;letprev_ccc=_cccn.acc.(last_starter)inloop~last_starter~prev_ccc(last_starter+1)inletlast_starter=n.firstinletprev_ccc=_cccn.acc.(last_starter)inloop~last_starter~prev_ccc(last_starter+1)letflush_nextn=letret=`Uchar(Uchar.unsafe_of_intn.acc.(n.first))inifn.first=n.lastthen(n.first<-0;n.last<--1)else(n.first<-n.first+1);retletflush_startn=ifn.composethencomposen;flush_nextnletaddn=function|`Ucharuasuc->letu=Uchar.to_intuinbeginmatchn.statewith|Boundary->ifn.boundaryuthen(letprev=n.ucinn.uc<-uc;(prev:>ret))else(n.state<-Acc;addn(get_un);addnu;`Await)|Acc->ifn.boundaryuthen(n.state<-Flush;n.uc<-uc;flush_startn)else(addnu;`Await)|Start->ifn.boundaryuthen(n.state<-Boundary;n.uc<-uc;`Await)else(n.state<-Acc;addnu;`Await)|Flush->err_exp_awaituc|End->err_endeducend|`Await->beginmatchn.statewith|Flush->ifnot(acc_emptyn)thenflush_nextnelseifn.is_endthen(n.state<-End;`End)else(n.state<-Boundary;`Await)|Start|Boundary|Acc->`Await|End->`Endend|`End->n.is_end<-true;beginmatchn.statewith|Boundary->n.state<-End;(n.uc:>ret)|Acc->n.state<-Flush;flush_startn|Start->n.state<-End;`End|Flush->err_exp_await`End|End->err_ended`Endend(*---------------------------------------------------------------------------
Copyright (c) 2012 The uunf programmers
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
---------------------------------------------------------------------------*)