123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518# 1 "Camomile/public/uNF.ml"(** Unicode normal form (NFD, NFKD, NFC, NFKC) as described in UTR #15 *)(* Copyright (C) 2002 Yamagata Yoriyuki. *)(* This library is free software; you can redistribute it and/or *)(* modify it under the terms of the GNU Lesser General Public License *)(* as published by the Free Software Foundation; either version 2 of *)(* the License, or (at your option) any later version. *)(* As a special exception to the GNU Library General Public License, you *)(* may link, statically or dynamically, a "work that uses this library" *)(* with a publicly distributed version of this library to produce an *)(* executable file containing portions of this library, and distribute *)(* that executable file under terms of your choice, without any of the *)(* additional requirements listed in clause 6 of the GNU Library General *)(* Public License. By "a publicly distributed version of this library", *)(* we mean either the unmodified Library as distributed by the authors, *)(* or a modified version of this library that is distributed under the *)(* conditions defined in clause 3 of the GNU Library General Public *)(* License. This exception does not however invalidate any other reasons *)(* why the executable file might be covered by the GNU Library General *)(* Public License . *)(* This library is distributed in the hope that it will be useful, *)(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)(* Lesser General Public License for more details. *)(* You should have received a copy of the GNU Lesser General Public *)(* License along with this library; if not, write to the Free Software *)(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)(* USA *)(* You can contact the authour by sending email to *)(* yoriyuki.y@gmail.com *)moduletypeType=sigtypetextopenOOChannelclassnfd:UChar.t#obj_output_channel->[UChar.t]obj_output_channelclassnfc:UChar.t#obj_output_channel->[UChar.t]obj_output_channelclassnfkd:UChar.t#obj_output_channel->[UChar.t]obj_output_channelclassnfkc:UChar.t#obj_output_channel->[UChar.t]obj_output_channel(** Conversion to NFD, NFKD, NFC, NFKC forms. *)valnfd:text->textvalnfkd:text->textvalnfc:text->textvalnfkc:text->textmoduleNFCBuf:sigtypebufvalcreate:int->bufvalcontents:buf->textvalclear:buf->unitvalreset:buf->unitvaladd_char:buf->UChar.t->unitvaladd_string:buf->text->unitvaladd_buffer:buf->buf->unitendvalnfc_append:text->text->text(** [put_nfd b t], [put_nfkd b t], [put_nfc b t], [put_nfkc b t]
clear the contents of [b] and put the NFD, NFKD, NFC, NFKC
forms of [t] into [b] respectively. *)valput_nfd:XString.t->text->unitvalput_nfkd:XString.t->text->unitvalput_nfc:XString.t->text->unitvalput_nfkc:XString.t->text->unittypeindexvalnfd_inc:text->index->([`IncofUChar.tlist*index*'alazy_t]as'a)valcanon_compare:text->text->intvalnfd_decompose:UChar.t->UChar.tlistvalnfkd_decompose:UChar.t->UChar.tlistendmoduleMake(Config:ConfigInt.Type)(Text:UnicodeString.Type)=structmoduleUInfo=UCharInfo.Make(Config)letnull=UChar.chr_of_uint0letdecomposition_tbl=UInfo.load_decomposition_tbl()letdecompositionu=UCharTbl.getdecomposition_tbluletcomposition_exclusion_tbl=UInfo.load_composition_exclusion_tbl()letcomposition_exclusionu=UCharTbl.Bool.getcomposition_exclusion_tbluletcomposition_tbl=UInfo.load_composition_tbl()letcompositionu=UCharTbl.getcomposition_tbluletrecadd_listx=function[]->()|u::r->XString.add_charxu;add_listxrletshiftrightxij=fork=jdowntoidoXString.setx(k+1)(XString.getxk)doneletrotatexij=letu=XString.getxjinshiftrightxi(j-1);XString.setxiuletblitxix'i'len=fork=0tolen-1doXString.setx'(i'+k)(XString.getx(i+k));doneletnfdu=matchdecompositionuwith`HangulSyllable->Hangul.decomposeu|`Composite(`Canon,d)->d|_->[u]letnfd_decompose=nfdletrecnfkdu=matchdecompositionuwith`HangulSyllable->Hangul.decomposeu|`Composite(_,d)->List.fold_right(funua->(nfkdu)@a)d[]|`Canonform->[u]letnfkd_decompose=nfkdletcanon_decompose_ucharxu=matchdecompositionuwith`HangulSyllable->Hangul.add_decompositionxu|`Composite(`Canon,d)->add_listxd|_->XString.add_charxuclasscanon_decompose(c_out:UChar.tOOChannel.obj_output_channel)=objectmethodputu=List.iterc_out#put(nfdu)methodclose_out=c_out#close_outmethodflush()=c_out#flush()endletreckompat_decompose_ucharxu=matchdecompositionuwith`Composite(_,d)->List.iter(kompat_decompose_ucharx)d|_->Hangul.add_decompositionxuclasskompat_decomposec_out:[UChar.t]OOChannel.obj_output_channel=objectmethodputu=List.iterc_out#put(nfkdu)methodclose_out=c_out#close_outmethodflush()=c_out#flush()endletcanon_reorderx=letchead=ref0inletpos=ref0infori=0toXString.lengthx-1doletu=XString.getxiinletc=UInfo.combined_classuinifc=0thenchead:=ielsebeginpos:=i-1;while!pos>=!chead&&UInfo.combined_class(XString.getx!pos)>cdodecrposdone;rotatex(!pos+1)ienddoneclasscanon_reorderc_out:[UChar.t]OOChannel.obj_output_channel=object(self)valmutablesq=[]methodprivateout_buf=letsq'=List.stable_sort(fun(c1,_)(c2,_)->c1-c2)sqinList.iter(fun(_,u)->c_out#putu)sq';sq<-[]methodputu=letc=UInfo.combined_classuinifc=0then(ifsq<>[]thenself#out_buf;c_out#putu)elsesq<-(c,u)::sqmethodclose_out()=self#out_buf;c_out#close_out()methodflush()=ifsq<>[]thenfailwith"uNF.canon_reorder#flush: \
Cannot flush the entire buffer";c_out#flush()endletreclook_compositionu1=function[]->None|(u,u')::_whenu=u1->ifcomposition_exclusionu'||UInfo.combined_classu'<>0thenNoneelseSomeu'|_::rest->look_compositionu1restletreccanon_compose_loopxijx'kc'=ifj>=XString.lengthxthenbeginblitxix'(k+1)(XString.lengthx-i);k+max(XString.lengthx-i)0endelseletu=XString.getxjinletc=UInfo.combined_classuinletb=ifj=i||c'<>cthen(*not blocked!*)matchlook_compositionu(composition(XString.getx'k))withNone->true|Someu'->XString.setx'ku';shiftrightxi(j-1);falseelsetrueinifb&&c=0thenbeginblitxix'(k+1)(j-i+1);canon_compose_loopx(j+1)(j+1)x'(k+1+j-i)0endelseleti'=ifbthenielsei+1inletc'=ifbthencelsec'incanon_compose_loopxi'(j+1)x'kc'letcanon_composex'x=ifXString.lengthx=0then()elseletpos=ref0inwhile!pos<XString.lengthx&&UInfo.combined_class(XString.getx!pos)<>0doincrposdone;blitx0x'0!pos;if!pos<XString.lengthxthenbeginXString.setx'!pos(XString.getx!pos);pos:=canon_compose_loopx(!pos+1)(!pos+1)x'!pos0endelse();XString.shrinkx'(!pos+1)classcanon_composec_out:[UChar.t]OOChannel.obj_output_channel=object(self)valmutablehas_strt=falsevalmutablestrt=nullvalmutablecmp_cnd=[]valmutablelst_cc=-1valsq=Queue.create()methodprivateset_strtu=ifhas_strtthenc_out#putstrt;Queue.iterc_out#putsq;strt<-u;has_strt<-true;cmp_cnd<-compositionu;lst_cc<--1;Queue.clearsqmethodprivateoutput_buffer()=ifhas_strtthenc_out#putstrt;Queue.iterc_out#putsq;methodputu=letc=UInfo.combined_classuinifnothas_strtthenifc=0thenself#set_strtuelsec_out#putuelseifc=lst_ccthenQueue.addusqelsematchlook_compositionucmp_cndwithSomeu'->strt<-u'|None->ifc=0thenself#set_strtuelsebeginQueue.addusq;lst_cc<-cendmethodclose_out()=self#output_buffer();c_out#close_out()methodflush()=self#output_buffer();c_out#flush()endclassnfdc_out=letc=newcanon_reorderc_outinobjectinheritcanon_decomposecendclassnfcc_out=letc=newcanon_composec_outinletc=newcanon_reordercinobjectinheritcanon_decomposecmethod!flush=c_out#flushendclassnfkdc_out=letc=newcanon_reorderc_outinobjectinheritkompat_decomposecmethod!flush=c_out#flushendclassnfkcc_out=letc=newcanon_composec_outinletc=newcanon_reordercinobjectinheritkompat_decomposecmethod!flush=c_out#flushendtypeinc=[`IncofUChar.tlist*Text.index*'alazy_t]as'aletrecinc_endi=`Inc([],i,lazy(inc_endi))letrecinc_canon_decomposeti:inc=ifText.out_of_rangetitheninc_endielseleti'=Text.nexttiin`Inc(nfd(Text.lookti),i',lazy(inc_canon_decomposeti'))letreccanon_insert_list((_,c)asx)a=matchawith[]->[x]|(_,c')asy::rest->ifc'<=ctheny::canon_insert_listxrestelsex::aletrecsplit1_list=function[]->[]|(x,_)::rest->x::split1_listrestletcanon_sort_listsq=letrecloopa=function[]->split1_lista|(u,0)::rest->(split1_lista)@(u::(loop[]rest))|(_,_)asx::rest->loop(canon_insert_listxa)restinloop[]sqletrecread_combined_class=function[]->[]|u::rest->(u,UInfo.combined_classu)::read_combined_classrestletinc_reorderfti=let`Inc(us,i',f)=ftiinletrecloop(f:incLazy.t)ai=let`Inc(us,i',f)=Lazy.forcefinleta'=read_combined_classusinmatcha'with[]->`Inc(canon_sort_lista,i,lazy(inc_endi))|(_,0)::_->`Inc(canon_sort_lista,i,lazy(loopfa'i'))|_->loopf(a@a')i'inloopf(read_combined_classus)i'letnfd_incti=inc_reorderinc_canon_decomposetiletcanon_comparet1t2=let`Inc(us1,_,f1)=nfd_inct1(Text.ntht10)inlet`Inc(us2,_,f2)=nfd_inct2(Text.ntht20)inletrecloopus1us2f1f2=matchus1,us2with[],[]->0|[],_->~-1|_,[]->1|u1::r1,u2::r2->letsgn=UChar.compareu1u2inifsgn<>0thensgnelseletus1,f1=ifr1=[]thenlet`Inc(us1,_,f1)=Lazy.forcef1in(us1,f1)else(r1,f1)inletus2,f2=ifr2=[]thenlet`Inc(us2,_,f2)=Lazy.forcef2in(us2,f2)else(r2,f2)inloopus1us2f1f2inloopus1us2f1f2typetext=Text.ttypeindex=Text.indexletcanon_decomposext=Text.iter(canon_decompose_ucharx)tletkompat_decomposext=Text.iter(kompat_decompose_ucharx)tlettext_of_xstringx=Text.init(XString.lengthx)(XString.getx)letnfdt=letx=XString.make0(UChar.chr_of_uint0)incanon_decomposext;canon_reorderx;text_of_xstringxletnfkdt=letx=XString.make0(UChar.chr_of_uint0)inkompat_decomposext;canon_reorderx;text_of_xstringxletnfct=letx=XString.make0(UChar.chr_of_uint0)incanon_decomposext;canon_reorderx;canon_composexx;Hangul.composexx;text_of_xstringxletnfkct=letx=XString.make0(UChar.chr_of_uint0)inkompat_decomposext;canon_reorderx;canon_composexx;Hangul.composexx;text_of_xstringxletput_nfdxt=XString.clearx;canon_decomposext;canon_reorderxletput_nfkdxt=XString.clearx;kompat_decomposext;canon_reorderxletput_nfcxt=XString.clearx;canon_decomposext;canon_reorderx;canon_composexx;Hangul.composexxletput_nfkcxt=XString.clearx;kompat_decomposext;canon_reorderx;canon_composexx;Hangul.composexxmoduleNFCBuf=structtypebuf={mutablenormalized:bool;mutablebuf:XString.t}letcreatebufsize={normalized=true;buf=XString.make~bufsize0null}letcontentsb=(ifnotb.normalizedthenletbuf=XString.make0nullinXString.iter(canon_decompose_ucharbuf)b.buf;canon_reorderbuf;canon_composebufbuf;b.buf<-buf;b.normalized<-true);text_of_xstringb.bufletclearb=b.normalized<-true;XString.clearb.bufletresetb=b.normalized<-true;XString.resetb.bufletadd_charbu=b.normalized<-false;XString.add_charb.bufuletadd_stringbt=b.normalized<-false;Text.iter(XString.add_charb.buf)tletadd_bufferb1b2=b1.normalized<-false;XString.add_xstringb1.bufb2.bufendletnfc_appendt1t2=letb=XString.make0nullincanon_decomposebt1;canon_decomposebt2;canon_reorderb;canon_composebb;text_of_xstringbend