123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495# 1 "Camomile/internal/unidata.ml"(** Unicode data *)(* Copyright (C) 2002, 2003, 2011 Yamagata Yoriyuki *)(* This library is free software; you can redistribute it and/or *)(* modify it under the terms of the GNU Lesser General Public License *)(* as published by the Free Software Foundation; either version 2 of *)(* the License, or (at your option) any later version. *)(* As a special exception to the GNU Library General Public License, you *)(* may link, statically or dynamically, a "work that uses this library" *)(* with a publicly distributed version of this library to produce an *)(* executable file containing portions of this library, and distribute *)(* that executable file under terms of your choice, without any of the *)(* additional requirements listed in clause 6 of the GNU Library General *)(* Public License. By "a publicly distributed version of this library", *)(* we mean either the unmodified Library as distributed by the authors, *)(* or a modified version of this library that is distributed under the *)(* conditions defined in clause 3 of the GNU Library General Public *)(* License. This exception does not however invalidate any other reasons *)(* why the executable file might be covered by the GNU Library General *)(* Public License . *)(* This library is distributed in the hope that it will be useful, *)(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)(* Lesser General Public License for more details. *)(* You should have received a copy of the GNU Lesser General Public *)(* License along with this library; if not, write to the Free Software *)(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)(* USA *)(* You can contact the authour by sending email to *)(* yoriyuki.y@gmail.com *)moduletypeType=sigvalread_data:?datadir:string->string->'atypegeneral_category_type=[`Lu(* Letter, Uppercase *)|`Ll(* Letter, Lowercase *)|`Lt(* Letter, Titlecase *)|`Mn(* Mark, Non-Spacing *)|`Mc(* Mark, Spacing Combining *)|`Me(* Mark, Enclosing *)|`Nd(* Number, Decimal Digit *)|`Nl(* Number, Letter *)|`No(* Number, Other *)|`Zs(* Separator, Space *)|`Zl(* Separator, Line *)|`Zp(* Separator, Paragraph *)|`Cc(* Other, Control *)|`Cf(* Other, Format *)|`Cs(* Other, Surrogate *)|`Co(* Other, Private Use *)|`Cn(* Other, Not Assigned *)|`Lm(* Letter, Modifier *)|`Lo(* Letter, Other *)|`Pc(* Punctuation, Connector *)|`Pd(* Punctuation, Dash *)|`Ps(* Punctuation, Open *)|`Pe(* Punctuation, Close *)|`Pi(* Punctuation, Initial quote *)|`Pf(* Punctuation, Final quote *)|`Po(* Punctuation, Other *)|`Sm(* Symbol, Math *)|`Sc(* Symbol, Currency *)|`Sk(* Symbol, Modifier *)|`So](* Symbol, Other *)valcat_of_name:string->general_category_typevalnum_of_cat:general_category_type->intvalcat_of_num:int->general_category_typetypescript_type=[`Common|`Inherited|`Latin|`Greek|`Cyrillic|`Armenian|`Hebrew|`Arabic|`Syriac|`Thaana|`Devanagari|`Bengali|`Gurmukhi|`Gujarati|`Oriya|`Tamil|`Telugu|`Kannada|`Malayalam|`Sinhala|`Thai|`Lao|`Tibetan|`Myanmar|`Georgian|`Hangul|`Ethiopic|`Cherokee|`Canadian_Aboriginal|`Ogham|`Runic|`Khmer|`Mongolian|`Hiragana|`Katakana|`Bopomofo|`Han|`Yi|`Old_Italic|`Gothic|`Deseret|`Tagalog|`Hanunoo|`Buhid|`Tagbanwa]valscript_of_name:string->script_typevalscript_of_num:int->script_typevalnum_of_script:script_type->inttypedecomposition_type=[`Canon|`Font|`NoBreak|`Initial|`Medial|`Final|`Isolated|`Circle|`Super|`Sub|`Vertical|`Wide|`Narrow|`Small|`Square|`Fraction|`Compat]typedecomposition_info=(* Already in the canonical form *)[`Canonform(* `Composite (dtype, text) :
* means the given character is decomposed into text by dtype
* decomposition. *)|`HangulSyllable|`Compositeofdecomposition_type*UChar.tlist](* Collation *)typece_type=int(*collation element*)valprimary:ce_type->intvalsecondary:ce_type->intvaltertiary:ce_type->intvalcompose_ce:int->int->int->ce_typevalcomplete_ignorable:ce_typetypece_tbl=(UChar.tlist*ce_typelist)listUCharTbl.ttypevariable_option=[`Blanked|`Non_ignorable|`Shifted|`Shift_Trimmed]typecol_info={variable_top:int;variable_option:variable_option;french_accent:bool;hiraganaQ:bool;hiraganaQ_weight:int;tbl:ce_tbl}valget_col_info:?locale:string->unit->col_info(* If the returned list contains ([u1; u2; ... ;un], [ce1; ce2; ... ;cem]),
for the given character u, the sequence u u1 u2 ... un corresponds
sequence of collation elements ce1 ce2 ... cem. the list is in
decreasing order respect to n. *)valce:ce_tbl->UChar.t->(UChar.tlist*ce_typelist)listtypelocaledata={col_info:col_infooption}endmoduleMake(Config:ConfigInt.Type)=structletread_data?datadirname=letdatadir=matchdatadirwithSomed->d|None->Config.datadirinDatabase.readdatadir"mar"input_valuenametypegeneral_category_type=[`Lu(* Letter, Uppercase *)|`Ll(* Letter, Lowercase *)|`Lt(* Letter, Titlecase *)|`Mn(* Mark, Non-Spacing *)|`Mc(* Mark, Spacing Combining *)|`Me(* Mark, Enclosing *)|`Nd(* Number, Decimal Digit *)|`Nl(* Number, Letter *)|`No(* Number, Other *)|`Zs(* Separator, Space *)|`Zl(* Separator, Line *)|`Zp(* Separator, Paragraph *)|`Cc(* Other, Control *)|`Cf(* Other, Format *)|`Cs(* Other, Surrogate *)|`Co(* Other, Private Use *)|`Cn(* Other, Not Assigned *)|`Lm(* Letter, Modifier *)|`Lo(* Letter, Other *)|`Pc(* Punctuation, Connector *)|`Pd(* Punctuation, Dash *)|`Ps(* Punctuation, Open *)|`Pe(* Punctuation, Close *)|`Pi(* Punctuation, Initial quote *)|`Pf(* Punctuation, Final quote *)|`Po(* Punctuation, Other *)|`Sm(* Symbol, Math *)|`Sc(* Symbol, Currency *)|`Sk(* Symbol, Modifier *)|`So](* Symbol, Other *)letcat_of_namename=matchnamewith"Lu"->`Lu|"Ll"->`Ll|"Lt"->`Lt|"Mn"->`Mn|"Mc"->`Mc|"Me"->`Me|"Nd"->`Nd|"Nl"->`Nl|"No"->`No|"Zs"->`Zs|"Zl"->`Zl|"Zp"->`Zp|"Cc"->`Cc|"Cf"->`Cf|"Cs"->`Cs|"Co"->`Co|"Cn"->`Cn|"Lm"->`Lm|"Lo"->`Lo|"Pc"->`Pc|"Pd"->`Pd|"Ps"->`Ps|"Pe"->`Pe|"Pi"->`Pi|"Pf"->`Pf|"Po"->`Po|"Sm"->`Sm|"Sc"->`Sc|"Sk"->`Sk|"So"->`So|_->raiseNot_foundletnum_of_catca=matchcawith`Lu->1|`Ll->2|`Lt->3|`Mn->4|`Mc->5|`Me->6|`Nd->7|`Nl->8|`No->9|`Zs->10|`Zl->11|`Zp->12|`Cc->13|`Cf->14|`Cs->15|`Co->16|`Cn->0|`Lm->17|`Lo->18|`Pc->19|`Pd->20|`Ps->21|`Pe->22|`Pi->23|`Pf->24|`Po->25|`Sm->26|`Sc->27|`Sk->28|`So->29letcat_of_num_tbl:general_category_typearray=[|`Cn;`Lu;`Ll;`Lt;`Mn;`Mc;`Me;`Nd;`Nl;`No;`Zs;`Zl;`Zp;`Cc;`Cf;`Cs;`Co;`Lm;`Lo;`Pc;`Pd;`Ps;`Pe;`Pi;`Pf;`Po;`Sm;`Sc;`Sk;`So|]letcat_of_numi=cat_of_num_tbl.(i)typescript_type=[`Common|`Inherited|`Latin|`Greek|`Cyrillic|`Armenian|`Hebrew|`Arabic|`Syriac|`Thaana|`Devanagari|`Bengali|`Gurmukhi|`Gujarati|`Oriya|`Tamil|`Telugu|`Kannada|`Malayalam|`Sinhala|`Thai|`Lao|`Tibetan|`Myanmar|`Georgian|`Hangul|`Ethiopic|`Cherokee|`Canadian_Aboriginal|`Ogham|`Runic|`Khmer|`Mongolian|`Hiragana|`Katakana|`Bopomofo|`Han|`Yi|`Old_Italic|`Gothic|`Deseret|`Tagalog|`Hanunoo|`Buhid|`Tagbanwa](* little hack to maintain 4.02.3 compat with warnings *)moduleString=struct[@@@ocaml.warning"-3-32"]letlowercase_ascii=StringLabels.lowercaseincludeStringendletscript_of_namename=matchString.lowercase_asciinamewith|"common"->`Common|"inherited"->`Inherited|"latin"->`Latin|"greek"->`Greek|"cyrillic"->`Cyrillic|"armenian"->`Armenian|"hebrew"->`Hebrew|"arabic"->`Arabic|"syriac"->`Syriac|"thaana"->`Thaana|"devanagari"->`Devanagari|"bengali"->`Bengali|"gurmukhi"->`Gurmukhi|"gujarati"->`Gujarati|"oriya"->`Oriya|"tamil"->`Tamil|"telugu"->`Telugu|"kannada"->`Kannada|"malayalam"->`Malayalam|"sinhala"->`Sinhala|"thai"->`Thai|"lao"->`Lao|"tibetan"->`Tibetan|"myanmar"->`Myanmar|"georgian"->`Georgian|"hangul"->`Hangul|"ethiopic"->`Ethiopic|"cherokee"->`Cherokee|"canadian_aboriginal"->`Canadian_Aboriginal|"ogham"->`Ogham|"runic"->`Runic|"khmer"->`Khmer|"mongolian"->`Mongolian|"hiragana"->`Hiragana|"katakana"->`Katakana|"bopomofo"->`Bopomofo|"han"->`Han|"yi"->`Yi|"old_italic"->`Old_Italic|"gothic"->`Gothic|"deseret"->`Deseret|"tagalog"->`Tagalog|"hanunoo"->`Hanunoo|"buhid"->`Buhid|"tagbanwa"->`Tagbanwa|_->raiseNot_foundletnum_of_script=function`Common->0|`Inherited->1|`Latin->2|`Greek->3|`Cyrillic->4|`Armenian->5|`Hebrew->6|`Arabic->7|`Syriac->8|`Thaana->9|`Devanagari->10|`Bengali->11|`Gurmukhi->12|`Gujarati->13|`Oriya->14|`Tamil->15|`Telugu->16|`Kannada->17|`Malayalam->18|`Sinhala->19|`Thai->20|`Lao->21|`Tibetan->22|`Myanmar->23|`Georgian->24|`Hangul->25|`Ethiopic->26|`Cherokee->27|`Canadian_Aboriginal->28|`Ogham->29|`Runic->30|`Khmer->31|`Mongolian->32|`Hiragana->33|`Katakana->34|`Bopomofo->35|`Han->36|`Yi->37|`Old_Italic->38|`Gothic->39|`Deseret->40|`Tagalog->41|`Hanunoo->42|`Buhid->43|`Tagbanwa->44letscript_tbl=[|`Common;`Inherited;`Latin;`Greek;`Cyrillic;`Armenian;`Hebrew;`Arabic;`Syriac;`Thaana;`Devanagari;`Bengali;`Gurmukhi;`Gujarati;`Oriya;`Tamil;`Telugu;`Kannada;`Malayalam;`Sinhala;`Thai;`Lao;`Tibetan;`Myanmar;`Georgian;`Hangul;`Ethiopic;`Cherokee;`Canadian_Aboriginal;`Ogham;`Runic;`Khmer;`Mongolian;`Hiragana;`Katakana;`Bopomofo;`Han;`Yi;`Old_Italic;`Gothic;`Deseret;`Tagalog;`Hanunoo;`Buhid;`Tagbanwa|]letscript_of_numi=script_tbl.(i)typedecomposition_type=[`Canon|`Font|`NoBreak|`Initial|`Medial|`Final|`Isolated|`Circle|`Super|`Sub|`Vertical|`Wide|`Narrow|`Small|`Square|`Fraction|`Compat]typedecomposition_info=(* Already in the canonical form *)[`Canonform(* `Composite (dtype, text) :
* means the given character is decomposed into text by dtype
* decomposition. *)|`HangulSyllable|`Compositeofdecomposition_type*UChar.tlist]typece_type=intletprimary_mask=0xfffflsl15letsecondary_mask=0xfflsl7lettertiary_mask=0x7fletprimaryce=(celandprimary_mask)lsr15letsecondaryce=(celandsecondary_mask)lsr7lettertiaryce=celandtertiary_maskletcompose_cew1w2w3=(w1lsl15)lor(w2lsl7)lorw3letcomplete_ignorable=0typece_tbl=(UChar.tlist*ce_typelist)listUCharTbl.ttypevariable_option=[`Blanked|`Non_ignorable|`Shifted|`Shift_Trimmed]typecol_info={variable_top:int;variable_option:variable_option;french_accent:bool;hiraganaQ:bool;hiraganaQ_weight:int;tbl:ce_tbl}letdefault_col_data=lazy(read_data"allkeys":col_info)typelocaledata={col_info:col_infooption}letread_localedatac=letdata:localedata=input_valuecinmatchdata.col_infowithNone->raiseNot_found|Somex->xletcol_tbl=Hashtbl.create0letget_col_info?locale()=matchlocalewithNone->Lazy.forcedefault_col_data|Somes->tryletb=Hashtbl.findcol_tblsinmatchWeak.getb0withNone->Hashtbl.removecol_tbls;raiseNot_found|Somex->xwithNot_found->tryletinfo=Locale.readConfig.localedir"mar"read_localedatasinletb=Weak.create1inWeak.setb0(Someinfo);Hashtbl.addcol_tblsb;infowithNot_found->Lazy.forcedefault_col_dataletcetblu=UCharTbl.gettbluend