123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238# 1 "Camomile/public/uTF8.ml"(** UTF-8 encoded Unicode strings. The type is normal string. *)(* Copyright (C) 2002, 2003 Yamagata Yoriyuki. *)(* This library is free software; you can redistribute it and/or *)(* modify it under the terms of the GNU Lesser General Public License *)(* as published by the Free Software Foundation; either version 2 of *)(* the License, or (at your option) any later version. *)(* As a special exception to the GNU Library General Public License, you *)(* may link, statically or dynamically, a "work that uses this library" *)(* with a publicly distributed version of this library to produce an *)(* executable file containing portions of this library, and distribute *)(* that executable file under terms of your choice, without any of the *)(* additional requirements listed in clause 6 of the GNU Library General *)(* Public License. By "a publicly distributed version of this library", *)(* we mean either the unmodified Library as distributed by the authors, *)(* or a modified version of this library that is distributed under the *)(* conditions defined in clause 3 of the GNU Library General Public *)(* License. This exception does not however invalidate any other reasons *)(* why the executable file might be covered by the GNU Library General *)(* Public License . *)(* This library is distributed in the hope that it will be useful, *)(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)(* Lesser General Public License for more details. *)(* You should have received a copy of the GNU Lesser General Public *)(* License along with this library; if not, write to the Free Software *)(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)(* USA *)(* You can contact the authour by sending email to *)(* yoriyuki.y@gmail.com *)typet=stringtypeindex=intletlooksi=letn'=letn=Char.codes.[i]inifn<0x80thennelseifn<=0xdfthen(n-0xc0)lsl6lor(0x7fland(Char.codes.[i+1]))elseifn<=0xefthenletn'=n-0xe0inletm0=Char.codes.[i+2]inletm=Char.code(String.unsafe_gets(i+1))inletn'=n'lsl6lor(0x7flandm)inn'lsl6lor(0x7flandm0)elseifn<=0xf7thenletn'=n-0xf0inletm0=Char.codes.[i+3]inletm=Char.code(String.unsafe_gets(i+1))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+2))inletn'=n'lsl6lor(0x7flandm)inn'lsl6lor(0x7flandm0)elseifn<=0xfbthenletn'=n-0xf8inletm0=Char.codes.[i+4]inletm=Char.code(String.unsafe_gets(i+1))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+2))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+3))inletn'=n'lsl6lor(0x7flandm)inn'lsl6lor(0x7flandm0)elseifn<=0xfdthenletn'=n-0xfcinletm0=Char.codes.[i+5]inletm=Char.code(String.unsafe_gets(i+1))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+2))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+3))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+4))inletn'=n'lsl6lor(0x7flandm)inn'lsl6lor(0x7flandm0)elseinvalid_arg"UTF8.look"inUChar.of_intn'letrecsearch_headsi=ifi>=String.lengthsthenielseletn=Char.code(String.unsafe_getsi)inifn<0x80||n>=0xc2thenielsesearch_heads(i+1)letnextsi=letn=Char.codes.[i]inifn<0x80theni+1elseifn<0xc0thensearch_heads(i+1)elseifn<=0xdftheni+2elseifn<=0xeftheni+3elseifn<=0xf7theni+4elseifn<=0xfbtheni+5elseifn<=0xfdtheni+6elseinvalid_arg"UTF8.next"letrecsearch_head_backwardsi=ifi<0then-1elseletn=Char.codes.[i]inifn<0x80||n>=0xc2thenielsesearch_head_backwards(i-1)letprevsi=search_head_backwards(i-1)letmovesin=ifn>=0thenletrecloopin=ifn<=0thenielseloop(nextsi)(n-1)inloopinelseletrecloopin=ifn>=0thenielseloop(prevsi)(n+1)inloopinletrecnth_auxsin=ifn=0thenielsenth_auxs(nextsi)(n-1)letnthsn=nth_auxs0nletfirst_=0letlasts=search_head_backwards(String.lengths-1)letout_of_rangesi=i<0||i>=String.lengthsletcompare_index_ij=i-jletgetsn=looks(nthsn)letadd_ucharbufu=letmasq=0b111111inletk=UChar.uint_codeuinifk<0||k>=0x4000000thenbeginBuffer.add_charbuf(Char.chr(0xfc+(klsr30)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr24)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr18)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr12)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr6)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)));endelseifk<=0x7fthenBuffer.add_charbuf(Char.unsafe_chrk)elseifk<=0x7ffthenbeginBuffer.add_charbuf(Char.unsafe_chr(0xc0lor(klsr6)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)))endelseifk<=0xffffthenbeginBuffer.add_charbuf(Char.unsafe_chr(0xe0lor(klsr12)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr6)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)));endelseifk<=0x1fffffthenbeginBuffer.add_charbuf(Char.unsafe_chr(0xf0+(klsr18)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr12)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr6)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)));endelsebeginBuffer.add_charbuf(Char.unsafe_chr(0xf8+(klsr24)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr18)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr12)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr6)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)));endletinitlenf=letbuf=Buffer.createleninforc=0tolen-1doadd_ucharbuf(fc)done;Buffer.contentsbufletreclength_auxsci=ifi>=String.lengthsthencelseletn=Char.code(String.unsafe_getsi)inletk=ifn<0x80then1elseifn<0xc0theninvalid_arg"UTF8.length"elseifn<0xe0then2elseifn<0xf0then3elseifn<0xf8then4elseifn<0xfcthen5elseifn<0xfethen6elseinvalid_arg"UTF8.length"inlength_auxs(c+1)(i+k)letlengths=length_auxs00letreciter_auxprocsi=ifi>=String.lengthsthen()elseletu=looksiinprocu;iter_auxprocs(nextsi)letiterprocs=iter_auxprocs0letcompares1s2=Pervasives.compares1s2exceptionMalformed_codeletvalidates=letrectrailcia=ifc=0thenaelseifi>=String.lengthsthenraiseMalformed_codeelseletn=Char.code(String.unsafe_getsi)inifn<0x80||n>=0xc0thenraiseMalformed_codeelsetrail(c-1)(i+1)(alsl6lor(n-0x80))inletrecmaini=ifi>=String.lengthsthen()elseletn=Char.code(String.unsafe_getsi)inifn<0x80thenmain(i+1)elseifn<0xc2thenraiseMalformed_codeelseifn<=0xdftheniftrail1(i+1)(n-0xc0)<0x80thenraiseMalformed_codeelsemain(i+2)elseifn<=0xeftheniftrail2(i+1)(n-0xe0)<0x800thenraiseMalformed_codeelsemain(i+3)elseifn<=0xf7theniftrail3(i+1)(n-0xf0)<0x10000thenraiseMalformed_codeelsemain(i+4)elseifn<=0xfbtheniftrail4(i+1)(n-0xf8)<0x200000thenraiseMalformed_codeelsemain(i+5)elseifn<=0xfdthenletn=trail5(i+1)(n-0xfc)inifnlsr16<0x400thenraiseMalformed_codeelsemain(i+6)elseraiseMalformed_codeinmain0moduleBuf=structincludeBuffertypebuf=tletadd_char=add_ucharend