123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321(** UTF-8 encoded Unicode strings. The type is normal string. *)(* Copyright (C) 2002, 2003 Yamagata Yoriyuki. *)(* This library is free software; you can redistribute it and/or *)(* modify it under the terms of the GNU Lesser General Public License *)(* as published by the Free Software Foundation; either version 2 of *)(* the License, or (at your option) any later version. *)(* As a special exception to the GNU Library General Public License, you *)(* may link, statically or dynamically, a "work that uses this library" *)(* with a publicly distributed version of this library to produce an *)(* executable file containing portions of this library, and distribute *)(* that executable file under terms of your choice, without any of the *)(* additional requirements listed in clause 6 of the GNU Library General *)(* Public License. By "a publicly distributed version of this library", *)(* we mean either the unmodified Library as distributed by the authors, *)(* or a modified version of this library that is distributed under the *)(* conditions defined in clause 3 of the GNU Library General Public *)(* License. This exception does not however invalidate any other reasons *)(* why the executable file might be covered by the GNU Library General *)(* Public License . *)(* This library is distributed in the hope that it will be useful, *)(* but WITHOUT ANY WARRANTY; without even the implied warranty of *)(* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *)(* Lesser General Public License for more details. *)(* You should have received a copy of the GNU Lesser General Public *)(* License along with this library; if not, write to the Free Software *)(* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *)(* USA *)(* You can contact the authour by sending email to *)(* yoriyuki.y@gmail.com *)typet=stringletempty=""type index=intletlength0n=if n<0x80then1elseifn<0xe0then2elseifn<0xf0then3else4letlooksi=letn'=letn=Char.code (String.unsafe_getsi)inifn<0x80thennelseifn<=0xdfthen(n-0xc0)lsl6lor(0x7fland(Char.code(String.unsafe_gets(i+1))))elseifn<=0xefthenletn'=n-0xe0inletm=Char.code(String.unsafe_gets(i+1))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+2))inn'lsl6lor(0x7flandm)elseletn'=n-0xf0inletm=Char.code(String.unsafe_gets(i+1))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+2))inletn'=n'lsl6lor(0x7flandm)inletm=Char.code(String.unsafe_gets(i+3))inn'lsl6lor(0x7flandm)inBatUChar.unsafe_chrn'letnextsi=letn=Char.codes.[i]inifn<0x80theni+1elseifn<=0xdftheni+2elseifn<=0xeftheni+3elsei+4letrecsearch_head_backwardsi=ifi<0then-1elseletn=Char.codes.[i]inifn<0x80||n>=0xc2thenielsesearch_head_backwards(i-1)letprevsi=search_head_backwards(i-1)letmovesin=ifn>=0thenletrecloop in=ifn<=0thenielseloop(nextsi)(n-1)inloopinelseletrecloopin=ifn>=0thenielseloop(prevsi)(n+1)inloopinletrecnth_auxsin=ifn=0thenielsenth_auxs(nextsi)(n-1)letnthsn=nth_auxs0nletfirst_=0letlasts=search_head_backwards(String.lengths-1)letout_of_rangesi=i<0||i>=String.lengthsletcompare_index_ij=i-jletget sn=looks(nthsn)letadd_ucharbufu=letmasq=0b111111inletk=BatUChar.codeuinifk<=0x7fthenBuffer.add_charbuf(Char.unsafe_chrk)elseifk<=0x7ff thenbeginBuffer.add_charbuf(Char.unsafe_chr(0xc0lor(klsr6)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)))end elseifk<= 0xffffthenbeginBuffer.add_charbuf(Char.unsafe_chr(0xe0lor(klsr12)));Buffer.add_char buf(Char.unsafe_chr(0x80lor((klsr6)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)));endelsebeginBuffer.add_charbuf(Char.unsafe_chr(0xf0+(klsr18)));Buffer.add_char buf(Char.unsafe_chr(0x80lor((klsr12)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((klsr6)landmasq)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(klandmasq)));endletinitlenf=letbuf=Buffer.createleninforc=0tolen -1doadd_ucharbuf(fc)done;Buffer.contentsbufletmakelenu=init len(fun_->u)letof_char u=make1uletof_string_unsafe s=sletto_string_unsafes=sletreclength_auxsci=ifi>=String.lengthsthencelseletn=Char.code(String.unsafe_getsi)inletk=ifn<0x80then1elseifn<0xe0then2elseifn<0xf0then3else4inlength_auxs(c+1)(i+k)letlengths=length_auxs00letreciter_auxprocsi=ifi>=String.lengthsthen()elseletu=looksiinprocu;iter_auxprocs(nextsi)letiterprocs=iter_auxprocs0letreciteri_auxfsicount=ifi>=String.lengthsthen()elseletu=looksiinfucount;iteri_auxfs(nextsi)(count+1)letiterifs=iteri_auxfs00letcompares1s2=String.compares1s2letsubsnlen =letipos=moves(firsts)ninletjpos=movesiposleninString.subsipos (jpos-ipos)exceptionMalformed_codeletvalidates=letrectrailcia=ifc=0thenaelseifi>=String.lengthsthenraiseMalformed_code elseletn=Char.code(String.unsafe_getsi)inifn<0x80||n>=0xc0thenraiseMalformed_codeelsetrail(c-1)(i+1)(alsl6lor(0x7flandn))inletrecmaini=ifi>=String.lengthsthen()elseletn=Char.code(String.unsafe_getsi)inifn<0x80thenmain(i+1)elseif n<0xc2thenraiseMalformed_codeelseifn<=0xdftheniftrail1(i+1)(n-0xc0)<0x80thenraiseMalformed_codeelsemain(i+2)elseifn<=0xefthenletn'=trail2(i+1)(n-0xe0)inifn'<0x800thenraiseMalformed_code elseifn'>=0xd800&&n'<=0xdfff thenraiseMalformed_codeelsemain(i+3)elseifn<=0xf4thenletn=trail3(i+1)(n-0xf0)inifn<0x10000||n>0x10FFFFthenraiseMalformed_codeelsemain(i+4)elseraiseMalformed_codeinmain0letof_asciis=fori=0toString.lengths-1doifChar.codes.[i]>=0x80thenraiseMalformed_code;done;##V<5##String.copys##V>=5##sletof_latin1s=init(String.lengths)(funi->BatUChar.of_chars.[i])moduleBuf=structincludeBuffertypebuf=tletadd_char =add_ucharendletmapfus=letb=Buf.create(lengthus)initer(fun c->Buf.add_charb(fc))us;Buf.contentsbletfilter_mapfus=letb=Buf.create(lengthus)initer(fun c->matchfcwithNone->()|Somec->Buf.add_charbc)us;Buf.contentsbletfilterpus=letb=Buf.create(lengthus)initer(fun c->ifpcthenBuf.add_charbc)us;Buf.contentsbletfoldfas=letrecloopai=ifout_of_rangesithenaelseleta'=fa(looksi)inloopa'(nextsi)inloopa0letenums=letsl=String.lengthsinleti=(ref(firsts))inBatEnum.from(fun()->if!i=slthenraiseBatEnum.No_more_elementselsebeginletc=looks!iini:=nexts!i;cend)(*$T enum
"" |> of_latin1 |> enum |> BatList.of_enum = []
"foo" |> of_latin1 |> enum |> BatList.of_enum = List.map BatUChar.of_char ['f'; 'o'; 'o']
let e = of_latin1 "abcdef" |> enum in \
for _i = 0 to 2 do BatEnum.junk e done; \
let e2 = BatEnum.clone e in \
let to_string en = BatList.of_enum en |> List.map BatUChar.char_of |> BatString.implode in \
to_string e = "def" && to_string e2 = "def"
init 3 (fun i -> BatUChar.of_int (1211+i)) |> enum |> BatList.of_enum = List.map BatUChar.of_int [1211;1212;1213]
*)(* The last test checks that we can make a round-trip of non-ASCII strings like "һҼҽ" *)letescaped=String.escapedmoduleByteIndex :sigtypet=stringtypeb_idx(* = private int*)typechar_idx=intvalof_int_unsafe :int->b_idxval to_int:b_idx->intvalnext:t->b_idx->b_idxval prev:t->b_idx->b_idxval of_char_idx:t->char_idx->b_idxvalat_end:t->b_idx->boolvalout_of_range :t->b_idx->boolvalfirst:b_idxvallast:t->b_idxvalmove:t->b_idx->int->b_idxval look:t->b_idx->BatUChar.tend=structtypet=stringtypeb_idx=inttypechar_idx=intexternalof_int_unsafe :int->b_idx="%identity"externalto_int :b_idx->int="%identity"letlook=lookletnext=nextletprev=prevletfirst=0letlastus=prevus(String.lengthus)letat_endusbi=bi=String.lengthusletout_of_range usbi=bi<0||bi >=String.lengthusletmoveusbin=(* fastermoving positive than negative n *)letbi=refbiinletstep=ifn>0then nextelseprevinfor_j=1toabsndobi:=stepus!bidone;!biletof_char_idxusci=moveusfirstciend(* Could be improved. *)letrindexusch=letrecaux cibi=ifByteIndex.out_of_rangeusbithenraiseNot_found;ifByteIndex.lookusbi=chthencielse aux(ci-1)(ByteIndex.prevusbi)inaux0(ByteIndex.lastus)letreccontains_auxstepbiusch=ifByteIndex.out_of_rangeusbithenfalseelseifByteIndex.lookusbi=chthentrueelse contains_auxstep(stepusbi)uschletcontainsusch=contains_aux ByteIndex.nextByteIndex.firstusch