1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087(*
* BatText - Unicode text library
*
* Copyright (C) 2012 The Batteries Included Team
* Copyright (C) 2007 Mauricio Fernandez <mfp@acm.org>
* Copyright (C) 2008 Edgar Friendly <thelema314@gmail.com>
* Copyright (C) 2008 David Teller, LIFO, Universite d'Orleans
*
* Rope: Rope: an implementation of the data structure described in
*
* Boehm, H., Atkinson, R., and Plass, M. 1995. Ropes: an alternative to
* strings. Softw. Pract. Exper. 25, 12 (Dec. 1995), 1315-1330.
*
* Motivated by Luca de Alfaro's extensible array implementation Vec.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version,
* with the special exception on linking described in file LICENSE.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*)moduleUTF8=BatUTF8moduleUChar=BatUChar(**Low-level optimization*)letint_max(x:int)(y:int)=ifx<ythenyelse xletint_min(x:int)(y:int)=ifx<ythenxelse yletsplices1offlens2=letlen1 =String.lengths1and len2=String.lengths2inletoff =ifoff<0thenlen1+off-1elseoffinletlen=int_min(len1-off)leninletout_len=len1-len+len2inlets=Bytes.createout_leninBytes.blit_string s10s0off;(* s1 beforesplice point *)Bytes.blit_strings20sofflen2;(* s2 at splice point *)Bytes.blit_string(* s1 after off+len *)s1(off+len)s(off+len2)(len1 -(off+len));Bytes.unsafe_to_stringstypet=Empty(**An empty rope*)|Concatoft*int*t*int*int(**[Concat l ls r rs h] is the concatenation of
ropes [l] and [r], where [ls] is the total
length of [l], [rs] is the length of [r]
and [h] is the height of the node in the
tree, used for rebalancing. *)|Leafofint*UTF8.t(**[Leaf l t] is string [t] with length [l],
measured in number of Unicode characters.*)typeforest_element={mutablec:t;mutablelen:int}letstr_append=(^)letempty_str=""letstring_of_string_listl=String.concatempty_str l(* 48 limits maxrope size to 220GB on 64 bit,
* ~ 700MB on 32bit (length fields overflow after that) *)letmax_height=48(* actual size will be that plus 1 word header;
* the code assumes it's an even num.
* 256 gives up to a 50% overhead in the worst case (all leaf nodes near
* half-filled *)letleaf_size=256(* utf-8 characters, not bytes *)(* MAIN CODE STARTS HERE *)exceptionOut_of_boundsletempty=Empty(* by construction, there cannot be Empty or Leaf "" leaves *)letis_empty=functionEmpty->true|_->falseletheight=function|Empty|Leaf_->0|Concat(_,_,_,_,h)->hletlength=function|Empty->0|Leaf(l,_)->l|Concat(_,cl,_,cr,_)->cl+crletmake_concatlr=lethl=heightlandhr=heightrinletcl=lengthlandcr=lengthrinConcat(l,cl,r,cr,ifhl>=hrthenhl+1elsehr+1)letmin_len=letfib_tbl=Array.makemax_height0inlet recfibn=matchfib_tbl.(n)with|0->letlast=fib(n-1)andprev=fib(n-2)inletr=last+previnletr=ifr>lastthenrelselastin(* check overflow *)fib_tbl.(n)<-r;r|n->ninfib_tbl.(0)<-leaf_size+1;fib_tbl.(1)<-3*leaf_size/2+1;Array.initmax_height(funi->ifi=0then1elsefib(i-1))letmax_length=min_len.(Array.length min_len-1)letconcat_fastlr=matchlwith|Empty->r|Leaf_|Concat(_,_,_,_,_)->matchrwith|Empty->l|Leaf_|Concat(_,_,_,_,_)->make_concatlr(* based on Hans-J. Boehm's *)letadd_forestforestropelen=leti=ref0inletsum=refemptyinwhilelen>min_len.(!i+1)doifforest.(!i).c<>Emptythenbeginsum:=concat_fastforest.(!i).c!sum;forest.(!i).c<-Emptyend;incridone;sum:=concat_fast !sumrope;letsum_len=ref(length!sum)inwhile !sum_len>=min_len.(!i)doifforest.(!i).c<>Emptythenbeginsum:=concat_fastforest.(!i).c!sum;sum_len :=!sum_len +forest.(!i).len;forest.(!i).c<-Empty;end;incridone;decri;forest.(!i).c<-!sum;forest.(!i).len<-!sum_lenletconcat_forestforest=Array.fold_left (funsx->concat_fast x.cs)Emptyforestletrecbalance_insertropelenforest=matchropewith|Empty->()|Leaf_->add_forestforestropelen|Concat(l,cl,r,cr,h)whenh>=max_height||len<min_len.(h)->balance_insertlclforest;balance_insertrcrforest|x->add_forestforestxlen(* function orbalanced *)letbalancer=matchrwith|Empty|Leaf_->r|_->letforest=Array.initmax_height(fun_->{c=Empty;len=0})inbalance_insertr(lengthr)forest;concat_forestforestletbal_if_neededlr=letr=make_concat lrinifheightr<max_heightthenrelsebalancerletconcat_strl=function|Empty|Concat(_,_,_,_,_)->invalid_arg"Text.concat_str"|Leaf(lenr,rs)asr->matchlwith|Empty->r|Leaf(lenl,ls)->letslen=lenr+lenlinifslen<=leaf_sizethenLeaf((lenl+lenr),(str_appendlsrs))elsemake_concatlr(* height = 1 *)|Concat(ll,cll,Leaf(lenlr,lrs),clr,h)->let slen=clr+lenrinifclr+lenr <=leaf_sizethenConcat(ll,cll,Leaf((lenlr+lenr),(str_appendlrsrs)),slen,h)elsebal_if_neededlr|_->bal_if_neededlrletappend_charcr=concat_strr(Leaf(1,(UTF8.make1c)))letappendl=function|Empty->l|Leaf_asr->concat_strlr|Concat(Leaf(lenrl,rls),rlc,rr,rc,h)asr->(matchlwithEmpty->r|Concat(_,_,_,_,_)->bal_if_neededlr|Leaf(lenl,ls)->letslen=rlc+lenlinifslen<=leaf_sizethenConcat(Leaf((lenrl+lenl),(str_appendlsrls)),slen,rr,rc,h)elsebal_if_neededlr)|r->(matchlwithEmpty->r|_->bal_if_neededlr)let(^^^)=appendletprepend_char cr=append(Leaf(1,(UTF8.make1c)))rletgetri=letrecauxi=functionEmpty ->raiseOut_of_bounds|Leaf(lens,s)->ifi>=0&&i<lensthenUTF8.getsielseraiseOut_of_bounds|Concat(l,cl,r,_cr,_)->ifi<clthenauxilelse aux(i-cl)rinaux irletcopy_setuscposc=letipos =UTF8.ByteIndex.of_char_idx uscposinletjpos=UTF8.ByteIndex.next usiposinleti=UTF8.ByteIndex.to_intiposandj=UTF8.ByteIndex.to_intjposinspliceusi(j-i)(UTF8.of_charc)letsetriv=letrecauxi=functionEmpty ->raiseOut_of_bounds|Leaf(lens,s)->ifi>=0&&i<lensthenlets=copy_setsivinLeaf(lens,s)elseraiseOut_of_bounds|Concat(l,cl,r,_cr,_)->ifi<clthenappend(auxil)relseappendl(aux(i-cl)r)inauxirmoduleIter=struct(* Iterators are used for iterating efficiently over multiple ropes
at the same time *)typeiterator={(* Current leaf in which the iterator is *)mutableleaf:UTF8.t;(* Current byte position of the iterator *)mutableidx:UTF8.ByteIndex.b_idx;(* Ropes not yet visited *)mutablerest:tlist;}letcopyi={iwith idx=i.idx;}(* Initial iterator state: *)letmakerope={leaf=UTF8.empty;idx=UTF8.ByteIndex.first;rest=ifrope=Emptythen[]else[rope]}letrecnext_leaf=function|Empty ::l->next_leafl|Leaf(_len,str)::l->Some(str,l)|Concat(left,_left_len,right,_right_len,_height)::l->next_leaf (left::right::l)|[]->None(* Advance the iterator to the next position, and return current
character: *)letnextiter=ifUTF8.ByteIndex.at_enditer.leafiter.idxthen(* We are at the end of the current leaf, find another one: *)matchnext_leafiter.restwith|None ->None|Some(leaf,rest)->iter.leaf <-leaf;iter.idx<-UTF8.ByteIndex.nextleafUTF8.ByteIndex.first;iter.rest<-rest;Some(UTF8.ByteIndex.lookleafUTF8.ByteIndex.first)elsebegin(* Just advance in the current leaf: *)letch=UTF8.ByteIndex.lookiter.leafiter.idxiniter.idx<-UTF8.ByteIndex.nextiter.leafiter.idx;Somechend(* Same thing but map leafs: *)letnext_mapfiter=ifUTF8.ByteIndex.at_enditer.leafiter.idxthenmatchnext_leafiter.restwith|None ->None|Some(leaf,rest)->let leaf=fleafiniter.leaf<-leaf;iter.idx<-UTF8.ByteIndex.nextleafUTF8.ByteIndex.first;iter.rest<-rest;Some(UTF8.ByteIndex.lookleafUTF8.ByteIndex.first)elsebeginletch=UTF8.ByteIndex.lookiter.leafiter.idxiniter.idx<-UTF8.ByteIndex.nextiter.leafiter.idx;Somechend(* Same thing but in reverse order: *)letrecprev_leaf=function|Empty ::l->prev_leafl|Leaf(_len,str)::l->Some(str,l)|Concat(left,_left_len,right,_right_len,_height)::l->prev_leaf (right::left::l)|[]->Noneletpreviter=ifiter.idx=UTF8.ByteIndex.firstthenmatchprev_leafiter.restwith|None ->None|Some(leaf,rest)->iter.leaf <-leaf;iter.idx<-UTF8.ByteIndex.lastleaf;iter.rest<-rest;Some(UTF8.ByteIndex.lookleafiter.idx)elsebeginiter.idx<-UTF8.ByteIndex.previter.leafiter.idx;Some(UTF8.ByteIndex.lookiter.leafiter.idx)endend(* Can be improved? *)letcompareab=letia=Iter.makeaandib=Iter.makebinlet recloop_=matchIter.nextia,Iter.nextibwith|None,None->0|None,_->-1|_,None->1|Someca,Somecb->matchUChar.comparecacbwith|0->loop()|n->ninloop()letof_ustringustr=(* We need fast access to raw bytes: *)letbytes=ustrinletbyte_length=String.lengthbytes in(* - [rope] is the accumulator
- [start_byte_idx] is the byte position of the current slice
- [current_byte_idx] is the current byte position
- [slice_size] is the number of unicode characters contained
between [start_byte_idx] and [current_byte_idx] *)letrecloopropestart_byte_idxcurrent_byte_idxslice_size=ifcurrent_byte_idx=byte_lengththenbeginifslice_size=0thenropeelseadd_slice ropestart_byte_idxcurrent_byte_idxslice_sizeendelsebeginifslice_size=leaf_sizethen(* We have enough unicode characters for this slice, extract
it and add a leaf to the rope: *)loop(add_sliceropestart_byte_idxcurrent_byte_idxslice_size)current_byte_idx current_byte_idx0elseletnext_byte_idx=UTF8.nextustrcurrent_byte_idxinloopropestart_byte_idxnext_byte_idx(slice_size+1)endandadd_sliceropestart_byte_idxend_byte_idxslice_size=appendrope(Leaf(slice_size,(* This is correct, we are just extracting a
sequence of well-formed UTF-8 encoded unicode
characters: *)UTF8.of_string_unsafe(String.subbytesstart_byte_idx(end_byte_idx -start_byte_idx))))inloopEmpty 000letof_string s=(* Validate + unsafe to avoid an extra copy (it is OK because
of_ustring do not reuse its argument in the resulting rope): *)UTF8.validates;of_ustring(UTF8.of_string_unsafes)letappend_usrus=appendr(of_ustringus)letrecmakelenc=letrecconcatlooplenir=ifi<=lenthen(*TODO: test for sharing among substrings *)concatlooplen(i*2)(appendrr)elseriniflen=0thenEmptyelseiflen<=leaf_sizethenLeaf (len,(UTF8.makelenc))elseletrope=concatlooplen2(of_ustring(UTF8.make1c))inappendrope(make(len-lengthrope)c)letof_ucharc=make1cletof_char c=of_uchar(UChar.of_charc)letsubrstartlen=letrecauxstartlen=functionEmpty->ifstart<>0||len<>0thenraise Out_of_boundselseEmpty|Leaf(lens,s)->iflen<0||start<0||start+len>lensthenraiseOut_of_boundselseiflen>0then(* Leaf "" cannot happen *)(tryLeaf(len,(UTF8.subsstartlen))with_->raiseOut_of_bounds)elseEmpty|Concat(l,cl,r,cr,_)->ifstart<0||len<0||start +len >cl+crthenraiseOut_of_bounds;letleft=ifstart=0theniflen>=clthenlelseaux0lenlelseifstart >clthenEmptyelse ifstart+len>=clthenauxstart (cl-start)lelseauxstartlenlinletright=ifstart<=clthenletupto =start+leninifupto=cl+crthenrelseifupto<clthenEmptyelseaux0(upto-cl)relseaux(start-cl)lenrinappendleftrightinauxstartlenrletinsertstartroper=append (append(subr0start)rope)(subrstart(lengthr-start))letremovestartlenr=append (subr0start)(subr(start+len)(lengthr-start -len))letto_ustringr=letrecstringsl=function|Empty ->l|Leaf(_,s)->s::l|Concat(left,_,right,_,_)->strings(strings lright)leftinstring_of_string_list(strings[]r)letrecbulk_iterf=function|Empty->()|Leaf(_,s)->fs|Concat(l,_,r,_,_)->bulk_iterfl;bulk_iterfrletrecbulk_iteri?(base=0)f=function|Empty->()|Leaf(_,s)->fbases|Concat(l,cl,r,_,_)->bulk_iteri~basefl;bulk_iteri~base:(base+cl)frletreciterf=function|Empty->()|Leaf(_,s)->UTF8.iterfs|Concat(l,_,r,_,_)->iterfl;iterfrletreciteri?(base=0)f=function|Empty->()|Leaf(_,s)->UTF8.iteri(funcj->f(base+j)c)s|Concat(l,cl,r,_,_)->iteri~basefl;iteri~base:(base+cl)frletrecbulk_iteri_backwards~topf=function|Empty->()|Leaf(_lens,s)->ftops|Concat(l,_,r,cr,_)->bulk_iteri_backwards~topfr;bulk_iteri_backwards~top:(top-cr)flletrecrange_iterfstartlen=function|Empty->ifstart<>0||len<>0thenraise Out_of_bounds|Leaf (lens,s)->letn=start +leninifstart>=0&&len>=0&&n<= lensthenfori=start ton-1dof(UTF8.looks(UTF8.nthsi))(*TODO: use enum to iterate efficiently*)doneelseraiseOut_of_bounds|Concat(l,cl,r,cr,_)->ifstart<0||len<0||start +len >cl+crthenraiseOut_of_bounds;if start<clthenbeginletupto=start+leninifupto<=clthenrange_iter fstartlenlelsebeginrange_iterfstart(cl-start)l;range_iterf0(upto-cl)rendendelsebeginrange_iterf(start-cl)lenrendletrecrange_iterif?(base=0)startlen=function|Empty->ifstart<>0||len<>0thenraise Out_of_bounds|Leaf (lens,s)->letn=start +leninifstart>=0&&len>=0&&n<= lensthenfori=start ton-1dof(base+i)(UTF8.looks(UTF8.nthsi))(*TODO:use enum to iterate efficiently*)doneelseraiseOut_of_bounds|Concat(l,cl,r,cr,_)->ifstart<0||len<0||start +len >cl+crthenraiseOut_of_bounds;if start<clthenbeginletupto=start+leninifupto<=clthenrange_iteri f~basestartlenlelse beginrange_iterif~basestart(cl-start)l;range_iterif~base:(base+cl-start)0(upto-cl)rendendelsebeginrange_iterif~base(start-cl)lenrendletrecfoldfa=function|Empty->a|Leaf(_,s)->UTF8.fold(funac->fac)as|Concat(l,_,r,_,_)->foldf(foldfal)rletrecbulk_foldfa=function|Empty->a|Leaf(_,s)->fas|Concat(l,_,r,_,_)->bulk_foldf(bulk_foldfal)rletto_stringt=(* We use unsafe version to avoid the copy of the non-reachable
temporary string: *)UTF8.to_string_unsafe(to_ustringt)letinitlenf=Leaf(len,UTF8.initlenf)let of_string_unsafes=of_ustring(UTF8.of_string_unsafes)letof_inti=of_string_unsafe(string_of_inti)letof_floatf=of_string_unsafe(string_of_floatf)letto_intr=int_of_string(UTF8.to_string_unsafe(to_ustringr))letto_floatr=float_of_string(UTF8.to_string_unsafe (to_ustringr))letbulk_mapfr=bulk_fold(funaccs->append_usacc(fs))Emptyrletmapfr=bulk_map(funs->UTF8.mapfs)rletbulk_filter_mapfr=bulk_fold(funaccs->matchfswithNone->acc|Somer->append_usaccr)Emptyrletfilter_mapfr=bulk_map(UTF8.filter_mapf)rletfilterfr=bulk_map(UTF8.filterf)rletleftrlen=subr0lenletrightrlen=let rlen=lengthrinsubr(rlen-len)lenlethead=leftlettailrpos =subrpos(lengthr-pos)let indexru=leti=Iter.makerinletrecloopn=matchIter.nextiwith|None->raiseNot_found|Someu'->ifUChar.equu'thennelseloop(n+1)inloop0letenumr=let nextiter()=matchIter.nextiterwith|None->raise BatEnum.No_more_elements|Somex->xandcountiter()=letn=ref0inletiter'=Iter.copyiterinbegintrywhiletruedomatchIter.nextiter'withNone->raiseExit|Some_->incrndonewithExit->()end;!ninletrecmakeiter=BatEnum.make~next:(nextiter)~clone:(cloneiter)~count:(countiter)andcloneiter()=make(Iter.copyiter)inmake(Iter.maker)letbackwardsr=letnextiter()=matchIter.previterwith|None->raise BatEnum.No_more_elements|Somex->xandcountiter()=letn=ref0inletiter'=Iter.copyiterinbegintrywhiletruedomatchIter.previter'withNone->raiseExit|Some_->incrndonewithExit->()end;!ninletrecmakeiter=BatEnum.make~next:(nextiter)~clone:(cloneiter)~count:(countiter)andcloneiter()=make(Iter.copyiter)inmake(Iter.maker)letof_enume=letsize=BatEnum.counteininitsize(fun_i->tryBatEnum.get_exnewithBatEnum.No_more_elements->assertfalse)(*$Q enum; of_enum
(Q.array Q.small_int) (fun a -> \
let s = BatUTF8.init (Array.length a) (fun i -> BatUChar.chr (Array.get a i)) in \
s = (of_string s |> enum |> of_enum |> to_string))
*)moduleReturn=BatReturnletindex_fromrbaseitem=Return.with_label(funlabel->letindex_auxic=ifc=itemthenReturn.returnlabeliinrange_iteriindex_auxbase(lengthr-base)r;raiseNot_found)(*$T index_from
index_from (of_string "batteries") 0 (BatUChar.of_char 't') = 2
index_from (of_string "batteries") 3 (BatUChar.of_char 't') = 3
Result.(catch (index_from (of_string "batteries") 4) (BatUChar.of_char 't') \
|> is_exn Not_found)
Result.(catch (index_from (of_string "batteries") 20) (BatUChar.of_char 't') \
|> is_exn Out_of_bounds)
*)letrindexrchar=Return.with_label(funlabel->letindex_auxius=tryletp=UTF8.rindexuscharinReturn.returnlabel(p+i)withNot_found->()inbulk_iteri_backwards~top:(lengthr-1)index_auxr;raiseNot_found)(*$T rindex
rindex (of_string "batteries") (BatUChar.of_char 't') = 3
rindex (of_string "batt") (BatUChar.of_char 't') = 3
try ignore (rindex (of_string "batteries") (BatUChar.of_char 'y')); false with Not_found -> true
*)letrindex_fromrstartchar=letrsub=leftr(start+1)in(rindexrsubchar)(*$T rindex_from
let s = "batteries" in rindex_from (of_string s) (String.length s - 1) (BatUChar.of_char 't') = 3
let s = "batteries" in rindex_from (of_string s) 2 (BatUChar.of_char 't') = 2
try ignore (rindex_from (of_string "batteries") 4 (BatUChar.of_char 'y')); false with Not_found -> true
try ignore (rindex_from (of_string "batteries") 20 (BatUChar.of_char 'y')); false with Out_of_bounds -> true
*)letcontainsrchar=Return.with_label(funlabel->letcontains_auxus=ifUTF8.contains uscharthenReturn.returnlabeltrueinbulk_iter contains_auxr;false)(*$T contains
contains empty (BatUChar.of_char 't') = false
contains (of_string "") (BatUChar.of_char 't') = false
contains (of_string "batteries") (BatUChar.of_char 't') = true
contains (of_string "batteries") (BatUChar.of_char 'y') = false
*)letcontains_fromrstartchar=Return.with_label (funlabel->letcontains_auxc=ifc=charthenReturn.returnlabeltrueinrange_itercontains_auxstart(lengthr-start)r;false)(*$T contains_from
try ignore (contains_from empty 4 (BatUChar.of_char 't')); false with Out_of_bounds -> true
try ignore (contains_from (of_string "") 4 (BatUChar.of_char 't')); false with Out_of_bounds -> true
contains_from (of_string "batteries") 4 (BatUChar.of_char 't') = false
contains_from (of_string "batteries") 3 (BatUChar.of_char 't') = true
contains_from (of_string "batteries") 2 (BatUChar.of_char 't') = true
contains_from (of_string "batteries") 1 (BatUChar.of_char 't') = true
contains_from (of_string "batteries") 4 (BatUChar.of_char 'y') = false
*)letrcontains_fromrstopchar=Return.with_label(funlabel->letcontains_auxc=ifc=charthenReturn.returnlabeltrueinrange_itercontains_aux0(stop+1)r;false)(*$T rcontains_from
try ignore (rcontains_from empty 4 (BatUChar.of_char 't')); false with Out_of_bounds -> true
try ignore (rcontains_from (of_string "") 4 (BatUChar.of_char 't')); false with Out_of_bounds -> true
rcontains_from (of_string "batteries") 4 (BatUChar.of_char 't') = true
rcontains_from (of_string "batteries") 3 (BatUChar.of_char 't') = true
rcontains_from (of_string "batteries") 2 (BatUChar.of_char 't') = true
rcontains_from (of_string "batteries") 1 (BatUChar.of_char 't') = false
rcontains_from (of_string "batteries") 4 (BatUChar.of_char 'y') = false
*)letequalr1r2=comparer1r2=0letstarts_withrprefix=letir=Iter.makerandiprefix=Iter.makeprefixinletrecloop_=matchIter.nextiprefixwith|None->true|Somech1->matchIter.nextirwith|None->false|Somech2->UChar.comparech1ch2=0&&loop()inloop()let ends_withrsuffix=letir=Iter.makerandisuffix=Iter.makesuffixinletrecloop_=matchIter.previsuffixwith|None->true|Somech1->matchIter.previrwith|None->false|Somech2->UChar.comparech1ch2=0&&loop()inloop()(** find [sub] within [rop] or raises Not_found *)letfind_fromropofssub_rop=letlen=lengthropinifofs<0||ofs>lenthenraiseOut_of_bounds;letmatchlen=lengthsub_ropinletsub_rop=to_ustringsub_ropinletcheck_atpos=sub_rop=(to_ustring(subropposmatchlen))in(* TODO: inefficient *)Return.with_label(funlabel->fori=ofsto len-matchlendoifcheck_atithenReturn.returnlabelidone;raise Not_found)(*$T find_from
find_from (of_string "foobarbaz") 4 (of_string "ba") = 6
find_from (of_string "foobarbaz") 7 (of_string "") = 7
Result.(catch (find_from (of_string "") 0) (of_string "a") |> is_exn Not_found)
let foo = of_string "foo" in Result.(catch (find_from foo 2) foo |> is_exn Not_found)
let foo = of_string "foo" in Result.(catch (find_from foo 3) foo |> is_exn Not_found)
let foo = of_string "foo" in Result.(catch (find_from foo 4) foo |> is_exn Out_of_bounds)
let foo = of_string "foo" in Result.(catch (find_from foo (-1)) foo |> is_exn Out_of_bounds)
*)letfindropsub=find_fromrop0subletrfind_fromropsufsub_rop=ifsuf+1<0|| suf+1>lengthropthenraiseOut_of_bounds;letmatchlen=lengthsub_ropinletsub_rop=to_ustringsub_ropinletcheck_atpos=sub_rop=(to_ustring(subropposmatchlen))in(* TODO: inefficient *)Return.with_label(funlabel->fori=suf-matchlen+1downto0doifcheck_atithenReturn.returnlabelidone;raise Not_found)(*$T rfind_from
rfind_from (of_string "foobarbaz") 5 (of_string "ba") = 3
rfind_from (of_string "foobarbaz") 7 (of_string "ba") = 6
rfind_from (of_string "foobarbaz") 6 (of_string "ba") = 3
rfind_from (of_string "foobarbaz") 7 (of_string "") = 8
Result.(catch (rfind_from (of_string "") 3) empty |> is_exn Out_of_bounds)
Result.(catch (rfind_from (of_string "") (-1)) (of_string "a") |> is_exn Not_found)
Result.(catch (rfind_from (of_string "foobarbaz") 2) (of_string "ba") |> is_exn Not_found)
Result.(catch (rfind_from (of_string "foo") 3) (of_string "foo") |> is_exn Out_of_bounds)
Result.(catch (rfind_from (of_string "foo") (-2)) (of_string "foo") |> is_exn Out_of_bounds)
*)letrfindropsub=rfind_fromrop(length rop-1)subletexistsr_strr_sub =tryignore(find r_str r_sub);truewithNot_found->falseletstrip_default_chars=List.mapUChar.of_char [' ';'\t';'\r';'\n']letstrip?(chars=strip_default_chars)rope=letrecstrip_leftniter=matchIter.next iterwith|None->Empty|SomechwhenList.memchchars->strip_left (n+1)iter|_->subropen(strip_right(lengthrope-n)(Iter.makerope))andstrip_rightniter=matchIter.prev iterwith|None->assertfalse|SomechwhenList.memchchars->strip_right(n-1)iter|_->ninstrip_left0(Iter.makerope)letlchop=function|Empty->Empty|str->substr1(lengthstr-1)let rchop=function|Empty->Empty|str->substr0(lengthstr-1)letof_listl=lete=reflinletget_leaf()=Return.label(funlabel->letb=Buffer.create256infor_i =1to256domatch!ewith[]->Return.returnlabel(false,UTF8.of_string_unsafe(Buffer.contentsb))|c::rest->Buffer.add_stringb(UTF8.to_string_unsafe (UTF8.of_charc));e:=restdone;(true,UTF8.of_string_unsafe(Buffer.contentsb)))inletrecloopr=(* concat 256 characters at a time *)matchget_leaf()with(true,us)->loop(appendr(of_ustringus))|(false,us)->appendr(of_ustringus)inloopEmptyletsplicerstartlennew_sub=letstart=ifstart >=0thenstartelse(length r)+start inappend(leftrstart)(appendnew_sub(tailr(start+len)))letfillrstartlenchar=splicerstart len (makelenchar)letblitrsrcoffsrcrdstoffdst len =splicerdstoffdstlen(subrsrcoffsrclen)letconcat sepr_list=matchr_listwith|[]->empty|h::t->List.fold_left(funr1r2->appendr1(appendsepr2))ht(**T concat Text.concat (Text.of_string "xyz") [] = Text.empty
**)letescapedr=bulk_mapUTF8.escaped rletreplace_charsfr=fold(funaccs->append_usacc(fs))Emptyrletsplitrsep=leti=findrsep inheadri,tailr(i+lengthsep)(*$Tsplit
split (of_string "OCaml, the coolest FP language.") (of_char ' ') = \
(of_string "OCaml,", of_string "the coolest FP language.")
split (of_string "OCaml, the coolest FP language.") (of_char '.') = \
(of_string "OCaml, the coolest FP language", empty)
Result.(catch (split (of_string "OCaml, the coolest FP language.")) \
(of_char '!') |> is_exn Not_found)
*)letrsplit(r:t)sep=leti=rfindrsepinheadri,tailr(i+lengthsep)(*$Trsplit
rsplit (of_string "OCaml, the coolest FP language.") (of_char ' ') = \
(of_string "OCaml, the coolest FP", of_string "language.")
rsplit (of_string "OCaml, the coolest FP language.") (of_char 'O') = \
(empty, of_string "Caml, the coolest FP language.")
Result.(catch (rsplit (of_string "OCaml, the coolest FP language.")) \
(of_char '!') |> is_exn Not_found)
*)(** An implementation of [nsplit] in one pass.
This implementation traverses the string backwards, hence building
the list of substrings from the end to the beginning, so as to
avoid a call to [List.rev]. *)letnsplitstrsep=ifis_empty strthen[]elseifis_empty septheninvalid_arg"Text.nsplit: empty sep not allowed"else(* str is not empty *)letseplen=lengthsepinletrecauxaccofs=ifofs>=0then(matchtrySome(rfind_fromstrofssep)with Not_found->Nonewith|Someidx->(* sep found *)letend_of_sep=idx+seplen-1inifend_of_sep=ofs(* sep at end ofstr *)thenaux(empty::acc)(idx-1)elselettoken=substr(end_of_sep+1)(ofs -end_of_sep)inaux(token::acc)(idx-1)|None->(* sep NOT found *)(substr0(ofs+1))::acc)else(* Negative ofs: the last sep started at the beginning of str *)empty::accinaux[](length str-1)(*$T nsplit
nsplit (of_string "OCaml, the coolest FP language.") (of_char 'o') \
|> List.map to_string = ["OCaml, the c"; ""; "lest FP language."]
nsplit (of_string "OCaml, the coolest FP language.") (of_char '!') \
|> List.map to_string = ["OCaml, the coolest FP language."]
nsplit (of_string "1,2,3") (of_string ",") \
|> List.map to_string = ["1"; "2"; "3"]
nsplit (of_string "a;b;c") (of_string ";") \
|> List.map to_string = ["a"; "b"; "c"]
nsplit (of_string "") (of_string "x") = []
try ignore (nsplit (of_string "abc") (of_string "")); false \
with Invalid_argument _ -> true
nsplit (of_string "a/b/c") (of_string "/") |> List.map to_string \
= ["a"; "b"; "c"]
nsplit (of_string "/a/b/c//") (of_string "/") |> List.map to_string \
= [""; "a"; "b"; "c"; ""; ""]
nsplit (of_string "FOOaFOObFOOcFOOFOO") (of_string "FOO") |> List.map to_string \
= [""; "a"; "b"; "c"; ""; ""]
*)letjoin=concatletslice?(first=0)?(last=max_int)s=letclip_min_maxx=int_max_min(int_min _maxx)inleti=clip0(lengths)(if(first<0)then(lengths)+firstelsefirst)andj=clip0(length s)(if(last<0)then(lengths)+lastelselast)inifi>=j||i=lengthsthenEmptyelsesubsi(j-i)letreplace~str~sub~by=tryleti=findstrsubin(true,append(slice~last:istr)(appendby(slice~first:(i+(lengthsub))str)))withNot_found->(false,str)letexploder=List.rev(fold(funau->u::a)[]r)(*$T explode explode (of_string "foo") = List.map UChar.of_char ['f'; 'o'; 'o']
explode (of_string "ếẶ") = List.map UChar.chr [0x1ebf; 0x1eb6]
explode (of_string "") = []
*)letimplodel=of_listl(*$T implode
implode (List.map UChar.of_char ['f'; 'o'; 'o']) = of_string "foo"
implode (List.map UChar.chr [0x1ebf; 0x1eb6]) = of_string "ếẶ"
implode [] = of_string ""
*)letof_latin1s=of_ustring(UTF8.of_latin1s)letprintoutt=bulk_iter(BatIO.nwriteout)topenBatIO(** {6 Unicode}*)(** {7 Reading unicode}
All these functions assume that the input is UTF-8 encoded.
*)(*val read_uchar: input -> UChar.t*)(** read one UChar from a UTF-8 encoded input*)letread_chari=letn0=readiinletlen =UTF8.length0(Char.coden0)iniflen=1thenUChar.of_charn0elselets=Bytes.createleninBytes.sets0n0;letn=really_inputis1(len-1)inassert(n=len-1);lets=Bytes.unsafe_to_stringsinUTF8.gets0(*val uchars_of: input -> UChar.t BatEnum.t*)(** offer the characters of an UTF-8 encoded input as an enumeration*)letchars_ofi=make_enumread_chari(*val read_rope: input -> int -> Rope.t*)(** read up to n uchars from a UTF-8 encoded input*)letread_textin=letreclooprj=ifj=0thenrelseloop(append_char(read_chari)r)(j-1)(* TODO: make more efficient by appending a string of Rope.leaf_size (256) chars at a time *)inifn<=0thenemptyelseloopemptyn(** read the wholecontents of a UTF-8 encoded input*)letread_alli=of_string(BatIO.read_alli)(* TODO: make efficient - possibly similar to above - buffering leaf_size chars at a time *)(** read a line of UTF-8*)letread_linei=letline=read_lineiinUTF8.validateline;of_stringline(** offer the lines of a UTF-8 encoded input as an enumeration*)letlines_ofi=BatIO.make_enumread_linei(** {7 Writing unicode}
All these functions assume that the output is UTF-8 encoded.*)letwrite_stringoc=write_stringoc(*val write_uchar:_ output -> UChar.t -> unit*)letwrite_charoc=write_stringo(UTF8.init1(fun_->c))(*val write_rope: _ output -> Rope.t -> unit*)letwrite_text=print(*val write_uline: _ output -> Rope.t -> unit*)letwrite_lineor=write_textor;writeo'\n'(*val write_ulines : _ output -> Rope.t BatEnum.t -> unit*)letwrite_linesore=BatEnum.iter(write_lineo)re(*val write_ropes : _ output -> Rope.t BatEnum.t -> unit*)letwrite_textsore=BatEnum.iter(write_texto)re(*val write_uchars : _ output -> UChar.t BatEnum.t -> unit*)letwrite_charsouce=BatEnum.iter(write_charo)uceletsprintffmt=BatPrintf.ksprintfof_stringfmtletksprintfkfmt=BatPrintf.ksprintf(funs->k(of_strings))fmtletoutput_text=print