123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312(** {1 UTF8 strings} *)(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia}
We only deal with UTF8 strings as they naturally map to OCaml bytestrings *)typeuchar=Uchar.ttype'agen=unit->'aoptiontype'aiter=('a->unit)->unit(* compat shim *)[@@@ocaml.warning"-32"]letequal(a:string)b=Stdlib.(=)ablethash:string->int=Hashtbl.hash[@@@ocaml.warning"+32"](* end compat shim *)letpp=Format.pp_print_stringincludeStringletempty=""letto_stringx=x(** State for decoding *)moduleDec=structtypet={s:string;len:int;(* max offset *)mutablei:int;(* offset *)}letmake?(idx=0)(s:string):t={s;i=idx;len=String.lengths}endletn_bytes=lengthexceptionMalformedofstring*int(** Malformed string at given offset *)(* decode next char. Mutate state, calls [yield c] if a char [c] is
read, [stop ()] otherwise.
@raise Malformed if an invalid substring is met *)letnext_(typea)(st:Dec.t)~(yield:uchar->a)~(stop:unit->a)():a=letopenDecinletmalformedst=raise(Malformed(st.s,st.i))in(* read a multi-byte character.
@param acc the accumulator (containing the first byte of the char)
@param n_bytes number of bytes to read (i.e. [width char - 1])
@param overlong minimal bound on second byte (to detect overlong encoding)
*)letread_multi?(overlong=0)n_bytesacc=(* inner loop j = 1..jmax *)letrecauxjacc=letc=Char.codest.s.[st.i+j]in(* check that c is in 0b10xxxxxx *)ifclsr6<>0b10thenmalformedst;(* overlong encoding? *)ifj=1&&overlong<>0&&cland0b111111<overlongthenmalformedst;(* except for first, each char gives 6 bits *)letnext=(acclsl6)lor(cland0b111111)inifj=n_bytesthenif(* done reading the codepoint *)Uchar.is_validnextthen(st.i<-st.i+j+1;(* +1 for first char *)yield(Uchar.unsafe_of_intnext))elsemalformedstelseaux(j+1)nextinassert(n_bytes>=1);(* is the string long enough to contain the whole codepoint? *)ifst.i+n_bytes<st.lenthenaux1acc(* start with j=1, first char is already processed! *)else(* char is truncated *)malformedstinifst.i>=st.lenthenstop()else(letc=st.s.[st.i]in(* find leading byte, and detect some impossible cases
according to https://en.wikipedia.org/wiki/Utf8#Codepage_layout *)matchcwith|'\000'..'\127'->st.i<-1+st.i;yield(Uchar.of_int@@Char.codec)(* 0xxxxxxx *)|'\194'..'\223'->read_multi1(Char.codecland0b11111)(* 110yyyyy *)|'\225'..'\239'->read_multi2(Char.codecland0b1111)(* 1110zzzz *)|'\241'..'\244'->read_multi3(Char.codecland0b111)(* 11110uuu *)|'\224'->(* overlong: if next byte is < than [0b001000000] then the char
would fit in 1 byte *)read_multi~overlong:0b001000002(Char.codecland0b1111)(* 1110zzzz *)|'\240'->(* overlong: if next byte is < than [0b000100000] then the char
would fit in 2 bytes *)read_multi~overlong:0b000100003(Char.codecland0b111)(* 11110uuu *)|'\128'..'\193'(* 192,193 are forbidden *)|'\245'..'\255'->malformedst)letto_gen?(idx=0)str:uchargen=letst=Dec.make~idxstrinfun()->next_st~yield:(func->Somec)~stop:(fun()->None)()exceptionStopletto_iter?(idx=0)s:uchariter=funyield->letst=Dec.make~idxsintrywhiletruedonext_st~yield~stop:(fun()->raiseStop)()donewithStop->()letto_seq?(idx=0)s:ucharSeq.t=letrecloopst=letr=refNoneinfun()->match!rwith|Somec->c|None->letc=next_st~yield:(funx->Seq.Cons(x,loopst))~stop:(fun()->Seq.Nil)()inr:=Somec;cinletst=Dec.make~idxsinloopstletiter?idxfs=to_iter?idxsfletfold?idxfaccs=letst=Dec.make?idxsinletrecauxacc=next_st~yield:(funx->letacc=faccxinauxacc)~stop:(fun()->acc)()inauxaccletn_chars=fold(funx_->x+1)0letto_list?(idx=0)s:ucharlist=fold~idx(funaccx->x::acc)[]s|>List.rev(* Convert a code point (int) into a string;
There are various equally trivial versions of this around.
*)let[@inline]uchar_to_bytes(c:uchar)(f:char->unit):unit=letc=Uchar.to_intcinletmask=0b111111inassert(Uchar.is_validc);ifc<=0x7fthenf(Char.unsafe_chrc)elseifc<=0x7ffthen(f(Char.unsafe_chr(0xc0lor(clsr6)));f(Char.unsafe_chr(0x80lor(clandmask))))elseifc<=0xffffthen(f(Char.unsafe_chr(0xe0lor(clsr12)));f(Char.unsafe_chr(0x80lor((clsr6)landmask)));f(Char.unsafe_chr(0x80lor(clandmask))))elseifc<=0x1fffffthen(f(Char.unsafe_chr(0xf0lor(clsr18)));f(Char.unsafe_chr(0x80lor((clsr12)landmask)));f(Char.unsafe_chr(0x80lor((clsr6)landmask)));f(Char.unsafe_chr(0x80lor(clandmask))))else(f(Char.unsafe_chr(0xf8lor(clsr24)));f(Char.unsafe_chr(0x80lor((clsr18)landmask)));f(Char.unsafe_chr(0x80lor((clsr12)landmask)));f(Char.unsafe_chr(0x80lor((clsr6)landmask)));f(Char.unsafe_chr(0x80lor(clandmask))))(* number of bytes required to encode this codepoint. A skeleton version
of {!uchar_to_bytes}. *)let[@inline]uchar_num_bytes(c:uchar):int=letc=Uchar.to_intcinifc<=0x7fthen1elseifc<=0x7ffthen2elseifc<=0xffffthen3elseifc<=0x1fffffthen4else5letof_geng:t=letbuf=Buffer.create32inletrecaux()=matchg()with|None->Buffer.contentsbuf|Somec->uchar_to_bytesc(Buffer.add_charbuf);aux()inaux()letof_seqseq:t=letbuf=Buffer.create32inSeq.iter(func->uchar_to_bytesc(Buffer.add_charbuf))seq;Buffer.contentsbufletof_iteri:t=letbuf=Buffer.create32ini(func->uchar_to_bytesc(Buffer.add_charbuf));Buffer.contentsbufletmakenc=ifn=0thenemptyelse(letn_bytes=uchar_num_bytescinletbuf=Bytes.create(n*n_bytes)in(* copy [c] at the beginning of the buffer *)leti=ref0inuchar_to_bytesc(funb->Bytes.setbuf!ib;incri);(* now repeat the prefix n-1 times *)forj=1ton-1doBytes.blitbuf0buf(n_bytes*j)n_bytesdone;Bytes.unsafe_to_stringbuf)let[@inline]of_ucharc:t=make1cletof_listl:t=letlen=List.fold_left(funnc->n+uchar_num_bytesc)0liniflen>Sys.max_string_lengththeninvalid_arg"CCUtf8_string.of_list: string size limit exceeded";letbuf=Bytes.makelen'\000'inleti=ref0inList.iter(func->uchar_to_bytesc(funbyte->Bytes.unsafe_setbuf!ibyte;incri))l;assert(!i=len);Bytes.unsafe_to_stringbufletmapfs:t=letbuf=Buffer.create(n_bytess)initer(func->uchar_to_bytes(fc)(Buffer.add_charbuf))s;Buffer.contentsbufletfilter_mapfs:t=letbuf=Buffer.create(n_bytess)initer(func->matchfcwith|None->()|Somec->uchar_to_bytesc(Buffer.add_charbuf))s;Buffer.contentsbufletflat_mapfs:t=letbuf=Buffer.create(n_bytess)initer(func->iter(func->uchar_to_bytesc(Buffer.add_charbuf))(fc))s;Buffer.contentsbufletappend=Stdlib.(^)letunsafe_of_strings=sletis_valid(s:string):bool=tryletst=Dec.makesinwhiletruedonext_st~yield:(fun_->())~stop:(fun()->raiseStop)()done;assertfalsewith|Malformed_->false|Stop->trueletof_string_exns=ifis_validsthenselseinvalid_arg"CCUtf8_string.of_string_exn"letof_strings=ifis_validsthenSomeselseNone