123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485(** {1 UTF8 strings} *)(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia}
We only deal with UTF8 strings as they naturally map to OCaml bytestrings *)openCCShims_typeuchar=Uchar.ttype'agen=unit->'aoptiontype'aiter=('a->unit)->unitletequal(a:string)b=Stdlib.(=)ablethash:string->int=Hashtbl.hashletpp=Format.pp_print_stringincludeStringletempty=""letto_stringx=x(** State for decoding *)moduleDec=structtypet={s:string;len:int;(* max offset *)mutablei:int;(* offset *)}letmake?(idx=0)(s:string):t={s=s;i=idx;len=String.lengths;}endletn_bytes=lengthexceptionMalformedofstring*int(** Malformed string at given offset *)(* decode next char. Mutate state, calls [yield c] if a char [c] is
read, [stop ()] otherwise.
@raise Malformed if an invalid substring is met *)letnext_(typea)(st:Dec.t)~(yield:uchar->a)~(stop:unit->a)():a=letopenDecinletmalformedst=raise(Malformed(st.s,st.i))in(* read a multi-byte character.
@param acc the accumulator (containing the first byte of the char)
@param n_bytes number of bytes to read (i.e. [width char - 1])
@param overlong minimal bound on second byte (to detect overlong encoding)
*)letread_multi?(overlong=0)n_bytesacc=(* inner loop j = 1..jmax *)letrecauxjacc=letc=Char.codest.s.[st.i+j]in(* check that c is in 0b10xxxxxx *)ifclsr6<>0b10thenmalformedst;(* overlong encoding? *)ifj=1&&overlong<>0&&(cland0b111111)<overlongthenmalformedst;(* except for first, each char gives 6 bits *)letnext=(acclsl6)lor(cland0b111111)inifj=n_bytesthen((* done reading the codepoint *)ifUchar.is_validnextthen(st.i<-st.i+j+1;(* +1 for first char *)yield(Uchar.unsafe_of_intnext))else(malformedst;))else(aux(j+1)next)inassert(n_bytes>=1);(* is the string long enough to contain the whole codepoint? *)ifst.i+n_bytes<st.lenthen(aux1acc(* start with j=1, first char is already processed! *))else((* char is truncated *)malformedst;)inifst.i>=st.lenthen(stop())else(letc=st.s.[st.i]in(* find leading byte, and detect some impossible cases
according to https://en.wikipedia.org/wiki/Utf8#Codepage_layout *)matchcwith|'\000'..'\127'->st.i<-1+st.i;yield(Uchar.of_int@@Char.codec)(* 0xxxxxxx *)|'\194'..'\223'->read_multi1((Char.codec)land0b11111)(* 110yyyyy *)|'\225'..'\239'->read_multi2((Char.codec)land0b1111)(* 1110zzzz *)|'\241'..'\244'->read_multi3((Char.codec)land0b111)(* 11110uuu *)|'\224'->(* overlong: if next byte is < than [0b001000000] then the char
would fit in 1 byte *)read_multi~overlong:0b001000002((Char.codec)land0b1111)(* 1110zzzz *)|'\240'->(* overlong: if next byte is < than [0b000100000] then the char
would fit in 2 bytes *)read_multi~overlong:0b000100003((Char.codec)land0b111)(* 11110uuu *)|'\128'..'\193'(* 192,193 are forbidden *)|'\245'..'\255'->malformedst;)letto_gen?(idx=0)str:uchargen=letst=Dec.make~idxstrinfun()->next_st~yield:(func->Somec)~stop:(fun()->None)()exceptionStopletto_iter?(idx=0)s:uchariter=funyield->letst=Dec.make~idxsintrywhiletruedonext_st~yield~stop:(fun()->raiseStop)()donewithStop->()letto_seq?(idx=0)s:ucharSeq.t=letrecloopst=letr=refNoneinfun()->match!rwith|Somec->c|None->letc=next_st~yield:(funx->Seq.Cons(x,loopst))~stop:(fun()->Seq.Nil)()inr:=Somec;cinletst=Dec.make~idxsinloopst(*$= & ~cmp:(=) ~printer:Q.Print.(list (fun c -> string_of_int@@ Uchar.to_int c))
(to_list (of_string_exn "aébõ😀")) (to_seq (of_string_exn "aébõ😀") |> CCList.of_seq)
*)(* make sure it's persisted correctly *)(*$R
let s = (of_string_exn "aébõ😀") in
let seq = to_seq s in
let l = to_list s in
let testeq seq = assert_equal ~cmp:(=) l (CCList.of_seq seq) in
testeq seq;
testeq seq;
testeq seq;
*)letiter?idxfs=to_iter?idxsfletfold?idxfaccs=letst=Dec.make?idxsinletrecauxacc=next_st~yield:(funx->letacc=faccxinauxacc)~stop:(fun()->acc)()inauxaccletn_chars=fold(funx_->x+1)0letto_list?(idx=0)s:ucharlist=fold~idx(funaccx->x::acc)[]s|>List.rev(* Convert a code point (int) into a string;
There are various equally trivial versions of this around.
*)let[@inline]uchar_to_bytes(c:uchar)(f:char->unit):unit=letc=Uchar.to_intcinletmask=0b111111inassert(Uchar.is_validc);ifc<=0x7fthen(f(Char.unsafe_chrc))elseifc<=0x7ffthen(f(Char.unsafe_chr(0xc0lor(clsr6)));f(Char.unsafe_chr(0x80lor(clandmask)));)elseifc<=0xffffthen(f(Char.unsafe_chr(0xe0lor(clsr12)));f(Char.unsafe_chr(0x80lor((clsr6)landmask)));f(Char.unsafe_chr(0x80lor(clandmask)));)elseifc<=0x1fffffthen(f(Char.unsafe_chr(0xf0lor(clsr18)));f(Char.unsafe_chr(0x80lor((clsr12)landmask)));f(Char.unsafe_chr(0x80lor((clsr6)landmask)));f(Char.unsafe_chr(0x80lor(clandmask)));)else(f(Char.unsafe_chr(0xf8lor(clsr24)));f(Char.unsafe_chr(0x80lor((clsr18)landmask)));f(Char.unsafe_chr(0x80lor((clsr12)landmask)));f(Char.unsafe_chr(0x80lor((clsr6)landmask)));f(Char.unsafe_chr(0x80lor(clandmask)));)(* number of bytes required to encode this codepoint. A skeleton version
of {!uchar_to_bytes}. *)let[@inline]uchar_num_bytes(c:uchar):int=letc=Uchar.to_intcinifc<=0x7fthen(1)elseifc<=0x7ffthen(2)elseifc<=0xffffthen(3)elseifc<=0x1fffffthen(4)else(5)letof_geng:t=letbuf=Buffer.create32inletrecaux()=matchg()with|None->Buffer.contentsbuf|Somec->uchar_to_bytesc(Buffer.add_charbuf);aux()inaux()letof_seqseq:t=letbuf=Buffer.create32inSeq.iter(func->uchar_to_bytesc(Buffer.add_charbuf))seq;Buffer.contentsbufletof_iteri:t=letbuf=Buffer.create32ini(func->uchar_to_bytesc(Buffer.add_charbuf));Buffer.contentsbufletmakenc=ifn=0thenemptyelse(letn_bytes=uchar_num_bytescinletbuf=Bytes.create(n*n_bytes)in(* copy [c] at the beginning of the buffer *)leti=ref0inuchar_to_bytesc(funb->Bytes.setbuf!ib;incri);(* now repeat the prefix n-1 times *)forj=1ton-1doBytes.blitbuf0buf(n_bytes*j)n_bytes;done;Bytes.unsafe_to_stringbuf)let[@inline]of_ucharc:t=make1cletof_listl:t=letlen=List.fold_left(funnc->n+uchar_num_bytesc)0liniflen>Sys.max_string_lengththen(invalid_arg"CCUtf8_string.of_list: string size limit exceeded";);letbuf=Bytes.makelen'\000'inleti=ref0inList.iter(func->uchar_to_bytesc(funbyte->Bytes.unsafe_setbuf!ibyte;incri))l;assert(!i=len);Bytes.unsafe_to_stringbufletmapfs:t=letbuf=Buffer.create(n_bytess)initer(func->uchar_to_bytes(fc)(Buffer.add_charbuf))s;Buffer.contentsbufletfilter_mapfs:t=letbuf=Buffer.create(n_bytess)initer(func->matchfcwith|None->()|Somec->uchar_to_bytesc(Buffer.add_charbuf))s;Buffer.contentsbufletflat_mapfs:t=letbuf=Buffer.create(n_bytess)initer(func->iter(func->uchar_to_bytesc(Buffer.add_charbuf))(fc))s;Buffer.contentsbufletappend=Stdlib.(^)letunsafe_of_strings=sletis_valid(s:string):bool=tryletst=Dec.makesinwhiletruedonext_st~yield:(fun_->())~stop:(fun()->raiseStop)()done;assertfalsewith|Malformed_->false|Stop->trueletof_string_exns=ifis_validsthenselseinvalid_arg"CCUtf8_string.of_string_exn"letof_strings=ifis_validsthenSomeselseNone(*$inject
let printer s = String.escaped (to_string s)
let pp_uchar (c:Uchar.t) = Printf.sprintf "0x%x" (Uchar.to_int c)
let arb_uchar =
let rec gen = lazy (
let open Q.Gen in
Q.Gen.int_range Uchar.(to_int min) Uchar.(to_int max) >>= fun n ->
try return (Uchar.of_int n)
with _ -> Lazy.force gen
) in
Q.make
~print:(fun c -> Printf.sprintf "<uchar '%d'>" (Uchar.to_int c))
(Lazy.force gen)
let uutf_is_valid s =
try
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> raise Exit
| `Uchar _ -> ())
() s;
true
with Exit ->
false
let uutf_to_iter s f =
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> f (Uchar.of_int 0xfffd)
| `Uchar c -> f c)
() s
let uutf_of_l l =
let buf = Buffer.create 32 in
List.iter (Uutf.Buffer.add_utf_8 buf) l;
Buffer.contents buf
*)(*$R
let s = of_string_exn "このため、" in
let s' = to_iter s |> of_iter in
assert_equal ~cmp:equal ~printer s s'
*)(*$QR
Q.small_string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
is_valid s)
*)(*$QR & ~long_factor:10
Q.small_string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
s = (of_string_exn s |> to_iter|> of_iter|> to_string)
)
*)(*$QR & ~long_factor:10
Q.string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
String.length s = List.length (of_string_exn s |> to_list)
)
*)(*$QR & ~long_factor:10 ~count:20_000
Q.(small_list arb_uchar) (fun l ->
let s = of_list l in
l = to_list s)
*)(*$QR & ~long_factor:10
Q.(small_list arb_uchar) (fun l ->
let s = of_list l in
l = (to_list @@ of_gen @@ to_gen s)
)
*)(*$QR & ~long_factor:10
Q.(small_list arb_uchar) (fun l ->
let s = of_list l in
l = (to_list @@ of_iter @@ to_iter s)
)
*)(*$T
not (is_valid "\192\181")
not (is_valid "\193\143")
not (is_valid "\224\151\167")
not (is_valid "\224\137\165")
is_valid "\240\151\189\163"
*)(*$QR & ~long_factor:40
Q.string (fun s ->
Q.assume (is_valid s);
let s = of_string_exn s in
let s2 = s |> to_iter|> of_iter in
if s=s2 then true
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
)
*)(*$QR & ~long_factor:40
Q.string (fun s ->
Q.assume (is_valid s);
let s = of_string_exn s in
let s2 = s |> to_gen |> of_gen in
if s=s2 then true
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
)
*)(* compare with uutf *)(*$QR & ~long_factor:40 ~count:50_000
Q.small_string (fun s ->
let v1 = is_valid s in
let v2 = uutf_is_valid s in
if v1=v2 then true
else Q.Test.fail_reportf "s:%S, valid: %B, uutf_valid: %B" s v1 v2
)
*)(*$QR & ~long_factor:40 ~count:50_000
Q.(small_list arb_uchar) (fun l ->
let pp s = Q.Print.(list pp_uchar) s in
let uutf = uutf_of_l l in
let s = (of_list l:>string) in
if uutf = s then true
else Q.Test.fail_reportf "l: '%s', uutf: '%s', containers: '%s'"
(pp l) uutf s
)
*)(*$QR & ~long_factor:40 ~count:50_000
Q.small_string (fun s ->
Q.assume (is_valid s && uutf_is_valid s);
let pp s = Q.Print.(list pp_uchar) s in
let l_uutf = uutf_to_iter s |> Iter.to_list in
let l_co = of_string_exn s |> to_iter |> Iter.to_list in
if l_uutf = l_co then true
else Q.Test.fail_reportf "uutf: '%s', containers: '%s', is_valid %B, uutf_is_valid %B"
(pp l_uutf) (pp l_co) (is_valid s) (uutf_is_valid s)
)
*)(*$R
for i = 0 to 127 do
let c = Uchar.of_int i in
assert_equal 1 (n_bytes (of_list [c]))
done
*)(*$QR
Q.(small_list arb_uchar) (fun l ->
of_list l = concat empty (List.map of_uchar l))
*)(*$QR
Q.(pair small_nat arb_uchar) (fun (i,c) ->
make i c = concat empty (CCList.init i (fun _ -> of_uchar c)))
*)