123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378(** {1 UTF8 strings} *)(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia}
We only deal with UTF8 strings as they naturally map to OCaml bytestrings *)openCCShims_typeuchar=Uchar.ttype'agen=unit->'aoptiontype'aiter=('a->unit)->unittype'asequence=('a->unit)->unitletequal(a:string)b=Stdlib.(=)ablethash:string->int=Hashtbl.hashletpp=Format.pp_print_stringincludeStringletto_stringx=x(** State for decoding *)moduleDec=structtypet={s:string;len:int;(* max offset *)mutablei:int;(* offset *)}letmake?(idx=0)(s:string):t={s=s;i=idx;len=String.lengths;}endletn_bytes=lengthexceptionMalformedofstring*int(** Malformed string at given offset *)(* decode next char. Mutate state, calls [yield c] if a char [c] is
read, [stop ()] otherwise.
@raise Malformed if an invalid substring is met *)letnext_(typea)(st:Dec.t)~(yield:uchar->a)~(stop:unit->a)():a=letopenDecinletmalformedst=raise(Malformed(st.s,st.i))in(* read a multi-byte character.
@param acc the accumulator (containing the first byte of the char)
@param n_bytes number of bytes to read (i.e. [width char - 1])
@param overlong minimal bound on second byte (to detect overlong encoding)
*)letread_multi?(overlong=0)n_bytesacc=(* inner loop j = 1..jmax *)letrecauxjacc=letc=Char.codest.s.[st.i+j]in(* check that c is in 0b10xxxxxx *)ifclsr6<>0b10thenmalformedst;(* overlong encoding? *)ifj=1&&overlong<>0&&(cland0b111111)<overlongthenmalformedst;(* except for first, each char gives 6 bits *)letnext=(acclsl6)lor(cland0b111111)inifj=n_bytesthen((* done reading the codepoint *)ifUchar.is_validnextthen(st.i<-st.i+j+1;(* +1 for first char *)yield(Uchar.unsafe_of_intnext))else(malformedst;))else(aux(j+1)next)inassert(n_bytes>=1);(* is the string long enough to contain the whole codepoint? *)ifst.i+n_bytes<st.lenthen(aux1acc(* start with j=1, first char is already processed! *))else((* char is truncated *)malformedst;)inifst.i>=st.lenthen(stop())else(letc=st.s.[st.i]in(* find leading byte, and detect some impossible cases
according to https://en.wikipedia.org/wiki/Utf8#Codepage_layout *)matchcwith|'\000'..'\127'->st.i<-1+st.i;yield(Uchar.of_int@@Char.codec)(* 0xxxxxxx *)|'\194'..'\223'->read_multi1((Char.codec)land0b11111)(* 110yyyyy *)|'\225'..'\239'->read_multi2((Char.codec)land0b1111)(* 1110zzzz *)|'\241'..'\244'->read_multi3((Char.codec)land0b111)(* 11110uuu *)|'\224'->(* overlong: if next byte is < than [0b001000000] then the char
would fit in 1 byte *)read_multi~overlong:0b001000002((Char.codec)land0b1111)(* 1110zzzz *)|'\240'->(* overlong: if next byte is < than [0b000100000] then the char
would fit in 2 bytes *)read_multi~overlong:0b000100003((Char.codec)land0b111)(* 11110uuu *)|'\128'..'\193'(* 192,193 are forbidden *)|'\245'..'\255'->malformedst;)letto_gen?(idx=0)str:uchargen=letst=Dec.make~idxstrinfun()->next_st~yield:(func->Somec)~stop:(fun()->None)()exceptionStopletto_iter?(idx=0)s:uchariter=funyield->letst=Dec.make~idxsintrywhiletruedonext_st~yield~stop:(fun()->raiseStop)()donewithStop->()letto_seq=to_iterletto_std_seq?(idx=0)s:ucharSeq.t=letrecloopst=letr=refNoneinfun()->match!rwith|Somec->c|None->letc=next_st~yield:(funx->Seq.Cons(x,loopst))~stop:(fun()->Seq.Nil)()inr:=Somec;cinletst=Dec.make~idxsinloopst(*$= & ~cmp:(=) ~printer:Q.Print.(list (fun c -> string_of_int@@ Uchar.to_int c))
(to_list (of_string_exn "aébõ😀")) (to_std_seq (of_string_exn "aébõ😀") |> CCList.of_std_seq)
*)(* make sure it's persisted correctly *)(*$R
let s = (of_string_exn "aébõ😀") in
let seq = to_std_seq s in
let l = to_list s in
let testeq seq = assert_equal ~cmp:(=) l (CCList.of_std_seq seq) in
testeq seq;
testeq seq;
testeq seq;
*)letiter?idxfs=to_iter?idxsfletfold?idxfaccs=letst=Dec.make?idxsinletrecauxacc=next_st~yield:(funx->letacc=faccxinauxacc)~stop:(fun()->acc)()inauxaccletn_chars=fold(funx_->x+1)0letto_list?(idx=0)s:ucharlist=fold~idx(funaccx->x::acc)[]s|>List.rev(* Convert a code point (int) into a string;
There are various equally trivial versions of this around.
*)letcode_to_stringbuf(c:uchar):unit=letc=Uchar.to_intcinletmask=0b111111inassert(Uchar.is_validc);ifc<=0x7fthen(Buffer.add_charbuf(Char.unsafe_chrc))elseifc<=0x7ffthen(Buffer.add_charbuf(Char.unsafe_chr(0xc0lor(clsr6)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(clandmask)));)elseifc<=0xffffthen(Buffer.add_charbuf(Char.unsafe_chr(0xe0lor(clsr12)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((clsr6)landmask)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(clandmask)));)elseifc<=0x1fffffthen(Buffer.add_charbuf(Char.unsafe_chr(0xf0lor(clsr18)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((clsr12)landmask)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((clsr6)landmask)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(clandmask)));)else(Buffer.add_charbuf(Char.unsafe_chr(0xf8lor(clsr24)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((clsr18)landmask)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((clsr12)landmask)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor((clsr6)landmask)));Buffer.add_charbuf(Char.unsafe_chr(0x80lor(clandmask)));)letof_geng:t=letbuf=Buffer.create32inletrecaux()=matchg()with|None->Buffer.contentsbuf|Somec->code_to_stringbufc;aux()inaux()letof_std_seqseq:t=letbuf=Buffer.create32inSeq.iter(code_to_stringbuf)seq;Buffer.contentsbufletof_iteri:t=letbuf=Buffer.create32ini(code_to_stringbuf);Buffer.contentsbufletof_seq=of_iterletof_listl:t=letbuf=Buffer.create32inList.iter(code_to_stringbuf)l;Buffer.contentsbufletmapfs:t=letbuf=Buffer.create(n_bytess)initer(func->code_to_stringbuf(fc))s;Buffer.contentsbufletfilter_mapfs:t=letbuf=Buffer.create(n_bytess)initer(func->matchfcwith|None->()|Somec->code_to_stringbufc)s;Buffer.contentsbufletflat_mapfs:t=letbuf=Buffer.create(n_bytess)initer(func->iter(code_to_stringbuf)(fc))s;Buffer.contentsbufletappend=Stdlib.(^)letunsafe_of_strings=sletis_valid(s:string):bool=tryletst=Dec.makesinwhiletruedonext_st~yield:(fun_->())~stop:(fun()->raiseStop)()done;assertfalsewith|Malformed_->false|Stop->trueletof_string_exns=ifis_validsthenselseinvalid_arg"CCUtf8_string.of_string_exn"letof_strings=ifis_validsthenSomeselseNone(*$inject
let printer s = String.escaped (to_string s)
let pp_uchar (c:Uchar.t) = Printf.sprintf "0x%x" (Uchar.to_int c)
let uutf_is_valid s =
try
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> raise Exit
| `Uchar _ -> ())
() s;
true
with Exit ->
false
let uutf_to_seq s f =
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> f (Uchar.of_int 0xfffd)
| `Uchar c -> f c)
() s
*)(*$R
let s = of_string_exn "このため、" in
let s' = to_seq s |> of_seq in
assert_equal ~cmp:equal ~printer s s'
*)(*$QR
Q.small_string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
is_valid s)
*)(*$QR & ~long_factor:10
Q.small_string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
s = (of_string_exn s |> to_seq |> of_seq |> to_string)
)
*)(*$QR & ~long_factor:10
Q.string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
String.length s = List.length (of_string_exn s |> to_list)
)
*)(*$T
not (is_valid "\192\181")
not (is_valid "\193\143")
not (is_valid "\224\151\167")
not (is_valid "\224\137\165")
is_valid "\240\151\189\163"
*)(*$QR & ~long_factor:40
Q.string (fun s ->
Q.assume (is_valid s);
let s = of_string_exn s in
let s2 = s |> to_seq |> of_seq in
if s=s2 then true
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
)
*)(*$QR & ~long_factor:40
Q.string (fun s ->
Q.assume (is_valid s);
let s = of_string_exn s in
let s2 = s |> to_gen |> of_gen in
if s=s2 then true
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
)
*)(* compare with uutf *)(*$QR & ~long_factor:40 ~count:50_000
Q.small_string (fun s ->
let v1 = is_valid s in
let v2 = uutf_is_valid s in
if v1=v2 then true
else Q.Test.fail_reportf "s:%S, valid: %B, uutf_valid: %B" s v1 v2
)
*)(*$QR & ~long_factor:40 ~count:50_000
Q.small_string (fun s ->
Q.assume (is_valid s && uutf_is_valid s);
let pp s = Q.Print.(list pp_uchar) s in
let l_uutf = uutf_to_seq s |> Iter.to_list in
let l_co = of_string_exn s |> to_seq |> Iter.to_list in
if l_uutf = l_co then true
else Q.Test.fail_reportf "uutf: '%s', containers: '%s', is_valid %B, uutf_is_valid %B"
(pp l_uutf) (pp l_co) (is_valid s) (uutf_is_valid s)
)
*)