123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478# 1 "ocaml_util.cppo.ml"letprint_locppfloc=Location.print_locppflocletprint_errorlocfppfx=# 17 "ocaml_util.cppo.ml"leterror=Location.error_of_printer~locfxinLocation.print_reportppferror# 25 "ocaml_util.cppo.ml"moduleUchar=structincludeUcharletvalid_bit=27letdecode_bits=24let[@inline]utf_decode_is_validd=(dlsrvalid_bit)=1let[@inline]utf_decode_lengthd=(dlsrdecode_bits)land0b111let[@inline]utf_decode_uchard=unsafe_of_int(dland0xFFFFFF)let[@inline]utf_decodenu=((8lorn)lsldecode_bits)lor(to_intu)letrep=0xFFFDlet[@inline]utf_decode_invalidn=(nlsldecode_bits)lorrepletrep=Uchar.rependmoduleBytes=structincludeBytesexternalunsafe_get_uint8:bytes->int->int="%bytes_unsafe_get"externalget_uint8:bytes->int->int="%bytes_safe_get"externalunsafe_set_uint8:bytes->int->int->unit="%bytes_unsafe_set"externalset_int8:bytes->int->int->unit="%bytes_safe_set"letget_int8bi=((get_uint8bi)lsl(Sys.int_size-8))asr(Sys.int_size-8)letset_uint8=set_int8(* UTF codecs and validations *)letdec_invalid=Uchar.utf_decode_invalidlet[@inline]dec_retnu=Uchar.utf_decoden(Uchar.unsafe_of_intu)let[@inline]not_in_x80_to_xBFb=blsr6<>0b10let[@inline]not_in_xA0_to_xBFb=blsr5<>0b101let[@inline]not_in_x80_to_x9Fb=blsr5<>0b100let[@inline]not_in_x90_to_xBFb=b<0x90||0xBF<blet[@inline]not_in_x80_to_x8Fb=blsr4<>0x8let[@inline]utf_8_uchar_2b0b1=((b0land0x1F)lsl6)lor((b1land0x3F))let[@inline]utf_8_uchar_3b0b1b2=((b0land0x0F)lsl12)lor((b1land0x3F)lsl6)lor((b2land0x3F))let[@inline]utf_8_uchar_4b0b1b2b3=((b0land0x07)lsl18)lor((b1land0x3F)lsl12)lor((b2land0x3F)lsl6)lor((b3land0x3F))letget_utf_8_ucharbi=letb0=get_uint8biin(* raises if [i] is not a valid index. *)letget=unsafe_get_uint8inletmax=lengthb-1inmatchChar.unsafe_chrb0with(* See The Unicode Standard, Table 3.7 *)|'\x00'..'\x7F'->dec_ret1b0|'\xC2'..'\xDF'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_xBFb1thendec_invalid1elsedec_ret2(utf_8_uchar_2b0b1)|'\xE0'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_xA0_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elsedec_ret3(utf_8_uchar_3b0b1b2)|'\xE1'..'\xEC'|'\xEE'..'\xEF'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elsedec_ret3(utf_8_uchar_3b0b1b2)|'\xED'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_x9Fb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elsedec_ret3(utf_8_uchar_3b0b1b2)|'\xF0'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x90_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elseleti=i+1inifi>maxthendec_invalid3elseletb3=getbiinifnot_in_x80_to_xBFb3thendec_invalid3elsedec_ret4(utf_8_uchar_4b0b1b2b3)|'\xF1'..'\xF3'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_xBFb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elseleti=i+1inifi>maxthendec_invalid3elseletb3=getbiinifnot_in_x80_to_xBFb3thendec_invalid3elsedec_ret4(utf_8_uchar_4b0b1b2b3)|'\xF4'->leti=i+1inifi>maxthendec_invalid1elseletb1=getbiinifnot_in_x80_to_x8Fb1thendec_invalid1elseleti=i+1inifi>maxthendec_invalid2elseletb2=getbiinifnot_in_x80_to_xBFb2thendec_invalid2elseleti=i+1inifi>maxthendec_invalid3elseletb3=getbiinifnot_in_x80_to_xBFb3thendec_invalid3elsedec_ret4(utf_8_uchar_4b0b1b2b3)|_->dec_invalid1letset_utf_8_ucharbiu=letset=unsafe_set_uint8inletmax=lengthb-1inmatchUchar.to_intuwith|uwhenu<0->assertfalse|uwhenu<=0x007F->set_uint8biu;1|uwhenu<=0x07FF->letlast=i+1iniflast>maxthen0else(set_uint8bi(0xC0lor(ulsr6));setblast(0x80lor(uland0x3F));2)|uwhenu<=0xFFFF->letlast=i+2iniflast>maxthen0else(set_uint8bi(0xE0lor(ulsr12));setb(i+1)(0x80lor((ulsr6)land0x3F));setblast(0x80lor(uland0x3F));3)|uwhenu<=0x10FFFF->letlast=i+3iniflast>maxthen0else(set_uint8bi(0xF0lor(ulsr18));setb(i+1)(0x80lor((ulsr12)land0x3F));setb(i+2)(0x80lor((ulsr6)land0x3F));setblast(0x80lor(uland0x3F));4)|_->assertfalseletis_valid_utf_8b=letrecloopmaxbi=ifi>maxthentrueelseletget=unsafe_get_uint8inmatchChar.unsafe_chr(getbi)with|'\x00'..'\x7F'->loopmaxb(i+1)|'\xC2'..'\xDF'->letlast=i+1iniflast>max||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|'\xE0'->letlast=i+2iniflast>max||not_in_xA0_to_xBF(getb(i+1))||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|'\xE1'..'\xEC'|'\xEE'..'\xEF'->letlast=i+2iniflast>max||not_in_x80_to_xBF(getb(i+1))||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|'\xED'->letlast=i+2iniflast>max||not_in_x80_to_x9F(getb(i+1))||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|'\xF0'->letlast=i+3iniflast>max||not_in_x90_to_xBF(getb(i+1))||not_in_x80_to_xBF(getb(i+2))||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|'\xF1'..'\xF3'->letlast=i+3iniflast>max||not_in_x80_to_xBF(getb(i+1))||not_in_x80_to_xBF(getb(i+2))||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|'\xF4'->letlast=i+3iniflast>max||not_in_x80_to_x8F(getb(i+1))||not_in_x80_to_xBF(getb(i+2))||not_in_x80_to_xBF(getblast)thenfalseelseloopmaxb(last+1)|_->falseinloop(lengthb-1)b0endmoduleString=structincludeStringmoduleB=structincludeBytesletfor_allps=letn=lengthsinletrecloopi=ifi=nthentrueelseifp(unsafe_getsi)thenloop(succi)elsefalseinloop0endletbos=B.unsafe_of_stringletfor_allfs=B.for_allf(boss)letget_utf_8_ucharsi=B.get_utf_8_uchar(boss)iletis_valid_utf_8s=B.is_valid_utf_8(boss)(** {6 Binary encoding/decoding of integers} *)externalget_uint8:string->int->int="%string_safe_get"externalget_uint16_ne:string->int->int="%caml_string_get16"externalget_int32_ne:string->int->int32="%caml_string_get32"externalget_int64_ne:string->int->int64="%caml_string_get64"letget_int8si=B.get_int8(boss)iend# 265 "ocaml_util.cppo.ml"(** {1 Minimal support for Unicode characters in identifiers} *)moduleUtf8_lexeme=structtypet=string(* Non-ASCII letters that are allowed in identifiers (currently: Latin-9) *)typecase=UpperofUchar.t|LowerofUchar.tletknown_chars:(Uchar.t,case)Hashtbl.t=Hashtbl.create32let_=List.iter(fun(upper,lower)->letupper=Uchar.of_intupperandlower=Uchar.of_intlowerinHashtbl.addknown_charsupper(Upperlower);Hashtbl.addknown_charslower(Lowerupper))[(0xc0,0xe0);(* À, à *)(0xc1,0xe1);(* Á, á *)(0xc2,0xe2);(* Â, â *)(0xc3,0xe3);(* Ã, ã *)(0xc4,0xe4);(* Ä, ä *)(0xc5,0xe5);(* Å, å *)(0xc6,0xe6);(* Æ, æ *)(0xc7,0xe7);(* Ç, ç *)(0xc8,0xe8);(* È, è *)(0xc9,0xe9);(* É, é *)(0xca,0xea);(* Ê, ê *)(0xcb,0xeb);(* Ë, ë *)(0xcc,0xec);(* Ì, ì *)(0xcd,0xed);(* Í, í *)(0xce,0xee);(* Î, î *)(0xcf,0xef);(* Ï, ï *)(0xd0,0xf0);(* Ð, ð *)(0xd1,0xf1);(* Ñ, ñ *)(0xd2,0xf2);(* Ò, ò *)(0xd3,0xf3);(* Ó, ó *)(0xd4,0xf4);(* Ô, ô *)(0xd5,0xf5);(* Õ, õ *)(0xd6,0xf6);(* Ö, ö *)(0xd8,0xf8);(* Ø, ø *)(0xd9,0xf9);(* Ù, ù *)(0xda,0xfa);(* Ú, ú *)(0xdb,0xfb);(* Û, û *)(0xdc,0xfc);(* Ü, ü *)(0xdd,0xfd);(* Ý, ý *)(0xde,0xfe);(* Þ, þ *)(0x160,0x161);(* Š, š *)(0x17d,0x17e);(* Ž, ž *)(0x152,0x153);(* Œ, œ *)(0x178,0xff);(* Ÿ, ÿ *)(0x1e9e,0xdf);(* ẞ, ß *)](* NFD to NFC conversion table for the letters above *)letknown_pairs:(Uchar.t*Uchar.t,Uchar.t)Hashtbl.t=Hashtbl.create32let_=List.iter(fun(c1,n2,n)->Hashtbl.addknown_pairs(Uchar.of_charc1,Uchar.of_intn2)(Uchar.of_intn))[('A',0x300,0xc0);(* À *)('A',0x301,0xc1);(* Á *)('A',0x302,0xc2);(*  *)('A',0x303,0xc3);(* à *)('A',0x308,0xc4);(* Ä *)('A',0x30a,0xc5);(* Å *)('C',0x327,0xc7);(* Ç *)('E',0x300,0xc8);(* È *)('E',0x301,0xc9);(* É *)('E',0x302,0xca);(* Ê *)('E',0x308,0xcb);(* Ë *)('I',0x300,0xcc);(* Ì *)('I',0x301,0xcd);(* Í *)('I',0x302,0xce);(* Î *)('I',0x308,0xcf);(* Ï *)('N',0x303,0xd1);(* Ñ *)('O',0x300,0xd2);(* Ò *)('O',0x301,0xd3);(* Ó *)('O',0x302,0xd4);(* Ô *)('O',0x303,0xd5);(* Õ *)('O',0x308,0xd6);(* Ö *)('U',0x300,0xd9);(* Ù *)('U',0x301,0xda);(* Ú *)('U',0x302,0xdb);(* Û *)('U',0x308,0xdc);(* Ü *)('Y',0x301,0xdd);(* Ý *)('Y',0x308,0x178);(* Ÿ *)('S',0x30c,0x160);(* Š *)('Z',0x30c,0x17d);(* Ž *)('a',0x300,0xe0);(* à *)('a',0x301,0xe1);(* á *)('a',0x302,0xe2);(* â *)('a',0x303,0xe3);(* ã *)('a',0x308,0xe4);(* ä *)('a',0x30a,0xe5);(* å *)('c',0x327,0xe7);(* ç *)('e',0x300,0xe8);(* è *)('e',0x301,0xe9);(* é *)('e',0x302,0xea);(* ê *)('e',0x308,0xeb);(* ë *)('i',0x300,0xec);(* ì *)('i',0x301,0xed);(* í *)('i',0x302,0xee);(* î *)('i',0x308,0xef);(* ï *)('n',0x303,0xf1);(* ñ *)('o',0x300,0xf2);(* ò *)('o',0x301,0xf3);(* ó *)('o',0x302,0xf4);(* ô *)('o',0x303,0xf5);(* õ *)('o',0x308,0xf6);(* ö *)('u',0x300,0xf9);(* ù *)('u',0x301,0xfa);(* ú *)('u',0x302,0xfb);(* û *)('u',0x308,0xfc);(* ü *)('y',0x301,0xfd);(* ý *)('y',0x308,0xff);(* ÿ *)('s',0x30c,0x161);(* š *)('z',0x30c,0x17e);(* ž *)]letnormalize_generic~keep_asciitransforms=letrecnormcheckbufprevi=ifi>=String.lengthsthenbeginBuffer.add_utf_8_ucharbuf(transformprev)endelsebeginletd=String.get_utf_8_ucharsiinletu=Uchar.utf_decode_uchardincheckdu;leti'=i+Uchar.utf_decode_lengthdinmatchHashtbl.find_optknown_pairs(prev,u)with|Someu'->normcheckbufu'i'|None->Buffer.add_utf_8_ucharbuf(transformprev);normcheckbufui'endinletascii_limit=128inifs=""||keep_ascii&&String.for_all(funx->Char.codex<ascii_limit)sthenOkselseletbuf=Buffer.create(String.lengths)inletvalid=reftrueinletcheckdu=valid:=!valid&&Uchar.utf_decode_is_validd&&u<>Uchar.repinletd=String.get_utf_8_uchars0inletu=Uchar.utf_decode_uchardincheckdu;normcheckbufu(Uchar.utf_decode_lengthd);letcontents=Buffer.contentsbufinif!validthenOkcontentselseErrorcontentsletnormalizes=normalize_generic~keep_ascii:true(funu->u)s(* Capitalization *)letuchar_is_uppercaseu=letc=Uchar.to_intuinifc<0x80thenc>=65&&c<=90elsematchHashtbl.find_optknown_charsuwith|Some(Upper_)->true|_->falseletuchar_lowercaseu=letc=Uchar.to_intuinifc<0x80thenifc>=65&&c<=90thenUchar.of_int(c+32)elseuelsematchHashtbl.find_optknown_charsuwith|Some(Upperu')->u'|_->uletuchar_uppercaseu=letc=Uchar.to_intuinifc<0x80thenifc>=97&&c<=122thenUchar.of_int(c-32)elseuelsematchHashtbl.find_optknown_charsuwith|Some(Loweru')->u'|_->uletcapitalizes=letfirst=reftrueinnormalize_generic~keep_ascii:false(funu->if!firstthen(first:=false;uchar_uppercaseu)elseu)sletuncapitalizes=letfirst=reftrueinnormalize_generic~keep_ascii:false(funu->if!firstthen(first:=false;uchar_lowercaseu)elseu)sletis_capitalizeds=s<>""&&uchar_is_uppercase(Uchar.utf_decode_uchar(String.get_utf_8_uchars0))(* Characters allowed in identifiers after normalization is applied.
Currently:
- ASCII letters, underscore
- Latin-9 letters, represented in NFC
- ASCII digits, single quote (but not as first character)
- dot if [with_dot] = true
*)letuchar_valid_in_identifier~with_dotu=letc=Uchar.to_intuinifc<0x80thenc>=97(* a *)&&c<=122(* z *)||c>=65(* A *)&&c<=90(* Z *)||c>=48(* 0 *)&&c<=57(* 9 *)||c=95(* underscore *)||c=39(* single quote *)||(with_dot&&c=46)(* dot *)elseHashtbl.memknown_charsuletuchar_not_identifier_startu=letc=Uchar.to_intuinc>=48(* 0 *)&&c<=57(* 9 *)||c=39(* single quote *)(* Check whether a normalized string is a valid OCaml identifier. *)typevalidation_result=|Valid|Invalid_characterofUchar.t(** Character not allowed *)|Invalid_beginningofUchar.t(** Character not allowed as first char *)letvalidate_identifier?(with_dot=false)s=letrecchecki=ifi>=String.lengthsthenValidelsebeginletd=String.get_utf_8_ucharsiinletu=Uchar.utf_decode_uchardinleti'=i+Uchar.utf_decode_lengthdinifnot(uchar_valid_in_identifier~with_dotu)thenInvalid_characteruelseifi=0&&uchar_not_identifier_startuthenInvalid_beginninguelsechecki'endincheck0letis_valid_identifiers=validate_identifiers=Validletstarts_like_a_valid_identifiers=s<>""&&(letu=Uchar.utf_decode_uchar(String.get_utf_8_uchars0)inuchar_valid_in_identifier~with_dot:falseu&¬(uchar_not_identifier_startu))letis_lowercases=letrecis_lowercase_atlensn=ifn>=lenthentrueelseletd=String.get_utf_8_ucharsninletu=Uchar.utf_decode_uchardin(uchar_valid_in_identifier~with_dot:falseu)&¬(uchar_is_uppercaseu)&&is_lowercase_atlens(n+Uchar.utf_decode_lengthd)inis_lowercase_at(String.lengths)s0end