123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497(*
Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Anton Lavrik
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*)moduleC=Piqi_commonopenC.StdmoduleUtf8=Piqi_utf8exceptionError0ofstring(* internally used error exception *)leterrors=raise(Error0s)letint_of_xstrings=int_of_string("0x"^s)letint_of_ostrings=(* TODO: check the range *)int_of_string("0o"^s)(* String_a is a subtype of both String_b and String_u *)typestring_type=|String_a(* string containing only 7-bit ascii characters *)|String_b(* string containing 8-bit binary data *)|String_u(* utf-8 encoded unicode string *)(* find out whether the string is a utf8, ascii, or a binary string *)letclassify_strings=letres=refString_ain(* assuming that it is an ascii string *)letlen=String.lengthsinletrecauxi=ifi=lenthen!reselsebeginletw=Utf8.width.(Char.codes.[i])inifw>0&&i+w<=lenthen(ifw>1thenres:=String_u;(* width is > 1 => Utf8 *)(* check if the next unicode char is correctly encoded in utf8 *)ignore(Utf8.nextsi);aux(i+w))elseraiseUtf8.MalFormedendintryaux0withUtf8.MalFormed->String_bletis_utf8_strings=classify_strings<>String_bletis_ascii_strings=letlen=String.lengthsinletrecauxi=ifi>=lenthentrueelseifChar.codes.[i]<=127thenaux(i+1)elsefalseinaux0lettype_of_charc=ifc<=127thenString_aelseString_bletdigit=[%sedlex.regexp?'0'..'9']letodigit=[%sedlex.regexp?'0'..'7']letxdigit=[%sedlex.regexp?'0'..'9'|'a'..'f'|'A'..'F']letmake_charc=String_a,(Char.codec)letescaped_lexemelexbuf=(* strip the first symbol *)letlen=Sedlexing.lexeme_lengthlexbufinSedlexing.Utf8.sub_lexemelexbuf1(len-1)(* XXX: add support for a b f v escapes? *)letparse_string_escapelexbuf=[%sedlexmatchlexbufwith|'\\'->make_char'\\'|'"'->make_char'"'|'t'->make_char'\t'|'n'->make_char'\n'|'r'->make_char'\r'(* XXX: disable it for now, since specifying decimals this way may make more
* sense:
| odigit odigit odigit ->
let c = int_of_ostring (Sedlexing.Utf8.lexeme lexbuf) in
(type_of_char c),c
*)|'x',xdigit,xdigit->letc=int_of_xstring(escaped_lexemelexbuf)in(type_of_charc),c|'u',xdigit,xdigit,xdigit,xdigit->letc=int_of_xstring(escaped_lexemelexbuf)inString_u,c|'U',xdigit,xdigit,xdigit,xdigit,xdigit,xdigit,xdigit,xdigit->(* XXX: check code validity so that it doesn't exeed allocated limit *)letc=int_of_xstring(escaped_lexemelexbuf)inString_u,c|_->lets=Sedlexing.Utf8.lexemelexbufinerror("invalid string escape literal "^s)](* returns the list of integers representing codepoints *)(* XXX: allow only printable characters in strings? *)(* XXX: provide a method for wraping a string to several lines? *)letrecparse_string_literalltypellexbuf=[%sedlexmatchlexbufwith|'\\'->letctype,c=parse_string_escapelexbufinletltype=matchltype,ctypewithString_a,_->(* set up string type to whatever character type is *)ctype(* TODO: print more meaningful error messages *)|String_b,String_u->error"invalid string literal"|String_u,String_b->error"invalid string literal"|_,_->ltype(* leave the previous type *)inparse_string_literalltype(c::l)lexbuf|(0..0x1F)|127->(* XXX: what about unicode non-printable chars? *)(* do not allow non-printables to appear in string literals -- one
* should use correspondent escaped specifications instead *)error"invalid string literal"|eof->ltype,(List.revl)|any->letc=Sedlexing.lexeme_charlexbuf0inletc=Uchar.to_intcinletltype=matchltypewithString_bwhenc>127->error"invalid string literal"|_whenc>127->String_u(* upgrage string type to unicode *)|_->ltypeinparse_string_literalltype(c::l)lexbuf|_->letc=Sedlexing.lexeme_charlexbuf0inletc=Uchar.to_intcinerror("invalid string codepoint "^string_of_intc)]letutf8_of_listl=leta=Array.of_listlinUtf8.from_int_arraya0(Array.lengtha)letstring_of_listl=lets=Bytes.create(List.lengthl)inletrecauxi=function|[]->()|h::t->Bytes.setsi(Char.chrh);aux(i+1)tinaux0l;Bytes.unsafe_to_stringsletparse_string_literals=letlexbuf=Sedlexing.Utf8.from_stringsinletstr_type,l=parse_string_literalString_a[]lexbufinletparsed_str=matchstr_typewith|String_u->utf8_of_listl|String_a|String_b->string_of_listlin(str_type,parsed_str)letadd_ascii_charbufi=letaddc=Buffer.add_charbufcinletadd_escapedc=add'\\';addcinletc=Char.chriinmatchcwith|'\n'->add_escaped'n'|'\r'->add_escaped'r'|'\t'->add_escaped't'|'"'|'\\'->add_escapedc|_wheni>=20&&i<127->addc(* printable *)|_->add'\\';(* TODO: optimize *)Buffer.add_stringbuf(Printf.sprintf"x%02x"i)(* escape utf8 string according to piq lexical conventions *)letescape_strings=letlen=String.lengthsinleta=Utf8.to_int_arrays0leninletbuf=Buffer.create(len+len/10)infori=0toArray.lengtha-1doletc=a.(i)inifc<=127thenadd_ascii_charbufcelse(* XXX: check if unicode codepoint/sequence is printable and escape it
* if it isn't *)Utf8.storebufcdone;Buffer.contentsbuf(* escape binary string according to piq lexical conventions *)(* NOTE: escaping each byte as hex value unless the character is
* ASCII-printable *)letescape_binarys=letlen=String.lengthsinletbuf=Buffer.create(len+len/10)infori=0tolen-1doletc=Char.codes.[i]inadd_ascii_charbufcdone;Buffer.contentsbuftypetoken=|Lpar(* ( *)|Rpar(* ) *)|Lbr(* [ *)|Rbr(* ] *)|Star(* * *)|Comma(* , *)|Stringofstring_type*string*string(* ascii | utf8 | binary, parsed literal, original literal *)|Wordofstring(* ASCII alphanumeric, plus a couple of other characters *)|Nameofstring(* identifier starting with '.' or ':' *)|Textofstring|EOF(* Raw string -- just a sequence of bytes: may be parsed as either binary or
* utf8 string
*
* NOTE: this is used only in several special cases, and can't be represented
* in Piq text format directly *)|Raw_stringofstringletnewline=[%sedlex.regexp?'\n'|"\r\n"]letws=[%sedlex.regexp?Plus(' '|'\t')]letname=[%sedlex.regexp?(':'|'.'),Plus('a'..'z'|'A'..'Z'|'0'..'9'|'-'|'_'|'/'|'.'|':')]letstring_literal=[%sedlex.regexp?'"',Star(Compl'"'|"\\\""),'"'](* ASCII alphanumeric, '-', '_', '.', '/' for representing numbers and unquoted
* strings (useful e.g. as DSL identifiers)
*
* XXX: include all alphanumeric Unicode? *)letfirst_word_char=[%sedlex.regexp?('a'..'z'|'A'..'Z'|'0'..'9'|'-'|'_')]letword_char=[%sedlex.regexp?(first_word_char|'.'|'/')]letword=[%sedlex.regexp?first_word_char,Star(word_char)]letfloat_literal=[%sedlex.regexp?('0'..'9'),Star('0'..'9'|'_'),Opt('.',Star('0'..'9'|'_')),Opt(('e'|'E'),Opt('+'|'-'),'0'..'9',Star('0'..'9'|'_'))]letis_valid_first_word_char=function|'a'..'z'|'A'..'Z'|'0'..'9'|'-'|'_'->true|_->falseletis_valid_word_char=function|'.'|'/'->true|x->is_valid_first_word_charx(* accepts the same language as the word regexp above *)letis_valid_words=letlen=String.lengthsin(* NOTE: it works transparently on utf8 strings *)letreccheck_charsi=ifi>=lenthentrueelseifis_valid_word_chars.[i]thencheck_chars(i+1)elsefalseiniflen=0thenfalseelseifnot(is_valid_first_word_chars.[0])thenfalseelsecheck_chars1typebuf={lexbuf:Sedlexing.lexbuf;mutablelcount:int;(* line counter *)mutablelstart:int;(* buffer position of the latest line *)mutablecol:int;(* column number of the last returned token *)mutablenext_token:tokenoption;(* rollback token *)}letmake_buflexbuf={lexbuf=lexbuf;lcount=1;lstart=0;col=1;next_token=None;}letupdate_line_counterbuf=buf.lcount<-buf.lcount+1;buf.lstart<-Sedlexing.lexeme_endbuf.lexbufletget_columnbuf=(* NOTE: ennumerating columns from 1 *)(Sedlexing.lexeme_startbuf.lexbuf)-buf.lstart+1letupdate_columnbuf=buf.col<-get_columnbuf(* location before we returned a token *)leterror_locationbuf=buf.lcount,get_columnbuf(* valid location after we returned a token *)letlocationbuf=buf.lcount,buf.colletrectoken0buflexbuf=[%sedlexmatchlexbufwith|newline->(* update line counter, drop column counter and move on *)update_line_counterbuf;token0buflexbuf|ws->token0buflexbuf(* skip whitespace *)|'\r'->(* standalone '\r', i.e. without following '\n', is invalid *)error"invalid character"|"%%"->error"'%%' literal is reserved for future versions"|'%',Opt(Compl('%'|'\n'),Star(Compl'\n')),Optnewline->(* skip single line comment *)update_line_counterbuf;token0buflexbuf|'#',Optnewline->(* verbatim empty text *)Text""|'#',Compl' '->error"space is expected after '#'"|'#',' ',Star(Compl'\n'),Optnewline->(* verbatim text *)(* TODO: restrict string literal to contain only printable characters *)lets=Sedlexing.Utf8.lexemelexbufinletlen=String.lengthsiniflen=0thenText""elseletchomp=ifs.[len-1]='\n'then1elseiflen>1&&s.[len-2]='\n'&&s.[len-1]='\r'then2else0inlets=String.subs2(len-2-chomp)in(* cut #' ' and newline *)Texts|'('->Lpar|')'->Rpar|'['->Lbr|']'->Rbr|'*'->Star|','->Comma|string_literal->lets=Sedlexing.Utf8.lexemelexbufinlets=String.subs1(String.lengths-2)in(* cut double-quotes *)let(str_type,parsed_str)=parse_string_literalsinString(str_type,parsed_str,s)|'"'->error"string literal overrun"|name->lets=Sedlexing.Utf8.lexemelexbufinNames|word->lets=Sedlexing.Utf8.lexemelexbufinWords(* TODO: this is inconsistent - here, integers qualify as words automatically,
* but we have to be specific about floats. Without this clause, floats having
* a '+' sign after E (e.g. 2e+08) will lead to "invalid character"
* lexing error
*)|float_literal->lets=Sedlexing.Utf8.lexemelexbufinWords|eof->EOF|_->error"invalid character"](* error reporter *)typeloc=int*int(* line, column *)exceptionErrorofstring*locleterrorbufs=raise(Error(s,error_locationbuf))lettoken1buf=trylettok=token0bufbuf.lexbufinupdate_columnbuf;(matchtokwithText_->update_line_counterbuf|_->());tokwith|Error0s->errorbufs|Sedlexing.InvalidCodepointi->errorbuf("invalid unicode code point "^string_of_inti)|Sedlexing.MalFormed->errorbuf"malformed utf-8"letrollbackbuftok=buf.next_token<-Sometok(* return next token *)lettokenbuf=matchbuf.next_tokenwith|None->token1buf|Sometok->buf.next_token<-None;tokletinit_from_strings=letlexbuf=Sedlexing.Utf8.from_stringsinmake_buflexbufletinit_from_streams=letlexbuf=Sedlexing.Utf8.from_streamsinmake_buflexbufletinit_from_channelch=letlexbuf=Sedlexing.Utf8.from_channelchinmake_buflexbuflettokenize_strings=letbuf=init_from_stringsinletrecauxaccu=matchtokenbufwith|EOF->List.revaccu|t->aux(t::accu)inaux[]