123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571(* This is an example of a parser recognizing a simplified json grammar by
* separating the lexer and the parser.
*)moduleToken=structtypetp=|End|String|Number|Bool|Colon|Comma|Lbrace|Rbrace|Lbrack|Rbracktypet=tp*stringendmoduleToken_plus=structtypet=Position.range*Token.tendmoduleLexer=structmoduleCP=Character.Make(Unit)(Token_plus)(Fmlib_std.Void)openCP(* Whitespace
* ==========
* - blanks
* - newlines
* - comments of the form "// xxxxx" until the end of line or end of
* input
* - multline comments of the form
* /* xxxx
* xxxxxx */
*)letblank_or_newline:unitt=let*_=char' '</>char'\n'</>char'\r'inreturn()letline_comment:unitt=let*_=backtrack(string"//"){|"//"|}inlet*_=skip_zero_or_more(charp(func->c<>'\n')"any char except newline")inreturn()letmulti_line_comment:unitt=letrecreststar=(* parse the remaining part of a multiline comment after the initial
* "/*". The flag [star] indicates that the previous character of
* the rest has been a '*'.
*)let*c=charp(fun_->true)"any character in a comment"inifnotstar&&c='*'thenresttrueelseifstar&&c='/'thenreturn()elserestfalseinlet*_=backtrack(string"/*"){|"/*"|}inrestfalseletwhitespace:intt=skip_zero_or_more(blank_or_newline</>line_comment</>multi_line_comment)|>no_expectations(* Specific tokens
* ===============
*)letcolon:Token.tt=let*_=char':'inreturn(Token.Colon,":")letcomma:Token.tt=let*_=char','inreturn(Token.Comma,",")letlbrace:Token.tt=let*_=char'{'inreturn(Token.Lbrace,"{")letrbrace:Token.tt=let*_=char'}'inreturn(Token.Rbrace,"}")letlbrack:Token.tt=let*_=char'['inreturn(Token.Lbrack,"[")letrbrack:Token.tt=let*_=char']'inreturn(Token.Rbrack,"]")letstring:Token.tt=let*_=char'"'<?>"string"inlet*lst=zero_or_more(map(func->String.make1c)(charp(func->' '<=c&&c<='~'&&c<>'"')"printable character"))inlet*_=char'"'inreturn(Token.String,String.concat""lst)letnumber:Token.tt=letis_digitc='0'<=c&&c<='9'inmap(funstr->Token.Number,str)(wordis_digitis_digit"number")letbool:Token.tt=map(funstr->Token.Bool,str)(CP.string"true"</>CP.string"false"<?>"bool")(* Combinator recognizing an arbitary token
* ========================================
*
* Preceeding whitespace is stripped off and the token is equipped with its
* start position and its end position.
*)lettoken:Token_plus.tt=lexerwhitespace(Token.End,"")((* None of the tokens needs any backtracking, because all can be
* recognized by looking at the first character. *)number</>string</>bool</>lbrace</>rbrace</>lbrack</>rbrack</>comma</>colon)(* The final lexer
* ===============
*)moduleParser=structincludeCP.Parserletinit:t=(* Lexer starting at the start of the input. *)make_partial()tokenletrestart(lex:t):t=(* Restart the lexer at the current position and replay the not yet
* consumed input on the restarted parser.
*)restart_partialtokenlexendend(* Internal representation of a json construct
* ===========================================
*
* It is a simplified json construct having as elementary values only strings,
* integer numbers and booleans.
*)moduleJson=structtypet=|Numberofint(* make the tests simpler *)|Stringofstring|Boolofbool|Listoftlist|Recordof(string*t)listletnumberi=Numberiletstrings=Stringsletboolb=Boolbletlistlst=Listlstletrecordlst=Recordlstletrecto_string:t->string=(* Compact string representation of a json value *)letopenPrintfinfunction|Numberi->sprintf"%d"i|Boolb->sprintf"%b"b|Strings->letdquote="\""indquote^s^dquote|Listlst->"["^String.concat", "(List.mapto_stringlst)^"]"|Recordlst->"{"^String.concat", "(List.map(fun(key,y)->"\""^key^"\": "^to_stringy)lst)^"}"end(* The parser receiving lexical tokens and parsing a json construct
* ================================================================
*
* Implemented as a [Token_parser] which can be used by the module
* [Parse_with_lexer] to generate the final parser.
*)moduleCombinator=structmoduleTP=Token_parser.Make(Unit)(Token)(Json)(Fmlib_std.Void)moduleParser=TP.ParseropenTPletconst(a:'a)(_:'b):'a=aletstep(expect:string)(etp:Token.tp)(f:string->'a):'at=TP.stepexpect(funstate_(tp,str)->iftp=etpthenSome(fstr,state)elseNone)letzero_or_more_separated(p:'at)(sep:'bt):'alistt=mapList.rev(one_or_more_separated(funx->return[x])(funlst_x->return(x::lst))psep)</>return[]letstring:stringt=step"string"Token.StringFun.idletcolon:_t=step{|":"|}Token.Colon(const"")letcomma:_t=step{|","|}Token.Comma(const"")letlbrace:_t=step{|"{"|}Token.Lbrace(const"")letrbrace:_t=step{|"}"|}Token.Rbrace(const"")letlbrack:_t=step{|"["|}Token.Lbrack(const"")letrbrack:_t=step{|"]"|}Token.Rbrack(const"")letnumber:Json.tt=step"number"Token.Number(funs->Json.number(int_of_strings))letbool:Json.tt=step"bool"Token.Bool(funs->Json.bool(bool_of_strings))letrecjson():Json.tt=mapJson.stringstring</>number</>bool</>(record()<?>"{ <key>: <value>, ... }")</>(list()<?>"[ <value>, ... ]")andrecord():Json.tt=let*_=lbraceinlet*pairs=zero_or_more_separated(key_value_pair()<?>"<key>: <value>")commainlet*_=rbraceinreturnJson.(Recordpairs)andkey_value_pair():(string*Json.t)t=let*key=stringinlet*_=coloninlet*value=json()inreturn(key,value)andlist():Json.tt=let*_=lbrackinlet*lst=zero_or_more_separated(json())commainlet*_=rbrackinreturn(Json.Listlst)letparse:Parser.t=make()(json())end(* The complete parser
* ===================
*)moduleLex=Lexer.ParsermoduleParse=Combinator.ParsermoduleVoid=Fmlib_std.VoidmodulePL=structincludeParse_with_lexer.Make(Unit)(Token)(Json)(Void)(Lex)(Parse)letstart:t=makeLex.initCombinator.parseend(* Helper functions for unit tests and error reporting
* ===================================================
*)modulePretty=Fmlib_pretty.Printletwrite_error(str:string)(p:PL.t):unit=letmoduleReporter=Error_reporter.Make(PL)inifnot(PL.has_succeededp)thenReporter.(make_syntaxp|>run_on_stringstr|>Pretty.layout50|>Pretty.write_to_channelstdout)letcheck_successes(arr:(string*string*string)array):bool=letcheck_success(tag,input,expected)=letopenPLinletp=run_on_stringinputstartinifnot(has_succeededp)thenPrintf.printf"unexpected failure of: %s\n"tag;write_errorinputp;has_succeededp&&letres=finalp|>Json.to_stringinifres<>expectedthenPrintf.printf"%s: expected %s, actual %s\n"tagexpectedres;res=expectedinArray.for_allcheck_successarrletcheck_failures(arr:(string*string*int*int*bool)array):bool=letcheck_failure(tag,input,row,col,flag)=letopenPLinletp=run_on_stringinputstartinletpos=positionpinifhas_succeededpthenPrintf.printf"unexpected success of test: %s\n"tagelseifflagthenwrite_errorinputp;has_failed_syntaxp&&Position.linepos=row&&Position.columnpos=colinArray.for_allcheck_failurearr(* Test cases
* ==========
*)letsuccess_cases:(string*string*string)array=[|"number","100","100";"bool"," true","true";"string",{|"hello"|},{|"hello"|};"number list","[100, 2, 1]","[100, 2, 1]";"arbitrary list",{|[0, true , /**/ "hello", [ ], { }]|},{|[0, true, "hello", [], {}]|};"record",{|{ "a" : 1, "b" : true , "c": "hello" }|},{|{"a": 1, "b": true, "c": "hello"}|};"empty list","/**/ [ ] //","[]";"complex",{|[ {}, [ 1], [ ], {"a": 0, "b": { } }]|},{|[{}, [1], [], {"a": 0, "b": {}}]|};|]letfailure_cases:(string*string*int*int*bool)array=[|"nothing","",0,0,false;"nothing with comment","// comment",0,10,false;"unterminated multiline comment","/*",0,2,false;"missing comma","[1 2]",0,3,false;"unterminated string",{| "|},0,2,false;"unexpected additional json","1 1",0,2,false;|]let%test_=check_successessuccess_caseslet%test_=check_failuresfailure_cases