123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331(* Yoann Padioleau
*
* Copyright (C) 2013 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License (GPL)
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* file license.txt for more details.
*)openCommonopenParser_phpmodulePI=Parse_infomoduleTH=Token_helpers_php(*****************************************************************************)(* Prelude *)(*****************************************************************************)(*
* This module transforms certain tokens like '>>', normally a T_SR
* into two TGREATER tokens which helps avoid using ugly tricks in the grammar
* regarding generics.
*
* This is similar to what we do for C/C++.
* See pfff/lang_cpp/parsing/parsing_hacks.ml for more information.
*
* In Hack they maintain those different states (InToplevel, InFunction,
* InBlock, ...) in the lexer itself, I prefer for now to separate
* concerns and do that entirely post-lexing (which introduces some performance
* degradation, from 195s to parse www to 209s).
*)(*****************************************************************************)(* Types *)(*****************************************************************************)typeenv={stack:ctxlist;misc:unit;}andctx=|Toplevel|ClassHeader|ClassBody|FunctionHeader|TypeHeader|UserAttribute|Block(*****************************************************************************)(* generics *)(*****************************************************************************)(* Split a single (assumed to be 2-chars wide) info and turn it
into a (1-char) lhs and rhs. Used to convert `>>` into two `>`
*)letsplit_two_charpi=letlhs={piwithParse_info.str=String.subpi.Parse_info.str01}inletrhs={piwithParse_info.str=String.subpi.Parse_info.str11;Parse_info.charpos=pi.Parse_info.charpos+1;Parse_info.column=pi.Parse_info.column+1}in(lhs,rhs)letsplit_two_char_infoi=lettok=matchi.Parse_info.tokenwith|Parse_info.OriginTokt->t|_->failwith"Parse error..."inletlhspi,rhspi=split_two_chartokinletlhs={Parse_info.token=Parse_info.OriginToklhspi;Parse_info.transfo=Parse_info.NoTransfo}inletrhs={Parse_info.token=Parse_info.OriginTokrhspi;Parse_info.transfo=Parse_info.NoTransfo}in(lhs,rhs)(*
* Utilities for lambda parsing
*)(*
* Checks if the given tokens are compatible with a set of lambda params.
* It must either be empty, as in () ==> ... or contain one variable/variadic.
*
* Both of these cases are not compatible with typehints, so we can safely
* determine if a (...) expression is part of lambda's params or its typehint.
*)letis_paramstoks=List.lengthtoks>0&&(List.for_all(function|T_LAMBDA_OPAR_|T_LAMBDA_CPAR_|TOPAR_|TCPAR_->true|x->TH.is_commentx)toks||List.exists(function|T_VARIABLE_|T_ELLIPSIS_->true|_->false)toks)(* Looks to see if the next token is a variable (ignoring comments) *)letrecis_variabletoks=matchtokswith|[]->false|T_VARIABLE_::_->true|x::xs->ifTH.is_commentxthenis_variablexselsefalse(*
* Find the next group of parenthesized tokens, being sure to balance parens.
* Returns an empty list if the parens were imbalanced or the first non-comment
* token was anything except a close paren.
*
* Replaces the opening/closing parens with lambda parens if `replace` is true.
*)letfind_paren_tokenstoksreplace=letrecauxtoksaccdepth=(matchtokswith|[]->([],[])(* failure *)|x::xs->(matchxwith|TCPARt->letx'=ifdepth==0&&replacethenT_LAMBDA_CPARtelsexinauxxs(x'::acc)(depth+1)|TOPARt->ifdepth==1thenletx'=ifreplacethenT_LAMBDA_OPARtelsexin(List.rev(x'::acc),xs)elseauxxs(x::acc)(depth-1)|T_SRt->ifdepth>0thenifreplacethen(* In the context of lambda parens, >> only makes sense
* if we split it into two > tokens *)let(lhs,rhs)=split_two_char_infotinauxxs(TGREATERrhs::TGREATERlhs::acc)depthelseauxxs(x::acc)depthelse([],[])|_->if(TH.is_commentx)||depth>0thenauxxs(x::acc)depthelse(* couldn't find the first closing paren *)([],[])))inauxtoks[]0(*
* Try to (roughly) match a lambda typehint - may have false positives.
* On the other hand, it's guaranteed that any valid typehint will be matched.
* False positives will most likely lead to an invalid set of lambda
* parens, though.
*)letfind_typehinttoks=letrecauxtoksaccdepth=(matchtokswith|[]->([],[])(* failure *)(* assume parens/brackets are balanced correctly *)|x::xs->(matchxwith|T_LAMBDA_CPAR_|TCPAR_|TGREATER_->auxxs(x::acc)(depth+1)|T_LAMBDA_OPAR_|TOPAR_|TSMALLER_->auxxs(x::acc)(depth-1)|T_SRt->(* >> when we're looking for a typehint is only valid in the context
* of closing a template, so split it up. *)let(lhs,rhs)=split_two_char_infotinauxxs(TGREATERrhs::TGREATERlhs::acc)(depth+2)|T_DOUBLE_ARROW_|TOBRACE_|TCBRACE_->([],[])(* absolutely will not be in a typehint *)|TCOLON_->ifdepth==0then(List.rev(x::acc),xs)elseauxxs(x::acc)depth|_->auxxs(x::acc)depth))inauxtoks[]0(*****************************************************************************)(* Fix tokens *)(*****************************************************************************)letfix_tokens2xs=letrecauxenvaccxs=matchxswith(* need an acc, to be tail recursive, otherwise get some stack overflow *)|[]->List.revacc(* '>>', maybe should be split in two tokens '>' '>' when in generic
* context
*)|T_SRii::xs->(matchenv.stackwith(* type context, those are the only places where types allowed for
* now, which makes the job easier than in parsing_hacks_java.ml
*)|(ClassHeader|ClassBody|TypeHeader|FunctionHeader)::_->let(lhs,rhs)=split_two_char_infoiiinauxenv(TGREATERrhs::TGREATERlhs::acc)xs|UserAttribute::rest->aux{envwithstack=rest}(T_SRii::acc)xs|_->auxenv(T_SRii::acc)xs)(* This must be part of a lambda expression.
* The parameters of a lambda expression are extremely difficult to parse
* due to their similarity to standard expressions.
* To get around this, we'll try to mark the opening and closing parens
* of the lambda's parameters with special lambda paren tokens.
*)|T_DOUBLE_ARROWarrow::xs->let(replaced,rest)=(* Nothing needs to be done for $x ==> ... *)ifis_variableaccthen([],acc)else(* The majority of the time, lambdas aren't typehinted so let's just
* eagerly replace the parens assuming these are the params. *)let(toks,rest)=find_paren_tokensacctruein(matchtokswith(* Not a set of parens - this is probably a typehint. *)|[]->let(typehint,rest)=find_typehintaccin(matchtypehintwith|[]->([],acc)(* ignore; let the parser deal with it *)|_->let(params,rest2)=find_paren_tokensresttrueinifis_paramsparamsthen(typehint@params,rest2)else([],acc))(* ignore *)(* There are two possibilities now:
*
* 1) The typehint is a tuple or function, in which case this
* closing paren is part of a typehint.
* 2) The closing paren is part of the parameters.
*
* is_params will be able to distinguish between the two cases.
*)|_->ifis_paramstoksthen(toks,rest)else(* try finding a typehint *)let(typehint,rest)=find_typehintaccin(matchtypehintwith|[]->([],acc)(* no match *)|_->let(params,rest2)=find_paren_tokensresttrueinifis_paramsparamsthen(typehint@params,rest2)else([],acc)))(* ignore *)inauxenv(T_DOUBLE_ARROWarrow::(replaced@rest))xs|x::xs->letstack=(* quite similar to hack/lexing_modes.ml *)matchx,env.stackwith(* ugly: we check we are at toplevel because the keyword 'class'
* could be used in a different context as part of an XHP attribute
* name, see ident_xhp_attr_name_atom rule in parser_php.mly
*)|(T_CLASS_|T_TRAIT_|T_INTERFACE_),Toplevel::_rest->ClassHeader::env.stack|(T_TYPE_|T_NEWTYPE_),Toplevel::_rest->TypeHeader::env.stack|T_FUNCTION_,(Toplevel|ClassHeader)::_rest->FunctionHeader::env.stack|T_FUNCTION_,Block::_rest->FunctionHeader::env.stack(* also FunctionHeader because we can have attributes on parameters *)|T_SL_,(Toplevel|ClassBody|FunctionHeader)::_rest->UserAttribute::env.stack|TOBRACE_ii,ClassHeader::rest->ClassBody::rest(* subtle: do not do Block::env.stack here otherwise we will
* not pop up enough to get back to a Toplevel context
*)|TOBRACE_ii,FunctionHeader::rest->Block::rest|TOBRACE_ii,_->Block::env.stack|(T_CURLY_OPEN_|T_DOLLAR_OPEN_CURLY_BRACES_),_->Block::env.stack|TCBRACE_ii,_x::xs->xs|TCBRACEii,[]->failwith(spf"unmatching closing brace at %s"(PI.string_of_infoii))|TSEMICOLON_ii,(FunctionHeader|TypeHeader)::rest->rest(* default case *)|_,st->stinaux{envwithstack}(x::acc)xsinaux{stack=[Toplevel];misc=();}[]xsletfix_tokensa=Common.profile_code"Parse_php.fix_tokens"(fun()->fix_tokens2a)