123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280(* Yoann Padioleau
*
* Copyright (C) 2009-2010 Facebook
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation, with the
* special exception on linking described in file license.txt.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
* license.txt for more details.
*)openCommonmoduleAst=Cst_phpmoduleFlag=Flag_parsing_phpmoduleTH=Token_helpers_phpmodulePI=Parse_info(*****************************************************************************)(* Prelude *)(*****************************************************************************)(* Preprocessor (e.g. XHP) helper functions.
*
* update: now that pfff supports XHP construts directly, there is far
* less need for preprocessor support and merging tokens.
*
*
* Pfff allows the user to pass a preprocessor command on the command line,
* which makes it possible to parse for instance XHP code.
*
* The preprocessor will generate a file in /tmp which means
* errors or further analysis will report position information
* on this tmp file. This can be inconvenient. If the
* preprocessor maintain line positions (which is the mostly case for instance
* with XHP), we can slightly improve the situation by
* changing the .file field in parse_info.
*
* Preprocessors such as XHP also remove comments, spaces, and indentations.
* It can be useful to merge the original comments/spaces/indents in the
* original file with the tokens in the expanded file. This makes it easier
* to support refactoring on XHP code for most of the tokens. Our solution
* to merge is:
* - transform certain tokens in the original file as
* TCommentPP tokens when they were transformed or passed by
* the preprocessor,
* - mark the new tokens from the preprocessed code with a ExpandedTok
* - adjust the file and position information from similar tokens with
* the information from the original file.
*
* Merging is not always possible if the preprocessor introduce really new
* tokens than the original PHP parser does not understand. In the case
* of XHP, which mainly use '<', ':'. '>' and strings, this is mostly ok.
* Nevertheless the original code can also contain
* quotes inside HTML snippet which confuse completely the
* original lexer which then produce tokens that are too different
* to the tokens after preprocessing (see tests/xhp/xhp_line_bug4.php).
* It is then too hard to synchronize them. What is important
* when we have problems synchronizing, is to fall back to a default mode
* where everything is marked as ExpandedTok and TCommentPP.
*)let(<=>)=Common2.(<=>)(* a few helpers *)letgroup_tokens_by_linetoks=letrecauxcurrent_linecurrent_toksxs=matchxswith|[]->[current_line,List.revcurrent_toks]|x::xs->letinfo=TH.info_of_tokxinletline=PI.line_of_infoinfoin(matchline<=>current_linewith|Common2.Inf->failwith("Impossible: wrong order of tokens: "^PI.str_of_infoinfo)|Common2.Equal->auxcurrent_line(x::current_toks)xs|Common2.Sup->lethd=current_line,List.revcurrent_toksinhd::auxline[x]xs)inaux1[]toksletcomment_pp_izetoks=toks|>List.map(funtok->letinfo=TH.info_of_toktokinParser_php.TCommentPPinfo)letmark_as_expandedlast_orig_parse_infotoks=letcnt=ref(String.length(last_orig_parse_info.Parse_info.str))intoks|>List.map(funtok->letinfo=TH.info_of_toktokinletstr=PI.str_of_infoinfoinletlen=String.lengthstrincnt:=!cnt+len;tok|>TH.visitor_info_of_tok(funinfo->letparse_info_in_pp=Parse_info.token_location_of_infoinfoin{infowithParse_info.token=Parse_info.ExpandedTok(parse_info_in_pp,last_orig_parse_info,!cnt)}))(* Merging tokens on a single line.
*
* As a first cut, we just want to know if this line has the same non-comment
* tokens in the original and preprocessed code. In that case the line
* didn't really needed XHP so we can return the original
* set of tokens which will have better indentation, comments, etc.
* Otherwise we return the tokens in the preprocessed code, but with
* a special mark, ExpandedTok. We also add the original tokens
* as TCommentPP so that the unparser can still print back those
* original tokens and pass the ExpandedTok one.
*
* todo: could try to be more precise and merge more tokens inside
* a line modified by XHP. Could do a diff at token level
* and adjust accordingly.
*
*)letmerge_tokens_line~orig_toks~pp_toks=leta=orig_toksinletb=pp_toksinletal_info_toktok=TH.visitor_info_of_tok(funinfo->Ast.al_infoinfo)tokinleta'=Common.excludeTH.is_commenta|>List.mapal_info_tokinletb'=Common.excludeTH.is_commentb|>List.mapal_info_tokinifa'=*=b'thenaelse(* todo: could do finer grained things here *)letcommented_a=comment_pp_izeainifnullcommented_athenfailwith"WEIRD: a XHP line has tokens but not the original one";letlast_orig_info=Common2.list_lastcommented_a|>TH.info_of_tokinletlast_orig_parse_info=Parse_info.token_location_of_infolast_orig_infoinletexpanded_b=mark_as_expandedlast_orig_parse_infobincommented_a@expanded_bletzip_and_sync~toks_orig_lines~toks_pp_lines=(* old: List.map snd toks_pp_lines +> List.flatten *)(* This is used below just in one very ugly situation *)letlast_orig_tok=ref(matchtoks_orig_lineswith|(_xline,xtoks)::_xs->List.hdxtoks|_->failwith"Impossible: if the file is empty then we should not be called at all")inletrecauxxsys=match(xs,ys)with|[],[]->[]|((_xline,xtoks)::xs),[]->(* The original can have some comments at the end of the file, which
* would be removed by XHP
*
* TODO: assert only space or eof
*)xtoks::auxxs[]|[],(_yline,ytoks)::ys->(* XHP usually cut the space and comments at the end of the file
* so the original file should always be longer.
*
* Nevertheless, in certain situations where the tokens
* are very different with very different line positions,
* sync can get confused. See xhp_line_bug4.php. So the best
* we can do here is to fall back to our invariant and
* mark the tokens as ExpandedTok.
*)letall_remaining_toks=(ytoks::List.mapsndys)|>List.flatteninletlast_orig_parse_info=!last_orig_tok|>TH.info_of_tok|>Parse_info.token_location_of_infoinlettoks=mark_as_expandedlast_orig_parse_infoall_remaining_toksin[toks]|(((xline,xtoks)::xs)asa),(((yline,ytoks)::ys)asb)->last_orig_tok:=Common2.list_lastxtoks;(matchxline<=>ylinewith|Common2.Inf->(* Sometimes XHP just remove certain tokens, like
* lines with attribute x y; in which case we must
* remove them also from the original file.
*
* It's also usually because comments are passed by XHP.
* We could maybe assert to have only space here or
* tokens known to be passed by XHP like attribute
*)letxtoks'=comment_pp_izextoksinxtoks'::auxxsb|Common2.Equal->letmerged=merge_tokens_line~orig_toks:xtoks~pp_toks:ytoksinmerged::auxxsys|Common2.Sup->(* sometimes XHP remove some lines ... so have to adjust things
*
*)pr2(spf"WEIRD, wrong line numbers in preprocessed file %d > %d"xlineyline);letb'=b|>List.map(fun(yline,ytoks)->yline+1,ytoks)inauxab')inauxtoks_orig_linestoks_pp_lines|>List.flatten(* Trying to merge tokens from the original file with the preprocessed file,
* for instance to put back space, indentation and comments information
* in the preprocessed file.
*)letadapt_tokens_pp2~tokenizer~orig_filenametoks_pp=(* The old algorithm was just to adjust the .file field so that error
* reporting was slightly improved.
*)ifnot!Flag.obsolete_merge_tokens_xhpthentoks_pp|>List.rev_map(funtok->tok|>TH.visitor_info_of_tok(funii->letpinfo=ii.PI.tokenin{iiwithParse_info.token=matchii.PI.tokenwith|Parse_info.OriginTokpi->Parse_info.OriginTok{piwithParse_info.file=orig_filename;}|Parse_info.FakeTokStr_|Parse_info.Ab->pinfo|Parse_info.ExpandedTok_->raiseImpossible}))|>List.rev(* ugly, but need tail-call rev_map and so this rev *)else(* algo:
* - split by line the tokens
* - for each line try to synchronize,
* by marking as TCommentPP the relevant tokens from toks_orig
* and adding one from toks as ExpandedTok
*)lettoks_orig=tokenizerorig_filenameinlettoks_pp_lines=group_tokens_by_linetoks_ppinlettoks_orig_lines=group_tokens_by_linetoks_originzip_and_sync~toks_orig_lines~toks_pp_linesletadapt_tokens_pp~tokenizer~orig_filenametoks_pp=Common.profile_code"Parse_php.merge tokens"(fun()->adapt_tokens_pp2~tokenizer~orig_filenametoks_pp)