123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666(* Yoann Padioleau
*
* Copyright (C) 2011, 2014 Facebook
* Copyright (C) 2007, 2008 Ecole des Mines de Nantes
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License (GPL)
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* file license.txt for more details.
*)openCommonmoduleFlag=Flag_parsingmoduleFlag_cpp=Flag_parsing_cppmodulePI=Parse_infomoduleTH=Token_helpers_cppopenParser_cpp(*****************************************************************************)(* Prelude *)(*****************************************************************************)(*
* This module makes it easier to write some fuzzy parsing heuristics
* by offering different "views" over the same set of tokens.
*
* Normally I should not use ref/mutable in the token_extended type below
* and instead have a set of functions taking a list of tokens and
* returning a list of tokens. The problem is that to make easier some
* functions, it is better to work on better representation, on "views"
* over this list of tokens. But then modifying those views and get
* back from those views to the original simple list of tokens is
* tedious. One way is to maintain next to the view a list of "actions"
* (I was using a hash storing the charpos of the token and associating
* the action) but it is tedious too. Simpler to use mutable/ref. We
* use the same idea that we use when working on the Ast.
*
* old: when I was using the list of "actions" next to the views, the hash
* indexed by the charpos, there could have been some problems:
* how my fake_pos interact with the way I tag and adjust token ?
* because I base my tagging on the position of the token ! so sometimes
* could tag another fakeInfo that should not be tagged ?
* fortunately I don't use anymore this technique.
*)(*****************************************************************************)(* Some debugging functions *)(*****************************************************************************)letpr2,_pr2_once=Common2.mk_pr2_wrappersFlag.verbose_parsing(*****************************************************************************)(* Types *)(*****************************************************************************)typetoken_extended={(* chose 't' and not 'tok' to have a short name because we will write
* lots of ocaml patterns around this ... so better to be short
*)mutablet:Parser_cpp.token;(* In C++ we have functions inside classes, so need a stack of context *)mutablewhere:contextlist;(* less: need also a after ? *)mutablenew_tokens_before:Parser_cpp.tokenlist;(* line x col cache (more easily accessible) of the info in the token *)line:int;col:int;}(* The strategy to tag is mostly to look at the token(s) before the '{' *)andcontext=|InTopLevel|InClassStructofstring(* can be __anon__ *)|InEnum|InInitializer|InAssign|InParameter|InArgument(* TODO actually commented in token_view_context because of c++ *)|InFunction(*
| InTemplateParam (* TODO *)
*)(* InCondition ? InParenExpr ? *)(* x list list, because x list separated by ',' *)typeparen_grouped=|Parenthisedofparen_groupedlistlist*token_extendedlist|PTokenoftoken_extendedtypebrace_grouped=|Braceisedofbrace_groupedlistlist*token_extended*token_extendedoption|BTokenoftoken_extended(* Far better data structure than doing hacks in the lexer or parser
* because in lexer we don't know to which ifdef a endif is related
* and so when we want to comment a ifdef, we don't know which endif
* we must also comment. Especially true for the #if 0 which sometimes
* have a #else part.
*
* x list list, because x list separated by #else or #elif
*)typeifdef_grouped=|Ifdefofifdef_groupedlistlist*token_extendedlist|Ifdefboolofbool*ifdef_groupedlistlist*token_extendedlist|NotIfdefLineoftoken_extendedlisttype'aline_grouped=Lineof'alisttypebody_function_grouped=|BodyFunctionoftoken_extendedlist|NotBodyLineoftoken_extendedlist(* quite similar to ast_fuzzy.ml but with extended token *)typemulti_grouped=|Bracesoftoken_extended*multi_groupedlist*token_extendedoption|Parensoftoken_extended*multi_groupedlist*token_extendedoption|Angleoftoken_extended*multi_groupedlist*token_extendedoption|Tokoftoken_extended(* with tarzan *)exceptionUnclosedSymbolofstring(*****************************************************************************)(* Helpers *)(*****************************************************************************)letmk_token_extendedx=letinfo=TH.info_of_tokxinlet(line,col)=PI.line_of_infoinfo,PI.col_of_infoinfoin{t=x;line=line;col=col;(* we use List.hd at a few places, so convenient to have a sentinel *)where=[InTopLevel];new_tokens_before=[];}letmk_token_fakex={t=x;line=-1;col=-1;where=[InTopLevel];new_tokens_before=[];}letrebuild_tokens_extentedtoks_ext=let_tokens=ref[]intoks_ext|>List.iter(funtok->tok.new_tokens_before|>List.iter(funx->pushx_tokens);pushtok.t_tokens);lettokens=List.rev!_tokensin(tokens|>Common2.acc_mapmk_token_extended)(*****************************************************************************)(* View builders *)(*****************************************************************************)(* ------------------------------------------------------------------------- *)(* Parens *)(* ------------------------------------------------------------------------- *)(* todo: synchro ! use more indentation
* if paren not closed and same indentation level, certainly because
* part of a mid-ifdef-expression.
*
* c++ext: TODO: need to handle templates here.
* The parenthized view must not consider the ',' in expressions
* like foo(lexical cast<string,int>, ...) as a separator for the arguments
* of foo(), otherwise we will get [lexical_cast<string; ...] which
* could confuse some heuristics.
*
* pre: have done the TInf->TInf_Template translation.
*)letrecmk_parenthisedxs=matchxswith|[]->[]|x::xs->(matchx.twith|xxwhenTH.is_oparxx->letbody,extras,xs=mk_parameters[x][]xsinParenthised(body,extras)::mk_parenthisedxs|_->PTokenx::mk_parenthisedxs)(* return the body of the parenthised expression and the rest of the tokens *)andmk_parametersextrasacc_before_sepxs=matchxswith|[]->(* maybe because of #ifdef which "opens" '(' in 2 branches *)pr2"PB: not found closing paren in fuzzy parsing";[List.revacc_before_sep],List.revextras,[]|x::xs->(matchx.twith(* synchro *)|xxwhenTH.is_obracexx&&x.col=0->pr2"PB: found synchro point } in paren";[List.revacc_before_sep],List.rev(extras),(x::xs)|xxwhenTH.is_cparxx->[List.revacc_before_sep],List.rev(x::extras),xs|xxwhenTH.is_oparxx->letbody,extrasnest,xs=mk_parameters[x][]xsinmk_parametersextras(Parenthised(body,extrasnest)::acc_before_sep)xs|TComma_->letbody,extras,xs=mk_parameters(x::extras)[]xsin(List.revacc_before_sep)::body,extras,xs|_->mk_parametersextras(PTokenx::acc_before_sep)xs)(* ------------------------------------------------------------------------- *)(* Brace *)(* ------------------------------------------------------------------------- *)letrecmk_braceisedxs=matchxswith|[]->[]|x::xs->(matchx.twith|xxwhenTH.is_obracexx->letbody,endbrace,xs=mk_braceised_aux[]xsinBraceised(body,x,endbrace)::mk_braceisedxs|xxwhenTH.is_cbracexx->pr2"PB: found closing brace alone in fuzzy parsing";BTokenx::mk_braceisedxs|_->BTokenx::mk_braceisedxs)(* return the body of the parenthised expression and the rest of the tokens *)andmk_braceised_auxaccxs=matchxswith|[]->(* maybe because of #ifdef which "opens" '(' in 2 branches *)pr2"PB: not found closing brace in fuzzy parsing";[List.revacc],None,[]|x::xs->(matchx.twith|xxwhenTH.is_cbracexx->[List.revacc],Somex,xs|xxwhenTH.is_obracexx->letbody,endbrace,xs=mk_braceised_aux[]xsinmk_braceised_aux(Braceised(body,x,endbrace)::acc)xs|_->mk_braceised_aux(BTokenx::acc)xs)(* ------------------------------------------------------------------------- *)(* Ifdefs *)(* ------------------------------------------------------------------------- *)letrecmk_ifdefxs=matchxswith|[]->[]|x::xs->(matchx.twith|TIfdef_->letbody,extra,xs=mk_ifdef_parameters[x][]xsinIfdef(body,extra)::mk_ifdefxs|TIfdefBool(b,_)->letbody,extra,xs=mk_ifdef_parameters[x][]xsin(* if not passing, then consider a #if 0 as an ordinary #ifdef *)if!Flag_cpp.if0_passingthenIfdefbool(b,body,extra)::mk_ifdefxselseIfdef(body,extra)::mk_ifdefxs|TIfdefMisc(b,_)|TIfdefVersion(b,_)->letbody,extra,xs=mk_ifdef_parameters[x][]xsinIfdefbool(b,body,extra)::mk_ifdefxs|_->(* todo? can have some Ifdef in the line ? *)letline,xs=Common.span(funy->y.line=x.line)(x::xs)inNotIfdefLineline::mk_ifdefxs)andmk_ifdef_parametersextrasacc_before_sepxs=matchxswith|[]->(* Note that mk_ifdef is assuming that CPP instruction are alone
* on their line. Because I do a span (fun x -> is_same_line ...)
* I might take with me a #endif if this one is mixed on a line
* with some "normal" tokens.
*)pr2"PB: not found closing ifdef in fuzzy parsing";[List.revacc_before_sep],List.revextras,[]|x::xs->(matchx.twith|TEndif_->[List.revacc_before_sep],List.rev(x::extras),xs|TIfdef_->letbody,extrasnest,xs=mk_ifdef_parameters[x][]xsinmk_ifdef_parametersextras(Ifdef(body,extrasnest)::acc_before_sep)xs|TIfdefBool(b,_)->letbody,extrasnest,xs=mk_ifdef_parameters[x][]xsinif!Flag_cpp.if0_passingthenmk_ifdef_parametersextras(Ifdefbool(b,body,extrasnest)::acc_before_sep)xselsemk_ifdef_parametersextras(Ifdef(body,extrasnest)::acc_before_sep)xs|TIfdefMisc(b,_)|TIfdefVersion(b,_)->letbody,extrasnest,xs=mk_ifdef_parameters[x][]xsinmk_ifdef_parametersextras(Ifdefbool(b,body,extrasnest)::acc_before_sep)xs|TIfdefelse_|TIfdefelif_->letbody,extras,xs=mk_ifdef_parameters(x::extras)[]xsin(List.revacc_before_sep)::body,extras,xs|_->letline,xs=Common.span(funy->y.line=x.line)(x::xs)inmk_ifdef_parametersextras(NotIfdefLineline::acc_before_sep)xs)(* ------------------------------------------------------------------------- *)(* Lines (of parens) *)(* ------------------------------------------------------------------------- *)letline_of_paren=function|PTokenx->x.line|Parenthised(_xxs,info_parens)->(matchinfo_parenswith|[]->raiseImpossible|x::_xs->x.line)(* old
let rec span_line_paren line = function
| [] -> [],[]
| x::xs ->
(match x with
| PToken tok when TH.is_eof tok.t ->
[], x::xs
| _ ->
if line_of_paren x = line
then
let (l1, l2) = span_line_paren line xs in
(x::l1, l2)
else ([], x::xs)
)
let rec mk_line_parenthised xs =
match xs with
| [] -> []
| x::xs ->
let line_no = line_of_paren x in
let line, xs = span_line_paren line_no xs in
Line (x::line)::mk_line_parenthised xs
*)letline_range_of_paren=function|PTokenx->x.line,x.line|Parenthised(_xxs,info_parens)->(matchinfo_parenswith|[]->raiseImpossible|x::xs->letlines_no=(x::xs)|>List.map(funx->x.line)inCommon2.minimumlines_no,Common2.maximumlines_no)letrecspan_line_paren_range(imin,imax)=function|[]->[],[]|x::xs->(matchxwith|PTokentokwhenTH.is_eoftok.t->[],x::xs|_->ifline_of_parenx>=imin&&line_of_parenx<=imaxthen(* may need to extend *)let(_imin',imax')=line_range_of_parenxinlet(l1,l2)=span_line_paren_range(imin,maximaximax')xsin(x::l1,l2)else([],x::xs))letrecmk_line_parenthisedxs=matchxswith|[]->[]|x::xs->letline_range=line_range_of_parenxinletline,xs=span_line_paren_rangeline_rangexsinLine(x::line)::mk_line_parenthisedxs(* ------------------------------------------------------------------------- *)(* Function body *)(* ------------------------------------------------------------------------- *)letrecmk_body_function_groupedxs=matchxswith|[]->[]|x::xs->(matchxwith|{t=TOBrace_;col=0;_}->letis_closing_brace=function|{t=TCBrace_;col=0;_}->true|_->falseinletbody,xs=Common.span(funx->not(is_closing_bracex))xsin(matchxswith|({t=TCBrace_;col=0;_})::xs->BodyFunctionbody::mk_body_function_groupedxs|[]->pr2"PB:not found closing brace in fuzzy parsing";[NotBodyLinebody]|_->raiseImpossible)|_->letline,xs=Common.span(funy->y.line=x.line)(x::xs)inNotBodyLineline::mk_body_function_groupedxs)(* ------------------------------------------------------------------------- *)(* Multi ('{', '(', '<') (could also do '[' ?) *)(* ------------------------------------------------------------------------- *)(* Assumes work on a list of tokens without comments, without ifdefs
* (todo? and without #define?).
* Used for typedef inference. Now also used for fuzzy parsing!
*
* todo? more fault tolerance, if col == 0 and { the reset!
* less: could check that it's consistent with the indentation
*
*)letmk_multixs=letrecconsumexxs=matchxwith|{t=(*TOBrace ii*)tok;_}whenTH.is_obracetok->letbody,closing,rest=look_close_bracex[]xsinBraces(x,body,closing),rest|{t=(*TOPar ii*)tok;_}whenTH.is_opartok->letbody,closing,rest=look_close_parenx[]xsinParens(x,body,closing),rest|{t=TInf_Template_ii;_}->letbody,closing,rest=look_close_templatex[]xsinAngle(x,body,closing),rest|x->Tokx,xsandauxxs=matchxswith|[]->[]|x::xs->letx',xs'=consumexxsinx'::auxxs'andlook_close_bracetok_startaccbodyxs=matchxswith|[]->raise(UnclosedSymbol(spf"PB look_close_brace (started at %d)"(TH.line_of_toktok_start.t)))|x::xs->(matchxwith|{t=TCBrace_ii;_}->List.revaccbody,Somex,xs(* Many macros have unclosed '{'. An alternative
* would be to work on a view where define has been filtered
*)|{t=TCommentNewline_DefineEndOfMacro_ii;_}->List.revaccbody,None,x::xs|_->let(x',xs')=consumexxsinlook_close_bracetok_start(x'::accbody)xs')andlook_close_parentok_startaccbodyxs=matchxswith|[]->raise(UnclosedSymbol(spf"PB look_close_paren (started at %d)"(TH.line_of_toktok_start.t)))|x::xs->(matchxwith|{t=(*TCPar ii*)tok;_}whenTH.is_cpartok->List.revaccbody,Somex,xs|_->let(x',xs')=consumexxsinlook_close_parentok_start(x'::accbody)xs')andlook_close_templatetok_startaccbodyxs=matchxswith|[]->raise(UnclosedSymbol(spf"PB look_close_template (started at %d)"(TH.line_of_toktok_start.t)))|x::xs->(matchxwith|{t=TSup_Template_ii;_}->List.revaccbody,Somex,xs|_->let(x',xs')=consumexxsinlook_close_templatetok_start(x'::accbody)xs')inauxxsletsplit_commaxs=xs|>Common2.split_gen_when(function|Tok{t=TComma_;_}::xs->Somexs|_->None)(*****************************************************************************)(* View iterators *)(*****************************************************************************)letreciter_token_parenfxs=xs|>List.iter(function|PTokentok->ftok;|Parenthised(xxs,info_parens)->info_parens|>List.iterf;xxs|>List.iter(funxs->iter_token_parenfxs))letreciter_token_bracefxs=xs|>List.iter(function|BTokentok->ftok;|Braceised(xxs,tok1,tok2opt)->ftok1;do_optionftok2opt;xxs|>List.iter(funxs->iter_token_bracefxs))letreciter_token_ifdeffxs=xs|>List.iter(function|NotIfdefLinexs->xs|>List.iterf;|Ifdefbool(_,xxs,info_ifdef)|Ifdef(xxs,info_ifdef)->info_ifdef|>List.iterf;xxs|>List.iter(iter_token_ifdeff))letreciter_token_multifxs=xs|>List.iter(function|Tokt->ft|Braces(t1,xs,t2)|Parens(t1,xs,t2)|Angle(t1,xs,t2)->ft1;iter_token_multifxs;Common.do_optionft2)lettokens_of_parenxs=letg=ref[]inxs|>iter_token_paren(funtok->pushtokg);List.rev!glettokens_of_paren_orderedxs=letg=ref[]inletrecaux_tokens_ordered=function|PTokentok->pushtokg;|Parenthised(xxs,info_parens)->let(opar,cpar,commas)=matchinfo_parenswith|opar::xs->(matchList.revxswith|cpar::xs->opar,cpar,List.revxs|_->raiseImpossible)|_->raiseImpossibleinpushoparg;aux_args(xxs,commas);pushcparg;andaux_args(xxs,commas)=matchxxs,commaswith|[],[]->()|[xs],[]->xs|>List.iteraux_tokens_ordered|xs::ys::xxs,comma::commas->xs|>List.iteraux_tokens_ordered;pushcommag;aux_args(ys::xxs,commas)|_->raiseImpossibleinxs|>List.iteraux_tokens_ordered;List.rev!glettokens_of_multi_groupedxs=letres=ref[]inletaddx=Common.pushxresinletrecauxxs=xs|>List.iter(function|Tokt1->addt1|Braces(t1,xs,t2)|Parens(t1,xs,t2)|Angle(t1,xs,t2)->addt1;auxxs;Common.do_optionaddt2)inauxxs;List.rev!res(*****************************************************************************)(* vof *)(*****************************************************************************)letvof_context=function|InTopLevel->Ocaml.VSum("T",[])|InClassStruct_s->Ocaml.VSum("C",[])|InEnum->Ocaml.VSum("E",[])|InInitializer->Ocaml.VSum("I",[])|InAssign->Ocaml.VSum("=",[])|InParameter->Ocaml.VSum("P",[])|InArgument->Ocaml.VSum("A",[])|InFunction->Ocaml.VSum("F",[])(*
| InTemplateParam -> Ocaml.VSum ("<>", [])
*)letvof_token_extendedt=letinfo=TH.info_of_tokt.tinletstr=PI.str_of_infoinfoinletxs=List.mapvof_contextt.whereinOcaml.VTuple[Ocaml.VStringstr;Ocaml.VListxs]letrecvof_multi_grouped=function|Braces((v1,v2,v3))->letv1=vof_token_extendedv1andv2=Ocaml.vof_listvof_multi_groupedv2andv3=Ocaml.vof_optionvof_token_extendedv3inOcaml.VSum(("Braces",[v1;v2;v3]))|Parens((v1,v2,v3))->letv1=vof_token_extendedv1andv2=Ocaml.vof_listvof_multi_groupedv2andv3=Ocaml.vof_optionvof_token_extendedv3inOcaml.VSum(("Parens",[v1;v2;v3]))|Angle((v1,v2,v3))->letv1=vof_token_extendedv1andv2=Ocaml.vof_listvof_multi_groupedv2andv3=Ocaml.vof_optionvof_token_extendedv3inOcaml.VSum(("Angle",[v1;v2;v3]))|Tokv1->letv1=vof_token_extendedv1inOcaml.VSum(("Tok",[v1]))letvof_multi_grouped_listxs=letv=Ocaml.VList(xs|>List.mapvof_multi_grouped)inv