123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678(* Yoann Padioleau
*
* Copyright (C) 2012 Facebook
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation, with the
* special exception on linking described in file license.txt.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
* license.txt for more details.
*)openCommonmoduleE=Entity_codemoduleG=Graphe(*****************************************************************************)(* Prelude *)(*****************************************************************************)(*
* A program can be seen as a hierarchy of entities
* (directory/package/module/file/class/function/method/field/...)
* linked to each other through different mechanisms
* (import/reference/extend/implement/instantiate/call/access/...).
* This module is the basis for 'codegraph', a tool to help
* visualize code dependencies or code relationships.
* It provides one of the core data structure of codegraph
* an (hyper)graph of all the entities in a program linked
* either via a 'has-a' relation, which represent the
* hierarchies (in the sense of containment, not inheritance), or
* 'use-a', which represent the dependencies
* (the other core data structure of codegraph is in
* dependencies_matrix_code.ml).
*
* Is this yet another code database? For PHP we already have
* database_php.ml, tags_php.ml, database_light_php.ml,
* and now even a Prolog database, ... that's a lot of code database.
* They all have things in common, but by focusing here on one thing,
* by just having a single graph, it's then
* easier to reason and implement certain features.
* I could have probably done the DSM using database_php.ml
* but it was not made for that. Here the graph is
* the core and simplest data structure that is needed.
*
* This graph also unifies many things. For instance there is no
* special code to handle directories or files, they are
* just considered regular entities like module or classes
* and can have sub-entities. Moreover like database_light.ml,
* this file is language independent so one can have one tool
* that can handle ML, PHP, C++, etc.
*
* todo:
* - how to handle duplicate entities (e.g. we can have two different
* files with the same module name, or two functions with the same
* name but one in a library and the other in a script).
* prepend a ___number suffix?
* Or just have one node with multiple parents :) But having
* multiple parents would not solve the problem because then
* an edge will increment unrelated cells in the DSM.
*
* - change API to allow by default to automatically create nodes
* when create edges with unexisting nodes? After all graphviz
* allow to specify graphs like this, which shorten graph
* description significantly. Can still have a
* add_edge_throw_exn_if_not_present for the cases where we
* want extra security.
*
* - maybe I can generate the light database from this graph_code.ml
* (I already do a bit for prolog with graph_code_prolog.ml)
*
* - opti: faster implem of parent? have a lock_graph() that forbid any
* further modifications on Has but then provide optimized operations
* like parent the precompute or memoize the parent relation
*
* related work:
* - grok: by steve yegge http://www.youtube.com/watch?v=KTJs-0EInW8
*)(*****************************************************************************)(* Types *)(*****************************************************************************)typenode=string*E.entity_kindtypeedge=(* a package Has subpackages, a subpackage Has classes, a class Has members,
* etc *)|Has(* A class Use(extends) another class, a method Use(calls) another method,
* etc.
* todo? refine by having different cases? Use of `Call|`Extend|...?
* I didn't feel the need yet, because it's easy to know if it's
* a Call or Extend by looking at the src and dst of the edge.
* But it could be useful for instance for field access to know
* weather it's a read or write access! Instead of having a variant
* here one could also have an edgeinfo.
*)|Usetypenodeinfo={pos:Parse_info.token_location;props:E.propertylist;(* would be better to have a more structured form than string at some point *)typ:stringoption;}(* could also have a pos: and props: here *)typeedgeinfo={write:bool;read:bool;}(*
* We use an imperative, directed, without intermediate node-index, graph.
*
* We use two different graphs because we need an efficient way to
* go up in the hierarchy to increment cells in the dependency matrix
* so it's better to separate the two usages.
*
* note: file information are in readable path format in Dir and File
* nodes (and should also be in readable format in the nodeinfo).
*)typegraph={(* Actually the Has graph should really be a tree, but we need convenient
* access to the children or parent of a node, which are provided
* by the graph API so let's reuse that.
*)has:nodeG.graph;(* The source and target should be enough information to understand
* the kind of Use. For instance a class referencing another class
* has to be an 'extends'. A class referencing an Interface has to
* be an 'implements'.
*)use:nodeG.graph;nodeinfo:(node,nodeinfo)Hashtbl.t;edgeinfo:((node*node*edge),edgeinfo)Hashtbl.t;}typeerror=|NodeAlreadyPresentofnodeexceptionErroroferror(* coupling: see print_statistics below *)typestatistics={parse_errors:Common.filenamelistref;(* could be Parse_info.token_location*)lookup_fail:(Parse_info.t*node)listref;method_calls:(Parse_info.t*resolved)listref;field_access:(Parse_info.t*resolved)listref;unresolved_class_access:Parse_info.tlistref;unresolved_calls:Parse_info.tlistref;}andresolved=boolletempty_statistics()={parse_errors=ref[];lookup_fail=ref[];method_calls=ref[];unresolved_calls=ref[];unresolved_class_access=ref[];field_access=ref[];}(* we sometimes want to collapse unimportant directories under a "..."
* fake intermediate directory. So one can create an adjust file with
* for instance:
* api -> extra/
* and we will delete the current parent of 'api' and relink it to the
* extra/ entity (possibly newly created)
*)typeadjust=(string*string)(* skip certain edges that are marked as ok regarding backward dependencies *)typedependency=(node*node)typewhitelist=dependencylist(*****************************************************************************)(* Constants *)(*****************************************************************************)letroot=".",E.Dirletpb="PB",E.Dirletnot_found="NOT_FOUND",E.Dirletdupe="DUPE",E.Dirlet_stdlib="STDLIB",E.Dir(*****************************************************************************)(* Debugging *)(*****************************************************************************)letstring_of_node(s,kind)=E.string_of_entity_kindkind^": "^sletstring_of_error=function|NodeAlreadyPresentn->("Node already present: "^string_of_noden)letnode_of_strings=ifs=~"\\([^:]*\\):\\(.*\\)"thenlet(s1,s2)=Common.matched2sins2,E.entity_kind_of_strings1elsefailwith(spf"node_of_string: wrong format '%s'"s)letdisplay_with_gvg=(* TODO? use different colors for the different kind of edges? *)G.display_with_gvg.has(*****************************************************************************)(* Graph construction *)(*****************************************************************************)letcreate()={has=G.create();use=G.create();nodeinfo=Hashtbl.create101;edgeinfo=Hashtbl.create101;}letadd_nodeng=Common.profile_code"Graph_code.add_node"(fun()->ifG.has_nodeng.hasthenbeginpr2_genn;raise(Error(NodeAlreadyPresentn))end;ifG.has_nodeng.usethenbeginpr2_genn;raise(Error(NodeAlreadyPresentn))end;G.add_vertex_if_not_presentng.has;G.add_vertex_if_not_presentng.use;())letadd_edge(n1,n2)eg=Common.profile_code"Graph_code.add_edge"(fun()->matchewith|Has->G.add_edgen1n2g.has|Use->G.add_edgen1n2g.use)letremove_edge(n1,n2)eg=matchewith|Has->G.remove_edgen1n2g.has|Use->G.remove_edgen1n2g.useletadd_nodeinfoninfog=ifnot(G.has_nodeng.has)thenfailwith"unknown node";Hashtbl.replaceg.nodeinfoninfoletadd_edgeinfo(n1,n2)einfog=Hashtbl.replaceg.edgeinfo(n1,n2,e)info(*****************************************************************************)(* IO *)(*****************************************************************************)(* todo: what when have a .opti? cache_computation will shortcut us *)letversion=5letsavegfile=(* see ocamlgraph FAQ *)Common2.write_value(g,!Graph.Blocks.cpt_vertex,version)fileletloadfile=let(g,serialized_cpt_vertex,version2)=Common2.get_valuefileinifversion!=version2thenfailwith(spf"your marshalled file has an old version, delete it");Graph.Blocks.after_unserializationserialized_cpt_vertex;gletdefault_filename="graph_code.marshall"(*****************************************************************************)(* Iteration *)(*****************************************************************************)letiter_use_edgesfg=G.iter_edgesfg.useletiter_nodesfg=G.iter_nodesfg.hasletall_use_edgesg=letres=ref[]inG.iter_edges(funn1n2->Common.push(n1,n2)res)g.use;!resletall_nodesg=letres=ref[]inG.iter_nodes(funn->Common.pushnres)g.has;!res(*****************************************************************************)(* Graph access *)(*****************************************************************************)lethas_nodeng=G.has_nodeng.hasletpredneg=Common.profile_code"Graph_code.pred"(fun()->matchewith|Has->G.predng.has|Use->G.predng.use)letsuccneg=matchewith|Has->G.succng.has|Use->G.succng.use(* the default implementation of a graph in ocamlgraph is good to
* get the successor but not good at all for the predecessors
* so if you need to use pred many times, use this precomputation
* function.
*)letmk_eff_use_predg=(* we use its find_all property *)leth=Hashtbl.create101ing|>iter_nodes(funn1->letuses=succn1Useginuses|>List.iter(funn2->Hashtbl.addhn2n1));(funn->Hashtbl.find_allhn)letparentng=Common.profile_code"Graph_code.parent"(fun()->letxs=G.predng.hasinCommon2.list_to_single_or_exnxs)letparentsng=Common.profile_code"Graph_code.parents"(fun()->G.predng.has)letchildrenng=G.succng.hasletrecnode_and_all_childrenng=letxs=G.succng.hasinifnullxsthen[n]elsen::(xs|>List.map(funn->node_and_all_childrenng)|>List.flatten)letnb_nodesg=G.nb_nodesg.hasletnb_use_edgesg=G.nb_edgesg.useletnodeinfong=Hashtbl.findg.nodeinfonletnodeinfo_optng=trySome(nodeinfong)withNot_found->Noneletedgeinfo_opt(n1,n2)eg=trySome(Hashtbl.findg.edgeinfo(n1,n2,e))withNot_found->None(* todo? assert it's a readable path? graph_code_php.ml is using readable
* path now but the other might not yet or it can be sometimes convenient
* also to have absolute path here, so not sure if can assert anything.
*)letfile_of_nodeng=tryletinfo=nodeinfongininfo.pos.Parse_info.filewithNot_found->(matchnwith|str,E.File->str|_->raiseNot_found(* todo: BAD no? *)(* spf "NOT_FOUND_FILE (for node %s)" (string_of_node n) *))letprivacy_of_nodeng=letinfo=nodeinfonginletprops=info.propsinprops|>Common.find_some(function|E.Privacyx->Somex|_->None)(* see also Graph_code_class_analysis.class_method_of_string *)letshortname_of_node(s,_kind)=letxs=Common.split"[.]"sinlets=Common2.list_lastxsin(* undo what was in gensym, otherwise codemap for instance will not
* recognize the entity as one hovers on its name in a file. *)lets=ifs=~"\\(.*\\)__[0-9]+"thenCommon.matched1selsesinlets=(* see graph_code_clang.ml handling of struct/typedef/unions *)ifs=~"^[STU]__\\(.*\\)"thenbegin(* assert (kind =*= E.Type);, hmm have constructor like T__AT *)Common.matched1sendelsesinsletcnt=ref0(* when we have static entities, or main(), we rename them locally
* and add a unique __xxx suffix, to avoid DUPES.
*)letgensyms=incrcnt;spf"%s__%d"s!cnt(*****************************************************************************)(* Helpers *)(*****************************************************************************)letcreate_intermediate_directories_if_not_presentgdir=letdirs=Common2.inits_of_relative_dirdirinletrecauxcurrentxs=matchxswith|[]->()|x::xs->letentity=x,E.Dirinifhas_nodeentitygthenauxentityxselsebeging|>add_nodeentity;g|>add_edge(current,entity)Has;auxentityxsendinauxrootdirsletcreate_initial_hierarchyg=g|>add_noderoot;g|>add_nodepb;g|>add_nodenot_found;g|>add_nodedupe;(* g +> add_node stdlib;*)g|>add_edge(root,pb)Has;g|>add_edge(pb,dupe)Has;g|>add_edge(pb,not_found)Has;(* g +> add_edge (root, stdlib) Has;*)()letremove_empty_nodesgxs=letuse_pred=mk_eff_use_predginxs|>List.iter(funn->ifsuccnUseg=[]&&use_predn=[]thenbegin(* less: could also remove the node? but slow? removing the edge
* should be enough for what we want (avoid clutter in codegraph)
*)remove_edge(parentng,n)Hasg;end)letbasename_to_readable_disambiguatorxs~root=letxs=xs|>List.map(Common.readable~root)in(* use the Hashtbl.find_all property of this hash *)leth=Hashtbl.create101inxs|>List.iter(funfile->Hashtbl.addh(Filename.basenamefile)file);(funfile->Hashtbl.find_allhfile)(*****************************************************************************)(* Misc *)(*****************************************************************************)letgroup_edges_by_files_edgesxsg=xs|>Common2.group_by_mapped_key(fun(n1,n2)->(file_of_noden1g,file_of_noden2g))|>List.map(fun(x,deps)->List.lengthdeps,(x,deps))|>Common.sort_by_key_highfirst|>List.mapsnd(*****************************************************************************)(* Graph algorithms *)(*****************************************************************************)letstrongly_connected_components_use_graphg=let(scc,hscc)=G.strongly_connected_componentsg.useinscc,hscclettop_down_numberingg=let(scc,hscc)=G.strongly_connected_componentsg.useinletg2=G.strongly_connected_components_condensationg.use(scc,hscc)inlethdepth=G.depth_nodesg2inlethres=Hashtbl.create101inhdepth|>Hashtbl.iter(funkv->letnodes_at_k=scc.(k)innodes_at_k|>List.iter(funn->Hashtbl.addhresnv));hresletbottom_up_numberingg=let(scc,hscc)=G.strongly_connected_componentsg.useinletg2=G.strongly_connected_components_condensationg.use(scc,hscc)inletg3=G.mirrorg2inlethdepth=G.depth_nodesg3inlethres=Hashtbl.create101inhdepth|>Hashtbl.iter(funkv->letnodes_at_k=scc.(k)innodes_at_k|>List.iter(funn->Hashtbl.addhresnv));hres(*****************************************************************************)(* Graph adjustments *)(*****************************************************************************)letload_adjustfile=Common.catfile|>Common.exclude(funs->s=~"#.*"||s=~"^[ \t]*$")|>List.map(funs->matchswith|_whens=~"\\([^ ]+\\)[ ]+->[ ]*\\([^ ]+\\)"->Common.matched2s|_->failwith("wrong line format in adjust file: "^s))letload_whitelistfile=Common.catfile|>List.map(funs->ifs=~"\\(.*\\) --> \\(.*\\) "thenlet(s1,s2)=Common.matched2sinnode_of_strings1,node_of_strings2elsefailwith(spf"load_whitelist: wrong line: %s"s))letsave_whitelistxsfileg=Common.with_open_outfilefile(fun(pr_no_nl,_chan)->xs|>List.iter(fun(n1,n2)->letfile=file_of_noden2ginpr_no_nl(spf"%s --> %s (%s)\n"(string_of_noden1)(string_of_noden2)file);))(* Used mainly to collapse many entries under a "..." intermediate fake
* parent. Maybe this could be done automatically in codegraph at some point,
* like ndepend does I think.
*)letadjust_graphgxswhitelist=letmapping=Hashtbl.create101ing|>iter_nodes(fun(s,kind)->Hashtbl.addmappings(s,kind));xs|>List.iter(fun(s1,s2)->letnodes=Hashtbl.find_allmappings1inletnew_parent=(s2,E.Dir)increate_intermediate_directories_if_not_presentgs2;(matchnodeswith|[n]->letold_parent=parentnginremove_edge(old_parent,n)Hasg;add_edge(new_parent,n)Hasg;|[]->failwith(spf"could not find entity %s"s1)|_->failwith(spf"multiple entities with %s as a name"s1)));whitelist|>Console.progress~show:true(funk->List.iter(fun(n1,n2)->k();remove_edge(n1,n2)Useg;))(*****************************************************************************)(* Example *)(*****************************************************************************)(* assumes a "path/to/file.x" -> "path/to/file2.x" format *)letgraph_of_dotfiledotfile=letxs=Common.catdotfileinletdeps=xs|>Common.map_filter(funs->ifs=~"^\"\\(.*\\)\" -> \"\\(.*\\)\"$"thenlet(src,dst)=Common.matched2sinSome(src,dst)elsebeginpr2(spf"ignoring line: %s"s);Noneend)inletg=create()increate_initial_hierarchyg;(* step1: defs *)deps|>List.iter(fun(src,dst)->letsrcdir=Filename.dirnamesrcinletdstdir=Filename.dirnamedstintrycreate_intermediate_directories_if_not_presentgsrcdir;create_intermediate_directories_if_not_presentgdstdir;ifnot(has_node(src,E.File)g)thenbeging|>add_node(src,E.File);g|>add_edge((srcdir,E.Dir),(src,E.File))Has;end;ifnot(has_node(dst,E.File)g)thenbeging|>add_node(dst,E.File);g|>add_edge((dstdir,E.Dir),(dst,E.File))Has;end;withAssert_failure_->pr2_gen(src,dst););(* step2: use *)deps|>List.iter(fun(src,dst)->letsrc_node=(src,E.File)inletdst_node=(dst,E.File)ing|>add_edge(src_node,dst_node)Use;);g(*****************************************************************************)(* Statistics *)(*****************************************************************************)letprint_statisticsstatsg=pr(spf"nb nodes = %d, nb edges = %d"(nb_nodesg)(nb_use_edgesg));pr(spf"parse errors = %d"(!(stats.parse_errors)|>List.length));pr(spf"lookup fail = %d"(!(stats.lookup_fail)|>List.length));pr(spf"unresolved method calls = %d"(!(stats.method_calls)|>List.filter(fun(_,x)->notx)|>List.length));pr(spf"(resolved method calls = %d)"(!(stats.method_calls)|>List.filter(fun(_,x)->x)|>List.length));pr(spf"unresolved field access = %d"(!(stats.field_access)|>List.filter(fun(_,x)->notx)|>List.length));pr(spf"(resolved field access) = %d)"(!(stats.field_access)|>List.filter(fun(_,x)->x)|>List.length));pr(spf"unresolved class access = %d"(!(stats.unresolved_class_access)|>List.length));pr(spf"unresolved calls = %d"(!(stats.unresolved_calls)|>List.length));()