123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194(* Yoann Padioleau
*
* Copyright (C) 2010 Facebook
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* version 2.1 as published by the Free Software Foundation, with the
* special exception on linking described in file license.txt.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the file
* license.txt for more details.
*)openCommonmoduleDb=Database_code(*****************************************************************************)(* Prelude *)(*****************************************************************************)(*
* Inspired by 'tbgs' and big_grep at facebook.
* The trick is to build a giant string and run compiled-regexps
* on it. For each match have to go back to find the start and
* end of entity, or the entity number so can display
* the information associated with it. So need markers
* in the string.
*
* One-liner in perl by Erling:
* perl -e '$|++; open F,"/usr/share/dict/words"; { local $/; $all=<F>;
* } while(<STDIN>) { chomp; $w=$_; $n = 0; while($all =~ /$w.*/g) {
* print "$&\n"; last if ++$n>10; } print "[$w]\n"; }'
*
*)(*****************************************************************************)(* Types *)(*****************************************************************************)typeindex={big_string:string;pos_to_entity:(int,Db.entity)Hashtbl.t;case_sensitive:bool;}(* using \n is convenient so can allow regexp queries like
* employee.* without having the regexp engine to try to match
* the whole string; it will stop at the first \n.
*)letseparation_marker_char='\n'letempty_index()={big_string="";pos_to_entity=Hashtbl.create1;case_sensitive=false;}(*****************************************************************************)(* Helpers *)(*****************************************************************************)let(==~)=Common2.(==~)(*****************************************************************************)(* Naive version *)(*****************************************************************************)(* This is the naive version, just to have a baseline for benchmarks *)letnaive_top_n_search2~top_n~queryxs=letre=Str.regexp(".*"^query)inletrecaux~nxs=ifn=top_nthen[]else(matchxswith|[]->[]|e::xs->ife.Db.e_name==~rethene::aux~n:(n+1)xselseaux~nxs)inaux~n:0xsletnaive_top_n_search~top_n~queryidx=Common.profile_code"Big_grep.naive_top_n"(fun()->naive_top_n_search2~top_n~queryidx)(*****************************************************************************)(* Main entry point *)(*****************************************************************************)letbuild_index2?(case_sensitive=false)entities=letbuf=Buffer.create20_000_000inleth=Hashtbl.create1001inletcurrent=ref0inentities|>List.iter(fune->(* Use fullname ? The caller, that is for instance
* files_and_dirs_and_sorted_entities_for_completion
* should have done the job of putting the fullename in e_name.
*)lets=Common2.string_of_charseparation_marker_char^e.Db.e_nameinlets=ifcase_sensitivethenselseString.lowercase_asciisinBuffer.add_stringbufs;Hashtbl.addh!currente;current:=!current+String.lengths;);(* just to make it easier to code certain algorithms such as
* find_position_marker_after
*)Buffer.add_stringbuf(Common2.string_of_charseparation_marker_char);{big_string=Buffer.contentsbuf;pos_to_entity=h;case_sensitive=case_sensitive;}letbuild_index?case_sensitivea=Common.profile_code"Big_grep.build_idx"(fun()->build_index2?case_sensitivea)letfind_position_marker_beforestart_posstr=letpos=ref(start_pos-1)inwhileString.getstr!pos<>separation_marker_chardopos:=!pos-1done;!posletfind_position_marker_afterstart_posstr=letpos=ref(start_pos+1)inwhileString.getstr!pos<>separation_marker_chardopos:=!pos+1done;!pos(* the query can now contain multipe words *)lettop_n_search2~top_n~queryidx=letquery=ifidx.case_sensitivethenqueryelseString.lowercase_asciiqueryinletwords=Str.split(Str.regexp"[ \t]+")queryinletre=matchwordswith|[_]->Str.regexp(".*"^query)|[a;b]->Str.regexp(spf".*\\(%s.*%s\\)\\|\\(%s.*%s\\)"abba)|_->failwith"more-than-2-words query is not supported; give money to pad"inletrecaux~n~pos=ifn=top_nthen[]elsetryletnew_pos=Str.search_forwardreidx.big_stringposin(* let's found the marker *)letpos_mark=find_position_marker_beforenew_posidx.big_stringinletpos_next_mark=find_position_marker_afternew_posidx.big_stringinlete=Hashtbl.findidx.pos_to_entitypos_markine::aux~n:(n+1)~pos:pos_next_markwithNot_found->[]inaux~n:0~pos:0lettop_n_search~top_n~queryidx=Common.profile_code"Big_grep.top_n"(fun()->top_n_search2~top_n~queryidx)