Source file `big_grep.ml`

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
(* Yoann Padioleau
 *
 * Copyright (C) 2010 Facebook
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * version 2.1 as published by the Free Software Foundation, with the
 * special exception on linking described in file license.txt.
 * 
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the file
 * license.txt for more details.
 *)
open Common

module Db = Database_code

(*****************************************************************************)
(* Prelude *)
(*****************************************************************************)
(*
 * Inspired by 'tbgs' and big_grep at facebook.
 * The trick is to build a giant string and run compiled-regexps
 * on it. For each match have to go back to find the start and
 * end of entity, or the entity number so can display
 * the information associated with it. So need markers
 * in the string.
 * 
 * One-liner in perl by Erling:
 * perl -e '$|++; open F,"/usr/share/dict/words"; { local $/; $all=<F>;
 * } while(<STDIN>) { chomp; $w=$_; $n = 0; while($all =~ /$w.*/g) {
 * print "$&\n"; last if ++$n>10; } print "[$w]\n"; }'
 * 
 *)

(*****************************************************************************)
(* Types *)
(*****************************************************************************)

type index = {
  big_string: string;
  pos_to_entity: (int, Db.entity) Hashtbl.t;
  case_sensitive: bool;
}

(* using \n is convenient so can allow regexp queries like
 * employee.* without having the regexp engine to try to match
 * the whole string; it will stop at the first \n.
 *)
let separation_marker_char = '\n'

let empty_index () = {
  big_string = "";
  pos_to_entity = Hashtbl.create 1;
  case_sensitive = false;
}

(*****************************************************************************)
(* Helpers *)
(*****************************************************************************)

let (==~) = Common2.(==~)

(*****************************************************************************)
(* Naive version *)
(*****************************************************************************)

(* This is the naive version, just to have a baseline for benchmarks *)
let naive_top_n_search2 ~top_n ~query xs =
  let re = Str.regexp (".*" ^ query) in

  let rec aux ~n xs =
    if n = top_n
    then []
    else 
      (match xs with
      | [] -> []
      | e::xs ->
          if e.Db.e_name ==~ re
          then
            e::aux ~n:(n+1) xs
          else 
            aux ~n xs
      )
  in
  aux ~n:0 xs


let naive_top_n_search ~top_n ~query idx =
  Common.profile_code "Big_grep.naive_top_n" (fun () -> 
    naive_top_n_search2 ~top_n ~query idx
  )

(*****************************************************************************)
(* Main entry point *)
(*****************************************************************************)

let build_index2 ?(case_sensitive=false) entities =
  
  let buf = Buffer.create 20_000_000 in
  let h = Hashtbl.create 1001 in

  let current = ref 0 in

  entities |> List.iter (fun e ->
    (* Use fullname ? The caller, that is for instance
     * files_and_dirs_and_sorted_entities_for_completion
     * should have done the job of putting the fullename in e_name.
     *)
    let s = Common2.string_of_char separation_marker_char ^ e.Db.e_name in
    let s = 
      if case_sensitive
      then s
      else String.lowercase_ascii s
    in

    Buffer.add_string buf s;
    Hashtbl.add h !current e;
    current := !current + String.length s;
  );
  (* just to make it easier to code certain algorithms such as
   * find_position_marker_after
   *)
  Buffer.add_string buf (Common2.string_of_char separation_marker_char);

  {
    big_string = Buffer.contents buf;
    pos_to_entity = h;
    case_sensitive = case_sensitive;
  }

let build_index ?case_sensitive a =
  Common.profile_code "Big_grep.build_idx" (fun () -> 
    build_index2 ?case_sensitive a)


let find_position_marker_before start_pos str =
  let pos = ref (start_pos - 1) in
  
  while String.get str !pos <> separation_marker_char do
    pos := !pos - 1
  done;
  !pos

let find_position_marker_after start_pos str =
  let pos = ref (start_pos + 1) in
  
  while String.get str !pos <> separation_marker_char do
    pos := !pos + 1
  done;
  !pos

(* the query can now contain multipe words *)
let top_n_search2 ~top_n ~query idx =

  let query = 
    if idx.case_sensitive then query else String.lowercase_ascii query
  in

  let words = Str.split (Str.regexp "[ \t]+") query in
  let re =
    match words with
    | [_] -> Str.regexp (".*" ^ query)
    | [a;b] -> 
        Str.regexp (spf 
                       ".*\\(%s.*%s\\)\\|\\(%s.*%s\\)"
                       a b b a)
    | _ -> 
        failwith "more-than-2-words query is not supported; give money to pad"
  in
  
  let rec aux ~n ~pos =
    if n = top_n
    then []
    else 
      try 
        let new_pos = Str.search_forward re idx.big_string pos in
        (* let's found the marker *)
        let pos_mark = 
          find_position_marker_before new_pos idx.big_string in
        let pos_next_mark = 
          find_position_marker_after new_pos idx.big_string in
        let e = Hashtbl.find idx.pos_to_entity pos_mark in
        e::aux ~n:(n+1) ~pos:pos_next_mark
      with Not_found -> []
  in
  aux ~n:0 ~pos:0


let top_n_search ~top_n ~query idx =
  Common.profile_code "Big_grep.top_n" (fun () -> 
    top_n_search2 ~top_n ~query idx
  )