1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
open Forester_core
open Spelll
open struct module T = Forester_core.Types end
module Ocurrences = Set.Make(struct
type t = int list list * URI.t
let compare (_i, x) (_j, y) = URI.compare x y
end)
type t = {
index: Ocurrences.t Index.t;
number_of_tokens: int;
number_of_docs: int;
}
let average_doc_length
: t -> float
= fun {number_of_tokens; number_of_docs; _} ->
Float.of_int number_of_tokens /. Float.of_int number_of_docs
let add_one
: T.content T.article -> t -> t
= fun article ({index; number_of_tokens; number_of_docs;} as t) ->
if Option.is_none T.(article.frontmatter.uri) then t
else
let tokens_in_article = Tokenizer.tokenize_article article in
let uri = Option.get T.(article.frontmatter.uri) in
let new_tokens = ref 0 in
let new_index =
List.fold_left
(fun index (ocurrences, token) ->
match Index.retrieve_l ~limit: 0 index token with
| [] ->
let ocurrence = Ocurrences.singleton ([ocurrences], uri) in
new_tokens := !new_tokens + 1;
Index.add index token ocurrence
| ids :: [] ->
Index.add index token (Ocurrences.add ([ocurrences], uri) ids)
| _ ->
assert false
)
index
tokens_in_article
in
{
index = new_index;
number_of_docs = number_of_docs + 1;
number_of_tokens = number_of_tokens + !new_tokens
}
let add
: T.content T.article list -> t -> t
=
List.fold_right add_one
let search
: ?fuzz: int -> t -> string -> (int list list * URI.t) list
= fun ?(fuzz = 0) index term ->
Tokenizer.tokenize term
|> List.concat_map
(fun str ->
List.concat_map Ocurrences.to_list @@
Index.retrieve_l ~limit: fuzz index.index str
)
module BM_25 = struct
let sum = List.fold_left (+.) 0.
let idf q (index : t) =
let n = Float.of_int @@ List.length @@ search ~fuzz: 0 index q in
log @@ ((Float.of_int index.number_of_docs -. n +. 0.5) /. n +. 0.5) +. 1.
let doc_length d =
Float.of_int @@
List.length @@
Tokenizer.tokenize_article d
let score
: T.content T.article -> string -> t -> float
= fun d q index ->
let tokens = Tokenizer.tokenize q in
assert (List.length tokens > 0);
let avg_len = average_doc_length index in
let k_1 = 1.5 in
let b = 0.75 in
sum @@
List.map
(fun q_i ->
let num_occurrences =
Float.of_int @@
List.length @@ search index q_i
in
idf q index *.
begin
(num_occurrences *. k_1 +. 1.) /.
(num_occurrences +. k_1 *. (1. -. b +. (b *. doc_length d /. avg_len))) +.
1.
end
)
tokens
end
let create articles =
let index = {
index = Index.empty;
number_of_docs = 0;
number_of_tokens = 0
}
in
add articles index
let marshal (v : t) filename =
let oc = open_out_bin filename in
Fun.protect
~finally: (fun () -> close_out oc)
(fun () -> Marshal.to_channel oc v [])
let unmarshal filename : t =
let ic = open_in_bin filename in
Fun.protect
~finally: (fun () -> close_in ic)
(fun () -> Marshal.from_channel ic)