1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
open Core
exception Not_enough_rows
type row_processor =
Csv_common.Or_file.t
-> skip_header:bool
-> sep:char
-> f:(string array -> unit)
-> [ `Limit_to of string list | `All_but of string list ]
-> unit
let load_rows ?on_invalid_row file ~sep ~f =
let process ic =
protectx
ic
~f:(fun ic ->
Delimited_kernel.Read.fold_lines
?on_invalid_row
~sep
~header:`No
Delimited_kernel.Read.Row.builder
ic
~init:()
~f:(fun () row -> f (Delimited_kernel.Read.Row.to_array row)))
~finally:In_channel.close
in
match (file : Csv_common.Or_file.t) with
| Csv { ; lines } ->
List.iter (header :: lines) ~f:(fun row -> Array.of_list row |> f)
| Stdin | File "-" -> process In_channel.stdin
| File x -> process (In_channel.create x)
;;
let get_positions =
match headers_wanted with
| `Limit_to ->
let hmap =
Map.of_iteri_exn
(module String)
~iteri:(fun ~f ->
Array.iteri header_row ~f:(fun i h -> f ~key:h ~data:i) [@nontail])
in
Array.of_list headers
|> Array.map ~f:(fun ->
match Map.find hmap header with
| Some position -> position
| None ->
(try Int.of_string header with
| _ -> raise_s [%message "Unknown header" ~_:(header : string)]))
| `All_but ->
let = String.Set.of_list headers in
Array.filter_mapi header_row ~f:(fun pos ->
if Set.mem headers header then None else Some pos)
;;
let index_positions =
match headers_wanted with
| `Limit_to -> Array.of_list_map headers ~f:Int.of_string
| `All_but ->
let = List.map headers ~f:Int.of_string |> Int.Set.of_list in
Array.filter_mapi header_row ~f:(fun pos _ ->
if Set.mem headers pos then None else Some pos)
;;
let cut_by_fields file ~ ~ ~sep ~f =
let handle_row = ref (fun _ -> assert false) in
(handle_row
:= fun row ->
let positions =
match consume_header_names with
| true -> get_positions row headers_wanted
| false -> index_positions row headers_wanted
in
let grab row =
Array.map positions ~f:(fun i -> if Array.length row < i then "" else row.(i))
in
if not skip_header then f (grab row);
handle_row := fun row -> f (grab row));
load_rows file ~sep ~f:(fun row -> !handle_row row)
;;
let cut_by_field_names = cut_by_fields ~consume_header_names:true
let cut_by_field_indices = cut_by_fields ~consume_header_names:false
exception First_row of string list
let field_names ~sep file =
let throw_row row = raise (First_row (Array.to_list row)) in
try
load_rows
~on_invalid_row:
(Delimited_kernel.Read.On_invalid_row.create (fun ~line_number:_ _ _ exn ->
match exn with
| First_row _ -> `Raise exn
| _ -> `Fallback Delimited_kernel.Read.On_invalid_row.raise))
file
~sep
~f:throw_row;
raise Not_enough_rows
with
| First_row row -> row
;;
let regex_match file ~sep ~f ~regexp =
let handle_row rows =
let grab rows =
Array.map rows ~f:(fun row -> if Pcre.pmatch ~rex:regexp row then row else "")
in
f (grab rows)
in
load_rows file ~sep ~f:handle_row
;;
let split_populated_rows file ~ ~sep ~f_populated ~f_unpopulated
=
let handle_row = ref (fun _ -> assert false) in
(handle_row
:= fun row ->
let positions = get_positions row headers_wanted in
let populated row =
let n = Array.length row in
Array.for_all positions ~f:(fun i -> i < n && String.length row.(i) > 0)
in
if not skip_header
then (
f_populated row;
f_unpopulated row);
handle_row
:= fun row -> if populated row then f_populated row else f_unpopulated row);
load_rows file ~sep ~f:(fun row -> !handle_row row)
;;
let fully_populated_rows file ~ ~sep ~f =
split_populated_rows
file
~skip_header
~sep
~f_populated:f
~f_unpopulated:ignore
headers_wanted
;;
let not_fully_populated_rows file ~ ~sep ~f =
split_populated_rows
file
~skip_header
~sep
~f_populated:ignore
~f_unpopulated:f
headers_wanted
;;