Source file dataset_utils.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
let () = Curl.global_init Curl.CURLINIT_GLOBALALL
let mkdir_p path perm =
if path = "" || path = "." || path = Filename.dir_sep then ()
else
let path_to_split, _is_absolute, initial_prefix =
if (Sys.win32 || Sys.cygwin) && String.length path >= 2 && path.[1] = ':' then
let drive_prefix = String.sub path 0 2 ^ Filename.dir_sep in
let rest =
if String.length path > 3 then String.sub path 3 (String.length path - 3) else ""
in
(rest, true, drive_prefix)
else if path <> "" && path.[0] = Filename.dir_sep.[0] then
let rest =
if String.length path > 1 then String.sub path 1 (String.length path - 1) else ""
in
(rest, true, Filename.dir_sep)
else
(path, false, ".")
in
let components =
String.split_on_char Filename.dir_sep.[0] path_to_split |> List.filter (( <> ) "")
in
ignore
(List.fold_left
(fun current_prefix comp ->
let next_path =
if current_prefix = Filename.dir_sep then Filename.dir_sep ^ comp
else Filename.concat current_prefix comp
in
(if Sys.file_exists next_path then (
if not (Sys.is_directory next_path) then
failwith (Printf.sprintf "mkdir_p: '%s' exists but is not a directory" next_path))
else
try Unix.mkdir next_path perm with
| Unix.Unix_error (Unix.EEXIST, _, _) ->
if not (Sys.is_directory next_path) then
failwith
(Printf.sprintf "mkdir_p: '%s' appeared as non-directory file after EEXIST"
next_path)
| Unix.Unix_error (e, fn, arg) ->
failwith
(Printf.sprintf "mkdir_p: Cannot create directory '%s': %s (%s %s)" next_path
(Unix.error_message e) fn arg)
| ex ->
failwith
(Printf.sprintf "mkdir_p: Unexpected error creating directory '%s': %s"
next_path (Printexc.to_string ex)));
next_path)
initial_prefix components);
()
module Xdg = struct
let home =
if Sys.win32 || Sys.cygwin then
try Sys.getenv "USERPROFILE"
with Not_found -> (
try Sys.getenv "HOMEPATH"
with Not_found ->
failwith "Neither USERPROFILE nor HOMEPATH environment variables are set.")
else try Sys.getenv "HOME" with Not_found -> failwith "HOME environment variable not set."
let cache_base =
let sep = Filename.dir_sep in
if Sys.win32 || Sys.cygwin then
home ^ sep ^ "AppData" ^ sep ^ "Local" ^ sep ^ "ocannl" ^ sep ^ "datasets" ^ sep
else home ^ sep ^ ".cache" ^ sep ^ "ocannl" ^ sep ^ "datasets" ^ sep
end
let get_cache_dir dataset_name = Xdg.cache_base ^ dataset_name ^ Filename.dir_sep
let mkdir_p dir = try mkdir_p dir 0o755 with Unix.Unix_error (Unix.EEXIST, _, _) -> ()
let download_file url dest_path =
let dest_dir = Filename.dirname dest_path in
mkdir_p dest_dir;
Printf.printf "Attempting to download %s to %s\n%!" (Filename.basename url) dest_path;
let h = new Curl.handle in
h#set_url url;
h#set_followlocation true;
h#set_timeout 300;
h#set_useragent "ocannl-datasets/0.6.1";
let oc = open_out_bin dest_path in
let result =
try
h#set_writefunction (fun s ->
output_string oc s;
String.length s);
h#perform;
let code = h#get_responsecode in
if code >= 200 && code < 300 then Ok () else Error (Printf.sprintf "HTTP Error: %d" code)
with
| Curl.CurlException (_code, _, msg) -> Error (Printf.sprintf "Curl error: %s" msg)
| exn -> Error (Printf.sprintf "Download exception: %s" (Printexc.to_string exn))
in
close_out oc;
h#cleanup;
match result with
| Ok () -> Printf.printf "Downloaded %s successfully.\n%!" (Filename.basename dest_path)
| Error msg ->
(try Sys.remove dest_path with Sys_error _ -> ());
failwith (Printf.sprintf "Failed to download %s: %s" url msg)
let ensure_file url dest_path =
if not (Sys.file_exists dest_path) then download_file url dest_path
else Printf.printf "Found file %s.\n%!" dest_path
let ~url ~archive_path ~ ~check_file =
let check_file_full_path = Filename.concat extract_dir check_file in
if not (Sys.file_exists check_file_full_path) then (
Printf.printf "Extracted file %s not found.\n%!" check_file_full_path;
ensure_file url archive_path;
mkdir_p extract_dir;
Printf.printf "Extracting %s to %s ...\n%!" archive_path extract_dir;
if Filename.check_suffix archive_path ".tar.gz" then
let =
if Sys.win32 || Sys.cygwin then (
let command =
Printf.sprintf "tar.exe -xzf %s -C %s" (Filename.quote archive_path)
(Filename.quote extract_dir)
in
Printf.printf "Executing: %s\n%!" command;
try
let exit_code = Unix.system command in
if exit_code = Unix.WEXITED 0 then (
Printf.printf "Extracted archive successfully using tar.exe.\n%!";
true)
else (
Printf.printf "tar.exe failed, trying alternative methods...\n%!";
false)
with _ ->
Printf.printf "tar.exe not available on this Windows system.\n%!";
false)
else
let command =
Printf.sprintf "tar xzf %s -C %s" (Filename.quote archive_path)
(Filename.quote extract_dir)
in
Printf.printf "Executing: %s\n%!" command;
let exit_code = Unix.system command in
exit_code = Unix.WEXITED 0
in
if not extract_success then
failwith
(Printf.sprintf
"Archive extraction failed for %s. On Windows, ensure tar.exe is available (Windows \
10+) or extract manually."
archive_path)
else Printf.printf "Archive extracted successfully.\n%!"
else failwith (Printf.sprintf "Unsupported archive type for %s (only .tar.gz)" archive_path);
if not (Sys.file_exists check_file_full_path) then
failwith
(Printf.sprintf "Extraction failed, %s not found after extraction." check_file_full_path))
else Printf.printf "Found extracted file %s.\n%!" check_file_full_path
let ensure_decompressed_gz ~gz_path ~target_path =
if Sys.file_exists target_path then (
Printf.printf "Found decompressed file %s.\n%!" target_path;
true)
else if Sys.file_exists gz_path then (
Printf.printf "Decompressing %s ...\n%!" gz_path;
try
let ic = Gzip.open_in gz_path in
let oc = open_out_bin target_path in
let buf = Bytes.create 4096 in
let rec loop () =
let n = Gzip.input ic buf 0 4096 in
if n > 0 then (
output oc buf 0 n;
loop ())
in
loop ();
Gzip.close_in ic;
close_out oc;
Printf.printf "Decompressed to %s.\n%!" target_path;
true
with Gzip.Error msg -> failwith (Printf.sprintf "Gzip error for %s: %s" gz_path msg))
else (
Printf.printf "Compressed file %s not found.\n%!" gz_path;
false)
let parse_float_cell ~context s =
try float_of_string s
with Failure _ | Invalid_argument _ ->
failwith (Printf.sprintf "Failed to parse float '%s' (%s)" s (context ()))
let parse_int_cell ~context s =
try int_of_string s
with Failure _ | Invalid_argument _ ->
failwith (Printf.sprintf "Failed to parse int '%s' (%s)" s (context ()))