123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207# 1 "src/base/misc/owl_io.ml"(*
* OWL - OCaml Scientific and Engineering Computing
* Copyright (c) 2016-2020 Liang Wang <liang.wang@cl.cam.ac.uk>
*)openOwl_utils(* read a file of a given path *)letread_file?(trim=true)f=leth=open_infinFun.protect(fun()->lets=Stack.make()in(trywhiletruedoletl=matchtrimwith|true->input_lineh|>String.trim|false->input_linehinStack.pushsldonewith|End_of_file->());Stack.to_arrays)~finally:(fun()->close_inh)letread_file_stringf=letic=open_infinFun.protect(fun()->letn=in_channel_lengthicinlets=Bytes.createninreally_inputics0n;Bytes.to_strings)~finally:(fun()->close_inic)(* write a file of a given path *)letwrite_file?(_flag=Open_creat)fs=leth=open_outfinFun.protect(fun()->Printf.fprintfh"%s"s)~finally:(fun()->close_outh)(* iterate every doc in the corpus without loading the whole corpus in the
memory, then apply passed in function f. note that each line is a doc.
*)letiteri_lines_of_file?(verbose=true)ffname=leti=ref0inleth=open_infnameinFun.protect(fun()->lett0=Unix.gettimeofday()inlett1=ref(Unix.gettimeofday())intrywhiletruedof!i(input_lineh);i:=!i+1;(* output summary if in verbose mode *)ifverbose=truethen(lett2=Unix.gettimeofday()inift2-.!t1>5.then(t1:=t2;letspeed=float_of_int!i/.(t2-.t0)|>int_of_floatinOwl_log.info"processed %i, avg. %i docs/s"!ispeed))donewith|End_of_file->())~finally:(fun()->close_inh)(* map every doc in the corpus into another type *)letmapi_lines_of_fileffname=letstack=Owl_utils.Stack.make()initeri_lines_of_file(funis->Owl_utils.Stack.pushstack(fis))fname;Owl_utils.Stack.to_arraystack(* similar to iteri_lines_of_file but for marshaled file *)letiteri_lines_of_marshal?(verbose=true)ffname=leti=ref0inleth=open_in_binfnameinFun.protect(fun()->lett1=ref(Unix.gettimeofday())inleti1=ref0intrywhiletruedof!i(Marshal.from_channelh);i:=!i+1;(* output summary if in verbose mode *)ifverbose=truethen(lett2=Unix.gettimeofday()inift2-.!t1>5.then(letspeed=float_of_int(!i-!i1)/.(t2-.!t1)|>int_of_floatini1:=!i;t1:=t2;Owl_log.info"processed %i, avg. %i docs/s"!ispeed))donewith|End_of_file->())~finally:(fun()->close_inh)(* similar to mapi_lines_of_file but for marshaled file *)letmapi_lines_of_marshalffname=letstack=Owl_utils.Stack.make()initeri_lines_of_marshal(funis->Owl_utils.Stack.pushstack(fis))fname;Owl_utils.Stack.to_arraystack(* save a marshalled object to a file *)letmarshal_to_file?(flags=[])xf=leth=open_out_binfinFun.protect(fun()->Marshal.to_channelhxflags)~finally:(fun()->close_outh)(* load a marshalled object from a file *)letmarshal_from_filef=leth=open_in_binfinFun.protect(fun()->lets=really_input_stringh(in_channel_lengthh)inMarshal.from_strings0)~finally:(fun()->close_inh)letheadnfname=letlines=Owl_utils.Stack.make()in(tryiteri_lines_of_file(funis->assert(i<n);Owl_utils.Stack.pushliness)fnamewith|exn->Owl_log.warn"Owl_io.head: ignored exception %s"(Printexc.to_stringexn));Owl_utils.Stack.to_arraylines(* TODO *)let_tail_n_fname=raise(Owl_exception.NOT_IMPLEMENTED"owl_io._tail")letread_csv?(sep='\t')fname=letlines=Owl_utils.Stack.make()initeri_lines_of_file(fun_is->String.trims|>String.split_on_charsep|>Array.of_list|>Owl_utils.Stack.pushlines)fname;Owl_utils.Stack.to_arraylinesletwrite_csv?(sep='\t')xfname=leth=open_outfnameinFun.protect(fun()->Array.iter(funrow->lets=Array.fold_left(funaccelt->Printf.sprintf"%s%s%c"acceltsep)""rowinPrintf.fprintfh"%s\n"s)x)~finally:(fun()->close_outh)letread_csv_proc?(sep='\t')procfname=iteri_lines_of_file(funis->String.trims|>String.split_on_charsep|>Array.of_list|>proci)fnameletwrite_csv_proc?(sep='\t')xprocfname=leth=open_outfnameinFun.protect(fun()->Array.iter(funrow->lets=Array.fold_left(funaccelt->Printf.sprintf"%s%s%c"acc(procelt)sep)""rowinPrintf.fprintfh"%s\n"s)x)~finally:(fun()->close_outh)letcsv_head?(sep='\t')idxfname=leth=open_infnameinFun.protect(fun()->for_i=1toidx-1doinput_lineh|>ignoredone;input_lineh|>String.trim|>String.split_on_charsep|>Array.of_list)~finally:(fun()->close_inh)