123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188# 1 "src/base/misc/owl_io.ml"(*
* OWL - OCaml Scientific and Engineering Computing
* Copyright (c) 2016-2020 Liang Wang <liang.wang@cl.cam.ac.uk>
*)openOwl_utils(* read a file of a given path *)letread_file?(trim=true)f=leth=open_infinlets=Stack.make()in(trywhiletruedoletl=matchtrimwith|true->input_lineh|>String.trim|false->input_linehinStack.pushsldonewith|End_of_file->());close_inh;Stack.to_arraysletread_file_stringf=letic=open_infinletn=in_channel_lengthicinlets=Bytes.createninreally_inputics0n;close_inic;Bytes.to_strings(* write a file of a given path *)letwrite_file?(_flag=Open_creat)fs=leth=open_outfinPrintf.fprintfh"%s"s;close_outh(* iterate every doc in the corpus without loading the whole corpus in the
memory, then apply passed in function f. note that each line is a doc.
*)letiteri_lines_of_file?(verbose=true)ffname=leti=ref0inleth=open_infnamein(lett0=Unix.gettimeofday()inlett1=ref(Unix.gettimeofday())intrywhiletruedof!i(input_lineh);i:=!i+1;(* output summary if in verbose mode *)ifverbose=truethen(lett2=Unix.gettimeofday()inift2-.!t1>5.then(t1:=t2;letspeed=float_of_int!i/.(t2-.t0)|>int_of_floatinOwl_log.info"processed %i, avg. %i docs/s"!ispeed))donewith|End_of_file->());close_inh(* map every doc in the corpus into another type *)letmapi_lines_of_fileffname=letstack=Owl_utils.Stack.make()initeri_lines_of_file(funis->Owl_utils.Stack.pushstack(fis))fname;Owl_utils.Stack.to_arraystack(* similar to iteri_lines_of_file but for marshaled file *)letiteri_lines_of_marshal?(verbose=true)ffname=leti=ref0inleth=open_infnamein(lett1=ref(Unix.gettimeofday())inleti1=ref0intrywhiletruedof!i(Marshal.from_channelh);i:=!i+1;(* output summary if in verbose mode *)ifverbose=truethen(lett2=Unix.gettimeofday()inift2-.!t1>5.then(letspeed=float_of_int(!i-!i1)/.(t2-.!t1)|>int_of_floatini1:=!i;t1:=t2;Owl_log.info"processed %i, avg. %i docs/s"!ispeed))donewith|End_of_file->());close_inh(* similar to mapi_lines_of_file but for marshaled file *)letmapi_lines_of_marshalffname=letstack=Owl_utils.Stack.make()initeri_lines_of_marshal(funis->Owl_utils.Stack.pushstack(fis))fname;Owl_utils.Stack.to_arraystack(* save a marshalled object to a file *)letmarshal_to_file?(flags=[])xf=leth=open_outfinMarshal.to_channelhxflags;close_outh(* load a marshalled object from a file *)letmarshal_from_filef=leth=open_infinlets=really_input_stringh(in_channel_lengthh)inMarshal.from_strings0letheadnfname=letlines=Owl_utils.Stack.make()in(tryiteri_lines_of_file(funis->assert(i<n);Owl_utils.Stack.pushliness)fnamewith|_exn->());Owl_utils.Stack.to_arraylines(* TODO *)let_tail_n_fname=raise(Owl_exception.NOT_IMPLEMENTED"owl_io._tail")letread_csv?(sep='\t')fname=letlines=Owl_utils.Stack.make()initeri_lines_of_file(fun_is->String.trims|>String.split_on_charsep|>Array.of_list|>Owl_utils.Stack.pushlines)fname;Owl_utils.Stack.to_arraylinesletwrite_csv?(sep='\t')xfname=leth=open_outfnameinArray.iter(funrow->lets=Array.fold_left(funaccelt->Printf.sprintf"%s%s%c"acceltsep)""rowinPrintf.fprintfh"%s\n"s)x;close_outhletread_csv_proc?(sep='\t')procfname=iteri_lines_of_file(funis->String.trims|>String.split_on_charsep|>Array.of_list|>proci)fnameletwrite_csv_proc?(sep='\t')xprocfname=leth=open_outfnameinArray.iter(funrow->lets=Array.fold_left(funaccelt->Printf.sprintf"%s%s%c"acc(procelt)sep)""rowinPrintf.fprintfh"%s\n"s)x;close_outhletcsv_head?(sep='\t')idxfname=leth=open_infnameinfor_i=1toidx-1doinput_lineh|>ignoredone;input_lineh|>String.trim|>String.split_on_charsep|>Array.of_list