123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319# 1 "src/owl/nlp/owl_nlp_corpus.ml"(*
* OWL - an OCaml numerical library for scientific computing
* Copyright (c) 2016-2018 Liang Wang <liang.wang@cl.cam.ac.uk>
*)(** NLP: Corpus module *)openOwl_nlp_utilstypet={mutableuri:string;(* path of the binary corpus *)mutablebin_ofs:intarray;(* index of the string corpus *)mutabletok_ofs:intarray;(* index of the tokenised corpus *)mutablebin_fh:in_channeloption;(* file descriptor of the binary corpus *)mutabletok_fh:in_channeloption;(* file descriptor of the tokenised corpus *)mutablevocab:Owl_nlp_vocabulary.toption;(* vocabulary of the corpus *)mutableminlen:int;(* minimum length of document to save *)mutabledocid:intarray;(* document id, can refer to original data *)}let_close_if_open=function|Someh->close_inh|None->()let_open_if_existsf=matchSys.file_existsfwith|true->Some(open_inf)|false->Noneletcleanupx=_close_if_openx.bin_fh;_close_if_openx.tok_fhletcreateuribin_ofstok_ofsbin_fhtok_fhvocabminlendocid=letx={uri;bin_ofs;tok_ofs;bin_fh;tok_fh;vocab;minlen;docid;}inGc.finalisecleanupx;xletget_uricorpus=corpus.uriletget_bin_uricorpus=corpus.uri^".bin"letget_bin_fhcorpus=matchcorpus.bin_fhwith|Somex->x|None->leth=corpus|>get_bin_uri|>open_inincorpus.bin_fh<-Someh;hletget_tok_uricorpus=corpus.uri^".tok"letget_tok_fhcorpus=matchcorpus.tok_fhwith|Somex->x|None->leth=corpus|>get_tok_uri|>open_inincorpus.tok_fh<-Someh;hletget_vocab_uricorpus=corpus.uri^".voc"letget_vocabcorpus=matchcorpus.vocabwith|Somex->x|None->leth=corpus|>get_vocab_uri|>Owl_nlp_vocabulary.loadincorpus.vocab<-Someh;hletget_docidcorpus=corpus.docidletlengthcorpus=Array.lengthcorpus.bin_ofs-1(* iterate docs and tokenised docs and etc. *)letnextcorpus:string=corpus|>get_bin_fh|>Marshal.from_channelletnext_tokcorpus:intarray=corpus|>get_tok_fh|>Marshal.from_channelletiterifcorpus=iteri_lines_of_marshalf(get_bin_uricorpus)letiteri_tokfcorpus=iteri_lines_of_marshalf(get_tok_uricorpus)letmapifcorpus=mapi_lines_of_marshalf(get_bin_uricorpus)letmapi_tokfcorpus=mapi_lines_of_marshalf(get_tok_uricorpus)letgetcorpusi:string=letfh=get_bin_fhcorpusinletold_pos=pos_infhinseek_infhcorpus.bin_ofs.(i);letdoc=Marshal.from_channelfhinseek_infhold_pos;docletget_tokcorpusi:intarray=letfh=get_tok_fhcorpusinletold_pos=pos_infhinseek_infhcorpus.tok_ofs.(i);letdoc=Marshal.from_channelfhinseek_infhold_pos;doc(* reset all the file pointers at offest 0 *)letreset_iteratorscorpus=let_reset_offset=function|Someh->seek_inh0|None->()in_reset_offsetcorpus.bin_fh;_reset_offsetcorpus.tok_fh(* return a batch of documents *)letnext_batch?(size=100)corpus=letbatch=Owl_utils.Stack.make()in(tryfori=0tosize-1docorpus|>next|>Owl_utils.Stack.pushbatchdonewithexn->());Owl_utils.Stack.to_arraybatch(* return a batch of tokenised documents *)letnext_batch_tok?(size=100)corpus=letbatch=Owl_utils.Stack.make()in(tryfori=0tosize-1docorpus|>next_tok|>Owl_utils.Stack.pushbatchdonewithexn->());Owl_utils.Stack.to_arraybatchlettokenisecorpuss=letdict=get_vocabcorpusinStr.split(Str.regexp" ")s|>List.filter(Owl_nlp_vocabulary.exits_wdict)|>List.map(Owl_nlp_vocabulary.word2indexdict)|>Array.of_list(* convert corpus into binary format, build dictionary, tokenise
lo and hi will be ignored if a vocab is passed in.
The passed in docid can be used for tracking back to the original corpus, but
this is not compulsory.
*)letbuild?docid?stopwords?lo?hi?vocab?(minlen=10)fname=(* build and save the vocabulary if necessary *)letvocab=matchvocabwith|Somevocab->vocab|None->(Owl_log.info"build up vocabulary ...";Owl_nlp_vocabulary.build?lo?hi?stopwordsfname)inOwl_nlp_vocabulary.savevocab(fname^".voc");Owl_nlp_vocabulary.save_txtvocab(fname^".voc.txt");(* prepare the output file *)letbin_f=fname^".bin"|>open_outinlettok_f=fname^".tok"|>open_outinset_binary_mode_outbin_ftrue;set_binary_mode_outtok_ftrue;(* initalise the offset array *)letb_ofs=Owl_utils.Stack.make()inlett_ofs=Owl_utils.Stack.make()inOwl_utils.Stack.pushb_ofs0;Owl_utils.Stack.pusht_ofs0;(* initalise the doc_id stack *)letdoc_s=Owl_utils.Stack.make()in(* binarise and tokenise at the same time *)Owl_log.info"convert to binary and tokenise ...";iteri_lines_of_file(funis->lett=Str.splitOwl_nlp_utils.regexp_splits|>List.filter(Owl_nlp_vocabulary.exits_wvocab)|>List.map(Owl_nlp_vocabulary.word2indexvocab)|>Array.of_listin(* only save those having at least minlen words *)ifArray.lengtht>=minlenthen(Marshal.to_channelbin_fs[];Marshal.to_channeltok_ft[];(* keep tracking of doc id *)letid=matchdocidwithSomed->d.(i)|None->iinOwl_utils.Stack.pushdoc_sid;(* keep tracking of doc offset *)Owl_utils.Stack.pushb_ofs(LargeFile.pos_outbin_f|>Int64.to_int);Owl_utils.Stack.pusht_ofs(LargeFile.pos_outtok_f|>Int64.to_int););)fname;(* save the corpus file *)letmdl_f=fname^".mdl"|>open_outinletb_ofs=Owl_utils.Stack.to_arrayb_ofsinlett_ofs=Owl_utils.Stack.to_arrayt_ofsinletdoc_s=Owl_utils.Stack.to_arraydoc_sinletcorpus=createfnameb_ofst_ofsNoneNoneNoneminlendoc_sinMarshal.to_channelmdl_fcorpus[];(* done, close the files *)close_outbin_f;close_outtok_f;close_outmdl_f;(* return the finished corpus *)get_bin_fhcorpus|>ignore;get_tok_fhcorpus|>ignore;get_vocabcorpus|>ignore;corpus(* remove duplicates in a text corpus, the ids of the removed files are returned *)letuniquefi_namefo_name=leth=Hashtbl.create1024inletrm=Owl_utils.Stack.make()inletfo=open_outfo_nameinOwl_nlp_utils.iteri_lines_of_file(funis->matchHashtbl.memhswith|true->Owl_utils.Stack.pushrmi|false->(output_stringfos;output_charfo'\n';Hashtbl.addhsNone;))fi_name;close_outfo;Owl_utils.Stack.to_arrayrm(* a simple function for pre-processing a given string *)letsimple_processs=Str.splitOwl_nlp_utils.regexp_splits|>List.filter(funx->String.lengthx>1)|>String.concat" "|>String.lowercase_ascii(* pre-process a given file with the passed in function
e.g., you can plug in [simple_process] function to clean up the text.
Note this function will not change the number of lines in a corpus.
*)letpreprocessffi_namefo_name=letfo=open_outfo_nameinOwl_nlp_utils.iteri_lines_of_file(funis->output_bytesfo(fs);output_charfo'\n';)fi_name;close_outfo(* i/o: save and load corpus *)(* set some fields to None so it can be safely saved *)letreduce_modelcorpus={uri=corpus.uri;bin_ofs=corpus.bin_ofs;tok_ofs=corpus.tok_ofs;bin_fh=None;tok_fh=None;vocab=None;minlen=corpus.minlen;docid=corpus.docid;}letsavecorpusf=letx=reduce_modelcorpusinOwl_utils.marshal_to_filexfletloadf:t=letcorpus=Owl_utils.marshal_from_filefinget_bin_fhcorpus|>ignore;get_tok_fhcorpus|>ignore;get_vocabcorpus|>ignore;corpus(* convert tokenised corpus back to text file *)letsave_txtcorpusf=letfh=open_outfinletvocab=get_vocabcorpusinleti2w_f=Owl_nlp_vocabulary.index2wordvocabiniteri_tok(funit->lets=t|>Array.mapi2w_f|>Array.to_list|>String.concat" "inoutput_stringfhs;output_charfh'\n';)corpus;close_outfhletto_stringcorpus=Printf.sprintf"corpus info\n"^Printf.sprintf" file path : %s\n"(corpus|>get_uri)^Printf.sprintf" # of docs : %i\n"(corpus|>length)^Printf.sprintf" doc minlen : %i"(corpus.minlen)letprintcorpus=corpus|>to_string|>print_endline(* ends here *)