123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300# 1 "src/owl/nlp/owl_nlp_vocabulary.ml"(*
* OWL - OCaml Scientific and Engineering Computing
* Copyright (c) 2016-2020 Liang Wang <liang.wang@cl.cam.ac.uk>
*)(** NLP: Vocabulary module *)typet={mutablew2i:(string,int)Hashtbl.t;(* word -> index *)mutablei2w:(int,string)Hashtbl.t;(* index -> word *)mutablei2f:(int,int)Hashtbl.t(* index -> freq *)}letget_w2id=d.w2iletget_i2wd=d.i2wletexits_wdw=Hashtbl.memd.w2iwletexits_idi=Hashtbl.memd.i2wiletword2indexdw=Hashtbl.findd.w2iwletindex2worddi=Hashtbl.findd.i2wiletlengthd=Hashtbl.lengthd.w2iletfreq_idi=Hashtbl.findd.i2filetfreq_wdw=w|>word2indexd|>freq_id(* make a copy of passed in vocabulary *)letcopyd={w2i=Hashtbl.copyd.w2i;i2w=Hashtbl.copyd.i2w;i2f=Hashtbl.copyd.i2f}(* re-index the indices in the vocabulary *)letre_indexd=letw2i=Hashtbl.create(lengthd)inleti2w=Hashtbl.create(lengthd)inleti2f=Hashtbl.create(lengthd)inleti=ref0inHashtbl.iter(funwf->Hashtbl.addw2iw!i;Hashtbl.addi2w!iw;Hashtbl.addi2f!if;i:=!i+1)d.w2i;{w2i;i2w;i2f}(* remove extremely low and high frequency words based on percentage
lo: the percentage of lower bound
hi: the percentage of higher bound
h: the hashtbl of the vocabulary (word, freq)
*)let_trim_percent_w2flohih=letn=Hashtbl.lengthhinletall_freq=Array.maken0inleti=ref0inHashtbl.iter(fun_freq->all_freq.(!i)<-freq;i:=!i+1)h;Array.sortStdlib.compareall_freq;letl0=float_of_intn*.lo|>int_of_floatinleth0=float_of_intn*.hi|>int_of_floatinletlo=all_freq.(l0)inlethi=all_freq.(h0)inHashtbl.filter_map_inplace(fun_freq->matchfreq>=lo,freq<=hiwith|true,true->Somefreq|_->None)h(* similar to _trim_percent, but trim all three hashtbls in the vocabulary *)lettrim_percent~lo~hivocab=letd=copyvocabin_trim_percent_w2flohid.i2f;Hashtbl.filter_map_inplace(funiw->matchHashtbl.memd.i2fiwith|true->Somew|false->Hashtbl.removed.w2iw;None)d.i2w;re_indexd(* similar to trim_percent, but according the word absolute count *)let_trim_count_w2flohih=Hashtbl.filter_map_inplace(fun_freq->iffreq>=lo&&freq<=hithenSomefreqelseNone)h(* similar to trim_count but trim all three hashtbls *)lettrim_count~lo~hivocab=letd=copyvocabin_trim_count_w2flohid.i2f;Hashtbl.filter_map_inplace(funiw->matchHashtbl.memd.i2fiwith|true->Somew|false->Hashtbl.removed.w2iw;None)d.i2w;re_indexd(* remove stopwords from vocabulary
sw: hashtbl contains the vocabulary
*)letremove_stopwordsswh=Hashtbl.filter_map_inplace(funwv->matchHashtbl.memswwwith|true->None|false->Somev)hlet_build_word_procw2fs=Str.splitOwl_nlp_utils.regexp_splits|>List.iter(funw->matchHashtbl.memw2fwwith|true->letfreq=Hashtbl.findw2fwinHashtbl.replacew2fw(freq+1)|false->Hashtbl.addw2fw1)let_build_alphabet_procw2fs=String.iter(func->letw=String.make1cinmatchHashtbl.memw2fwwith|true->letfreq=Hashtbl.findw2fwinHashtbl.replacew2fw(freq+1)|false->Hashtbl.addw2fw1)s(* return both word->index and index->word hashtbl
lo: percentage of lower bound of word frequency
hi: percentage of higher bound of word frequency
fname: file name of the vocabulary, each line contains a doc
*)letbuild?(lo=0.)?(hi=1.)?(alphabet=false)?stopwordsfname=letw2f=Hashtbl.create(64*1024)inletf=matchalphabetwith|true->_build_alphabet_procw2f|false->_build_word_procw2finOwl_io.iteri_lines_of_file(fun_s->fs)fname;(* trim frequency if necessary *)iflo<>0.||hi<>1.then_trim_percent_w2flohiw2f;(* trim stopwords if necessary *)(matchstopwordswith|Somesw->remove_stopwordssww2f|None->());(* build w2i and i2w tables from trimmed w2f *)letw2i=Hashtbl.(create(lengthw2f))inleti2w=Hashtbl.(create(lengthw2f))inleti2f=Hashtbl.(create(lengthw2f))inleti=ref0inHashtbl.iter(funwf->Hashtbl.addw2iw!i;Hashtbl.addi2w!iw;Hashtbl.addi2f!if;i:=!i+1)w2f;{w2i;i2w;i2f}(* similar to build but build from string rather than file *)letbuild_from_string?(lo=0.)?(hi=1.)?(alphabet=false)?stopwordss=letw2f=Hashtbl.create(64*1024)inifalphabetthen_build_alphabet_procw2fselse_build_word_procw2fs;(* trim frequency if necessary *)iflo<>0.||hi<>1.then_trim_percent_w2flohiw2f;(* trim stopwords if necessary *)(matchstopwordswith|Somesw->remove_stopwordssww2f|None->());(* build w2i and i2w tables from trimmed w2f *)letw2i=Hashtbl.(create(lengthw2f))inleti2w=Hashtbl.(create(lengthw2f))inleti2f=Hashtbl.(create(lengthw2f))inleti=ref0inHashtbl.iter(funwf->Hashtbl.addw2iw!i;Hashtbl.addi2w!iw;Hashtbl.addi2f!if;i:=!i+1)w2f;{w2i;i2w;i2f}(* return (index, freq) array in increasing or decreasing freq *)letsort_freq?(inc=true)d=letall_freq=Array.make(lengthd)(0,0)inleti=ref0inHashtbl.iter(funjfreq->all_freq.(!i)<-j,freq;i:=!i+1)d.i2f;letf=matchincwith|true->funab->Stdlib.compare(snda)(sndb)|false->funab->Stdlib.compare(sndb)(snda)inArray.sortfall_freq;all_freq(* return k most popular words *)lettopdk=letall_freq=sort_freq~inc:falsedinassert(lengthd>=k);Array.suball_freq0k|>Array.map(fun(i,freq)->index2worddi,freq)(* return k least popular words *)letbottomdk=letall_freq=sort_freq~inc:truedinArray.suball_freq0k|>Array.map(fun(i,freq)->index2worddi,freq)(* convert w2i to a list of tuples *)letw2i_to_tuplesd=Hashtbl.fold(funwia->(w,i)::a)d.w2i[](* tokenise a string according to the passed in vocabulary*)lettokenisevs=Array.init(String.lengths)(funi->word2indexv(String.make1s.[i]))(* I/O functions *)letsavedfname=Owl_io.marshal_to_filedfnameletloadfname:t=Owl_io.marshal_from_filefnameletsave_txtdfname=letfh=open_outfnameinletvl=w2i_to_tuplesdinList.fast_sort(funxy->String.compare(fstx)(fsty))vl|>List.iter(fun(w,i)->lets=Printf.sprintf"%s %i\n"wiinoutput_stringfhs);close_outfhletto_stringx=lettopk=topx5|>Array.fold_left(funacc(s,i)->Printf.sprintf"%s(%s,%i) "accsi)""inPrintf.sprintf"length: %i; tokens:%s..."(lengthx)topkletpp_vocabformatterx=Format.open_box0;Format.fprintfformatter"%s"(to_stringx);Format.close_box()letto_arrayx=lets=Owl_utils.Stack.make()inHashtbl.iter(funiw->Owl_utils.Stack.pushs(i,w))x.i2w;Owl_utils.Stack.to_arraysletof_arrayx=letn=Array.lengthxinletw2i=Hashtbl.createninleti2w=Hashtbl.createninleti2f=Hashtbl.createninArray.iter(fun(i,w)->Hashtbl.addw2iwi;Hashtbl.addi2wiw;Hashtbl.addi2fi1)x;{w2i;i2w;i2f}(* ends here *)