123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141(* Weighted Minwise Hashing in (amortized) Constant Time
Shrivastava, A. (2016).
Simple and efficient weighted minwise hashing.
In Advances in Neural Information Processing Systems (pp. 1498-1506). *)openPrintfmoduleA=BatArraymoduleBA=BigarraymoduleBA1=BA.Array1moduleFp=FingerprintmoduleL=BatListtypedense=(int,BA.int8_unsigned_elt,BA.c_layout)BA1.ttypehashed=intarrayletget_seedsk=letglobal_seed=314159265inletrng=Random.State.make[|global_seed|]inletbound=(BatInt.pow230)-1inArray.initk(fun_->Random.State.intrngbound)(* convert the sparse Fp.t type into a dense array of small positive ints *)letto_dense(feat_id_bound:int)(fp:Fp.t):dense=letres=BA1.createBA.int8_unsignedBA.C_layoutfeat_id_boundinBA1.fillres0;letn=BA1.dimfpinleti=ref0inwhile!i<ndoletk=BA1.getfp!iinletv=BA1.getfp(!i+1)inassert(k<feat_id_bound&&v<256);BA1.setreskv;i:=!i+2done;resletstring_of_densex=letn=BA1.dimxinletbuff=Buffer.create80infori=0ton-1doletj=BA1.getxiinbprintfbuff" %d:%d"ijdone;Buffer.contentsbuff(* read the sparse fingerprints, update feat. val. bounds
* if necessary *)letupdate_bounds(bounds:intarray)(fp:Fp.t):unit=letn=BA1.dimfpinleti=ref0inwhile!i<ndoletk=BA1.getfp!iinletv=BA1.getfp(!i+1)inbounds.(k)<-max(bounds.(k))v;i:=!i+2done(* compute the max value for each feature. *)(* I.e. the columns' maximum if we put observations as rows
* and features as columns in a data matrix *)letbounds(max_feat_id:int)(train:Fp.tarray):intarray=letbounds=A.makemax_feat_id0inA.iter(update_boundsbounds)train;bounds(* create a lookup table from the bounds (max feature values) so that we
can draw a single rand but still know which feature id. it corresponds to *)letlookup_table(bounds:intarray):intarray=lettotal=A.sumboundsinletres=A.createtotal0inletj=ref0inA.iteri(funibound->for_=1tobounddores.(!j)<-i;incrjdone)bounds;resletacc_bounds_table(bounds:intarray):intarray=letn=A.lengthboundsinletres=A.createn0inletacc=ref0inA.iteri(funibound->res.(i)<-!acc;acc:=!acc+bound)bounds;res(* (\* in the paper, he defines is_green; but he samples until is_green becomes
* * true. It is more natural to sample while is_red *\)
* let is_red (arr: dense) (test_feat_id: int) (test_feat_val: int): bool =
* test_feat_val >= (BA1.unsafe_get arr test_feat_id) *)(* pre-generate non-repeating random number sequences for later *)letgen_randsseedsrand_bound=A.map(funseed->letrng=Random.State.make[|seed|]in(* all ints we are interested into *)letints=A.initrand_bound(funi->i)inA.shuffle~state:rngints;(* random permute them *)ints)seeds(* compute k hashes *)lethashpregen_randsidx2featfeat2acc_bound(dense_fp:dense):hashed=letk=A.lengthpregen_randsinletres=A.makek0infori=0tok-1doletmisses=ref0inletj=ref0inletrands=pregen_rands.(i)inletrand'=rands.(!j)inlettest_feat_id=ref(idx2feat.(rand'))inlettest_feat_val=ref(rand'-feat2acc_bound.(!test_feat_id))in(* while is_red ... *)while!test_feat_val>=(BA1.unsafe_getdense_fp!test_feat_id)doincrmisses;(* in the paper: Hashes[i]++ *)incrj;letrand=rands.(!j)intest_feat_id:=idx2feat.(rand);test_feat_val:=rand-feat2acc_bound.(!test_feat_id)done;A.unsafe_setresi!missesdone;resletestimate_jaccard(hash1:hashed)(hash2:hashed):float=letres=ref0inletk=A.lengthhash1infori=0tok-1doif(A.unsafe_gethash1i)=(A.unsafe_gethash2i)thenincrresdone;(float!res)/.(floatk)letestimate_distance(h1:hashed)(h2:hashed):float=1.0-.(estimate_jaccardh1h2)