123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185letsqrx=x*.xletfirst_setbitn=(* Reverse mapping of B(2,6)
See http://chessprogramming.wikispaces.com/De+Bruijn+sequence *)letdb26="\x00\x01\x02\x35\x03\x07\x36\x1b\x04\x26\x29\x08\x22\x37\x30\x1c\x3e\x05\x27\x2e\x2c\x2a\x16\x09\x18\x23\x3b\x38\x31\x12\x1d\x0b\x3f\x34\x06\x1a\x25\x28\x21\x2f\x3d\x2d\x2b\x15\x17\x3a\x11\x0a\x33\x19\x24\x20\x3c\x14\x39\x10\x32\x1f\x13\x0f\x1e\x0e\x0d\x0c"in(* Isolate lsb
See http://aggregate.org/MAGIC/#Least%20Significant%201%20Bit *)letn=Int64.logandn(Int64.negn)in(* Get index in B(2,6) *)letn=Int64.muln0x022fdd63cc95386dLinletn=Int64.shift_right_logicaln58inChar.codedb26.[Int64.to_intn]typet=bytes(** Building a new hll *)letvalidatet=(1lslChar.code(Bytes.gett0)+1=Bytes.lengtht)letestimate_memory~error=letp=int_of_float(ceil(log(sqr(1.04/.error))))in(1lslp)letmake~error=assert(0.<error&&error<1.);letp=int_of_float(ceil(log(sqr(1.04/.error))))inlett=Bytes.make(1lslp+1)'\000'inBytes.sett0(Char.chrp);assert(validatet);tletcleart=Bytes.fillt1(Bytes.lengtht-1)'\000';assert(validatet)(** Adding an element to the hll *)letget_rhow=ifw=0Lthen64else1+first_setbitwletaddtx=letp=Char.code(Bytes.gett0)inletm=1lslpinletj=Int64.to_intxland(m-1)+1inletw=Int64.shift_right_logicalxpinBytes.settj(Char.chr(max(Char.code(Bytes.gettj))(get_rhow)))(* assert (validate t): micro benchmark shows that validating in an add loop
has a 10% overhead, not necessary. *)(** Merging and copying hlls *)letcopyt=Bytes.copytletmerge~into:tt'=letlength=Bytes.lengthtiniflength<>Bytes.lengtht'theninvalid_arg"update: counters precision should be equal";fori=1tolength-1doBytes.setti(max(Bytes.getti)(Bytes.gett'i))done;assert(validatet)(** Estimating cardinality, HyperLogLog *)letcount_nullst=letnulls=ref0infori=1toBytes.lengtht-1doifBytes.getti='\000'thenincrnullsdone;!nullsletget_alpha=function|pwhennot(4<=p&&p<=16)->assertfalse|4->0.673|5->0.697|6->0.709|p->0.7213/.(1.0+.1.079/.float(1lslp))lethll_estimationprecisiont=letp=Char.code(Bytes.gett0)inletm=1lslpinletsum=ref0.infori=1tomdosum:=!sum+.2.**float(-min(precision-p)(Char.code(Bytes.getti)))done;get_alphap*.sqr(floatm)/.!sumletlinear_countingmnulls=letm=floatmandnulls=floatnullsin(m*.log(m/.nulls))letcard_hllt=lete=hll_estimation32tinletp=Char.code(Bytes.gett0)inletm=1lslpinife<=(5.0/.2.0)*.floatmthen((* Small range *)matchcount_nullstwith|0->e|nulls->linear_countingmnulls)elseife<=(2.0**32.0)/.30.0then((* Normal range *)e)else((* Large range *)(-.(2.0**32.0)*.log(1.0-.e/.(2.0**32.0))))(** Estimating cardinality, HyperLogLog++ *)letget_thresholdp=Hll_consts.threshold.(p-4)letget_nearest_neighborsevec=letdistance=Array.mapi(funidxv->sqr(e-.v),idx)vecinArray.sort(fun((a:float),_)(b,_)->compareab)distance;Array.init6(funi->let_,idx=distance.(i)inidx)letestimate_biasep=letbias_vector=Hll_consts.bias_data.(p-4)inletnearest_neighbors=get_nearest_neighborseHll_consts.raw_estimated_data.(p-4)inletsum=ref0.infori=0toArray.lengthnearest_neighbors-1dosum:=!sum+.bias_vector.(nearest_neighbors.(i))done;!sum/.float(Array.lengthnearest_neighbors)letept=letp=Char.code(Bytes.gett0)inletm=float(1lslp)inlete=hll_estimation64tinife<=5.*.mthene-.estimate_biasepelseeletcard_hllppt=assert(validatet);letp=Char.code(Bytes.gett0)inletm=(1lslp)inmatchcount_nullstwith|0->ept|nulls->leth=linear_countingmnullsinifh<=get_thresholdpthenhelseeptletcard=card_hllpp(* Thomas Wang 64-bit integer hashing *)lethash_int64key=letopenInt64inlet(lsr)=shift_right_logicalinlet(lsl)=shift_leftinletnot=lognotinletxor=logxorinletkey=add(notkey)(keylsl21)inletkey=xorkey(keylsr24)inletkey=add(addkey(keylsl3))(keylsl8)inletkey=xorkey(keylsr14)inletkey=add(addkey(keylsl2))(keylsl4)inletkey=xorkey(keylsr28)inletkey=addkey(keylsl31)inkeyletto_stringt=assert(1lslChar.code(Bytes.gett0)+1=Bytes.lengtht);Bytes.to_stringtletof_strings=lett=Bytes.of_stringsin(* t.[0] = 1 lsl length s + 1.
Also, it as to be small, so higher bits must be null and could be used to
store versioning information in the future. *)ifnot(validatet)thenraise(Invalid_argument"Hll.of_string");t