123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809# 1 "src/owl/stats/owl_stats.ml"(*
* OWL - OCaml Scientific Computing
* Copyright (c) 2016-2022 Liang Wang <liang@ocaml.xyz>
*)[@@@warning"-32"](** Random numbers and distributions *)includeOwl_stats_dist(* Randomisation function *)letshufflex=lety=Array.copyxinOwl_stats_extend.shuffley;yletchoosexk=assert(Array.lengthx>=k);lety=Array.makekx.(0)inOwl_stats_extend.choose~src:x~dst:y;yletsamplexk=lety=Array.makekx.(0)inOwl_stats_extend.sample~src:x~dst:y;y(* Basic statistical functions *)letsumx=Owl_stats_extend.sumxletmeanx=Owl_stats_extend.meanxlet_get_meanmx=matchmwith|Somea->a|None->meanxletvar?meanx=Owl_stats_extend.varx(_get_meanmeanx)letstd?meanx=Owl_stats_extend.stdx(_get_meanmeanx)letsem?meanx=lets=std?meanxinletn=float_of_int(Array.lengthx)ins/.sqrtnletabsdev?meanx=Owl_stats_extend.absdevx(_get_meanmeanx)letskew?mean?sdx=letm=_get_meanmeanxinlets=matchsdwith|Somea->a|None->std~mean:mxinOwl_stats_extend.skewxmsletkurtosis?mean?sdx=letm=_get_meanmeanxinlets=matchsdwith|Somea->a|None->std~mean:mxinOwl_stats_extend.kurtosisxms(* TODO: move to C code *)letcentral_moment=Owl_base_stats.central_momentletcorrcoefx0x1=assert(Array.(lengthx0=lengthx1));Owl_stats_extend.corrcoefx0x1(* TODO: optimise *)letsort=Owl_base_stats.sortletargsort=Owl_base_stats.argsortlet_resolve_tiesnextd=function|`Average->float_of_intnext-.(float_of_intd/.2.)|`Min->float_of_int(next-d)|`Max->float_of_intnextletrank?(ties_strategy=`Average)vs=letn=Array.lengthvsinletorder=argsortvsinletranks=Array.maken0.inletd=ref0infori=0ton-1doifi==n-1||comparevs.(order.(i))vs.(order.(i+1))<>0then(lettie_rank=_resolve_ties(i+1)!dties_strategyinforj=i-!dtoidoranks.(order.(j))<-tie_rankdone;d:=0)elseincrd(* Found a duplicate! *)done;ranksletautocorrelation?(lag=1)x=letn=Array.lengthxinlety=meanxinleta=ref0.infori=0ton-lag-1doa:=!a+.((x.(i)-.y)*.(x.(i+lag)-.y))done;letb=ref0.infori=0ton-1dob:=!b+.((x.(i)-.y)**2.)done;!a/.!bletcov?m0?m1x0x1=assert(Array.(lengthx0=lengthx1));letm0=_get_meanm0x0inletm1=_get_meanm1x1inOwl_stats_extend.covx0x1m0m1letconcordant=Owl_base_stats.concordantletdiscordant=Owl_base_stats.discordantletkendall_tau=Owl_base_stats.kendall_tauletspearman_rhox0x1=letr0=rankx0inletr1=rankx1inleta=covr0r1inletb=stdr0*.stdr1ina/.bletminmax_i=Owl_base_stats.minmax_iletmin_i=Owl_base_stats.min_iletmax_i=Owl_base_stats.max_iletmin=Owl_base_stats.minletmax=Owl_base_stats.maxletminmax=Owl_base_stats.minmaxtypehistogram=Owl_base_stats.histogramlethistogram=Owl_base_stats.histogramlethistogram_sorted=Owl_base_stats.histogram_sortedletnormalise=Owl_base_stats.normaliseletnormalise_density=Owl_base_stats.normalise_densityletpp_hist=Owl_base_stats.pp_histletecdfx=ifArray.existsFloat.is_nanxthenraise(Invalid_argument"Owl_stats.ecdf: nan values in the array x are not supported.");letx=sort~inc:truexinletn=Array.lengthxinletm=float_of_intninlety=ref[||]inletf=ref[||]inleti=ref0inletc=ref0.inwhile!i<ndoletj=ref!iinwhile!j<n&&x.(!i)=x.(!j)doc:=!c+.1.;j:=!j+1done;y:=Array.append!y[|x.(!i)|];f:=Array.append!f[|!c/.m|];i:=!jdone;!y,!fletquantilex=letx=sort~inc:truexinfunp->ifp<0.||p>1.thenraise(Invalid_argument"Owl_stats.quantile: expected float between 0 and 1")elseOwl_stats_extend.quantilexpletpercentilexp=quantilex(p/.100.)letmedianx=percentilex50.letfirst_quartilex=percentilex25.letthird_quartilex=percentilex75.letinterquartile=Owl_base_stats.interquartileletz_score~mu~sigmax=Array.map(funy->(y-.mu)/.sigma)xlett_scorex=letmu=meanxinletsigma=stdxinz_score~mu~sigmaxletnormalise_pdfx=letc=Owl_stats_extend.sumxinArray.map(funx->x/.c)x(* TODO *)letcenterise_x=Noneletstandarderise_x=Noneletksdensity_x=None(* Hypothesis tests *)typetail=|BothSide|RightSide|LeftSidetypehypothesis={reject:bool;p_value:float;score:float}letmake_hypothesisrejectp_valuescore={reject;p_value;score}letpp_hypothesisformatterhypothesis=lets=Printf.sprintf{| reject : %b
p value : %f
score : %f|}hypothesis.rejecthypothesis.p_valuehypothesis.scoreinFormat.open_box0;Format.fprintfformatter"%s"s;Format.close_box()letz_test~mu~sigma?(alpha=0.05)?(side=BothSide)x=letn=float_of_int(Array.lengthx)inletz=(meanx-.mu)*.sqrtn/.sigmainletpl=gaussian_cdf~mu:0.~sigma:1.zinletpr=gaussian_sf~mu:0.~sigma:1.zinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishpzlett_test~mu?(alpha=0.05)?(side=BothSide)x=letn=float_of_int(Array.lengthx)inletm=meanxinlets=std~mean:mxinlett=(m-.mu)*.sqrtn/.sinletpl=t_cdf~df:(n-.1.)~loc:0.~scale:1.tinletpr=t_sf~df:(n-.1.)~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlett_test_paired?(alpha=0.05)?(side=BothSide)xy=letnx=float_of_int(Array.lengthx)inletny=float_of_int(Array.lengthy)inlet_=ifnx<>nythenfailwith"the sizes of two samples does not equal."inletd=Owl_utils.Array.map2i(fun_ab->a-.b)xyinletm=Owl_stats_extend.sumd/.nxinlett=m/.sem~mean:mdinletpl=t_cdf~df:(nx-.1.)~loc:0.~scale:1.tinletpr=t_sf~df:(nx-.1.)~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlet_t_test2_equal_var~alpha~sidexy=letnx=float_of_int(Array.lengthx)inletny=float_of_int(Array.lengthy)inletxm=meanxinletym=meanyinletxs=stdxinletys=stdyinletv=nx+.ny-.2.inlett=(xm-.ym)/.sqrt(((xs**2.)/.nx)+.((ys**2.)/.ny))inletpl=t_cdf~df:v~loc:0.~scale:1.tinletpr=t_sf~df:v~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlet_t_test2_welche~alpha~sidexy=letnx=float_of_int(Array.lengthx)inletny=float_of_int(Array.lengthy)inletxm=meanxinletym=meanyinletxs=stdxinletys=stdyinletvx=nx-.1.inletvy=ny-.1.inletv=((((xs**2.)/.nx)+.((ys**2.)/.ny))**2.)/.(((xs**4.)/.(vx*.(nx**2.)))+.((ys**4.)/.(vy*.(ny**2.))))inlett=(xm-.ym)/.sqrt(((xs**2.)/.nx)+.((ys**2.)/.ny))inletpl=t_cdf~df:v~loc:0.~scale:1.tinletpr=t_sf~df:v~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlett_test_unpaired?(alpha=0.05)?(side=BothSide)?(equal_var=true)xy=matchequal_varwith|true->_t_test2_equal_var~alpha~sidexy|false->_t_test2_welche~alpha~sidexyletsmirnovne=letnn=int_of_float(floor(float_of_intn*.(1.-.e)))inletrechelpersumvc=letevn=e+.(float_of_intv/.float_of_intn)inletsum'=sum+.(c*.(evn**float_of_int(v-1))*.((1.-.evn)**float_of_int(n-v)))inletc'=c*.float_of_int(n-v)/.float_of_int(v+1)inifv<=nnthenhelpersum'(v+1)c'elsesuminlethelper2()=letmaxlog=logmax_floatinletlngamma=Owl_maths_special.loggammainletlgamnp1=lngamma(1.+.float_of_intn)inletrechelper3sumv=letevn=e+.(float_of_intv/.float_of_intn)inletomevn=1.-.evninlett=lgamnp1-.lngamma(1.+.float_of_intv)-.lngamma(1.+.float_of_int(n+v))inletsum'=sum+.exptinifv<=nnthenifabs_floatomevn>0.&&t>~-.maxlogthenhelper3sum'(v+1)elsehelper3sum(v+1)elsesuminhelper30.0inifnot(n>0&&e>=0.&&e<=1.)thennanelseife=0.0then1.0elseifn<1013thene*.helper0.01.elsee*.helper2()letkolmogorovy=letx=-2.*.y*.yinletrechelpersignsumr=lett=exp(x*.r*.r)inletsum'=sum+.(sign*.t)inletr'=r+.1.inletsign'=~-.signinift=0.0||t/.sum'<=1.1e-16thensum'elsehelpersign'sum'r'inify<1.1e-16then1.0else2.*.helper1.0.1.letks_test?(alpha=0.05)xf=letx'=sortxinletmaxpq=ifp>qthenpelseqinletn=Array.lengthx'inletnn=float_of_intninletfvals=Array.mapfx'inletg1iv=v-.(float_of_inti/.nn)inletg2iv=(float_of_int(i+1)/.nn)-.vinletd1=Array.fold_leftmax0.(Array.mapig1fvals)inletd2=Array.fold_leftmax0.(Array.mapig2fvals)inletd=maxd1d2inletpval=2.*.smirnovndinletpval2=kolmogorov(d*.sqrtnn)inifn=0thenraiseOwl_exception.EMPTY_ARRAYelseifn>2666||pval2>0.8-.(nn*.0.003)thenmake_hypothesis(pval2<alpha)pval2delsemake_hypothesis(pval<alpha)pvaldletrecuniquesl=matchlwith|[]->[]|[x]->[x]|x1::x2::xs->ifx1=x2thenuniques(x2::xs)elsex1::uniques(x2::xs)(* Compute the empirical CDF of a list of samples from the input
domain (sorted list of floats). The output is a list of length
equal to domain. Both inputs are assumed to be sorted. *)letempCdfdomainsamples=letreccountxsamples=matchsampleswith|[]->0,samples|y::ys->ifx=ythen(letn,rest=countxysinn+1,rest)else0,samplesinletrecaggregateaccumdomainsamples=matchdomainwith|[]->[]|x::xs->letp,rest=countxsamplesinletaccum'=accum+pinaccum'::aggregateaccum'xsrestinletn=float_of_int(List.lengthsamples)inleta=aggregate0domainsamplesinList.map(funx->float_of_intx/.n)aletks2_test?(alpha=0.05)xy=letn1=Array.lengthxinletn2=Array.lengthyinifn1=0||n2=0thenraiseOwl_exception.EMPTY_ARRAYelse(letnn1=float_of_intn1inletnn2=float_of_intn2inletx'=Array.to_list(sortx)inlety'=Array.to_list(sorty)inletdomain=uniques(Array.to_list(sort(Array.concat[x;y])))inletxCdf=empCdfdomainx'inletyCdf=empCdfdomainy'inletdiffs=List.map2(funpq->abs_float(p-.q))xCdfyCdfinletmaxpq=ifp>qthenpelseqinletd=List.fold_leftmax0.diffsinleten=sqrt(nn1*.nn2/.(nn1+.nn2))inletpval=kolmogorov((en+.0.12+.(0.11/.en))*.d)inmake_hypothesis(pval<alpha)pvald)letad_test_x=None(* Anderson-Darling test *)letdw_test_x=None(* Durbin-Watson test *)letjb_test?(alpha=0.05)x=(* Jarque-Bera test *)letn=float_of_int(Array.lengthx)inlets=skewxinletk=kurtosisxinletj=n/.6.*.((s**2.)+.(((k-.3.)**2.)/.4.))inletp=chi2_sf~df:2.jinleth=alpha>pinmake_hypothesishpjletvar_test?(alpha=0.05)?(side=BothSide)~variancex=letn=float_of_int(Array.lengthx)inletv=n-.1.inletk=v*.varx/.varianceinletpl=chi2_cdf~df:vkinletpr=chi2_sf~df:vkinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishpkletfisher_test?(alpha=0.05)?(side=BothSide)abcd=letcdf?(max_prob=1.)kn1n2t=letleft=Stdlib.max0(t-n2)inletright=matchmax_probwith|1.->k|_->Stdlib.minn1tinleteps=0.000000001inOwl_utils.range_foldleftright~f:(funaccx->letp=hypergeometric_pdf~good:n1~bad:n2~sample:txinifp<max_prob||abs_float(p-.max_prob)<epsthenacc+.pelseacc)~init:0.0in(* let n = a + b + c + d in *)letprob=hypergeometric_pdf~good:(a+b)~bad:(c+d)~sample:(a+c)ainletoddsratio=float_of_inta*.float_of_intd/.(float_of_intb*.float_of_intc)inletp=matchsidewith|BothSide->cdfa(a+b)(c+d)(a+c)~max_prob:prob|RightSide->cdfb(b+a)(c+d)(b+d)|LeftSide->cdfa(a+b)(c+d)(a+c)inleth=alpha>pinmake_hypothesishpoddsratioletlillie_test_x=None(* Lilliefors test *)lettiecorrectrankvals=letranks_sort=sortrankvalsinletcounts=Owl_utils.count_dup(Array.to_listranks_sort)inletsize=float_of_int(Array.lengthrankvals)inletnumerator=Array.fold_left(+)0(Array.of_list(List.map(fun(_x,y)->(y*y*y)-y)counts))inmatchsizewith|0.0->1.0|1.0->1.0|_->1.0-.(float_of_intnumerator/.((size*.size*.size)-.size))(* Mann–Whitney U test *)letmannwhitneyu?(alpha=0.05)?(side=BothSide)xy=letrecexact_aunm=ifu<0.then0.elseifu>=m*.(n-.m)thenfloat_of_int(Owl_maths.combination(int_of_floatn)(int_of_floatm))elseifm=1.||n-.m=1.thenu+.1.elseexact_au(n-.1.)m+.exact_a(u-.(n-.m))(n-.1.)(m-.1.)inletn1=float_of_int(Array.lengthx)inletn2=float_of_int(Array.lengthy)inletranked=rank(Array.appendxy)inletrankx=Array.fold_left(+.)0.0(Array.subranked0(int_of_floatn1))inletu1=(n1*.n2)+.(n1*.(n1+.1.0)/.2.0)-.rankxinletu2=(n1*.n2)-.u1inletasymptotic_v=lett=tiecorrectrankedinletsd=sqrt(t*.n1*.n2*.(n1+.n2+.1.0)/.12.0)inletmean=n1*.n2/.2.0inletbigu=matchsidewith|BothSide->Stdlib.maxu1u2|RightSide->u2|LeftSide->u1inletz=(bigu-.mean)/.sdinletp=matchsidewith|BothSide->2.0*.gaussian_sf~mu:0.~sigma:1.(abs_floatz)|_->gaussian_sf~mu:0.~sigma:1.zinleth=alpha>pinmake_hypothesishpu2inletexact_v=letbigu=matchsidewith|BothSide->Stdlib.minu1u2|RightSide->u1|LeftSide->u2inletp=letn=n1+.n2inletk=ifn1<n2thenn2elsen1inletz=exact_abigu(n1+.n2)k/.float_of_int(Owl_maths.combination(int_of_floatn)(int_of_floatk))inmatchsidewith|BothSide->2.*.z|_->zinleth=alpha>pinmake_hypothesishpu2inifmaxranked=n1+.n2&&max[|n1;n2|]<10.thenexact1elseasymptotic1(* wilcoxon paired *)letwilcoxon?(alpha=0.05)?(side=BothSide)xy=letd=Array.map2(funab->a-.b)xyinletd=Owl_utils.Array.filter(funa->a<>0.)dinletn=float_of_int(Array.lengthd)inletrankval=rank(Array.mapabs_floatd)inletrp=Array.map2(funab->(ifa>0.0then1.else0.)*.b)drankvalinletrm=Array.map2(funab->(ifa<0.0then1.else0.)*.b)drankvalinletrp=Array.fold_left(+.)0.rpinletrm=Array.fold_left(+.)0.rminlett=Stdlib.minrprminletasymptotic_v=letmn=n*.(n+.1.)*.0.25inletse=n*.(n+.1.)*.((2.*.n)+.1.)inlett_correctionrankvals=letranks_sort=sortrankvalsinletcounts=Owl_utils.count_dup(Array.to_listranks_sort)in(* let size = (float_of_int (Array.length rankvals)) in *)Array.fold_left(+)0(Array.of_list(List.map(fun(_x,y)->(y*y*y)-y)counts))inletcorr=float_of_int(t_correctionrankval)inletse=sqrt((se-.(0.5*.corr))/.24.)inletz=(t-.mn)/.seinletp=2.0*.gaussian_sf~mu:0.~sigma:1.(abs_floatz)inmatchsidewith|BothSide->p|RightSide->1.-.(p/.2.)|LeftSide->p/.2.inletexactv=letrecfwn=ifw=n*.(n+.1.)/.2.||(w=0.&&n>=0.)then1.elseif(w<0.&&n>0.)||(w>0.&&n=0.)||n<0.then0.elsefw(n-.1.)+.f(w-.n)(n-.1.)inletn1=float_of_int(Array.lengthx)inletv=matchsidewith|RightSide->v-.1.|_->vinletp=ifv<0.then0.elseArray.fold_left(+.)0.(Owl_utils.Array.map(funi->f(float_of_inti)n1)(Owl_utils.Array.range0(int_of_floatv)))inmatchsidewith|BothSide->2.*.p/.(2.**n1)|RightSide->1.-.(p/.(2.**n1))|LeftSide->p/.(2.**n1)inletp=ifArray.lengthd=Array.lengthx&&n<10.thenexacttelseasymptotic1inleth=alpha>pinmake_hypothesishptletruns_test?(alpha=0.05)?(side=BothSide)?vx=(* Run test for randomness *)letv=matchvwith|Somev->v|None->medianxinletn1,n2=ref0.,ref0.inletz=ref[||]inlet_=Array.iter(funy->ify>vthen(n1:=!n1+.1.;z:=Array.append!z[|1|])elseify<vthen(n2:=!n2+.1.;z:=Array.append!z[|-1|]))xinletr0=ref1.inlet_=fori=0toArray.length!z-2domatch!z.(i)*!z.(i+1)<0with|true->r0:=!r0+.1.|false->()doneinletaa=2.*.!n1*.!n2inletbb=!n1+.!n2inletr1=(aa/.bb)+.1.inletsr=aa*.(aa-.bb)/.(bb*.bb*.(bb-.1.))inletz=(!r0-.r1)/.sqrtsrinletpl=gaussian_cdf~mu:0.~sigma:1.zinletpr=gaussian_sf~mu:0.~sigma:1.zinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishpzletcrosstab_x=None(* Cross-tabulation *)(* MCMC: Metropolis and Gibbs sampling *)letmetropolis_hastingsfpn=letstepsize=0.1in(* be careful about step size, try 0.01 *)leta,b=1000,10inlets=Array.makenpinfori=0toa+(b*n)-1doletp'=Array.map(funx->gaussian_rvs~mu:0.~sigma:stepsize+.x)pinlety,y'=fp,fp'inletp'=ify'>=ythenp'elseifstd_uniform_rvs()<y'/.ythenp'elseArray.copypinArray.iteri(funix->p.(i)<-x)p';ifi>=a&&imodb=0thens.((i-a)/b)<-Array.copypdone;sletgibbs_samplingfpn=leta,b=1000,10inletm=a+(b*n)inlets=Array.makenpinletc=Array.lengthpinfori=1tom-1doforj=0toc-1dop.(j)<-fpjdone;ifi>=a&&imodb=0thens.((i-a)/b)<-Array.copypdone;slettukey_fences?(k=1.5)arr=letfirst_quartile=first_quartilearrinletthird_quartile=third_quartilearrinletoffset=k*.(third_quartile-.first_quartile)infirst_quartile-.offset,third_quartile+.offsetletgaussian_kde=Owl_base_stats.gaussian_kdelet_=(* init the internal state of PRNG *)Owl_stats_prng.self_init()(* ends here *)