123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641# 1 "src/base/compute/owl_computation_optimiser.ml"(*
* OWL - OCaml Scientific and Engineering Computing
* Copyright (c) 2016-2020 Liang Wang <liang.wang@cl.cam.ac.uk>
*)openOwl_graph(* Functor of making a Lazy engine to execute a computation graph. *)moduleMake(Operator:Owl_computation_operator_sig.Sig)=structmoduleOperator=OperatoropenOperator.SymbolopenOperator.Symbol.Shape.TypeopenOperator.Symbol.Shape.Type.Deviceletrec_optimise_termx=Owl_log.debug"optimise %s ..."(node_to_strx);ifis_validx=falsethen((matchget_operatorxwith|Noop->pattern_003x|Var->()|Const->pattern_000x|Empty_shape->pattern_000x|Zeros_shape->pattern_000x|Ones_shape->pattern_000x|Create_shape->pattern_000x|Sequential_shape->pattern_000x|Uniform_shape->pattern_000x|Gaussian_shape->pattern_000x|Bernoulli_shape->pattern_000x|Init(_shape,_f)->pattern_000x|Get_i->pattern_000x|Set_i->pattern_000x|GetSlice_slice->pattern_000x|SetSlice_slice->pattern_000x|Copy->pattern_018x|Reset->pattern_000x|Reshape_shape->pattern_022x|Reverse->pattern_000x|Tile_repeats->pattern_000x|Repeat_repeats->pattern_023x|Pad(_v,_padding)->pattern_000x|Concatenate_axis->pattern_000x|Split(_axis,_parts)->pattern_000x|Draw(_axis,_n)->pattern_000x|Map_f->pattern_000x|Fold(_axis,_f)->pattern_000x|Scan(_axis,_f)->pattern_000x|OneHot_depth->pattern_000x|Delay_f->pattern_000x|DelayArray(_shape,_f)->pattern_000x|LazyPrint(_max_row,_max_col,_header,_fmt)->pattern_000x|Abs->pattern_000x|Neg->pattern_000x|Floor->pattern_000x|Ceil->pattern_000x|Round->pattern_000x|Sqr->pattern_000x|Sqrt->pattern_000x|Log->pattern_000x|Log2->pattern_000x|Log10->pattern_000x|Exp->pattern_000x|Sin->pattern_000x|Cos->pattern_000x|Tan->pattern_000x|Sinh->pattern_000x|Cosh->pattern_000x|Tanh->pattern_000x|Asin->pattern_000x|Acos->pattern_000x|Atan->pattern_000x|Asinh->pattern_000x|Acosh->pattern_000x|Atanh->pattern_000x|Min_axis->pattern_000x|Max_axis->pattern_000x|Sum_axis->pattern_000x|SumReduce_axis->pattern_024x|Signum->pattern_000x|Sigmoid->pattern_000x|Relu->pattern_000x|Min'->pattern_000x|Max'->pattern_000x|Sum'->pattern_000x|L1norm'->pattern_000x|L2norm'->pattern_000x|L2NormSqr'->pattern_000x|ClipByValue->pattern_000x|ClipByL2norm->pattern_000x|Pow->pattern_000x|ScalarPow->pattern_000x|PowScalar->pattern_000x|Atan2->pattern_000x|ScalarAtan2->pattern_000x|Atan2Scalar->pattern_000x|Add->pattern_001x|Sub->pattern_000x|Mul->pattern_019x|Div->pattern_007x|AddScalar->pattern_015x|SubScalar->pattern_000x|MulScalar->pattern_000x|DivScalar->pattern_000x|ScalarAdd->pattern_000x|ScalarSub->pattern_000x|ScalarMul->pattern_014x|ScalarDiv->pattern_017x|EltEqual->pattern_000x|EltNotEqual->pattern_000x|EltLess->pattern_000x|EltGreater->pattern_000x|EltLessEqual->pattern_000x|EltGreaterEqual->pattern_000x|EltEqualScalar->pattern_000x|EltNotEqualScalar->pattern_000x|EltLessScalar->pattern_000x|EltGreaterScalar->pattern_000x|EltLessEqualScalar->pattern_000x|EltGreaterEqualScalar->pattern_000x|Conv1d(_padding,_stride)->pattern_000x|Conv2d(_padding,_stride)->pattern_000x|Conv3d(_padding,_stride)->pattern_000x|TransposeConv1d(_padding,_stride)->pattern_000x|TransposeConv2d(_padding,_stride)->pattern_000x|TransposeConv3d(_padding,_stride)->pattern_000x|DilatedConv1d(_padding,_stride,_rate)->pattern_000x|DilatedConv2d(_padding,_stride,_rate)->pattern_000x|DilatedConv3d(_padding,_stride,_rate)->pattern_000x|MaxPool1d(_padding,_kernel,_stride)->pattern_000x|MaxPool2d(_padding,_kernel,_stride)->pattern_000x|MaxPool3d(_padding,_kernel,_stride)->pattern_000x|AvgPool1d(_padding,_kernel,_stride)->pattern_000x|AvgPool2d(_padding,_kernel,_stride)->pattern_000x|AvgPool3d(_padding,_kernel,_stride)->pattern_000x|UpSampling2d_size->pattern_000x|Conv1dBackwardInput_stride->pattern_000x|Conv1dBackwardKernel_stride->pattern_000x|Conv2dBackwardInput_stride->pattern_000x|Conv2dBackwardKernel_stride->pattern_000x|Conv3dBackwardInput_stride->pattern_000x|Conv3dBackwardKernel_stride->pattern_000x|TransposeConv1dBackwardInput_stride->pattern_000x|TransposeConv1dBackwardKernel_stride->pattern_000x|TransposeConv2dBackwardInput_stride->pattern_000x|TransposeConv2dBackwardKernel_stride->pattern_000x|TransposeConv3dBackwardInput_stride->pattern_000x|TransposeConv3dBackwardKernel_stride->pattern_000x|DilatedConv1dBackwardInput(_stride,_rate)->pattern_000x|DilatedConv1dBackwardKernel(_stride,_rate)->pattern_000x|DilatedConv2dBackwardInput(_stride,_rate)->pattern_000x|DilatedConv2dBackwardKernel(_stride,_rate)->pattern_000x|DilatedConv3dBackwardInput(_stride,_rate)->pattern_000x|DilatedConv3dBackwardKernel(_stride,_rate)->pattern_000x|MaxPool1dBackward(_padding,_kernel,_stride)->pattern_000x|MaxPool2dBackward(_padding,_kernel,_stride)->pattern_000x|MaxPool3dBackward(_padding,_kernel,_stride)->pattern_000x|AvgPool1dBackward(_padding,_kernel,_stride)->pattern_000x|AvgPool2dBackward(_padding,_kernel,_stride)->pattern_000x|AvgPool3dBackward(_padding,_kernel,_stride)->pattern_000x|UpSampling2dBackward_size->pattern_000x|Row->pattern_000x|Rows_i->pattern_000x|CopyRowTo->pattern_000x|CopyColTo->pattern_000x|Dot(_transa,_transb,_alpha,_beta)->pattern_005x|Inv->pattern_000x|Trace->pattern_000x|Transpose_axis->pattern_000x|ToRows->pattern_000x|OfRows->pattern_000x|Scalar_Add->pattern_010x|Scalar_Sub->pattern_010x|Scalar_Mul->pattern_010x|Scalar_Div->pattern_010x|Scalar_Pow->pattern_010x|Scalar_Atan2->pattern_010x|Scalar_Abs->pattern_012x|Scalar_Neg->pattern_012x|Scalar_Sqr->pattern_012x|Scalar_Sqrt->pattern_012x|Scalar_Exp->pattern_012x|Scalar_Log->pattern_012x|Scalar_Log2->pattern_012x|Scalar_Log10->pattern_012x|Scalar_Signum->pattern_012x|Scalar_Floor->pattern_012x|Scalar_Ceil->pattern_012x|Scalar_Round->pattern_012x|Scalar_Sin->pattern_012x|Scalar_Cos->pattern_012x|Scalar_Tan->pattern_012x|Scalar_Sinh->pattern_012x|Scalar_Cosh->pattern_012x|Scalar_Tanh->pattern_012x|Scalar_Asin->pattern_012x|Scalar_Acos->pattern_012x|Scalar_Atan->pattern_012x|Scalar_Asinh->pattern_012x|Scalar_Acosh->pattern_012x|Scalar_Atanh->pattern_012x|Scalar_Relu->pattern_012x|Scalar_Sigmoid->pattern_012x|Fused_Adagrad(_rate,_eps)->pattern_000x|_->failwith"Owl_computation_optimiser:_optimise_term");validatex)(* dummy pattern *)andpattern_000x=Array.iter_optimise_term(parentsx)(* Add ndarray pattern *)andpattern_001x=letparents=parentsxinleta=parents.(0)inletb=parents.(1)in_optimise_terma;_optimise_termb;pattern_002x;pattern_004x(* Add ndarray pattern: x + 0 or 0 + x *)andpattern_002x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)inifget_operatorx=Addthen(matchget_operatora,get_operatorbwith|Zeros_,_->set_operatorxNoop;remove_edgeax;_optimise_termx|_,Zeros_->set_operatorxNoop;remove_edgebx;_optimise_termx|_,_->())(* Noop pattern *)andpattern_003x=letparent=(parentsx).(0)in_optimise_termparent;letop=get_operatorxinletreusable=get_reusexinifop=Noop&&reusablethen(letx_children=childrenxinletparent_children=childrenparentinletmerged_children=Owl_utils_array.mergex_childrenparent_childreninset_childrenparentmerged_children;replace_parentxparent;remove_nodex)(* Add ndarray pattern: FMA x * y + z *)andpattern_004x=ifget_operatorx=Addthen(letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)inifget_operatora=Mul&&refnuma=1then(letnew_parents=Owl_utils_array.(parentsa@[|b|])inset_parentsxnew_parents;replace_childax;set_operatorxFMA;remove_nodea)elseifget_operatorb=Mul&&refnumb=1then(letnew_parents=Owl_utils_array.(parentsb@[|a|])inset_parentsxnew_parents;replace_childbx;set_operatorxFMA;remove_nodeb))(* Gemm pattern : alpha * x *@ y + beta * z *)andpattern_005x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in_optimise_terma;_optimise_termb;pattern_006x(* Gemm pattern: transpose *)andpattern_006x=matchget_operatorxwith|Dot(transa,transb,alpha,beta)->letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in(matchget_operatorawith|Transpose_i->ifrefnuma=1then(letop=Dot(nottransa,transb,alpha,beta)inset_operatorxop;leta_parent=(parentsa).(0)inreplace_childax;replace_parentaa_parent)|_->());(matchget_operatorbwith|Transpose_i->ifrefnumb=1then(letop=Dot(transa,nottransb,alpha,beta)inset_operatorxop;letb_parent=(parentsb).(0)inreplace_childbx;replace_parentbb_parent)|_->())|_->()(* Div pattern *)andpattern_007x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in_optimise_terma;_optimise_termb;pattern_008x(* Div pattern: 0 / x *)andpattern_008x=ifget_operatorx=Divthen(letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)inletx_shp=node_shapexinmatchget_operatorawith|Zeros_->remove_edgeax;remove_edgebx;set_operatorx(Zerosx_shp)|_->())(* div pattern: x / 1 *)andpattern_009x=ifget_operatorx=Divthen(letx_parents=parentsxinletb=x_parents.(1)inletx_shp=node_shapexinmatchget_operatorbwith|Onesb_shp->ifx_shp=b_shpthen(remove_edgebx;set_operatorxNoop;_optimise_termx)|_->())(* Binary operator pattern for const scalar *)andpattern_010x=letparents=parentsxinleta=parents.(0)inletb=parents.(1)in_optimise_terma;_optimise_termb;matchget_operatora,get_operatorbwith|Const,Const->leta_val=node_to_elta|>elt_to_floatinletb_val=node_to_eltb|>elt_to_floatinletc_val=pattern_011(get_operatorx)a_valb_valinset_parentsx[||];set_reusexfalse;set_operatorxConst;freezex;(* FIXME: OMG, I need to fix this ... to many conversions *)set_valuex[|float_to_eltc_val|>unpack_elt|>elt_to_value|]|_->()(* Binary operator pattern: evaluation function for pattern_010 *)andpattern_011opab=matchopwith|Scalar_Add->a+.b|Scalar_Sub->a-.b|Scalar_Mul->a*.b|Scalar_Div->a/.b|Scalar_Pow->a**b|Scalar_Atan2->Stdlib.atan2ab|_->failwith"pattern_011: not supported"(* Unary operator pattern for const scalar *)andpattern_012x=letparents=parentsxinleta=parents.(0)in_optimise_terma;matchget_operatorawith|Const->leta_val=node_to_elta|>elt_to_floatinletb_val=pattern_013(get_operatorx)a_valinset_parentsx[||];set_reusexfalse;set_operatorxConst;freezex;(* FIXME: OMG, I need to fix this ... to many conversions *)set_valuex[|float_to_eltb_val|>unpack_elt|>elt_to_value|]|_->()(* Unary operator pattern: evaluation function for pattern_012 *)andpattern_013opx=letopenOwl_base_mathsinmatchopwith|Scalar_Abs->absx|Scalar_Neg->negx|Scalar_Sqr->x*.x|Scalar_Sqrt->sqrtx|Scalar_Exp->expx|Scalar_Log->logx|Scalar_Log2->log2x|Scalar_Log10->log10x|Scalar_Signum->signumx|Scalar_Floor->floorx|Scalar_Ceil->ceilx|Scalar_Round->roundx|Scalar_Sin->sinx|Scalar_Cos->cosx|Scalar_Tan->tanx|Scalar_Sinh->sinhx|Scalar_Cosh->coshx|Scalar_Tanh->tanhx|Scalar_Asin->asinx|Scalar_Acos->acosx|Scalar_Atan->atanx|Scalar_Asinh->asinhx|Scalar_Acosh->acoshx|Scalar_Atanh->atanhx|Scalar_Relu->relux|Scalar_Sigmoid->sigmoidx|_->failwith"pattern_013: not supported"(* ScalarMul pattern : a $* 0, a $* 1 *)andpattern_014x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in_optimise_terma;_optimise_termb;matchget_operatora,get_operatorbwith|_,Zerosshp->set_operatorx(Zerosshp);set_parentsx[||];remove_edgeax;remove_edgebx|_,Onesshp->set_parentsx[|a|];set_operatorx(Createshp);remove_edgebx|_->()(* ScalarDiv pattern *)andpattern_016x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in_optimise_terma;_optimise_termb;pattern_017x(* ScalarDiv pattern: Adagrad pattern *)andpattern_017x=ifget_operatorx=ScalarDivthen(letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)inifget_operatora=Const&&get_operatorb=Sqrt&&refnumb=1then(letb_parents=parentsbinletb_a=b_parents.(0)inifget_operatorb_a=AddScalar&&refnumb_a=1then(letb_a_parents=parentsb_ainletb_a_a=b_a_parents.(0)inletb_a_b=b_a_parents.(1)inifget_operatorb_a_b=Constthen(letb_a_b_val=node_to_eltb_a_b|>elt_to_floatinifb_a_b_val=1e-32then(leta_val=node_to_elta|>elt_to_floatinset_parentsx[|b_a_a|];replace_childb_ax;set_operatorx(Fused_Adagrad(a_val,b_a_b_val)))))))(* AddScalar pattern : a +$ 0 *)andpattern_015x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in_optimise_terma;_optimise_termb;matchget_operatora,get_operatorbwith|Zerosshp,_->set_parentsx[|b|];set_operatorx(Createshp);remove_edgeax|_->()(* Copy pattern *)andpattern_018x=leta=(parentsx).(0)in_optimise_terma;set_operatorxNoop;pattern_003x(* Mul pattern *)andpattern_019x=letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)in_optimise_terma;_optimise_termb;pattern_020x(* Mul pattern : a * 0 or 0 * a *)andpattern_020x=ifget_operatorx=Multhen(letx_parents=parentsxinleta=x_parents.(0)inletb=x_parents.(1)inletx_shp=node_shapexinmatchget_operatora,get_operatorbwith|Zeros_,_|_,Zeros_->set_operatorx(Zerosx_shp);remove_edgeax;remove_edgebx|_,_->())(* Mul pattern : a * 1 or 1 * a *)andpattern_021_x=failwith"pattern_021: not implemented"(* Reshape pattern *)andpattern_022x=leta=(parentsx).(0)in_optimise_terma;ifrefnuma=1then(letx_shp=node_shapexinmatchget_operatorawith|Zeros_->set_operatorx(Zerosx_shp);remove_edgeax|Ones_->set_operatorx(Onesx_shp);remove_edgeax|_->())(* Repeat pattern *)andpattern_023x=leta=(parentsx).(0)in_optimise_terma;ifrefnumx=1then(letx_parent=(parentsx).(0)inletx_children=childrenxinmatchget_operatorx_children.(0)with|Add|Sub|Mul|Div|Pow|Min2|Max2|Hypot|Atan2->letreps=matchget_operatorxwith|Repeatreps->reps|_->failwith"optimiser:pattern_023"inletoptimisable=reftrueinArray.iter2(fundr->ifr<>1&&d<>1thenoptimisable:=false)(node_shapex_parent)reps;if!optimisable=truethen(letparent_children=childrenx_parentinletmerged_children=Owl_utils_array.mergex_childrenparent_childreninset_childrenx_parentmerged_children;replace_parentxx_parent;remove_nodex)|_->())(* SumReduce pattern *)andpattern_024x=letx_parents=parentsxinleta=x_parents.(0)in_optimise_terma;(* if only reduce along one dimension, change to sum *)matchget_operatorxwith|SumReduceaxis->ifArray.lengthaxis=1then(set_operatorx(Sumaxis.(0));_optimise_termx)|_->()(* core optimise functions *)letestimate_complexitygraph=letnodes=ref0inOwl_graph.iter_ancestors(fun_->nodes:=!nodes+1)graph;letedges=ref0inOwl_graph.iter_in_edges(fun__->edges:=!edges+1)graph;!nodes,!edgesletoptimise_nodesxs=letnodes,edges=estimate_complexityxsinOwl_log.info"unoptimised graph: %i nodes, %i edges ..."nodesedges;Array.iter_optimise_termxs;(* NOTE: invalidate ancestors *)iter_ancestors(funv->invalidatev)xs;letnodes,edges=estimate_complexityxsinOwl_log.info"optimised graph: %i nodes, %i edges ..."nodesedgesend(* Make functor ends *)