12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079(* ml - Complete IR with all tinygrad operations *)(* ───── Scalars & element types ───── *)moduleDtype=structtype_t=|Float32:floatt|Int32:int32t|Bool:boolt|Uint8:intt|Unit:unitttypeany=Any_Dtype:'at->any[@@unboxed]letto_string:typea.at->string=function|Float32->"float32"|Int32->"int32"|Bool->"bool"|Uint8->"uint8"|Unit->"unit"letany_to_string(Any_Dtyped)=to_stringdletsizeof_elt:typea.at->int=function|Float32|Int32->4|Bool|Uint8->1|Unit->0end(* ───── SSA variables & symbolic variables ───── *)moduleVar=structtypet=intletcounter=ref0letfresh()=incrcounter;!counterletcompare=Int.compareletequal=Int.equallethash=Hashtbl.hashletppfmtv=Format.fprintffmt"v%d"vletto_string=Format.asprintf"%a"ppmoduleSet=structincludeSet.Make(structtypenonrect=tletcompare=compareend)letppfmts=Format.fprintffmt"{%a}"(Format.pp_print_list~pp_sep:(funf()->Format.pp_print_stringf", ")pp)(elementss)endend(* Symbolic variables for dynamic shapes *)moduleSymVar=structtypet={name:string;min_val:int;max_val:int}end(* ───── Misc enums & types ───── *)moduleSpecial_index_kind=structtypet=|Global_task_idxofint(* 0=x,1=y,2=z *)|Local_thread_idxofint|Workgroup_idxofintendtypevar_metadata={dtype:Dtype.any;shape:intarray;shape_expr:Shape_expr.shapeoption;device:stringoption;}typekernel_metadata={name:string;local_dims:int;upcasted:int;dont_use_locals:bool;}typecustom_attr=|Attr_Intofint|Attr_Floatoffloat|Attr_Stringofstring|Attr_Shapeofintarray(* Shape tracker for VIEW operations *)typeshape_tracker={views:viewlist;shape:Shape_expr.shape}andview={shape:Shape_expr.shape;strides:intarray;offset:int;mask:(int*int)arrayoption;(* for masked/valid regions *)}(* ───── Operation kinds ───── *)typebinop_kind=|Add|Mul|Sub|Div|Idiv|Fdiv|Mod|Pow|Max|Min|Cmplt|Cmpne|Xor|Or|And|Shl|Shr(* bitwise shifts *)typeunary_op_kind=Neg|Log2|Exp2|Sin|Sqrt|Reciptypeternary_op_kind=Where|Mulacc(* multiply-accumulate *)typereduce_op_kind=Reduce_Sum|Reduce_Max|Reduce_Prod(* ───── High-level graph IR ───── *)type_node_t=(* ──── Buffer/Memory Operations ──── *)|Buffer:{dtype:'aDtype.t;size_in_elements:int;device:string;out_var:Var.t;}->'anode_t|Buffer_View:{(* view into existing buffer *)buffer_var:Var.t;size:int;offset:int;dtype:'aDtype.t;out_var:Var.t;}->'anode_t|Placeholder:{out_var:Var.t;dtype:'aDtype.t;shape:Shape_expr.shape;}->'anode_t|Const_Scalar:{value:'a;dtype:'aDtype.t;out_var:Var.t;}->'anode_t|Vconst:{(* vector constant *)values:'aarray;dtype:'aDtype.t;out_var:Var.t;}->'anode_t(* ──── Compute Operations ──── *)|Binop:{op:binop_kind;a_var:Var.t;b_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Unary:{op:unary_op_kind;in_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Ternary:{op:ternary_op_kind;a_var:Var.t;b_var:Var.t;c_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Movement/Shape Operations ──── *)|View:{(* zero-copy shape operations *)in_var:Var.t;shape_tracker:shape_tracker;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Reshape:{in_var:Var.t;new_shape:Shape_expr.shape;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Permute:{in_var:Var.t;axes_permutation:intarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Expand:{in_var:Var.t;new_target_shape:Shape_expr.shape;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Pad:{in_var:Var.t;pad_width:(int*int)array;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Shrink:{in_var:Var.t;limits:(int*int)array;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Flip:{in_var:Var.t;axes:intarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Reduction Operations ──── *)|Reduce_Axis:{in_var:Var.t;reduce_op_kind:reduce_op_kind;axes:intarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Advanced Operations ──── *)|Valid:{(* masked valid regions *)in_var:Var.t;shape_tracker:shape_tracker;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Index:{(* explicit indexing *)in_var:Var.t;idx_var:Var.t;valid_var:Var.toption;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Gep:{(* get element pointer for vectors *)in_var:Var.t;indices:intarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Vectorize:{(* create vector from scalars *)in_vars:Var.tarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Wmma:{(* tensor core operations *)a_var:Var.t;b_var:Var.t;c_var:Var.t;m:int;n:int;k:int;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Type Operations ──── *)|Cast:{in_var:Var.t;target_dtype:Dtype.any;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Bitcast:{(* reinterpret bits *)in_var:Var.t;target_dtype:Dtype.any;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Memory Operations ──── *)|Contiguous:{in_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Copy:{in_var:Var.t;target_device:string;clone:bool;(* if true, force copy even on same device *)out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Assign:{target_var:Var.t;updates:(Var.t*Var.t*(int*int)option)array;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Symbolic/Dynamic Shapes ──── *)|Define_Var:{(* symbolic variables *)sym_var:SymVar.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Bind:{(* bind symbolic var to value *)sym_var:Var.t;value:int;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── AutoGrad Support ──── *)|Detach:{(* stop gradient *)in_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Contiguous_Backward:{(* backward pass marker *)in_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Kernel/Graph Management ──── *)|Sink:{(* dependency synchronization *)deps:Var.tarray;dtype:'aDtype.t;}->'anode_t|Kernel:{(* kernel wrapper *)ast:any_node;input_vars:Var.tarray;output_vars:Var.tarray;metadata:kernel_metadata;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Unique:{(* unique identifier generation *)id:int;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Device Management ──── *)|Device:{(* device marker *)device_name:string;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Multi:{(* multi-device tensor *)device_vars:Var.tarray;axis:intoption;real_mask:boolarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Optimization Directives ──── *)|Fuse:{(* fusion marker *)in_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Unroll:{(* loop unroll directive *)loop_var:Var.t;unroll_factor:int;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Contract:{(* tensor contraction *)in_vars:Var.tarray;contraction_axes:(int*int)array;out_var:Var.t;dtype:'aDtype.t;}->'anode_t(* ──── Miscellaneous Operations ──── *)|Cat:{in_vars:Var.tarray;axis:int;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Threefry:{ctr_var:Var.t;key_var:Var.t;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Gather:{src_var:Var.t;indices_var:Var.t;axis:int;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Scatter:{indices_var:Var.t;updates_var:Var.t;axis:int;shape:intarray;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Custom:{(* custom operation *)op_name:string;in_vars:Var.tarray;attributes:(string*custom_attr)list;out_var:Var.t;dtype:'aDtype.t;}->'anode_t|Noop:{(* no operation *)in_var:Var.toption;out_var:Var.t;dtype:'aDtype.t;}->'anode_tandany_node=Any_Node:'anode_t->any_node[@@unboxed]typegraph_t={nodes:any_nodelist;vars_metadata:(Var.t,var_metadata)Hashtbl.t;input_vars:Var.tlist;output_vars:Var.tlist;symbolic_vars:SymVar.tlist;}letbuffer~dtype~size~device~out_var=Buffer{dtype;size_in_elements=size;device;out_var}letunary~op~in_var~out_var~dtype=Unary{op;in_var;out_var;dtype}letbinary~op~a_var~b_var~out_var~dtype=Binop{op;a_var;b_var;out_var;dtype}letternary~op~a_var~b_var~c_var~out_var~dtype=Ternary{op;a_var;b_var;c_var;out_var;dtype}letconst_scalar~value~out_var~dtype=Const_Scalar{value;out_var;dtype}letvconst~values~out_var~dtype=Vconst{values;out_var;dtype}letreshape~in_var~new_shape~out_var~dtype=Reshape{in_var;new_shape;out_var;dtype}letpermute~in_var~axes_permutation~out_var~dtype=Permute{in_var;axes_permutation;out_var;dtype}letexpand~in_var~new_target_shape~out_var~dtype=Expand{in_var;new_target_shape;out_var;dtype}letpad~in_var~pad_width~out_var~dtype=Pad{in_var;pad_width;out_var;dtype}letshrink~in_var~limits~out_var~dtype=Shrink{in_var;limits;out_var;dtype}letreduce_axis~in_var~reduce_op_kind~axes~out_var~dtype=Reduce_Axis{in_var;reduce_op_kind;axes;out_var;dtype}letcast~in_var~target_dtype~out_var~dtype=Cast{in_var;target_dtype;out_var;dtype}letbitcast~in_var~target_dtype~out_var~dtype=Bitcast{in_var;target_dtype;out_var;dtype}letview~in_var~shape_tracker~out_var~dtype=View{in_var;shape_tracker;out_var;dtype}letcopy~in_var~target_device~clone~out_var~dtype=Copy{in_var;target_device;clone;out_var;dtype}letcat~in_vars~axis~out_var~dtype=Cat{in_vars;axis;out_var;dtype}letgather~src_var~indices_var~axis~out_var~dtype=Gather{src_var;indices_var;axis;out_var;dtype}letscatter~indices_var~updates_var~axis~shape~out_var~dtype=Scatter{indices_var;updates_var;axis;shape;out_var;dtype}letfresh_var()=Var.fresh()(* ───── Scheduled IR ───── *)(* ───── Scheduled IR (single module, structured loops/tiles/mapping) ───── *)moduleScheduled=struct(* Utilities *)let[@inline]prod(arr:intarray)=Array.fold_left(*)1arrlet[@inline]ensure3(a:intarray):intarray=matchArray.lengthawith|3->a|0->[|1;1;1|]|1->[|a.(0);1;1|]|2->[|a.(0);a.(1);1|]|_->[|a.(0);a.(1);a.(2)|]let[@inline]contiguous_strides_elems(shape:intarray):intarray=letn=Array.lengthshapeinifn=0then[||]elselets=Array.maken0inletstride=ref1infori=n-1downto0dos.(i)<-!stride;stride:=!stride*ifshape.(i)=0then1elseshape.(i)done;s(* Core scheduling types *)typeaxis_role=[`Normal|`Reduction]typeaxis={name:string;size:intoption;(* known static extent or None (symbolic) *)sym:SymVar.toption;(* the symbolic var that bounds the axis *)role:axis_role;}typemapping={block:intlist;(* threadblock / grid dims on GPU; core on CPU *)thread:intlist;(* thread / lane *)vec:intlist;(* vector lanes (SIMD) *)serial:intlist;(* remaining serial loops *)}typeiter_space={axes:axisarray;(* logical iteration axes *)(* mapping selects WHICH axis indices (into [axes]) map to each machine
level *)mapping:mapping;(* tiling: for each axis i, a list of tile sizes (outer→inner) *)tiles:intlistarray;}typememory_scope=Global|Shared|Register(* Layout strides are measured in ELEMENTS (not bytes); dtype tells byte
width *)typelayout={shape:intarray;(* logical shape in elements *)strides:intarray;(* strides in elements (row-major typical) *)alignment:int;(* bytes *)vector_width:int;(* elements per vector lane *)contiguous_axes:intlist;(* for coalescing; usually [last;...] *)}typeallocation={scope:memory_scope;size_bytes:int;(* final allocated size (post-tiling/packing) *)lifetime:int*int;(* inclusive item-id range for reuse *)alias_group:intoption;(* optional alias set id for in-place plans *)}typebuffer_info={buf_var:Var.t;dtype:Dtype.any;layout:layout;alloc:allocation;is_input:bool;is_output:bool;}typeloop_hint=|Vectorizeof{axis:int;width:int}|Unrollof{axis:int;factor:int}|Prefetchof{var:Var.t;into:memory_scope;distance:int}|Pipelineof{axis:int;stages:int;overlap:bool}typereduction_plan={axes:intlist;(* indices (into iter_space.axes) tagged as reductions *)intra_thread:[`Tree|`Welford|`Shfl|`None];inter_thread:[`SharedTree|`Atomic|`GridReduce];}typeschedule_context={global_dims:intarray;(* [|gx;gy;gz|] *)local_dims:intarray;(* [|lx;ly;lz|] *)upcasted:int;device:string;stream:intoption;}typescheduled_op=|S_Kernelof{kernel_id:int;kernel_name:string;ops:any_nodelist;(* HL ops fused into this kernel *)inputs:buffer_infolist;outputs:buffer_infolist;iter:iter_space;(* explicit loops/tiling/mapping *)reduce:reduction_planoption;hints:loop_hintlist;context:schedule_context;}|S_Memory_Transferof{transfer_id:int;src_var:Var.t;dst_var:Var.t;src_device:string;dst_device:string;dims:intarray;(* ND copy extents in elements *)src_strides:intarrayoption;(* elements; pitched if provided *)dst_strides:intarrayoption;(* elements *)size_bytes:int;(* optional precomputed flat size *)is_async:bool;stream:intoption;}|S_Synchronizationof{sync_id:int;sync_type:[`Barrier|`Fence|`Eventofint];scope:[`Threadgroup|`Device|`System];devices:stringlist;stream:intoption;}|S_Host_Callbackof{callback_id:int;callback_name:string;input_vars:Var.tlist;output_vars:Var.tlist;}typedependency={dep_from:int;(* schedule_item id *)dep_to:int;(* schedule_item id *)dep_vars:Var.tlist;(* values creating the edge *)kind:[`Data|`Control];}typeschedule_item={item_id:int;operation:scheduled_op;depends_on:intlist;(* item ids *)dependents:intlist;(* filled by validation/toposort *)}typefusion_opportunity={kernel_a:int;(* item id *)kernel_b:int;(* item id *)fusion_type:[`Elementwise|`Reduction|`Mixed];benefit_score:float;memory_saved:int;(* bytes *)}(* Lightweight analysis product kept outside the core op shape *)typeitem_analysis={item_id:int;flops:int;bytes_read:int;bytes_written:int;regs_per_thread:int;smem_bytes:int;occupancy:float;(* 0–1 estimate *)est_ns:int;(* estimated latency in ns *)}typegraph_t={schedule_items:schedule_itemarray;dependencies:dependencylist;fusion_opportunities:fusion_opportunitylist;analysis:item_analysisarray;(* same order as items; may be empty *)critical_path:intlist;(* item ids *)total_memory_usage:int;(* approximate peak, bytes *)estimated_runtime_ns:int;(* critical path sum *)vars_metadata:(Var.t,var_metadata)Hashtbl.t;symbolic_vars:SymVar.tlist;}(* Validation & helpers *)letvalidate_dims3(a:intarray)(label:string):unit=ifArray.lengtha<>3theninvalid_arg(Printf.sprintf"Scheduled.%s must be length-3"label)letvalidate_iter_space(it:iter_space):unit=letn=Array.lengthit.axesinletin_rangei=ifi<0||i>=ntheninvalid_arg(Printf.sprintf"Scheduled.iter_space: axis index %d out of range 0..%d"i(n-1))inList.iterin_rangeit.mapping.block;List.iterin_rangeit.mapping.thread;List.iterin_rangeit.mapping.vec;List.iterin_rangeit.mapping.serial;ifArray.lengthit.tiles<>ntheninvalid_arg"Scheduled.iter_space: tiles length must match axes length"letsize_bytes_of_layout(dt:Dtype.any)(ly:layout):int=letelt=matchdtwith|Dtype.Any_DtypeDtype.Float32->4|Dtype.Any_DtypeDtype.Int32->4|Dtype.Any_DtypeDtype.Uint8->1|Dtype.Any_DtypeDtype.Bool->1|Dtype.Any_DtypeDtype.Unit->0inprodly.shape*eltletdefault_layout?(vector_width=1)?(alignment=16)(shape:intarray):layout={shape;strides=contiguous_strides_elemsshape;alignment;vector_width;contiguous_axes=(letn=Array.lengthshapeinletrecauxiacc=ifi<0thenaccelseaux(i-1)(i::acc)inaux(n-1)[]);}letdefault_alloc~scope~dtype~layout~lifetime:allocation=letsz=size_bytes_of_layoutdtypelayoutin{scope;size_bytes=sz;lifetime;alias_group=None}(* Build dependents lists from depends_on *)letcompute_dependents(items:schedule_itemarray):unit=letn=Array.lengthitemsinletdeps_rev:intlistarray=Array.maken[]inArray.iter(fun(it:schedule_item)->List.iter(fun(p:int)->ifp>=0&&p<nthendeps_rev.(p)<-it.item_id::deps_rev.(p))it.depends_on)items;Array.iteri(funi(it:schedule_item)->items.(i)<-{itwithdependents=List.revdeps_rev.(i)})items(* Topological order (Kahn). Returns item ids in topo sequence. *)lettopological_order(items:schedule_itemarray):intlist=letn=Array.lengthitemsinletindeg=Array.maken0inArray.iter(fun(it:schedule_item)->(* indegree of a node is number of its dependencies *)indeg.(it.item_id)<-List.lengthit.depends_on)items;letq=Queue.create()infori=0ton-1doifindeg.(i)=0thenQueue.addiqdone;letorder=ref[]inwhilenot(Queue.is_emptyq)doletu=Queue.popqinorder:=u::!order;List.iter(funv->indeg.(v)<-indeg.(v)-1;ifindeg.(v)=0thenQueue.addvq)items.(u).dependentsdone;List.rev!orderletfind_critical_path(g:graph_t):intlist=letn=Array.lengthg.schedule_itemsinifn=0then[]elseletcost=Array.maken1inArray.iter(funa->ifa.item_id<nthencost.(a.item_id)<-max1a.est_ns)g.analysis;letdist=Array.makenmin_intinletprev=Array.maken(-1)inletindeg=Array.maken0inArray.iter(fun(it:schedule_item)->indeg.(it.item_id)<-List.lengthit.depends_on)g.schedule_items;letq=Queue.create()infori=0ton-1doifindeg.(i)=0then(dist.(i)<-cost.(i);Queue.addiq)done;whilenot(Queue.is_emptyq)doletu=Queue.popqinList.iter(funv->(ifdist.(u)<>min_intthenletcand=dist.(u)+cost.(v)inifcand>dist.(v)then(dist.(v)<-cand;prev.(v)<-u));indeg.(v)<-indeg.(v)-1;ifindeg.(v)=0thenQueue.addvq)g.schedule_items.(u).dependentsdone;letend_id=ref0infori=1ton-1doifdist.(i)>dist.(!end_id)thenend_id:=idone;letrecbuildaccu=ifu=-1thenaccelsebuild(u::acc)prev.(u)inbuild[]!end_idletsum_estimated_runtime_ns(g:graph_t):int=List.fold_left(funaccid->matchArray.find_opt(funa->a.item_id=id)g.analysiswith|Somea->acc+max1a.est_ns|None->acc+1)0g.critical_path(* Very rough peak memory estimate: sum of distinct kernel allocations at each
item *)letestimate_peak_memory(g:graph_t):int=letmem_at_item(it:schedule_item):int=matchit.operationwith|S_Kernel{inputs;outputs;_}->letsuml=List.fold_left(funaccb->acc+b.alloc.size_bytes)0lin(* NOTE: This double counts shared/register; acceptable as an upper
bound. *)suminputs+sumoutputs|S_Memory_Transfer{size_bytes;_}->size_bytes|_->0inArray.fold_left(funaccit->maxacc(mem_at_itemit))0g.schedule_items(* Constructors *)letmake_iter_space~axes~mapping~tiles:iter_space=lets={axes;mapping;tiles}invalidate_iter_spaces;sletmake_buffer_info~(buf_var:Var.t)~(dtype:Dtype.any)~(shape:intarray)~(scope:memory_scope)~(is_input:bool)~(is_output:bool)~(lifetime:int*int):buffer_info=letlayout=default_layoutshapeinletalloc=default_alloc~scope~dtype~layout~lifetimein{buf_var;dtype;layout;alloc;is_input;is_output}letcreate_kernel~kernel_id~kernel_name~ops~inputs~outputs~iter~reduce~hints~context:scheduled_op=validate_dims3context.global_dims"context.global_dims";validate_dims3context.local_dims"context.local_dims";validate_iter_spaceiter;S_Kernel{kernel_id;kernel_name;ops;inputs;outputs;iter;reduce;hints;context;}letcreate_memory_transfer~transfer_id~src_var~dst_var~src_device~dst_device~dims?src_strides?dst_strides~size_bytes~is_async~stream():scheduled_op=S_Memory_Transfer{transfer_id;src_var;dst_var;src_device;dst_device;dims;src_strides;dst_strides;size_bytes;is_async;stream;}letcreate_synchronization~sync_id~sync_type~scope~devices~stream:scheduled_op=S_Synchronization{sync_id;sync_type;scope;devices;stream}letcreate_host_callback~callback_id~callback_name~input_vars~output_vars:scheduled_op=S_Host_Callback{callback_id;callback_name;input_vars;output_vars}letcreate_schedule_item~item_id~operation~depends_on:schedule_item={item_id;operation;depends_on;dependents=[]}end(* ───── Low-level / lowered IR ───── *)moduleLowered=structtypealu_op=|Binaryofbinop_kind|Unaryofunary_op_kind|Ternaryofternary_op_kindtypeinstruction=(* Memory allocation *)|L_Bufferof{dtype:Dtype.any;size:int;out:Var.t}|L_Localof{dtype:Dtype.any;size:int;out:Var.t}|L_Accof{dtype:Dtype.any;out:Var.t}(* Memory definitions *)|L_Define_Globalof{(* global memory definition *)ptr:Var.t;dtype:Dtype.any;size:int;}(* Constants and indices *)|L_Constof{dtype:Dtype.any;value:string;out:Var.t}|L_Vconstof{(* vector constant *)dst:Var.t;values:stringarray;dtype:Dtype.any;}|L_Specialof{dst:Var.t;kind:Special_index_kind.t}|L_Define_Varof{sym_var:SymVar.t;out:Var.t}(* Control flow *)|L_Rangeof{idx:Var.t;bound:Var.t}|L_EndRange|L_Ifof{cond:Var.t}|L_EndIf|L_Barrier(* Block operations *)|L_Blockof{(* block marker *)block_id:int;start:bool;(* true for BLOCKSTART, false for BLOCKEND *)}(* Unrolling *)|L_Unrollof{(* unrolled loop *)idx:Var.t;iterations:int;}(* Memory access *)|L_Loadof{dst:Var.t;buf:Var.t;idx:Var.t;dtype:Dtype.any;valid:Var.toption;(* masked loads *)}|L_Storeof{buf:Var.t;idx:Var.t;src:Var.t;valid:Var.toption;(* masked stores *)}(* Compute *)|L_ALUof{dst:Var.t;op:alu_op;args:Var.tlist;dtype:Dtype.any;}(* Vector operations *)|L_Gepof{(* get element from vector *)dst:Var.t;src:Var.t;indices:intarray;dtype:Dtype.any;}|L_Vectorizeof{(* build vector *)dst:Var.t;srcs:Var.tarray;dtype:Dtype.any;}(* Pointer operations *)|L_Ptrcatof{(* pointer concatenation *)dst:Var.t;ptrs:Var.tarray;dtype:Dtype.any;}(* Tensor core operations *)|L_Wmmaof{dst:Var.t;a:Var.t;b:Var.t;c:Var.t;m:int;n:int;k:int;dtype:Dtype.any;}(* Data movement *)|L_Castof{dst:Var.t;src:Var.t;dtype:Dtype.any}|L_Bitcastof{dst:Var.t;src:Var.t;dtype:Dtype.any}|L_Assignof{dst:Var.t;src:Var.t}(* Custom operations *)|L_Customof{dst:Var.toption;op_name:string;args:Var.tarray;attributes:(string*custom_attr)list;inline:bool;(* CUSTOMI vs CUSTOM *)}(* No-op *)|L_Nooptypegraph_t={instructions:instructionlist;vars_metadata:(Var.t,var_metadata)Hashtbl.t;kernel_input_vars:Var.tlist;kernel_output_vars:Var.tlist;symbolic_vars:SymVar.tlist;}end