Backend_intf (arrayjit.Ir.Backend

The interface types for backends

The shared backend-interface types: the user-facing API (Backend, routine, buffer_loc) together with the interface pieces the implementation layers assemble from (marked implementation-facing where applicable). Implementation-only components live in Backend_impl.

type buffer_loc = {

pool_id : Base.int;
offset : Base.int;

}

A backend-agnostic, deterministic per-device buffer location: a pool_id into the device's backend-private pool_id -> 'base pool table, plus a byte offset within that pool. The concrete backend pointer (Metal.Buffer.t / CUdeviceptr / void*) lives only in that private table -- it never appears in any type of this shared interface -- so buffer_loc (pure integers) is stable across runs, diffable, and meaningful in logs and .expected files. Phase-1 policy is one pool per tnode at offset = 0, byte-for-byte equivalent to per-tnode allocation. An alias (future work) is the parent's { pool_id; offset = offset + delta }.

val buffer_loc_of_sexp : Sexplib0.Sexp.t -> buffer_loc

val sexp_of_buffer_loc : buffer_loc -> Sexplib0.Sexp.t

val compare_buffer_loc : buffer_loc -> buffer_loc -> Base.int

val equal_buffer_loc : buffer_loc -> buffer_loc -> Base.bool

type ctx_buffers = buffer_loc Base.Map.M(Ir.Tnode).t

val sexp_of_ctx_buffers : ctx_buffers -> Sexplib0.Sexp.t

type mma_capability = {

mma_simd_width : Base.int;
(*
Threads cooperating in one tile-MMA instruction (CUDA warp / Metal simdgroup width).
*)
mma_tile : Base.int * Base.int * Base.int;
(*
The intrinsic tile shape (m, n, k) (8×8×8 for Metal simdgroup_matrix, 16×16×16 for CUDA wmma); a Low_level.t.Tile_mma's block extents must be multiples of it.
*)

}

Tensor-core capability descriptor (docs/proposals/tensorize-mma.md §6). Which operand precisions are supported is decided per call by the backend's mma_syntax hook (the emission is the source of truth); this record carries what schedule construction needs.

val mma_capability_of_sexp : Sexplib0.Sexp.t -> mma_capability

val sexp_of_mma_capability : mma_capability -> Sexplib0.Sexp.t

val compare_mma_capability : mma_capability -> mma_capability -> Base.int

val equal_mma_capability : mma_capability -> mma_capability -> Base.bool

type hardware_limits = {

max_threads_per_workgroup : Base.int Base.option;
(*
Upper bound on the number of threads in one workgroup (CUDA thread block / Metal threadgroup); None when the backend imposes no limit (the C backends render annotated loops serially).
*)
max_workgroup_memory_bytes : Base.int Base.option;
(*
Capacity in bytes of the workgroup-shared memory (CUDA __shared__ / Metal threadgroup); None when the backend imposes no limit.
*)
mma : mma_capability Base.option;
(*
Tile-MMA units (simdgroup_matrix / tensor cores); None when the backend has none wired — Tile_mma statements then render their scalar fallback.
*)

}

val hardware_limits_of_sexp : Sexplib0.Sexp.t -> hardware_limits

val sexp_of_hardware_limits : hardware_limits -> Sexplib0.Sexp.t

val compare_hardware_limits : hardware_limits -> hardware_limits -> Base.int

val equal_hardware_limits : hardware_limits -> hardware_limits -> Base.bool

val no_hardware_limits : hardware_limits

module type Slab_alloc = sig ... end

The backend slab allocator, replacing the per-tnode Alloc_buffer interface. The shared allocator seam (see Backends) mints deterministic per-device pool_ids and calls these int-in / int-out primitives; the backend keeps the pool_id -> 'base table private. The pool_id -> 'base resolution (then base + offset) stays inside the backend.

type merge_buffer_use =

| No
| Copy

val sexp_of_merge_buffer_use : merge_buffer_use -> Sexplib0.Sexp.t

type kparam_source =

| Log_file_name
| Merge_buffer
| Kparam_ptr of Tnode.t
| Kparam_pool_slab of Base.int
(*
gh-ocannl-344: the i-th pool base-pointer parameter of a pooled kernel (Metal). A fixed number of these is emitted; at link the backend binds slab i to the pool assigned index i (or a duplicate of an in-use pool for the unused tail). Lets a kernel reach hundreds of tensor nodes through a handful of bound pools, staying under Metal's ~31 binding limit.
*)
| Kparam_pool_slots of Tnode.t Base.list
(*
gh-ocannl-344: the per-routine slot table accompanying Kparam_pool_slab. For the k-th tnode in this list the backend writes (pool_index, byte_offset); the shader reads it to form the typed pointer by casting (pools at pool_index) + byte_offset. Emitted only by pooled (Metal) codegen; per-tnode pointer backends (C, CUDA) never produce it.
*)
| Static_idx of Indexing.static_symbol

Kernel-parameter sources: the codegen <-> backend contract for a compiled routine's parameters. Implementation-facing (consumed by C_syntax and the backends' link steps); it lives in this file because the shared Backend_impl.Lowered_no_device_backend signature mentions it.

val sexp_of_kparam_source : kparam_source -> Sexplib0.Sexp.t

type 'context routine = {

context : 'context;
schedule : Task.t;
bindings : Indexing.lowered_bindings;
name : Base.string;
inputs : Base.Set.M(Ir.Tnode).t;
(*
The materialized read-only and read-before-write (within the routine) non-constant nodes. They are inputs in a broad sense, as they could be recurrent nodes or parameters.
*)
merge_buffer_input : Tnode.t Base.option;
(*
Similar to inputs, for the merge buffer.
*)
outputs : Base.Set.M(Ir.Tnode).t;
(*
All the materialized nodes written-to by the routine.
*)

}

val sexp_of_routine : 
  'context. ('context -> Sexplib0.Sexp.t) ->
  'context routine ->
  Sexplib0.Sexp.t

module type Device_config_common = sig ... end

type ('dev, 'runner, 'event) device = {

dev : 'dev;
ordinal : Base.int;
(*
The number of the represented backend's device, in the range from 0 to the number of the backend's devices - 1.
*)
device_id : Base.int;
(*
A unique identifier among all device instances of all backends. Note that multiple device_id (distinct device instances) might refer to the same physical device.
*)
runner : 'runner;
merge_buffer : buffer_loc Base.option Base.ref;
(*
The merge buffer's reserved single-tenant pool location, or None if not yet allocated. The slab can be reused (grown in place) for nodes that fit.
*)
mutable merge_buffer_capacity : Base.int;
(*
Byte capacity of the reserved merge-buffer pool; drives the grow decision.
*)
updating_for : 'event Base.Hashtbl.M(Ir.Tnode).t;
(*
The completion event for the most recent updating (writing to) a node via this device.
*)
mutable updating_for_merge_buffer : (Tnode.t * 'event Base.option) Base.option;
(*
The tensor node that was most recently scheduled to be in the device's merge buffer. See also updating_for.
*)
constant_buffer_cache : buffer_loc Base.Hashtbl.M(Ir.Tnode).t;
(*
Per-device cache for read-only/constant buffer allocations.
*)
mutable next_pool_id : Base.int;
(*
Deterministic per-device pool-id counter, advanced by the shared allocator seam in tnode iteration order. Pool id 0 is reserved for the merge buffer; tnode pools start at 1.
*)

}

A device bundles its single compute runner with the associated buffer and event tracking: the merge_buffer, the updating_for writer events (used for cross-device coherence by Backend.device_to_device), and the deterministic pool-id counter. The design is forward-compatible with a future fixed-role prefetch/transfer runner.

val sexp_of_device : 'a -> 'b -> 'c -> ('d, 'e, 'f) device -> Sexplib0.Sexp.t

val equal_device : ('a, 'b, 'c) device -> ('d, 'e, 'f) device -> bool

val merge_buffer_pool_id : int

Pool id 0 on every device is reserved for the (single-tenant) merge buffer.

type ('dev, 'runner, 'event) context = {

device : ('dev, 'runner, 'event) device;
parent : ('dev, 'runner, 'event) context Base.option;
ctx_buffers : ctx_buffers;
(*
This map contains the deterministic buffer locations used in this context or an ancestor context.
*)
finalized : Utils.atomic_bool;
optimize_ctx : Low_level.optimize_ctx;
(*
The optimization context threaded through compilation: all OCANNL backends compile through the Low_level IR, so this is concretely Low_level.optimize_ctx (the abstraction for hypothetical assignments-level backends was retired; the Assignments.comp -> code seam can be reintroduced if such a backend ever materializes).
*)
merge_buffer_node : Tnode.t Base.option;
(*
The tensor node that a Backend.device_to_device transfer with into_merge_buffer:Copy placed (or will place) into this context's device's merge buffer. It is a static, immutably-chained fact carried producer -> consumer: linking a consumer whose code expects a merge-buffer node verifies it against this field at link time. A transfer with into_merge_buffer:No does not touch the merge buffer and inherits the parent's value.
*)

}

val sexp_of_context : 
  'dev 'runner 'event. ('dev -> Sexplib0.Sexp.t) ->
  ('runner -> Sexplib0.Sexp.t) ->
  ('event -> Sexplib0.Sexp.t) ->
  ('dev, 'runner, 'event) context ->
  Sexplib0.Sexp.t

module type Device_types = sig ... end

module type Device = sig ... end

module type Backend_device_common = sig ... end

The device, event and synchronization part of the backend interface, shared by the user-facing Backend and the implementation-facing Backend_impl.Lowered_backend. Does not include: compilation and linking (they differ between the user-facing and lowered interfaces); copying and tensor-node-level synchronization (copying is different for user-facing and implementation-facing APIs, synchronization is provided by a component outside of backend implementations).

module type With_buffer_retrieval_and_syncing = sig ... end

module type Backend = sig ... end