Module type Backend_intf.Backend

include Backend_common
type code
val sexp_of_code : code -> Sexplib0.Sexp.t
type code_batch
val sexp_of_code_batch : code_batch -> Sexplib0.Sexp.t
type optimize_ctx
val get_optimize_ctx : code -> optimize_ctx
val get_optimize_ctx_batch : code_batch -> optimize_ctx
val compile : optimize_ctx -> ?name:Base.string -> Indexing.unit_bindings -> Assignments.comp -> code

name is used to derive names for compilation artifacts. If omitted, it's derived via Assignments.get_name_exn.

val compile_batch : optimize_ctx -> ?names:Base.string Base.array -> ?occupancy:(name:Base.string -> src_n:Base.int -> Base.bool) -> Indexing.unit_bindings -> Assignments.comp Base.array -> code_batch

compile_batch vs. compile is mostly about improving the compile time and debugging convenience by generating fewer files -- ideally does not affect execution, but there can be backend-specific differences. Only array entries for which occupancy returns true are included. names are used to derive names for compilation artifacts. If omitted, they're derived via Assignments.get_name_exn.

include Backend_device_common with type optimize_ctx := optimize_ctx
include Device with type optimize_ctx := optimize_ctx
include Device_types with type optimize_ctx := optimize_ctx
include Device_config with type optimize_ctx := optimize_ctx
include Device_config_common
type dev

Interface to a device driver.

val sexp_of_dev : dev -> Sexplib0.Sexp.t
type runner

Interface to a stream driver.

val sexp_of_runner : runner -> Sexplib0.Sexp.t
type event

An event tracks if a device's runner finished computing past a particular point in its schedule. These values are used internally for scheduling across devices/queues of the backend, and can be used for explicit scheduling.

val sexp_of_event : event -> Sexplib0.Sexp.t
val name : Base.string
val sexp_of_optimize_ctx : optimize_ctx -> Sexplib0.Sexp.t
val empty_optimize_ctx : Base.unit -> optimize_ctx
type nonrec device = (dev, runner, event) device
val sexp_of_device : device -> Sexplib0.Sexp.t
type nonrec context = (dev, runner, event, optimize_ctx) context
val sexp_of_context : context -> Sexplib0.Sexp.t
include Slab_alloc with type device := device
val alloc_pool : ?mode:Tnode.memory_mode -> device -> pool_id:Base.int -> size_in_bytes:Base.int -> alignment:Base.int -> Base.unit

Allocates the slab for pool_id on device. The optional ?mode carries the tnode's memory mode so backends can pick a storage mode (Metal private vs. shared); backends that do not care ignore it.

val free_pool : (device -> pool_id:Base.int -> Base.unit) Base.option

Frees the slab for pool_id and drops its table entry. None for backends that rely on GC.

val memset_zero : device -> pool_id:Base.int -> offset:Base.int -> size_in_bytes:Base.int -> Base.unit

Zero-initializes size_in_bytes at base_of pool_id + offset.

val make_device : dev -> runner -> ordinal:Base.int -> device
val make_context : ?ctx_buffers:ctx_buffers -> ?optimize_ctx:optimize_ctx -> device -> context

Returns a context without a parent.

val make_child : ?ctx_buffers:ctx_buffers -> ?optimize_ctx:optimize_ctx -> ?merge_buffer_node:Tnode.t Base.option -> context -> context
val get_name : device -> Base.string
val sync : event -> Base.unit

Blocks till the event completes, if it's not done already.

It is rarely needed to call sync explicitly, because it should always be called internally when necessary, in particular before extracting values from host.

val is_done : event -> Base.bool

Whether the event completed.

val will_wait_for : context -> event -> Base.unit

Schedules waiting for the given event on the context's device.

NOTE: it should rarely be needed to call will_wait_for explicitly, because it should always be called internally when necessary.

val static_properties : Base.Sexp.t

Returns a sexp description of the properties of all devices.

val get_used_memory : device -> Base.int

Returns (an upper bound of) the memory used for arrays, in bytes.

val get_global_debug_info : Base.unit -> Base.Sexp.t

Global debug information; backend-specific and might evolve independently on the backends.

val get_debug_info : device -> Base.Sexp.t

Per-device debug information; backend-specific and might evolve independently on the backends

val await : device -> Base.unit

Blocks till the device becomes idle, i.e. synchronizes the device's runner.

val all_work : device -> event

Returns the event indicating if any currently running or scheduled computations on the device have completed.

val is_idle : device -> Base.bool

Whether the device's runner is currently waiting for work.

val get_device : ordinal:Base.int -> device
val num_devices : Base.unit -> Base.int
val new_stream : device -> device

After the stream-into-device fold there is one compute stream per device, so the device is its own single stream; new_stream returns the device unchanged. Retained for call-site compatibility (callers create a fresh context per logical stream via make_context).

Returns the routine for the code's procedure, in a new context derived from the given context.

Returns the routines for the procedures included in the code batch. The returned context is downstream of all the returned routines.

include With_buffer_retrieval_and_syncing with type device := device and type context := context and type event := event
val from_host : context -> Tnode.t -> Ndarray.t -> Base.bool

from_host ctx tn src schedules a copy of the explicit host buffer src into tn's in-context device buffer and returns true, or returns false if the node is not in context. After gh-ocannl-333 the host buffer is supplied by the caller (e.g. Context.set_values); it is no longer read from the tensor node.

val init_from_host : context -> Tnode.t -> Ndarray.t -> context

Schedules a copy from the explicit host buffer to context: a variant of from_host that requires the input context to not contain the tensor node, and outputs the context with the tensor node.

val to_host : context -> Tnode.t -> Ndarray.t -> Base.bool

to_host ctx tn dst schedules a copy of tn's in-context device buffer into the explicit host buffer dst and returns true, or returns false if the node is not in context. After gh-ocannl-333 the destination buffer is supplied by the caller (e.g. Context.to_host); it is no longer the tensor node's own array.

val device_to_device : Tnode.t -> into_merge_buffer:merge_buffer_use -> dst:context -> src:context -> context routine Base.option

device_to_device tn ~into_merge_buffer ~dst ~src builds a transfer routine instead of scheduling the copy directly. The caller schedules it (e.g. via Task.run r.schedule) or links a consumer against r.context. It returns:

  • None if there is nothing to transfer: the node is absent from src; or, for into_merge_buffer=No, the node is absent from dst or the source and destination buffers are physically the same.
  • Some r otherwise. Running r.schedule waits for writing into the tensor node on src to finish, then performs the copy and updates the writer event.
  • For into_merge_buffer=No, the copy goes from src to dst; r.context is a child of dst inheriting its Backend_intf.context.merge_buffer_node.
  • For into_merge_buffer=Copy, the copy goes from src to the merge buffer of dst's stream; r.context is a child of dst with merge_buffer_node = Some tn, so that linking a consumer of the merge buffer against r.context statically verifies the node.
val init_from_device : Tnode.t -> dst:context -> src:context -> context

Schedules a copy from src to dst: a variant of device_to_device with into_merge_buffer=No that requires the input src context to not contain the tensor node, and outputs the dst context with the tensor node.

val sync_device : device -> Base.unit

Synchronizes all the streams on a device, and cleans up (removes) all associated events.