Namespaces
namespace	Detail
namespace	Gelu
namespace	Gqa
namespace	Kernels
namespace	LayerNorm
namespace	Linear
namespace	Lpe
namespace	MatMulBiasGelu
namespace	MultiHeadAttention
namespace	Residual
namespace	RmsNorm
namespace	Rope
namespace	Softmax
namespace	SoftmaxCrossEntropy
namespace	Swiglu
namespace	TokenEmbedding

Classes
struct	CublasLtLinearPlan
	RAII wrapper owning cuBLASLt descriptors for a Linear matmul. More...
struct	CublasLtMatMulPlan
	RAII wrapper owning cuBLASLt descriptors and the selected heuristic algorithm. More...
class	CublasLtPlanCache
	Generic plan cache keyed on batch size bucket. More...
struct	CudaDataTypeTraits
	Compile-time mapping from TensorDataType -> cudaDataType_t. More...
struct	CudaDataTypeTraits< TensorDataType::BF16 >
struct	CudaDataTypeTraits< TensorDataType::FP16 >
struct	CudaDataTypeTraits< TensorDataType::FP32 >
struct	CudaDataTypeTraits< TensorDataType::FP8_E4M3 >
struct	CudaDataTypeTraits< TensorDataType::FP8_E5M2 >
struct	CudaDataTypeTraits< TensorDataType::INT32 >
struct	CudaDataTypeTraits< TensorDataType::INT8 >
struct	FillOps
	CUDA specialization of TensorOps for initialization operations. More...
struct	MathOps
	CUDA specialization of TensorOps for mathematical operations. More...
struct	RandomOps
struct	StructuralOps
struct	TensorDataTypeMap
	Compile-time mapping from abstract TensorDataType -> CUDA native device type. More...
struct	TensorDataTypeMap< TensorDataType::BF16 >
	Maps TensorDataType::BF16 to CUDA __nv_bfloat16. More...
struct	TensorDataTypeMap< TensorDataType::FP16 >
	Maps TensorDataType::FP16 to CUDA __half. More...
struct	TensorDataTypeMap< TensorDataType::FP32 >
	Maps TensorDataType::FP32 to CUDA float. More...
struct	TensorDataTypeMap< TensorDataType::FP4_E2M1 >
	Maps TensorDataType::FP4_E2M1 to std::uint8_t. More...
struct	TensorDataTypeMap< TensorDataType::FP4_E3M0 >
	Maps TensorDataType::FP4_E3M0 to std::uint8_t. More...
struct	TensorDataTypeMap< TensorDataType::FP8_E4M3 >
	Maps TensorDataType::FP8_E4M3 to CUDA __nv_fp8_e4m3. More...
struct	TensorDataTypeMap< TensorDataType::FP8_E5M2 >
	Maps TensorDataType::FP8_E5M2 to CUDA __nv_fp8_e5m2. More...
struct	TensorDataTypeMap< TensorDataType::INT16 >
	Maps TensorDataType::INT16 to std::int16_t. More...
struct	TensorDataTypeMap< TensorDataType::INT32 >
	Maps TensorDataType::INT32 to std::int32_t. More...
struct	TensorDataTypeMap< TensorDataType::INT8 >
	Maps TensorDataType::INT8 to std::int8_t. More...
struct	TensorDataTypeMap< TensorDataType::UINT16 >
	Maps TensorDataType::UINT16 to std::uint16_t. More...
struct	TensorDataTypeMap< TensorDataType::UINT32 >
	Maps TensorDataType::UINT32 to std::uint32_t. More...
struct	TensorDataTypeMap< TensorDataType::UINT8 >
	Maps TensorDataType::UINT8 to std::uint8_t. More...
struct	TransferOps
	CUDA specialization of TensorOps for tensor transfer operations. More...
struct	ZeroOps

Typedefs
template<TensorDataType TDataType>
using	Mila::Dnn::Compute::Cuda::device_type_t = typename TensorDataTypeMap<TDataType>::device_type
	Convenience alias for accessing CUDA native type.

Functions
template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>
CublasLtLinearPlan< TComputePrecision, TParameterPrecision >	Mila::Dnn::Compute::Cuda::build_linear_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cublasComputeType_t compute_type, cudaDataType_t scale_type, const float *weight_scale=nullptr)
	Build a cuBLASLt plan for a Linear matmul.
template<typename TNative>
CublasLtMatMulPlan< TNative >	Mila::Dnn::Compute::Cuda::build_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cudaDataType_t data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
	Build a cuBLASLt plan for a standard (non-strided) matmul.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision >	Mila::Dnn::Compute::Cuda::build_strided_plan (cublasLtHandle_t handle, int A_rows, int A_cols, int ldA, long long strideA_elems, int B_rows, int B_cols, int ldB, long long strideB_elems, int C_rows, int C_cols, int ldC, long long strideC_elems, cublasOperation_t opA, cublasOperation_t opB, int strided_batch_count, bool has_bias=false, cublasComputeType_t compute_type=CUBLAS_COMPUTE_32F, cudaDataType_t cuda_data_type=CUDA_R_32F, cudaDataType_t scale_type=CUDA_R_32F, cublasLtOrder_t order=CUBLASLT_ORDER_ROW)
	Build a cuBLASLt matmul plan for strided-batched matmuls.
constexpr int	Mila::Dnn::Compute::Cuda::ceil_div (int M, int N)
	Calculates ceiling division for kernel grid/block dimensions.
int	Mila::Dnn::Compute::Cuda::checkDevice (int deviceId)
	Validates that a device ID is valid and available.
std::vector< int >	Mila::Dnn::Compute::Cuda::computeArchitectureBuckets (int max_batch_size)
	Computes optimal bucket boundaries for cuBLASLt plan caching based on CUDA device architecture.
template<typename NativeType>
void	Mila::Dnn::Compute::Cuda::cublaslt_compute_types (cublasComputeType_t &compute_type, cudaDataType_t &scale_type)
template<typename NativeType>
cudaDataType_t	Mila::Dnn::Compute::Cuda::cublaslt_cuda_data_type ()
void	cuda_split3_bf16 (const __nv_bfloat16 __restrict__ src, __nv_bfloat16 __restrict__ out0, __nv_bfloat16 __restrict__ out1, __nv_bfloat16 __restrict__ out2, int rows, int D0, int D1, int D2, cudaStream_t stream)
void	cuda_split3_fp32 (const float __restrict__ src, float __restrict__ out_a, float __restrict__ out_b, float __restrict__ out_c, int src_rows, int dim_a, int dim_b, int dim_c, cudaStream_t stream)
	Vectorized 3-way last-dimension split, float32.
template<typename T>
void	Mila::Dnn::Compute::Cuda::dump_2d_rowmajor_host (std::ostringstream &oss, const T *host_data, int rows, int cols, int max_display, int indent=0)
	Helper to dump a single 2D row-major matrix (host memory).
template<typename T = float>
std::string	Mila::Dnn::Compute::Cuda::dump_tensor (const T *device_data, const shape_t &shape, const std::string &name="tensor", int max_display_size=16, cudaStream_t stream=nullptr)
	Debug utility to dump row-major tensor from device memory.
template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>
void	Mila::Dnn::Compute::Cuda::execute_linear_plan (cublasLtHandle_t handle, const CublasLtLinearPlan< TComputePrecision, TParameterPrecision > &plan, const float alpha, const void A, const void B, const float beta, typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType C, const typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType bias, const float weight_scale, cudaStream_t stream, void workspace=nullptr, size_t workspace_size=0)
	Execute a previously-built CublasLtLinearPlan.
template<typename TComputePrecision>
void	Mila::Dnn::Compute::Cuda::execute_plan (cublasLtHandle_t handle, const CublasLtMatMulPlan< TComputePrecision > &plan, const void alpha, const TComputePrecision A, const TComputePrecision B, const void beta, TComputePrecision C, const TComputePrecision bias, cudaStream_t stream, void *workspace=nullptr, size_t workspaceSize=0)
	Execute a previously-built cuBLASLt plan.
int	Mila::Dnn::Compute::Cuda::findCudaDevice (int deviceId=-1, bool preferMemory=false)
	Finds the most appropriate CUDA device for computation.
int	Mila::Dnn::Compute::Cuda::getBestDeviceId (bool preferMemory=false)
	Identifies the best CUDA device based on performance characteristics.
int	Mila::Dnn::Compute::Cuda::getBucket (const std::vector< int > &buckets, int batch_size)
	Fast O(log N) bucket lookup.
int	Mila::Dnn::Compute::Cuda::getDeviceCount ()
	Gets the number of available CUDA devices.
int	Mila::Dnn::Compute::Cuda::getDriverVersion ()
	Gets the installed CUDA driver version.
int	Mila::Dnn::Compute::Cuda::getRuntimeVersion ()
	Gets the installed CUDA runtime version.
template<typename TargetType, typename HostType>
void	launch_array_fill_typed (void dst, const HostType host_values, size_t count, cudaStream_t stream)
template<typename TargetType, typename HostType>
void	launch_constant_fill_typed (void *dst, size_t count, HostType host_value, cudaStream_t stream)
	Templated fill operations with compile-time type dispatch.
template<typename SrcT, typename DstT>
void	launch_convert_copy_kernel (const SrcT d_src, DstT d_or_h_dst, size_t n, cudaStream_t stream)
	Launch type-converting copy kernel between tensors.
template<typename T>
void	launch_fast_copy_kernel (const T d_src, T d_dst, size_t n, cudaStream_t stream)
	Launch optimized same-type copy kernel.
void	launch_scale_shift (float *data, size_t n, float min_val, float max_val, cudaStream_t stream)
template<typename T>
void	Mila::Dnn::Compute::Cuda::print_stats (const std::string &name, const T *data, const shape_t &shape, int max_display=8, cudaStream_t stream=nullptr)
	Copies a device tensor to host, formats it via dump_tensor, and emits the result through Logging::Logger::info.
void	Mila::Dnn::Compute::Cuda::setCurrentDevice (int device_id)
	Sets the current CUDA device with thread-local caching.

Variables
template<TensorDataType TDataType>
constexpr cudaDataType_t	Mila::Dnn::Compute::Cuda::cuda_data_type_v = CudaDataTypeTraits<TDataType>::cuda_data_type
	Convenience alias for accessing the cudaDataType_t mapping directly.

Typedef Documentation

◆ device_type_t

template<TensorDataType TDataType>

using Mila::Dnn::Compute::Cuda::device_type_t = typename TensorDataTypeMap<TDataType>::device_type

export

Convenience alias for accessing CUDA native type.

Template Parameters

TDataType Abstract tensor data type

Function Documentation

◆ build_linear_plan()

template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>

CublasLtLinearPlan< TComputePrecision, TParameterPrecision > Mila::Dnn::Compute::Cuda::build_linear_plan	(	cublasLtHandle_t	handle,
		int	outer_size,
		int	in_features,
		int	out_features,
		bool	has_bias,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type,
		const float *	weight_scale = nullptr )

export

Build a cuBLASLt plan for a Linear matmul.

Layout is selected at compile time based on kIsQuantized:

Non-quantized (TComputePrecision == TParameterPrecision): NT row-major — A = activations, B = weights, opA=N, opB=T C[outer_size, out_features] = A[outer_size, in_features] × B^T[in_features, out_features]

Quantized (Ada SM 8.9+, TParameterPrecision = FP8_E4M3): TN column-major — A = weights (FP8), B = activations (BF16), opA=T, opB=N Exploits the row-major / column-major duality: row-major W[N, K] ≡ col-major W^T[K, N] (same bytes, lda = K) row-major X[M, K] ≡ col-major X^T[K, M] (same bytes, ldb = K) op(A) = (W^T)^T = W[N, K], op(B) = X^T[K, M] D = W × X^T = Y^T[N, M] col-major ≡ row-major Y[M, N] (ldc = N) A_SCALE_POINTER = per-tensor weight scale (weight_scales_[0]).

Parameters

outer_size	Token count (M = B * T for transformers).
in_features	Inner dimension K.
out_features	Output channels N.
has_bias	Activates CUBLASLT_EPILOGUE_BIAS (non-quantized path only).
compute_type	Supplied by CudaLinearOp::getComputeTypes().
scale_type	Always CUDA_R_32F.
weight_scale	Quantized path only: device pointer to the per-tensor weight scale (weight_scales_[0]). Must be non-null when kIsQuantized because CUBLASLT_MATMUL_DESC_A_SCALE_POINTER must be set in the descriptor before cublasLtMatmulAlgoGetHeuristic for FP8 algorithms to be returned. Ignored on the non-quantized path (default nullptr).

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::build_plan	(	cublasLtHandle_t	handle,
		int	outer_size,
		int	in_features,
		int	out_features,
		bool	has_bias,
		cudaDataType_t	data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

export

Build a cuBLASLt plan for a standard (non-strided) matmul.

Computes C[outer_size, out_features] = A[outer_size, in_features] @ B^T[in_features, out_features] Row-major layout, opA=N, opB=T, strided_batch_count=1.

Parameters

outer_size Bucket-aligned row count for A and C (C=B*T for transformers).

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_strided_plan()

template<typename TComputePrecision>

CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::build_strided_plan	(	cublasLtHandle_t	handle,
		int	A_rows,
		int	A_cols,
		int	ldA,
		long long	strideA_elems,
		int	B_rows,
		int	B_cols,
		int	ldB,
		long long	strideB_elems,
		int	C_rows,
		int	C_cols,
		int	ldC,
		long long	strideC_elems,
		cublasOperation_t	opA,
		cublasOperation_t	opB,
		int	strided_batch_count,
		bool	has_bias = false,
		cublasComputeType_t	compute_type = CUBLAS_COMPUTE_32F,
		cudaDataType_t	cuda_data_type = CUDA_R_32F,
		cudaDataType_t	scale_type = CUDA_R_32F,
		cublasLtOrder_t	order = CUBLASLT_ORDER_ROW )

export

Build a cuBLASLt matmul plan for strided-batched matmuls.

Accepts explicit layout dimensions (rows/cols/ld) and per-element strides for A, B, C. The heuristic algorithm is selected at build time for the given dimensions and reused at execution time. When actual_rows is passed to execute_plan, only the row count embedded in layoutA and layoutC is updated — the selected algorithm remains valid for any row count up to the bucket it was built for.

Parameters

strided_batch_count Number of independent matmul instances (1 for Linear, B*NH for Attention).

Note: CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET expects stride in elements, not bytes.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ ceil_div()

int Mila::Dnn::Compute::Cuda::ceil_div	(	int	M,
		int	N )

constexprexport

Calculates ceiling division for kernel grid/block dimensions.

Parameters

M	Dividend value
N	Divisor value

Returns: Ceiling of M/N as an integer

◆ checkDevice()

int Mila::Dnn::Compute::Cuda::checkDevice ( int deviceId )

export

Validates that a device ID is valid and available.

Parameters

deviceId CUDA device ID to check

Returns: The same device ID if valid

Exceptions

std::invalid_argument	If device ID is negative
std::runtime_error	If no CUDA devices are available
std::out_of_range	If device ID exceeds available device count
std::runtime_error	If device is in prohibited compute mode

Here is the call graph for this function:

Here is the caller graph for this function:

◆ computeArchitectureBuckets()

std::vector< int > Mila::Dnn::Compute::Cuda::computeArchitectureBuckets ( int max_batch_size )

inlineexport

Computes optimal bucket boundaries for cuBLASLt plan caching based on CUDA device architecture.

Bucket sizes are aligned to tensor core tile granularity: Pre-Volta (SM<70): grain=32 (warp-aligned, no tensor cores) Volta (SM70): grain=8 (FP16 tensor cores, 8x8x4 MMA) Turing (SM75): grain=16 Ampere+ (SM80+): grain=16 (TF32/BF16/FP16, 16x8x16 MMA) Hopper (SM90+): grain=16 (FP8 wgmma)

Here is the caller graph for this function:

◆ cublaslt_compute_types()

template<typename NativeType>

void Mila::Dnn::Compute::Cuda::cublaslt_compute_types	(	cublasComputeType_t &	compute_type,
		cudaDataType_t &	scale_type )

export

◆ cublaslt_cuda_data_type()

template<typename NativeType>

cudaDataType_t Mila::Dnn::Compute::Cuda::cublaslt_cuda_data_type ( )

export

◆ cuda_split3_bf16()

void Mila::Dnn::Compute::Cuda::cuda_split3_bf16	(	const __nv_bfloat16 *__restrict__	src,
		__nv_bfloat16 *__restrict__	out0,
		__nv_bfloat16 *__restrict__	out1,
		__nv_bfloat16 *__restrict__	out2,
		int	rows,
		int	D0,
		int	D1,
		int	D2,
		cudaStream_t	stream )

Here is the caller graph for this function:

◆ cuda_split3_fp32()

void Mila::Dnn::Compute::Cuda::cuda_split3_fp32	(	const float *__restrict__	src,
		float *__restrict__	out_a,
		float *__restrict__	out_b,
		float *__restrict__	out_c,
		int	src_rows,
		int	dim_a,
		int	dim_b,
		int	dim_c,
		cudaStream_t	stream )

Vectorized 3-way last-dimension split, float32.

Splits input [B, T, D0+D1+D2] into three output tensors [B, T, D0], [B, T, D1], [B, T, D2] along the last dimension.

Parameters

src	Input device buffer [B * T * (D0+D1+D2) floats].
out0	Output device buffer [B * T * D0 floats].
out1	Output device buffer [B * T * D1 floats].
out2	Output device buffer [B * T * D2 floats].
B	Batch size.
T	Sequence length.
D0	Last-dim size of output 0. Must be a multiple of 4.
D1	Last-dim size of output 1. Must be a multiple of 4.
D2	Last-dim size of output 2. Must be a multiple of 4.
stream	CUDA stream for kernel scheduling.

Here is the caller graph for this function:

◆ dump_2d_rowmajor_host()

template<typename T>

void Mila::Dnn::Compute::Cuda::dump_2d_rowmajor_host	(	std::ostringstream &	oss,
		const T *	host_data,
		int	rows,
		int	cols,
		int	max_display,
		int	indent = 0 )

export

Helper to dump a single 2D row-major matrix (host memory).

Indexing: element (row=r, col=c) -> host_data[r * cols + c]

Here is the caller graph for this function:

◆ dump_tensor()

template<typename T = float>

std::string Mila::Dnn::Compute::Cuda::dump_tensor	(	const T *	device_data,
		const shape_t &	shape,
		const std::string &	name = "tensor",
		int	max_display_size = 16,
		cudaStream_t	stream = nullptr )

export

Debug utility to dump row-major tensor from device memory.

This utility copies data from device to host, then properly interprets the row-major layout for display.

Template Parameters

T	Data type (float, __half, etc.)

Parameters

device_data	Device pointer to row-major data
shape	Shape vector (shape_t)
name	Tensor name for display
max_display_size	Maximum elements to display per dimension
stream	CUDA stream for async copy (nullptr for default stream)

Returns: String representation of the tensor

Here is the call graph for this function:

Here is the caller graph for this function:

◆ execute_linear_plan()

template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>

void Mila::Dnn::Compute::Cuda::execute_linear_plan	(	cublasLtHandle_t	handle,
		const CublasLtLinearPlan< TComputePrecision, TParameterPrecision > &	plan,
		const float *	alpha,
		const void *	A,
		const void *	B,
		const float *	beta,
		typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType *	C,
		const typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType *	bias,
		const float *	weight_scale,
		cudaStream_t	stream,
		void *	workspace = nullptr,
		size_t	workspace_size = 0 )

export

Execute a previously-built CublasLtLinearPlan.

Computes: D = alpha * op(A) * op(B) + beta * C with optional bias epilogue (non-quantized path) and optional weight scale (FP8 path).

Pointer semantics match the layout built by build_linear_plan: Non-quantized (NT row-major): A = activations, B = weights Quantized (TN col-major): A = weights (FP8), B = activations (BF16)

Both A and B are passed as const void* to accommodate the pointer-swap between the two layouts without template type conflicts. Type safety is enforced at the plan-build level via the layout descriptors.

Parameters

A	Device pointer to matrix A (see layout note above).
B	Device pointer to matrix B (see layout note above).
C	Device pointer to output (TComputePrecision).
bias	Device pointer to bias vector; used only when has_bias_epilogue.
weight_scale	Device pointer to per-tensor weight scale (float); applied as A_SCALE_POINTER when has_weight_scale is true. For the TN path weight_scales_[0] is the global per-tensor scale.
workspace	Optional device scratch buffer.
workspace_size	Size of workspace in bytes.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ execute_plan()

template<typename TComputePrecision>

void Mila::Dnn::Compute::Cuda::execute_plan	(	cublasLtHandle_t	handle,
		const CublasLtMatMulPlan< TComputePrecision > &	plan,
		const void *	alpha,
		const TComputePrecision *	A,
		const TComputePrecision *	B,
		const void *	beta,
		TComputePrecision *	C,
		const TComputePrecision *	bias,
		cudaStream_t	stream,
		void *	workspace = nullptr,
		size_t	workspaceSize = 0 )

export

Execute a previously-built cuBLASLt plan.

Computes: C = alpha * op(A) * op(B) + beta * C, with optional bias epilogue.

Parameters

bias	Device pointer to bias vector. Applied only when plan.has_bias_epilogue is true.
workspace	Optional device workspace buffer. Must be non-null when the selected algorithm requires scratch memory (workspaceSize > 0 in heuristic result).
workspaceSize	Size of the workspace buffer in bytes.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ findCudaDevice()

int Mila::Dnn::Compute::Cuda::findCudaDevice	(	int	deviceId = -1,
		bool	preferMemory = false )

inlineexport

Finds the most appropriate CUDA device for computation.

Either validates a specific device ID if provided or finds the best available device when no preference is specified.

Parameters

deviceId Preferred device ID, or -1 to select the best device

Returns: Valid CUDA device ID

Exceptions

std::runtime_error If no CUDA devices are found

Here is the call graph for this function:

◆ getBestDeviceId()

int Mila::Dnn::Compute::Cuda::getBestDeviceId ( bool preferMemory = false )

inlineexport

Identifies the best CUDA device based on performance characteristics.

Evaluates available CUDA devices and selects the one with highest performance potential. Selection criteria vary based on the intended workload type.

Parameters

preferMemory When true, prioritizes memory bandwidth over compute

Returns: Device ID of the best available CUDA device

Exceptions

CudaError If device properties cannot be accessed

Here is the call graph for this function:

Here is the caller graph for this function:

◆ getBucket()

int Mila::Dnn::Compute::Cuda::getBucket	(	const std::vector< int > &	buckets,
		int	batch_size )

inlineexport

Fast O(log N) bucket lookup.

Returns smallest bucket >= batch_size.

Here is the caller graph for this function:

◆ getDeviceCount()

int Mila::Dnn::Compute::Cuda::getDeviceCount ( )

inlineexport

Gets the number of available CUDA devices.

Returns: Number of CUDA devices available to the application

Exceptions

CudaError If device enumeration fails

Here is the call graph for this function:

Here is the caller graph for this function:

◆ getDriverVersion()

int Mila::Dnn::Compute::Cuda::getDriverVersion ( )

export

Gets the installed CUDA driver version.

Returns: Integer representation of the CUDA driver version

Exceptions

CudaError If driver version cannot be determined

Here is the call graph for this function:

◆ getRuntimeVersion()

int Mila::Dnn::Compute::Cuda::getRuntimeVersion ( )

export

Gets the installed CUDA runtime version.

Returns: Integer representation of the CUDA runtime version

Exceptions

CudaError If runtime version cannot be determined

Here is the call graph for this function:

◆ launch_array_fill_typed()

template<typename TargetType, typename HostType>

void Mila::Dnn::Compute::Cuda::launch_array_fill_typed	(	void *	dst,
		const HostType *	host_values,
		size_t	count,
		cudaStream_t	stream )

Here is the caller graph for this function:

◆ launch_constant_fill_typed()

template<typename TargetType, typename HostType>

void Mila::Dnn::Compute::Cuda::launch_constant_fill_typed	(	void *	dst,
		size_t	count,
		HostType	host_value,
		cudaStream_t	stream )

Templated fill operations with compile-time type dispatch.

Here is the caller graph for this function:

◆ launch_convert_copy_kernel()

template<typename SrcT, typename DstT>

void Mila::Dnn::Compute::Cuda::launch_convert_copy_kernel	(	const SrcT *	d_src,
		DstT *	d_or_h_dst,
		size_t	n,
		cudaStream_t	stream )

Launch type-converting copy kernel between tensors.

Copies data between tensors with automatic type conversion using specialized device conversion functions for optimal performance.

Template Parameters

SrcT	Source tensor element type
DstT	Destination tensor element type

Parameters

d_src	Source device memory pointer
d_or_h_dst	Destination memory pointer (device or mapped host)
n	Number of elements to copy
stream	CUDA stream for asynchronous execution (0 for default stream)

Here is the caller graph for this function:

◆ launch_fast_copy_kernel()

template<typename T>

void Mila::Dnn::Compute::Cuda::launch_fast_copy_kernel	(	const T *	d_src,
		T *	d_dst,
		size_t	n,
		cudaStream_t	stream )

Launch optimized same-type copy kernel.

Performs optimized copy between tensors of the same type using vectorized memory operations when possible.

Template Parameters

T	Tensor element type (same for source and destination)

Parameters

d_src	Source device memory pointer
d_dst	Destination device memory pointer
n	Number of elements to copy
stream	CUDA stream for asynchronous execution (0 for default stream)

Here is the caller graph for this function:

◆ launch_scale_shift()

void Mila::Dnn::Compute::Cuda::launch_scale_shift	(	float *	data,
		size_t	n,
		float	min_val,
		float	max_val,
		cudaStream_t	stream )

Here is the caller graph for this function:

◆ print_stats()

template<typename T>

void Mila::Dnn::Compute::Cuda::print_stats	(	const std::string &	name,
		const T *	data,
		const shape_t &	shape,
		int	max_display = 8,
		cudaStream_t	stream = nullptr )

export

Copies a device tensor to host, formats it via dump_tensor, and emits the result through Logging::Logger::info.

The 3-argument form (name, ptr, shape) is the common call site; max_display and stream have defaults so callers that own a stream can opt in for async copy.

Parameters

name	Label used as the tensor name in the dump output.
data	Non-owning device pointer to the tensor data.
shape	Logical shape passed to dump_tensor for layout interpretation.
max_display	Maximum elements per dimension to display (default 8).
stream	CUDA stream for the device-to-host copy (default nullptr = sync).

Here is the call graph for this function:

◆ setCurrentDevice()

void Mila::Dnn::Compute::Cuda::setCurrentDevice ( int device_id )

inlineexport

Sets the current CUDA device with thread-local caching.

Uses thread-local storage to avoid redundant cudaSetDevice calls. This is critical for performance when multiple memory operations occur in sequence on the same device.

Parameters

device_id CUDA device ID to activate

Exceptions

std::runtime_error If cudaSetDevice fails

Here is the caller graph for this function:

Variable Documentation

◆ cuda_data_type_v

template<TensorDataType TDataType>

cudaDataType_t Mila::Dnn::Compute::Cuda::cuda_data_type_v = CudaDataTypeTraits<TDataType>::cuda_data_type

constexprexport

Convenience alias for accessing the cudaDataType_t mapping directly.

Usage: constexpr cudaDataType_t dt = cuda_data_type_v<TensorDataType::BF16>; // dt == CUDA_R_BF16

Namespaces

Classes

Typedefs

Functions

Variables

Typedef Documentation

◆ device_type_t

Function Documentation

◆ build_linear_plan()

◆ build_plan()

◆ build_strided_plan()

◆ ceil_div()

◆ checkDevice()

◆ computeArchitectureBuckets()

◆ cublaslt_compute_types()

◆ cublaslt_cuda_data_type()

◆ cuda_split3_bf16()

◆ cuda_split3_fp32()

◆ dump_2d_rowmajor_host()

◆ dump_tensor()

◆ execute_linear_plan()

◆ execute_plan()

◆ findCudaDevice()

◆ getBestDeviceId()

◆ getBucket()

◆ getDeviceCount()

◆ getDriverVersion()

◆ getRuntimeVersion()

◆ launch_array_fill_typed()

◆ launch_constant_fill_typed()

◆ launch_convert_copy_kernel()

◆ launch_fast_copy_kernel()

◆ launch_scale_shift()

◆ print_stats()

◆ setCurrentDevice()

Variable Documentation

◆ cuda_data_type_v