Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Mila::Dnn::Compute::Cuda Namespace Reference

Namespaces

namespace  Detail
namespace  Gelu
namespace  Gqa
namespace  Kernels
namespace  LayerNorm
namespace  Linear
namespace  Lpe
namespace  MatMulBiasGelu
namespace  MultiHeadAttention
namespace  Residual
namespace  RmsNorm
namespace  Rope
namespace  Softmax
namespace  SoftmaxCrossEntropy
namespace  Swiglu
namespace  TokenEmbedding

Classes

struct  CublasLtLinearPlan
 RAII wrapper owning cuBLASLt descriptors for a Linear matmul. More...
struct  CublasLtMatMulPlan
 RAII wrapper owning cuBLASLt descriptors and the selected heuristic algorithm. More...
class  CublasLtPlanCache
 Generic plan cache keyed on batch size bucket. More...
struct  CudaDataTypeTraits
 Compile-time mapping from TensorDataType -> cudaDataType_t. More...
struct  CudaDataTypeTraits< TensorDataType::BF16 >
struct  CudaDataTypeTraits< TensorDataType::FP16 >
struct  CudaDataTypeTraits< TensorDataType::FP32 >
struct  CudaDataTypeTraits< TensorDataType::FP8_E4M3 >
struct  CudaDataTypeTraits< TensorDataType::FP8_E5M2 >
struct  CudaDataTypeTraits< TensorDataType::INT32 >
struct  CudaDataTypeTraits< TensorDataType::INT8 >
struct  FillOps
 CUDA specialization of TensorOps for initialization operations. More...
struct  MathOps
 CUDA specialization of TensorOps for mathematical operations. More...
struct  RandomOps
struct  StructuralOps
struct  TensorDataTypeMap
 Compile-time mapping from abstract TensorDataType -> CUDA native device type. More...
struct  TensorDataTypeMap< TensorDataType::BF16 >
 Maps TensorDataType::BF16 to CUDA __nv_bfloat16. More...
struct  TensorDataTypeMap< TensorDataType::FP16 >
 Maps TensorDataType::FP16 to CUDA __half. More...
struct  TensorDataTypeMap< TensorDataType::FP32 >
 Maps TensorDataType::FP32 to CUDA float. More...
struct  TensorDataTypeMap< TensorDataType::FP4_E2M1 >
 Maps TensorDataType::FP4_E2M1 to std::uint8_t. More...
struct  TensorDataTypeMap< TensorDataType::FP4_E3M0 >
 Maps TensorDataType::FP4_E3M0 to std::uint8_t. More...
struct  TensorDataTypeMap< TensorDataType::FP8_E4M3 >
 Maps TensorDataType::FP8_E4M3 to CUDA __nv_fp8_e4m3. More...
struct  TensorDataTypeMap< TensorDataType::FP8_E5M2 >
 Maps TensorDataType::FP8_E5M2 to CUDA __nv_fp8_e5m2. More...
struct  TensorDataTypeMap< TensorDataType::INT16 >
 Maps TensorDataType::INT16 to std::int16_t. More...
struct  TensorDataTypeMap< TensorDataType::INT32 >
 Maps TensorDataType::INT32 to std::int32_t. More...
struct  TensorDataTypeMap< TensorDataType::INT8 >
 Maps TensorDataType::INT8 to std::int8_t. More...
struct  TensorDataTypeMap< TensorDataType::UINT16 >
 Maps TensorDataType::UINT16 to std::uint16_t. More...
struct  TensorDataTypeMap< TensorDataType::UINT32 >
 Maps TensorDataType::UINT32 to std::uint32_t. More...
struct  TensorDataTypeMap< TensorDataType::UINT8 >
 Maps TensorDataType::UINT8 to std::uint8_t. More...
struct  TransferOps
 CUDA specialization of TensorOps for tensor transfer operations. More...
struct  ZeroOps

Typedefs

template<TensorDataType TDataType>
using Mila::Dnn::Compute::Cuda::device_type_t = typename TensorDataTypeMap<TDataType>::device_type
 Convenience alias for accessing CUDA native type.

Functions

template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>
CublasLtLinearPlan< TComputePrecision, TParameterPrecision > Mila::Dnn::Compute::Cuda::build_linear_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cublasComputeType_t compute_type, cudaDataType_t scale_type, const float *weight_scale=nullptr)
 Build a cuBLASLt plan for a Linear matmul.
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::build_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cudaDataType_t data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
 Build a cuBLASLt plan for a standard (non-strided) matmul.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::build_strided_plan (cublasLtHandle_t handle, int A_rows, int A_cols, int ldA, long long strideA_elems, int B_rows, int B_cols, int ldB, long long strideB_elems, int C_rows, int C_cols, int ldC, long long strideC_elems, cublasOperation_t opA, cublasOperation_t opB, int strided_batch_count, bool has_bias=false, cublasComputeType_t compute_type=CUBLAS_COMPUTE_32F, cudaDataType_t cuda_data_type=CUDA_R_32F, cudaDataType_t scale_type=CUDA_R_32F, cublasLtOrder_t order=CUBLASLT_ORDER_ROW)
 Build a cuBLASLt matmul plan for strided-batched matmuls.
constexpr int Mila::Dnn::Compute::Cuda::ceil_div (int M, int N)
 Calculates ceiling division for kernel grid/block dimensions.
int Mila::Dnn::Compute::Cuda::checkDevice (int deviceId)
 Validates that a device ID is valid and available.
std::vector< int > Mila::Dnn::Compute::Cuda::computeArchitectureBuckets (int max_batch_size)
 Computes optimal bucket boundaries for cuBLASLt plan caching based on CUDA device architecture.
template<typename NativeType>
void Mila::Dnn::Compute::Cuda::cublaslt_compute_types (cublasComputeType_t &compute_type, cudaDataType_t &scale_type)
template<typename NativeType>
cudaDataType_t Mila::Dnn::Compute::Cuda::cublaslt_cuda_data_type ()
void cuda_split3_bf16 (const __nv_bfloat16 *__restrict__ src, __nv_bfloat16 *__restrict__ out0, __nv_bfloat16 *__restrict__ out1, __nv_bfloat16 *__restrict__ out2, int rows, int D0, int D1, int D2, cudaStream_t stream)
void cuda_split3_fp32 (const float *__restrict__ src, float *__restrict__ out_a, float *__restrict__ out_b, float *__restrict__ out_c, int src_rows, int dim_a, int dim_b, int dim_c, cudaStream_t stream)
 Vectorized 3-way last-dimension split, float32.
template<typename T>
void Mila::Dnn::Compute::Cuda::dump_2d_rowmajor_host (std::ostringstream &oss, const T *host_data, int rows, int cols, int max_display, int indent=0)
 Helper to dump a single 2D row-major matrix (host memory).
template<typename T = float>
std::string Mila::Dnn::Compute::Cuda::dump_tensor (const T *device_data, const shape_t &shape, const std::string &name="tensor", int max_display_size=16, cudaStream_t stream=nullptr)
 Debug utility to dump row-major tensor from device memory.
template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>
void Mila::Dnn::Compute::Cuda::execute_linear_plan (cublasLtHandle_t handle, const CublasLtLinearPlan< TComputePrecision, TParameterPrecision > &plan, const float *alpha, const void *A, const void *B, const float *beta, typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType *C, const typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType *bias, const float *weight_scale, cudaStream_t stream, void *workspace=nullptr, size_t workspace_size=0)
 Execute a previously-built CublasLtLinearPlan.
template<typename TComputePrecision>
void Mila::Dnn::Compute::Cuda::execute_plan (cublasLtHandle_t handle, const CublasLtMatMulPlan< TComputePrecision > &plan, const void *alpha, const TComputePrecision *A, const TComputePrecision *B, const void *beta, TComputePrecision *C, const TComputePrecision *bias, cudaStream_t stream, void *workspace=nullptr, size_t workspaceSize=0)
 Execute a previously-built cuBLASLt plan.
int Mila::Dnn::Compute::Cuda::findCudaDevice (int deviceId=-1, bool preferMemory=false)
 Finds the most appropriate CUDA device for computation.
int Mila::Dnn::Compute::Cuda::getBestDeviceId (bool preferMemory=false)
 Identifies the best CUDA device based on performance characteristics.
int Mila::Dnn::Compute::Cuda::getBucket (const std::vector< int > &buckets, int batch_size)
 Fast O(log N) bucket lookup.
int Mila::Dnn::Compute::Cuda::getDeviceCount ()
 Gets the number of available CUDA devices.
int Mila::Dnn::Compute::Cuda::getDriverVersion ()
 Gets the installed CUDA driver version.
int Mila::Dnn::Compute::Cuda::getRuntimeVersion ()
 Gets the installed CUDA runtime version.
template<typename TargetType, typename HostType>
void launch_array_fill_typed (void *dst, const HostType *host_values, size_t count, cudaStream_t stream)
template<typename TargetType, typename HostType>
void launch_constant_fill_typed (void *dst, size_t count, HostType host_value, cudaStream_t stream)
 Templated fill operations with compile-time type dispatch.
template<typename SrcT, typename DstT>
void launch_convert_copy_kernel (const SrcT *d_src, DstT *d_or_h_dst, size_t n, cudaStream_t stream)
 Launch type-converting copy kernel between tensors.
template<typename T>
void launch_fast_copy_kernel (const T *d_src, T *d_dst, size_t n, cudaStream_t stream)
 Launch optimized same-type copy kernel.
void launch_scale_shift (float *data, size_t n, float min_val, float max_val, cudaStream_t stream)
template<typename T>
void Mila::Dnn::Compute::Cuda::print_stats (const std::string &name, const T *data, const shape_t &shape, int max_display=8, cudaStream_t stream=nullptr)
 Copies a device tensor to host, formats it via dump_tensor, and emits the result through Logging::Logger::info.
void Mila::Dnn::Compute::Cuda::setCurrentDevice (int device_id)
 Sets the current CUDA device with thread-local caching.

Variables

template<TensorDataType TDataType>
constexpr cudaDataType_t Mila::Dnn::Compute::Cuda::cuda_data_type_v = CudaDataTypeTraits<TDataType>::cuda_data_type
 Convenience alias for accessing the cudaDataType_t mapping directly.

Typedef Documentation

◆ device_type_t

template<TensorDataType TDataType>
using Mila::Dnn::Compute::Cuda::device_type_t = typename TensorDataTypeMap<TDataType>::device_type
export

Convenience alias for accessing CUDA native type.

Template Parameters
TDataTypeAbstract tensor data type

Function Documentation

◆ build_linear_plan()

template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>
CublasLtLinearPlan< TComputePrecision, TParameterPrecision > Mila::Dnn::Compute::Cuda::build_linear_plan ( cublasLtHandle_t handle,
int outer_size,
int in_features,
int out_features,
bool has_bias,
cublasComputeType_t compute_type,
cudaDataType_t scale_type,
const float * weight_scale = nullptr )
export

Build a cuBLASLt plan for a Linear matmul.

Layout is selected at compile time based on kIsQuantized:

Non-quantized (TComputePrecision == TParameterPrecision): NT row-major — A = activations, B = weights, opA=N, opB=T C[outer_size, out_features] = A[outer_size, in_features] × B^T[in_features, out_features]

Quantized (Ada SM 8.9+, TParameterPrecision = FP8_E4M3): TN column-major — A = weights (FP8), B = activations (BF16), opA=T, opB=N Exploits the row-major / column-major duality: row-major W[N, K] ≡ col-major W^T[K, N] (same bytes, lda = K) row-major X[M, K] ≡ col-major X^T[K, M] (same bytes, ldb = K) op(A) = (W^T)^T = W[N, K], op(B) = X^T[K, M] D = W × X^T = Y^T[N, M] col-major ≡ row-major Y[M, N] (ldc = N) A_SCALE_POINTER = per-tensor weight scale (weight_scales_[0]).

Parameters
outer_sizeToken count (M = B * T for transformers).
in_featuresInner dimension K.
out_featuresOutput channels N.
has_biasActivates CUBLASLT_EPILOGUE_BIAS (non-quantized path only).
compute_typeSupplied by CudaLinearOp::getComputeTypes().
scale_typeAlways CUDA_R_32F.
weight_scaleQuantized path only: device pointer to the per-tensor weight scale (weight_scales_[0]). Must be non-null when kIsQuantized because CUBLASLT_MATMUL_DESC_A_SCALE_POINTER must be set in the descriptor before cublasLtMatmulAlgoGetHeuristic for FP8 algorithms to be returned. Ignored on the non-quantized path (default nullptr).
Here is the call graph for this function:
Here is the caller graph for this function:

◆ build_plan()

template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::build_plan ( cublasLtHandle_t handle,
int outer_size,
int in_features,
int out_features,
bool has_bias,
cudaDataType_t data_type,
cublasComputeType_t compute_type,
cudaDataType_t scale_type )
export

Build a cuBLASLt plan for a standard (non-strided) matmul.

Computes C[outer_size, out_features] = A[outer_size, in_features] @ B^T[in_features, out_features] Row-major layout, opA=N, opB=T, strided_batch_count=1.

Parameters
outer_sizeBucket-aligned row count for A and C (C=B*T for transformers).
Here is the call graph for this function:
Here is the caller graph for this function:

◆ build_strided_plan()

template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::build_strided_plan ( cublasLtHandle_t handle,
int A_rows,
int A_cols,
int ldA,
long long strideA_elems,
int B_rows,
int B_cols,
int ldB,
long long strideB_elems,
int C_rows,
int C_cols,
int ldC,
long long strideC_elems,
cublasOperation_t opA,
cublasOperation_t opB,
int strided_batch_count,
bool has_bias = false,
cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F,
cudaDataType_t cuda_data_type = CUDA_R_32F,
cudaDataType_t scale_type = CUDA_R_32F,
cublasLtOrder_t order = CUBLASLT_ORDER_ROW )
export

Build a cuBLASLt matmul plan for strided-batched matmuls.

Accepts explicit layout dimensions (rows/cols/ld) and per-element strides for A, B, C. The heuristic algorithm is selected at build time for the given dimensions and reused at execution time. When actual_rows is passed to execute_plan, only the row count embedded in layoutA and layoutC is updated — the selected algorithm remains valid for any row count up to the bucket it was built for.

Parameters
strided_batch_countNumber of independent matmul instances (1 for Linear, B*NH for Attention).
Note
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET expects stride in elements, not bytes.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ ceil_div()

int Mila::Dnn::Compute::Cuda::ceil_div ( int M,
int N )
constexprexport

Calculates ceiling division for kernel grid/block dimensions.

Parameters
MDividend value
NDivisor value
Returns
Ceiling of M/N as an integer

◆ checkDevice()

int Mila::Dnn::Compute::Cuda::checkDevice ( int deviceId)
export

Validates that a device ID is valid and available.

Parameters
deviceIdCUDA device ID to check
Returns
The same device ID if valid
Exceptions
std::invalid_argumentIf device ID is negative
std::runtime_errorIf no CUDA devices are available
std::out_of_rangeIf device ID exceeds available device count
std::runtime_errorIf device is in prohibited compute mode
Here is the call graph for this function:
Here is the caller graph for this function:

◆ computeArchitectureBuckets()

std::vector< int > Mila::Dnn::Compute::Cuda::computeArchitectureBuckets ( int max_batch_size)
inlineexport

Computes optimal bucket boundaries for cuBLASLt plan caching based on CUDA device architecture.

Bucket sizes are aligned to tensor core tile granularity: Pre-Volta (SM<70): grain=32 (warp-aligned, no tensor cores) Volta (SM70): grain=8 (FP16 tensor cores, 8x8x4 MMA) Turing (SM75): grain=16 Ampere+ (SM80+): grain=16 (TF32/BF16/FP16, 16x8x16 MMA) Hopper (SM90+): grain=16 (FP8 wgmma)

Here is the caller graph for this function:

◆ cublaslt_compute_types()

template<typename NativeType>
void Mila::Dnn::Compute::Cuda::cublaslt_compute_types ( cublasComputeType_t & compute_type,
cudaDataType_t & scale_type )
export

◆ cublaslt_cuda_data_type()

template<typename NativeType>
cudaDataType_t Mila::Dnn::Compute::Cuda::cublaslt_cuda_data_type ( )
export

◆ cuda_split3_bf16()

void Mila::Dnn::Compute::Cuda::cuda_split3_bf16 ( const __nv_bfloat16 *__restrict__ src,
__nv_bfloat16 *__restrict__ out0,
__nv_bfloat16 *__restrict__ out1,
__nv_bfloat16 *__restrict__ out2,
int rows,
int D0,
int D1,
int D2,
cudaStream_t stream )
Here is the caller graph for this function:

◆ cuda_split3_fp32()

void Mila::Dnn::Compute::Cuda::cuda_split3_fp32 ( const float *__restrict__ src,
float *__restrict__ out_a,
float *__restrict__ out_b,
float *__restrict__ out_c,
int src_rows,
int dim_a,
int dim_b,
int dim_c,
cudaStream_t stream )

Vectorized 3-way last-dimension split, float32.

Splits input [B, T, D0+D1+D2] into three output tensors [B, T, D0], [B, T, D1], [B, T, D2] along the last dimension.

Parameters
srcInput device buffer [B * T * (D0+D1+D2) floats].
out0Output device buffer [B * T * D0 floats].
out1Output device buffer [B * T * D1 floats].
out2Output device buffer [B * T * D2 floats].
BBatch size.
TSequence length.
D0Last-dim size of output 0. Must be a multiple of 4.
D1Last-dim size of output 1. Must be a multiple of 4.
D2Last-dim size of output 2. Must be a multiple of 4.
streamCUDA stream for kernel scheduling.
Here is the caller graph for this function:

◆ dump_2d_rowmajor_host()

template<typename T>
void Mila::Dnn::Compute::Cuda::dump_2d_rowmajor_host ( std::ostringstream & oss,
const T * host_data,
int rows,
int cols,
int max_display,
int indent = 0 )
export

Helper to dump a single 2D row-major matrix (host memory).

Indexing: element (row=r, col=c) -> host_data[r * cols + c]

Here is the caller graph for this function:

◆ dump_tensor()

template<typename T = float>
std::string Mila::Dnn::Compute::Cuda::dump_tensor ( const T * device_data,
const shape_t & shape,
const std::string & name = "tensor",
int max_display_size = 16,
cudaStream_t stream = nullptr )
export

Debug utility to dump row-major tensor from device memory.

This utility copies data from device to host, then properly interprets the row-major layout for display.

Template Parameters
TData type (float, __half, etc.)
Parameters
device_dataDevice pointer to row-major data
shapeShape vector (shape_t)
nameTensor name for display
max_display_sizeMaximum elements to display per dimension
streamCUDA stream for async copy (nullptr for default stream)
Returns
String representation of the tensor
Here is the call graph for this function:
Here is the caller graph for this function:

◆ execute_linear_plan()

template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision>
void Mila::Dnn::Compute::Cuda::execute_linear_plan ( cublasLtHandle_t handle,
const CublasLtLinearPlan< TComputePrecision, TParameterPrecision > & plan,
const float * alpha,
const void * A,
const void * B,
const float * beta,
typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType * C,
const typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType * bias,
const float * weight_scale,
cudaStream_t stream,
void * workspace = nullptr,
size_t workspace_size = 0 )
export

Execute a previously-built CublasLtLinearPlan.

Computes: D = alpha * op(A) * op(B) + beta * C with optional bias epilogue (non-quantized path) and optional weight scale (FP8 path).

Pointer semantics match the layout built by build_linear_plan: Non-quantized (NT row-major): A = activations, B = weights Quantized (TN col-major): A = weights (FP8), B = activations (BF16)

Both A and B are passed as const void* to accommodate the pointer-swap between the two layouts without template type conflicts. Type safety is enforced at the plan-build level via the layout descriptors.

Parameters
ADevice pointer to matrix A (see layout note above).
BDevice pointer to matrix B (see layout note above).
CDevice pointer to output (TComputePrecision).
biasDevice pointer to bias vector; used only when has_bias_epilogue.
weight_scaleDevice pointer to per-tensor weight scale (float); applied as A_SCALE_POINTER when has_weight_scale is true. For the TN path weight_scales_[0] is the global per-tensor scale.
workspaceOptional device scratch buffer.
workspace_sizeSize of workspace in bytes.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ execute_plan()

template<typename TComputePrecision>
void Mila::Dnn::Compute::Cuda::execute_plan ( cublasLtHandle_t handle,
const CublasLtMatMulPlan< TComputePrecision > & plan,
const void * alpha,
const TComputePrecision * A,
const TComputePrecision * B,
const void * beta,
TComputePrecision * C,
const TComputePrecision * bias,
cudaStream_t stream,
void * workspace = nullptr,
size_t workspaceSize = 0 )
export

Execute a previously-built cuBLASLt plan.

Computes: C = alpha * op(A) * op(B) + beta * C, with optional bias epilogue.

Parameters
biasDevice pointer to bias vector. Applied only when plan.has_bias_epilogue is true.
workspaceOptional device workspace buffer. Must be non-null when the selected algorithm requires scratch memory (workspaceSize > 0 in heuristic result).
workspaceSizeSize of the workspace buffer in bytes.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ findCudaDevice()

int Mila::Dnn::Compute::Cuda::findCudaDevice ( int deviceId = -1,
bool preferMemory = false )
inlineexport

Finds the most appropriate CUDA device for computation.

Either validates a specific device ID if provided or finds the best available device when no preference is specified.

Parameters
deviceIdPreferred device ID, or -1 to select the best device
Returns
Valid CUDA device ID
Exceptions
std::runtime_errorIf no CUDA devices are found
Here is the call graph for this function:

◆ getBestDeviceId()

int Mila::Dnn::Compute::Cuda::getBestDeviceId ( bool preferMemory = false)
inlineexport

Identifies the best CUDA device based on performance characteristics.

Evaluates available CUDA devices and selects the one with highest performance potential. Selection criteria vary based on the intended workload type.

Parameters
preferMemoryWhen true, prioritizes memory bandwidth over compute
Returns
Device ID of the best available CUDA device
Exceptions
CudaErrorIf device properties cannot be accessed
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getBucket()

int Mila::Dnn::Compute::Cuda::getBucket ( const std::vector< int > & buckets,
int batch_size )
inlineexport

Fast O(log N) bucket lookup.

Returns smallest bucket >= batch_size.

Here is the caller graph for this function:

◆ getDeviceCount()

int Mila::Dnn::Compute::Cuda::getDeviceCount ( )
inlineexport

Gets the number of available CUDA devices.

Returns
Number of CUDA devices available to the application
Exceptions
CudaErrorIf device enumeration fails
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getDriverVersion()

int Mila::Dnn::Compute::Cuda::getDriverVersion ( )
export

Gets the installed CUDA driver version.

Returns
Integer representation of the CUDA driver version
Exceptions
CudaErrorIf driver version cannot be determined
Here is the call graph for this function:

◆ getRuntimeVersion()

int Mila::Dnn::Compute::Cuda::getRuntimeVersion ( )
export

Gets the installed CUDA runtime version.

Returns
Integer representation of the CUDA runtime version
Exceptions
CudaErrorIf runtime version cannot be determined
Here is the call graph for this function:

◆ launch_array_fill_typed()

template<typename TargetType, typename HostType>
void Mila::Dnn::Compute::Cuda::launch_array_fill_typed ( void * dst,
const HostType * host_values,
size_t count,
cudaStream_t stream )
Here is the caller graph for this function:

◆ launch_constant_fill_typed()

template<typename TargetType, typename HostType>
void Mila::Dnn::Compute::Cuda::launch_constant_fill_typed ( void * dst,
size_t count,
HostType host_value,
cudaStream_t stream )

Templated fill operations with compile-time type dispatch.

Here is the caller graph for this function:

◆ launch_convert_copy_kernel()

template<typename SrcT, typename DstT>
void Mila::Dnn::Compute::Cuda::launch_convert_copy_kernel ( const SrcT * d_src,
DstT * d_or_h_dst,
size_t n,
cudaStream_t stream )

Launch type-converting copy kernel between tensors.

Copies data between tensors with automatic type conversion using specialized device conversion functions for optimal performance.

Template Parameters
SrcTSource tensor element type
DstTDestination tensor element type
Parameters
d_srcSource device memory pointer
d_or_h_dstDestination memory pointer (device or mapped host)
nNumber of elements to copy
streamCUDA stream for asynchronous execution (0 for default stream)
Here is the caller graph for this function:

◆ launch_fast_copy_kernel()

template<typename T>
void Mila::Dnn::Compute::Cuda::launch_fast_copy_kernel ( const T * d_src,
T * d_dst,
size_t n,
cudaStream_t stream )

Launch optimized same-type copy kernel.

Performs optimized copy between tensors of the same type using vectorized memory operations when possible.

Template Parameters
TTensor element type (same for source and destination)
Parameters
d_srcSource device memory pointer
d_dstDestination device memory pointer
nNumber of elements to copy
streamCUDA stream for asynchronous execution (0 for default stream)
Here is the caller graph for this function:

◆ launch_scale_shift()

void Mila::Dnn::Compute::Cuda::launch_scale_shift ( float * data,
size_t n,
float min_val,
float max_val,
cudaStream_t stream )
Here is the caller graph for this function:

◆ print_stats()

template<typename T>
void Mila::Dnn::Compute::Cuda::print_stats ( const std::string & name,
const T * data,
const shape_t & shape,
int max_display = 8,
cudaStream_t stream = nullptr )
export

Copies a device tensor to host, formats it via dump_tensor, and emits the result through Logging::Logger::info.

The 3-argument form (name, ptr, shape) is the common call site; max_display and stream have defaults so callers that own a stream can opt in for async copy.

Parameters
nameLabel used as the tensor name in the dump output.
dataNon-owning device pointer to the tensor data.
shapeLogical shape passed to dump_tensor for layout interpretation.
max_displayMaximum elements per dimension to display (default 8).
streamCUDA stream for the device-to-host copy (default nullptr = sync).
Here is the call graph for this function:

◆ setCurrentDevice()

void Mila::Dnn::Compute::Cuda::setCurrentDevice ( int device_id)
inlineexport

Sets the current CUDA device with thread-local caching.

Uses thread-local storage to avoid redundant cudaSetDevice calls. This is critical for performance when multiple memory operations occur in sequence on the same device.

Parameters
device_idCUDA device ID to activate
Exceptions
std::runtime_errorIf cudaSetDevice fails
Here is the caller graph for this function:

Variable Documentation

◆ cuda_data_type_v

template<TensorDataType TDataType>
cudaDataType_t Mila::Dnn::Compute::Cuda::cuda_data_type_v = CudaDataTypeTraits<TDataType>::cuda_data_type
constexprexport

Convenience alias for accessing the cudaDataType_t mapping directly.

Usage: constexpr cudaDataType_t dt = cuda_data_type_v<TensorDataType::BF16>; // dt == CUDA_R_BF16