|
Mila 0.13.48
Deep Neural Network Library
|
Namespaces | |
| namespace | Detail |
| namespace | Gelu |
| namespace | Gqa |
| namespace | Kernels |
| namespace | LayerNorm |
| namespace | Linear |
| namespace | Lpe |
| namespace | MatMulBiasGelu |
| namespace | MultiHeadAttention |
| namespace | Residual |
| namespace | RmsNorm |
| namespace | Rope |
| namespace | Softmax |
| namespace | SoftmaxCrossEntropy |
| namespace | Swiglu |
| namespace | TokenEmbedding |
Typedefs | |
| template<TensorDataType TDataType> | |
| using | Mila::Dnn::Compute::Cuda::device_type_t = typename TensorDataTypeMap<TDataType>::device_type |
| Convenience alias for accessing CUDA native type. | |
Functions | |
| template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision> | |
| CublasLtLinearPlan< TComputePrecision, TParameterPrecision > | Mila::Dnn::Compute::Cuda::build_linear_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cublasComputeType_t compute_type, cudaDataType_t scale_type, const float *weight_scale=nullptr) |
| Build a cuBLASLt plan for a Linear matmul. | |
| template<typename TNative> | |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::build_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cudaDataType_t data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| Build a cuBLASLt plan for a standard (non-strided) matmul. | |
| template<typename TComputePrecision> | |
| CublasLtMatMulPlan< TComputePrecision > | Mila::Dnn::Compute::Cuda::build_strided_plan (cublasLtHandle_t handle, int A_rows, int A_cols, int ldA, long long strideA_elems, int B_rows, int B_cols, int ldB, long long strideB_elems, int C_rows, int C_cols, int ldC, long long strideC_elems, cublasOperation_t opA, cublasOperation_t opB, int strided_batch_count, bool has_bias=false, cublasComputeType_t compute_type=CUBLAS_COMPUTE_32F, cudaDataType_t cuda_data_type=CUDA_R_32F, cudaDataType_t scale_type=CUDA_R_32F, cublasLtOrder_t order=CUBLASLT_ORDER_ROW) |
| Build a cuBLASLt matmul plan for strided-batched matmuls. | |
| constexpr int | Mila::Dnn::Compute::Cuda::ceil_div (int M, int N) |
| Calculates ceiling division for kernel grid/block dimensions. | |
| int | Mila::Dnn::Compute::Cuda::checkDevice (int deviceId) |
| Validates that a device ID is valid and available. | |
| std::vector< int > | Mila::Dnn::Compute::Cuda::computeArchitectureBuckets (int max_batch_size) |
| Computes optimal bucket boundaries for cuBLASLt plan caching based on CUDA device architecture. | |
| template<typename NativeType> | |
| void | Mila::Dnn::Compute::Cuda::cublaslt_compute_types (cublasComputeType_t &compute_type, cudaDataType_t &scale_type) |
| template<typename NativeType> | |
| cudaDataType_t | Mila::Dnn::Compute::Cuda::cublaslt_cuda_data_type () |
| void | cuda_split3_bf16 (const __nv_bfloat16 *__restrict__ src, __nv_bfloat16 *__restrict__ out0, __nv_bfloat16 *__restrict__ out1, __nv_bfloat16 *__restrict__ out2, int rows, int D0, int D1, int D2, cudaStream_t stream) |
| void | cuda_split3_fp32 (const float *__restrict__ src, float *__restrict__ out_a, float *__restrict__ out_b, float *__restrict__ out_c, int src_rows, int dim_a, int dim_b, int dim_c, cudaStream_t stream) |
| Vectorized 3-way last-dimension split, float32. | |
| template<typename T> | |
| void | Mila::Dnn::Compute::Cuda::dump_2d_rowmajor_host (std::ostringstream &oss, const T *host_data, int rows, int cols, int max_display, int indent=0) |
| Helper to dump a single 2D row-major matrix (host memory). | |
| template<typename T = float> | |
| std::string | Mila::Dnn::Compute::Cuda::dump_tensor (const T *device_data, const shape_t &shape, const std::string &name="tensor", int max_display_size=16, cudaStream_t stream=nullptr) |
| Debug utility to dump row-major tensor from device memory. | |
| template<TensorDataType TComputePrecision, TensorDataType TParameterPrecision = TComputePrecision> | |
| void | Mila::Dnn::Compute::Cuda::execute_linear_plan (cublasLtHandle_t handle, const CublasLtLinearPlan< TComputePrecision, TParameterPrecision > &plan, const float *alpha, const void *A, const void *B, const float *beta, typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType *C, const typename CublasLtLinearPlan< TComputePrecision, TParameterPrecision >::ActivationType *bias, const float *weight_scale, cudaStream_t stream, void *workspace=nullptr, size_t workspace_size=0) |
| Execute a previously-built CublasLtLinearPlan. | |
| template<typename TComputePrecision> | |
| void | Mila::Dnn::Compute::Cuda::execute_plan (cublasLtHandle_t handle, const CublasLtMatMulPlan< TComputePrecision > &plan, const void *alpha, const TComputePrecision *A, const TComputePrecision *B, const void *beta, TComputePrecision *C, const TComputePrecision *bias, cudaStream_t stream, void *workspace=nullptr, size_t workspaceSize=0) |
| Execute a previously-built cuBLASLt plan. | |
| int | Mila::Dnn::Compute::Cuda::findCudaDevice (int deviceId=-1, bool preferMemory=false) |
| Finds the most appropriate CUDA device for computation. | |
| int | Mila::Dnn::Compute::Cuda::getBestDeviceId (bool preferMemory=false) |
| Identifies the best CUDA device based on performance characteristics. | |
| int | Mila::Dnn::Compute::Cuda::getBucket (const std::vector< int > &buckets, int batch_size) |
| Fast O(log N) bucket lookup. | |
| int | Mila::Dnn::Compute::Cuda::getDeviceCount () |
| Gets the number of available CUDA devices. | |
| int | Mila::Dnn::Compute::Cuda::getDriverVersion () |
| Gets the installed CUDA driver version. | |
| int | Mila::Dnn::Compute::Cuda::getRuntimeVersion () |
| Gets the installed CUDA runtime version. | |
| template<typename TargetType, typename HostType> | |
| void | launch_array_fill_typed (void *dst, const HostType *host_values, size_t count, cudaStream_t stream) |
| template<typename TargetType, typename HostType> | |
| void | launch_constant_fill_typed (void *dst, size_t count, HostType host_value, cudaStream_t stream) |
| Templated fill operations with compile-time type dispatch. | |
| template<typename SrcT, typename DstT> | |
| void | launch_convert_copy_kernel (const SrcT *d_src, DstT *d_or_h_dst, size_t n, cudaStream_t stream) |
| Launch type-converting copy kernel between tensors. | |
| template<typename T> | |
| void | launch_fast_copy_kernel (const T *d_src, T *d_dst, size_t n, cudaStream_t stream) |
| Launch optimized same-type copy kernel. | |
| void | launch_scale_shift (float *data, size_t n, float min_val, float max_val, cudaStream_t stream) |
| template<typename T> | |
| void | Mila::Dnn::Compute::Cuda::print_stats (const std::string &name, const T *data, const shape_t &shape, int max_display=8, cudaStream_t stream=nullptr) |
| Copies a device tensor to host, formats it via dump_tensor, and emits the result through Logging::Logger::info. | |
| void | Mila::Dnn::Compute::Cuda::setCurrentDevice (int device_id) |
| Sets the current CUDA device with thread-local caching. | |
Variables | |
| template<TensorDataType TDataType> | |
| constexpr cudaDataType_t | Mila::Dnn::Compute::Cuda::cuda_data_type_v = CudaDataTypeTraits<TDataType>::cuda_data_type |
| Convenience alias for accessing the cudaDataType_t mapping directly. | |
|
export |
Convenience alias for accessing CUDA native type.
| TDataType | Abstract tensor data type |
|
export |
Build a cuBLASLt plan for a Linear matmul.
Layout is selected at compile time based on kIsQuantized:
Non-quantized (TComputePrecision == TParameterPrecision): NT row-major — A = activations, B = weights, opA=N, opB=T C[outer_size, out_features] = A[outer_size, in_features] × B^T[in_features, out_features]
Quantized (Ada SM 8.9+, TParameterPrecision = FP8_E4M3): TN column-major — A = weights (FP8), B = activations (BF16), opA=T, opB=N Exploits the row-major / column-major duality: row-major W[N, K] ≡ col-major W^T[K, N] (same bytes, lda = K) row-major X[M, K] ≡ col-major X^T[K, M] (same bytes, ldb = K) op(A) = (W^T)^T = W[N, K], op(B) = X^T[K, M] D = W × X^T = Y^T[N, M] col-major ≡ row-major Y[M, N] (ldc = N) A_SCALE_POINTER = per-tensor weight scale (weight_scales_[0]).
| outer_size | Token count (M = B * T for transformers). |
| in_features | Inner dimension K. |
| out_features | Output channels N. |
| has_bias | Activates CUBLASLT_EPILOGUE_BIAS (non-quantized path only). |
| compute_type | Supplied by CudaLinearOp::getComputeTypes(). |
| scale_type | Always CUDA_R_32F. |
| weight_scale | Quantized path only: device pointer to the per-tensor weight scale (weight_scales_[0]). Must be non-null when kIsQuantized because CUBLASLT_MATMUL_DESC_A_SCALE_POINTER must be set in the descriptor before cublasLtMatmulAlgoGetHeuristic for FP8 algorithms to be returned. Ignored on the non-quantized path (default nullptr). |


|
export |
Build a cuBLASLt plan for a standard (non-strided) matmul.
Computes C[outer_size, out_features] = A[outer_size, in_features] @ B^T[in_features, out_features] Row-major layout, opA=N, opB=T, strided_batch_count=1.
| outer_size | Bucket-aligned row count for A and C (C=B*T for transformers). |


|
export |
Build a cuBLASLt matmul plan for strided-batched matmuls.
Accepts explicit layout dimensions (rows/cols/ld) and per-element strides for A, B, C. The heuristic algorithm is selected at build time for the given dimensions and reused at execution time. When actual_rows is passed to execute_plan, only the row count embedded in layoutA and layoutC is updated — the selected algorithm remains valid for any row count up to the bucket it was built for.
| strided_batch_count | Number of independent matmul instances (1 for Linear, B*NH for Attention). |


|
constexprexport |
Calculates ceiling division for kernel grid/block dimensions.
| M | Dividend value |
| N | Divisor value |
|
export |
Validates that a device ID is valid and available.
| deviceId | CUDA device ID to check |
| std::invalid_argument | If device ID is negative |
| std::runtime_error | If no CUDA devices are available |
| std::out_of_range | If device ID exceeds available device count |
| std::runtime_error | If device is in prohibited compute mode |


|
inlineexport |
Computes optimal bucket boundaries for cuBLASLt plan caching based on CUDA device architecture.
Bucket sizes are aligned to tensor core tile granularity: Pre-Volta (SM<70): grain=32 (warp-aligned, no tensor cores) Volta (SM70): grain=8 (FP16 tensor cores, 8x8x4 MMA) Turing (SM75): grain=16 Ampere+ (SM80+): grain=16 (TF32/BF16/FP16, 16x8x16 MMA) Hopper (SM90+): grain=16 (FP8 wgmma)

|
export |
|
export |
| void Mila::Dnn::Compute::Cuda::cuda_split3_bf16 | ( | const __nv_bfloat16 *__restrict__ | src, |
| __nv_bfloat16 *__restrict__ | out0, | ||
| __nv_bfloat16 *__restrict__ | out1, | ||
| __nv_bfloat16 *__restrict__ | out2, | ||
| int | rows, | ||
| int | D0, | ||
| int | D1, | ||
| int | D2, | ||
| cudaStream_t | stream ) |

| void Mila::Dnn::Compute::Cuda::cuda_split3_fp32 | ( | const float *__restrict__ | src, |
| float *__restrict__ | out_a, | ||
| float *__restrict__ | out_b, | ||
| float *__restrict__ | out_c, | ||
| int | src_rows, | ||
| int | dim_a, | ||
| int | dim_b, | ||
| int | dim_c, | ||
| cudaStream_t | stream ) |
Vectorized 3-way last-dimension split, float32.
Splits input [B, T, D0+D1+D2] into three output tensors [B, T, D0], [B, T, D1], [B, T, D2] along the last dimension.
| src | Input device buffer [B * T * (D0+D1+D2) floats]. |
| out0 | Output device buffer [B * T * D0 floats]. |
| out1 | Output device buffer [B * T * D1 floats]. |
| out2 | Output device buffer [B * T * D2 floats]. |
| B | Batch size. |
| T | Sequence length. |
| D0 | Last-dim size of output 0. Must be a multiple of 4. |
| D1 | Last-dim size of output 1. Must be a multiple of 4. |
| D2 | Last-dim size of output 2. Must be a multiple of 4. |
| stream | CUDA stream for kernel scheduling. |

|
export |
Helper to dump a single 2D row-major matrix (host memory).
Indexing: element (row=r, col=c) -> host_data[r * cols + c]

|
export |
Debug utility to dump row-major tensor from device memory.
This utility copies data from device to host, then properly interprets the row-major layout for display.
| T | Data type (float, __half, etc.) |
| device_data | Device pointer to row-major data |
| shape | Shape vector (shape_t) |
| name | Tensor name for display |
| max_display_size | Maximum elements to display per dimension |
| stream | CUDA stream for async copy (nullptr for default stream) |


|
export |
Execute a previously-built CublasLtLinearPlan.
Computes: D = alpha * op(A) * op(B) + beta * C with optional bias epilogue (non-quantized path) and optional weight scale (FP8 path).
Pointer semantics match the layout built by build_linear_plan: Non-quantized (NT row-major): A = activations, B = weights Quantized (TN col-major): A = weights (FP8), B = activations (BF16)
Both A and B are passed as const void* to accommodate the pointer-swap between the two layouts without template type conflicts. Type safety is enforced at the plan-build level via the layout descriptors.
| A | Device pointer to matrix A (see layout note above). |
| B | Device pointer to matrix B (see layout note above). |
| C | Device pointer to output (TComputePrecision). |
| bias | Device pointer to bias vector; used only when has_bias_epilogue. |
| weight_scale | Device pointer to per-tensor weight scale (float); applied as A_SCALE_POINTER when has_weight_scale is true. For the TN path weight_scales_[0] is the global per-tensor scale. |
| workspace | Optional device scratch buffer. |
| workspace_size | Size of workspace in bytes. |


|
export |
Execute a previously-built cuBLASLt plan.
Computes: C = alpha * op(A) * op(B) + beta * C, with optional bias epilogue.
| bias | Device pointer to bias vector. Applied only when plan.has_bias_epilogue is true. |
| workspace | Optional device workspace buffer. Must be non-null when the selected algorithm requires scratch memory (workspaceSize > 0 in heuristic result). |
| workspaceSize | Size of the workspace buffer in bytes. |


|
inlineexport |
Finds the most appropriate CUDA device for computation.
Either validates a specific device ID if provided or finds the best available device when no preference is specified.
| deviceId | Preferred device ID, or -1 to select the best device |
| std::runtime_error | If no CUDA devices are found |

|
inlineexport |
Identifies the best CUDA device based on performance characteristics.
Evaluates available CUDA devices and selects the one with highest performance potential. Selection criteria vary based on the intended workload type.
| preferMemory | When true, prioritizes memory bandwidth over compute |
| CudaError | If device properties cannot be accessed |


|
inlineexport |
Fast O(log N) bucket lookup.
Returns smallest bucket >= batch_size.

|
inlineexport |
Gets the number of available CUDA devices.
| CudaError | If device enumeration fails |


|
export |
Gets the installed CUDA driver version.
| CudaError | If driver version cannot be determined |

|
export |
Gets the installed CUDA runtime version.
| CudaError | If runtime version cannot be determined |

| void Mila::Dnn::Compute::Cuda::launch_array_fill_typed | ( | void * | dst, |
| const HostType * | host_values, | ||
| size_t | count, | ||
| cudaStream_t | stream ) |

| void Mila::Dnn::Compute::Cuda::launch_constant_fill_typed | ( | void * | dst, |
| size_t | count, | ||
| HostType | host_value, | ||
| cudaStream_t | stream ) |
Templated fill operations with compile-time type dispatch.

| void Mila::Dnn::Compute::Cuda::launch_convert_copy_kernel | ( | const SrcT * | d_src, |
| DstT * | d_or_h_dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch type-converting copy kernel between tensors.
Copies data between tensors with automatic type conversion using specialized device conversion functions for optimal performance.
| SrcT | Source tensor element type |
| DstT | Destination tensor element type |
| d_src | Source device memory pointer |
| d_or_h_dst | Destination memory pointer (device or mapped host) |
| n | Number of elements to copy |
| stream | CUDA stream for asynchronous execution (0 for default stream) |

| void Mila::Dnn::Compute::Cuda::launch_fast_copy_kernel | ( | const T * | d_src, |
| T * | d_dst, | ||
| size_t | n, | ||
| cudaStream_t | stream ) |
Launch optimized same-type copy kernel.
Performs optimized copy between tensors of the same type using vectorized memory operations when possible.
| T | Tensor element type (same for source and destination) |
| d_src | Source device memory pointer |
| d_dst | Destination device memory pointer |
| n | Number of elements to copy |
| stream | CUDA stream for asynchronous execution (0 for default stream) |

| void Mila::Dnn::Compute::Cuda::launch_scale_shift | ( | float * | data, |
| size_t | n, | ||
| float | min_val, | ||
| float | max_val, | ||
| cudaStream_t | stream ) |

|
export |
Copies a device tensor to host, formats it via dump_tensor, and emits the result through Logging::Logger::info.
The 3-argument form (name, ptr, shape) is the common call site; max_display and stream have defaults so callers that own a stream can opt in for async copy.
| name | Label used as the tensor name in the dump output. |
| data | Non-owning device pointer to the tensor data. |
| shape | Logical shape passed to dump_tensor for layout interpretation. |
| max_display | Maximum elements per dimension to display (default 8). |
| stream | CUDA stream for the device-to-host copy (default nullptr = sync). |

|
inlineexport |
Sets the current CUDA device with thread-local caching.
Uses thread-local storage to avoid redundant cudaSetDevice calls. This is critical for performance when multiple memory operations occur in sequence on the same device.
| device_id | CUDA device ID to activate |
| std::runtime_error | If cudaSetDevice fails |

|
constexprexport |
Convenience alias for accessing the cudaDataType_t mapping directly.
Usage: constexpr cudaDataType_t dt = cuda_data_type_v<TensorDataType::BF16>; // dt == CUDA_R_BF16