|
Mila 0.13.48
Deep Neural Network Library
|
Shared cuBLASLt plans for building and executing matmul plans (RAII + builders). More...
#include <cublasLt.h>#include <cuda_runtime.h>#include <cstdint>#include <stdexcept>#include <utility>#include <string>#include <vector>#include <iostream>#include <sstream>#include <iomanip>import Logging.Logger;import CublasLt.Error;import Dnn.TensorTypes;Classes | |
| struct | Mila::Dnn::Compute::Cuda::CublasLtMatMulPlan< TComputePrecision > |
| RAII wrapper owning cuBLASLt descriptors and the selected heuristic algorithm. More... | |
Namespaces | |
| namespace | Mila |
| Mila main API namespace. | |
| namespace | Mila::Dnn |
| namespace | Mila::Dnn::Compute |
| namespace | Mila::Dnn::Compute::Cuda |
Functions | |
| template<typename TNative> | |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::build_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cudaDataType_t data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| Build a cuBLASLt plan for a standard (non-strided) matmul. | |
| template<typename TComputePrecision> | |
| CublasLtMatMulPlan< TComputePrecision > | Mila::Dnn::Compute::Cuda::build_strided_plan (cublasLtHandle_t handle, int A_rows, int A_cols, int ldA, long long strideA_elems, int B_rows, int B_cols, int ldB, long long strideB_elems, int C_rows, int C_cols, int ldC, long long strideC_elems, cublasOperation_t opA, cublasOperation_t opB, int strided_batch_count, bool has_bias=false, cublasComputeType_t compute_type=CUBLAS_COMPUTE_32F, cudaDataType_t cuda_data_type=CUDA_R_32F, cudaDataType_t scale_type=CUDA_R_32F, cublasLtOrder_t order=CUBLASLT_ORDER_ROW) |
| Build a cuBLASLt matmul plan for strided-batched matmuls. | |
| template<typename TComputePrecision> | |
| void | Mila::Dnn::Compute::Cuda::execute_plan (cublasLtHandle_t handle, const CublasLtMatMulPlan< TComputePrecision > &plan, const void *alpha, const TComputePrecision *A, const TComputePrecision *B, const void *beta, TComputePrecision *C, const TComputePrecision *bias, cudaStream_t stream, void *workspace=nullptr, size_t workspaceSize=0) |
| Execute a previously-built cuBLASLt plan. | |
Shared cuBLASLt plans for building and executing matmul plans (RAII + builders).
Provides templated utilities to build cuBLASLt matmul plans (including strided-batched) and execute them. Designed to be reused by CUDA Linear and Attention operations.