Shared cuBLASLt plans for building and executing matmul plans (RAII + builders). More...

#include <cublasLt.h>
#include <cuda_runtime.h>
#include <cstdint>
#include <stdexcept>
#include <utility>
#include <string>
#include <vector>
#include <iostream>
#include <sstream>
#include <iomanip>
import Logging.Logger;
import CublasLt.Error;
import Dnn.TensorTypes;

Classes
struct	Mila::Dnn::Compute::Cuda::CublasLtMatMulPlan< TComputePrecision >
	RAII wrapper owning cuBLASLt descriptors and the selected heuristic algorithm. More...

Namespaces
namespace	Mila
	Mila main API namespace.
namespace	Mila::Dnn
namespace	Mila::Dnn::Compute
namespace	Mila::Dnn::Compute::Cuda

Functions
template<typename TNative>
CublasLtMatMulPlan< TNative >	Mila::Dnn::Compute::Cuda::build_plan (cublasLtHandle_t handle, int outer_size, int in_features, int out_features, bool has_bias, cudaDataType_t data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
	Build a cuBLASLt plan for a standard (non-strided) matmul.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision >	Mila::Dnn::Compute::Cuda::build_strided_plan (cublasLtHandle_t handle, int A_rows, int A_cols, int ldA, long long strideA_elems, int B_rows, int B_cols, int ldB, long long strideB_elems, int C_rows, int C_cols, int ldC, long long strideC_elems, cublasOperation_t opA, cublasOperation_t opB, int strided_batch_count, bool has_bias=false, cublasComputeType_t compute_type=CUBLAS_COMPUTE_32F, cudaDataType_t cuda_data_type=CUDA_R_32F, cudaDataType_t scale_type=CUDA_R_32F, cublasLtOrder_t order=CUBLASLT_ORDER_ROW)
	Build a cuBLASLt matmul plan for strided-batched matmuls.
template<typename TComputePrecision>
void	Mila::Dnn::Compute::Cuda::execute_plan (cublasLtHandle_t handle, const CublasLtMatMulPlan< TComputePrecision > &plan, const void alpha, const TComputePrecision A, const TComputePrecision B, const void beta, TComputePrecision C, const TComputePrecision bias, cudaStream_t stream, void *workspace=nullptr, size_t workspaceSize=0)
	Execute a previously-built cuBLASLt plan.

Detailed Description

Shared cuBLASLt plans for building and executing matmul plans (RAII + builders).

Provides templated utilities to build cuBLASLt matmul plans (including strided-batched) and execute them. Designed to be reused by CUDA Linear and Attention operations.

Classes

Namespaces

Functions

Detailed Description