Classes
struct	cuda_mha_kernels
	CUDA kernel dispatcher for attention non-matmul operations. More...
struct	cuda_mha_kernels< float >
struct	cuda_mha_kernels< half >

Typedefs
template<typename TNative>
using	CublasLtMatMulPlan = CublasLtMatMulPlan<TNative>
	cuBLASLt matmul execution plan for attention operations.

Functions
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_att_value_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_att_value_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_backward_att_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_backward_k_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_backward_q_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_backward_v_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_qk_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative >	build_qk_score_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
	Build cuBLASLt plan for Q�K^T attention score computation (row-major).

Typedef Documentation

◆ CublasLtMatMulPlan

template<typename TNative>

using Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::CublasLtMatMulPlan = CublasLtMatMulPlan<TNative>

cuBLASLt matmul execution plan for attention operations.

Function Documentation

◆ build_att_value_decode_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_att_value_decode_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	max_seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_att_value_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_att_value_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_backward_att_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_att_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_backward_k_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_k_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_backward_q_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_q_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_backward_v_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_v_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_qk_decode_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_qk_decode_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	max_seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_qk_score_plan()

template<typename TNative>

CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_qk_score_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	num_heads,
		int	seq_length,
		int	head_size,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Build cuBLASLt plan for Q�K^T attention score computation (row-major).

Row-major storage: Q[K] and K[K] are stored as [T, HS] (rows = sequence length, cols = head size). Mathematical operation: preatt[T, T] = Q[T, HS] @ K^T[HS, T]

Here is the call graph for this function:

Here is the caller graph for this function:

Classes

Typedefs

Functions

Typedef Documentation

◆ CublasLtMatMulPlan

Function Documentation

◆ build_att_value_decode_plan()

◆ build_att_value_plan()

◆ build_backward_att_plan()

◆ build_backward_k_plan()

◆ build_backward_q_plan()

◆ build_backward_v_plan()

◆ build_qk_decode_plan()

◆ build_qk_score_plan()