Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
CudaMhaOp.Plans.ixx File Reference
#include <cublasLt.h>
#include <cuda_fp16.h>
#include <vector>
#include <memory>
#include <string>
#include <stdexcept>
#include <cstdint>
#include <type_traits>
#include <sstream>
#include <cassert>
#include "Kernels/CudaMha.cuh"
import Logging.Logger;
import Compute.CublasLtPlan;

Namespaces

namespace  Mila
 Mila main API namespace.
namespace  Mila::Dnn
namespace  Mila::Dnn::Compute
namespace  Mila::Dnn::Compute::Cuda
namespace  Mila::Dnn::Compute::Cuda::MultiHeadAttention
namespace  Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail

Typedefs

template<typename TNative>
using Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::CublasLtMatMulPlan = CublasLtMatMulPlan<TNative>
 cuBLASLt matmul execution plan for attention operations.

Functions

template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_att_value_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_att_value_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_att_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_k_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_q_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_backward_v_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_qk_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TNative>
CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_qk_score_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
 Build cuBLASLt plan for Q·K^T attention score computation (row-major).