|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_att_value_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_att_value_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_backward_att_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_backward_k_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_backward_q_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_backward_v_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_qk_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | build_qk_score_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Build cuBLASLt plan for Q·K^T attention score computation (row-major).
|
template<typename TNative>
| CublasLtMatMulPlan< TNative > Mila::Dnn::Compute::Cuda::MultiHeadAttention::Detail::build_qk_score_plan |
( |
cublasLtHandle_t | handle, |
|
|
int | batch_size, |
|
|
int | num_heads, |
|
|
int | seq_length, |
|
|
int | head_size, |
|
|
cudaDataType_t | cuda_data_type, |
|
|
cublasComputeType_t | compute_type, |
|
|
cudaDataType_t | scale_type ) |
Build cuBLASLt plan for Q·K^T attention score computation (row-major).
Row-major storage: Q[K] and K[K] are stored as [T, HS] (rows = sequence length, cols = head size). Mathematical operation: preatt[T, T] = Q[T, HS] @ K^T[HS, T]