|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_att_value_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Single-token Att @ V decode plan.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_att_value_decode_plan_optimized (cublasLtHandle_t handle, int batch_size, int num_kv_heads, int group_size, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Single-token Att @ V decode plan reading V from [B, NKV, T, HS] cache.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_att_value_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Att @ V weighted-sum plan (training, full sequence length).
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_att_value_prefill_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int chunk_rows, int max_seq_length, int head_size, int prefill_window_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_att_value_prefill_plan_optimized (cublasLtHandle_t handle, int batch_size, int num_kv_heads, int group_size, int chunk_rows, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Att @ V prefill plan reading V directly from [B, NKV, T, HS] cache.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_backward_att_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | dAtt = dVout @ V^T.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_backward_k_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | dK = dPreatt^T @ Q.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_backward_q_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | dQ = dPreatt @ K.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_backward_v_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | dV = Att^T @ dVout.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_qk_decode_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Single-token Q @ K^T decode plan.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_qk_decode_plan_optimized (cublasLtHandle_t handle, int batch_size, int num_kv_heads, int group_size, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Single-token Q @ K^T decode plan reading K from [B, NKV, T, HS] cache.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_qk_prefill_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int chunk_rows, int max_seq_length, int head_size, int prefill_window_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Q @ K^T attention score plan for chunked prefill (inference).
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_qk_prefill_plan_optimized (cublasLtHandle_t handle, int batch_size, int num_kv_heads, int group_size, int chunk_rows, int max_seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Q @ K^T prefill plan reading K directly from [B, NKV, T, HS] cache.
|
| template<typename TNative> |
| CublasLtMatMulPlan< TNative > | Mila::Dnn::Compute::Cuda::Gqa::Detail::build_qk_score_plan (cublasLtHandle_t handle, int batch_size, int num_heads, int seq_length, int head_size, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type) |
| | Q @ K^T attention score plan (training, full sequence length).
|