|
Mila 0.13.48
Deep Neural Network Library
|
CUDA Grouped-Query Attention (GQA) operation using cuBLASLt. More...
#include <cublasLt.h>#include <cuda_fp16.h>#include <vector>#include <memory>#include <string>#include <format>#include <stdexcept>#include <cstdint>#include <type_traits>#include <sstream>#include <cassert>#include <unordered_map>#include "Kernels/CudaGqa.cuh"#include <iostream>import Cuda.Debug;import Compute.OperationBase;import Dnn.TensorDataTypeTraits;import Compute.CudaDevice;import Dnn.TensorTypes;import Dnn.Component;import Dnn.ITensor;import Compute.UnaryOperation;import Compute.ExecutionContext;import Dnn.ComponentConfig;import Compute.CudaGqaOp:Plans;import Dnn.Components.GqaConfig;import Dnn.TensorOps;import Compute.Device;import Compute.MemoryResource;import Compute.DeviceType;import Compute.OperationRegistry;import Compute.IExecutionContext;import Compute.OperationType;import Compute.CpuMemoryResource;import Dnn.Tensor;import Compute.CudaDeviceMemoryResource;import Compute.IKvInference;import Compute.CudaTensorDataType;import Compute.GqaState;import Compute.CublasLtPlan;import CublasLt.Error;import Dnn.TensorDataType;import Logging.Logger;Classes | |
| class | Mila::Dnn::Compute::Cuda::Gqa::CudaGqaOp< TPrecision > |
| CUDA Grouped-Query Attention operation. More... | |
| class | Mila::Dnn::Compute::Cuda::Gqa::CudaGroupedQueryAttentionOpRegistrar |
Namespaces | |
| namespace | Mila |
| Mila main API namespace. | |
| namespace | Mila::Dnn |
| namespace | Mila::Dnn::Compute |
| namespace | Mila::Dnn::Compute::Cuda |
| namespace | Mila::Dnn::Compute::Cuda::Gqa |
Variables | |
| static constexpr bool | Mila::Dnn::Compute::Cuda::Gqa::kUseOptimizedPath = true |
CUDA Grouped-Query Attention (GQA) operation using cuBLASLt.