Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
CudaGqaOp.ixx File Reference

CUDA Grouped-Query Attention (GQA) operation using cuBLASLt. More...

#include <cublasLt.h>
#include <cuda_fp16.h>
#include <vector>
#include <memory>
#include <string>
#include <format>
#include <stdexcept>
#include <cstdint>
#include <type_traits>
#include <sstream>
#include <cassert>
#include <unordered_map>
#include "Kernels/CudaGqa.cuh"
#include <iostream>
import Cuda.Debug;
import Compute.OperationBase;
import Dnn.TensorDataTypeTraits;
import Compute.CudaDevice;
import Dnn.TensorTypes;
import Dnn.Component;
import Dnn.ITensor;
import Compute.UnaryOperation;
import Compute.ExecutionContext;
import Dnn.ComponentConfig;
import Compute.CudaGqaOp:Plans;
import Dnn.Components.GqaConfig;
import Dnn.TensorOps;
import Compute.Device;
import Compute.MemoryResource;
import Compute.DeviceType;
import Compute.OperationRegistry;
import Compute.IExecutionContext;
import Compute.OperationType;
import Compute.CpuMemoryResource;
import Dnn.Tensor;
import Compute.CudaDeviceMemoryResource;
import Compute.IKvInference;
import Compute.CudaTensorDataType;
import Compute.GqaState;
import Compute.CublasLtPlan;
import CublasLt.Error;
import Dnn.TensorDataType;
import Logging.Logger;

Classes

class  Mila::Dnn::Compute::Cuda::Gqa::CudaGqaOp< TPrecision >
 CUDA Grouped-Query Attention operation. More...
class  Mila::Dnn::Compute::Cuda::Gqa::CudaGroupedQueryAttentionOpRegistrar

Namespaces

namespace  Mila
 Mila main API namespace.
namespace  Mila::Dnn
namespace  Mila::Dnn::Compute
namespace  Mila::Dnn::Compute::Cuda
namespace  Mila::Dnn::Compute::Cuda::Gqa

Variables

static constexpr bool Mila::Dnn::Compute::Cuda::Gqa::kUseOptimizedPath = true

Detailed Description

CUDA Grouped-Query Attention (GQA) operation using cuBLASLt.