|
Mila 0.13.48
Deep Neural Network Library
|
CUDA implementation of Linear operation with two-phase cuBLASLt optimization. More...
#include <cublasLt.h>#include <cuda_runtime.h>#include <cuda_fp16.h>#include <cuda_bf16.h>#include <cuda_fp8.h>#include <vector>#include <memory>#include <string>#include <format>#include <stdexcept>#include <exception>#include <cstdint>#include <type_traits>#include <sstream>#include <cassert>#include <algorithm>#include <cmath>#include "Kernels/Linear.cuh"#include "Kernels/Fp8Prefill/CudaFp8Prefill.cuh"#include "Kernels/W8A16Gemm/CudaW8A16Gemm.cuh"#include "Kernels/W4A16Gemm/CudaW4A16Gemm.cuh"#include "Kernels/W4A16Gemm/CudaW4A16Gemm.Wmma.cuh"import Dnn.TensorHelpers;import Dnn.ComponentConfig;import Dnn.Components.LinearConfig;import Compute.OperationBase;import Compute.ExecutionContextTemplate;import Compute.CudaDeviceMemoryResource;import Dnn.TensorDataTypeTraits;import Compute.CublasLtPlanCache;import Dnn.ITensor;import Dnn.Tensor;import Compute.CudaDevice;import Compute.CudaLinearOp:Dispatch;import Dnn.TensorDataType;import Dnn.Component;import Compute.CublasLtPlan;import Compute.DeviceType;import Compute.IExecutionContext;import Compute.OperationType;import Compute.MemoryResource;import Compute.CudaTensorDataType;import Compute.Cuda.CublasLtLinearPlan;import Serialization.Tensor;import CublasLt.Error;import Dnn.Quantization.Weight.Policies;import Dnn.TensorTypes;import Compute.ExecutionContext;import Logging.Logger;import Dnn.TensorOps;Classes | |
| class | Mila::Dnn::Compute::Cuda::Linear::CudaLinearOp< TComputePrecision, TWeightQuant > |
| CUDA Linear operation with compile-time weight quantization policy dispatch. More... | |
| class | Mila::Dnn::Compute::Cuda::Linear::CudaLinearOpRegistrar |
Namespaces | |
| namespace | Mila |
| Mila main API namespace. | |
| namespace | Mila::Dnn |
| namespace | Mila::Dnn::Compute |
| namespace | Mila::Dnn::Compute::Cuda |
| namespace | Mila::Dnn::Compute::Cuda::Linear |
CUDA implementation of Linear operation with two-phase cuBLASLt optimization.
CUDA Linear operation with compile-time weight quantization policy dispatch.
TWeightQuant = NoWeightQuant selects the standard BF16/FP32 cuBLASLt path. TWeightQuant = PerChannelFp8<> selects the FP8 weight + BF16 activation mixed-precision path. quantize() and setWeightScales() are only callable on the PerChannelFp8<> instantiation (enforced via requires). All other operations are unaware they exist.