Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
CudaLinearOp.ixx File Reference

CUDA implementation of Linear operation with two-phase cuBLASLt optimization. More...

#include <cublasLt.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <cuda_fp8.h>
#include <vector>
#include <memory>
#include <string>
#include <format>
#include <stdexcept>
#include <exception>
#include <cstdint>
#include <type_traits>
#include <sstream>
#include <cassert>
#include <algorithm>
#include <cmath>
#include "Kernels/Linear.cuh"
#include "Kernels/Fp8Prefill/CudaFp8Prefill.cuh"
#include "Kernels/W8A16Gemm/CudaW8A16Gemm.cuh"
#include "Kernels/W4A16Gemm/CudaW4A16Gemm.cuh"
#include "Kernels/W4A16Gemm/CudaW4A16Gemm.Wmma.cuh"
import Dnn.TensorHelpers;
import Dnn.ComponentConfig;
import Dnn.Components.LinearConfig;
import Compute.OperationBase;
import Compute.ExecutionContextTemplate;
import Compute.CudaDeviceMemoryResource;
import Dnn.TensorDataTypeTraits;
import Compute.CublasLtPlanCache;
import Dnn.ITensor;
import Dnn.Tensor;
import Compute.CudaDevice;
import Compute.CudaLinearOp:Dispatch;
import Dnn.TensorDataType;
import Dnn.Component;
import Compute.CublasLtPlan;
import Compute.DeviceType;
import Compute.IExecutionContext;
import Compute.OperationType;
import Compute.MemoryResource;
import Compute.CudaTensorDataType;
import Compute.Cuda.CublasLtLinearPlan;
import Serialization.Tensor;
import CublasLt.Error;
import Dnn.Quantization.Weight.Policies;
import Dnn.TensorTypes;
import Compute.ExecutionContext;
import Logging.Logger;
import Dnn.TensorOps;

Classes

class  Mila::Dnn::Compute::Cuda::Linear::CudaLinearOp< TComputePrecision, TWeightQuant >
 CUDA Linear operation with compile-time weight quantization policy dispatch. More...
class  Mila::Dnn::Compute::Cuda::Linear::CudaLinearOpRegistrar

Namespaces

namespace  Mila
 Mila main API namespace.
namespace  Mila::Dnn
namespace  Mila::Dnn::Compute
namespace  Mila::Dnn::Compute::Cuda
namespace  Mila::Dnn::Compute::Cuda::Linear

Detailed Description

CUDA implementation of Linear operation with two-phase cuBLASLt optimization.

CUDA Linear operation with compile-time weight quantization policy dispatch.

TWeightQuant = NoWeightQuant selects the standard BF16/FP32 cuBLASLt path. TWeightQuant = PerChannelFp8<> selects the FP8 weight + BF16 activation mixed-precision path. quantize() and setWeightScales() are only callable on the PerChannelFp8<> instantiation (enforced via requires). All other operations are unaware they exist.