Classes
struct	cuda_matmul_impl
	CUDA kernel dispatcher for Linear operations. More...
struct	cuda_matmul_impl< float >
struct	cuda_matvec_impl
	CUDA kernel dispatcher for matrix-vector multiply (M=1 decode path). More...
struct	cuda_matvec_impl< float, float >
struct	cuda_matvec_impl< nv_bfloat16, __nv_fp8_e4m3 >
struct	cuda_matvec_impl< nv_bfloat16, nv_bfloat16 >

Typedefs
template<typename TComputePrecision>
using	CublasLtMatMulPlan = Mila::Dnn::Compute::Cuda::CublasLtMatMulPlan<TComputePrecision>

Functions
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision >	build_backward_input_plan (cublasLtHandle_t handle, int batch_size, int in_features, int out_features, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
	Build cuBLASLt plan for backward input gradient computation.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision >	build_backward_weight_plan (cublasLtHandle_t handle, int batch_size, int in_features, int out_features, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
	Build cuBLASLt plan for backward weight gradient computation.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision >	build_forward_plan (cublasLtHandle_t handle, int batch_size, int in_features, int out_features, bool has_bias, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TComputePrecision>
void	compute_bias_gradient (TComputePrecision bias_grad, const TComputePrecision output_grad, int batch_size, int out_features, cudaStream_t stream)
	Compute bias gradient via reduction sum across batch dimension.
void	Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp4_per_group (const Mila::Dnn::Serialization::ITensorBlob &blob, Mila::Dnn::ITensor &weight_out, Mila::Dnn::ITensor &scales_out, const Mila::Dnn::shape_t &expected_shape, int group_size, void *dev_staging, cudaStream_t stream)
	Validate, quantize and upload a BF16 weight blob to packed FP4_E2M1 with per-group float32 scales.
void	Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_channel (const Mila::Dnn::Serialization::ITensorBlob &blob, Mila::Dnn::ITensor &weight_out, Mila::Dnn::ITensor &scales_out, const Mila::Dnn::shape_t &expected_shape, void *dev_staging, cudaStream_t stream)
	Validate, quantize and upload a BF16 weight blob to FP8_E4M3 on device.
void	Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_tensor (const Mila::Dnn::Serialization::ITensorBlob &blob, Mila::Dnn::ITensor &weight_out, Mila::Dnn::ITensor &scales_out, const Mila::Dnn::shape_t &expected_shape, void *dev_staging, cudaStream_t stream)
	Validate, quantize and upload a BF16 weight blob to FP8_E4M3 with a single per-tensor scale — for the Ada (SM 8.9+) cuBLASLt TN path.

Typedef Documentation

◆ CublasLtMatMulPlan

template<typename TComputePrecision>

using Mila::Dnn::Compute::Cuda::Linear::Detail::CublasLtMatMulPlan = Mila::Dnn::Compute::Cuda::CublasLtMatMulPlan<TComputePrecision>

Function Documentation

◆ build_backward_input_plan()

template<typename TComputePrecision>

CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::Linear::Detail::build_backward_input_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	in_features,
		int	out_features,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Build cuBLASLt plan for backward input gradient computation.

Computes dX[batch, in] = dY[batch, out] @ weight[out, in] Row-major layout, opA=N, opB=N, batch_count=1.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_backward_weight_plan()

template<typename TComputePrecision>

CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::Linear::Detail::build_backward_weight_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	in_features,
		int	out_features,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Build cuBLASLt plan for backward weight gradient computation.

Computes dW[out, in] = dY^T[out, batch] @ X[batch, in] Row-major layout, opA=T, opB=N, batch_count=1. Note: always built at max batch_size — weight grad accumulates full batch.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ build_forward_plan()

template<typename TComputePrecision>

CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::Linear::Detail::build_forward_plan	(	cublasLtHandle_t	handle,
		int	batch_size,
		int	in_features,
		int	out_features,
		bool	has_bias,
		cudaDataType_t	cuda_data_type,
		cublasComputeType_t	compute_type,
		cudaDataType_t	scale_type )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ compute_bias_gradient()

template<typename TComputePrecision>

void Mila::Dnn::Compute::Cuda::Linear::Detail::compute_bias_gradient	(	TComputePrecision *	bias_grad,
		const TComputePrecision *	output_grad,
		int	batch_size,
		int	out_features,
		cudaStream_t	stream )

Compute bias gradient via reduction sum across batch dimension.

dB[out] = sum(dY[batch, out], dim=0)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ quantize_fp4_per_group()

void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp4_per_group	(	const Mila::Dnn::Serialization::ITensorBlob &	blob,
		Mila::Dnn::ITensor &	weight_out,
		Mila::Dnn::ITensor &	scales_out,
		const Mila::Dnn::shape_t &	expected_shape,
		int	group_size,
		void *	dev_staging,
		cudaStream_t	stream )

export

Validate, quantize and upload a BF16 weight blob to packed FP4_E2M1 with per-group float32 scales.

For each output channel and each group of group_size input channels: scale[n, g] = max(|W[n, g*gs..(g+1)*gs)|) / 6.0f packed[n, k/2] = fp4_e2m1(W[n,k]/scale) nibble-packed (low=even, high=odd)

This function is the non-template CL.EXE/NVCC boundary crossing point for the FP4 quantize-on-load path. CudaLinearOp::quantize() (template body, CL.EXE) passes the compile-time group_size as a runtime int; this function dispatches to the correct NVCC-compiled kernel instantiation.

Parameters

blob	Host BF16 weight blob [out_features, in_features].
weight_out	Device UINT8 tensor [out_features, in_features/2].
scales_out	Device FP32 tensor [out_features, in_features/group_size].
expected_shape	Expected weight shape [out_features, in_features].
group_size	Quantization group size (64 or 128).

Here is the call graph for this function:

Here is the caller graph for this function:

◆ quantize_fp8_per_channel()

void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_channel	(	const Mila::Dnn::Serialization::ITensorBlob &	blob,
		Mila::Dnn::ITensor &	weight_out,
		Mila::Dnn::ITensor &	scales_out,
		const Mila::Dnn::shape_t &	expected_shape,
		void *	dev_staging,
		cudaStream_t	stream )

export

Validate, quantize and upload a BF16 weight blob to FP8_E4M3 on device.

Validates the incoming blob shape against expected_shape, then delegates to cuda_quantize_fp8_per_channel() for per-channel absmax quantization and device upload. See cuda_quantize_fp8_per_channel() in CudaFp8WeightQuantization.cu for the quantization algorithm.

This function is the non-template bridge that keeps all CUDA host code inside NVCC-compiled TUs. CudaLinearOp::quantize() (a template member body compiled by NVCC) is the sole caller.

Parameters

blob	Host BF16 weight blob from the model archive.
weight_out	Device FP8_E4M3 tensor of shape [out_features, in_features].
scales_out	Device float32 tensor of shape [out_features].
expected_shape	Expected weight shape for validation.

Exceptions

std::invalid_argument	if the blob shape does not match expected_shape.
std::runtime_error	if a cudaMemcpy device upload fails.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ quantize_fp8_per_tensor()

void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_tensor	(	const Mila::Dnn::Serialization::ITensorBlob &	blob,
		Mila::Dnn::ITensor &	weight_out,
		Mila::Dnn::ITensor &	scales_out,
		const Mila::Dnn::shape_t &	expected_shape,
		void *	dev_staging,
		cudaStream_t	stream )

export

Validate, quantize and upload a BF16 weight blob to FP8_E4M3 with a single per-tensor scale — for the Ada (SM 8.9+) cuBLASLt TN path.

Folds all per-channel scales into one global scale: global_scale = max(|W[o, i]|, for all o, i) / 448.0f

Every slot of scales_out is filled with global_scale so that scales_out[0] can be passed directly as CUBLASLT_MATMUL_DESC_A_SCALE_POINTER.

Parameters

blob	Host BF16 weight blob from the model archive.
weight_out	Device FP8_E4M3 tensor of shape [out_features, in_features].
scales_out	Device float32 tensor of shape [out_features].
expected_shape	Expected weight shape for validation.

Exceptions

std::invalid_argument	if the blob shape does not match expected_shape.
std::runtime_error	if any CUDA call fails.

Here is the call graph for this function:

Here is the caller graph for this function:

Classes

Typedefs

Functions

Typedef Documentation

◆ CublasLtMatMulPlan

Function Documentation

◆ build_backward_input_plan()

◆ build_backward_weight_plan()

◆ build_forward_plan()

◆ compute_bias_gradient()

◆ quantize_fp4_per_group()

◆ quantize_fp8_per_channel()

◆ quantize_fp8_per_tensor()