Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Mila::Dnn::Compute::Cuda::Linear::Detail Namespace Reference

Classes

struct  cuda_matmul_impl
 CUDA kernel dispatcher for Linear operations. More...
struct  cuda_matmul_impl< float >
struct  cuda_matvec_impl
 CUDA kernel dispatcher for matrix-vector multiply (M=1 decode path). More...
struct  cuda_matvec_impl< float, float >
struct  cuda_matvec_impl< nv_bfloat16, __nv_fp8_e4m3 >
struct  cuda_matvec_impl< nv_bfloat16, nv_bfloat16 >

Typedefs

template<typename TComputePrecision>
using CublasLtMatMulPlan = Mila::Dnn::Compute::Cuda::CublasLtMatMulPlan<TComputePrecision>

Functions

template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > build_backward_input_plan (cublasLtHandle_t handle, int batch_size, int in_features, int out_features, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
 Build cuBLASLt plan for backward input gradient computation.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > build_backward_weight_plan (cublasLtHandle_t handle, int batch_size, int in_features, int out_features, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
 Build cuBLASLt plan for backward weight gradient computation.
template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > build_forward_plan (cublasLtHandle_t handle, int batch_size, int in_features, int out_features, bool has_bias, cudaDataType_t cuda_data_type, cublasComputeType_t compute_type, cudaDataType_t scale_type)
template<typename TComputePrecision>
void compute_bias_gradient (TComputePrecision *bias_grad, const TComputePrecision *output_grad, int batch_size, int out_features, cudaStream_t stream)
 Compute bias gradient via reduction sum across batch dimension.
void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp4_per_group (const Mila::Dnn::Serialization::ITensorBlob &blob, Mila::Dnn::ITensor &weight_out, Mila::Dnn::ITensor &scales_out, const Mila::Dnn::shape_t &expected_shape, int group_size, void *dev_staging, cudaStream_t stream)
 Validate, quantize and upload a BF16 weight blob to packed FP4_E2M1 with per-group float32 scales.
void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_channel (const Mila::Dnn::Serialization::ITensorBlob &blob, Mila::Dnn::ITensor &weight_out, Mila::Dnn::ITensor &scales_out, const Mila::Dnn::shape_t &expected_shape, void *dev_staging, cudaStream_t stream)
 Validate, quantize and upload a BF16 weight blob to FP8_E4M3 on device.
void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_tensor (const Mila::Dnn::Serialization::ITensorBlob &blob, Mila::Dnn::ITensor &weight_out, Mila::Dnn::ITensor &scales_out, const Mila::Dnn::shape_t &expected_shape, void *dev_staging, cudaStream_t stream)
 Validate, quantize and upload a BF16 weight blob to FP8_E4M3 with a single per-tensor scale — for the Ada (SM 8.9+) cuBLASLt TN path.

Typedef Documentation

◆ CublasLtMatMulPlan

Function Documentation

◆ build_backward_input_plan()

template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::Linear::Detail::build_backward_input_plan ( cublasLtHandle_t handle,
int batch_size,
int in_features,
int out_features,
cudaDataType_t cuda_data_type,
cublasComputeType_t compute_type,
cudaDataType_t scale_type )

Build cuBLASLt plan for backward input gradient computation.

Computes dX[batch, in] = dY[batch, out] @ weight[out, in] Row-major layout, opA=N, opB=N, batch_count=1.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ build_backward_weight_plan()

template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::Linear::Detail::build_backward_weight_plan ( cublasLtHandle_t handle,
int batch_size,
int in_features,
int out_features,
cudaDataType_t cuda_data_type,
cublasComputeType_t compute_type,
cudaDataType_t scale_type )

Build cuBLASLt plan for backward weight gradient computation.

Computes dW[out, in] = dY^T[out, batch] @ X[batch, in] Row-major layout, opA=T, opB=N, batch_count=1. Note: always built at max batch_size — weight grad accumulates full batch.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ build_forward_plan()

template<typename TComputePrecision>
CublasLtMatMulPlan< TComputePrecision > Mila::Dnn::Compute::Cuda::Linear::Detail::build_forward_plan ( cublasLtHandle_t handle,
int batch_size,
int in_features,
int out_features,
bool has_bias,
cudaDataType_t cuda_data_type,
cublasComputeType_t compute_type,
cudaDataType_t scale_type )
Here is the call graph for this function:
Here is the caller graph for this function:

◆ compute_bias_gradient()

template<typename TComputePrecision>
void Mila::Dnn::Compute::Cuda::Linear::Detail::compute_bias_gradient ( TComputePrecision * bias_grad,
const TComputePrecision * output_grad,
int batch_size,
int out_features,
cudaStream_t stream )

Compute bias gradient via reduction sum across batch dimension.

dB[out] = sum(dY[batch, out], dim=0)

Here is the call graph for this function:
Here is the caller graph for this function:

◆ quantize_fp4_per_group()

void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp4_per_group ( const Mila::Dnn::Serialization::ITensorBlob & blob,
Mila::Dnn::ITensor & weight_out,
Mila::Dnn::ITensor & scales_out,
const Mila::Dnn::shape_t & expected_shape,
int group_size,
void * dev_staging,
cudaStream_t stream )
export

Validate, quantize and upload a BF16 weight blob to packed FP4_E2M1 with per-group float32 scales.

For each output channel and each group of group_size input channels: scale[n, g] = max(|W[n, g*gs..(g+1)*gs)|) / 6.0f packed[n, k/2] = fp4_e2m1(W[n,k]/scale) nibble-packed (low=even, high=odd)

This function is the non-template CL.EXE/NVCC boundary crossing point for the FP4 quantize-on-load path. CudaLinearOp::quantize() (template body, CL.EXE) passes the compile-time group_size as a runtime int; this function dispatches to the correct NVCC-compiled kernel instantiation.

Parameters
blobHost BF16 weight blob [out_features, in_features].
weight_outDevice UINT8 tensor [out_features, in_features/2].
scales_outDevice FP32 tensor [out_features, in_features/group_size].
expected_shapeExpected weight shape [out_features, in_features].
group_sizeQuantization group size (64 or 128).
Here is the call graph for this function:
Here is the caller graph for this function:

◆ quantize_fp8_per_channel()

void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_channel ( const Mila::Dnn::Serialization::ITensorBlob & blob,
Mila::Dnn::ITensor & weight_out,
Mila::Dnn::ITensor & scales_out,
const Mila::Dnn::shape_t & expected_shape,
void * dev_staging,
cudaStream_t stream )
export

Validate, quantize and upload a BF16 weight blob to FP8_E4M3 on device.

Validates the incoming blob shape against expected_shape, then delegates to cuda_quantize_fp8_per_channel() for per-channel absmax quantization and device upload. See cuda_quantize_fp8_per_channel() in CudaFp8WeightQuantization.cu for the quantization algorithm.

This function is the non-template bridge that keeps all CUDA host code inside NVCC-compiled TUs. CudaLinearOp::quantize() (a template member body compiled by NVCC) is the sole caller.

Parameters
blobHost BF16 weight blob from the model archive.
weight_outDevice FP8_E4M3 tensor of shape [out_features, in_features].
scales_outDevice float32 tensor of shape [out_features].
expected_shapeExpected weight shape for validation.
Exceptions
std::invalid_argumentif the blob shape does not match expected_shape.
std::runtime_errorif a cudaMemcpy device upload fails.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ quantize_fp8_per_tensor()

void Mila::Dnn::Compute::Cuda::Linear::Detail::quantize_fp8_per_tensor ( const Mila::Dnn::Serialization::ITensorBlob & blob,
Mila::Dnn::ITensor & weight_out,
Mila::Dnn::ITensor & scales_out,
const Mila::Dnn::shape_t & expected_shape,
void * dev_staging,
cudaStream_t stream )
export

Validate, quantize and upload a BF16 weight blob to FP8_E4M3 with a single per-tensor scale — for the Ada (SM 8.9+) cuBLASLt TN path.

Folds all per-channel scales into one global scale: global_scale = max(|W[o, i]|, for all o, i) / 448.0f

Every slot of scales_out is filled with global_scale so that scales_out[0] can be passed directly as CUBLASLT_MATMUL_DESC_A_SCALE_POINTER.

Parameters
blobHost BF16 weight blob from the model archive.
weight_outDevice FP8_E4M3 tensor of shape [out_features, in_features].
scales_outDevice float32 tensor of shape [out_features].
expected_shapeExpected weight shape for validation.
Exceptions
std::invalid_argumentif the blob shape does not match expected_shape.
std::runtime_errorif any CUDA call fails.
Here is the call graph for this function:
Here is the caller graph for this function: