Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision > Class Template Referenceexport

CUDA implementation of Layer Normalization. More...

Inheritance diagram for Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >:
Collaboration diagram for Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >:

Public Types

using CudaExecutionContext = ExecutionContext<DeviceType::Cuda>
using MR = CudaDeviceMemoryResource
using NativeType = typename Mila::Dnn::Compute::Cuda::TensorDataTypeMap<TPrecision>::device_type
using TensorType = Tensor<TPrecision, MR>
using UnaryOperationBase = UnaryOperation<DeviceType::Cuda, TPrecision>
Public Types inherited from Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision >
using MR
using TensorInputType
using TensorOutputType
Public Types inherited from Mila::Dnn::Compute::Operation< TDeviceType, TInput >
using DataTypeTraits

Public Member Functions

 CudaLayerNormOp (IExecutionContext *context, const LayerNormConfig &config)
void backward (const ITensor &input, const ITensor &output_grad, ITensor &input_grad) const override
 Execute backward pass (hot path).
void build (const BuildContext &config) override
 Prepare operation for execution with concrete input shape.
void forward (const ITensor &input, ITensor &output) const override
 Execute forward pass (hot path).
const LayerNormConfiggetConfig () const
std::string getName () const override
 Human-readable operation name.
OperationType getOperationType () const override
 Operation type identifier.
void setGradients (ITensor *weight_grad, ITensor *bias_grad) override
 Bind component-owned parameter gradient tensors for training.
void setParameters (ITensor *weight, ITensor *bias) override
 Bind component-owned parameter tensors.
Public Member Functions inherited from Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision >
virtual ~UnaryOperation ()=default
Public Member Functions inherited from Mila::Dnn::Compute::Operation< TDeviceType, TInput >
virtual ~Operation ()=default
virtual void clearGradients () noexcept
 Clear any cached gradient pointers held by the operation.
virtual TensorDataType getDataType () const
 Tensor data type for this operation.
virtual DeviceType getDeviceType () const
 Device type for this operation.
virtual std::size_t getStateMemorySize () const
 Returns the number of bytes of state memory allocated by this operation.
virtual bool isBuilt () const
 Whether build() completed successfully for a concrete input shape.
virtual bool isEvalMode () const
 Query whether operation is configured for training.
virtual void setTrainingMode (TrainingMode training_mode)
 Configure operation training-mode behavior.

Private Member Functions

void computeRuntimePartition_ (const shape_t &input_shape, int64_t &norm_axis, int &outer_size, int &inner_size, int &norm_dim, int64_t &num_slices, int64_t &normalized_features) const
void validateNormalizedShape_ (const shape_t &input_shape) const
void validateRuntimeShape_ (const shape_t &input_shape) const

Private Attributes

NativeTypebias_ { nullptr }
NativeTypebias_grad_ { nullptr }
LayerNormConfig config_
CudaExecutionContextcontext_
Detail::cuda_layernorm_impl< NativeTypeimpl_
int max_inner_size_ { 0 }
shape_t max_input_shape_
int max_norm_dim_ { 0 }
dim_t max_num_slices_ { 0 }
int max_outer_size_ { 0 }
NativeTypemean_ { nullptr }
std::shared_ptr< TensorTypemean_tensor_
int64_t norm_axis_ { -1 }
NativeTyperstd_ { nullptr }
std::shared_ptr< TensorTyperstd_tensor_
NativeTypeweight_ { nullptr }
NativeTypeweight_grad_ { nullptr }
int64_t weight_size_ { 0 }

Additional Inherited Members

Static Public Attributes inherited from Mila::Dnn::Compute::Operation< TDeviceType, TInput >
static constexpr TensorDataType data_type
static constexpr DeviceType device_type
Static Protected Member Functions inherited from Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision >
static const TensorInputTypeasInputTensor (const ITensor &t)
static TensorOutputTypeasOutputTensor (ITensor &t)
Protected Attributes inherited from Mila::Dnn::Compute::Operation< TDeviceType, TInput >
bool is_built_
TrainingMode training_mode_

Detailed Description

template<TensorDataType TPrecision>
requires PrecisionSupportedOnDevice<TPrecision, DeviceType::Cuda>
class Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >

CUDA implementation of Layer Normalization.

Normalizes activations along a specified axis by computing mean and variance, then applying an affine transformation with learnable weight and bias parameters.

Design philosophy:

  • Two-phase initialization: build() allocates resources, forward()/backward() dispatch to kernels
  • Component owns weight/bias parameters, operation caches device pointers
  • Operation owns ephemeral forward-pass statistics (mean/rstd) required for backward
  • All dimension computation happens once in build() for zero-overhead hot-path execution
Template Parameters
TPrecisionAbstract tensor precision (FP32, FP16, etc.)

Member Typedef Documentation

◆ CudaExecutionContext

template<TensorDataType TPrecision>
using Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::CudaExecutionContext = ExecutionContext<DeviceType::Cuda>

◆ MR

◆ NativeType

◆ TensorType

template<TensorDataType TPrecision>
using Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::TensorType = Tensor<TPrecision, MR>

◆ UnaryOperationBase

template<TensorDataType TPrecision>
using Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::UnaryOperationBase = UnaryOperation<DeviceType::Cuda, TPrecision>

Constructor & Destructor Documentation

◆ CudaLayerNormOp()

template<TensorDataType TPrecision>
Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::CudaLayerNormOp ( IExecutionContext * context,
const LayerNormConfig & config )
inline
Here is the call graph for this function:

Member Function Documentation

◆ backward()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::backward ( const ITensor & input,
const ITensor & output_grad,
ITensor & input_grad ) const
inlineoverridevirtual

Execute backward pass (hot path).

Computes input gradient and accumulates parameter gradients using forward-pass statistics cached during forward().

Parameters
inputOriginal forward-pass input (required for gradient computation)
output_gradGradient of loss with respect to output
input_gradGradient of loss with respect to input (computed)

Implements Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision >.

Here is the call graph for this function:

◆ build()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::build ( const BuildContext & config)
inlineoverridevirtual

Prepare operation for execution with concrete input shape.

Cold-path initialization: computes normalization axis, partitions tensor dimensions, and allocates forward-pass statistics storage.

Dimension partitioning:

  • norm_axis: The axis along which normalization is applied
  • outer_size: Product of all dimensions before norm_axis
  • inner_size: Product of all dimensions after norm_axis
  • norm_dim: Size of the dimension at norm_axis

Example: For shape [2, 3, 4, 5] with axis=2:

  • norm_axis = 2
  • outer_size = 2 * 3 = 6
  • inner_size = 5
  • norm_dim = 4

Forward-pass statistics (mean, rstd) are allocated with size outer_size * inner_size to store one mean/rstd value per normalized slice.

Parameters
input_shapeShape of input tensor to be normalized
Exceptions
std::runtime_errorIf parameters not bound via setParameters()
std::invalid_argumentIf input shape incompatible with configuration
std::invalid_argumentIf computed normalization axis is out of range
Note
After build(), forward() and backward() become pure dispatch with zero overhead

Reimplemented from Mila::Dnn::Compute::Operation< TDeviceType, TInput >.

Here is the call graph for this function:

◆ computeRuntimePartition_()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::computeRuntimePartition_ ( const shape_t & input_shape,
int64_t & norm_axis,
int & outer_size,
int & inner_size,
int & norm_dim,
int64_t & num_slices,
int64_t & normalized_features ) const
inlineprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ forward()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::forward ( const ITensor & input,
ITensor & output ) const
inlineoverridevirtual

Execute forward pass (hot path).

Computes normalized output and caches forward-pass statistics (mean, rstd) required for backward gradient computation.

Parameters
inputInput tensor to normalize
outputNormalized output tensor (same shape as input)

Implements Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision >.

Here is the call graph for this function:

◆ getConfig()

template<TensorDataType TPrecision>
const LayerNormConfig & Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::getConfig ( ) const
inline

◆ getName()

template<TensorDataType TPrecision>
std::string Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::getName ( ) const
inlineoverridevirtual

Human-readable operation name.

Implements Mila::Dnn::Compute::Operation< TDeviceType, TInput >.

◆ getOperationType()

template<TensorDataType TPrecision>
OperationType Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::getOperationType ( ) const
inlineoverridevirtual

◆ setGradients()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::setGradients ( ITensor * weight_grad,
ITensor * bias_grad )
inlineoverridevirtual

Bind component-owned parameter gradient tensors for training.

Caches native device gradient pointers for backward pass writes. Weight gradient is required; bias gradient is optional based on configuration.

Parameters
weight_gradGradient accumulator for weight parameter (required)
bias_gradGradient accumulator for bias parameter (optional)
Exceptions
std::invalid_argumentIf weight_grad is null or not a CUDA tensor
std::invalid_argumentIf bias_grad is required by config but null or not a CUDA tensor
Note
Must be called before training (backward pass)

Reimplemented from Mila::Dnn::Compute::Operation< TDeviceType, TInput >.

Here is the call graph for this function:

◆ setParameters()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::setParameters ( ITensor * weight,
ITensor * bias )
inlineoverridevirtual

Bind component-owned parameter tensors.

Caches native device pointers for zero-overhead hot-path access. Weight is required; bias is optional based on configuration.

Parameters
weightScaling parameter applied after normalization (required)
biasShift parameter applied after normalization (optional)
Exceptions
std::invalid_argumentIf weight is null or not a CUDA tensor
std::invalid_argumentIf bias is required by config but null or not a CUDA tensor
Note
Must be called before build()

Reimplemented from Mila::Dnn::Compute::Operation< TDeviceType, TInput >.

Here is the call graph for this function:

◆ validateNormalizedShape_()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::validateNormalizedShape_ ( const shape_t & input_shape) const
inlineprivate
Here is the call graph for this function:
Here is the caller graph for this function:

◆ validateRuntimeShape_()

template<TensorDataType TPrecision>
void Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::validateRuntimeShape_ ( const shape_t & input_shape) const
inlineprivate
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ bias_

template<TensorDataType TPrecision>
NativeType* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::bias_ { nullptr }
private

◆ bias_grad_

template<TensorDataType TPrecision>
NativeType* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::bias_grad_ { nullptr }
private

◆ config_

template<TensorDataType TPrecision>
LayerNormConfig Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::config_
private

◆ context_

template<TensorDataType TPrecision>
CudaExecutionContext* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::context_
private

◆ impl_

◆ max_inner_size_

template<TensorDataType TPrecision>
int Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::max_inner_size_ { 0 }
private

◆ max_input_shape_

template<TensorDataType TPrecision>
shape_t Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::max_input_shape_
private

◆ max_norm_dim_

template<TensorDataType TPrecision>
int Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::max_norm_dim_ { 0 }
private

◆ max_num_slices_

template<TensorDataType TPrecision>
dim_t Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::max_num_slices_ { 0 }
private

◆ max_outer_size_

template<TensorDataType TPrecision>
int Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::max_outer_size_ { 0 }
private

◆ mean_

template<TensorDataType TPrecision>
NativeType* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::mean_ { nullptr }
private

◆ mean_tensor_

template<TensorDataType TPrecision>
std::shared_ptr<TensorType> Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::mean_tensor_
private

◆ norm_axis_

template<TensorDataType TPrecision>
int64_t Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::norm_axis_ { -1 }
private

◆ rstd_

template<TensorDataType TPrecision>
NativeType* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::rstd_ { nullptr }
private

◆ rstd_tensor_

template<TensorDataType TPrecision>
std::shared_ptr<TensorType> Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::rstd_tensor_
private

◆ weight_

template<TensorDataType TPrecision>
NativeType* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::weight_ { nullptr }
private

◆ weight_grad_

template<TensorDataType TPrecision>
NativeType* Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::weight_grad_ { nullptr }
private

◆ weight_size_

template<TensorDataType TPrecision>
int64_t Mila::Dnn::Compute::Cuda::LayerNorm::CudaLayerNormOp< TPrecision >::weight_size_ { 0 }
private

The documentation for this class was generated from the following file:
  • /__w/Mila/Mila/Mila/Src/Dnn/Compute/Devices/Cuda/Operations/Normalizations/LayerNorm/LayerNormOp.ixx