Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy > Class Template Referenceexport
Inheritance diagram for Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >:
Collaboration diagram for Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >:

Public Types

using AttentionType = GroupedQueryAttention<TDeviceType, TPrecision, TKvPolicy>
using CompositeComponentBase = CompositeComponent<TDeviceType, TPrecision>
using LinearType = Linear<TDeviceType, TPrecision, TWeightQuant>
using MR = typename DeviceTypeTraits<TDeviceType>::memory_resource
using ResidualType = Residual<TDeviceType, TPrecision>
using RmsNormType = RmsNorm<TDeviceType, TPrecision>
using RopeType = Rope<TDeviceType, TPrecision>
using SwiGLUType = Swiglu<TDeviceType, TPrecision>
using TensorType = Tensor<TPrecision, MR>
Public Types inherited from Mila::Dnn::CompositeComponent< TDeviceType, TPrecision >
using ComponentBase = Component<TDeviceType, TPrecision>
using ComponentPtr = std::shared_ptr<Component<TDeviceType, TPrecision>>

Public Member Functions

 LlamaBlock (const std::string &name, const LlamaConfig &config, std::optional< DeviceId > device_id=std::nullopt)
 ~LlamaBlock () override=default
TensorTypebackward (const TensorType &input, const TensorType &output_grad)
TensorTypedecode (const TensorType &input, int position)
TensorTypeforward (const TensorType &input)
MemoryStats getMemoryStats () const override
 Return the current memory allocation breakdown for this component.
const ComponentType getType () const override
 Get the component type identifier.
void load_ (ModelArchive &archive, SerializationMode mode)
TensorTypeprefill (const TensorType &input, int position_offset)
void resetKVCache ()
void save_ (ModelArchive &archive, SerializationMode mode) const override
 Save all child components recursively.
void setState (const GqaState &state)
 Forward the shared GQA transient workspace to this block's attention layer.
bool supportsKVCache () const noexcept
void zeroGradients () override
 Clear all model-owned gradients for this component.
Public Member Functions inherited from Mila::Dnn::CompositeComponent< TDeviceType, TPrecision >
 CompositeComponent (CompositeComponent &&) noexcept=default
 CompositeComponent (const CompositeComponent &)=delete
 CompositeComponent (const std::string &name)
 Construct composite component with name.
virtual ~CompositeComponent ()=default
CompositeComponentaddComponent (ComponentPtr component)
 Add a pre-constructed child component (chainable).
size_t childCount () const noexcept
 Get the number of direct children.
void clearComponents ()
 Clear all child components.
ComponentPtr findComponent (const std::string &path) const
 Resolve a dot-separated component path within this composite.
ComponentPtr getComponent (const std::string &name) const
 Retrieve a direct child component by name.
const std::vector< ComponentPtr > & getComponents () const
 Get all child components in insertion order.
DeviceId getDeviceId () const override
 Get the compute device for this composite.
std::vector< ITensor * > getGradients () const override
 Get all parameter gradients from all children.
std::vector< ITensor * > getParameters () const override
 Get all parameters from all children.
bool hasChildren () const noexcept
 Check if this composite has any children.
bool hasComponent (const std::string &name) const
 Check if a named child component exists.
CompositeComponentoperator= (CompositeComponent &&) noexcept=default
CompositeComponentoperator= (const CompositeComponent &)=delete
size_t parameterCount () const override
 Count parameters across all children.
bool removeComponent (const std::string &name)
 Get the named child components map.
void synchronize () override
 Synchronize all child components.
std::string toString () const override
 Generate a human-readable description.
ComponentPtr tryFindComponent (const std::string &path) const
 Try to resolve a dot-separated component path within this composite.
Public Member Functions inherited from Mila::Dnn::Component< TDeviceType, TPrecision >
 Component (const std::string &name)
 Construct component with required name identifier.
virtual ~Component ()=default
virtual void build (const BuildContext &context) final
 Build the component with the provided BuildContext (canonical overload).
const std::string getName () const
 Get the component's name identifier.
virtual std::vector< std::string > getParameterNames () const
 List all available parameter names for this component.
RuntimeMode getRuntimeMode () const noexcept
 Convenience accessor — true if currently in Eval mode.
TrainingMode getTrainingMode () const noexcept
 The current runtime behavioral mode of this Component.
virtual bool isBuilt () const final
 Returns true if build() has completed successfully.
bool isInferenceMode () const noexcept
bool isTrainingMode () const noexcept
virtual void loadParameter (const std::string &name, const Serialization::ITensorBlob &blob)
 Load a parameter from serialized tensor data.
void setTrainingMode (TrainingMode mode)
 Set the runtime behavioral mode for this Component.

Protected Member Functions

void onBuilding (const BuildContext &context) override
 Hook invoked by build() to allocate component buffers.
void onTrainingModeChanging (TrainingMode training_mode) override
 Hook invoked when training mode is about to change.
Protected Member Functions inherited from Mila::Dnn::CompositeComponent< TDeviceType, TPrecision >
template<typename TComponent>
std::shared_ptr< TComponent > getComponentAs (const std::string &name) const
 Retrieve a typed child component by name.
void onExecutionContextSet () override
 Hook invoked after ExecutionContext is set.
virtual void optimize ()
 Virtual hook for graph optimization after construction.
Protected Member Functions inherited from Mila::Dnn::Component< TDeviceType, TPrecision >
IExecutionContextgetExecutionContext () const
 Get the shared execution context.
bool hasExecutionContext () const noexcept
 Check if execution context has been set.
template<TensorDataType TParameterPrecision, typename TMemoryResource>
void loadParameterFromBlob (const std::string &param_name, const Serialization::ITensorBlob &blob, Tensor< TParameterPrecision, TMemoryResource > &target, const shape_t &expected_shape)
 Load a tensor blob into a parameter tensor with validation.
void setExecutionContext (IExecutionContext *context)
 Set the execution context for this component.

Private Member Functions

void createGraph ()
void validateBuildContext (const BuildContext &context) const
void validateInputShape (const shape_t &input_shape) const

Private Attributes

std::shared_ptr< AttentionTypeattn_ { nullptr }
shape_t cached_input_shape_
LlamaConfig config_
std::unique_ptr< TensorTyped_input_ { nullptr }
std::unique_ptr< TensorTyped_res1_accum_ { nullptr }
std::shared_ptr< LinearTypefc_down_ { nullptr }
std::shared_ptr< LinearTypefc_gate_up_ { nullptr }
bool forward_executed_ { false }
std::unique_ptr< TensorTypek_ { nullptr }
shape_t k_prefill_shape_
shape_t k_shape_
TensorTypelast_attn_out_ { nullptr }
TensorTypelast_ffn_out_ { nullptr }
TensorTypelast_gate_up_out_ { nullptr }
TensorTypelast_out_proj_out_ { nullptr }
TensorTypelast_qkv_out_ { nullptr }
TensorTypelast_res1_out_ { nullptr }
TensorTypelast_res2_out_ { nullptr }
TensorTypelast_rms1_out_ { nullptr }
TensorTypelast_rms2_out_ { nullptr }
TensorTypelast_swiglu_out_ { nullptr }
std::shared_ptr< LinearTypeout_proj_ { nullptr }
std::unique_ptr< IExecutionContextowned_exec_context_ { nullptr }
std::unique_ptr< TensorTypeq_ { nullptr }
size_t q_offset_ { 0 }
size_t q_prefill_offset_
shape_t q_prefill_shape_
shape_t q_shape_
std::shared_ptr< LinearTypeqkv_proj_ { nullptr }
std::shared_ptr< ResidualTyperes1_ { nullptr }
std::unique_ptr< TensorTyperes1_prefill_ { nullptr }
std::shared_ptr< ResidualTyperes2_ { nullptr }
std::shared_ptr< RmsNormTyperms1_ { nullptr }
std::shared_ptr< RmsNormTyperms2_ { nullptr }
std::shared_ptr< RopeTyperope_ { nullptr }
std::shared_ptr< SwiGLUTypeswiglu_ { nullptr }
std::unique_ptr< TensorTypev_ { nullptr }

Additional Inherited Members

Static Public Member Functions inherited from Mila::Dnn::Component< TDeviceType, TPrecision >
static constexpr DeviceType getDeviceType ()
 Compile-time device type for this component instance.
static constexpr TensorDataType getPrecision () noexcept
 Compile-time tensor precision for this component instance.
Protected Attributes inherited from Mila::Dnn::Component< TDeviceType, TPrecision >
BuildContext build_context_ { shape_t{ 1 }, RuntimeMode::Training }
 The BuildContext stored at build time.

Member Typedef Documentation

◆ AttentionType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::AttentionType = GroupedQueryAttention<TDeviceType, TPrecision, TKvPolicy>

◆ CompositeComponentBase

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::CompositeComponentBase = CompositeComponent<TDeviceType, TPrecision>

◆ LinearType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::LinearType = Linear<TDeviceType, TPrecision, TWeightQuant>

◆ MR

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::MR = typename DeviceTypeTraits<TDeviceType>::memory_resource

◆ ResidualType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::ResidualType = Residual<TDeviceType, TPrecision>

◆ RmsNormType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::RmsNormType = RmsNorm<TDeviceType, TPrecision>

◆ RopeType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::RopeType = Rope<TDeviceType, TPrecision>

◆ SwiGLUType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::SwiGLUType = Swiglu<TDeviceType, TPrecision>

◆ TensorType

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
using Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::TensorType = Tensor<TPrecision, MR>

Constructor & Destructor Documentation

◆ LlamaBlock()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::LlamaBlock ( const std::string & name,
const LlamaConfig & config,
std::optional< DeviceId > device_id = std::nullopt )
inlineexplicit

◆ ~LlamaBlock()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::~LlamaBlock ( )
overridedefault

Member Function Documentation

◆ backward()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType & Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::backward ( const TensorType & input,
const TensorType & output_grad )
inline

◆ createGraph()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::createGraph ( )
inlineprivate
Here is the caller graph for this function:

◆ decode()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType & Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::decode ( const TensorType & input,
int position )
inline

◆ forward()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType & Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::forward ( const TensorType & input)
inline

◆ getMemoryStats()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
MemoryStats Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::getMemoryStats ( ) const
inlineoverridevirtual

Return the current memory allocation breakdown for this component.

Reflects allocations at the moment of the call. The returned stats naturally track the component lifecycle:

After construction — parameters only After build( Inference ) — parameters + T=1 state buffers After build( Training ) — parameters + T=full state buffers After setEvaluation( false ) — parameters + state + gradients

For CompositeComponent and Network, the returned stats are the recursive aggregate of all child components.

May be called at any time — no lifecycle preconditions.

Returns
MemoryStats reflecting current allocations.

Implements Mila::Dnn::Component< TDeviceType, TPrecision >.

◆ getType()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
const ComponentType Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::getType ( ) const
inlineoverridevirtual

Get the component type identifier.

Used for serialization and runtime type identification.

Returns
Component type enum value.

Implements Mila::Dnn::Component< TDeviceType, TPrecision >.

◆ load_()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::load_ ( ModelArchive & archive,
SerializationMode mode )
inline

◆ onBuilding()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::onBuilding ( const BuildContext & config)
inlineoverrideprotectedvirtual

Hook invoked by build() to allocate component buffers.

Receives the stored BuildContext. Implementations must use config.allocationSeqLen() when sizing output buffers — this is the single call that makes Inference and Training allocate the correct buffer sizes automatically without per-component logic.

// Example — Linear component:
shape_t out_shape =
{
config.batchSize(),
config.allocationSeqLen(), // 1 for Inference, T for Training
config_.getOutputFeatures()
};
output_ = std::make_unique<TensorType>( device, out_shape,
this->getName() + ".output" );
const std::string getName() const
Get the component's name identifier.
Definition Component.ixx:410
LlamaConfig config_
Definition Llama.Block.ixx:719
TensorShape shape_t
Row-major shape descriptor for tensor dimensional sizes.
Definition Tensor.Types.ixx:143

The default implementation forwards to the legacy onBuilding( const shape_t& ) overload for backwards compatibility. New components should override this overload directly.

Note
Do not call build() or onBuilding() from within this hook.
Implementations should either succeed fully or leave no partial state, as a failed build() may be retried.
Parameters
configBuild-time configuration. Use config.allocationSeqLen() to obtain the correct output buffer sequence dimension.

Reimplemented from Mila::Dnn::Component< TDeviceType, TPrecision >.

◆ onTrainingModeChanging()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::onTrainingModeChanging ( TrainingMode training_mode)
inlineoverrideprotectedvirtual

Hook invoked when training mode is about to change.

Propagates the new mode to all child components. The hook runs with the Component's training mutex held; it MUST NOT call setTraining().

Parameters
is_trainingNew training mode (true = training, false = eval)

Reimplemented from Mila::Dnn::CompositeComponent< TDeviceType, TPrecision >.

◆ prefill()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType & Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::prefill ( const TensorType & input,
int position_offset )
inline

◆ resetKVCache()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::resetKVCache ( )
inline

◆ save_()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::save_ ( ModelArchive & archive,
SerializationMode mode ) const
inlineoverridevirtual

Save all child components recursively.

Follows the component serialization contract:

  • Writes type, version, and configuration metadata
  • Recursively saves all children with scoped namespaces
  • Each child's save_() handles its own state
Parameters
archiveArchive to write to
modeWhat to save (Checkpoint, WeightsOnly, Architecture)

Reimplemented from Mila::Dnn::CompositeComponent< TDeviceType, TPrecision >.

◆ setState()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::setState ( const GqaState & state)
inline

Forward the shared GQA transient workspace to this block's attention layer.

Must be called after build() and before prefill() or decode().

Parameters
stateNon-owning pointers to workspace tensors owned by LlamaTransformer.

◆ supportsKVCache()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
bool Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::supportsKVCache ( ) const
inlinenoexcept

◆ validateBuildContext()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::validateBuildContext ( const BuildContext & context) const
inlineprivate
Here is the caller graph for this function:

◆ validateInputShape()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::validateInputShape ( const shape_t & input_shape) const
inlineprivate

◆ zeroGradients()

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
void Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::zeroGradients ( )
inlineoverridevirtual

Clear all model-owned gradients for this component.

Default implementation is a no-op. Composite components should override to recurse to children. Leaf components should override to zero their parameter and activation gradients using device-aware helpers.

Reimplemented from Mila::Dnn::Component< TDeviceType, TPrecision >.

Member Data Documentation

◆ attn_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<AttentionType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::attn_ { nullptr }
private

◆ cached_input_shape_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
shape_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::cached_input_shape_
private

◆ config_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
LlamaConfig Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::config_
private

◆ d_input_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<TensorType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::d_input_ { nullptr }
private

◆ d_res1_accum_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<TensorType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::d_res1_accum_ { nullptr }
private

◆ fc_down_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<LinearType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::fc_down_ { nullptr }
private

◆ fc_gate_up_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<LinearType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::fc_gate_up_ { nullptr }
private

◆ forward_executed_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
bool Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::forward_executed_ { false }
private

◆ k_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<TensorType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::k_ { nullptr }
private

◆ k_prefill_shape_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
shape_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::k_prefill_shape_
private

◆ k_shape_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
shape_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::k_shape_
private

◆ last_attn_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_attn_out_ { nullptr }
private

◆ last_ffn_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_ffn_out_ { nullptr }
private

◆ last_gate_up_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_gate_up_out_ { nullptr }
private

◆ last_out_proj_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_out_proj_out_ { nullptr }
private

◆ last_qkv_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_qkv_out_ { nullptr }
private

◆ last_res1_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_res1_out_ { nullptr }
private

◆ last_res2_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_res2_out_ { nullptr }
private

◆ last_rms1_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_rms1_out_ { nullptr }
private

◆ last_rms2_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_rms2_out_ { nullptr }
private

◆ last_swiglu_out_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
TensorType* Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::last_swiglu_out_ { nullptr }
private

◆ out_proj_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<LinearType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::out_proj_ { nullptr }
private

◆ owned_exec_context_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<IExecutionContext> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::owned_exec_context_ { nullptr }
private

◆ q_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<TensorType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::q_ { nullptr }
private

◆ q_offset_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
size_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::q_offset_ { 0 }
private

◆ q_prefill_offset_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
size_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::q_prefill_offset_
private

◆ q_prefill_shape_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
shape_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::q_prefill_shape_
private

◆ q_shape_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
shape_t Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::q_shape_
private

◆ qkv_proj_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<LinearType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::qkv_proj_ { nullptr }
private

◆ res1_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<ResidualType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::res1_ { nullptr }
private

◆ res1_prefill_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<TensorType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::res1_prefill_ { nullptr }
private

◆ res2_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<ResidualType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::res2_ { nullptr }
private

◆ rms1_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<RmsNormType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::rms1_ { nullptr }
private

◆ rms2_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<RmsNormType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::rms2_ { nullptr }
private

◆ rope_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<RopeType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::rope_ { nullptr }
private

◆ swiglu_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::shared_ptr<SwiGLUType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::swiglu_ { nullptr }
private

◆ v_

template<DeviceType TDeviceType, TensorDataType TPrecision, WeightQuantPolicy TWeightQuant = NoWeightQuant, KvCachePolicy TKvPolicy = NoKvCompression>
std::unique_ptr<TensorType> Mila::Dnn::LlamaBlock< TDeviceType, TPrecision, TWeightQuant, TKvPolicy >::v_ { nullptr }
private

The documentation for this class was generated from the following file:
  • /__w/Mila/Mila/Mila/Src/Dnn/Components/Transformers/LlaMa/Llama.Block.ixx