| active_max_seq_len_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| allocateStateTensors() | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| asInputTensor(const ITensor &t) | Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision > | inlineprotectedstatic |
| asOutputTensor(ITensor &t) | Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision > | inlineprotectedstatic |
| att_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| att_decode_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| att_decode_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| att_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| att_value_decode_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| att_value_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| B_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| backward(const ITensor &input, const ITensor &output_grad, ITensor &input_grad) const override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| backward_att_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| backward_k_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| backward_q_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| backward_v_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| build(const BuildContext &config) override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| buildCublasLtPlans() | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| cached_seq_len_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| clearGradients() noexcept | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| config_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| ConfigType typedef | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | |
| context_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| cublaslt_handle_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| CudaExecutionContext typedef | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | |
| CudaMultiHeadAttentionOp(IExecutionContext *context, const MultiHeadAttentionConfig &config) | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inline |
| data_type | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | static |
| DataTypeTraits typedef | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | |
| datt_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| datt_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| decode(const ITensor &input, ITensor &output, int position) override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| device_type | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | static |
| dk_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dk_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dpreatt_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dpreatt_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dq_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dq_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dV_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dV_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dVout_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| dVout_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| embedding_dim_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| ensureKVCacheEnabled() const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| forward(const ITensor &input, ITensor &output) const override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| getComputeTypes(cublasComputeType_t &compute_type, cudaDataType_t &scale_type) const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| getConfig() const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inline |
| getCudaDataType() const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| getDataType() const | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| getDeviceType() const | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| getName() const override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| getOperationType() const override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| getStateMemorySize() const | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| HS_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| initializeKvCache(int batch_size, int max_seq_length) override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| is_built_ | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | protected |
| isBuilt() const | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| isEvalMode() const | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| k_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| k_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| kv_cache_enabled_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| MR typedef | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | |
| NativeType typedef | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | |
| NH_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| preatt_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| preatt_decode_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| preatt_decode_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| preatt_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| prefill(const ITensor &input, ITensor &output) override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| q_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| q_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| qk_decode_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| qk_score_plan_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| qkv_dim_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| resetKvCache() override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| setGradients(ITensor *, ITensor *) override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| setParameters(ITensor *, ITensor *) override | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlinevirtual |
| setTrainingMode(TrainingMode training_mode) | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | inlinevirtual |
| T_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| TensorInputType typedef | Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision > | |
| TensorOutputType typedef | Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision > | |
| TensorType typedef | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | |
| training_mode_ | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | protected |
| UnaryOperationBase typedef | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | |
| v_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| v_out_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| v_out_decode_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| v_out_decode_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| v_out_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| v_tensor_ | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | private |
| validateDecodeInputShape(const shape_t &input_shape) const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| validateInputShape(const shape_t &input_shape) const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| validatePrefillInputShape(const shape_t &input_shape) const | Mila::Dnn::Compute::Cuda::MultiHeadAttention::CudaMultiHeadAttentionOp< TPrecision > | inlineprivate |
| ~IKvCacheLifecycle()=default | Mila::Dnn::Compute::IKvCacheLifecycle | virtual |
| ~IPackedKvInference() override=default | Mila::Dnn::Compute::IPackedKvInference | |
| ~Operation()=default | Mila::Dnn::Compute::Operation< TDeviceType, TInput > | virtual |
| ~UnaryOperation()=default | Mila::Dnn::Compute::UnaryOperation< DeviceType::Cuda, TPrecision > | virtual |