Token sequence loader for autoregressive language models. More...

Inheritance diagram for Mila::Data::TokenSequenceLoader< TMemoryResource >:

Collaboration diagram for Mila::Data::TokenSequenceLoader< TMemoryResource >:

[legend]

Public Types
using	BaseLoader = DataLoader<TensorDataType::INT32, TensorDataType::INT32, TMemoryResource>
using	HostType = typename TensorHostTypeMap<TensorDataType::INT32>::host_type
using	TensorType = Tensor<TensorDataType::INT32, TMemoryResource>
Public Types inherited from Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >
using	InputDataType
	Input tensor abstract data type.
using	InputTensor
	Input tensor type alias.
using	MemoryResource
	Memory resource type for tensor allocation.
using	TargetDataType
	Target tensor abstract data type.
using	TargetTensor
	Target tensor type alias.

Public Member Functions
	TokenSequenceLoader (const std::filesystem::path &tokens_file, int64_t batch_size, int64_t seq_length, bool is_training, DeviceId device, const TokenSequenceLoaderConfig &config=TokenSequenceLoaderConfig())
	Constructs streaming autoregressive sequence loader.
	TokenSequenceLoader (const TokenSequenceLoader &)=delete
	TokenSequenceLoader (TokenSequenceLoader &&)=delete
	~TokenSequenceLoader () noexcept
const TensorType &	inputs () const override
	Provides immutable access to input tensor for current batch.
TensorType &	inputs () override
	Provides mutable access to input tensor for current batch.
void	nextBatch () override
	Loads the next batch of data from the dataset.
int64_t	numBatches () const override
	Returns the total number of batches in the dataset.
size_t	numTokens () const
size_t	numWindows () const
TokenSequenceLoader &	operator= (const TokenSequenceLoader &)=delete
TokenSequenceLoader &	operator= (TokenSequenceLoader &&)=delete
void	reset () override
	Resets the loader to the beginning of the dataset.
int64_t	sequenceLength () const
const TensorType &	targets () const override
	Provides immutable access to target tensor for current batch.
TensorType &	targets () override
	Provides mutable access to target tensor for current batch.
size_t	windowSizeTokens () const
Public Member Functions inherited from Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >
	DataLoader (const DataLoader &)=delete
	Copy operations explicitly deleted for performance safety.
	DataLoader (DataLoader &&)=default
	Move operations for efficient ownership transfer.
	DataLoader (int64_t batch_size)
	Constructs data loader with specified batch configuration.
virtual	~DataLoader ()=default
	Virtual destructor ensuring proper cleanup in derived classes.
int64_t	batchSize () const noexcept
	Returns the configured batch size.
int64_t	currentBatch () const noexcept
	Returns the current batch index.
virtual std::string	getDatasetInfo () const
	Returns dataset statistics for optimization and analysis.
virtual bool	hasNext () const
	Checks if more batches are available.
DataLoader &	operator= (const DataLoader &)=delete
DataLoader &	operator= (DataLoader &&)=default
virtual bool	validateCurrentBatch () const
	Validates current batch data integrity.

Private Member Functions
void	allocateBuffers ()
void	cleanupBuffers () noexcept
void	fillBatch (const TokenId window_buffer, size_t batch_idx, HostType input_dest, HostType *target_dest)
	Fills a batch from the current window buffer.
void	initializeDataset ()
void	loadWindowFromFile (std::ifstream &file, TokenId *buffer, size_t window_idx)
	Loads a window from the token file.
void	prepareSequenceIndices ()
void	producerThreadFunc () noexcept
	Producer thread: streams windows from disk and fills batches.
void	shuffleSequenceIndices ()
void	swapBuffers () noexcept

Static Private Member Functions
static DeviceId	validateDeviceId (DeviceId device)

Private Attributes
std::atomic< bool >	back_buffer_ready_
std::shared_ptr< TensorType >	back_input_tensor_
std::shared_ptr< TensorType >	back_target_tensor_
size_t	batches_per_window_
TokenSequenceLoaderConfig	config_
std::atomic< size_t >	current_batch_in_window_
std::atomic< size_t >	current_window_idx_
std::condition_variable	cv_consumer_
std::condition_variable	cv_producer_
DeviceId	device_
size_t	file_size_
std::atomic< bool >	front_buffer_ready_
std::shared_ptr< TensorType >	front_input_tensor_
std::shared_ptr< TensorType >	front_target_tensor_
bool	is_training_
std::mutex	mutex_
int64_t	num_batches_
size_t	num_tokens_
size_t	num_windows_
std::exception_ptr	producer_exception_
std::thread	producer_thread_
int64_t	seq_length_
std::vector< size_t >	sequence_indices_
size_t	sequences_per_window_
std::atomic< bool >	stop_
std::filesystem::path	tokens_file_path_
size_t	window_size_tokens_

Additional Inherited Members
Static Public Member Functions inherited from Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >
static constexpr bool	supportsMixedPrecision () noexcept
	Checks if data loader supports mixed-precision workflows.
static constexpr bool	usesPinnedMemory () noexcept
	Checks if data loader uses pinned memory for GPU optimization.
Static Public Attributes inherited from Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >
static constexpr TensorDataType	input_data_type
	Compile-time input data type constant.
static constexpr bool	is_mixed_precision
	Mixed-precision workflow detection.
static constexpr TensorDataType	target_data_type
	Compile-time target data type constant.
static constexpr bool	uses_pinned_memory
	Pinned memory optimization (CUDA-only; false on CPU-only builds).
Protected Member Functions inherited from Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >
void	incrementBatch () noexcept
	Increments current batch counter.
void	setCurrentBatch (int64_t batch_index) noexcept
	Updates current batch counter.

Detailed Description

template<typename TMemoryResource>
requires (std::is_same_v<TMemoryResource, CpuMemoryResource>)
class Mila::Data::TokenSequenceLoader< TMemoryResource >

Token sequence loader for autoregressive language models.

Loads tokenized text data for causal language modeling tasks such as GPT, LLaMA, and other transformer-based models. Reads from pre-tokenized binary .tokens files and produces batches of (input, target) sequence pairs where target[i] = input[i+1] (next-token prediction).

Implementation uses efficient disk streaming with double-buffered producer-consumer pattern for high-throughput training on large corpora.

Template Parameters

TMemoryResource CpuMemoryResource or CudaPinnedMemoryResource

Constructor & Destructor Documentation

◆ TokenSequenceLoader() [1/3]

template<typename TMemoryResource>

Mila::Data::TokenSequenceLoader< TMemoryResource >::TokenSequenceLoader	(	const std::filesystem::path &	tokens_file,
		int64_t	batch_size,
		int64_t	seq_length,
		bool	is_training,
		DeviceId	device,
		const TokenSequenceLoaderConfig &	config = TokenSequenceLoaderConfig() )

inlineexport

Constructs streaming autoregressive sequence loader.

Parameters

tokens_file	Path to binary .tokens file (uint32_t format)
batch_size	Number of sequences per batch
seq_length	Context window length (tokens per sequence)
is_training	Enable shuffling and continuous epochs
device	Compute device for tensor allocation
config	Performance and streaming configuration

Exceptions

std::invalid_argument	If batch_size or seq_length is zero
std::runtime_error	If file operations or initialization fails

Here is the call graph for this function:

Here is the caller graph for this function:

◆ ~TokenSequenceLoader()

template<typename TMemoryResource>

Mila::Data::TokenSequenceLoader< TMemoryResource >::~TokenSequenceLoader ( )

inlineexportnoexcept

Here is the call graph for this function:

◆ TokenSequenceLoader() [2/3]

template<typename TMemoryResource>

Mila::Data::TokenSequenceLoader< TMemoryResource >::TokenSequenceLoader ( const TokenSequenceLoader< TMemoryResource > & )

exportdelete

Here is the call graph for this function:

◆ TokenSequenceLoader() [3/3]

template<typename TMemoryResource>

Mila::Data::TokenSequenceLoader< TMemoryResource >::TokenSequenceLoader ( TokenSequenceLoader< TMemoryResource > && )

exportdelete

Here is the call graph for this function:

Member Function Documentation

◆ allocateBuffers()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::allocateBuffers ( )

inlineexportprivate

Here is the call graph for this function:

Here is the caller graph for this function:

◆ cleanupBuffers()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::cleanupBuffers ( )

inlineexportprivatenoexcept

Here is the caller graph for this function:

◆ fillBatch()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::fillBatch	(	const TokenId *	window_buffer,
		size_t	batch_idx,
		HostType *	input_dest,
		HostType *	target_dest )

inlineexportprivate

Fills a batch from the current window buffer.

Creates non-overlapping sequences where target[i] = input[i+1].

Parameters

window_buffer	Source tokens for current window
batch_idx	Batch index within current window
input_dest	Destination for input sequences
target_dest	Destination for target sequences

Here is the call graph for this function:

Here is the caller graph for this function:

◆ initializeDataset()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::initializeDataset ( )

inlineexportprivate

Here is the call graph for this function:

Here is the caller graph for this function:

◆ inputs() [1/2]

template<typename TMemoryResource>

const TensorType & Mila::Data::TokenSequenceLoader< TMemoryResource >::inputs ( ) const

inlineoverrideexportvirtual

Provides immutable access to input tensor for current batch.

Derived classes must implement this method to provide read-only access to the tensor containing input data for the currently loaded batch.

Returns: Const reference to input tensor containing current batch data

Note: Enables safe access for analysis and debugging without modification risk; Should return same data as mutable version

Implements Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

◆ inputs() [2/2]

template<typename TMemoryResource>

TensorType & Mila::Data::TokenSequenceLoader< TMemoryResource >::inputs ( )

inlineoverrideexportvirtual

Provides mutable access to input tensor for current batch.

Derived classes must implement this method to provide access to the tensor containing input data for the currently loaded batch. The tensor should be properly shaped and contain valid data after nextBatch() call.

Returns: Mutable reference to input tensor containing current batch data

Note: Tensor shape should match expected input dimensions for the model; Data should be preprocessed and ready for model consumption; Memory layout should be optimized for target compute device

Implements Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

◆ loadWindowFromFile()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::loadWindowFromFile	(	std::ifstream &	file,
		TokenId *	buffer,
		size_t	window_idx )

inlineexportprivate

Loads a window from the token file.

Parameters

file	Input file stream
buffer	Destination buffer (must have space for window_size_tokens_)
window_idx	Which window to load

Here is the caller graph for this function:

◆ nextBatch()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::nextBatch ( )

inlineoverrideexportvirtual

Loads the next batch of data from the dataset.

Derived classes must implement this method to load the next batch of data into the input and target tensors. Implementation should handle data preprocessing, memory allocation, and batch composition according to the specific dataset requirements.

Exceptions

std::runtime_error	If no more batches are available
std::runtime_error	If data loading fails

Note: Implementation must update current_batch_ counter after successful load; Should handle end-of-dataset conditions appropriately; May involve complex preprocessing pipelines and data augmentation

Implements Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

Here is the call graph for this function:

◆ numBatches()

template<typename TMemoryResource>

int64_t Mila::Data::TokenSequenceLoader< TMemoryResource >::numBatches ( ) const

inlineoverrideexportvirtual

Returns the total number of batches in the dataset.

Derived classes must implement this method to report the total number of batches available in their specific dataset. This information is essential for training loop progress tracking and epoch management.

Returns: Total number of batches available in the dataset

Note: Implementation should account for partial batches at dataset end; Value may change if dataset is modified or resampled; Used for training progress reporting and epoch boundary detection

Implements Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

◆ numTokens()

template<typename TMemoryResource>

size_t Mila::Data::TokenSequenceLoader< TMemoryResource >::numTokens ( ) const

inlineexport

◆ numWindows()

template<typename TMemoryResource>

size_t Mila::Data::TokenSequenceLoader< TMemoryResource >::numWindows ( ) const

inlineexport

◆ operator=() [1/2]

template<typename TMemoryResource>

TokenSequenceLoader & Mila::Data::TokenSequenceLoader< TMemoryResource >::operator= ( const TokenSequenceLoader< TMemoryResource > & )

exportdelete

Here is the call graph for this function:

◆ operator=() [2/2]

template<typename TMemoryResource>

TokenSequenceLoader & Mila::Data::TokenSequenceLoader< TMemoryResource >::operator= ( TokenSequenceLoader< TMemoryResource > && )

exportdelete

Here is the call graph for this function:

◆ prepareSequenceIndices()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::prepareSequenceIndices ( )

inlineexportprivate

Here is the call graph for this function:

Here is the caller graph for this function:

◆ producerThreadFunc()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::producerThreadFunc ( )

inlineexportprivatenoexcept

Producer thread: streams windows from disk and fills batches.

Workflow:

Load window from disk
Fill back buffer with batch
Mark back buffer ready
Wait for consumer to swap buffers
Repeat

Exception safety: Catches all exceptions and stores them for consumer.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ reset()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::reset ( )

inlineoverrideexportvirtual

Resets the loader to the beginning of the dataset.

Resets the internal state to start iteration from the first batch. Derived classes may override this method to implement additional reset functionality such as dataset reshuffling or preprocessing pipeline reinitialization.

Note: Base implementation resets batch counter to zero; Called automatically at epoch boundaries in training loops; Override to implement custom reset behavior (shuffling, etc.)

Reimplemented from Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

Here is the call graph for this function:

◆ sequenceLength()

template<typename TMemoryResource>

int64_t Mila::Data::TokenSequenceLoader< TMemoryResource >::sequenceLength ( ) const

inlineexport

◆ shuffleSequenceIndices()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::shuffleSequenceIndices ( )

inlineexportprivate

Here is the caller graph for this function:

◆ swapBuffers()

template<typename TMemoryResource>

void Mila::Data::TokenSequenceLoader< TMemoryResource >::swapBuffers ( )

inlineexportprivatenoexcept

Here is the caller graph for this function:

◆ targets() [1/2]

template<typename TMemoryResource>

const TensorType & Mila::Data::TokenSequenceLoader< TMemoryResource >::targets ( ) const

inlineoverrideexportvirtual

Provides immutable access to target tensor for current batch.

Derived classes must implement this method to provide read-only access to the tensor containing target/label data for the currently loaded batch.

Returns: Const reference to target tensor containing current batch labels

Note: Enables safe access for analysis and debugging without modification risk; Should return same data as mutable version

Implements Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

◆ targets() [2/2]

template<typename TMemoryResource>

TensorType & Mila::Data::TokenSequenceLoader< TMemoryResource >::targets ( )

inlineoverrideexportvirtual

Provides mutable access to target tensor for current batch.

Derived classes must implement this method to provide access to the tensor containing target/label data for the currently loaded batch. The tensor should contain ground truth data corresponding to the inputs.

Returns: Mutable reference to target tensor containing current batch labels

Note: Target data should align with input batch ordering; Data format should match model's expected output structure; For mixed-precision workflows, may use different data type than inputs

Implements Mila::Data::DataLoader< TensorDataType::INT32, TensorDataType::INT32, TMemoryResource >.

◆ validateDeviceId()

template<typename TMemoryResource>

DeviceId Mila::Data::TokenSequenceLoader< TMemoryResource >::validateDeviceId ( DeviceId device )

inlinestaticexportprivate

Here is the call graph for this function:

Here is the caller graph for this function:

◆ windowSizeTokens()

template<typename TMemoryResource>

size_t Mila::Data::TokenSequenceLoader< TMemoryResource >::windowSizeTokens ( ) const

inlineexport

The documentation for this class was generated from the following file:

/__w/Mila/Mila/Mila/Src/Data/Loaders/TokenSequenceLoader.ixx

Public Types

Public Member Functions

Private Member Functions

Static Private Member Functions

Private Attributes

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ TokenSequenceLoader() [1/3]

◆ ~TokenSequenceLoader()

◆ TokenSequenceLoader() [2/3]

◆ TokenSequenceLoader() [3/3]

Member Function Documentation

◆ allocateBuffers()

◆ cleanupBuffers()

◆ fillBatch()

◆ initializeDataset()

◆ inputs() [1/2]

◆ inputs() [2/2]

◆ loadWindowFromFile()

◆ nextBatch()

◆ numBatches()

◆ numTokens()

◆ numWindows()

◆ operator=() [1/2]

◆ operator=() [2/2]

◆ prepareSequenceIndices()

◆ producerThreadFunc()

◆ reset()

◆ sequenceLength()

◆ shuffleSequenceIndices()

◆ swapBuffers()

◆ targets() [1/2]

◆ targets() [2/2]

◆ validateDeviceId()

◆ windowSizeTokens()