Mila 0.13.48
Deep Neural Network Library
Loading...
Searching...
No Matches
Data.TokenSequenceLoader Module Reference

Exported Modules

module  Dnn.Tensor
module  Dnn.TensorDataType
module  Data.DataLoader
module  Dnn.TensorHostTypeMap
module  Compute.CpuMemoryResource
module  Data.Tokenizer
module  Compute.DeviceType
module  Dnn.TensorTypes
module  Compute.DeviceId

Classes

class  Mila::Data::TokenSequenceLoader< TMemoryResource >
 Token sequence loader for autoregressive language models. More...
struct  Mila::Data::TokenSequenceLoaderConfig
 Configuration for StreamingSequenceLoader behavior. More...

Typedefs

using BaseLoader = DataLoader<TensorDataType::INT32, TensorDataType::INT32, TMemoryResource>
using HostType = typename TensorHostTypeMap<TensorDataType::INT32>::host_type
using TensorType = Tensor<TensorDataType::INT32, TMemoryResource>

Functions

 TokenSequenceLoader (const std::filesystem::path &tokens_file, int64_t batch_size, int64_t seq_length, bool is_training, DeviceId device, const TokenSequenceLoaderConfig &config=TokenSequenceLoaderConfig())
 Constructs streaming autoregressive sequence loader.
 TokenSequenceLoader (const TokenSequenceLoader &)=delete
 TokenSequenceLoader (TokenSequenceLoader &&)=delete
 ~TokenSequenceLoader () noexcept
void allocateBuffers ()
void cleanupBuffers () noexcept
void fillBatch (const TokenId *window_buffer, size_t batch_idx, HostType *input_dest, HostType *target_dest)
 Fills a batch from the current window buffer.
void initializeDataset ()
const TensorTypeinputs () const override
 Provides immutable access to input tensor for current batch.
TensorTypeinputs () override
 Provides mutable access to input tensor for current batch.
void loadWindowFromFile (std::ifstream &file, TokenId *buffer, size_t window_idx)
 Loads a window from the token file.
void nextBatch () override
 Loads the next batch of data from the dataset.
int64_t numBatches () const override
 Returns the total number of batches in the dataset.
size_t numTokens () const
size_t numWindows () const
TokenSequenceLoaderoperator= (const TokenSequenceLoader &)=delete
TokenSequenceLoaderoperator= (TokenSequenceLoader &&)=delete
void prepareSequenceIndices ()
void producerThreadFunc () noexcept
 Producer thread: streams windows from disk and fills batches.
void reset () override
 Resets the loader to the beginning of the dataset.
int64_t sequenceLength () const
void shuffleSequenceIndices ()
void swapBuffers () noexcept
const TensorTypetargets () const override
 Provides immutable access to target tensor for current batch.
TensorTypetargets () override
 Provides mutable access to target tensor for current batch.
static DeviceId validateDeviceId (DeviceId device)
size_t windowSizeTokens () const

Variables

std::atomic< bool > back_buffer_ready_
std::shared_ptr< TensorTypeback_input_tensor_
std::shared_ptr< TensorTypeback_target_tensor_
size_t batches_per_window_
TokenSequenceLoaderConfig config_
std::atomic< size_t > current_batch_in_window_
std::atomic< size_t > current_window_idx_
std::condition_variable cv_consumer_
std::condition_variable cv_producer_
DeviceId device_
size_t file_size_
std::atomic< bool > front_buffer_ready_
std::shared_ptr< TensorTypefront_input_tensor_
std::shared_ptr< TensorTypefront_target_tensor_
bool is_training_
std::mutex mutex_
int64_t num_batches_
size_t num_tokens_
size_t num_windows_
std::exception_ptr producer_exception_
std::thread producer_thread_
int64_t seq_length_
std::vector< size_t > sequence_indices_
size_t sequences_per_window_
std::atomic< bool > stop_
std::filesystem::path tokens_file_path_
size_t window_size_tokens_

Files

file  /__w/Mila/Mila/Mila/Src/Data/Loaders/TokenSequenceLoader.ixx
file  /__w/Mila/Mila/Mila/Src/Data/Loaders/TokenSequenceLoader.Config.ixx