Unified Byte Pair Encoding (BPE) vocabulary. More...

Inheritance diagram for Mila::Data::BpeVocabulary:

Collaboration diagram for Mila::Data::BpeVocabulary:

[legend]

Classes
struct	PairHash
struct	PairViewHash

Public Member Functions
	BpeVocabulary ()=delete
const BpeVocabularyConfig &	getConfig () const
std::optional< size_t >	getMergePriority (const std::string &left, const std::string &right) const
const std::vector< std::pair< std::string, std::string > > &	getMergeRules () const
size_t	getSize () const override
	Get the number of tokens in the vocabulary.
std::optional< TokenId >	getSpecialTokenId (const std::string &token_str) const
	Look up a special token ID by its string representation.
const std::vector< std::pair< std::string, TokenId > > &	getSpecialTokenList () const
	Return the special token list sorted longest-first.
std::optional< std::string >	idToToken (TokenId id) const override
	Map a numeric id back to its token string.
bool	isByteLevel () const
void	save (const fs::path &path) const override
	Serialize the vocabulary to Mila binary format (content version 2).
std::optional< TokenId >	tokenToId (const std::string &token) const override
	Convert a token string to its ID.
Public Member Functions inherited from Mila::Data::TokenizerVocabulary
virtual	~TokenizerVocabulary ()=default
	Virtual destructor.
virtual void	save (const std::filesystem::path &path) const =0
	Serialize the vocabulary to disk at the given path.

Static Public Member Functions
static const std::unordered_map< std::string, unsigned char > &	getByteDecoder ()
static const std::unordered_map< unsigned char, std::string > &	getByteEncoder ()
static BpeVocabulary	load (const fs::path &path)
	Load a vocabulary from Mila binary format (version 2).
static BpeVocabulary	loadGpt2 (const fs::path &tokenizer_path)
	Load a pretrained GPT-2 vocabulary.
static BpeVocabulary	loadLlama32 (const fs::path &path)
	Load a pretrained Llama 3.2 vocabulary.
static BpeVocabulary	loadMistral (const fs::path &vocab_path, const fs::path &merges_path)
	Load a pretrained Mistral vocabulary.
static BpeVocabulary	train (const std::string &corpus, const BpeVocabularyConfig &config)
	Train a BPE vocabulary from a text corpus.
static BpeVocabulary	trainFromFile (const fs::path &corpus_path, const BpeVocabularyConfig &config)
	Train a BPE vocabulary from a corpus file.

Private Member Functions
	BpeVocabulary (const BpeVocabularyConfig &config)
void	addSpecialToken (const std::string &token, TokenId id)
void	addSpecialTokensFromConfig ()
void	applyMergeAndUpdateCounts (std::vector< std::vector< std::string > > &corpus, const std::string &left, const std::string &right, const std::string &merged, std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash > &pair_counts)
void	buildFromText (const std::string &corpus)
void	buildMergeMap ()
void	buildSpecialTokenList ()
std::vector< std::vector< std::string > >	convertToTokenSequences (const std::vector< std::string > &words)
std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash >	countPairs (const std::vector< std::vector< std::string > > &corpus) const
std::pair< std::pair< std::string, std::string >, size_t >	getMostFrequentPair (const std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash > &counts) const
void	initializeBaseVocabulary ()
void	loadContent (std::istream &file)
void	logTrainingComplete (std::chrono::steady_clock::time_point start_time)
std::vector< std::string >	preTokenize (const std::string &text) const
std::vector< std::string >	preTokenizeCorpus (const std::string &text)
void	runBpeMergeLoop (std::vector< std::vector< std::string > > &corpus_tokens, std::chrono::steady_clock::time_point start_time)
void	saveContent (std::ostream &file) const

Private Attributes
BpeVocabularyConfig	config_
TokenId	current_id_
std::vector< std::string >	id_to_token_
std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash >	merge_map_
std::vector< std::pair< std::string, std::string > >	merges_
std::unordered_map< std::string, TokenId >	special_token_ids_
std::vector< std::pair< std::string, TokenId > >	special_token_list_
std::unordered_map< std::string, TokenId >	token_to_id_

Detailed Description

Unified Byte Pair Encoding (BPE) vocabulary.

Immutable after construction; safe for concurrent reads. Supports training from scratch via BpeTrainer, or loading pretrained vocabularies from:

Mila binary format produced by save() (load)
GPT-2 binary produced by convert_gpt2_tokenizer.py (loadGpt2)
Llama 3.2 binary produced by convert_llama_tokenizer.py (loadLlama32)

Special tokens are keyed on their string representation (e.g., "<|endoftext|>", "<|begin_of_text|>") and exposed via getSpecialTokenList() for O(n) pre-pass scanning in BpeTokenizer. Extended special tokens from SpecialTokens are registered automatically.

Constructor & Destructor Documentation

◆ BpeVocabulary() [1/2]

Mila::Data::BpeVocabulary::BpeVocabulary ( )

delete

Here is the caller graph for this function:

◆ BpeVocabulary() [2/2]

Mila::Data::BpeVocabulary::BpeVocabulary ( const BpeVocabularyConfig & config )

inlineexplicitprivate

Member Function Documentation

◆ addSpecialToken()

void Mila::Data::BpeVocabulary::addSpecialToken	(	const std::string &	token,
		TokenId	id )

private

Here is the caller graph for this function:

◆ addSpecialTokensFromConfig()

void Mila::Data::BpeVocabulary::addSpecialTokensFromConfig ( )

private

Here is the call graph for this function:

Here is the caller graph for this function:

◆ applyMergeAndUpdateCounts()

void Mila::Data::BpeVocabulary::applyMergeAndUpdateCounts	(	std::vector< std::vector< std::string > > &	corpus,
		const std::string &	left,
		const std::string &	right,
		const std::string &	merged,
		std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash > &	pair_counts )

private

Here is the caller graph for this function:

◆ buildFromText()

void Mila::Data::BpeVocabulary::buildFromText ( const std::string & corpus )

private

Here is the call graph for this function:

Here is the caller graph for this function:

◆ buildMergeMap()

void Mila::Data::BpeVocabulary::buildMergeMap ( )

private

Here is the caller graph for this function:

◆ buildSpecialTokenList()

void Mila::Data::BpeVocabulary::buildSpecialTokenList ( )

private

Here is the caller graph for this function:

◆ convertToTokenSequences()

std::vector< std::vector< std::string > > Mila::Data::BpeVocabulary::convertToTokenSequences ( const std::vector< std::string > & words )

private

Here is the caller graph for this function:

◆ countPairs()

std::unordered_map< std::pair< std::string, std::string >, size_t, BpeVocabulary::PairHash > Mila::Data::BpeVocabulary::countPairs ( const std::vector< std::vector< std::string > > & corpus ) const

private

Here is the caller graph for this function:

◆ getByteDecoder()

const std::unordered_map< std::string, unsigned char > & Mila::Data::BpeVocabulary::getByteDecoder ( )

static

Here is the call graph for this function:

Here is the caller graph for this function:

◆ getByteEncoder()

const std::unordered_map< unsigned char, std::string > & Mila::Data::BpeVocabulary::getByteEncoder ( )

static

Here is the caller graph for this function:

◆ getConfig()

const BpeVocabularyConfig & Mila::Data::BpeVocabulary::getConfig ( ) const

inline

◆ getMergePriority()

std::optional< size_t > Mila::Data::BpeVocabulary::getMergePriority	(	const std::string &	left,
		const std::string &	right ) const

inline

◆ getMergeRules()

const std::vector< std::pair< std::string, std::string > > & Mila::Data::BpeVocabulary::getMergeRules ( ) const

inline

◆ getMostFrequentPair()

std::pair< std::pair< std::string, std::string >, size_t > Mila::Data::BpeVocabulary::getMostFrequentPair ( const std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash > & counts ) const

private

Here is the caller graph for this function:

◆ getSize()

size_t Mila::Data::BpeVocabulary::getSize ( ) const

inlineoverridevirtual

Get the number of tokens in the vocabulary.

Returns: size_t Number of entries (tokens) present in the vocabulary.

Implements Mila::Data::TokenizerVocabulary.

◆ getSpecialTokenId()

std::optional< TokenId > Mila::Data::BpeVocabulary::getSpecialTokenId ( const std::string & token_str ) const

inline

Look up a special token ID by its string representation.

Used by BpeTokenizer's encode pre-pass to resolve tokens such as "<|endoftext|>" or "<|begin_of_text|>" directly to IDs before BPE runs.

Parameters

token_str Token string to look up.

Returns: Token ID if registered as special, nullopt otherwise.

◆ getSpecialTokenList()

const std::vector< std::pair< std::string, TokenId > > & Mila::Data::BpeVocabulary::getSpecialTokenList ( ) const

inline

Return the special token list sorted longest-first.

Ordered longest-first so BpeTokenizer's linear scan matches longer tokens before any of their prefixes (e.g., "<|begin_of_text|>" before "<|").

Returns: Vector of (token_string, token_id) pairs.

◆ idToToken()

std::optional< std::string > Mila::Data::BpeVocabulary::idToToken ( TokenId id ) const

inlineoverridevirtual

Map a numeric id back to its token string.

Returns an empty optional if the id is out of range or not defined.

Parameters

id	Token id to convert.

Returns: std::optional<std::string> The token string if present, otherwise empty.

Implements Mila::Data::TokenizerVocabulary.

◆ initializeBaseVocabulary()

void Mila::Data::BpeVocabulary::initializeBaseVocabulary ( )

private

Here is the caller graph for this function:

◆ isByteLevel()

bool Mila::Data::BpeVocabulary::isByteLevel ( ) const

inline

◆ load()

BpeVocabulary Mila::Data::BpeVocabulary::load ( const fs::path & path )

inlinestatic

Load a vocabulary from Mila binary format (version 2).

Reads a file written by save(). Special tokens are restored from the serialized (string, id) pairs and the special token list is rebuilt automatically.

Parameters

path	Input file path.

Returns: Loaded BpeVocabulary instance.

Exceptions

std::runtime_error on I/O errors or format mismatch.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ loadContent()

void Mila::Data::BpeVocabulary::loadContent ( std::istream & file )

private

Here is the call graph for this function:

Here is the caller graph for this function:

◆ loadGpt2()

BpeVocabulary Mila::Data::BpeVocabulary::loadGpt2 ( const fs::path & tokenizer_path )

static

Load a pretrained GPT-2 vocabulary.

Reads the binary format produced by convert_gpt2_tokenizer.py:

vocab_size   (uint32)
num_merges   (uint32)
For each token: token_length (uint32), token_bytes (utf-8), token_id (uint32)
For each merge: left_length (uint32), left, right_length (uint32), right
has_eos (uint32), eos_id (uint32, conditional)
has_bos (uint32), bos_id (uint32, conditional)
has_pad (uint32), pad_id (uint32, conditional)

Parameters

tokenizer_path Path to the converted GPT-2 tokenizer binary.

Returns: Loaded BpeVocabulary instance.

Exceptions

std::runtime_error on I/O or format errors.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ loadLlama32()

BpeVocabulary Mila::Data::BpeVocabulary::loadLlama32 ( const fs::path & path )

static

Load a pretrained Llama 3.2 vocabulary.

Reads the binary format produced by convert_llama_tokenizer.py:

Header: vocab_size (uint32), use_byte_fallback (uint8)
For each token: token_length (uint32), token_bytes, score (float32), token_id (uint32)
has_bos (uint32), bos_id (uint32, conditional)  -- 128000
has_eos (uint32), eos_id (uint32, conditional)  -- 128001
has_pad (uint32), pad_id (uint32, conditional)
has_unk (uint32), unk_id (uint32, conditional)  -- absent for Llama 3.2

Llama 3.x vocabularies carry no explicit BPE merges; the merge order is encoded implicitly in the token ID assignment.

Parameters

path	Path to the converted Llama 3.2 tokenizer binary.

Returns: Loaded BpeVocabulary instance.

Exceptions

std::runtime_error on I/O or format errors.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ loadMistral()

BpeVocabulary Mila::Data::BpeVocabulary::loadMistral	(	const fs::path &	vocab_path,
		const fs::path &	merges_path )

static

Load a pretrained Mistral vocabulary.

Note: Not yet implemented for external Mistral formats. Provide a Mila binary produced by save() as a workaround.

Exceptions

std::runtime_error always.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ logTrainingComplete()

void Mila::Data::BpeVocabulary::logTrainingComplete ( std::chrono::steady_clock::time_point start_time )

private

Here is the caller graph for this function:

◆ preTokenize()

std::vector< std::string > Mila::Data::BpeVocabulary::preTokenize ( const std::string & text ) const

private

Here is the caller graph for this function:

◆ preTokenizeCorpus()

std::vector< std::string > Mila::Data::BpeVocabulary::preTokenizeCorpus ( const std::string & text )

private

Here is the call graph for this function:

Here is the caller graph for this function:

◆ runBpeMergeLoop()

void Mila::Data::BpeVocabulary::runBpeMergeLoop	(	std::vector< std::vector< std::string > > &	corpus_tokens,
		std::chrono::steady_clock::time_point	start_time )

private

Here is the call graph for this function:

Here is the caller graph for this function:

◆ save()

void Mila::Data::BpeVocabulary::save ( const fs::path & path ) const

inlineoverride

Serialize the vocabulary to Mila binary format (content version 2).

Writes a MilaFileHeader followed by the vocabulary content. Special tokens are stored as (string_length, string, token_id) triples, eliminating the char-key indirection used in the former GPT-2-only format.

Parameters

path	Output file path. Parent directory must exist.

Exceptions

std::runtime_error on I/O errors.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ saveContent()

void Mila::Data::BpeVocabulary::saveContent ( std::ostream & file ) const

private

Here is the caller graph for this function:

◆ tokenToId()

std::optional< TokenId > Mila::Data::BpeVocabulary::tokenToId ( const std::string & token ) const

inlineoverridevirtual

Convert a token string to its ID.

Falls back to the UNK token ID when the token is not found and use_unk is enabled (GPT-2 style). Llama 3.x vocabularies return nullopt on a miss because they rely on byte-level fallback rather than an UNK token.

Parameters

token UTF-8 encoded token string.

Returns: Token ID, UNK ID (if enabled), or nullopt on miss.

Implements Mila::Data::TokenizerVocabulary.

◆ train()

BpeVocabulary Mila::Data::BpeVocabulary::train	(	const std::string &	corpus,
		const BpeVocabularyConfig &	config )

inlinestatic

Train a BPE vocabulary from a text corpus.

Parameters

corpus	Training text.
config	Vocabulary configuration; config.validate() is called internally.

Returns: Trained BpeVocabulary instance.

Exceptions

std::invalid_argument if config fails validation.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ trainFromFile()

BpeVocabulary Mila::Data::BpeVocabulary::trainFromFile	(	const fs::path &	corpus_path,
		const BpeVocabularyConfig &	config )

inlinestatic

Train a BPE vocabulary from a corpus file.

Parameters

corpus_path	Path to a UTF-8 text corpus file.
config	Vocabulary configuration.

Returns: Trained BpeVocabulary instance.

Exceptions

std::runtime_error	if the file cannot be opened.
std::invalid_argument	if config fails validation.

Here is the call graph for this function:

Member Data Documentation

◆ config_

BpeVocabularyConfig Mila::Data::BpeVocabulary::config_

private

◆ current_id_

TokenId Mila::Data::BpeVocabulary::current_id_

private

◆ id_to_token_

std::vector<std::string> Mila::Data::BpeVocabulary::id_to_token_

private

◆ merge_map_

std::unordered_map<std::pair<std::string, std::string>, size_t, PairHash> Mila::Data::BpeVocabulary::merge_map_

private

◆ merges_

std::vector<std::pair<std::string, std::string> > Mila::Data::BpeVocabulary::merges_

private

◆ special_token_ids_

std::unordered_map<std::string, TokenId> Mila::Data::BpeVocabulary::special_token_ids_

private

◆ special_token_list_

std::vector<std::pair<std::string, TokenId> > Mila::Data::BpeVocabulary::special_token_list_

private

◆ token_to_id_

std::unordered_map<std::string, TokenId> Mila::Data::BpeVocabulary::token_to_id_

private

The documentation for this class was generated from the following file:

/__w/Mila/Mila/Mila/Src/Data/Tokenizers/Bpe/BpeVocabulary.ixx

Classes

Public Member Functions

Static Public Member Functions

Private Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

◆ BpeVocabulary() [1/2]

◆ BpeVocabulary() [2/2]

Member Function Documentation

◆ addSpecialToken()

◆ addSpecialTokensFromConfig()

◆ applyMergeAndUpdateCounts()

◆ buildFromText()

◆ buildMergeMap()

◆ buildSpecialTokenList()

◆ convertToTokenSequences()

◆ countPairs()

◆ getByteDecoder()

◆ getByteEncoder()

◆ getConfig()

◆ getMergePriority()

◆ getMergeRules()

◆ getMostFrequentPair()

◆ getSize()

◆ getSpecialTokenId()

◆ getSpecialTokenList()

◆ idToToken()

◆ initializeBaseVocabulary()

◆ isByteLevel()

◆ load()

◆ loadContent()

◆ loadGpt2()

◆ loadLlama32()

◆ loadMistral()

◆ logTrainingComplete()

◆ preTokenize()

◆ preTokenizeCorpus()

◆ runBpeMergeLoop()

◆ save()

◆ saveContent()

◆ tokenToId()

◆ train()

◆ trainFromFile()

Member Data Documentation

◆ config_

◆ current_id_

◆ id_to_token_

◆ merge_map_

◆ merges_

◆ special_token_ids_

◆ special_token_list_

◆ token_to_id_