| addSpecialToken(const std::string &token, TokenId id) | Mila::Data::BpeVocabulary | private |
| addSpecialTokensFromConfig() | Mila::Data::BpeVocabulary | private |
| applyMergeAndUpdateCounts(std::vector< std::vector< std::string > > &corpus, const std::string &left, const std::string &right, const std::string &merged, std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash > &pair_counts) | Mila::Data::BpeVocabulary | private |
| BpeVocabulary()=delete | Mila::Data::BpeVocabulary | |
| BpeVocabulary(const BpeVocabularyConfig &config) | Mila::Data::BpeVocabulary | inlineexplicitprivate |
| buildFromText(const std::string &corpus) | Mila::Data::BpeVocabulary | private |
| buildMergeMap() | Mila::Data::BpeVocabulary | private |
| buildSpecialTokenList() | Mila::Data::BpeVocabulary | private |
| config_ | Mila::Data::BpeVocabulary | private |
| convertToTokenSequences(const std::vector< std::string > &words) | Mila::Data::BpeVocabulary | private |
| countPairs(const std::vector< std::vector< std::string > > &corpus) const | Mila::Data::BpeVocabulary | private |
| current_id_ | Mila::Data::BpeVocabulary | private |
| getByteDecoder() | Mila::Data::BpeVocabulary | static |
| getByteEncoder() | Mila::Data::BpeVocabulary | static |
| getConfig() const | Mila::Data::BpeVocabulary | inline |
| getMergePriority(const std::string &left, const std::string &right) const | Mila::Data::BpeVocabulary | inline |
| getMergeRules() const | Mila::Data::BpeVocabulary | inline |
| getMostFrequentPair(const std::unordered_map< std::pair< std::string, std::string >, size_t, PairHash > &counts) const | Mila::Data::BpeVocabulary | private |
| getSize() const override | Mila::Data::BpeVocabulary | inlinevirtual |
| getSpecialTokenId(const std::string &token_str) const | Mila::Data::BpeVocabulary | inline |
| getSpecialTokenList() const | Mila::Data::BpeVocabulary | inline |
| id_to_token_ | Mila::Data::BpeVocabulary | private |
| idToToken(TokenId id) const override | Mila::Data::BpeVocabulary | inlinevirtual |
| initializeBaseVocabulary() | Mila::Data::BpeVocabulary | private |
| isByteLevel() const | Mila::Data::BpeVocabulary | inline |
| load(const fs::path &path) | Mila::Data::BpeVocabulary | inlinestatic |
| loadContent(std::istream &file) | Mila::Data::BpeVocabulary | private |
| loadGpt2(const fs::path &tokenizer_path) | Mila::Data::BpeVocabulary | static |
| loadLlama32(const fs::path &path) | Mila::Data::BpeVocabulary | static |
| loadMistral(const fs::path &vocab_path, const fs::path &merges_path) | Mila::Data::BpeVocabulary | static |
| logTrainingComplete(std::chrono::steady_clock::time_point start_time) | Mila::Data::BpeVocabulary | private |
| merge_map_ | Mila::Data::BpeVocabulary | private |
| merges_ | Mila::Data::BpeVocabulary | private |
| preTokenize(const std::string &text) const | Mila::Data::BpeVocabulary | private |
| preTokenizeCorpus(const std::string &text) | Mila::Data::BpeVocabulary | private |
| runBpeMergeLoop(std::vector< std::vector< std::string > > &corpus_tokens, std::chrono::steady_clock::time_point start_time) | Mila::Data::BpeVocabulary | private |
| save(const fs::path &path) const override | Mila::Data::BpeVocabulary | inline |
| Mila::Data::TokenizerVocabulary::save(const std::filesystem::path &path) const =0 | Mila::Data::TokenizerVocabulary | pure virtual |
| saveContent(std::ostream &file) const | Mila::Data::BpeVocabulary | private |
| special_token_ids_ | Mila::Data::BpeVocabulary | private |
| special_token_list_ | Mila::Data::BpeVocabulary | private |
| token_to_id_ | Mila::Data::BpeVocabulary | private |
| tokenToId(const std::string &token) const override | Mila::Data::BpeVocabulary | inlinevirtual |
| train(const std::string &corpus, const BpeVocabularyConfig &config) | Mila::Data::BpeVocabulary | inlinestatic |
| trainFromFile(const fs::path &corpus_path, const BpeVocabularyConfig &config) | Mila::Data::BpeVocabulary | inlinestatic |
| ~TokenizerVocabulary()=default | Mila::Data::TokenizerVocabulary | virtual |