from pathlib import Path from typing import Any from transformers.convert_slow_tokenizer import TikTokenConverter from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE def convert_tiktoken_to_fast(encoding: Any, output_dir: str): """ Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer on disk. Args: encoding (`str` or `tiktoken.Encoding`): Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with `tiktoken.get_encoding(encoding)`. output_dir (`str`): Save path for converted tokenizer configuration file. """ output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True) save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE tokenizer_file = output_dir / TOKENIZER_FILE save_file_absolute = str(save_file.absolute()) output_file_absolute = str(tokenizer_file.absolute()) try: from tiktoken import get_encoding from tiktoken.load import dump_tiktoken_bpe if isinstance(encoding, str): encoding = get_encoding(encoding) dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute) except ImportError: raise ValueError("`tiktoken` is required to save a `tiktoken` file. Install it with `pip install tiktoken`.") tokenizer = TikTokenConverter( vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens ).converted() tokenizer.save(output_file_absolute)