from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors, decoders from tokenizers.normalizers import NFC from transformers import PreTrainedTokenizerFast from pathlib import Path SPECIAL_TOKENS = [ "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|sep|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|python|>", "<|javascript|>", "<|typescript|>", "<|cpp|>", "<|rust|>", "<|go|>", "<|java|>", "<|bash|>", ] def get_gpt2_tokenizer_for_code(): from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained("gpt2") tok.pad_token = tok.eos_token tok.add_special_tokens({"additional_special_tokens": [ "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|python|>", "<|javascript|>", "<|rust|>", "<|go|>", ]}) return tok def load_tokenizer(save_dir="./tokenizer"): return PreTrainedTokenizerFast.from_pretrained(save_dir)