File size: 928 Bytes
8c29328 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors, decoders
from tokenizers.normalizers import NFC
from transformers import PreTrainedTokenizerFast
from pathlib import Path
SPECIAL_TOKENS = [
"<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|sep|>",
"<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
"<|python|>", "<|javascript|>", "<|typescript|>",
"<|cpp|>", "<|rust|>", "<|go|>", "<|java|>", "<|bash|>",
]
def get_gpt2_tokenizer_for_code():
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("gpt2")
tok.pad_token = tok.eos_token
tok.add_special_tokens({"additional_special_tokens": [
"<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
"<|python|>", "<|javascript|>", "<|rust|>", "<|go|>",
]})
return tok
def load_tokenizer(save_dir="./tokenizer"):
return PreTrainedTokenizerFast.from_pretrained(save_dir) |