File size: 928 Bytes
8c29328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors, decoders
from tokenizers.normalizers import NFC
from transformers import PreTrainedTokenizerFast
from pathlib import Path

SPECIAL_TOKENS = [
    "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|sep|>",
    "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
    "<|python|>", "<|javascript|>", "<|typescript|>",
    "<|cpp|>", "<|rust|>", "<|go|>", "<|java|>", "<|bash|>",
]

def get_gpt2_tokenizer_for_code():
    from transformers import AutoTokenizer
    tok = AutoTokenizer.from_pretrained("gpt2")
    tok.pad_token = tok.eos_token
    tok.add_special_tokens({"additional_special_tokens": [
        "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
        "<|python|>", "<|javascript|>", "<|rust|>", "<|go|>",
    ]})
    return tok

def load_tokenizer(save_dir="./tokenizer"):
    return PreTrainedTokenizerFast.from_pretrained(save_dir)