| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors, decoders |
| from tokenizers.normalizers import NFC |
| from transformers import PreTrainedTokenizerFast |
| from pathlib import Path |
|
|
| SPECIAL_TOKENS = [ |
| "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|sep|>", |
| "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", |
| "<|python|>", "<|javascript|>", "<|typescript|>", |
| "<|cpp|>", "<|rust|>", "<|go|>", "<|java|>", "<|bash|>", |
| ] |
|
|
| def get_gpt2_tokenizer_for_code(): |
| from transformers import AutoTokenizer |
| tok = AutoTokenizer.from_pretrained("gpt2") |
| tok.pad_token = tok.eos_token |
| tok.add_special_tokens({"additional_special_tokens": [ |
| "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", |
| "<|python|>", "<|javascript|>", "<|rust|>", "<|go|>", |
| ]}) |
| return tok |
|
|
| def load_tokenizer(save_dir="./tokenizer"): |
| return PreTrainedTokenizerFast.from_pretrained(save_dir) |