File size: 400 Bytes
504dd41 dc12aad 4af520c dc12aad 504dd41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
from typing import Dict
from transformers import PreTrainedTokenizer, AddedToken
class CustomTokenizer(PreTrainedTokenizer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
print("Initializing CustomTokenizer")
def tokenize(self, text):
print("Tokenizing text", text)
return text.split()
def get_vocab(self) -> Dict[str, int]:
return {} |