File size: 400 Bytes
504dd41
dc12aad
 
 
4af520c
dc12aad
 
 
 
 
504dd41
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from typing import Dict
from transformers import PreTrainedTokenizer, AddedToken

class CustomTokenizer(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        print("Initializing CustomTokenizer")

    def tokenize(self, text):
        print("Tokenizing text", text)
        return text.split()

    def get_vocab(self) -> Dict[str, int]:
        return {}