Commit
·
504dd41
1
Parent(s):
4af520c
yo
Browse files- custom_tokenizer.py +5 -1
custom_tokenizer.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from transformers import PreTrainedTokenizer, AddedToken
|
2 |
|
3 |
class CustomTokenizer(PreTrainedTokenizer):
|
@@ -7,4 +8,7 @@ class CustomTokenizer(PreTrainedTokenizer):
|
|
7 |
|
8 |
def tokenize(self, text):
|
9 |
print("Tokenizing text", text)
|
10 |
-
return text.split()
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
from transformers import PreTrainedTokenizer, AddedToken
|
3 |
|
4 |
class CustomTokenizer(PreTrainedTokenizer):
|
|
|
8 |
|
9 |
def tokenize(self, text):
|
10 |
print("Tokenizing text", text)
|
11 |
+
return text.split()
|
12 |
+
|
13 |
+
def get_vocab(self) -> Dict[str, int]:
|
14 |
+
return {}
|