File size: 1,156 Bytes
8a8dcbd a668866 8a8dcbd a668866 8a8dcbd a668866 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from flair.data import Tokenizer
from typing import List
class StatsTokenizer(Tokenizer):
def __init__(self):
super(StatsTokenizer, self).__init__()
def tokenize(self, text: str) -> List[str]:
return StatsTokenizer.run_tokenize(text)
@staticmethod
def run_tokenize(text: str) -> List[str]:
tokens: List[str] = []
index = -1
words = text.split()
for word in words:
token = ""
for index, char in enumerate(word):
if char in [',']:
if len(token) > 0:
tokens.append(token)
token = ""
elif char in ['(', ')', '<', '=']:
if len(token) > 0:
tokens.append(token)
tokens.append(char)
token = ""
else:
token += char
if len(token) > 0:
if token.endswith('.'):
token = token[:-1]
tokens.append(token)
return tokens |