stats-nerd / tokenizer.py
VinayNR's picture
Update tokenizer.py
8a8dcbd
from flair.data import Tokenizer
from typing import List
class StatsTokenizer(Tokenizer):
def __init__(self):
super(StatsTokenizer, self).__init__()
def tokenize(self, text: str) -> List[str]:
return StatsTokenizer.run_tokenize(text)
@staticmethod
def run_tokenize(text: str) -> List[str]:
tokens: List[str] = []
index = -1
words = text.split()
for word in words:
token = ""
for index, char in enumerate(word):
if char in [',']:
if len(token) > 0:
tokens.append(token)
token = ""
elif char in ['(', ')', '<', '=']:
if len(token) > 0:
tokens.append(token)
tokens.append(char)
token = ""
else:
token += char
if len(token) > 0:
if token.endswith('.'):
token = token[:-1]
tokens.append(token)
return tokens