from flair.data import Tokenizer from typing import List class StatsTokenizer(Tokenizer): def __init__(self): super(StatsTokenizer, self).__init__() def tokenize(self, text: str) -> List[str]: return StatsTokenizer.run_tokenize(text) @staticmethod def run_tokenize(text: str) -> List[str]: tokens: List[str] = [] index = -1 words = text.split() for word in words: token = "" for index, char in enumerate(word): if char in [',']: if len(token) > 0: tokens.append(token) token = "" elif char in ['(', ')', '<', '=']: if len(token) > 0: tokens.append(token) tokens.append(char) token = "" else: token += char if len(token) > 0: if token.endswith('.'): token = token[:-1] tokens.append(token) return tokens