|
from flair.data import Tokenizer |
|
from typing import List |
|
|
|
|
|
class StatsTokenizer(Tokenizer): |
|
|
|
def __init__(self): |
|
super(StatsTokenizer, self).__init__() |
|
|
|
def tokenize(self, text: str) -> List[str]: |
|
return StatsTokenizer.run_tokenize(text) |
|
|
|
@staticmethod |
|
def run_tokenize(text: str) -> List[str]: |
|
tokens: List[str] = [] |
|
|
|
index = -1 |
|
words = text.split() |
|
for word in words: |
|
token = "" |
|
for index, char in enumerate(word): |
|
if char in [',']: |
|
if len(token) > 0: |
|
tokens.append(token) |
|
|
|
token = "" |
|
elif char in ['(', ')', '<', '=']: |
|
if len(token) > 0: |
|
tokens.append(token) |
|
tokens.append(char) |
|
|
|
token = "" |
|
else: |
|
token += char |
|
|
|
if len(token) > 0: |
|
if token.endswith('.'): |
|
token = token[:-1] |
|
tokens.append(token) |
|
|
|
return tokens |