|
from typing import Union, Tuple, List, Iterable, Dict |
|
import collections |
|
import string |
|
import os |
|
import json |
|
from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS |
|
|
|
class WhitespaceTokenizer(WordTokenizer): |
|
""" |
|
Simple and fast white-space tokenizer. Splits sentence based on white spaces. |
|
Punctuation are stripped from tokens. |
|
""" |
|
def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False): |
|
self.stop_words = set(stop_words) |
|
self.do_lower_case = do_lower_case |
|
self.set_vocab(vocab) |
|
|
|
def get_vocab(self): |
|
return self.vocab |
|
|
|
def set_vocab(self, vocab: Iterable[str]): |
|
self.vocab = vocab |
|
self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) |
|
|
|
def tokenize(self, text: str) -> List[int]: |
|
if self.do_lower_case: |
|
text = text.lower() |
|
|
|
tokens = text.split() |
|
|
|
tokens_filtered = [] |
|
for token in tokens: |
|
if token in self.stop_words: |
|
continue |
|
elif token in self.word2idx: |
|
tokens_filtered.append(self.word2idx[token]) |
|
continue |
|
|
|
token = token.strip(string.punctuation) |
|
if token in self.stop_words: |
|
continue |
|
elif len(token) > 0 and token in self.word2idx: |
|
tokens_filtered.append(self.word2idx[token]) |
|
continue |
|
|
|
token = token.lower() |
|
if token in self.stop_words: |
|
continue |
|
elif token in self.word2idx: |
|
tokens_filtered.append(self.word2idx[token]) |
|
continue |
|
|
|
return tokens_filtered |
|
|
|
def save(self, output_path: str): |
|
with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut: |
|
json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut) |
|
|
|
@staticmethod |
|
def load(input_path: str): |
|
with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn: |
|
config = json.load(fIn) |
|
|
|
return WhitespaceTokenizer(**config) |
|
|