Abhinav KM
Initial commit
8dc29a8
raw
history blame
1.87 kB
import Stemmer
import re
import string
class ProcessText(object):
STEMMER = Stemmer.Stemmer("english")
PUNCTUATION = re.compile("[%s]" % re.escape(string.punctuation))
# top 25 most common words in English and "wikipedia":
STOPWORDS = set(
[
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"I",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"wikipedia",
]
)
def __init__(self) -> None:
self._text = ""
@property
def text(self):
"""text property"""
return self._text
@text.setter
def text(self, value):
"""text setter"""
self._text = value
@text.getter
def text(self):
"""text getter"""
return self._text
def tokenize(self):
return self.text.split()
def lowercase_filter(self):
self.tokens = [token.lower() for token in self.tokenize()]
def punctuation_filter(self):
self.tokens = [self.PUNCTUATION.sub("", token) for token in self.tokens]
def stopword_filter(self):
self.tokens = [token for token in self.tokens if token not in self.STOPWORDS]
def stem_filter(self):
self.tokens = self.STEMMER.stemWords(self.tokens)
def check_truthiness(self):
self.tokens = [token for token in self.tokens if token]
def clean_and_stem(self):
self.lowercase_filter()
self.punctuation_filter()
self.stopword_filter()
self.check_truthiness()
return self.tokens