Spaces:
Sleeping
Sleeping
File size: 1,871 Bytes
8dc29a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import Stemmer
import re
import string
class ProcessText(object):
STEMMER = Stemmer.Stemmer("english")
PUNCTUATION = re.compile("[%s]" % re.escape(string.punctuation))
# top 25 most common words in English and "wikipedia":
STOPWORDS = set(
[
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"I",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"wikipedia",
]
)
def __init__(self) -> None:
self._text = ""
@property
def text(self):
"""text property"""
return self._text
@text.setter
def text(self, value):
"""text setter"""
self._text = value
@text.getter
def text(self):
"""text getter"""
return self._text
def tokenize(self):
return self.text.split()
def lowercase_filter(self):
self.tokens = [token.lower() for token in self.tokenize()]
def punctuation_filter(self):
self.tokens = [self.PUNCTUATION.sub("", token) for token in self.tokens]
def stopword_filter(self):
self.tokens = [token for token in self.tokens if token not in self.STOPWORDS]
def stem_filter(self):
self.tokens = self.STEMMER.stemWords(self.tokens)
def check_truthiness(self):
self.tokens = [token for token in self.tokens if token]
def clean_and_stem(self):
self.lowercase_filter()
self.punctuation_filter()
self.stopword_filter()
self.check_truthiness()
return self.tokens
|