File size: 1,871 Bytes
8dc29a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import Stemmer
import re
import string


class ProcessText(object):
    STEMMER = Stemmer.Stemmer("english")
    PUNCTUATION = re.compile("[%s]" % re.escape(string.punctuation))
    # top 25 most common words in English and "wikipedia":
    STOPWORDS = set(
        [
            "the",
            "be",
            "to",
            "of",
            "and",
            "a",
            "in",
            "that",
            "have",
            "I",
            "it",
            "for",
            "not",
            "on",
            "with",
            "he",
            "as",
            "you",
            "do",
            "at",
            "this",
            "but",
            "his",
            "by",
            "from",
            "wikipedia",
        ]
    )

    def __init__(self) -> None:
        self._text = ""

    @property
    def text(self):
        """text property"""
        return self._text

    @text.setter
    def text(self, value):
        """text setter"""
        self._text = value

    @text.getter
    def text(self):
        """text getter"""
        return self._text

    def tokenize(self):
        return self.text.split()

    def lowercase_filter(self):
        self.tokens = [token.lower() for token in self.tokenize()]

    def punctuation_filter(self):
        self.tokens = [self.PUNCTUATION.sub("", token) for token in self.tokens]

    def stopword_filter(self):
        self.tokens = [token for token in self.tokens if token not in self.STOPWORDS]

    def stem_filter(self):
        self.tokens = self.STEMMER.stemWords(self.tokens)

    def check_truthiness(self):
        self.tokens = [token for token in self.tokens if token]

    def clean_and_stem(self):
        self.lowercase_filter()
        self.punctuation_filter()
        self.stopword_filter()
        self.check_truthiness()

        return self.tokens