File size: 3,674 Bytes

e4eb02a

import re  # For regular expressions
from nltk.stem import PorterStemmer  # For stemming
from nltk.corpus import stopwords  # For stopword removal
from typing import List, Tuple  # For type hinting
import string  # For string operations
from collections import Counter  # For creating unique sets of tokens
import nltk  # For downloading stopwords

# Ensure stopwords resource is downloaded
nltk.download('stopwords')

class Preprocessing:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))  # Initialize a set of stopwords
        self.stemmer = PorterStemmer()  # Initialize the PorterStemmer for stemming

    def check_special_char(self, ch: str) -> bool:
        """
        Checks if a character is a special character or a digit.
        Returns True if it is, otherwise False.
        """
        return ch in string.punctuation or ch.isdigit()

    def remove_special_char(self, text: Tuple[str, str]) -> Tuple[str, str]:
        """
        Removes special characters and digits from the text.
        Replaces them with a space to preserve word boundaries.
        """
        sub, mes = text
        sub = ''.join([' ' if self.check_special_char(c) else c for c in sub])
        mes = ''.join([' ' if self.check_special_char(c) else c for c in mes])
        return sub, mes

    def lowercase_conversion(self, text: Tuple[str, str]) -> Tuple[str, str]:
        """
        Converts all characters in the text to lowercase.
        """
        sub, mes = text
        return sub.lower(), mes.lower()

    def tokenize(self, text: Tuple[str, str]) -> Tuple[List[str], List[str]]:
        """
        Splits the text into individual words (tokens) based on spaces.
        """
        sub, mes = text
        return sub.split(), mes.split()

    def check_stop_words(self, word: str) -> bool:
        """
        Checks if a word is a stopword.
        """
        return word in self.stop_words

    def removal_of_stop_words(self, tokens: Tuple[List[str], List[str]]) -> Tuple[List[str], List[str]]:
        """
        Removes stopwords from the tokenized text.
        """
        sub_tokens, mes_tokens = tokens
        sub_tokens = [word for word in sub_tokens if not self.check_stop_words(word)]
        mes_tokens = [word for word in mes_tokens if not self.check_stop_words(word)]
        return sub_tokens, mes_tokens

    def stem_words(self, tokens: Tuple[List[str], List[str]]) -> List[str]:
        """
        Stems each word in the tokenized text using PorterStemmer.
        Removes duplicates by returning a unique list of stems.
        """
        sub_tokens, mes_tokens = tokens
        unique_stems = set()

        # Stem tokens from both subject and message
        for word in sub_tokens + mes_tokens:
            unique_stems.add(self.stemmer.stem(word))

        return list(unique_stems)

# Example Usage
if __name__ == "__main__":
    # Example input: (subject, message)
    text = ("HELLO!!! This is an example subject 123.", "This is an example message with special chars!! @@#$")

    preprocessor = Preprocessing()

    # Remove special characters
    text = preprocessor.remove_special_char(text)
    print("After removing special characters:", text)

    # Convert to lowercase
    text = preprocessor.lowercase_conversion(text)
    print("After converting to lowercase:", text)

    # Tokenize
    tokens = preprocessor.tokenize(text)
    print("After tokenizing:", tokens)

    # Remove stopwords
    tokens = preprocessor.removal_of_stop_words(tokens)
    print("After removing stopwords:", tokens)

    # Stem words
    stems = preprocessor.stem_words(tokens)
    print("After stemming:", stems)