Upload 03-TotalWords.py
Browse files- 03-TotalWords.py +89 -0
03-TotalWords.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import re # For regular expressions
|
3 |
+
from nltk.stem import PorterStemmer # For stemming
|
4 |
+
from nltk.corpus import stopwords # For stopword removal
|
5 |
+
import string # For string operations
|
6 |
+
import csv # For reading and writing CSV files
|
7 |
+
|
8 |
+
class Preprocessing:
|
9 |
+
def __init__(self):
|
10 |
+
self.stop_words = set(stopwords.words('english')) # Initialize a set of stopwords
|
11 |
+
self.stemmer = PorterStemmer() # Initialize the PorterStemmer for stemming
|
12 |
+
|
13 |
+
def remove_special_char(self, text: tuple[str, str]) -> tuple[str, str]:
|
14 |
+
"""
|
15 |
+
Removes special characters and digits from the text.
|
16 |
+
"""
|
17 |
+
sub, mes = text
|
18 |
+
sub = re.sub(r'[^\w\s]', ' ', sub) # Replace non-alphanumeric characters with spaces
|
19 |
+
mes = re.sub(r'[^\w\s]', ' ', mes)
|
20 |
+
return sub, mes
|
21 |
+
|
22 |
+
def lowercase_conversion(self, text: tuple[str, str]) -> tuple[str, str]:
|
23 |
+
"""
|
24 |
+
Converts all characters in the text to lowercase.
|
25 |
+
"""
|
26 |
+
sub, mes = text
|
27 |
+
return sub.lower(), mes.lower()
|
28 |
+
|
29 |
+
def tokenize(self, text: tuple[str, str]) -> tuple[list[str], list[str]]:
|
30 |
+
"""
|
31 |
+
Splits the text into individual words (tokens).
|
32 |
+
"""
|
33 |
+
sub, mes = text
|
34 |
+
return sub.split(), mes.split()
|
35 |
+
|
36 |
+
def removal_of_stop_words(self, tokens: tuple[list[str], list[str]]) -> tuple[list[str], list[str]]:
|
37 |
+
"""
|
38 |
+
Removes stopwords from the tokenized text.
|
39 |
+
"""
|
40 |
+
sub_tokens, mes_tokens = tokens
|
41 |
+
sub_tokens = [word for word in sub_tokens if word not in self.stop_words]
|
42 |
+
mes_tokens = [word for word in mes_tokens if word not in self.stop_words]
|
43 |
+
return sub_tokens, mes_tokens
|
44 |
+
|
45 |
+
def stem_words(self, tokens: tuple[list[str], list[str]]) -> list[str]:
|
46 |
+
"""
|
47 |
+
Stems each word in the tokenized text.
|
48 |
+
Removes duplicates by returning a unique list of stems.
|
49 |
+
"""
|
50 |
+
sub_tokens, mes_tokens = tokens
|
51 |
+
return list({self.stemmer.stem(word) for word in sub_tokens + mes_tokens})
|
52 |
+
|
53 |
+
|
54 |
+
# Main program to process the dataset
|
55 |
+
if __name__ == "__main__":
|
56 |
+
# Initialize the Preprocessing class
|
57 |
+
preprocessor = Preprocessing()
|
58 |
+
|
59 |
+
# Variables to store unique words
|
60 |
+
unique_words = set()
|
61 |
+
|
62 |
+
# Open the CSV file for reading
|
63 |
+
with open("Final_Dataset.csv", "r", encoding="utf-8") as infile:
|
64 |
+
csv_reader = csv.reader(infile)
|
65 |
+
next(csv_reader) # Skip the header line
|
66 |
+
|
67 |
+
# Process each row in the dataset
|
68 |
+
for i, row in enumerate(csv_reader):
|
69 |
+
subject = row[0] # First column is the subject
|
70 |
+
message = row[1] # Second column is the message
|
71 |
+
|
72 |
+
# Preprocess the subject and message
|
73 |
+
text = (subject, message)
|
74 |
+
text = preprocessor.remove_special_char(text)
|
75 |
+
text = preprocessor.lowercase_conversion(text)
|
76 |
+
tokens = preprocessor.tokenize(text)
|
77 |
+
filtered_tokens = preprocessor.removal_of_stop_words(tokens)
|
78 |
+
stemmed_tokens = preprocessor.stem_words(filtered_tokens)
|
79 |
+
|
80 |
+
# Add stemmed tokens to the unique words set
|
81 |
+
unique_words.update(stemmed_tokens)
|
82 |
+
|
83 |
+
print(f"Processed row {i + 1}") # Print progress
|
84 |
+
|
85 |
+
# Write unique words to a file
|
86 |
+
with open("processed_data.txt", "w", encoding="utf-8") as outfile:
|
87 |
+
outfile.write(" ".join(unique_words)) # Join words with space and write to file
|
88 |
+
|
89 |
+
print("Unique words have been saved to uniquewords.txt.")
|