KarthikaRajagopal commited on
Commit
55a464e
·
verified ·
1 Parent(s): e4eb02a

Upload 03-TotalWords.py

Browse files
Files changed (1) hide show
  1. 03-TotalWords.py +89 -0
03-TotalWords.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re # For regular expressions
3
+ from nltk.stem import PorterStemmer # For stemming
4
+ from nltk.corpus import stopwords # For stopword removal
5
+ import string # For string operations
6
+ import csv # For reading and writing CSV files
7
+
8
+ class Preprocessing:
9
+ def __init__(self):
10
+ self.stop_words = set(stopwords.words('english')) # Initialize a set of stopwords
11
+ self.stemmer = PorterStemmer() # Initialize the PorterStemmer for stemming
12
+
13
+ def remove_special_char(self, text: tuple[str, str]) -> tuple[str, str]:
14
+ """
15
+ Removes special characters and digits from the text.
16
+ """
17
+ sub, mes = text
18
+ sub = re.sub(r'[^\w\s]', ' ', sub) # Replace non-alphanumeric characters with spaces
19
+ mes = re.sub(r'[^\w\s]', ' ', mes)
20
+ return sub, mes
21
+
22
+ def lowercase_conversion(self, text: tuple[str, str]) -> tuple[str, str]:
23
+ """
24
+ Converts all characters in the text to lowercase.
25
+ """
26
+ sub, mes = text
27
+ return sub.lower(), mes.lower()
28
+
29
+ def tokenize(self, text: tuple[str, str]) -> tuple[list[str], list[str]]:
30
+ """
31
+ Splits the text into individual words (tokens).
32
+ """
33
+ sub, mes = text
34
+ return sub.split(), mes.split()
35
+
36
+ def removal_of_stop_words(self, tokens: tuple[list[str], list[str]]) -> tuple[list[str], list[str]]:
37
+ """
38
+ Removes stopwords from the tokenized text.
39
+ """
40
+ sub_tokens, mes_tokens = tokens
41
+ sub_tokens = [word for word in sub_tokens if word not in self.stop_words]
42
+ mes_tokens = [word for word in mes_tokens if word not in self.stop_words]
43
+ return sub_tokens, mes_tokens
44
+
45
+ def stem_words(self, tokens: tuple[list[str], list[str]]) -> list[str]:
46
+ """
47
+ Stems each word in the tokenized text.
48
+ Removes duplicates by returning a unique list of stems.
49
+ """
50
+ sub_tokens, mes_tokens = tokens
51
+ return list({self.stemmer.stem(word) for word in sub_tokens + mes_tokens})
52
+
53
+
54
+ # Main program to process the dataset
55
+ if __name__ == "__main__":
56
+ # Initialize the Preprocessing class
57
+ preprocessor = Preprocessing()
58
+
59
+ # Variables to store unique words
60
+ unique_words = set()
61
+
62
+ # Open the CSV file for reading
63
+ with open("Final_Dataset.csv", "r", encoding="utf-8") as infile:
64
+ csv_reader = csv.reader(infile)
65
+ next(csv_reader) # Skip the header line
66
+
67
+ # Process each row in the dataset
68
+ for i, row in enumerate(csv_reader):
69
+ subject = row[0] # First column is the subject
70
+ message = row[1] # Second column is the message
71
+
72
+ # Preprocess the subject and message
73
+ text = (subject, message)
74
+ text = preprocessor.remove_special_char(text)
75
+ text = preprocessor.lowercase_conversion(text)
76
+ tokens = preprocessor.tokenize(text)
77
+ filtered_tokens = preprocessor.removal_of_stop_words(tokens)
78
+ stemmed_tokens = preprocessor.stem_words(filtered_tokens)
79
+
80
+ # Add stemmed tokens to the unique words set
81
+ unique_words.update(stemmed_tokens)
82
+
83
+ print(f"Processed row {i + 1}") # Print progress
84
+
85
+ # Write unique words to a file
86
+ with open("processed_data.txt", "w", encoding="utf-8") as outfile:
87
+ outfile.write(" ".join(unique_words)) # Join words with space and write to file
88
+
89
+ print("Unique words have been saved to uniquewords.txt.")