KarthikaRajagopal
/

Spam_Email_Detection

Model card Files Files and versions Community

KarthikaRajagopal commited on Dec 2, 2024

Commit

123bbaa

verified ·

1 Parent(s): 253d59f

Upload 3 files

Browse files

Files changed (3) hide show

08_bagofwords.py +62 -0
09_tfidf_py.py +106 -0
10_TF_IDF_and_Naives_Bayes.ipynb +0 -0

08_bagofwords.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# -*- coding: utf-8 -*-
+"""08 - BagOfWords.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/16K9eNawK7Oli4ZnUm0r1nLcTiWRuTYW_
+"""
+# Commented out IPython magic to ensure Python compatibility.
+# %%writefile 08-BagOfWords.py
+# import csv
+#
+# class BagOfWords:
+#     def transform(self, processed_data):
+#         """
+#         This function creates a Bag of Words (BoW) representation of the data.
+#
+#         Steps:
+#         1. Read unique words from a file.
+#         2. Process the input data (processed_data) and count the occurrences of each unique word.
+#         3. Save the BoW representation to a CSV file.
+#         """
+#
+#         # Step 1: Reading the unique words from "unique_words.txt"
+#         unique_words = []  # List to store unique words
+#         with open("05 - unique words.txt", "r") as in_file:
+#             for line in in_file:
+#                 unique_words.append(line.strip())  # Add each word to the unique_words list
+#
+#         print(f"Unique words: {len(unique_words)}")  # Print the count of unique words
+#
+#         # Step 2: Writing the columns (unique words) in the output "BagOfWords.csv"
+#         with open("08 - BagOfWords.csv", mode="w", newline='') as out_file:
+#             writer = csv.writer(out_file)
+#
+#             # Write the header (unique words)
+#             writer.writerow(unique_words)
+#
+#             # Step 3: Creating the Bag of Words file
+#             for data in processed_data:
+#                 word_count = {}  # Dictionary to store word counts for the current sentence
+#
+#                 # Count the occurrences of words in the current sentence
+#                 for word in data:
+#                     word_count[word] = word_count.get(word, 0) + 1
+#
+#                 # Write the word counts for each unique word in the CSV file
+#                 row = []
+#                 for word in unique_words:
+#                     if word in word_count:
+#                         row.append(word_count[word])
+#                     else:
+#                         row.append(0)
+#
+#                 writer.writerow(row)  # Write the row to the CSV file
+#
+#                 print(f"Processed sentence {processed_data.index(data) + 1}")
+#
+#
+!python /content/08-BagOfWords.py

09_tfidf_py.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# -*- coding: utf-8 -*-
+"""09 - TFIDF.py
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1aDPOMUJa_ml-2rRqyxKp24Gd9hlVFDwv
+"""
+# Commented out IPython magic to ensure Python compatibility.
+# %%writefile 09-TFIDF.py
+# import numpy as np
+# import math
+# import pandas as pd
+# from collections import defaultdict
+#
+# class TFIDF:
+#     def __init__(self):
+#         self.raw_data_set = []  # Raw input data (list of documents as lists of words)
+#         self.vocab_list = []  # List of unique words (vocabulary)
+#         self.data_matrix = []  # Matrix of word counts (Bag of Words representation)
+#         self.tfidf_matrix = []  # Matrix for TF-IDF values
+#         self.num_terms = []  # Number of terms in each document (for TF calculation)
+#
+#     def load_data(self, file_path):
+#         """Load dataset (CSV file with one row per document)."""
+#         print("Loading data...")
+#         df = pd.read_csv(file_path, header=None)
+#         for index, row in df.iterrows():
+#             self.raw_data_set.append(row[0].split())  # Split each document into words
+#         print(f"Loaded {len(self.raw_data_set)} documents.")
+#
+#     def create_vocab_list(self):
+#         """Create a list of unique words (vocabulary) from the raw data."""
+#         print("Creating vocabulary list...")
+#         vocab_set = set()
+#         for document in self.raw_data_set:
+#             vocab_set.update(document)
+#         self.vocab_list = sorted(list(vocab_set))
+#         print(f"Vocabulary size: {len(self.vocab_list)}")
+#
+#     def bag_of_words_to_vector(self, document):
+#         """Convert a document (list of words) to a vector representation."""
+#         word_vector = [0] * len(self.vocab_list)
+#         for word in document:
+#             if word in self.vocab_list:
+#                 idx = self.vocab_list.index(word)
+#                 word_vector[idx] += 1
+#         return word_vector
+#
+#     def convert_to_matrix(self):
+#         """Convert the entire dataset into a Bag of Words matrix."""
+#         print("Converting data to Bag of Words matrix...")
+#         for document in self.raw_data_set:
+#             self.data_matrix.append(self.bag_of_words_to_vector(document))
+#             self.num_terms.append(len(document))
+#         print(f"Data matrix shape: {len(self.data_matrix)} x {len(self.vocab_list)}")
+#
+#     def compute_tfidf(self):
+#         """Calculate the TF-IDF matrix."""
+#         print("Calculating TF-IDF matrix...")
+#         num_docs = len(self.raw_data_set)
+#         doc_term_count = [0] * len(self.vocab_list)
+#
+#         # Count the number of documents each term appears in (for IDF calculation)
+#         for doc_vector in self.data_matrix:
+#             for idx, count in enumerate(doc_vector):
+#                 if count > 0:
+#                     doc_term_count[idx] += 1
+#
+#         # Calculate TF-IDF for each document-term pair
+#         for i, doc_vector in enumerate(self.data_matrix):
+#             tfidf_vector = []
+#             for j, count in enumerate(doc_vector):
+#                 tf = count / self.num_terms[i]  # Term Frequency
+#                 idf = math.log(num_docs / (1 + doc_term_count[j]))  # Inverse Document Frequency
+#                 tfidf_vector.append(tf * idf)
+#             self.tfidf_matrix.append(tfidf_vector)
+#         print(f"TF-IDF matrix calculated with shape: {len(self.tfidf_matrix)} x {len(self.vocab_list)}")
+#
+#     def save_tfidf_matrix(self, output_file):
+#         """Save the TF-IDF matrix to a CSV file."""
+#         print(f"Saving TF-IDF matrix to {output_file}...")
+#         df = pd.DataFrame(self.tfidf_matrix, columns=self.vocab_list)
+#         df.to_csv(output_file, index=False)
+#         print(f"TF-IDF matrix saved to {output_file}.")
+#
+# #         Explanation of the Python code:
+# # 1.Imports: We use numpy, math, and pandas to handle matrices and math calculations efficiently. We also use defaultdict for easy counting of word occurrences.
+#
+# # 2.Initialization (__init__): Initializes necessary attributes such as raw_data_set, vocab_list, data_matrix, and tfidf_matrix.
+#
+# # 3.Loading Data (load_data): This function reads a CSV file where each row contains a document. The documents are split into words and stored in raw_data_set.
+#
+# # 4.Vocabulary Creation (create_vocab_list): This function extracts the unique words from all documents to create a vocabulary list. It uses a set to avoid duplicates and then sorts it to maintain consistency.
+#
+# # 5.Bag of Words Conversion (bag_of_words_to_vector): Converts a document (list of words) into a vector where each index corresponds to a word in the vocabulary list, and the value is the frequency of that word in the document.
+#
+# # 6.Matrix Conversion (convert_to_matrix): Converts the entire dataset (list of documents) into a Bag of Words matrix. It also calculates the number of terms in each document for TF calculation.
+#
+# # 7.TF-IDF Calculation (compute_tfidf): For each document and each word, the function calculates the TF-IDF value based on the Term Frequency (TF) and Inverse Document Frequency (IDF). It calculates the TF by dividing the word count by the total number of terms in the document. IDF is calculated using the formula log(total_documents / (1 + document_count_for_term)).
+#
+# # 8.Saving the TF-IDF Matrix (save_tfidf_matrix): This function saves the TF-IDF matrix to a CSV file for further use or analysis.
+#
+!python /content/09-TFIDF.py

10_TF_IDF_and_Naives_Bayes.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff