KarthikaRajagopal commited on
Commit
123bbaa
·
verified ·
1 Parent(s): 253d59f

Upload 3 files

Browse files
08_bagofwords.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """08 - BagOfWords.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/16K9eNawK7Oli4ZnUm0r1nLcTiWRuTYW_
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %%writefile 08-BagOfWords.py
12
+ # import csv
13
+ #
14
+ # class BagOfWords:
15
+ # def transform(self, processed_data):
16
+ # """
17
+ # This function creates a Bag of Words (BoW) representation of the data.
18
+ #
19
+ # Steps:
20
+ # 1. Read unique words from a file.
21
+ # 2. Process the input data (processed_data) and count the occurrences of each unique word.
22
+ # 3. Save the BoW representation to a CSV file.
23
+ # """
24
+ #
25
+ # # Step 1: Reading the unique words from "unique_words.txt"
26
+ # unique_words = [] # List to store unique words
27
+ # with open("05 - unique words.txt", "r") as in_file:
28
+ # for line in in_file:
29
+ # unique_words.append(line.strip()) # Add each word to the unique_words list
30
+ #
31
+ # print(f"Unique words: {len(unique_words)}") # Print the count of unique words
32
+ #
33
+ # # Step 2: Writing the columns (unique words) in the output "BagOfWords.csv"
34
+ # with open("08 - BagOfWords.csv", mode="w", newline='') as out_file:
35
+ # writer = csv.writer(out_file)
36
+ #
37
+ # # Write the header (unique words)
38
+ # writer.writerow(unique_words)
39
+ #
40
+ # # Step 3: Creating the Bag of Words file
41
+ # for data in processed_data:
42
+ # word_count = {} # Dictionary to store word counts for the current sentence
43
+ #
44
+ # # Count the occurrences of words in the current sentence
45
+ # for word in data:
46
+ # word_count[word] = word_count.get(word, 0) + 1
47
+ #
48
+ # # Write the word counts for each unique word in the CSV file
49
+ # row = []
50
+ # for word in unique_words:
51
+ # if word in word_count:
52
+ # row.append(word_count[word])
53
+ # else:
54
+ # row.append(0)
55
+ #
56
+ # writer.writerow(row) # Write the row to the CSV file
57
+ #
58
+ # print(f"Processed sentence {processed_data.index(data) + 1}")
59
+ #
60
+ #
61
+
62
+ !python /content/08-BagOfWords.py
09_tfidf_py.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """09 - TFIDF.py
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1aDPOMUJa_ml-2rRqyxKp24Gd9hlVFDwv
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %%writefile 09-TFIDF.py
12
+ # import numpy as np
13
+ # import math
14
+ # import pandas as pd
15
+ # from collections import defaultdict
16
+ #
17
+ # class TFIDF:
18
+ # def __init__(self):
19
+ # self.raw_data_set = [] # Raw input data (list of documents as lists of words)
20
+ # self.vocab_list = [] # List of unique words (vocabulary)
21
+ # self.data_matrix = [] # Matrix of word counts (Bag of Words representation)
22
+ # self.tfidf_matrix = [] # Matrix for TF-IDF values
23
+ # self.num_terms = [] # Number of terms in each document (for TF calculation)
24
+ #
25
+ # def load_data(self, file_path):
26
+ # """Load dataset (CSV file with one row per document)."""
27
+ # print("Loading data...")
28
+ # df = pd.read_csv(file_path, header=None)
29
+ # for index, row in df.iterrows():
30
+ # self.raw_data_set.append(row[0].split()) # Split each document into words
31
+ # print(f"Loaded {len(self.raw_data_set)} documents.")
32
+ #
33
+ # def create_vocab_list(self):
34
+ # """Create a list of unique words (vocabulary) from the raw data."""
35
+ # print("Creating vocabulary list...")
36
+ # vocab_set = set()
37
+ # for document in self.raw_data_set:
38
+ # vocab_set.update(document)
39
+ # self.vocab_list = sorted(list(vocab_set))
40
+ # print(f"Vocabulary size: {len(self.vocab_list)}")
41
+ #
42
+ # def bag_of_words_to_vector(self, document):
43
+ # """Convert a document (list of words) to a vector representation."""
44
+ # word_vector = [0] * len(self.vocab_list)
45
+ # for word in document:
46
+ # if word in self.vocab_list:
47
+ # idx = self.vocab_list.index(word)
48
+ # word_vector[idx] += 1
49
+ # return word_vector
50
+ #
51
+ # def convert_to_matrix(self):
52
+ # """Convert the entire dataset into a Bag of Words matrix."""
53
+ # print("Converting data to Bag of Words matrix...")
54
+ # for document in self.raw_data_set:
55
+ # self.data_matrix.append(self.bag_of_words_to_vector(document))
56
+ # self.num_terms.append(len(document))
57
+ # print(f"Data matrix shape: {len(self.data_matrix)} x {len(self.vocab_list)}")
58
+ #
59
+ # def compute_tfidf(self):
60
+ # """Calculate the TF-IDF matrix."""
61
+ # print("Calculating TF-IDF matrix...")
62
+ # num_docs = len(self.raw_data_set)
63
+ # doc_term_count = [0] * len(self.vocab_list)
64
+ #
65
+ # # Count the number of documents each term appears in (for IDF calculation)
66
+ # for doc_vector in self.data_matrix:
67
+ # for idx, count in enumerate(doc_vector):
68
+ # if count > 0:
69
+ # doc_term_count[idx] += 1
70
+ #
71
+ # # Calculate TF-IDF for each document-term pair
72
+ # for i, doc_vector in enumerate(self.data_matrix):
73
+ # tfidf_vector = []
74
+ # for j, count in enumerate(doc_vector):
75
+ # tf = count / self.num_terms[i] # Term Frequency
76
+ # idf = math.log(num_docs / (1 + doc_term_count[j])) # Inverse Document Frequency
77
+ # tfidf_vector.append(tf * idf)
78
+ # self.tfidf_matrix.append(tfidf_vector)
79
+ # print(f"TF-IDF matrix calculated with shape: {len(self.tfidf_matrix)} x {len(self.vocab_list)}")
80
+ #
81
+ # def save_tfidf_matrix(self, output_file):
82
+ # """Save the TF-IDF matrix to a CSV file."""
83
+ # print(f"Saving TF-IDF matrix to {output_file}...")
84
+ # df = pd.DataFrame(self.tfidf_matrix, columns=self.vocab_list)
85
+ # df.to_csv(output_file, index=False)
86
+ # print(f"TF-IDF matrix saved to {output_file}.")
87
+ #
88
+ # # Explanation of the Python code:
89
+ # # 1.Imports: We use numpy, math, and pandas to handle matrices and math calculations efficiently. We also use defaultdict for easy counting of word occurrences.
90
+ #
91
+ # # 2.Initialization (__init__): Initializes necessary attributes such as raw_data_set, vocab_list, data_matrix, and tfidf_matrix.
92
+ #
93
+ # # 3.Loading Data (load_data): This function reads a CSV file where each row contains a document. The documents are split into words and stored in raw_data_set.
94
+ #
95
+ # # 4.Vocabulary Creation (create_vocab_list): This function extracts the unique words from all documents to create a vocabulary list. It uses a set to avoid duplicates and then sorts it to maintain consistency.
96
+ #
97
+ # # 5.Bag of Words Conversion (bag_of_words_to_vector): Converts a document (list of words) into a vector where each index corresponds to a word in the vocabulary list, and the value is the frequency of that word in the document.
98
+ #
99
+ # # 6.Matrix Conversion (convert_to_matrix): Converts the entire dataset (list of documents) into a Bag of Words matrix. It also calculates the number of terms in each document for TF calculation.
100
+ #
101
+ # # 7.TF-IDF Calculation (compute_tfidf): For each document and each word, the function calculates the TF-IDF value based on the Term Frequency (TF) and Inverse Document Frequency (IDF). It calculates the TF by dividing the word count by the total number of terms in the document. IDF is calculated using the formula log(total_documents / (1 + document_count_for_term)).
102
+ #
103
+ # # 8.Saving the TF-IDF Matrix (save_tfidf_matrix): This function saves the TF-IDF matrix to a CSV file for further use or analysis.
104
+ #
105
+
106
+ !python /content/09-TFIDF.py
10_TF_IDF_and_Naives_Bayes.ipynb ADDED
The diff for this file is too large to render. See raw diff