Krish30 commited on
Commit
1b2e553
·
verified ·
1 Parent(s): 0b32918

Upload 3 files

Browse files
Files changed (3) hide show
  1. config.json +1 -0
  2. requirements.txt +18 -0
  3. vectorize_documents.py +129 -0
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GROQ_API_KEY": "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"}
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ langchain-community==0.2.16
3
+ langchain-text-splitters==0.2.4
4
+ langchain-chroma==0.1.3
5
+ langchain-huggingface==0.0.3
6
+ langchain-groq==0.1.9
7
+ unstructured==0.15.0
8
+ nltk==3.8.1
9
+ docx2txt
10
+ SpeechRecognition
11
+ deep-translator
12
+ sounddevice # Replacement for PyAudio
13
+ scipy # Required for WAV file handling with SoundDevice
14
+ vosk
15
+ google-generativeai
16
+ PyPDF2
17
+ streamlit_chat
18
+ googlesearch-python
vectorize_documents.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from langchain_text_splitters import CharacterTextSplitter
2
+ # from langchain_huggingface import HuggingFaceEmbeddings
3
+ # from langchain_chroma import Chroma
4
+ # from langchain.docstore.document import Document
5
+ # import pandas as pd
6
+ # import os
7
+ # import glob
8
+
9
+ # # Define a function to perform vectorization for multiple CSV files
10
+ # def vectorize_documents():
11
+ # embeddings = HuggingFaceEmbeddings()
12
+
13
+ # # Directory containing multiple CSV files
14
+ # csv_directory = "Data" # Replace with your folder name
15
+ # csv_files = glob.glob(os.path.join(csv_directory, "*.csv")) # Find all CSV files in the folder
16
+
17
+ # documents = []
18
+
19
+ # # Load and concatenate all CSV files
20
+ # for file_path in csv_files:
21
+ # df = pd.read_csv(file_path)
22
+ # for _, row in df.iterrows():
23
+ # # Combine all columns in the row into a single string
24
+ # row_content = " ".join(row.astype(str))
25
+ # documents.append(Document(page_content=row_content))
26
+
27
+ # # Splitting the text and creating chunks of these documents
28
+ # text_splitter = CharacterTextSplitter(
29
+ # chunk_size=2000,
30
+ # chunk_overlap=500
31
+ # )
32
+
33
+ # text_chunks = text_splitter.split_documents(documents)
34
+
35
+ # # Process text chunks in batches
36
+ # batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
37
+ # for i in range(0, len(text_chunks), batch_size):
38
+ # batch = text_chunks[i:i + batch_size]
39
+
40
+ # # Store the batch in Chroma vector DB
41
+ # vectordb = Chroma.from_documents(
42
+ # documents=batch,
43
+ # embedding=embeddings,
44
+ # persist_directory="vector_db_dir"
45
+ # )
46
+
47
+ # print("Documents Vectorized and saved in VectorDB")
48
+
49
+ # # Expose embeddings if needed
50
+ # embeddings = HuggingFaceEmbeddings()
51
+
52
+
53
+
54
+ # # Main guard to prevent execution on import
55
+ # if __name__ == "__main__":
56
+ # vectorize_documents()
57
+
58
+
59
+
60
+ from langchain_text_splitters import CharacterTextSplitter
61
+ from langchain_huggingface import HuggingFaceEmbeddings
62
+ from langchain_chroma import Chroma
63
+ from langchain.docstore.document import Document
64
+ import pandas as pd
65
+ import os
66
+ import glob
67
+ from PyPDF2 import PdfReader # Ensure PyPDF2 is installed
68
+
69
+ # Define a function to process CSV files
70
+ def process_csv_files(csv_files):
71
+ documents = []
72
+ for file_path in csv_files:
73
+ df = pd.read_csv(file_path)
74
+ for _, row in df.iterrows():
75
+ row_content = " ".join(row.astype(str))
76
+ documents.append(Document(page_content=row_content))
77
+ return documents
78
+
79
+ # Define a function to process PDF files
80
+ def process_pdf_files(pdf_files):
81
+ documents = []
82
+ for file_path in pdf_files:
83
+ reader = PdfReader(file_path)
84
+ for page in reader.pages:
85
+ text = page.extract_text()
86
+ if text: # Only add non-empty text
87
+ documents.append(Document(page_content=text))
88
+ return documents
89
+
90
+ # Define a function to perform vectorization for CSV and PDF files
91
+ def vectorize_documents():
92
+ embeddings = HuggingFaceEmbeddings()
93
+
94
+ # Directory containing files
95
+ data_directory = "Data" # Replace with your folder name
96
+ csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
97
+ pdf_files = glob.glob(os.path.join(data_directory, "*.pdf"))
98
+
99
+ # Process CSV and PDF files
100
+ documents = process_csv_files(csv_files) + process_pdf_files(pdf_files)
101
+
102
+ # Splitting the text and creating chunks of these documents
103
+ text_splitter = CharacterTextSplitter(
104
+ chunk_size=2000,
105
+ chunk_overlap=500
106
+ )
107
+
108
+ text_chunks = text_splitter.split_documents(documents)
109
+
110
+ # Process text chunks in batches
111
+ batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
112
+ for i in range(0, len(text_chunks), batch_size):
113
+ batch = text_chunks[i:i + batch_size]
114
+
115
+ # Store the batch in Chroma vector DB
116
+ vectordb = Chroma.from_documents(
117
+ documents=batch,
118
+ embedding=embeddings,
119
+ persist_directory="vector_db_dir"
120
+ )
121
+
122
+ print("Documents Vectorized and saved in VectorDB")
123
+
124
+ # Expose embeddings if needed
125
+ embeddings = HuggingFaceEmbeddings()
126
+
127
+ # Main guard to prevent execution on import
128
+ if __name__ == "__main__":
129
+ vectorize_documents()