Spaces:

Krish30
/

Wisdom-Query-Assistant

Sleeping

App Files Files Community

Krish30 commited on Jan 12

Commit

1b2e553

verified ·

1 Parent(s): 0b32918

Upload 3 files

Browse files

Files changed (3) hide show

config.json +1 -0
requirements.txt +18 -0
vectorize_documents.py +129 -0

config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"GROQ_API_KEY": "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"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+streamlit==1.38.0
+langchain-community==0.2.16
+langchain-text-splitters==0.2.4
+langchain-chroma==0.1.3
+langchain-huggingface==0.0.3
+langchain-groq==0.1.9
+unstructured==0.15.0
+nltk==3.8.1
+docx2txt
+SpeechRecognition
+deep-translator
+sounddevice  # Replacement for PyAudio
+scipy  # Required for WAV file handling with SoundDevice
+vosk
+google-generativeai
+PyPDF2
+streamlit_chat
+googlesearch-python

vectorize_documents.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# from langchain_text_splitters import CharacterTextSplitter
+# from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain_chroma import Chroma
+# from langchain.docstore.document import Document
+# import pandas as pd
+# import os
+# import glob
+# # Define a function to perform vectorization for multiple CSV files
+# def vectorize_documents():
+#     embeddings = HuggingFaceEmbeddings()
+#     # Directory containing multiple CSV files
+#     csv_directory = "Data"  # Replace with your folder name
+#     csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))  # Find all CSV files in the folder
+#     documents = []
+#     # Load and concatenate all CSV files
+#     for file_path in csv_files:
+#         df = pd.read_csv(file_path)
+#         for _, row in df.iterrows():
+#             # Combine all columns in the row into a single string
+#             row_content = " ".join(row.astype(str))
+#             documents.append(Document(page_content=row_content))
+#     # Splitting the text and creating chunks of these documents
+#     text_splitter = CharacterTextSplitter(
+#         chunk_size=2000,
+#         chunk_overlap=500
+#     )
+#     text_chunks = text_splitter.split_documents(documents)
+#     # Process text chunks in batches
+#     batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
+#     for i in range(0, len(text_chunks), batch_size):
+#         batch = text_chunks[i:i + batch_size]
+#         # Store the batch in Chroma vector DB
+#         vectordb = Chroma.from_documents(
+#             documents=batch,
+#             embedding=embeddings,
+#             persist_directory="vector_db_dir"
+#         )
+#     print("Documents Vectorized and saved in VectorDB")
+# # Expose embeddings if needed
+# embeddings = HuggingFaceEmbeddings()
+# # Main guard to prevent execution on import
+# if __name__ == "__main__":
+#     vectorize_documents()
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain.docstore.document import Document
+import pandas as pd
+import os
+import glob
+from PyPDF2 import PdfReader  # Ensure PyPDF2 is installed
+# Define a function to process CSV files
+def process_csv_files(csv_files):
+    documents = []
+    for file_path in csv_files:
+        df = pd.read_csv(file_path)
+        for _, row in df.iterrows():
+            row_content = " ".join(row.astype(str))
+            documents.append(Document(page_content=row_content))
+    return documents
+# Define a function to process PDF files
+def process_pdf_files(pdf_files):
+    documents = []
+    for file_path in pdf_files:
+        reader = PdfReader(file_path)
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:  # Only add non-empty text
+                documents.append(Document(page_content=text))
+    return documents
+# Define a function to perform vectorization for CSV and PDF files
+def vectorize_documents():
+    embeddings = HuggingFaceEmbeddings()
+    # Directory containing files
+    data_directory = "Data"  # Replace with your folder name
+    csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
+    pdf_files = glob.glob(os.path.join(data_directory, "*.pdf"))
+    # Process CSV and PDF files
+    documents = process_csv_files(csv_files) + process_pdf_files(pdf_files)
+    # Splitting the text and creating chunks of these documents
+    text_splitter = CharacterTextSplitter(
+        chunk_size=2000,
+        chunk_overlap=500
+    )
+    text_chunks = text_splitter.split_documents(documents)
+    # Process text chunks in batches
+    batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
+    for i in range(0, len(text_chunks), batch_size):
+        batch = text_chunks[i:i + batch_size]
+        # Store the batch in Chroma vector DB
+        vectordb = Chroma.from_documents(
+            documents=batch,
+            embedding=embeddings,
+            persist_directory="vector_db_dir"
+        )
+    print("Documents Vectorized and saved in VectorDB")
+# Expose embeddings if needed
+embeddings = HuggingFaceEmbeddings()
+# Main guard to prevent execution on import
+if __name__ == "__main__":
+    vectorize_documents()