Spaces:
Sleeping
Sleeping
# from langchain_text_splitters import CharacterTextSplitter | |
# from langchain_huggingface import HuggingFaceEmbeddings | |
# from langchain_chroma import Chroma | |
# from langchain.docstore.document import Document | |
# import pandas as pd | |
# import os | |
# import glob | |
# # Define a function to perform vectorization for multiple CSV files | |
# def vectorize_documents(): | |
# embeddings = HuggingFaceEmbeddings() | |
# # Directory containing multiple CSV files | |
# csv_directory = "Data" # Replace with your folder name | |
# csv_files = glob.glob(os.path.join(csv_directory, "*.csv")) # Find all CSV files in the folder | |
# documents = [] | |
# # Load and concatenate all CSV files | |
# for file_path in csv_files: | |
# df = pd.read_csv(file_path) | |
# for _, row in df.iterrows(): | |
# # Combine all columns in the row into a single string | |
# row_content = " ".join(row.astype(str)) | |
# documents.append(Document(page_content=row_content)) | |
# # Splitting the text and creating chunks of these documents | |
# text_splitter = CharacterTextSplitter( | |
# chunk_size=2000, | |
# chunk_overlap=500 | |
# ) | |
# text_chunks = text_splitter.split_documents(documents) | |
# # Process text chunks in batches | |
# batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety | |
# for i in range(0, len(text_chunks), batch_size): | |
# batch = text_chunks[i:i + batch_size] | |
# # Store the batch in Chroma vector DB | |
# vectordb = Chroma.from_documents( | |
# documents=batch, | |
# embedding=embeddings, | |
# persist_directory="vector_db_dir" | |
# ) | |
# print("Documents Vectorized and saved in VectorDB") | |
# # Expose embeddings if needed | |
# embeddings = HuggingFaceEmbeddings() | |
# # Main guard to prevent execution on import | |
# if __name__ == "__main__": | |
# vectorize_documents() | |
from langchain_text_splitters import CharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
from langchain.docstore.document import Document | |
import pandas as pd | |
import os | |
import glob | |
from PyPDF2 import PdfReader # Ensure PyPDF2 is installed | |
# Define a function to process CSV files | |
def process_csv_files(csv_files): | |
documents = [] | |
for file_path in csv_files: | |
df = pd.read_csv(file_path) | |
for _, row in df.iterrows(): | |
row_content = " ".join(row.astype(str)) | |
documents.append(Document(page_content=row_content)) | |
return documents | |
# Define a function to process PDF files | |
def process_pdf_files(pdf_files): | |
documents = [] | |
for file_path in pdf_files: | |
reader = PdfReader(file_path) | |
for page in reader.pages: | |
text = page.extract_text() | |
if text: # Only add non-empty text | |
documents.append(Document(page_content=text)) | |
return documents | |
# Define a function to perform vectorization for CSV and PDF files | |
def vectorize_documents(): | |
embeddings = HuggingFaceEmbeddings() | |
# Directory containing files | |
data_directory = "Data" # Replace with your folder name | |
csv_files = glob.glob(os.path.join(data_directory, "*.csv")) | |
pdf_files = glob.glob(os.path.join(data_directory, "*.pdf")) | |
# Process CSV and PDF files | |
documents = process_csv_files(csv_files) + process_pdf_files(pdf_files) | |
# Splitting the text and creating chunks of these documents | |
text_splitter = CharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=500 | |
) | |
text_chunks = text_splitter.split_documents(documents) | |
# Process text chunks in batches | |
batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety | |
for i in range(0, len(text_chunks), batch_size): | |
batch = text_chunks[i:i + batch_size] | |
# Store the batch in Chroma vector DB | |
vectordb = Chroma.from_documents( | |
documents=batch, | |
embedding=embeddings, | |
persist_directory="vector_db_dir" | |
) | |
print("Documents Vectorized and saved in VectorDB") | |
# Expose embeddings if needed | |
embeddings = HuggingFaceEmbeddings() | |
# Main guard to prevent execution on import | |
if __name__ == "__main__": | |
vectorize_documents() | |