Spaces:

arpita-23
/

Shlok_Summerizer

Sleeping

App Files Files Community

Shlok_Summerizer / vectorize_documents.py

arpita-23

Upload 4 files

6df9459 verified 3 months ago

raw

history blame

1.89 kB

	from langchain_text_splitters import CharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma
	from langchain.docstore.document import Document
	import pandas as pd
	import os
	import glob

	# Define a function to perform vectorization for multiple CSV files
	def vectorize_documents():
	embeddings = HuggingFaceEmbeddings()

	# Directory containing multiple CSV files
	csv_directory = "Data" # Replace with your folder name
	csv_files = glob.glob(os.path.join(csv_directory, "*.csv")) # Find all CSV files in the folder

	documents = []

	# Load and concatenate all CSV files
	for file_path in csv_files:
	df = pd.read_csv(file_path)
	for _, row in df.iterrows():
	# Combine all columns in the row into a single string
	row_content = " ".join(row.astype(str))
	documents.append(Document(page_content=row_content))

	# Splitting the text and creating chunks of these documents
	text_splitter = CharacterTextSplitter(
	chunk_size=2000,
	chunk_overlap=500
	)

	text_chunks = text_splitter.split_documents(documents)

	# Process text chunks in batches
	batch_size = 5000 # Chroma's batch size limit is 5461, set a slightly smaller size for safety
	for i in range(0, len(text_chunks), batch_size):
	batch = text_chunks[i:i + batch_size]

	# Store the batch in Chroma vector DB
	vectordb = Chroma.from_documents(
	documents=batch,
	embedding=embeddings,
	persist_directory="vector_db_dir"
	)

	print("Documents Vectorized and saved in VectorDB")

	# Expose embeddings if needed
	embeddings = HuggingFaceEmbeddings()



	# Main guard to prevent execution on import
	if __name__ == "__main__":
	vectorize_documents()