|
from langchain_text_splitters import CharacterTextSplitter
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from langchain_chroma import Chroma
|
|
from langchain.docstore.document import Document
|
|
import pandas as pd
|
|
import os
|
|
import glob
|
|
|
|
|
|
def vectorize_documents():
|
|
embeddings = HuggingFaceEmbeddings()
|
|
|
|
|
|
csv_directory = "Data"
|
|
csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))
|
|
|
|
documents = []
|
|
|
|
|
|
for file_path in csv_files:
|
|
df = pd.read_csv(file_path)
|
|
for _, row in df.iterrows():
|
|
|
|
row_content = " ".join(row.astype(str))
|
|
documents.append(Document(page_content=row_content))
|
|
|
|
|
|
text_splitter = CharacterTextSplitter(
|
|
chunk_size=2000,
|
|
chunk_overlap=500
|
|
)
|
|
|
|
text_chunks = text_splitter.split_documents(documents)
|
|
|
|
|
|
batch_size = 5000
|
|
for i in range(0, len(text_chunks), batch_size):
|
|
batch = text_chunks[i:i + batch_size]
|
|
|
|
|
|
vectordb = Chroma.from_documents(
|
|
documents=batch,
|
|
embedding=embeddings,
|
|
persist_directory="House_vectordb"
|
|
)
|
|
|
|
print("Documents Vectorized and saved in VectorDB")
|
|
|
|
|
|
embeddings = HuggingFaceEmbeddings()
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
vectorize_documents() |