from langchain_community.document_loaders import UnstructuredFileLoader from langchain_community.document_loaders import DirectoryLoader from langchain_text_splitters import CharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma # # Define a function to perform vectorization def vectorize_documents(): embeddings = HuggingFaceEmbeddings() loader = DirectoryLoader( path="Data", glob="./*.pdf", loader_cls=UnstructuredFileLoader ) documents = loader.load() # Splitting the text and creating chunks of these documents. text_splitter = CharacterTextSplitter( chunk_size=2000, chunk_overlap=500 ) text_chunks = text_splitter.split_documents(documents) # Store in Chroma vector DB vectordb = Chroma.from_documents( documents=text_chunks, embedding=embeddings, persist_directory="soil_vectordb" ) print("Documents Vectorized and saved in VectorDB") # Expose embeddings if needed embeddings = HuggingFaceEmbeddings() # Main guard to prevent execution on import if __name__ == "__main__": vectorize_documents() # # Define a function to perform vectorization # def vectorize_documents(): # # Loading the embedding model # embeddings = HuggingFaceEmbeddings() # loader = DirectoryLoader( # path="Data", # glob="./*.pdf", # loader_cls=UnstructuredFileLoader # ) # documents = loader.load() # # Splitting the text and creating chunks of these documents. # text_splitter = CharacterTextSplitter( # chunk_size=2000, # chunk_overlap=500 # ) # text_chunks = text_splitter.split_documents(documents) # # Store in Chroma vector DB # vectordb = Chroma.from_documents( # documents=text_chunks, # embedding=embeddings, # persist_directory="vector_db_dir" # ) # print("Documents Vectorized and saved in VectorDB") # # Expose embeddings if needed # embeddings = HuggingFaceEmbeddings() # # Main guard to prevent execution on import # if __name__ == "__main__": # vectorize_documents()