Spaces:
Runtime error
Runtime error
from langchain_community.document_loaders import UnstructuredFileLoader | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_text_splitters import CharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
# # Define a function to perform vectorization | |
def vectorize_documents(): | |
embeddings = HuggingFaceEmbeddings() | |
loader = DirectoryLoader( | |
path="Data", | |
glob="./*.pdf", | |
loader_cls=UnstructuredFileLoader | |
) | |
documents = loader.load() | |
# Splitting the text and creating chunks of these documents. | |
text_splitter = CharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=500 | |
) | |
text_chunks = text_splitter.split_documents(documents) | |
# Store in Chroma vector DB | |
vectordb = Chroma.from_documents( | |
documents=text_chunks, | |
embedding=embeddings, | |
persist_directory="soil_vectordb" | |
) | |
print("Documents Vectorized and saved in VectorDB") | |
# Expose embeddings if needed | |
embeddings = HuggingFaceEmbeddings() | |
# Main guard to prevent execution on import | |
if __name__ == "__main__": | |
vectorize_documents() | |
# # Define a function to perform vectorization | |
# def vectorize_documents(): | |
# # Loading the embedding model | |
# embeddings = HuggingFaceEmbeddings() | |
# loader = DirectoryLoader( | |
# path="Data", | |
# glob="./*.pdf", | |
# loader_cls=UnstructuredFileLoader | |
# ) | |
# documents = loader.load() | |
# # Splitting the text and creating chunks of these documents. | |
# text_splitter = CharacterTextSplitter( | |
# chunk_size=2000, | |
# chunk_overlap=500 | |
# ) | |
# text_chunks = text_splitter.split_documents(documents) | |
# # Store in Chroma vector DB | |
# vectordb = Chroma.from_documents( | |
# documents=text_chunks, | |
# embedding=embeddings, | |
# persist_directory="vector_db_dir" | |
# ) | |
# print("Documents Vectorized and saved in VectorDB") | |
# # Expose embeddings if needed | |
# embeddings = HuggingFaceEmbeddings() | |
# # Main guard to prevent execution on import | |
# if __name__ == "__main__": | |
# vectorize_documents() |