Spaces:
Runtime error
Runtime error
File size: 2,272 Bytes
785c4f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
# # Define a function to perform vectorization
def vectorize_documents():
embeddings = HuggingFaceEmbeddings()
loader = DirectoryLoader(
path="Data",
glob="./*.pdf",
loader_cls=UnstructuredFileLoader
)
documents = loader.load()
# Splitting the text and creating chunks of these documents.
text_splitter = CharacterTextSplitter(
chunk_size=2000,
chunk_overlap=500
)
text_chunks = text_splitter.split_documents(documents)
# Store in Chroma vector DB
vectordb = Chroma.from_documents(
documents=text_chunks,
embedding=embeddings,
persist_directory="soil_vectordb"
)
print("Documents Vectorized and saved in VectorDB")
# Expose embeddings if needed
embeddings = HuggingFaceEmbeddings()
# Main guard to prevent execution on import
if __name__ == "__main__":
vectorize_documents()
# # Define a function to perform vectorization
# def vectorize_documents():
# # Loading the embedding model
# embeddings = HuggingFaceEmbeddings()
# loader = DirectoryLoader(
# path="Data",
# glob="./*.pdf",
# loader_cls=UnstructuredFileLoader
# )
# documents = loader.load()
# # Splitting the text and creating chunks of these documents.
# text_splitter = CharacterTextSplitter(
# chunk_size=2000,
# chunk_overlap=500
# )
# text_chunks = text_splitter.split_documents(documents)
# # Store in Chroma vector DB
# vectordb = Chroma.from_documents(
# documents=text_chunks,
# embedding=embeddings,
# persist_directory="vector_db_dir"
# )
# print("Documents Vectorized and saved in VectorDB")
# # Expose embeddings if needed
# embeddings = HuggingFaceEmbeddings()
# # Main guard to prevent execution on import
# if __name__ == "__main__":
# vectorize_documents() |