|
from langchain_community.document_loaders import UnstructuredFileLoader
|
|
from langchain_community.document_loaders import DirectoryLoader
|
|
from langchain_text_splitters import CharacterTextSplitter
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from langchain_chroma import Chroma
|
|
|
|
|
|
|
|
def vectorize_documents():
|
|
embeddings = HuggingFaceEmbeddings()
|
|
|
|
loader = DirectoryLoader(
|
|
path="cv_data",
|
|
glob="./*.pdf",
|
|
loader_cls=UnstructuredFileLoader
|
|
)
|
|
|
|
documents = loader.load()
|
|
|
|
|
|
text_splitter = CharacterTextSplitter(
|
|
chunk_size=2000,
|
|
chunk_overlap=500
|
|
)
|
|
|
|
text_chunks = text_splitter.split_documents(documents)
|
|
|
|
|
|
vectordb = Chroma.from_documents(
|
|
documents=text_chunks,
|
|
embedding=embeddings,
|
|
persist_directory="cv_vectordb"
|
|
)
|
|
|
|
print("Documents Vectorized and saved in VectorDB")
|
|
|
|
|
|
|
|
|
|
embeddings = HuggingFaceEmbeddings()
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
vectorize_documents() |