Spaces:

Krish30
/

Soil-Crop-Recommendation-Ai

Runtime error

App Files Files Community

Soil-Crop-Recommendation-Ai / vectorize_documents.py

Krish30

Upload 5 files

785c4f1 verified 3 months ago

raw

history blame contribute delete

2.27 kB

	from langchain_community.document_loaders import UnstructuredFileLoader
	from langchain_community.document_loaders import DirectoryLoader
	from langchain_text_splitters import CharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma


	# # Define a function to perform vectorization
	def vectorize_documents():
	embeddings = HuggingFaceEmbeddings()

	loader = DirectoryLoader(
	path="Data",
	glob="./*.pdf",
	loader_cls=UnstructuredFileLoader
	)

	documents = loader.load()

	# Splitting the text and creating chunks of these documents.
	text_splitter = CharacterTextSplitter(
	chunk_size=2000,
	chunk_overlap=500
	)

	text_chunks = text_splitter.split_documents(documents)

	# Store in Chroma vector DB
	vectordb = Chroma.from_documents(
	documents=text_chunks,
	embedding=embeddings,
	persist_directory="soil_vectordb"
	)

	print("Documents Vectorized and saved in VectorDB")



	# Expose embeddings if needed
	embeddings = HuggingFaceEmbeddings()


	# Main guard to prevent execution on import
	if __name__ == "__main__":
	vectorize_documents()



	# # Define a function to perform vectorization
	# def vectorize_documents():
	# # Loading the embedding model
	# embeddings = HuggingFaceEmbeddings()

	# loader = DirectoryLoader(
	# path="Data",
	# glob="./*.pdf",
	# loader_cls=UnstructuredFileLoader
	# )

	# documents = loader.load()

	# # Splitting the text and creating chunks of these documents.
	# text_splitter = CharacterTextSplitter(
	# chunk_size=2000,
	# chunk_overlap=500
	# )

	# text_chunks = text_splitter.split_documents(documents)

	# # Store in Chroma vector DB
	# vectordb = Chroma.from_documents(
	# documents=text_chunks,
	# embedding=embeddings,
	# persist_directory="vector_db_dir"
	# )

	# print("Documents Vectorized and saved in VectorDB")


	# # Expose embeddings if needed
	# embeddings = HuggingFaceEmbeddings()


	# # Main guard to prevent execution on import
	# if __name__ == "__main__":
	# vectorize_documents()