Spaces:

MRK4863
/

RAG-based-RecSys-using-LLMs

Sleeping

App Files Files Community

RAG-based-RecSys-using-LLMs / create_vectorStore.py

MRK4863

first commit

337af81 7 months ago

raw

history blame contribute delete

2.98 kB

	import sys
	sys.path.append("..")
	import os.path
	import pandas as pd
	import time
	from tqdm import tqdm
	import chromadb
	from langchain.vectorstores import Chroma
	from langchain.embeddings.openai import OpenAIEmbeddings
	from openai import OpenAI

	from openai import OpenAI

	import yaml
	# Read YAML file
	with open("config.yaml", 'r') as stream:
	CONFIG = yaml.safe_load(stream)

	# Wrapper for DeepInfraEmbeddings generation
	class DeepInfraEmbeddings:
	def __init__(self, api_key, base_url, model="BAAI/bge-large-en-v1.5"):
	"""Intialise client to access embedding model

	Args:
	api_key (str): Deep-Infra API key
	base_url (str): URL to access the embeddings
	model (str, optional): 1024 dimension embeddings. Defaults to "BAAI/bge-large-en-v1.5".
	"""
	self.client = OpenAI(api_key=api_key, base_url=base_url)
	self.model = model

	def embed_documents(self, texts):
	"""Converts given INPUT data to corresponding embeddings

	Args:
	texts (str): INPUT database contents as string.

	Returns:
	list: List of embeddings
	"""
	if isinstance(texts, str):
	texts = [texts]

	embeddings = self.client.embeddings.create(
	model=self.model,
	input=texts,
	encoding_format="float"
	)

	return [embedding.embedding for embedding in embeddings.data]

	def embed_query(self, text):
	return self.embed_documents([text])[0]



	# CREATE A LOCAL CHROMA_DB WITH PERSISTENT STORAGE
	client = chromadb.PersistentClient(path=os.path.join(os.getcwd(), 'vector_stores'))

	# LOAD THE DATA_PATH
	file_path = os.path.join(CONFIG["DATA_PATH"])
	df = pd.read_csv(file_path)
	metadatas = [{'source': int(df.loc[i][0]), 'row': i} for i in range(len(df))]
	docs = df.apply(lambda x: x.to_json(), axis=1).tolist()

	# Initialize DeepInfraEmbeddings with your API key and base URL
	embeddings = DeepInfraEmbeddings(
	api_key=CONFIG["API_KEY"],
	base_url=CONFIG["BASE_URL"]
	)

	# Create Chroma collection
	vector_store = Chroma(
	collection_name=CONFIG["COLLECTION_NAME"],
	embedding_function=embeddings, # Pass the DeepInfraEmbeddings instance
	client=client,
	persist_directory = os.path.join(os.getcwd(), 'vector_stores')
	)

	# Store the processed embeddings into the vector_store in chunks
	retries_dict = {}
	CHUNK_SIZE = 32
	for i in tqdm(range(0, len(docs), CHUNK_SIZE)):
	try:
	vector_store.add_texts(
	texts=docs[i:i+CHUNK_SIZE],
	metadatas=metadatas[i:i+CHUNK_SIZE],
	ids=[str(x) for x in range(i, i+CHUNK_SIZE)]
	)
	except Exception as e:
	print(i, e)
	i = i - CHUNK_SIZE
	retries_dict[i] = retries_dict.get(i, 0) + 1
	if retries_dict[i] > 5:
	print(f"Failed to add documents at index {i} after 3 retries. Skipping...")
	i += CHUNK_SIZE
	continue
	time.sleep(1)