import sys sys.path.append("..") import os.path import pandas as pd import time from tqdm import tqdm import chromadb from langchain.vectorstores import Chroma from langchain.embeddings.openai import OpenAIEmbeddings from openai import OpenAI from openai import OpenAI import yaml # Read YAML file with open("config.yaml", 'r') as stream: CONFIG = yaml.safe_load(stream) # Wrapper for DeepInfraEmbeddings generation class DeepInfraEmbeddings: def __init__(self, api_key, base_url, model="BAAI/bge-large-en-v1.5"): """Intialise client to access embedding model Args: api_key (str): Deep-Infra API key base_url (str): URL to access the embeddings model (str, optional): 1024 dimension embeddings. Defaults to "BAAI/bge-large-en-v1.5". """ self.client = OpenAI(api_key=api_key, base_url=base_url) self.model = model def embed_documents(self, texts): """Converts given INPUT data to corresponding embeddings Args: texts (str): INPUT database contents as string. Returns: list: List of embeddings """ if isinstance(texts, str): texts = [texts] embeddings = self.client.embeddings.create( model=self.model, input=texts, encoding_format="float" ) return [embedding.embedding for embedding in embeddings.data] def embed_query(self, text): return self.embed_documents([text])[0] # CREATE A LOCAL CHROMA_DB WITH PERSISTENT STORAGE client = chromadb.PersistentClient(path=os.path.join(os.getcwd(), 'vector_stores')) # LOAD THE DATA_PATH file_path = os.path.join(CONFIG["DATA_PATH"]) df = pd.read_csv(file_path) metadatas = [{'source': int(df.loc[i][0]), 'row': i} for i in range(len(df))] docs = df.apply(lambda x: x.to_json(), axis=1).tolist() # Initialize DeepInfraEmbeddings with your API key and base URL embeddings = DeepInfraEmbeddings( api_key=CONFIG["API_KEY"], base_url=CONFIG["BASE_URL"] ) # Create Chroma collection vector_store = Chroma( collection_name=CONFIG["COLLECTION_NAME"], embedding_function=embeddings, # Pass the DeepInfraEmbeddings instance client=client, persist_directory = os.path.join(os.getcwd(), 'vector_stores') ) # Store the processed embeddings into the vector_store in chunks retries_dict = {} CHUNK_SIZE = 32 for i in tqdm(range(0, len(docs), CHUNK_SIZE)): try: vector_store.add_texts( texts=docs[i:i+CHUNK_SIZE], metadatas=metadatas[i:i+CHUNK_SIZE], ids=[str(x) for x in range(i, i+CHUNK_SIZE)] ) except Exception as e: print(i, e) i = i - CHUNK_SIZE retries_dict[i] = retries_dict.get(i, 0) + 1 if retries_dict[i] > 5: print(f"Failed to add documents at index {i} after 3 retries. Skipping...") i += CHUNK_SIZE continue time.sleep(1)