Spaces:
Sleeping
Sleeping
import sys | |
sys.path.append("..") | |
import os.path | |
import pandas as pd | |
import time | |
from tqdm import tqdm | |
import chromadb | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from openai import OpenAI | |
from openai import OpenAI | |
import yaml | |
# Read YAML file | |
with open("config.yaml", 'r') as stream: | |
CONFIG = yaml.safe_load(stream) | |
# Wrapper for DeepInfraEmbeddings generation | |
class DeepInfraEmbeddings: | |
def __init__(self, api_key, base_url, model="BAAI/bge-large-en-v1.5"): | |
"""Intialise client to access embedding model | |
Args: | |
api_key (str): Deep-Infra API key | |
base_url (str): URL to access the embeddings | |
model (str, optional): 1024 dimension embeddings. Defaults to "BAAI/bge-large-en-v1.5". | |
""" | |
self.client = OpenAI(api_key=api_key, base_url=base_url) | |
self.model = model | |
def embed_documents(self, texts): | |
"""Converts given INPUT data to corresponding embeddings | |
Args: | |
texts (str): INPUT database contents as string. | |
Returns: | |
list: List of embeddings | |
""" | |
if isinstance(texts, str): | |
texts = [texts] | |
embeddings = self.client.embeddings.create( | |
model=self.model, | |
input=texts, | |
encoding_format="float" | |
) | |
return [embedding.embedding for embedding in embeddings.data] | |
def embed_query(self, text): | |
return self.embed_documents([text])[0] | |
# CREATE A LOCAL CHROMA_DB WITH PERSISTENT STORAGE | |
client = chromadb.PersistentClient(path=os.path.join(os.getcwd(), 'vector_stores')) | |
# LOAD THE DATA_PATH | |
file_path = os.path.join(CONFIG["DATA_PATH"]) | |
df = pd.read_csv(file_path) | |
metadatas = [{'source': int(df.loc[i][0]), 'row': i} for i in range(len(df))] | |
docs = df.apply(lambda x: x.to_json(), axis=1).tolist() | |
# Initialize DeepInfraEmbeddings with your API key and base URL | |
embeddings = DeepInfraEmbeddings( | |
api_key=CONFIG["API_KEY"], | |
base_url=CONFIG["BASE_URL"] | |
) | |
# Create Chroma collection | |
vector_store = Chroma( | |
collection_name=CONFIG["COLLECTION_NAME"], | |
embedding_function=embeddings, # Pass the DeepInfraEmbeddings instance | |
client=client, | |
persist_directory = os.path.join(os.getcwd(), 'vector_stores') | |
) | |
# Store the processed embeddings into the vector_store in chunks | |
retries_dict = {} | |
CHUNK_SIZE = 32 | |
for i in tqdm(range(0, len(docs), CHUNK_SIZE)): | |
try: | |
vector_store.add_texts( | |
texts=docs[i:i+CHUNK_SIZE], | |
metadatas=metadatas[i:i+CHUNK_SIZE], | |
ids=[str(x) for x in range(i, i+CHUNK_SIZE)] | |
) | |
except Exception as e: | |
print(i, e) | |
i = i - CHUNK_SIZE | |
retries_dict[i] = retries_dict.get(i, 0) + 1 | |
if retries_dict[i] > 5: | |
print(f"Failed to add documents at index {i} after 3 retries. Skipping...") | |
i += CHUNK_SIZE | |
continue | |
time.sleep(1) | |