Spaces:
Runtime error
Runtime error
import os | |
import pinecone | |
import time | |
import yaml | |
import pandas as pd | |
from langchain.document_loaders import DataFrameLoader | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores.pinecone import Pinecone | |
from typing import List | |
from dotenv import load_dotenv | |
from pathlib import Path | |
class PinceconeIndex: | |
def __init__(self, index_name: str, model_name: str): | |
self.index_name = index_name | |
self._embeddingModel = HuggingFaceEmbeddings(model_name=model_name) | |
def connect_index(self, embedding_dimension: int, delete_existing: bool = False): | |
index_name = self.index_name | |
# load pinecone env variables within Google Colab | |
if (not os.getenv("PINECONE_KEY")) or (not os.getenv("PINECONE_ENV")): | |
dotenv_path = Path("/content/gt-policy-bot/config.env") | |
load_dotenv(dotenv_path=dotenv_path) | |
pinecone.init( | |
api_key=os.getenv("PINECONE_KEY"), | |
environment=os.getenv("PINECONE_ENV"), | |
) | |
if index_name in pinecone.list_indexes() and delete_existing: | |
pinecone.delete_index(index_name) | |
if index_name not in pinecone.list_indexes(): | |
pinecone.create_index(index_name, dimension=embedding_dimension) | |
index = pinecone.Index(index_name) | |
pinecone.describe_index(index_name) | |
self._index = index | |
def upsert_docs(self, df: pd.DataFrame, text_col: str): | |
loader = DataFrameLoader(df, page_content_column=text_col) | |
docs = loader.load() | |
Pinecone.from_documents(docs, self._embeddingModel, index_name=self.index_name) | |
def get_embedding_model(self): | |
return self._embeddingModel | |
def get_index_name(self): | |
return self.index_name | |
def query(self, query: str, top_k: int = 5) -> List[str]: | |
docsearch = Pinecone.from_existing_index(self.index_name, self._embeddingModel) | |
res = docsearch.similarity_search(query, k=top_k) | |
return [doc.page_content for doc in res] | |
if __name__ == "__main__": | |
config_path = "config.yml" | |
with open("config.yml", "r") as file: | |
config = yaml.safe_load(file) | |
print(config) | |
data_path = config["paths"]["data_path"] | |
project = config["paths"]["project"] | |
format = ".csv" | |
index_name = config["pinecone"]["index-name"] | |
embedding_model = config["sentence-transformers"]["model-name"] | |
embedding_dimension = config["sentence-transformers"]["embedding-dimension"] | |
delete_existing = True | |
if config["paths"]["chunking"] == "manual": | |
print("Using manual chunking") | |
file_path_embedding = config["paths"]["manual_chunk_file"] | |
df = pd.read_csv(file_path_embedding, header=None, names=["chunks"]) | |
else: | |
print("Using automatic chunking") | |
file_path_embedding = config["paths"]["auto_chunk_file"] | |
df = pd.read_csv(file_path_embedding, index_col=0) | |
print(df) | |
start_time = time.time() | |
index = PinceconeIndex(index_name, embedding_model) | |
index.connect_index(embedding_dimension, delete_existing) | |
index.upsert_docs(df, "chunks") | |
end_time = time.time() | |
print(f"Indexing took {end_time - start_time} seconds") | |
index = PinceconeIndex(index_name, embedding_model) | |
index.connect_index(embedding_dimension, delete_existing=False) | |
query = "When was the student code of conduct last revised?" | |
res = index.query(query, top_k=5) | |
# assert len(res) == 5 | |
print(res) | |