semsearch / pineconeclient.py
hanch's picture
fix pinecone startup
c9d11ad verified
raw
history blame
2.62 kB
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
import json
import streamlit as st
from pinecone import Pinecone
from utils import get_variable
PINECONE_KEY = get_variable("PINECONE_API_KEY") # app.pinecone.io
PINE_CONE_ENVIRONMENT = get_variable("PINE_CONE_ENVIRONMENT") # app.pinecone.io
@st.cache_resource
def init_pinecone():
#pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
pinecone = Pinecone(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
return pinecone.Index("semsearch")
def index_query(xq, top_k, regions=[], countries=[], index_namespace=""):
logger.debug(f"Getting companies from countries: {countries} ")
filters = []
if len(regions)>0:
filters.append({'region': {"$in": regions}})
if len(countries)>0:
filters.append({'country': {"$in": countries}})
if len(filters)==1:
filters = filters[0]
elif len(filters)>1:
filters = {"$and": filters}
else:
filters = {}
#st.write(filter)
if not 'index' in st.session_state:
st.session_state.index = init_pinecone()
xc = st.session_state.index.query(vector = xq, namespace=index_namespace, top_k=20, filter = filters, include_metadata=True, include_vectors = False)
#xc = st.session_state.index.query(xq, top_k=top_k, include_metadata=True, include_vectors = True)
return xc
def search_index(query, top_k, regions, countries, retriever, index_namespace=""):
xq = retriever.encode([query]).tolist()
try:
xc = index_query(xq, top_k=top_k, regions=regions, countries=countries)
except:
# force reload
Pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
st.session_state.index = Pinecone.Index("semsearch")
xc = index_query(xq, top_k=top_k, regions=regions, countries=countries, index_namespace=index_namespace)
results = []
for match in xc['matches']:
#logger.debug(f"Match: {match}")
#answer = reader(question=query, context=match["metadata"]['context'])
score = match['score']
# if 'type' in match['metadata'] and match['metadata']['type']!='description-webcontent' and scrape_boost>0:
# score = score / scrape_boost
answer = {'score': score, 'metadata': match['metadata']}
answer['id'] = match['id']
answer["name"] = match["metadata"]['company_name']
answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
results.append(answer)
return results