Spaces:
Running
Running
import logging | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
import json | |
import streamlit as st | |
from pinecone import Pinecone | |
from utils import get_variable | |
PINECONE_KEY = get_variable("PINECONE_API_KEY") # app.pinecone.io | |
PINE_CONE_ENVIRONMENT = get_variable("PINE_CONE_ENVIRONMENT") # app.pinecone.io | |
def init_pinecone(): | |
#pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT) | |
pinecone = Pinecone(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT) | |
return pinecone.Index("semsearch") | |
def index_query(xq, top_k, regions=[], countries=[], index_namespace=""): | |
logger.debug(f"Getting companies from countries: {countries} ") | |
filters = [] | |
if len(regions)>0: | |
filters.append({'region': {"$in": regions}}) | |
if len(countries)>0: | |
filters.append({'country': {"$in": countries}}) | |
if len(filters)==1: | |
filters = filters[0] | |
elif len(filters)>1: | |
filters = {"$and": filters} | |
else: | |
filters = {} | |
#st.write(filter) | |
if not 'index' in st.session_state: | |
st.session_state.index = init_pinecone() | |
xc = st.session_state.index.query(vector = xq, namespace=index_namespace, top_k=20, filter = filters, include_metadata=True, include_vectors = False) | |
#xc = st.session_state.index.query(xq, top_k=top_k, include_metadata=True, include_vectors = True) | |
return xc | |
def search_index(query, top_k, regions, countries, retriever, index_namespace=""): | |
xq = retriever.encode([query]).tolist() | |
try: | |
xc = index_query(xq, top_k=top_k, regions=regions, countries=countries) | |
except: | |
# force reload | |
Pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT) | |
st.session_state.index = Pinecone.Index("semsearch") | |
xc = index_query(xq, top_k=top_k, regions=regions, countries=countries, index_namespace=index_namespace) | |
results = [] | |
for match in xc['matches']: | |
#logger.debug(f"Match: {match}") | |
#answer = reader(question=query, context=match["metadata"]['context']) | |
score = match['score'] | |
# if 'type' in match['metadata'] and match['metadata']['type']!='description-webcontent' and scrape_boost>0: | |
# score = score / scrape_boost | |
answer = {'score': score, 'metadata': match['metadata']} | |
answer['id'] = match['id'] | |
answer["name"] = match["metadata"]['company_name'] | |
answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else "" | |
results.append(answer) | |
return results | |