Spaces:

hanchraizedai
/

semsearch

Sleeping

File size: 2,617 Bytes

import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

import json
import streamlit as st

from pinecone import Pinecone

from utils import get_variable


PINECONE_KEY = get_variable("PINECONE_API_KEY")  # app.pinecone.io
PINE_CONE_ENVIRONMENT = get_variable("PINE_CONE_ENVIRONMENT")  # app.pinecone.io


@st.cache_resource
def init_pinecone():
    #pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
    pinecone = Pinecone(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
    return pinecone.Index("semsearch")



   
def index_query(xq, top_k, regions=[], countries=[], index_namespace=""):
    logger.debug(f"Getting companies from countries: {countries} ")
    filters = []
    if len(regions)>0:
        filters.append({'region': {"$in": regions}})
    if len(countries)>0:
        filters.append({'country': {"$in": countries}})
    if len(filters)==1:
        filters = filters[0]
    elif len(filters)>1:
        filters = {"$and": filters} 
    else:
        filters = {}
    #st.write(filter)
    if not 'index' in st.session_state:
        st.session_state.index = init_pinecone()

    xc = st.session_state.index.query(vector = xq, namespace=index_namespace, top_k=20, filter = filters, include_metadata=True, include_vectors = False)
    #xc = st.session_state.index.query(xq, top_k=top_k, include_metadata=True, include_vectors = True)
    return xc

def search_index(query, top_k, regions, countries, retriever, index_namespace=""):
    xq = retriever.encode([query]).tolist()
    try:
        xc = index_query(xq, top_k=top_k, regions=regions, countries=countries)
    except:
        # force reload
        Pinecone.init(api_key=PINECONE_KEY, environment=PINE_CONE_ENVIRONMENT)
        st.session_state.index = Pinecone.Index("semsearch")
        xc = index_query(xq, top_k=top_k, regions=regions, countries=countries, index_namespace=index_namespace)

    results = []
    for match in xc['matches']:
        #logger.debug(f"Match: {match}")
        #answer = reader(question=query, context=match["metadata"]['context'])
        score = match['score']
        # if 'type' in match['metadata'] and match['metadata']['type']!='description-webcontent' and scrape_boost>0:
        #     score = score / scrape_boost
        answer = {'score': score, 'metadata': match['metadata']}
        answer['id'] = match['id']
        answer["name"] = match["metadata"]['company_name']
        answer["description"] = match["metadata"]['description'] if "description" in match['metadata'] else ""
        
        results.append(answer)
    return results