Spaces:
Configuration error
Configuration error
from knowledgebase import create_index, load_retriever | |
from bs4 import BeautifulSoup | |
import requests | |
import serpapi | |
import os | |
import re | |
from transformers import BartTokenizer | |
from dotenv import load_dotenv, find_dotenv | |
load_dotenv(find_dotenv()) | |
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY') | |
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') | |
def query_pinecone(query, top_k, index, retriever): | |
# generate embeddings for the query | |
xq = retriever.encode([query], convert_to_tensor=True).tolist()[0] | |
# search pinecone index for context passage with the answer | |
xc = index.query(vector=xq, top_k=top_k, include_metadata=True) | |
return xc | |
def format_query(query, context): | |
# extract passage_text from Pinecone search result and add the <P> tag | |
context = " ".join([f"<P> {m['metadata']['passage_text']}" for m in context['matches']]) | |
# contcatinate the query and context passages | |
query = f"Pregunta del usuario: {query} \n Contexto para responder a la pregunta del usuario: {context}" | |
return query | |
def get_question_context(query, top_k): | |
# Creo el index | |
_, index = create_index() | |
# Load retriever model | |
retriever = load_retriever() | |
# search pinecone index for context passage with the answer | |
context = query_pinecone(query, top_k, index, retriever) | |
# format query with context passages | |
query = format_query(query, context) | |
return query | |
# Función que realiza la búsqueda en Google y extrae el contenido relevante de la primera URL no patrocinada | |
def google_search_result(query): | |
# Make a Google search | |
s = serpapi.search(q=query, engine="google", location="Madrid, Spain", hl="es", gl="es", api_key=SERPAPI_API_KEY) | |
# Get the first non-ad URL | |
url = s["organic_results"][0]["link"] | |
# Extraer el contenido de la página | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extraer el texto relevante de la página | |
page_content = soup.get_text() | |
page_content = re.sub(r'\n+', ' ', page_content) | |
page_content = re.sub(r'\s+', ' ', page_content) | |
# Cargar el tokenizador para BART | |
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") | |
# Tokenizar el contenido para contar los tokens | |
tokens = tokenizer.encode(page_content, truncation=True, max_length=1000) | |
# Decodificar los tokens de nuevo en texto truncado si es necesario | |
truncated_content = tokenizer.decode(tokens, skip_special_tokens=True) | |
# Resume el contenido de la página | |
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn" | |
# Set the API headers | |
headers = {"Authorization":"Bearer "+HUGGINGFACEHUB_API_TOKEN} | |
# Make a request to the API | |
response = requests.post(API_URL, headers=headers, json={"inputs":truncated_content}) | |
# Get the summary text from the response | |
return response.json()[0]['summary_text'] if len(response.json())>0 else "No se ha podido obtener un resumen de la página" | |