File size: 3,013 Bytes
1d9f996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from knowledgebase import create_index, load_retriever
from bs4 import BeautifulSoup
import requests
import serpapi
import os
import re
from transformers import BartTokenizer
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

def query_pinecone(query, top_k, index, retriever):
    # generate embeddings for the query
    xq = retriever.encode([query], convert_to_tensor=True).tolist()[0]
    # search pinecone index for context passage with the answer
    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
    return xc

def format_query(query, context):
    # extract passage_text from Pinecone search result and add the <P> tag
    context = " ".join([f"<P> {m['metadata']['passage_text']}" for m in context['matches']])
    # contcatinate the query and context passages
    query = f"Pregunta del usuario: {query} \n Contexto para responder a la pregunta del usuario: {context}"
    return query

def get_question_context(query, top_k):
    # Creo el index
    _, index = create_index()
    # Load retriever model
    retriever = load_retriever()
    # search pinecone index for context passage with the answer
    context = query_pinecone(query, top_k, index, retriever)
    # format query with context passages
    query = format_query(query, context)    
    return query

# Función que realiza la búsqueda en Google y extrae el contenido relevante de la primera URL no patrocinada
def google_search_result(query):
    # Make a Google search
    s = serpapi.search(q=query, engine="google", location="Madrid, Spain", hl="es", gl="es", api_key=SERPAPI_API_KEY)
    # Get the first non-ad URL
    url = s["organic_results"][0]["link"]

    # Extraer el contenido de la página
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extraer el texto relevante de la página
    page_content = soup.get_text()

    page_content = re.sub(r'\n+', ' ', page_content)
    page_content = re.sub(r'\s+', ' ', page_content)

    # Cargar el tokenizador para BART
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Tokenizar el contenido para contar los tokens
    tokens = tokenizer.encode(page_content, truncation=True, max_length=1000)

    # Decodificar los tokens de nuevo en texto truncado si es necesario
    truncated_content = tokenizer.decode(tokens, skip_special_tokens=True)

    # Resume el contenido de la página
    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
    # Set the API headers
    headers = {"Authorization":"Bearer "+HUGGINGFACEHUB_API_TOKEN}
    # Make a request to the API
    response = requests.post(API_URL, headers=headers, json={"inputs":truncated_content})
    # Get the summary text from the response
    return response.json()[0]['summary_text'] if len(response.json())>0 else "No se ha podido obtener un resumen de la página"