Spaces:

vcasas
/

llama-index_tests

Runtime error

App Files Files Community

vcasas commited on Dec 8, 2024

Commit

87d2c80

verified ·

1 Parent(s): 9ef2fec

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -49

app.py CHANGED Viewed

@@ -1,59 +1,89 @@
 import os
 import requests
-from llama_index.core import VectorStoreIndex, Settings, Document
-from llama_index.readers.file import PDFReader
 import gradio as gr
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-# Disable the default LLM
-Settings.llm = None
-# Descargar y guardar PDF
 def download_pdf(url, destination):
     os.makedirs(os.path.dirname(destination), exist_ok=True)
     response = requests.get(url)
     with open(destination, 'wb') as f:
         f.write(response.content)
-# Crear índice desde un archivo PDF
-def create_index_from_pdf(pdf_path, model_name='nlpaueb/legal-bert-base-uncased'):
-    pdf_reader = PDFReader()
-    # Leer el contenido del PDF como documentos
-    documents = pdf_reader.load_data(file=pdf_path)
-    # Crear embeddings con un modelo más específico
-    embed_model = HuggingFaceEmbedding(model_name=model_name)
-    index = VectorStoreIndex.from_documents(
-        documents,
-        embed_model=embed_model
-    )
-    query_engine = index.as_query_engine(
-        similarity_top_k=5,  # Aumentar documentos relevantes
-        response_mode="simple"  # Generar respuestas completas
-    )
-    return query_engine
-# Función de búsqueda
-def search_pdf(query):
-    response = query_engine.query(query)
-    return response.response
-# Configurar parámetros
-pdf_url = 'https://www.boe.es/buscar/pdf/1995/BOE-A-1995-25444-consolidado.pdf'
-pdf_path = './BOE-A-1995-25444-consolidado.pdf'
-# Descargar y procesar el PDF
-if not os.path.exists(pdf_path):
-    download_pdf(pdf_url, pdf_path)
-# Crear índice semántico
-query_engine = create_index_from_pdf(pdf_path)
-# Interfaz con Gradio
-gr.Interface(
-    fn=search_pdf,
-    inputs="text",
-    outputs="text",
-    title="Búsqueda en Código Penal PDF",
-    description="Sube el Código Penal o pregunta directamente por información específica."
-).launch()

 import os
 import requests
+import re
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer, util
 import gradio as gr
+# 1. Descargar el PDF
 def download_pdf(url, destination):
+    """Descarga un PDF desde una URL y lo guarda en la ruta especificada."""
     os.makedirs(os.path.dirname(destination), exist_ok=True)
     response = requests.get(url)
     with open(destination, 'wb') as f:
         f.write(response.content)
+# 2. Extraer los artículos del PDF
+def extract_articles_from_pdf(pdf_path):
+    """Extrae artículos del PDF basado en el formato del Código Penal."""
+    reader = PdfReader(pdf_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    # Usar regex para segmentar los artículos
+    article_pattern = r'(Artículo \d+\..*?)(?=Artículo \d+\.|$)'
+    matches = re.findall(article_pattern, text, re.DOTALL)
+    # Crear un diccionario de artículos
+    articles = {}
+    for match in matches:
+        lines = match.strip().split("\n")
+        title = lines[0].strip()  # Ejemplo: "Artículo 138."
+        content = " ".join(line.strip() for line in lines[1:]).strip()
+        articles[title] = content
+    return articles
+# 3. Crear embeddings para los artículos
+def create_article_embeddings(articles, model_name="paraphrase-multilingual-mpnet-base-v2"):
+    """Crea embeddings para los artículos utilizando SentenceTransformers."""
+    model = SentenceTransformer(model_name)
+    article_keys = list(articles.keys())
+    article_embeddings = model.encode(list(articles.values()), convert_to_tensor=True)
+    return article_keys, article_embeddings, model
+# 4. Buscar el artículo relevante
+def find_article(question, article_keys, article_embeddings, model, articles):
+    """Busca el artículo más relevante para la pregunta utilizando embeddings."""
+    question_embedding = model.encode(question, convert_to_tensor=True)
+    scores = util.pytorch_cos_sim(question_embedding, article_embeddings)
+    best_match_idx = scores.argmax()
+    best_article_key = article_keys[best_match_idx]
+    return f"{best_article_key}\n{articles[best_article_key]}"
+# Flujo principal
+def main():
+    # Configuración inicial
+    pdf_url = 'https://www.boe.es/buscar/pdf/1995/BOE-A-1995-25444-consolidado.pdf'
+    pdf_path = './BOE-A-1995-25444-consolidado.pdf'
+    # Descargar el PDF si no existe
+    if not os.path.exists(pdf_path):
+        print("Descargando el Código Penal...")
+        download_pdf(pdf_url, pdf_path)
+    # Extraer y procesar los artículos
+    print("Extrayendo artículos del Código Penal...")
+    articles = extract_articles_from_pdf(pdf_path)
+    # Crear embeddings para los artículos
+    print("Creando embeddings para los artículos...")
+    article_keys, article_embeddings, model = create_article_embeddings(articles)
+    # Función para responder preguntas
+    def search_law(query):
+        return find_article(query, article_keys, article_embeddings, model, articles)
+    # Iniciar la interfaz de Gradio
+    print("Lanzando la aplicación...")
+    gr.Interface(
+        fn=search_law,
+        inputs="text",
+        outputs="text",
+        title="Búsqueda en el Código Penal Español",
+        description="Realiza preguntas sobre delitos y penas en el Código Penal Español."
+    ).launch()
+if __name__ == "__main__":
+    main()