vcasas commited on
Commit
0dd4627
·
verified ·
1 Parent(s): 888d0e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -17
app.py CHANGED
@@ -1,43 +1,56 @@
1
  import os
2
  import requests
3
- from llama_index.core import VectorStoreIndex, Settings
4
- from llama_index.readers.file import PDFReader
5
- import gradio as gr
6
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
7
 
8
- # Disable the default LLM
9
- Settings.llm = None
10
-
11
  def download_pdf(url, destination):
12
  os.makedirs(os.path.dirname(destination), exist_ok=True)
13
  response = requests.get(url)
14
  with open(destination, 'wb') as f:
15
  f.write(response.content)
16
 
17
- def create_index_from_pdf(pdf_path):
 
18
  pdf_reader = PDFReader()
 
19
  documents = pdf_reader.load_data(file=pdf_path)
20
 
21
- embed_model = HuggingFaceEmbedding(model_name='sentence-transformers/all-mpnet-base-v2')
22
-
23
  index = VectorStoreIndex.from_documents(
24
  documents,
25
  embed_model=embed_model
26
  )
27
  query_engine = index.as_query_engine(
28
- similarity_top_k=2, # Adjust number of retrieved documents
29
- response_mode="compact" # Prevents generating full response
30
  )
31
  return query_engine
32
 
 
 
 
 
 
 
33
  pdf_url = 'https://www.boe.es/buscar/pdf/1995/BOE-A-1995-25444-consolidado.pdf'
34
  pdf_path = './BOE-A-1995-25444-consolidado.pdf'
35
 
36
- download_pdf(pdf_url, pdf_path)
37
- query_engine = create_index_from_pdf(pdf_path)
 
38
 
39
- def search_pdf(query):
40
- response = query_engine.query(query)
41
- return response.response
42
 
43
- gr.Interface(fn=search_pdf, inputs="text", outputs="text").launch()
 
 
 
 
 
 
 
 
1
  import os
2
  import requests
3
+ from llama_index import VectorStoreIndex, Document
4
+ from llama_index.readers.file.base import PDFReader
 
5
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
+ import gradio as gr
7
 
8
+ # Descargar y guardar PDF
 
 
9
  def download_pdf(url, destination):
10
  os.makedirs(os.path.dirname(destination), exist_ok=True)
11
  response = requests.get(url)
12
  with open(destination, 'wb') as f:
13
  f.write(response.content)
14
 
15
+ # Crear índice desde un archivo PDF
16
+ def create_index_from_pdf(pdf_path, model_name='nlpaueb/legal-bert-base-uncased'):
17
  pdf_reader = PDFReader()
18
+ # Leer el contenido del PDF como documentos
19
  documents = pdf_reader.load_data(file=pdf_path)
20
 
21
+ # Crear embeddings con un modelo más específico
22
+ embed_model = HuggingFaceEmbedding(model_name=model_name)
23
  index = VectorStoreIndex.from_documents(
24
  documents,
25
  embed_model=embed_model
26
  )
27
  query_engine = index.as_query_engine(
28
+ similarity_top_k=5, # Aumentar documentos relevantes
29
+ response_mode="simple" # Generar respuestas completas
30
  )
31
  return query_engine
32
 
33
+ # Función de búsqueda
34
+ def search_pdf(query):
35
+ response = query_engine.query(query)
36
+ return response.response
37
+
38
+ # Configurar parámetros
39
  pdf_url = 'https://www.boe.es/buscar/pdf/1995/BOE-A-1995-25444-consolidado.pdf'
40
  pdf_path = './BOE-A-1995-25444-consolidado.pdf'
41
 
42
+ # Descargar y procesar el PDF
43
+ if not os.path.exists(pdf_path):
44
+ download_pdf(pdf_url, pdf_path)
45
 
46
+ # Crear índice semántico
47
+ query_engine = create_index_from_pdf(pdf_path)
 
48
 
49
+ # Interfaz con Gradio
50
+ gr.Interface(
51
+ fn=search_pdf,
52
+ inputs="text",
53
+ outputs="text",
54
+ title="Búsqueda en Código Penal PDF",
55
+ description="Sube el Código Penal o pregunta directamente por información específica."
56
+ ).launch()