sammarigarcia commited on
Commit
82da546
·
verified ·
1 Parent(s): 5fd3097

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -32
app.py CHANGED
@@ -4,9 +4,10 @@ from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
 
6
  import os
 
7
  from tqdm import tqdm
8
 
9
- # LangChain imports
10
  from langchain_community.document_loaders import PyPDFLoader
11
  from langchain_community.vectorstores import FAISS
12
  from langchain_community.chat_models import ChatOpenAI
@@ -14,12 +15,22 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain.chains import RetrievalQA
15
  from langchain_community.embeddings import HuggingFaceEmbeddings
16
 
 
17
  # FastAPI setup
 
18
  app = FastAPI()
19
  app.mount("/static", StaticFiles(directory="."), name="static")
20
  templates = Jinja2Templates(directory=".")
21
 
22
- # Cargar e indexar los documentos PDF
 
 
 
 
 
 
 
 
23
  def cargar_docs():
24
  all_docs = []
25
  for pdf_file in [
@@ -35,48 +46,40 @@ def cargar_docs():
35
 
36
  docs = cargar_docs()
37
 
38
- # Dividir en chunks
 
 
39
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
40
  split_docs = text_splitter.split_documents(docs)
41
 
42
- # Debug: mostrar primeros chunks
43
  for i, doc in enumerate(split_docs[:5]):
44
  print(f"Chunk {i+1}:\n{doc.page_content[:300]}\n{'-'*40}")
45
 
46
- # ===================
47
- # Embeddings árabes
48
- # ===================
49
- model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mnli"
50
  embeddings = HuggingFaceEmbeddings(model_name=model_name)
51
 
52
- # Generar índice FAISS con los embeddings
53
- db = FAISS.from_documents(texts, embeddings)
54
-
55
- # Guardar el índice si quieres (opcional pero útil para no repetir este proceso)
56
- db.save_local("faiss_index")
57
-
58
- db = FAISS.load_local("faiss_index", embeddings)
59
-
60
- # Crear vectorstore
61
- batch_size = 100
62
- vectorstore = None
63
 
64
- for i in tqdm(range(0, len(split_docs), batch_size)):
65
- batch = split_docs[i:i + batch_size]
66
- if vectorstore is None:
67
- vectorstore = FAISS.from_documents(batch, embeddings)
68
- else:
69
- vectorstore.add_documents(batch)
70
-
71
- print(f"Se han indexado {len(split_docs)} chunks.")
72
-
73
- # Cadena de respuesta
74
  qa_chain = RetrievalQA.from_chain_type(
75
  llm=ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini"),
76
  chain_type="stuff",
77
  retriever=vectorstore.as_retriever(search_kwargs={"k": 5})
78
  )
79
 
 
 
 
80
  @app.get("/", response_class=HTMLResponse)
81
  async def read_root(request: Request):
82
  return templates.TemplateResponse("index.html", {"request": request})
@@ -86,8 +89,9 @@ async def preguntar(request: Request, pregunta: str = Form(...)):
86
  respuesta = qa_chain.run(pregunta)
87
  return JSONResponse({"respuesta": respuesta})
88
 
89
- # Para modo local
90
- import uvicorn
91
-
92
  if __name__ == "__main__":
 
93
  uvicorn.run("app:app", host="0.0.0.0", port=7860)
 
4
  from fastapi.templating import Jinja2Templates
5
 
6
  import os
7
+ import shutil
8
  from tqdm import tqdm
9
 
10
+ # LangChain
11
  from langchain_community.document_loaders import PyPDFLoader
12
  from langchain_community.vectorstores import FAISS
13
  from langchain_community.chat_models import ChatOpenAI
 
15
  from langchain.chains import RetrievalQA
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
17
 
18
+ # ====================
19
  # FastAPI setup
20
+ # ====================
21
  app = FastAPI()
22
  app.mount("/static", StaticFiles(directory="."), name="static")
23
  templates = Jinja2Templates(directory=".")
24
 
25
+ # ====================
26
+ # Limpiar FAISS anterior
27
+ # ====================
28
+ if os.path.exists("faiss_index"):
29
+ shutil.rmtree("faiss_index")
30
+
31
+ # ====================
32
+ # Cargar documentos PDF
33
+ # ====================
34
  def cargar_docs():
35
  all_docs = []
36
  for pdf_file in [
 
46
 
47
  docs = cargar_docs()
48
 
49
+ # ====================
50
+ # Separar en chunks
51
+ # ====================
52
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
53
  split_docs = text_splitter.split_documents(docs)
54
 
55
+ # Debug
56
  for i, doc in enumerate(split_docs[:5]):
57
  print(f"Chunk {i+1}:\n{doc.page_content[:300]}\n{'-'*40}")
58
 
59
+ # ====================
60
+ # Embeddings multilingües
61
+ # ====================
62
+ model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
63
  embeddings = HuggingFaceEmbeddings(model_name=model_name)
64
 
65
+ # ====================
66
+ # Crear FAISS vectorstore
67
+ # ====================
68
+ vectorstore = FAISS.from_documents(split_docs, embeddings)
69
+ vectorstore.save_local("faiss_index") # Opcional, para caching
 
 
 
 
 
 
70
 
71
+ # ====================
72
+ # QA Chain
73
+ # ====================
 
 
 
 
 
 
 
74
  qa_chain = RetrievalQA.from_chain_type(
75
  llm=ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini"),
76
  chain_type="stuff",
77
  retriever=vectorstore.as_retriever(search_kwargs={"k": 5})
78
  )
79
 
80
+ # ====================
81
+ # Rutas FastAPI
82
+ # ====================
83
  @app.get("/", response_class=HTMLResponse)
84
  async def read_root(request: Request):
85
  return templates.TemplateResponse("index.html", {"request": request})
 
89
  respuesta = qa_chain.run(pregunta)
90
  return JSONResponse({"respuesta": respuesta})
91
 
92
+ # ====================
93
+ # Ejecutar localmente
94
+ # ====================
95
  if __name__ == "__main__":
96
+ import uvicorn
97
  uvicorn.run("app:app", host="0.0.0.0", port=7860)