sammarigarcia's picture
Update app.py
0905dfa verified
raw
history blame
2.72 kB
from fastapi import FastAPI, Request, Form
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
import os
from tqdm import tqdm
# LangChain imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
# FastAPI setup
app = FastAPI()
app.mount("/static", StaticFiles(directory="."), name="static")
templates = Jinja2Templates(directory=".")
# Cargar e indexar los documentos PDF
def cargar_docs():
all_docs = []
for pdf_file in [
"1 مساعد ممارس ملف المحور.pdf",
"2 مساعد ممارس ملف المحور.pdf",
"3 مساعد ممارس املف المحور.pdf",
"4 مساعد ممارس ملف المحور.pdf",
"ملف المحور 5 مساعد ممارس.pdf"
]:
loader = PyPDFLoader(pdf_file)
all_docs.extend(loader.load())
return all_docs
docs = cargar_docs()
# Dividir en chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
split_docs = text_splitter.split_documents(docs)
# Debug: mostrar primeros chunks
for i, doc in enumerate(split_docs[:5]):
print(f"Chunk {i+1}:\n{doc.page_content[:300]}\n{'-'*40}")
# ===================
# Embeddings árabes
# ===================
model_name = "asafaya/bert-base-arabic"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
# Crear vectorstore
batch_size = 100
vectorstore = None
for i in tqdm(range(0, len(split_docs), batch_size)):
batch = split_docs[i:i + batch_size]
if vectorstore is None:
vectorstore = FAISS.from_documents(batch, embeddings)
else:
vectorstore.add_documents(batch)
print(f"Se han indexado {len(split_docs)} chunks.")
# Cadena de respuesta
qa_chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini"),
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 5})
)
@app.get("/", response_class=HTMLResponse)
async def read_root(request: Request):
return templates.TemplateResponse("index.html", {"request": request})
@app.post("/preguntar")
async def preguntar(request: Request, pregunta: str = Form(...)):
respuesta = qa_chain.run(pregunta)
return JSONResponse({"respuesta": respuesta})
# Para modo local
import uvicorn
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860)