|
|
from fastapi import FastAPI, Request, Form |
|
|
from fastapi.responses import HTMLResponse, JSONResponse |
|
|
from fastapi.staticfiles import StaticFiles |
|
|
from fastapi.templating import Jinja2Templates |
|
|
|
|
|
import os |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_community.chat_models import ChatOpenAI |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain.chains import RetrievalQA |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
app.mount("/static", StaticFiles(directory="."), name="static") |
|
|
templates = Jinja2Templates(directory=".") |
|
|
|
|
|
|
|
|
def cargar_docs(): |
|
|
all_docs = [] |
|
|
for pdf_file in [ |
|
|
"1 مساعد ممارس ملف المحور.pdf", |
|
|
"2 مساعد ممارس ملف المحور.pdf", |
|
|
"3 مساعد ممارس املف المحور.pdf", |
|
|
"4 مساعد ممارس ملف المحور.pdf", |
|
|
"ملف المحور 5 مساعد ممارس.pdf" |
|
|
]: |
|
|
loader = PyPDFLoader(pdf_file) |
|
|
all_docs.extend(loader.load()) |
|
|
return all_docs |
|
|
|
|
|
docs = cargar_docs() |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200) |
|
|
split_docs = text_splitter.split_documents(docs) |
|
|
|
|
|
|
|
|
for i, doc in enumerate(split_docs[:5]): |
|
|
print(f"Chunk {i+1}:\n{doc.page_content[:300]}\n{'-'*40}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_name = "asafaya/bert-base-arabic" |
|
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
|
|
|
|
|
batch_size = 100 |
|
|
vectorstore = None |
|
|
|
|
|
for i in tqdm(range(0, len(split_docs), batch_size)): |
|
|
batch = split_docs[i:i + batch_size] |
|
|
if vectorstore is None: |
|
|
vectorstore = FAISS.from_documents(batch, embeddings) |
|
|
else: |
|
|
vectorstore.add_documents(batch) |
|
|
|
|
|
print(f"Se han indexado {len(split_docs)} chunks.") |
|
|
|
|
|
|
|
|
qa_chain = RetrievalQA.from_chain_type( |
|
|
llm=ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini"), |
|
|
chain_type="stuff", |
|
|
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}) |
|
|
) |
|
|
|
|
|
@app.get("/", response_class=HTMLResponse) |
|
|
async def read_root(request: Request): |
|
|
return templates.TemplateResponse("index.html", {"request": request}) |
|
|
|
|
|
@app.post("/preguntar") |
|
|
async def preguntar(request: Request, pregunta: str = Form(...)): |
|
|
respuesta = qa_chain.run(pregunta) |
|
|
return JSONResponse({"respuesta": respuesta}) |
|
|
|
|
|
|
|
|
import uvicorn |
|
|
|
|
|
if __name__ == "__main__": |
|
|
uvicorn.run("app:app", host="0.0.0.0", port=7860) |
|
|
|