sammarigarcia commited on
Commit
0905dfa
·
verified ·
1 Parent(s): 7f7499f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -12
app.py CHANGED
@@ -3,45 +3,58 @@ from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
 
6
- # OpenAI y LangChain imports
7
  import os
 
 
 
8
  from langchain_community.document_loaders import PyPDFLoader
9
- from langchain_openai import OpenAIEmbeddings
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.chat_models import ChatOpenAI
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain.chains import RetrievalQA
 
14
 
 
15
  app = FastAPI()
16
-
17
  app.mount("/static", StaticFiles(directory="."), name="static")
18
  templates = Jinja2Templates(directory=".")
19
 
20
- # Carga y procesamiento PDFs (una sola vez al iniciar)
21
  def cargar_docs():
22
  all_docs = []
23
- for pdf_file in ["1 مساعد ممارس ملف المحور.pdf", "2 مساعد ممارس ملف المحور.pdf", "3 مساعد ممارس املف المحور.pdf", "4 مساعد ممارس ملف المحور.pdf", "ملف المحور 5 مساعد ممارس.pdf"]:
 
 
 
 
 
 
24
  loader = PyPDFLoader(pdf_file)
25
  all_docs.extend(loader.load())
26
  return all_docs
27
 
28
  docs = cargar_docs()
29
 
 
30
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
31
  split_docs = text_splitter.split_documents(docs)
 
 
32
  for i, doc in enumerate(split_docs[:5]):
33
  print(f"Chunk {i+1}:\n{doc.page_content[:300]}\n{'-'*40}")
34
 
 
 
 
 
 
35
 
36
- embeddings = OpenAIEmbeddings()
37
- from tqdm import tqdm
38
-
39
- # Dividir en lotes de, por ejemplo, 100 documentos
40
  batch_size = 100
41
  vectorstore = None
42
 
43
  for i in tqdm(range(0, len(split_docs), batch_size)):
44
- batch = split_docs[i:i+batch_size]
45
  if vectorstore is None:
46
  vectorstore = FAISS.from_documents(batch, embeddings)
47
  else:
@@ -49,10 +62,11 @@ for i in tqdm(range(0, len(split_docs), batch_size)):
49
 
50
  print(f"Se han indexado {len(split_docs)} chunks.")
51
 
 
52
  qa_chain = RetrievalQA.from_chain_type(
53
  llm=ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini"),
54
  chain_type="stuff",
55
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) # más chunks
56
  )
57
 
58
  @app.get("/", response_class=HTMLResponse)
@@ -64,8 +78,8 @@ async def preguntar(request: Request, pregunta: str = Form(...)):
64
  respuesta = qa_chain.run(pregunta)
65
  return JSONResponse({"respuesta": respuesta})
66
 
 
67
  import uvicorn
68
 
69
  if __name__ == "__main__":
70
  uvicorn.run("app:app", host="0.0.0.0", port=7860)
71
-
 
3
  from fastapi.staticfiles import StaticFiles
4
  from fastapi.templating import Jinja2Templates
5
 
 
6
  import os
7
+ from tqdm import tqdm
8
+
9
+ # LangChain imports
10
  from langchain_community.document_loaders import PyPDFLoader
 
11
  from langchain_community.vectorstores import FAISS
12
  from langchain_community.chat_models import ChatOpenAI
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain.chains import RetrievalQA
15
+ from langchain_community.embeddings import HuggingFaceEmbeddings
16
 
17
+ # FastAPI setup
18
  app = FastAPI()
 
19
  app.mount("/static", StaticFiles(directory="."), name="static")
20
  templates = Jinja2Templates(directory=".")
21
 
22
+ # Cargar e indexar los documentos PDF
23
  def cargar_docs():
24
  all_docs = []
25
+ for pdf_file in [
26
+ "1 مساعد ممارس ملف المحور.pdf",
27
+ "2 مساعد ممارس ملف المحور.pdf",
28
+ "3 مساعد ممارس املف المحور.pdf",
29
+ "4 مساعد ممارس ملف المحور.pdf",
30
+ "ملف المحور 5 مساعد ممارس.pdf"
31
+ ]:
32
  loader = PyPDFLoader(pdf_file)
33
  all_docs.extend(loader.load())
34
  return all_docs
35
 
36
  docs = cargar_docs()
37
 
38
+ # Dividir en chunks
39
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
40
  split_docs = text_splitter.split_documents(docs)
41
+
42
+ # Debug: mostrar primeros chunks
43
  for i, doc in enumerate(split_docs[:5]):
44
  print(f"Chunk {i+1}:\n{doc.page_content[:300]}\n{'-'*40}")
45
 
46
+ # ===================
47
+ # Embeddings árabes
48
+ # ===================
49
+ model_name = "asafaya/bert-base-arabic"
50
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
51
 
52
+ # Crear vectorstore
 
 
 
53
  batch_size = 100
54
  vectorstore = None
55
 
56
  for i in tqdm(range(0, len(split_docs), batch_size)):
57
+ batch = split_docs[i:i + batch_size]
58
  if vectorstore is None:
59
  vectorstore = FAISS.from_documents(batch, embeddings)
60
  else:
 
62
 
63
  print(f"Se han indexado {len(split_docs)} chunks.")
64
 
65
+ # Cadena de respuesta
66
  qa_chain = RetrievalQA.from_chain_type(
67
  llm=ChatOpenAI(temperature=0.2, model_name="gpt-4o-mini"),
68
  chain_type="stuff",
69
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 5})
70
  )
71
 
72
  @app.get("/", response_class=HTMLResponse)
 
78
  respuesta = qa_chain.run(pregunta)
79
  return JSONResponse({"respuesta": respuesta})
80
 
81
+ # Para modo local
82
  import uvicorn
83
 
84
  if __name__ == "__main__":
85
  uvicorn.run("app:app", host="0.0.0.0", port=7860)