Spaces:

Praga-6000
/

Arivian

Runtime error

App Files Files Community

Praga-6000 commited on Aug 30

Commit

82d5a2f

verified ·

1 Parent(s): 857c266

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -209

app.py CHANGED Viewed

@@ -1,219 +1,92 @@
-# app.py
-import os
-import io
-import json
-import requests
-from typing import List, Dict, Optional
-import numpy as np
-import faiss
-import pathlib
-import hashlib
-import time
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
-# PDF lib (fallback)
-import PyPDF2
-# ---------- CONFIG ----------
-EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"   # CPU-friendly
-LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"                 # chosen model
-DATA_DIR = "/tmp/rag_data"                                    # persistent within Space runtime
-os.makedirs(DATA_DIR, exist_ok=True)
-# ---------- DEVICE ----------
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# ---------- LOAD MODELS ----------
-print("Loading embedding model...")
-embedder = SentenceTransformer(EMBED_MODEL_NAME)
-embed_dim = embedder.get_sentence_embedding_dimension()
-print(f"Embedding dim: {embed_dim}")
-print("Loading tokenizer and LLM (may take a while)...")
-tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, use_fast=True)
-model = AutoModelForCausalLM.from_pretrained(LLM_MODEL, trust_remote_code=True)
-model.to(device)
-model.eval()
-# ---------- UTILITIES ----------
-def sha256_text(s: str) -> str:
-    return hashlib.sha256(s.encode("utf-8")).hexdigest()
-def extract_text_from_pdf_url(url: str) -> Optional[str]:
-    try:
-        resp = requests.get(url, timeout=20)
-        resp.raise_for_status()
-        pdf_bytes = io.BytesIO(resp.content)
-        reader = PyPDF2.PdfReader(pdf_bytes)
-        text_parts = []
-        for p in reader.pages:
-            page_text = p.extract_text()
-            if page_text:
-                text_parts.append(page_text)
-        if not text_parts:
-            return None
-        return "\n".join(text_parts)
-    except Exception as e:
-        print("PDF extraction error:", e)
-        return None
-def chunk_text_token_aware(text: str, max_tokens=800, overlap_tokens=128):
-    # approximate by splitting on sentences/words, then measuring token length with tokenizer
     words = text.split()
     chunks = []
-    i = 0
-    while i < len(words):
-        # grow until ~max_tokens
-        j = min(len(words), i + max_tokens)
-        chunk = " ".join(words[i:j])
-        # if too long by tokens, shrink
-        enc = tokenizer.encode(chunk, add_special_tokens=False)
-        if len(enc) > max_tokens:
-            # binary shrink loop
-            high = j
-            low = i
-            while high - low > 1:
-                mid = (high + low) // 2
-                c = " ".join(words[i:mid])
-                if len(tokenizer.encode(c, add_special_tokens=False)) <= max_tokens:
-                    low = mid
-                else:
-                    high = mid
-            chunk = " ".join(words[i:low])
-            j = low
         chunks.append(chunk)
-        # advance by chunk_size - overlap
-        i = max(i + max(1, len(tokenizer.encode(chunk, add_special_tokens=False)) - overlap_tokens), j)
     return chunks
-def build_or_load_index(paper_id: str, chunks: List[str]):
-    """
-    If index exists on disk for paper_id, load it. Otherwise build FAISS index from chunks.
-    Returns (index, chunks)
-    """
-    safe_id = sha256_text(paper_id)
-    index_path = os.path.join(DATA_DIR, f"{safe_id}.index")
-    meta_path = os.path.join(DATA_DIR, f"{safe_id}.chunks.json")
-    if os.path.exists(index_path) and os.path.exists(meta_path):
-        # load
-        print("Loading existing index:", index_path)
-        index = faiss.read_index(index_path)
-        with open(meta_path, "r", encoding="utf-8") as f:
-            stored_chunks = json.load(f)
-        return index, stored_chunks
-    # build embeddings
-    print("Encoding chunks:", len(chunks))
-    embeddings = embedder.encode(chunks, show_progress_bar=False, convert_to_numpy=True)
-    # normalize for cosine similarity (IndexFlatIP)
-    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
-    norms[norms==0] = 1e-10
-    embeddings = embeddings / norms
-    # create index
-    index = faiss.IndexFlatIP(embeddings.shape[1])
-    index.add(embeddings.astype('float32'))
-    # persist
-    faiss.write_index(index, index_path)
-    with open(meta_path, "w", encoding="utf-8") as f:
-        json.dump(chunks, f)
-    print("Index written:", index_path)
-    return index, chunks
-def retrieve_relevant(index, chunks, query, k=4):
     q_emb = embedder.encode([query], convert_to_numpy=True)
-    q_emb = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
-    D, I = index.search(q_emb.astype('float32'), k)
-    results = []
-    for idx in I[0]:
-        if idx < len(chunks):
-            results.append(chunks[idx])
-    return results
-def generate_answer(question: str, context_chunks: List[str], chat_history: List[Dict]):
-    # Build a safe prompt: limited context
-    context = "\n\n---\n\n".join(context_chunks)
-    # Keep last few messages
-    history_text = ""
-    for msg in (chat_history or [])[-6:]:
-        role = "User" if msg.get("role")=="user" else "Assistant"
-        history_text += f"{role}: {msg.get('content')}\n"
-    prompt = f"""You are a helpful research assistant. Use the provided paper content to answer the user's question concisely and cite which chunk the answer came from when relevant.
-Paper Context:
-{context}
-Conversation History:
-{history_text}
-User: {question}
-Assistant:"""
-    # tokenize & truncate if needed
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length).to(device)
-    gen = model.generate(
-        **inputs,
-        max_new_tokens=256,
-        temperature=0.0,
-        do_sample=False,
-        eos_token_id=tokenizer.eos_token_id,
-        pad_token_id=tokenizer.eos_token_id
-    )
-    out = tokenizer.decode(gen[0], skip_special_tokens=True)
-    # post-process to return assistant text only
-    if "Assistant:" in out:
-        out = out.split("Assistant:")[-1].strip()
-    return out
-# ---------- MAIN PROCESS ----------
-def process_paper_and_answer(paper_id, title, abstract, url, question, chat_history):
-    # derive unique id (paper_id or url)
-    pid = paper_id or url or title
-    if not pid:
-        pid = str(time.time())
-    # Try to load or extract text
-    full_text = None
-    if url and url.lower().endswith(".pdf"):
-        full_text = extract_text_from_pdf_url(url)
-    if not full_text:
-        full_text = abstract or title or "No content"
-    # chunk
-    chunks = chunk_text_token_aware(full_text, max_tokens=800, overlap_tokens=128)
-    # build or load index (persisted)
-    index, stored_chunks = build_or_load_index(pid, chunks)
-    # retrieve
-    relevant = retrieve_relevant(index, stored_chunks, question, k=4)
-    # generate
-    answer = generate_answer(question, relevant, chat_history)
-    return answer
-# ---------- GRADIO API ----------
-def chat_api(paper_id, paper_title, paper_abstract, paper_url, question, chat_history_json):
-    # chat_history_json might be None or a JSON string
-    chat_history = chat_history_json or []
-    try:
-        return process_paper_and_answer(paper_id, paper_title, paper_abstract, paper_url, question, chat_history)
-    except Exception as e:
-        print("Error:", e)
-        return "Sorry, an internal error occurred."
-iface = gr.Interface(
-    fn=chat_api,
-    inputs=[
-        gr.Textbox(label="Paper ID", lines=1),
-        gr.Textbox(label="Paper Title", lines=1),
-        gr.Textbox(label="Paper Abstract", lines=4),
-        gr.Textbox(label="Paper URL", lines=1),
-        gr.Textbox(label="Question", lines=2),
-        gr.JSON(label="Chat History")
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="Paper Chat RAG (Space)",
-    description="Upload a paper URL (PDF) or paste abstract and ask questions."
-)
-app = iface.app  # expose as API in Space

 import gradio as gr
+import PyPDF2
 from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from transformers import pipeline
+# Load models (lightweight for CPU)
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
+# Initialize FAISS index (for semantic search)
+embedding_size = 384  # MiniLM-L6-v2 output dimension
+index = faiss.IndexFlatL2(embedding_size)
+# Storage for documents and embeddings
+doc_chunks = []
+doc_embeddings = None
+def extract_text_from_pdf(file):
+    """Extract raw text from uploaded PDF."""
+    reader = PyPDF2.PdfReader(file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() + " "
+    return text
+def chunk_text(text, chunk_size=300, overlap=50):
+    """Split text into overlapping chunks."""
     words = text.split()
     chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i + chunk_size])
         chunks.append(chunk)
     return chunks
+def build_index(pdf_file):
+    """Process PDF, create embeddings, and store in FAISS."""
+    global doc_chunks, doc_embeddings, index
+    # Extract + chunk
+    text = extract_text_from_pdf(pdf_file)
+    doc_chunks = chunk_text(text)
+    # Encode chunks
+    doc_embeddings = embedder.encode(doc_chunks, convert_to_numpy=True)
+    # Reset and add to FAISS
+    index = faiss.IndexFlatL2(embedding_size)
+    index.add(doc_embeddings)
+    return f"PDF processed! {len(doc_chunks)} chunks indexed."
+def answer_question(query, top_k=3):
+    """Retrieve relevant chunks and answer user query."""
+    if doc_embeddings is None:
+        return "Please upload and process a PDF first."
+    # Embed question
     q_emb = embedder.encode([query], convert_to_numpy=True)
+    distances, indices = index.search(q_emb, top_k)
+    # Gather top chunks
+    context = " ".join([doc_chunks[i] for i in indices[0]])
+    # Run QA pipeline
+    result = qa_pipeline(question=query, context=context)
+    return result["answer"]
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 📚 PDF Q&A App\nUpload a PDF and ask questions about it!")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF", type="filepath")
+        process_btn = gr.Button("Process PDF")
+    status = gr.Textbox(label="Status", interactive=False)
+    with gr.Row():
+        question = gr.Textbox(label="Ask a Question")
+        answer = gr.Textbox(label="Answer", interactive=False)
+    process_btn.click(build_index, inputs=pdf_input, outputs=status)
+    question.submit(answer_question, inputs=question, outputs=answer)
+demo.launch()