import gradio as gr import PyPDF2 from sentence_transformers import SentenceTransformer import faiss import numpy as np from transformers import pipeline # Load models (lightweight for CPU) embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Initialize FAISS index (for semantic search) embedding_size = 384 # MiniLM-L6-v2 output dimension index = faiss.IndexFlatL2(embedding_size) # Storage for documents and embeddings doc_chunks = [] doc_embeddings = None def extract_text_from_pdf(file): """Extract raw text from uploaded PDF.""" reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() + " " return text def chunk_text(text, chunk_size=300, overlap=50): """Split text into overlapping chunks.""" words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = " ".join(words[i:i + chunk_size]) chunks.append(chunk) return chunks def build_index(pdf_file): """Process PDF, create embeddings, and store in FAISS.""" global doc_chunks, doc_embeddings, index # Extract + chunk text = extract_text_from_pdf(pdf_file) doc_chunks = chunk_text(text) # Encode chunks doc_embeddings = embedder.encode(doc_chunks, convert_to_numpy=True) # Reset and add to FAISS index = faiss.IndexFlatL2(embedding_size) index.add(doc_embeddings) return f"PDF processed! {len(doc_chunks)} chunks indexed." def answer_question(query, top_k=3): """Retrieve relevant chunks and answer user query.""" if doc_embeddings is None: return "Please upload and process a PDF first." # Embed question q_emb = embedder.encode([query], convert_to_numpy=True) distances, indices = index.search(q_emb, top_k) # Gather top chunks context = " ".join([doc_chunks[i] for i in indices[0]]) # Run QA pipeline result = qa_pipeline(question=query, context=context) return result["answer"] # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 📚 PDF Q&A App\nUpload a PDF and ask questions about it!") with gr.Row(): pdf_input = gr.File(label="Upload PDF", type="filepath") process_btn = gr.Button("Process PDF") status = gr.Textbox(label="Status", interactive=False) with gr.Row(): question = gr.Textbox(label="Ask a Question") answer = gr.Textbox(label="Answer", interactive=False) process_btn.click(build_index, inputs=pdf_input, outputs=status) question.submit(answer_question, inputs=question, outputs=answer) demo.launch()