Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pdfplumber | |
import faiss | |
import torch | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from transformers import pipeline | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# Load embedding model | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
text = "" | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() + "\n" | |
return text.strip() | |
# Chunking text | |
def chunk_text(text, chunk_size=500, overlap=100): | |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
return splitter.split_text(text) | |
# Generate embeddings | |
def generate_embeddings(text_chunks): | |
return embedding_model.encode(text_chunks, convert_to_numpy=True) | |
# Create FAISS index | |
def create_faiss_index(embeddings): | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(embeddings) | |
return index | |
# Retrieve relevant context (Increased context size) | |
def retrieve_context(query, index, text_chunks, top_k=7): | |
query_embedding = embedding_model.encode([query], convert_to_numpy=True) | |
distances, indices = index.search(query_embedding, top_k) | |
retrieved_text = "\n".join([text_chunks[i] for i in indices[0]]) | |
return retrieved_text | |
# Generate Answer (Allow longer answers) | |
def answer_question(query, faiss_index, book_chunks): | |
context = retrieve_context(query, faiss_index, book_chunks) | |
result = qa_pipeline(question=query, context=context, max_answer_len=150) | |
return result["answer"] + "\n\n**Additional Context:** " + context[:400] + "..." | |
# Streamlit UI | |
st.title("π Book-Based Question Answering System") | |
st.write("Upload a book (PDF) and ask any question!") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a PDF book", type="pdf") | |
if uploaded_file: | |
st.write("Processing book...") | |
book_text = extract_text_from_pdf(uploaded_file) | |
book_chunks = chunk_text(book_text) | |
chunk_embeddings = generate_embeddings(book_chunks) | |
faiss_index = create_faiss_index(chunk_embeddings) | |
st.success(f"Book processed successfully! ({len(book_chunks)} chunks)") | |
query = st.text_input("Ask a question based on the book:") | |
if query: | |
answer = answer_question(query, faiss_index, book_chunks) | |
st.write(f"**Answer:** {answer}") | |