Spaces:

aaporosh
/

Smart_PDF_QA

Sleeping

App Files Files

xet

Community

Smart_PDF_QA / app.py

aaporosh

Update app.py

ce4cfdb verified 2 months ago

raw

history blame

7.5 kB

	import streamlit as st
	import logging
	import os
	from io import BytesIO
	import pdfplumber
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline

	# Setup logging for Spaces
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Lazy load models
	@st.cache_resource(ttl=3600)
	def load_embeddings_model():
	logger.info("Loading embeddings model")
	try:
	return SentenceTransformer("all-MiniLM-L6-v2")
	except Exception as e:
	logger.error(f"Embeddings load error: {str(e)}")
	st.error(f"Embedding model error: {str(e)}")
	return None

	@st.cache_resource(ttl=3600)
	def load_qa_pipeline():
	logger.info("Loading QA pipeline")
	try:
	return pipeline("text2text-generation", model="google/flan-t5-base", max_length=300)
	except Exception as e:
	logger.error(f"QA model load error: {str(e)}")
	st.error(f"QA model error: {str(e)}")
	return None

	# Process PDF
	def process_pdf(uploaded_file):
	logger.info("Processing PDF")
	try:
	text = ""
	with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
	for page in pdf.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted + "\n"

	if not text:
	# Optional OCR (uncomment if needed, requires pdf2image, pytesseract)
	# from pdf2image import convert_from_bytes
	# import pytesseract
	# images = convert_from_bytes(uploaded_file.getvalue())
	# text = "".join(pytesseract.image_to_string(img) for img in images)
	if not text:
	raise ValueError("No text extracted from PDF")

	text_splitter = CharacterTextSplitter(separator="\n", chunk_size=600, chunk_overlap=150)
	chunks = text_splitter.split_text(text)

	embeddings_model = load_embeddings_model()
	if not embeddings_model:
	return None, text

	embeddings = [embeddings_model.encode(chunk) for chunk in chunks]
	vector_store = FAISS.from_embeddings(zip(chunks, embeddings), embeddings_model.encode)
	logger.info("PDF processed successfully")
	return vector_store, text
	except Exception as e:
	logger.error(f"PDF processing error: {str(e)}")
	st.error(f"PDF error: {str(e)}")
	return None, ""

	# Summarize PDF
	def summarize_pdf(text):
	logger.info("Generating summary")
	try:
	qa_pipeline = load_qa_pipeline()
	if not qa_pipeline:
	return "Summary model unavailable."

	# Split text for summarization if too long
	text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100)
	chunks = text_splitter.split_text(text)
	summaries = []

	for chunk in chunks[:3]: # Limit to first 3 chunks for brevity
	prompt = f"Summarize this text in 60-80 words, highlighting key points:\n{chunk}"
	summary = qa_pipeline(prompt, max_length=100)[0]['generated_text']
	summaries.append(summary.strip())

	combined_summary = " ".join(summaries)
	if len(combined_summary.split()) > 200:
	combined_summary = " ".join(combined_summary.split()[:200])
	logger.info("Summary generated")
	return combined_summary
	except Exception as e:
	logger.error(f"Summary error: {str(e)}")
	return f"Error summarizing: {str(e)}"

	# Answer question
	def answer_question(vector_store, query):
	logger.info(f"Processing query: {query}")
	try:
	if not vector_store:
	return "Please upload a PDF first."

	qa_pipeline = load_qa_pipeline()
	if not qa_pipeline:
	return "QA model unavailable."

	docs = vector_store.similarity_search(query, k=3)
	context = "\n".join(doc.page_content for doc in docs)
	prompt = f"Context: {context}\nQuestion: {query}\nAnswer concisely:"
	response = qa_pipeline(prompt)[0]['generated_text']
	logger.info("Answer generated")
	return response.strip()
	except Exception as e:
	logger.error(f"Query error: {str(e)}")
	return f"Error answering: {str(e)}"

	# Streamlit UI
	try:
	st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄")
	st.title("Smart PDF Q&A")
	st.markdown("""
	Upload a PDF to ask questions or get a summary (up to 200 words). Chat history is preserved.
	<style>
	.stChatMessage { border-radius: 10px; padding: 10px; margin: 5px; }
	.stChatMessage.user { background-color: #e6f3ff; }
	.stChatMessage.assistant { background-color: #f0f0f0; }
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "vector_store" not in st.session_state:
	st.session_state.vector_store = None
	if "pdf_text" not in st.session_state:
	st.session_state.pdf_text = ""

	# PDF upload
	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
	if uploaded_file:
	col1, col2 = st.columns([1, 1])
	with col1:
	if st.button("Process PDF"):
	with st.spinner("Processing PDF..."):
	st.session_state.vector_store, st.session_state.pdf_text = process_pdf(uploaded_file)
	if st.session_state.vector_store:
	st.success("PDF processed! Ask questions or summarize.")
	st.session_state.messages = []
	else:
	st.error("Failed to process PDF.")
	with col2:
	if st.button("Summarize PDF") and st.session_state.pdf_text:
	with st.spinner("Generating summary..."):
	summary = summarize_pdf(st.session_state.pdf_text)
	st.session_state.messages.append({"role": "assistant", "content": f"Summary: {summary}"})
	st.markdown(f"Summary: {summary}")

	# Chat interface
	if st.session_state.vector_store:
	prompt = st.chat_input("Ask a question about the PDF:")
	if prompt:
	st.session_state.messages.append({"role": "user", "content": prompt})
	with st.chat_message("user"):
	st.markdown(prompt)
	with st.chat_message("assistant"):
	with st.spinner("Generating answer..."):
	answer = answer_question(st.session_state.vector_store, prompt)
	st.markdown(answer)
	st.session_state.messages.append({"role": "assistant", "content": answer})

	# Display chat history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Download chat history
	if st.session_state.messages:
	chat_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in st.session_state.messages)
	st.download_button("Download Chat History", chat_text, "chat_history.txt")

	except Exception as e:
	logger.error(f"App initialization failed: {str(e)}")
	st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")