import os
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer
import tensorflow as tf
import streamlit as st
def preprocess_text(element):
"""Preprocesses text elements from the PDF."""
if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
text = element.get_text().strip()
# Remove non-textual elements
text = re.sub(r'[^\w\s]', '', text)
# Convert to lowercase
text = text.lower()
return text
return ""
def answer_question(text, question, max_length=512):
"""Answers a question using the provided text and a pre-trained model."""
qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
# Add special tokens and tokenize
inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length)
# Model prediction
outputs = qa_model(inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
# Find the indices of the start and end positions
answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]
# Extract the answer text from the original text
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
return answer if answer else "No answer found."
# Streamlit app
st.set_page_config(page_title="PDF Summarizer and Q&A")
st.header("PDF Summarizer and Q&A")
# User options
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
summarization_model = "facebook/bart-large-cnn"
# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
with st.spinner("Processing..."):
text = ""
for page_layout in extract_pages(uploaded_file):
for element in page_layout:
text += preprocess_text(element) + "\n"
if text:
question = st.text_input("Ask a question about the PDF:")
summarize_button = st.button("Generate Summary")
if summarize_button:
with st.spinner("Summarizing..."):
summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length)
if question:
with st.spinner("Answering..."):
answer = answer_question(text, question)
st.error("No text found in the PDF.")