import os import re import pdfminer from pdfminer.high_level import extract_pages from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer import tensorflow as tf import streamlit as st def preprocess_text(element): """Preprocesses text elements from the PDF. Args: element: A PDFminer text element. Returns: The preprocessed text. """ if isinstance(element, pdfminer.layout.LTTextBoxHorizontal): text = element.get_text().strip() # Remove non-textual elements text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression # Remove stop words (optional) # from nltk.corpus import stopwords # stop_words = set(stopwords.words('english')) # text = " ".join([word for word in text.split() if word not in stop_words]) # Convert to lowercase (optional) text = text.lower() return text else: return "" def answer_question(text, question, max_length=512): """Answers a question using the provided text and a pre-trained model. Args: text: The preprocessed text from the PDF. question: The user's question. Returns: The answer extracted from the text using the model. """ qa_model_name = "bert-base-uncased" # Replace with your model qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name) tokenizer = AutoTokenizer.from_pretrained(qa_model_name) # Truncate text if necessary: if len(text) > max_length: text = text[:max_length] # Add special tokens and tokenize: inputs = tokenizer( question, text, return_tensors="tf", padding="max_length", truncation=True ) outputs = qa_model(inputs) start_logits = outputs.start_logits end_logits = outputs.end_logits # Ensure start_logits and end_logits are tensors start_logits = tf.convert_to_tensor(start_logits) end_logits = tf.convert_to_tensor(end_logits) # Find the indices of the start and end positions answer_start = tf.argmax(start_logits, axis=1).numpy()[0] answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] # Increment by 1 for exclusive end index # Extract the answer text from the original text answer = text[answer_start:answer_end].strip() return answer if answer else "No answer found." ## Streamlit app st.set_page_config(page_title="PDF Summarizer and Q&A") st.header("PDF Summarizer and Q&A") # User options st.subheader("Settings") min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100) summarization_model = "facebook/bart-large-cnn" # File upload and processing uploaded_file = st.file_uploader("Choose a PDF file") if uploaded_file is not None: with st.spinner("Processing..."): text = "" for page_layout in extract_pages(uploaded_file): for element in page_layout: text += preprocess_text(element) + "\n" if text: question = st.text_input("Ask a question about the PDF:") summarize_button = st.button("Generate Summary") if summarize_button: with st.spinner("Summarizing..."): max_input_length = 1024 # Example value, adjust according to your model truncated_text = text[:max_input_length] # Truncate the text summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length) st.subheader("Summary") st.write(summary_response[0]["summary_text"]) if question: with st.spinner("Answering..."): answer = answer_question(text, question) st.subheader("Answer") st.write(answer) else: st.error("No text found in the PDF.")