Spaces:
Sleeping
Sleeping
import os | |
import re | |
import pdfminer | |
from pdfminer.high_level import extract_pages | |
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer | |
import tensorflow as tf | |
import streamlit as st | |
def preprocess_text(element): | |
"""Preprocesses text elements from the PDF. | |
Args: | |
element: A PDFminer text element. | |
Returns: | |
The preprocessed text. | |
""" | |
if isinstance(element, pdfminer.layout.LTTextBoxHorizontal): | |
text = element.get_text().strip() | |
# Remove non-textual elements | |
text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression | |
# Remove stop words (optional) | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('english')) | |
text = " ".join([word for word in text.split() if word not in stop_words]) | |
# Convert to lowercase (optional) | |
text = text.lower() | |
return text | |
else: | |
return "" | |
def answer_question(text, question, max_length=512): | |
"""Answers a question using the provided text and a pre-trained model. | |
Args: | |
text: The preprocessed text from the PDF. | |
question: The user's question. | |
Returns: | |
The answer extracted from the text using the model. | |
""" | |
qa_model_name = "bert-base-uncased" # Replace with your model | |
qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name) | |
tokenizer = AutoTokenizer.from_pretrained(qa_model_name) | |
# Truncate text if necessary: | |
if len(text) > max_length: | |
text = text[:max_length] | |
# Add special tokens and tokenize: | |
inputs = tokenizer( | |
question, text, return_tensors="tf", padding="max_length", truncation=True | |
) | |
outputs = qa_model(inputs) | |
start_logits = outputs.start_logits | |
end_logits = outputs.end_logits | |
# Ensure start_logits and end_logits are tensors | |
start_logits = tf.convert_to_tensor(start_logits) | |
end_logits = tf.convert_to_tensor(end_logits) | |
# Find the indices of the start and end positions | |
answer_start = tf.argmax(start_logits, axis=1).numpy()[0] | |
answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] # Increment by 1 for exclusive end index | |
# Extract the answer text from the original text | |
answer = text[answer_start:answer_end].strip() | |
return answer if answer else "No answer found." | |
## Streamlit app | |
st.set_page_config(page_title="PDF Summarizer and Q&A") | |
st.header("PDF Summarizer and Q&A") | |
# User options | |
st.subheader("Settings") | |
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100) | |
summarization_model = "facebook/bart-large-cnn" | |
# File upload and processing | |
uploaded_file = st.file_uploader("Choose a PDF file") | |
if uploaded_file is not None: | |
with st.spinner("Processing..."): | |
text = "" | |
for page_layout in extract_pages(uploaded_file): | |
for element in page_layout: | |
text += preprocess_text(element) + "\n" | |
if text: | |
question = st.text_input("Ask a question about the PDF:") | |
summarize_button = st.button("Generate Summary") | |
if summarize_button: | |
with st.spinner("Summarizing..."): | |
max_input_length = 1024 # Example value, adjust according to your model | |
truncated_text = text[:max_input_length] # Truncate the text | |
summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length) | |
st.subheader("Summary") | |
st.write(summary_response[0]["summary_text"]) | |
if question: | |
with st.spinner("Answering..."): | |
answer = answer_question(text, question) | |
st.subheader("Answer") | |
st.write(answer) | |
else: | |
st.error("No text found in the PDF.") | |