Spaces:
Sleeping
Sleeping
File size: 2,998 Bytes
df6d309 cd78587 a0a4c21 a9ed5cd ed723ec f76e3c0 190a44b f76e3c0 de71dbe f9b6f1e df6d309 eb9dd95 f9b6f1e da9af65 eb9dd95 e24ae62 0f1930b 9be9f60 f9b6f1e b46238f a8825e5 df6d309 f9b6f1e df6d309 f9b6f1e 09d71b7 6f020bb a2d5f2f f9b6f1e 1465c8f a2d5f2f f9b6f1e df6d309 de71dbe f9b6f1e 99a4a27 df6d309 0f1930b de71dbe fbcc2a4 da9af65 a0a4c21 de71dbe f76e3c0 de71dbe a9ed5cd de71dbe df6d309 276fdaf f9b6f1e 276fdaf df6d309 de71dbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer
import tensorflow as tf
import streamlit as st
def preprocess_text(element):
"""Preprocesses text elements from the PDF."""
if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
text = element.get_text().strip()
# Remove non-textual elements
text = re.sub(r'[^\w\s]', '', text)
# Convert to lowercase
text = text.lower()
return text
else:
return ""
def answer_question(text, question, max_length=512):
"""Answers a question using the provided text and a pre-trained model."""
qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
# Add special tokens and tokenize
inputs = tokenizer(question, text, return_tensors="tf", padding=True, truncation=True, max_length=max_length)
# Model prediction
outputs = qa_model(inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
# Find the indices of the start and end positions
answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0]
# Extract the answer text from the original text
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
return answer if answer else "No answer found."
# Streamlit app
st.set_page_config(page_title="PDF Summarizer and Q&A")
st.header("PDF Summarizer and Q&A")
# User options
st.subheader("Settings")
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
summarization_model = "facebook/bart-large-cnn"
# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
with st.spinner("Processing..."):
text = ""
for page_layout in extract_pages(uploaded_file):
for element in page_layout:
text += preprocess_text(element) + "\n"
if text:
question = st.text_input("Ask a question about the PDF:")
summarize_button = st.button("Generate Summary")
if summarize_button:
with st.spinner("Summarizing..."):
summary_response = pipeline("summarization", model=summarization_model)(text, min_length=min_summary_length)
st.subheader("Summary")
st.write(summary_response[0]["summary_text"])
if question:
with st.spinner("Answering..."):
answer = answer_question(text, question)
st.subheader("Answer")
st.write(answer)
else:
st.error("No text found in the PDF.")
|