Spaces:
Sleeping
Sleeping
File size: 3,945 Bytes
df6d309 cd78587 a0a4c21 a9ed5cd ed723ec f76e3c0 190a44b f76e3c0 68de083 de71dbe df6d309 eb9dd95 da9af65 df6d309 8d21acd da9af65 eb9dd95 e24ae62 0f1930b 68de083 9be9f60 df6d309 1465c8f df6d309 b46238f a8825e5 df6d309 1465c8f 9be9f60 1465c8f df6d309 09d71b7 df6d309 09d71b7 6f020bb 09d71b7 1465c8f a2d5f2f 1465c8f a2d5f2f df6d309 de71dbe 68de083 de71dbe 0f1930b 99a4a27 df6d309 0f1930b de71dbe fbcc2a4 da9af65 a0a4c21 de71dbe f76e3c0 de71dbe a9ed5cd de71dbe df6d309 276fdaf 99a4a27 276fdaf df6d309 de71dbe 99a4a27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer
import tensorflow as tf
import streamlit as st
def preprocess_text(element):
"""Preprocesses text elements from the PDF.
Args:
element: A PDFminer text element.
Returns:
The preprocessed text.
"""
if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
text = element.get_text().strip()
# Remove non-textual elements
text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression
# Remove stop words (optional)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
text = " ".join([word for word in text.split() if word not in stop_words])
# Convert to lowercase (optional)
text = text.lower()
return text
else:
return ""
def answer_question(text, question, max_length=512):
"""Answers a question using the provided text and a pre-trained model.
Args:
text: The preprocessed text from the PDF.
question: The user's question.
Returns:
The answer extracted from the text using the model.
"""
qa_model_name = "bert-base-uncased" # Replace with your model
qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
# Truncate text if necessary:
if len(text) > max_length:
text = text[:max_length]
# Add special tokens and tokenize:
inputs = tokenizer(
question, text, return_tensors="tf", padding="max_length", truncation=True
)
outputs = qa_model(inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
# Ensure start_logits and end_logits are tensors
start_logits = tf.convert_to_tensor(start_logits)
end_logits = tf.convert_to_tensor(end_logits)
# Find the indices of the start and end positions
answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] # Increment by 1 for exclusive end index
# Extract the answer text from the original text
answer = text[answer_start:answer_end].strip()
return answer if answer else "No answer found."
## Streamlit app
st.set_page_config(page_title="PDF Summarizer and Q&A")
st.header("PDF Summarizer and Q&A")
# User options
st.subheader("Settings")
min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
summarization_model = "facebook/bart-large-cnn"
# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
with st.spinner("Processing..."):
text = ""
for page_layout in extract_pages(uploaded_file):
for element in page_layout:
text += preprocess_text(element) + "\n"
if text:
question = st.text_input("Ask a question about the PDF:")
summarize_button = st.button("Generate Summary")
if summarize_button:
with st.spinner("Summarizing..."):
max_input_length = 1024 # Example value, adjust according to your model
truncated_text = text[:max_input_length] # Truncate the text
summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length)
st.subheader("Summary")
st.write(summary_response[0]["summary_text"])
if question:
with st.spinner("Answering..."):
answer = answer_question(text, question)
st.subheader("Answer")
st.write(answer)
else:
st.error("No text found in the PDF.")
|