Spaces:

adinarayana
/

Sample

Sleeping

App Files Files Community

Sample / app.py

adinarayana

Update app.py

8d21acd verified about 1 year ago

raw

history blame

3.95 kB

	import os
	import re
	import pdfminer
	from pdfminer.high_level import extract_pages
	from transformers import pipeline, TFBertForQuestionAnswering, AutoTokenizer

	import tensorflow as tf
	import streamlit as st


	def preprocess_text(element):
	"""Preprocesses text elements from the PDF.

	Args:
	element: A PDFminer text element.

	Returns:
	The preprocessed text.
	"""
	if isinstance(element, pdfminer.layout.LTTextBoxHorizontal):
	text = element.get_text().strip()
	# Remove non-textual elements
	text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression

	# Remove stop words (optional)
	from nltk.corpus import stopwords
	stop_words = set(stopwords.words('english'))
	text = " ".join([word for word in text.split() if word not in stop_words])

	# Convert to lowercase (optional)
	text = text.lower()
	return text
	else:
	return ""


	def answer_question(text, question, max_length=512):
	"""Answers a question using the provided text and a pre-trained model.

	Args:
	text: The preprocessed text from the PDF.
	question: The user's question.

	Returns:
	The answer extracted from the text using the model.
	"""

	qa_model_name = "bert-base-uncased" # Replace with your model

	qa_model = TFBertForQuestionAnswering.from_pretrained(qa_model_name)
	tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

	# Truncate text if necessary:
	if len(text) > max_length:
	text = text[:max_length]

	# Add special tokens and tokenize:
	inputs = tokenizer(
	question, text, return_tensors="tf", padding="max_length", truncation=True
	)

	outputs = qa_model(inputs)

	start_logits = outputs.start_logits
	end_logits = outputs.end_logits

	# Ensure start_logits and end_logits are tensors
	start_logits = tf.convert_to_tensor(start_logits)
	end_logits = tf.convert_to_tensor(end_logits)

	# Find the indices of the start and end positions
	answer_start = tf.argmax(start_logits, axis=1).numpy()[0]
	answer_end = (tf.argmax(end_logits, axis=1) + 1).numpy()[0] # Increment by 1 for exclusive end index

	# Extract the answer text from the original text
	answer = text[answer_start:answer_end].strip()

	return answer if answer else "No answer found."


	## Streamlit app

	st.set_page_config(page_title="PDF Summarizer and Q&A")
	st.header("PDF Summarizer and Q&A")

	# User options
	st.subheader("Settings")
	min_summary_length = st.slider("Minimum Summary Length", min_value=50, max_value=500, value=100)
	summarization_model = "facebook/bart-large-cnn"

	# File upload and processing
	uploaded_file = st.file_uploader("Choose a PDF file")
	if uploaded_file is not None:
	with st.spinner("Processing..."):
	text = ""
	for page_layout in extract_pages(uploaded_file):
	for element in page_layout:
	text += preprocess_text(element) + "\n"
	if text:
	question = st.text_input("Ask a question about the PDF:")
	summarize_button = st.button("Generate Summary")
	if summarize_button:
	with st.spinner("Summarizing..."):
	max_input_length = 1024 # Example value, adjust according to your model
	truncated_text = text[:max_input_length] # Truncate the text
	summary_response = pipeline("summarization", model=summarization_model)(truncated_text, min_length=min_summary_length)
	st.subheader("Summary")
	st.write(summary_response[0]["summary_text"])
	if question:
	with st.spinner("Answering..."):
	answer = answer_question(text, question)
	st.subheader("Answer")
	st.write(answer)
	else:
	st.error("No text found in the PDF.")