Spaces:

adinarayana
/

Sample

Sleeping

App Files Files Community

Sample / app.py

adinarayana

Update app.py

cd78587 verified about 1 year ago

raw

history blame

1.93 kB

	import re
	import pdfminer
	from pdfminer.high_level import extract_pages
	from transformers import pipeline

	import streamlit as st

	def preprocess_text(element):
	# Extract text content
	text = element.get_text().strip()

	# Remove non-textual elements
	text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression

	# Remove stop words (optional)
	# from nltk.corpus import stopwords
	# stop_words = set(stopwords.words('english'))
	# text = " ".join([word for word in text.split() if word not in stop_words])

	# Convert to lowercase (optional)
	# text = text.lower()

	return text

	def get_openai_response(text, length=100, model="gpt-3.5-turbo-instruct"):
	summarizer = pipeline("summarization", model=model)
	return summarizer(text, max_length=length)

	## Streamlit app

	st.set_page_config(page_title="Trail Demo")
	st.header("PDF Summarizer")

	# User options
	st.subheader("Settings")
	summary_length = st.slider("Summary Length", min_value=50, max_value=500, value=100)
	summarization_model = st.selectbox("Summarization Model", ["gpt-3.5-turbo-instruct", "t5-small"])

	# File upload and processing
	uploaded_file = st.file_uploader("Choose a PDF file")
	if uploaded_file is not None:
	with st.spinner("Processing..."):
	text = ""
	for page_layout in extract_pages(uploaded_file):
	for element in page_layout:
	text += preprocess_text(element) + "\n"
	if text:
	st.subheader("Extracted Text")
	st.write(text)
	submit = st.button("Generate Summary")
	if submit:
	st.spinner("Summarizing...")
	response = get_openai_response(text, length=summary_length, model=summarization_model)
	st.subheader("Summary")
	st.write(response[0]["summary_text"])
	else:
	st.error("No text found in the PDF.")