Sample / app.py
adinarayana's picture
Update app.py
cd78587 verified
raw
history blame
1.93 kB
import re
import pdfminer
from pdfminer.high_level import extract_pages
from transformers import pipeline
import streamlit as st
def preprocess_text(element):
# Extract text content
text = element.get_text().strip()
# Remove non-textual elements
text = re.sub(r'[^\w\s]', '', text) # Replace with your preferred regular expression
# Remove stop words (optional)
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))
# text = " ".join([word for word in text.split() if word not in stop_words])
# Convert to lowercase (optional)
# text = text.lower()
return text
def get_openai_response(text, length=100, model="gpt-3.5-turbo-instruct"):
summarizer = pipeline("summarization", model=model)
return summarizer(text, max_length=length)
## Streamlit app
st.set_page_config(page_title="Trail Demo")
st.header("PDF Summarizer")
# User options
st.subheader("Settings")
summary_length = st.slider("Summary Length", min_value=50, max_value=500, value=100)
summarization_model = st.selectbox("Summarization Model", ["gpt-3.5-turbo-instruct", "t5-small"])
# File upload and processing
uploaded_file = st.file_uploader("Choose a PDF file")
if uploaded_file is not None:
with st.spinner("Processing..."):
text = ""
for page_layout in extract_pages(uploaded_file):
for element in page_layout:
text += preprocess_text(element) + "\n"
if text:
st.subheader("Extracted Text")
st.write(text)
submit = st.button("Generate Summary")
if submit:
st.spinner("Summarizing...")
response = get_openai_response(text, length=summary_length, model=summarization_model)
st.subheader("Summary")
st.write(response[0]["summary_text"])
else:
st.error("No text found in the PDF.")