Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
import fitz # PyMuPDF | |
import docx | |
import concurrent.futures | |
# Summarization pipeline | |
pipe = pipeline("summarization", model="facebook/bart-large-cnn") | |
def chunk_text(text, chunk_size=512): | |
# Split the text into smaller chunks | |
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
def summarize_chunk(chunk): | |
return pipe(chunk)[0]['summary_text'] | |
def extract_text_from_pdf(file): | |
text = "" | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
for page in doc: | |
text += page.get_text() | |
return text | |
def extract_text_from_docx(file): | |
doc = docx.Document(file) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def main(): | |
st.title("Text Summarization App") | |
input_text = st.text_area("Enter Text (Due to the Free CPU Basic Hardware being used, it takes more time for the output, please keep the prompt minimal)") | |
uploaded_file = st.file_uploader("Upload a file", type=['pdf', 'txt', 'doc', 'docx']) | |
if st.button("Summarize"): | |
if input_text: | |
chunks = chunk_text(input_text) | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
summaries = list(executor.map(summarize_chunk, chunks)) | |
st.subheader("Summary") | |
st.write(' '.join(summaries)) | |
elif uploaded_file is not None: | |
if uploaded_file.type == "application/pdf": | |
file_text = extract_text_from_pdf(uploaded_file) | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
file_text = extract_text_from_docx(uploaded_file) | |
elif uploaded_file.type == "text/plain": | |
file_text = str(uploaded_file.read(), "utf-8") | |
else: | |
st.error("Unsupported file type") | |
return | |
chunks = chunk_text(file_text) | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
summaries = list(executor.map(summarize_chunk, chunks)) | |
st.subheader("Summary") | |
st.write(' '.join(summaries)) | |
if __name__ == "__main__": | |
main() | |