Spaces:
Sleeping
Sleeping
File size: 2,209 Bytes
99606c7 36dcb40 99606c7 36dcb40 99606c7 36dcb40 99606c7 36dcb40 99606c7 36dcb40 99606c7 36dcb40 8fab35e 36dcb40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import streamlit as st
from transformers import pipeline
import fitz # PyMuPDF
import docx
import concurrent.futures
# Summarization pipeline
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
def chunk_text(text, chunk_size=512):
# Split the text into smaller chunks
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
def summarize_chunk(chunk):
return pipe(chunk)[0]['summary_text']
def extract_text_from_pdf(file):
text = ""
doc = fitz.open(stream=file.read(), filetype="pdf")
for page in doc:
text += page.get_text()
return text
def extract_text_from_docx(file):
doc = docx.Document(file)
return "\n".join([para.text for para in doc.paragraphs])
def main():
st.title("Text Summarization App")
input_text = st.text_area("Enter Text (Due to the Free CPU Basic Hardware being used, it takes more time for the output, please keep the prompt minimal)")
uploaded_file = st.file_uploader("Upload a file", type=['pdf', 'txt', 'doc', 'docx'])
if st.button("Summarize"):
if input_text:
chunks = chunk_text(input_text)
with concurrent.futures.ThreadPoolExecutor() as executor:
summaries = list(executor.map(summarize_chunk, chunks))
st.subheader("Summary")
st.write(' '.join(summaries))
elif uploaded_file is not None:
if uploaded_file.type == "application/pdf":
file_text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
file_text = extract_text_from_docx(uploaded_file)
elif uploaded_file.type == "text/plain":
file_text = str(uploaded_file.read(), "utf-8")
else:
st.error("Unsupported file type")
return
chunks = chunk_text(file_text)
with concurrent.futures.ThreadPoolExecutor() as executor:
summaries = list(executor.map(summarize_chunk, chunks))
st.subheader("Summary")
st.write(' '.join(summaries))
if __name__ == "__main__":
main()
|