Spaces:

fracapuano
/

AISandbox

Runtime error

AISandbox / summarization /summarization.py

fracapuano

fix: text is given when input fixed in a good way now

4ba1608 about 2 years ago

4.4 kB

	import streamlit as st
	from transformers import pipeline
	from qa.qa import file_to_doc
	from transformers import AutoTokenizer
	from typing import Text, Union

	@st.cache_resource
	def summarization_model(
	model_name:str="facebook/bart-large-cnn",
	custom_tokenizer:Union[AutoTokenizer, bool]=False
	):
	summarizer = pipeline(
	model=model_name,
	tokenizer=model_name if custom_tokenizer==False else custom_tokenizer,
	task="summarization"
	)
	return summarizer

	@st.cache_data
	def split_string_into_token_chunks(s:Text, _tokenizer:AutoTokenizer, chunk_size:int):
	# Tokenize the entire string
	token_ids = _tokenizer.encode(s)
	# Split the token ids into chunks of the desired size
	chunks = [token_ids[i:i+chunk_size] for i in range(0, len(token_ids), chunk_size)]
	# Decode each chunk back into a string
	return [_tokenizer.decode(chunk) for chunk in chunks]

	def summarization_main():
	st.markdown("<h2 style='text-align: center'>Text Summarization</h2>", unsafe_allow_html=True)
	st.markdown("<h3 style='text-align: left'><b>What is text summarization about?<b></h3>", unsafe_allow_html=True)

	st.write("""
	Text summarization is common NLP task concerned with producing a shorter version of a given text while preserving the important information
	contained in such text
	""")

	OPTION_1 = "I want to input some text"
	OPTION_2 = "I want to upload a file"
	option = st.radio("How would you like to start? Choose an option below", [OPTION_1, OPTION_2])

	# greenlight to summarize
	text_is_given = False
	if option == OPTION_1:
	sample_text = ""
	text = st.text_area(
	"Input a text in English (10,000 characters max)",
	value=sample_text,
	max_chars=10_000,
	height=330)
	# toggle text is given greenlight
	if text != sample_text:
	text_is_given = not text_is_given

	elif option == OPTION_2:
	uploaded_file = st.file_uploader(
	"Upload a pdf, docx, or txt file (scanned documents not supported)",
	type=["pdf", "docx", "txt"],
	help="Scanned documents are not supported yet 🥲"
	)
	if uploaded_file is not None:
	# parse the file using custom parsers and build a concatenation for the summarizer
	text = " ".join(file_to_doc(uploaded_file))
	# toggle text is given greenlight
	text_is_given = not text_is_given

	if text_is_given:
	# minimal number of words in the summary
	min_length, max_length = 30, 200
	user_max_length = max_length
	# user_max_lenght = st.slider(
	# label="Maximal number of tokens in the summary",
	# min_value=min_length,
	# max_value=max_length,
	# value=150,
	# step=10,
	# )

	summarizer_downloaded = False
	# loading the tokenizer to split the input document into feasible chunks
	model_name = "facebook/bart-large-cnn"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# the maximum number of tokens the model can handle depends on the model - accounting for tokens added by tokenizer
	chunk_size = int(0.88*tokenizer.model_max_length)

	# loading the summarization model considered
	with st.spinner(text="Loading summarization model..."):
	summarizer = summarization_model(model_name=model_name)
	summarizer_downloaded = True

	if summarizer_downloaded:
	button = st.button("Summarize!")
	if button:
	with st.spinner(text="Summarizing text..."):
	# summarizing each chunk of the input text to avoid exceeding the maximum number of tokens
	summary = ""
	chunks = split_string_into_token_chunks(text, tokenizer, chunk_size)
	for chunk in chunks:
	chunk_summary = summarizer(chunk, max_length=user_max_length, min_length=min_length)
	summary += "\n" + chunk_summary[0]["summary_text"]

	st.markdown("<h3 style='text-align: left'><b>Summary<b></h3>", unsafe_allow_html=True)
	print(summary)
	st.write(summary)