Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

App Files Files Community

UST_Deep_learning_asm1 / app.py

IrisDeng

Update app.py

9e09636 verified 1 day ago

raw

history blame

3.43 kB

	import streamlit as st
	from PIL import Image
	from transformers import pipeline
	from gtts import gTTS
	import torch

	st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="🦜")


	def create_image_caption(image_file):
	pil_image = Image.open(image_file)
	caption_generator = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base",
	)
	caption_result = caption_generator(pil_image)
	generated_caption = caption_result[0]["generated_text"]
	return generated_caption

	def build_children_story(image_description):
	story_generator = pipeline(
	"text-generation",
	model="Qwen/Qwen2.5-0.5B-Instruct",
	)

	story_prompt = (
	"You are a talented and imaginative storyteller for children aged 3 to 10. "
	"Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
	"an adventurous journey, and delightful surprises. "
	"Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
	f"Image Details: {caption_result}\n\nStory:"
	)

	generated_output = story_pipeline(
	story_prompt,
	max_new_tokens=150,
	num_return_sequences=1,
	do_sample=True
	)

	raw_story = generated_output[0]["generated_text"]

	if "Story:" in raw_story:
	story_text = raw_story.split("Story:", 1)[1].strip()
	else:
	story_text = raw_story.strip()

	story_words = story_text.split()
	if len(story_words) > 95:
	trimmed_story = " ".join(story_words[:95])
	last_period = trimmed_story.rfind(".")
	last_exclamation = trimmed_story.rfind("!")
	last_question = trimmed_story.rfind("?")
	index = max(last_period, last_exclamation, last_question)

	if index != -1:
	trimmed_story = trimmed_story[:index + 1]

	story_text = trimmed_story

	if story_text and story_text[-1] not in ".!?":
	story_text += "."

	return story_text


	def convert_text_to_audio(story_content, audio_file_name="output.mp3"):
	tts = gTTS(text=story_content, lang="en")
	tts.save(audio_file_name)
	return audio_file_name


	def main_app():
	st.markdown("<h1 style='text-align: center;'>Image-to-Audio Story Generator 🦜</h1>", unsafe_allow_html=True)
	st.write("Upload an image below to generate an engaging story from the picture, then convert the story into audio playback!")

	uploaded_image_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])

	if uploaded_image_file is not None:
	pil_image = Image.open(uploaded_image_file)
	st.image(pil_image, caption="Uploaded Image", use_container_width=True)

	with st.spinner("Generating image caption..."):
	image_caption = create_image_caption(uploaded_image_file)
	st.write("Image Caption:", image_caption)

	with st.spinner("Building story narrative..."):
	story_content = build_children_story(image_caption)
	st.write("Story:")
	st.write(story_content)

	with st.spinner("Converting story to audio..."):
	audio_file_name = convert_text_to_audio(story_content)
	st.audio(audio_file_name, format="audio/mp3")


	if __name__ == "__main__":
	main_app()