Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

App Files Files Community

UST_Deep_learning_asm1 / app.py

IrisDeng

Update app.py

4299953 verified about 21 hours ago

raw

history blame contribute delete

3.43 kB

	import streamlit as st
	from PIL import Image
	from transformers import pipeline
	from gtts import gTTS
	import torch

	st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="🦜")

	def create_image_caption(image_file):
	pil_image = Image.open(image_file)
	caption_generator = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base",
	)
	caption_result = caption_generator(pil_image)
	generated_caption = caption_result[0]["generated_text"]
	return generated_caption

	def build_children_story(image_caption):
	story_generator = pipeline(
	"text-generation",
	model="Qwen/Qwen2.5-0.5B-Instruct",
	)

	story_prompt = (
	"You are a talented and imaginative storyteller for children aged 3 to 10. "
	"Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
	"an adventurous journey, and delightful surprises. "
	"Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
	f"Image Details: {image_caption}\n\nStory:"
	)

	generated_output = story_generator(
	story_prompt,
	max_new_tokens=150,
	num_return_sequences=1,
	do_sample=True
	)

	raw_story = generated_output[0]["generated_text"]

	if "Story:" in raw_story:
	story_text = raw_story.split("Story:", 1)[1].strip()
	else:
	story_text = raw_story.strip()

	story_words = story_text.split()
	if len(story_words) > 95:
	trimmed_story = " ".join(story_words[:95])
	last_period = trimmed_story.rfind(".")
	last_exclamation = trimmed_story.rfind("!")
	last_question = trimmed_story.rfind("?")
	index = max(last_period, last_exclamation, last_question)

	if index != -1:
	trimmed_story = trimmed_story[:index + 1]

	story_text = trimmed_story

	if story_text and story_text[-1] not in ".!?":
	story_text += "."

	return story_text

	def convert_text_to_audio(story_content, audio_file_name="output.mp3"):
	tts = gTTS(text=story_content, lang="en")
	tts.save(audio_file_name)
	return audio_file_name

	def main_app():
	st.markdown("<h1 style='text-align: center;'>Image-to-Audio Story Generator 🦜</h1>", unsafe_allow_html=True)
	st.write("Upload an image below to generate an engaging story from the picture, then convert the story into audio playback!")

	uploaded_image_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])

	if uploaded_image_file is not None:
	pil_image = Image.open(uploaded_image_file)
	st.image(pil_image, caption="Uploaded Image", use_container_width=True)

	with st.spinner("Generating image caption..."):
	image_caption = create_image_caption(uploaded_image_file)
	st.write("Image Caption:", image_caption)

	with st.spinner("Building story narrative..."):
	story_content = build_children_story(image_caption)
	st.write("Story:")
	st.write(story_content)

	with st.spinner("Converting story to audio..."):
	audio_file_name = convert_text_to_audio(story_content)
	st.audio(audio_file_name, format="audio/mp3")

	if __name__ == "__main__":
	main_app()