IrisDeng's picture
Update app.py
9e09636 verified
raw
history blame
3.43 kB
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch
st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="🦜")
def create_image_caption(image_file):
pil_image = Image.open(image_file)
caption_generator = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
)
caption_result = caption_generator(pil_image)
generated_caption = caption_result[0]["generated_text"]
return generated_caption
def build_children_story(image_description):
story_generator = pipeline(
"text-generation",
model="Qwen/Qwen2.5-0.5B-Instruct",
)
story_prompt = (
"You are a talented and imaginative storyteller for children aged 3 to 10. "
"Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
"an adventurous journey, and delightful surprises. "
"Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
f"Image Details: {caption_result}\n\nStory:"
)
generated_output = story_pipeline(
story_prompt,
max_new_tokens=150,
num_return_sequences=1,
do_sample=True
)
raw_story = generated_output[0]["generated_text"]
if "Story:" in raw_story:
story_text = raw_story.split("Story:", 1)[1].strip()
else:
story_text = raw_story.strip()
story_words = story_text.split()
if len(story_words) > 95:
trimmed_story = " ".join(story_words[:95])
last_period = trimmed_story.rfind(".")
last_exclamation = trimmed_story.rfind("!")
last_question = trimmed_story.rfind("?")
index = max(last_period, last_exclamation, last_question)
if index != -1:
trimmed_story = trimmed_story[:index + 1]
story_text = trimmed_story
if story_text and story_text[-1] not in ".!?":
story_text += "."
return story_text
def convert_text_to_audio(story_content, audio_file_name="output.mp3"):
tts = gTTS(text=story_content, lang="en")
tts.save(audio_file_name)
return audio_file_name
def main_app():
st.markdown("<h1 style='text-align: center;'>Image-to-Audio Story Generator 🦜</h1>", unsafe_allow_html=True)
st.write("Upload an image below to generate an engaging story from the picture, then convert the story into audio playback!")
uploaded_image_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
if uploaded_image_file is not None:
pil_image = Image.open(uploaded_image_file)
st.image(pil_image, caption="Uploaded Image", use_container_width=True)
with st.spinner("Generating image caption..."):
image_caption = create_image_caption(uploaded_image_file)
st.write("**Image Caption:**", image_caption)
with st.spinner("Building story narrative..."):
story_content = build_children_story(image_caption)
st.write("**Story:**")
st.write(story_content)
with st.spinner("Converting story to audio..."):
audio_file_name = convert_text_to_audio(story_content)
st.audio(audio_file_name, format="audio/mp3")
if __name__ == "__main__":
main_app()