Image-to-Audio Story Generator 🦜

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch

st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="🦜")


def create_image_caption(image_file):
    pil_image = Image.open(image_file)
    caption_generator = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
    )
    caption_result = caption_generator(pil_image)
    generated_caption = caption_result[0]["generated_text"]
    return generated_caption

def build_children_story(image_description):
    story_generator = pipeline(
        "text-generation",
        model="Qwen/Qwen2.5-0.5B-Instruct",
    )
    
    story_prompt = (
        "You are a talented and imaginative storyteller for children aged 3 to 10. "
        "Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
        "an adventurous journey, and delightful surprises. "
        "Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
        f"Image Details: {caption_result}\n\nStory:"
    )
    
    generated_output = story_pipeline(
        story_prompt, 
        max_new_tokens=150, 
        num_return_sequences=1, 
        do_sample=True
    )
    
    raw_story = generated_output[0]["generated_text"]
    
    if "Story:" in raw_story:
        story_text = raw_story.split("Story:", 1)[1].strip()
    else:
        story_text = raw_story.strip()
    
    story_words = story_text.split()
    if len(story_words) > 95:
        trimmed_story = " ".join(story_words[:95])
        last_period = trimmed_story.rfind(".")
        last_exclamation = trimmed_story.rfind("!")
        last_question = trimmed_story.rfind("?")
        index = max(last_period, last_exclamation, last_question)
        
        if index != -1:
            trimmed_story = trimmed_story[:index + 1]
        
        story_text = trimmed_story
    
    if story_text and story_text[-1] not in ".!?":
        story_text += "."
    
    return story_text


def convert_text_to_audio(story_content, audio_file_name="output.mp3"):
    tts = gTTS(text=story_content, lang="en")
    tts.save(audio_file_name)
    return audio_file_name


def main_app():
    st.markdown("<h1 style='text-align: center;'>Image-to-Audio Story Generator 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below to generate an engaging story from the picture, then convert the story into audio playback!")
    
    uploaded_image_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
    
    if uploaded_image_file is not None:
        pil_image = Image.open(uploaded_image_file)
        st.image(pil_image, caption="Uploaded Image", use_container_width=True)
        
        with st.spinner("Generating image caption..."):
            image_caption = create_image_caption(uploaded_image_file)
        st.write("**Image Caption:**", image_caption)
        
        with st.spinner("Building story narrative..."):
            story_content = build_children_story(image_caption)
        st.write("**Story:**")
        st.write(story_content)
        
        with st.spinner("Converting story to audio..."):
            audio_file_name = convert_text_to_audio(story_content)
        st.audio(audio_file_name, format="audio/mp3")


if __name__ == "__main__":
    main_app()