Your Image to Audio Story 🦜

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch

st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")


def generate_caption(image_file):
    image = Image.open(image_file)
    caption_generator = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
    )
    caption_results = caption_generator(image)
    caption = caption_results[0]['generated_text']
    return caption

def generate_story(caption):
    story_generator = pipeline(
        "text-generation",
        model="Qwen/Qwen2.5-0.5B-Instruct",
    )
    prompt = (
        "You are a highly imaginative children's story writer celebrated for your creativity and captivating narratives. "
        "Using the image details provided below, please craft an enchanting tale tailored for children aged 3 to 10. "
        "Rather than simply reiterating the image details, enhance your story with imaginative characters, quirky adventures, "
        "and delightful surprises that ignite wonder in every young heart. Let your narrative flow naturally and ensure that your story is complete, with a clear beginning, middle, and end. "
        "Please ensure the total word count does not exceed 80 words, and do not leave the story incomplete.\n\n"
        f"Image Details: {caption}\n\nStory:"
    )
    
    result = story_generator(
        prompt, 
        max_new_tokens=100,
        num_return_sequences=1,
        do_sample=True,
        temperature=1.0
    )
    full_text = result[0]['generated_text']
    
    if "Story:" in full_text:
        story = full_text.split("Story:", 1)[1].strip()
    else:
        story = full_text.strip()
    
    return story

def text_to_speech(text, output_file="output.mp3"):
    tts = gTTS(text=text, lang="en")
    tts.save(output_file)
    return output_file

def main():
    st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
    
    uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
    
    if uploaded_file is not None:
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded image", use_container_width=True)
        
        with st.spinner("Image caption being generated..."):
            caption = generate_caption(uploaded_file)
        st.write("**Image Caption:**", caption)
        
        with st.spinner("Generating story..."):
            story = generate_story(caption)
        st.write("**Story:**")
        st.write(story)
        with st.spinner("Converting to voice..."):
            audio_file = text_to_speech(story)
        st.audio(audio_file, format="audio/mp3")

if __name__ == "__main__":
    main()