Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

File size: 3,425 Bytes

a4a0cba
 
 
 
97efbf3
a4a0cba
97efbf3
a4a0cba
97efbf3
 
 
 
 
 
 
 
 
16f5ed7
1ab26ef
97efbf3
 
 
 
 
 
 
 
 
 
1e8859c
97efbf3
 
1e8859c
97efbf3
 
 
 
 
 
 
 
 
 
 
 
 
 
9e09636
 
97efbf3
 
 
9e09636
2e3e400
9e09636
 
2e3e400
97efbf3
 
 
 
8498664
97efbf3
a4a0cba
97efbf3
 
 
 
16f5ed7
97efbf3
 
 
 
 
 
 
 
 
f1d429a
97efbf3
 
 
f1d429a
97efbf3
 
f1d429a
97efbf3
f1d429a
97efbf3
 
 
1ab26ef
4299953
1ab26ef

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch

st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="🦜")

def create_image_caption(image_file):
    pil_image = Image.open(image_file)
    caption_generator = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
    )
    caption_result = caption_generator(pil_image)
    generated_caption = caption_result[0]["generated_text"]
    return generated_caption

def build_children_story(image_caption):
    story_generator = pipeline(
        "text-generation",
        model="Qwen/Qwen2.5-0.5B-Instruct",
    )
    
    story_prompt = (
        "You are a talented and imaginative storyteller for children aged 3 to 10. "
        "Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
        "an adventurous journey, and delightful surprises. "
        "Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
        f"Image Details: {image_caption}\n\nStory:"
    )
    
    generated_output = story_generator(
        story_prompt, 
        max_new_tokens=150, 
        num_return_sequences=1, 
        do_sample=True
    )
    
    raw_story = generated_output[0]["generated_text"]
    
    if "Story:" in raw_story:
        story_text = raw_story.split("Story:", 1)[1].strip()
    else:
        story_text = raw_story.strip()
    
    story_words = story_text.split()
    if len(story_words) > 95:
        trimmed_story = " ".join(story_words[:95])
        last_period = trimmed_story.rfind(".")
        last_exclamation = trimmed_story.rfind("!")
        last_question = trimmed_story.rfind("?")
        index = max(last_period, last_exclamation, last_question)
        
        if index != -1:
            trimmed_story = trimmed_story[:index + 1]
        
        story_text = trimmed_story
    
    if story_text and story_text[-1] not in ".!?":
        story_text += "."
    
    return story_text

def convert_text_to_audio(story_content, audio_file_name="output.mp3"):
    tts = gTTS(text=story_content, lang="en")
    tts.save(audio_file_name)
    return audio_file_name

def main_app():
    st.markdown("<h1 style='text-align: center;'>Image-to-Audio Story Generator 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below to generate an engaging story from the picture, then convert the story into audio playback!")
    
    uploaded_image_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
    
    if uploaded_image_file is not None:
        pil_image = Image.open(uploaded_image_file)
        st.image(pil_image, caption="Uploaded Image", use_container_width=True)
        
        with st.spinner("Generating image caption..."):
            image_caption = create_image_caption(uploaded_image_file)
        st.write("**Image Caption:**", image_caption)
        
        with st.spinner("Building story narrative..."):
            story_content = build_children_story(image_caption)
        st.write("**Story:**")
        st.write(story_content)
        
        with st.spinner("Converting story to audio..."):
            audio_file_name = convert_text_to_audio(story_content)
        st.audio(audio_file_name, format="audio/mp3")

if __name__ == "__main__": 
    main_app()