Spaces:

IrisDeng
/

UST_Deep_learning_asm1

Running

File size: 2,625 Bytes

a4a0cba
 
 
 
96644c9
a4a0cba
 
 
f1d429a
 
 
 
 
a4a0cba
f1d429a
 
 
a4a0cba
 
f1d429a
 
 
 
 
 
 
 
 
 
a4a0cba
 
f1d429a
 
 
a4a0cba
 
f1d429a
 
a4a0cba
f1d429a
75961c1
f1d429a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75961c1

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch

st.set_page_config(page_title="Image to Audio Story", page_icon="🦜")

# Load models once
caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
story_pipeline = pipeline("text-generation", model="Qwen/Qwen2-1.5B")


def extract_image_caption(image_data):
    img_obj = Image.open(image_data)
    caption_results = caption_pipeline(img_obj)
    return caption_results[0]['generated_text']

def compose_story_from_caption(caption_detail):
    prompt_text = (
        "You are a talented and imaginative storyteller for children aged 3 to 10. "
        "Using the details derived from the image below, craft a captivating tale that goes beyond merely describing the scene. "
        "Let your creativity shine by introducing engaging characters, adventurous journeys, and delightful surprises. "
        "Your story should be vivid, original, and between 100 and 300 words in length.\n\n"
        f"Image Details: {caption_detail}\n\nStory:"
    )
    story_results = story_pipeline(prompt_text, num_return_sequences=1)
    story_text = story_results[0]['generated_text']
    return story_text.split("Story:", 1)[1].strip() if "Story:" in story_text else story_text.strip()

def convert_text_to_audio(text_content, audio_path="output.mp3"):
    tts_engine = gTTS(text=text_content, lang="en")
    tts_engine.save(audio_path)
    return audio_path

def run_app():
    st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
    
    uploaded_image = st.file_uploader("Select an Image", type=["png", "jpg", "jpeg"])
    
    if uploaded_image is not None:
        image_display = Image.open(uploaded_image)
        st.image(image_display, caption="Uploaded Image", use_container_width=True)
        
        with st.spinner("Generating caption for the image..."):
            caption_text = extract_image_caption(uploaded_image)
        st.write("**Generated Caption:**", caption_text)
        
        with st.spinner("Composing story..."):
            story_text = compose_story_from_caption(caption_text)
        st.write("**Story:**")
        st.write(story_text)
        
        with st.spinner("Converting text to audio..."):
            audio_file = convert_text_to_audio(story_text)
        st.audio(audio_file, format="audio/mp3")

if __name__ == "__main__":
    run_app()