File size: 2,864 Bytes
1fb6258
7f4a56e
06bce28
6b8bebd
06bce28
7577927
06bce28
e775448
 
06bce28
 
 
 
 
 
 
 
 
7f4a56e
06bce28
 
 
c819855
06bce28
ab58353
c819855
 
 
 
ab58353
 
 
 
e61b3fb
 
 
 
 
 
 
ab58353
 
 
 
 
 
 
06bce28
 
 
 
 
 
e775448
06bce28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a31b925
06bce28
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch

st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")


def generate_caption(image_file):
    image = Image.open(image_file)
    caption_generator = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
    )
    caption_results = caption_generator(image)
    caption = caption_results[0]['generated_text']
    return caption

def generate_story(caption):
    story_generator = pipeline(
        "text-generation",
        model="Qwen/Qwen2.5-0.5B-Instruct",
    )
    prompt = (
        "You are a highly imaginative children's story writer celebrated for your creativity and captivating narratives. "
        "Using the image details provided below, please craft an enchanting tale tailored for children aged 3 to 10. "
        "Rather than simply reiterating the image details, enhance your story with imaginative characters, quirky adventures, "
        "and delightful surprises that ignite wonder in every young heart. Let your narrative flow naturally and kindle the magic of storytelling. "
        "Please ensure that your story is engaging, coherent, and falls between 100 and 300 words in length.\n\n"
        f"Image Details: {caption}\n\nStory:"
    )
    
    result = story_generator(
        prompt, 
        max_length=300,
        num_return_sequences=1,
        do_sample=True,
        temperature=1.0
    )
    full_text = result[0]['generated_text']
    
    if "Story:" in full_text:
        story = full_text.split("Story:", 1)[1].strip()
    else:
        story = full_text.strip()
    
    return story

def text_to_speech(text, output_file="output.mp3"):
    tts = gTTS(text=text, lang="en")
    tts.save(output_file)
    return output_file

def main():
    st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
    
    uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
    
    if uploaded_file is not None:
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded image", use_container_width=True)
        
        with st.spinner("Image caption being generated..."):
            caption = generate_caption(uploaded_file)
        st.write("**Image Caption:**", caption)
        
        with st.spinner("Generating story..."):
            story = generate_story(caption)
        st.write("**Story:**")
        st.write(story)
        with st.spinner("Converting to voice..."):
            audio_file = text_to_speech(story)
        st.audio(audio_file, format="audio/mp3")

if __name__ == "__main__":
    main()