File size: 2,911 Bytes
1fb6258
0887ac5
1fb6258
6b8bebd
7577927
342ef7b
7577927
1fb6258
 
 
 
7577927
0887ac5
e775448
8769d88
6e40acc
d42bff7
 
 
 
 
 
 
0887ac5
 
e775448
6b8bebd
7577927
 
 
 
 
 
 
 
 
 
 
 
6b8bebd
0887ac5
7577927
0887ac5
 
e775448
0887ac5
342ef7b
 
0887ac5
7577927
1fb6258
0887ac5
 
f72a080
0887ac5
b55983c
1fb6258
342ef7b
0887ac5
7577927
09b929d
342ef7b
a31b925
6b8bebd
7577927
 
b55983c
f72a080
0887ac5
b55983c
a31b925
 
 
0887ac5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS

st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")

def generate_caption(image_file):
    image = Image.open(image_file)
    caption_generator = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
    caption_results = caption_generator(image)
    caption = caption_results[0]['generated_text']
    return caption

def generate_story(caption):
    story_generator = pipeline("text-generation", model="Qwen/Qwen2.5-14B-Instruct-1M")
    messages = [
        {
            "role": "user",
            "content": f"Please based on following image caption: '{caption}', generate a complete fairy tale story for children with at least 100 words and max 300 words"
        }
    ]
    result = story_generator(messages, max_length=300, num_return_sequences=1)
    story = result[0]['generated_text']
    return story

@st.cache_resource
# def load_image_generator():
#     device = "cuda" if torch.cuda.is_available() else "cpu"
#     torch_dtype = torch.float16 if device == "cuda" else torch.float32
#     pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
#     pipe = pipe.to(device)
#     return pipe

# def generate_illustration(prompt):
#     pipe = load_image_generator()
#     image_result = pipe(prompt)
#     generated_image = image_result.images[0]
#     return generated_image

def text_to_speech(text, output_file="output.mp3"):
    tts = gTTS(text=text, lang="en")
    tts.save(output_file)
    return output_file

def main():
    st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
    st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
    
    uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
    
    if uploaded_file is not None:
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded image", use_container_width=True)
        
        with st.spinner("Image caption being generated..."):
            caption = generate_caption(uploaded_file)
        st.write("**Image Caption:**", caption)
        
        with st.spinner("Generating story..."):
            story = generate_story(caption)
        st.write("**Story:**")
        st.write(story)
        
        # with st.spinner("Generating illustration..."):
        #     illustration = generate_illustration(story[:200])
        # st.write("### Story Illustrations:")
        # st.image(illustration, caption="Story Illustrations", use_container_width=True)
        
        with st.spinner("Converting to voice...."):
            audio_file = text_to_speech(story)
        st.audio(audio_file, format="audio/mp3")

if __name__ == "__main__":
    main()