File size: 2,911 Bytes
1fb6258 0887ac5 1fb6258 6b8bebd 7577927 342ef7b 7577927 1fb6258 7577927 0887ac5 e775448 8769d88 6e40acc d42bff7 0887ac5 e775448 6b8bebd 7577927 6b8bebd 0887ac5 7577927 0887ac5 e775448 0887ac5 342ef7b 0887ac5 7577927 1fb6258 0887ac5 f72a080 0887ac5 b55983c 1fb6258 342ef7b 0887ac5 7577927 09b929d 342ef7b a31b925 6b8bebd 7577927 b55983c f72a080 0887ac5 b55983c a31b925 0887ac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
def generate_caption(image_file):
image = Image.open(image_file)
caption_generator = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
caption_results = caption_generator(image)
caption = caption_results[0]['generated_text']
return caption
def generate_story(caption):
story_generator = pipeline("text-generation", model="Qwen/Qwen2.5-14B-Instruct-1M")
messages = [
{
"role": "user",
"content": f"Please based on following image caption: '{caption}', generate a complete fairy tale story for children with at least 100 words and max 300 words"
}
]
result = story_generator(messages, max_length=300, num_return_sequences=1)
story = result[0]['generated_text']
return story
@st.cache_resource
# def load_image_generator():
# device = "cuda" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if device == "cuda" else torch.float32
# pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
# pipe = pipe.to(device)
# return pipe
# def generate_illustration(prompt):
# pipe = load_image_generator()
# image_result = pipe(prompt)
# generated_image = image_result.images[0]
# return generated_image
def text_to_speech(text, output_file="output.mp3"):
tts = gTTS(text=text, lang="en")
tts.save(output_file)
return output_file
def main():
st.markdown("<h1 style='text-align: center;'>Your Image to Audio Story 🦜</h1>", unsafe_allow_html=True)
st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!")
uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded image", use_container_width=True)
with st.spinner("Image caption being generated..."):
caption = generate_caption(uploaded_file)
st.write("**Image Caption:**", caption)
with st.spinner("Generating story..."):
story = generate_story(caption)
st.write("**Story:**")
st.write(story)
# with st.spinner("Generating illustration..."):
# illustration = generate_illustration(story[:200])
# st.write("### Story Illustrations:")
# st.image(illustration, caption="Story Illustrations", use_container_width=True)
with st.spinner("Converting to voice...."):
audio_file = text_to_speech(story)
st.audio(audio_file, format="audio/mp3")
if __name__ == "__main__":
main() |