File size: 3,425 Bytes
a4a0cba 97efbf3 a4a0cba 97efbf3 a4a0cba 97efbf3 16f5ed7 1ab26ef 97efbf3 1e8859c 97efbf3 1e8859c 97efbf3 9e09636 97efbf3 9e09636 2e3e400 9e09636 2e3e400 97efbf3 8498664 97efbf3 a4a0cba 97efbf3 16f5ed7 97efbf3 f1d429a 97efbf3 f1d429a 97efbf3 f1d429a 97efbf3 f1d429a 97efbf3 1ab26ef 4299953 1ab26ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import torch
st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="π¦")
def create_image_caption(image_file):
pil_image = Image.open(image_file)
caption_generator = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
)
caption_result = caption_generator(pil_image)
generated_caption = caption_result[0]["generated_text"]
return generated_caption
def build_children_story(image_caption):
story_generator = pipeline(
"text-generation",
model="Qwen/Qwen2.5-0.5B-Instruct",
)
story_prompt = (
"You are a talented and imaginative storyteller for children aged 3 to 10. "
"Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, "
"an adventurous journey, and delightful surprises. "
"Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n"
f"Image Details: {image_caption}\n\nStory:"
)
generated_output = story_generator(
story_prompt,
max_new_tokens=150,
num_return_sequences=1,
do_sample=True
)
raw_story = generated_output[0]["generated_text"]
if "Story:" in raw_story:
story_text = raw_story.split("Story:", 1)[1].strip()
else:
story_text = raw_story.strip()
story_words = story_text.split()
if len(story_words) > 95:
trimmed_story = " ".join(story_words[:95])
last_period = trimmed_story.rfind(".")
last_exclamation = trimmed_story.rfind("!")
last_question = trimmed_story.rfind("?")
index = max(last_period, last_exclamation, last_question)
if index != -1:
trimmed_story = trimmed_story[:index + 1]
story_text = trimmed_story
if story_text and story_text[-1] not in ".!?":
story_text += "."
return story_text
def convert_text_to_audio(story_content, audio_file_name="output.mp3"):
tts = gTTS(text=story_content, lang="en")
tts.save(audio_file_name)
return audio_file_name
def main_app():
st.markdown("<h1 style='text-align: center;'>Image-to-Audio Story Generator π¦</h1>", unsafe_allow_html=True)
st.write("Upload an image below to generate an engaging story from the picture, then convert the story into audio playback!")
uploaded_image_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"])
if uploaded_image_file is not None:
pil_image = Image.open(uploaded_image_file)
st.image(pil_image, caption="Uploaded Image", use_container_width=True)
with st.spinner("Generating image caption..."):
image_caption = create_image_caption(uploaded_image_file)
st.write("**Image Caption:**", image_caption)
with st.spinner("Building story narrative..."):
story_content = build_children_story(image_caption)
st.write("**Story:**")
st.write(story_content)
with st.spinner("Converting story to audio..."):
audio_file_name = convert_text_to_audio(story_content)
st.audio(audio_file_name, format="audio/mp3")
if __name__ == "__main__":
main_app() |