import streamlit as st from PIL import Image from transformers import pipeline from gtts import gTTS import torch st.set_page_config(page_title="Image-to-Audio Story Generator", page_icon="🦜") def create_image_caption(image_file): pil_image = Image.open(image_file) caption_generator = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base", ) caption_result = caption_generator(pil_image) generated_caption = caption_result[0]["generated_text"] return generated_caption def build_children_story(image_description): story_generator = pipeline( "text-generation", model="Qwen/Qwen2.5-0.5B-Instruct", ) story_prompt = ( "You are a talented and imaginative storyteller for children aged 3 to 10. " "Using the details derived from the image below, craft a complete and captivating tale that includes three main characters, " "an adventurous journey, and delightful surprises. " "Your story should have a clear beginning, middle, and end, and be between 80 and 100 words in length.\n\n" f"Image Details: {caption_result}\n\nStory:" ) generated_output = story_pipeline( story_prompt, max_new_tokens=150, num_return_sequences=1, do_sample=True ) raw_story = generated_output[0]["generated_text"] if "Story:" in raw_story: story_text = raw_story.split("Story:", 1)[1].strip() else: story_text = raw_story.strip() story_words = story_text.split() if len(story_words) > 95: trimmed_story = " ".join(story_words[:95]) last_period = trimmed_story.rfind(".") last_exclamation = trimmed_story.rfind("!") last_question = trimmed_story.rfind("?") index = max(last_period, last_exclamation, last_question) if index != -1: trimmed_story = trimmed_story[:index + 1] story_text = trimmed_story if story_text and story_text[-1] not in ".!?": story_text += "." return story_text def convert_text_to_audio(story_content, audio_file_name="output.mp3"): tts = gTTS(text=story_content, lang="en") tts.save(audio_file_name) return audio_file_name def main_app(): st.markdown("