import streamlit as st from PIL import Image from transformers import pipeline from gtts import gTTS st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") def generate_caption(image_file): image = Image.open(image_file) caption_generator = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") caption_results = caption_generator(image) caption = caption_results[0]['generated_text'] return caption def generate_story(caption): story_generator = pipeline("text-generation", model="Qwen/QwQ-32B") messages = [ { "role": "user", "content": f"Please based on following image caption: '{caption}', generate a complete fairy tale story for children with at least 100 words and max 300 words" } ] result = story_generator(messages, max_length=300, num_return_sequences=1) story = result[0]['generated_text'] return story @st.cache_resource # def load_image_generator(): # device = "cuda" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if device == "cuda" else torch.float32 # pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") # pipe = pipe.to(device) # return pipe # def generate_illustration(prompt): # pipe = load_image_generator() # image_result = pipe(prompt) # generated_image = image_result.images[0] # return generated_image def text_to_speech(text, output_file="output.mp3"): tts = gTTS(text=text, lang="en") tts.save(output_file) return output_file def main(): st.markdown("