import streamlit as st from PIL import Image from transformers import pipeline from gtts import gTTS st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") def generate_caption(image_file): image = Image.open(image_file) caption_generator = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") caption_results = caption_generator(image) caption = caption_results[0]['generated_text'] return caption def generate_story(caption): story_generator = pipeline("text-generation", model="Qwen/QwQ-32B") messages = [ { "role": "user", "content": f"Please based on following image caption: '{caption}', generate a complete fairy tale story for children with at least 100 words and max 300 words" } ] result = story_generator(messages, max_length=300, num_return_sequences=1) story = result[0]['generated_text'] return story @st.cache_resource # def load_image_generator(): # device = "cuda" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if device == "cuda" else torch.float32 # pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") # pipe = pipe.to(device) # return pipe # def generate_illustration(prompt): # pipe = load_image_generator() # image_result = pipe(prompt) # generated_image = image_result.images[0] # return generated_image def text_to_speech(text, output_file="output.mp3"): tts = gTTS(text=text, lang="en") tts.save(output_file) return output_file def main(): st.markdown("

Your Image to Audio Story 🦜

", unsafe_allow_html=True) st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!") uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded image", use_container_width=True) with st.spinner("Image caption being generated..."): caption = generate_caption(uploaded_file) st.write("**Image Caption:**", caption) with st.spinner("Generating story..."): story = generate_story(caption) st.write("**Story:**") st.write(story) # with st.spinner("Generating illustration..."): # illustration = generate_illustration(story[:200]) # st.write("### Story Illustrations:") # st.image(illustration, caption="Story Illustrations", use_container_width=True) with st.spinner("Converting to voice...."): audio_file = text_to_speech(story) st.audio(audio_file, format="audio/mp3") if __name__ == "__main__": main()