import streamlit as st from PIL import Image from transformers import pipeline from gtts import gTTS st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") def generate_caption(image_file): image = caption_generator = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") caption_results = caption_generator(image) caption = caption_results[0]['generated_text'] return caption def generate_story(prompt): story_generator = pipeline("text-generation", model="gpt2") result = story_generator(prompt, max_length=300, num_return_sequences=1) story = result[0]['generated_text'] if len(story.split()) < 100: additional = story_generator(prompt, max_length=350, num_return_sequences=1)[0]['generated_text'] story += " " + additional return story # ---------------------------- # generate_illustration # ---------------------------- @st.cache_resource # def load_image_generator(): # device = "cuda" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 if device == "cuda" else torch.float32 # pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5") # pipe = # return pipe # def generate_illustration(prompt): # pipe = load_image_generator() # image_result = pipe(prompt) # generated_image = image_result.images[0] # return generated_image def text_to_speech(text, output_file="output.mp3"): tts = gTTS(text=text, lang="en") return output_file def main(): st.markdown("

Your Image to Audio Story 🦜

", unsafe_allow_html=True) st.write("Upload an image below and we will generate an engaging story from the picture, then convert the story into an audio playback!") uploaded_file = st.file_uploader("Select Image", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = st.image(image, caption="Uploaded image", use_column_width=True) with st.spinner("Image caption being generated..."): caption = generate_caption(uploaded_file) st.write("**Image Caption:**", caption) with st.spinner("Generating story..."): story_prompt = f"Please generate a children's story based on this description: {caption}" story = generate_story(story_prompt) st.write("**Story:**") st.write(story) # with st.spinner("Generating illustration..."): # illustration = generate_illustration(story[:200]) # st.write("### Story Illustrations:") # st.image(illustration, caption="Story Illustrations", use_column_width=True) with st.spinner("Converting to voice...."): audio_file = text_to_speech(story), format="audio/mp3") if __name__ == "__main__": main()