import streamlit as st from transformers import pipeline from gtts import gTTS import os from PIL import Image # Load models def load_models(): image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") storyteller = pipeline( "text-generation", model="pranavpsv/gpt2-genre-story-generator", temperature=0.75, max_length=100 ) return image_to_text, storyteller # Process image to text def generate_caption(image, image_to_text): result = image_to_text(image) return result[0]["generated_text"] if result else "No caption generated." # Generate a narrative story using the GPT-2 genre-based story generator def generate_story(text, storyteller): prompt = f" {text}" story = storyteller(prompt, max_length=100, num_return_sequences=1) # Clean the generated text by removing the prefix generated_story = story[0]["generated_text"].replace(" ", "").strip() return generated_story if generated_story else "No story generated." # Convert text to speech def text_to_speech(text, filename="output.mp3"): tts = gTTS(text) tts.save(filename) return filename # Main Streamlit app def main(): st.title("AI-Powered Image Captioning and Storytelling") image_to_text, storyteller = load_models() uploaded_file = st.file_uploader("Upload an image...", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: # Convert uploaded file to a PIL image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_container_width=True) with st.spinner("Generating caption..."): caption = generate_caption(image, image_to_text) st.write("### Image Caption:") st.write(caption) with st.spinner("Generating story..."): story = generate_story(caption, storyteller) st.write("### Generated Story:") st.write(story) with st.spinner("Generating speech..."): audio_file = text_to_speech(story) st.audio(audio_file, format="audio/mp3") if __name__ == "__main__": main()