# to create neural network import torch # for interface import gradio as gr # to open images from PIL import Image # used for audio import scipy.io.wavfile as wavfile # Use a pipeline as a high-level helper from transformers import pipeline # device: 0 for GPU, -1 for CPU device = 0 if torch.cuda.is_available() else -1 # Text-to-speech model (English) narrator = pipeline( "text-to-speech", model="facebook/mms-tts-eng", device=device ) # Load the pretrained image captioning model caption_image = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base", device=device ) # Define the function to generate audio from text def generate_audio(text): # Generate the narrated text narrated_text = narrator(text) # narrator output format: dict with "audio" and "sampling_rate" audio = narrated_text["audio"] # sometimes it's a list of arrays, handle that: if isinstance(audio, list): audio = audio[0] # Save the audio to WAV file output_path = "output.wav" wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio) # Return the path to the saved output WAV file return output_path # return audio file path def caption_my_image(pil_image: Image.Image): # Call pipeline with positional input (no `images=` keyword) result = caption_image(pil_image) # result is usually a list of dicts if isinstance(result, list): semantics = result[0]["generated_text"] else: semantics = result["generated_text"] audio = generate_audio(semantics) return semantics, audio # returns both text and audio output # gr.close_all() # <- NOT NEEDED, remove to avoid issues demo = gr.Interface( fn=caption_my_image, inputs=[gr.Image(label="Select Image", type="pil")], outputs=[ gr.Textbox(label="Image Caption"), gr.Audio(label="Image Caption Audio") ], title="IMAGE CAPTIONING WITH AUDIO OUTPUT", description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI" ) demo.launch()