Spaces:

Devarsh24
/

Image_Captioning_Advanced

Sleeping

File size: 2,083 Bytes

# to create neural network
import torch

# for interface
import gradio as gr

# to open images
from PIL import Image

# used for audio
import scipy.io.wavfile as wavfile

# Use a pipeline as a high-level helper
from transformers import pipeline

# device: 0 for GPU, -1 for CPU
device = 0 if torch.cuda.is_available() else -1

# Text-to-speech model (English)
narrator = pipeline(
    "text-to-speech",
    model="facebook/mms-tts-eng",
    device=device
)

# Load the pretrained image captioning model
caption_image = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-base",
    device=device
)

# Define the function to generate audio from text 
def generate_audio(text):
    # Generate the narrated text
    narrated_text = narrator(text)

    # narrator output format: dict with "audio" and "sampling_rate"
    audio = narrated_text["audio"]
    # sometimes it's a list of arrays, handle that:
    if isinstance(audio, list):
        audio = audio[0]

    # Save the audio to WAV file
    output_path = "output.wav"
    wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)

    # Return the path to the saved output WAV file
    return output_path  # return audio file path

def caption_my_image(pil_image: Image.Image):
    # Call pipeline with positional input (no `images=` keyword)
    result = caption_image(pil_image)

    # result is usually a list of dicts
    if isinstance(result, list):
        semantics = result[0]["generated_text"]
    else:
        semantics = result["generated_text"]

    audio = generate_audio(semantics)
    return semantics, audio  # returns both text and audio output


# gr.close_all()  # <- NOT NEEDED, remove to avoid issues

demo = gr.Interface(
    fn=caption_my_image,
    inputs=[gr.Image(label="Select Image", type="pil")],
    outputs=[
        gr.Textbox(label="Image Caption"),
        gr.Audio(label="Image Caption Audio")
    ],
    title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
    description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
)

demo.launch()