File size: 2,083 Bytes
4c685d2
9a99a85
1db872f
 
9a99a85
1db872f
 
4c685d2
1db872f
 
9a99a85
 
 
 
 
4c685d2
 
9a99a85
4c685d2
 
 
 
 
 
9a99a85
4c685d2
 
 
 
 
 
9a99a85
 
 
 
 
 
4c685d2
 
 
 
 
 
9a99a85
4c685d2
 
9a99a85
 
4c685d2
9a99a85
4c685d2
 
 
 
 
 
 
 
 
9a99a85
 
4c685d2
 
9a99a85
4c685d2
9a99a85
4c685d2
 
 
 
 
 
 
 
 
 
9a99a85
4c685d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# to create neural network
import torch

# for interface
import gradio as gr

# to open images
from PIL import Image

# used for audio
import scipy.io.wavfile as wavfile

# Use a pipeline as a high-level helper
from transformers import pipeline

# device: 0 for GPU, -1 for CPU
device = 0 if torch.cuda.is_available() else -1

# Text-to-speech model (English)
narrator = pipeline(
    "text-to-speech",
    model="facebook/mms-tts-eng",
    device=device
)

# Load the pretrained image captioning model
caption_image = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-base",
    device=device
)

# Define the function to generate audio from text 
def generate_audio(text):
    # Generate the narrated text
    narrated_text = narrator(text)

    # narrator output format: dict with "audio" and "sampling_rate"
    audio = narrated_text["audio"]
    # sometimes it's a list of arrays, handle that:
    if isinstance(audio, list):
        audio = audio[0]

    # Save the audio to WAV file
    output_path = "output.wav"
    wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)

    # Return the path to the saved output WAV file
    return output_path  # return audio file path

def caption_my_image(pil_image: Image.Image):
    # Call pipeline with positional input (no `images=` keyword)
    result = caption_image(pil_image)

    # result is usually a list of dicts
    if isinstance(result, list):
        semantics = result[0]["generated_text"]
    else:
        semantics = result["generated_text"]

    audio = generate_audio(semantics)
    return semantics, audio  # returns both text and audio output


# gr.close_all()  # <- NOT NEEDED, remove to avoid issues

demo = gr.Interface(
    fn=caption_my_image,
    inputs=[gr.Image(label="Select Image", type="pil")],
    outputs=[
        gr.Textbox(label="Image Caption"),
        gr.Audio(label="Image Caption Audio")
    ],
    title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
    description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
)

demo.launch()