Spaces:
Sleeping
Sleeping
File size: 2,083 Bytes
4c685d2 9a99a85 1db872f 9a99a85 1db872f 4c685d2 1db872f 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 9a99a85 4c685d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# to create neural network
import torch
# for interface
import gradio as gr
# to open images
from PIL import Image
# used for audio
import scipy.io.wavfile as wavfile
# Use a pipeline as a high-level helper
from transformers import pipeline
# device: 0 for GPU, -1 for CPU
device = 0 if torch.cuda.is_available() else -1
# Text-to-speech model (English)
narrator = pipeline(
"text-to-speech",
model="facebook/mms-tts-eng",
device=device
)
# Load the pretrained image captioning model
caption_image = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=device
)
# Define the function to generate audio from text
def generate_audio(text):
# Generate the narrated text
narrated_text = narrator(text)
# narrator output format: dict with "audio" and "sampling_rate"
audio = narrated_text["audio"]
# sometimes it's a list of arrays, handle that:
if isinstance(audio, list):
audio = audio[0]
# Save the audio to WAV file
output_path = "output.wav"
wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)
# Return the path to the saved output WAV file
return output_path # return audio file path
def caption_my_image(pil_image: Image.Image):
# Call pipeline with positional input (no `images=` keyword)
result = caption_image(pil_image)
# result is usually a list of dicts
if isinstance(result, list):
semantics = result[0]["generated_text"]
else:
semantics = result["generated_text"]
audio = generate_audio(semantics)
return semantics, audio # returns both text and audio output
# gr.close_all() # <- NOT NEEDED, remove to avoid issues
demo = gr.Interface(
fn=caption_my_image,
inputs=[gr.Image(label="Select Image", type="pil")],
outputs=[
gr.Textbox(label="Image Caption"),
gr.Audio(label="Image Caption Audio")
],
title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
)
demo.launch()
|