|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
|
|
img_text_pipe = pipeline("image-to-text", |
|
model="./models/Salesforce/blip-image-captioning-base") |
|
|
|
narrator = pipeline("text-to-speech", |
|
model="./models/kakao-enterprise/vits-ljs") |
|
|
|
def describe_image(file_path): |
|
|
|
img_text_pip_output = img_text_pipe(file_path) |
|
|
|
description_text = img_text_pip_output[0]['generated_text'] |
|
|
|
narrated_text = narrator(description_text) |
|
|
|
return narrated_text["audio"][0] |
|
|
|
|
|
iface = gr.Interface(fn=describe_image, |
|
inputs=gr.Image(label="Input image", |
|
type="pil"), |
|
outputs="audio" |
|
) |
|
iface.launch() |
|
|