import gradio as gr from transformers import pipeline img_text_pipe = pipeline("image-to-text", model="./models/Salesforce/blip-image-captioning-base") narrator = pipeline("text-to-speech", model="./models/kakao-enterprise/vits-ljs") def describe_image(file_path): img_text_pip_output = img_text_pipe(file_path) description_text = img_text_pip_output[0]['generated_text'] narrated_text = narrator(description_text) return narrated_text["audio"][0] iface = gr.Interface(fn=describe_image, inputs=gr.Image(label="Input image", type="pil"), outputs="audio" ) iface.launch()