Spaces:
Runtime error
Runtime error
File size: 5,111 Bytes
1e11ba3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import data
import torch
import gradio as gr
from models import imagebind_model
from models.imagebind_model import ModalityType
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)
def image_text_zeroshot(image, text_list):
image_paths = [image]
labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
inputs = {
ModalityType.TEXT: data.load_and_transform_text(labels, device),
ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
}
with torch.no_grad():
embeddings = model(inputs)
scores = (
torch.softmax(
embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1
)
.squeeze(0)
.tolist()
)
score_dict = {label: score for label, score in zip(labels, scores)}
return score_dict
def audio_text_zeroshot(audio, text_list):
audio_paths = [audio]
labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
inputs = {
ModalityType.TEXT: data.load_and_transform_text(labels, device),
ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
}
with torch.no_grad():
embeddings = model(inputs)
scores = (
torch.softmax(
embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1
)
.squeeze(0)
.tolist()
)
score_dict = {label: score for label, score in zip(labels, scores)}
return score_dict
def video_text_zeroshot(video, text_list):
video_paths = [video]
labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
inputs = {
ModalityType.TEXT: data.load_and_transform_text(labels, device),
ModalityType.VISION: data.load_and_transform_video_data(video_paths, device),
}
with torch.no_grad():
embeddings = model(inputs)
scores = (
torch.softmax(
embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1
)
.squeeze(0)
.tolist()
)
score_dict = {label: score for label, score in zip(labels, scores)}
return score_dict
def inference(
task,
text_list=None,
image=None,
audio=None,
video=None,
):
if task == "image-text":
result = image_text_zeroshot(image, text_list)
elif task == "audio-text":
result = audio_text_zeroshot(audio, text_list)
elif task == "video-text":
result = video_text_zeroshot(video, text_list)
else:
raise NotImplementedError
return result
def main():
inputs = [
gr.inputs.Radio(
choices=[
"image-text",
"audio-text",
"video-text",
],
type="value",
default="image-text",
label="Task",
),
gr.inputs.Textbox(lines=1, label="Candidate texts"),
gr.inputs.Image(type="filepath", label="Input image"),
gr.inputs.Audio(type="filepath", label="Input audio"),
gr.inputs.Video(type=None, label="Input video"),
]
iface = gr.Interface(
inference,
inputs,
"label",
examples=[
["image-text", "A dog|A car|A bird", "assets/dog_image.jpg", None, None],
["image-text", "A dog|A car|A bird", "assets/car_image.jpg", None, None],
["audio-text", "A dog|A car|A bird", None, "assets/bird_audio.wav", None],
["video-text", "A dog|A car|A bird", None, None, "assets/dog_video.mp4"],
],
description="""<p>This is a simple demo of ImageBind for zero-shot cross-modal understanding (now including image classification, audio classification, and video classification). Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
To test your own cases, you can upload an image, an audio or a video, and provide the candidate texts separated by "|".<br>
You can duplicate this space and run it privately: <a href='https://huggingface.co/spaces/OFA-Sys/chinese-clip-zero-shot-image-classification?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>""",
title="ImageBind: Zero-shot Cross-modal Understanding",
)
iface.launch()
if __name__ == "__main__":
main()
|