File size: 5,977 Bytes
a059c46 7be5aa5 a059c46 7be5aa5 a059c46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
os.system("python setup.py build develop --user")
import gradio as gr
from app_util import ContextDetDemo
header = '''
<div align=center>
<h1 style="font-weight: 900; margin-bottom: 7px;">
Contextual Object Detection with Multimodal Large Language Models
</h1>
</div>
'''
abstract = '''
π€ This is the official Gradio demo for <b>Contextual Object Detection with Multimodal Large Language Models</b>.
π Our goal is to promote object detection with better `context understanding` and enable `interactive feedback`
through `human language vocabulary`, all made possible by using multimodal large language models!
π€ This demo is still under construction. Your comments or suggestions are welcome!
β‘ For faster inference without waiting in the queue, you may duplicate the space and use the GPU setting:
<a href="https://huggingface.co/spaces/yuhangzang/ContextDet-Demo?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
<p/>
'''
footer = r'''
π¦ **Github Repo**
We would be grateful if you consider star our <a href="https://github.com/yuhangzang/ContextDET">github repo</a>
π **Citation**
We would be grateful if you consider citing our work if you find it useful:
```bibtex
@article{zang2023contextual,
author = {Zang, Yuhang and Li, Wei and Han, Jun, and Zhou, Kaiyang and Loy, Chen Change},
title = {Contextual Object Detection with Multimodal Large Language Models},
journal = {arXiv preprint arXiv:2305.18279},
year = {2023}
}
```
π **License**
This project is licensed under
<a rel="license" href="https://github.com/sczhou/CodeFormer/blob/master/LICENSE">S-Lab License 1.0</a>.
Redistribution and use for non-commercial purposes should follow this license.
π§ **Contact**
If you have any questions, please feel free to contact Yuhang Zang <b>([email protected])</b>.
'''
css = '''
h1#title {
text-align: center;
}
'''
cloze_samples = [
["main_4.jpg", "A teacher is helping a <mask> with her homework at desk."],
["main_5.jpg", "A man crossing a busy <mask> with his <mask> up."],
]
captioning_samples = [
["main_1.jpg"],
["main_2.jpg"],
["main_4.jpg"],
["main_6.jpeg"],
]
qa_samples = [
["main_5.jpg", "What is his career?"],
["main_6.jpeg", "What are they doing?"],
]
contextdet_model = ContextDetDemo('./ckpt.pth')
def inference_fn_select(image_input, text_input, task_button, history=[]):
return contextdet_model.forward(image_input, text_input, task_button, history)
def set_cloze_samples(example: list) -> dict:
return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Cloze Test'
def set_captioning_samples(example: list) -> dict:
return gr.Image.update(example[0]), gr.Textbox.update(''), 'Captioning'
def set_qa_samples(example: list) -> dict:
return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Question Answering'
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown(header)
gr.Markdown(abstract)
state = gr.State([])
with gr.Row():
with gr.Column(scale=0.5, min_width=500):
image_input = gr.Image(type="pil", interactive=True, label="Upload an image π").style(height=250)
with gr.Column(scale=0.5, min_width=500):
chat_input = gr.Textbox(label="Type your text prompt ‡οΈ")
task_button = gr.Radio(label="Contextual Task type", interactive=True,
choices=['Cloze Test', 'Captioning', 'Question Answering'],
value='Cloze Test')
with gr.Row():
submit_button = gr.Button(value="π Run", interactive=True, variant="primary")
clear_button = gr.Button(value="π Clear", interactive=True)
with gr.Row():
with gr.Column(scale=0.5, min_width=500):
image_output = gr.Image(type='pil', interactive=False, label="Detection output")
with gr.Column(scale=0.5, min_width=500):
chat_output = gr.Chatbot(label="Text output").style(height=300)
with gr.Row():
with gr.Column(scale=0.33, min_width=330):
cloze_examples = gr.Dataset(
label='Contextual Cloze Test Examples',
components=[image_input, chat_input],
samples=cloze_samples,
)
with gr.Column(scale=0.33, min_width=330):
qa_examples = gr.Dataset(
label='Contextual Question Answering Examples',
components=[image_input, chat_input],
samples=qa_samples,
)
with gr.Column(scale=0.33, min_width=330):
captioning_examples = gr.Dataset(
label='Contextual Captioning Examples',
components=[image_input, ],
samples=captioning_samples,
)
submit_button.click(
inference_fn_select,
[image_input, chat_input, task_button, state],
[image_output, chat_output, state],
)
clear_button.click(
lambda: (None, None, "", [], [], 'Question Answering'),
[],
[image_input, image_output, chat_input, chat_output, state, task_button],
queue=False,
)
image_input.change(
lambda: (None, "", []),
[],
[image_output, chat_output, state],
queue=False,
)
cloze_examples.click(
fn=set_cloze_samples,
inputs=[cloze_examples],
outputs=[image_input, chat_input, task_button],
)
captioning_examples.click(
fn=set_captioning_samples,
inputs=[captioning_examples],
outputs=[image_input, chat_input, task_button],
)
qa_examples.click(
fn=set_qa_samples,
inputs=[qa_examples],
outputs=[image_input, chat_input, task_button],
)
gr.Markdown(footer)
demo.launch(enable_queue=True, share=False)
# demo.launch(enable_queue=True, share=True)
|