File size: 5,977 Bytes
a059c46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7be5aa5
a059c46
 
 
 
 
 
 
 
 
 
 
 
7be5aa5
 
 
 
 
a059c46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
os.system("python setup.py build develop --user")

import gradio as gr

from app_util import ContextDetDemo

header = '''
<div align=center>
<h1 style="font-weight: 900; margin-bottom: 7px;">
Contextual Object Detection with Multimodal Large Language Models
</h1>
</div>
'''

abstract = '''
πŸ€— This is the official Gradio demo for <b>Contextual Object Detection with Multimodal Large Language Models</b>.

πŸ†’ Our goal is to promote object detection with better `context understanding` and enable `interactive feedback`
through `human language vocabulary`, all made possible by using multimodal large language models!

🀝 This demo is still under construction. Your comments or suggestions are welcome!

⚑ For faster inference without waiting in the queue, you may duplicate the space and use the GPU setting:
<a href="https://huggingface.co/spaces/yuhangzang/ContextDet-Demo?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
<p/>
'''

footer = r'''
🦁 **Github Repo**
We would be grateful if you consider star our <a href="https://github.com/yuhangzang/ContextDET">github repo</a>

πŸ“ **Citation**
We would be grateful if you consider citing our work if you find it useful:
```bibtex
@article{zang2023contextual,
  author = {Zang, Yuhang and Li, Wei and Han, Jun, and Zhou, Kaiyang and Loy, Chen Change},
  title = {Contextual Object Detection with Multimodal Large Language Models},
  journal = {arXiv preprint arXiv:2305.18279},
  year = {2023}
}
```

πŸ“‹ **License**
This project is licensed under
<a rel="license" href="https://github.com/sczhou/CodeFormer/blob/master/LICENSE">S-Lab License 1.0</a>.
Redistribution and use for non-commercial purposes should follow this license.

πŸ“§ **Contact**
If you have any questions, please feel free to contact Yuhang Zang <b>([email protected])</b>.
'''

css = '''
h1#title {
  text-align: center;
}
'''

cloze_samples = [
    ["main_4.jpg", "A teacher is helping a <mask> with her homework at desk."],
    ["main_5.jpg", "A man crossing a busy <mask> with his <mask> up."],
]


captioning_samples = [
    ["main_1.jpg"],
    ["main_2.jpg"],
    ["main_4.jpg"],
    ["main_6.jpeg"],
]

qa_samples = [
    ["main_5.jpg", "What is his career?"],
    ["main_6.jpeg", "What are they doing?"],
]

contextdet_model = ContextDetDemo('./ckpt.pth')


def inference_fn_select(image_input, text_input, task_button, history=[]):
    return contextdet_model.forward(image_input, text_input, task_button, history)


def set_cloze_samples(example: list) -> dict:
    return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Cloze Test'


def set_captioning_samples(example: list) -> dict:
    return gr.Image.update(example[0]), gr.Textbox.update(''), 'Captioning'


def set_qa_samples(example: list) -> dict:
    return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Question Answering'


with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown(header)
    gr.Markdown(abstract)
    state = gr.State([])

    with gr.Row():
        with gr.Column(scale=0.5, min_width=500):
            image_input = gr.Image(type="pil", interactive=True, label="Upload an image πŸ“").style(height=250)
        with gr.Column(scale=0.5, min_width=500):
            chat_input = gr.Textbox(label="Type your text prompt ‡️")
            task_button = gr.Radio(label="Contextual Task type", interactive=True,
                                   choices=['Cloze Test', 'Captioning', 'Question Answering'],
                                   value='Cloze Test')
            with gr.Row():
                submit_button = gr.Button(value="πŸƒ Run", interactive=True, variant="primary")
                clear_button = gr.Button(value="πŸ”„ Clear", interactive=True)

    with gr.Row():
        with gr.Column(scale=0.5, min_width=500):
            image_output = gr.Image(type='pil', interactive=False, label="Detection output")
        with gr.Column(scale=0.5, min_width=500):
            chat_output = gr.Chatbot(label="Text output").style(height=300)

    with gr.Row():
        with gr.Column(scale=0.33, min_width=330):
            cloze_examples = gr.Dataset(
                label='Contextual Cloze Test Examples',
                components=[image_input, chat_input],
                samples=cloze_samples,
            )
        with gr.Column(scale=0.33, min_width=330):
            qa_examples = gr.Dataset(
                label='Contextual Question Answering Examples',
                components=[image_input, chat_input],
                samples=qa_samples,
            )
        with gr.Column(scale=0.33, min_width=330):
            captioning_examples = gr.Dataset(
                label='Contextual Captioning Examples',
                components=[image_input, ],
                samples=captioning_samples,
            )

    submit_button.click(
        inference_fn_select,
        [image_input, chat_input, task_button, state],
        [image_output, chat_output, state],
    )
    clear_button.click(
        lambda: (None, None, "", [], [], 'Question Answering'),
        [],
        [image_input, image_output, chat_input, chat_output, state, task_button],
        queue=False,
    )
    image_input.change(
        lambda: (None, "", []),
        [],
        [image_output, chat_output, state],
        queue=False,
    )
    cloze_examples.click(
        fn=set_cloze_samples,
        inputs=[cloze_examples],
        outputs=[image_input, chat_input, task_button],
    )
    captioning_examples.click(
        fn=set_captioning_samples,
        inputs=[captioning_examples],
        outputs=[image_input, chat_input, task_button],
    )
    qa_examples.click(
        fn=set_qa_samples,
        inputs=[qa_examples],
        outputs=[image_input, chat_input, task_button],
    )

    gr.Markdown(footer)

demo.launch(enable_queue=True, share=False)
# demo.launch(enable_queue=True, share=True)