import os os.system("python setup.py build develop --user") import gradio as gr from app_util import ContextDetDemo header = '''

Contextual Object Detection with Multimodal Large Language Models

''' abstract = ''' ๐Ÿค— This is the official Gradio demo for Contextual Object Detection with Multimodal Large Language Models. ๐Ÿ†’ Our goal is to promote object detection with better `context understanding` and enable `interactive feedback` through `human language vocabulary`, all made possible by using multimodal large language models! ๐Ÿค This demo is still under construction. Your comments or suggestions are welcome! โšก For faster inference without waiting in queue, you may duplicate the space and use the GPU setting: Duplicate Space

''' footer = r''' ๐Ÿฆ **Github Repo** We would be grateful if you consider star our github repo ๐Ÿ“ **Citation** We would be grateful if you consider citing our work if you find it useful: ```bibtex @article{ } ``` ๐Ÿ“‹ **License** This project is licensed under S-Lab License 1.0. Redistribution and use for non-commercial purposes should follow this license. ๐Ÿ“ง **Contact** If you have any questions, please feel free to contact Yuhang Zang (zang0012@ntu.edu.sg). ''' css = ''' h1#title { text-align: center; } ''' cloze_samples = [ ["main_4.jpg", "A teacher is helping a with her homework at desk."], ["main_5.jpg", "A man crossing a busy with his up."], ] captioning_samples = [ ["main_1.jpg"], ["main_2.jpg"], ["main_4.jpg"], ["main_6.jpeg"], ] qa_samples = [ ["main_5.jpg", "What is his career?"], ["main_6.jpeg", "What are they doing?"], ] contextdet_model = ContextDetDemo('./ckpt.pth') def inference_fn_select(image_input, text_input, task_button, history=[]): return contextdet_model.forward(image_input, text_input, task_button, history) def set_cloze_samples(example: list) -> dict: return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Cloze Test' def set_captioning_samples(example: list) -> dict: return gr.Image.update(example[0]), gr.Textbox.update(''), 'Captioning' def set_qa_samples(example: list) -> dict: return gr.Image.update(example[0]), gr.Textbox.update(example[1]), 'Question Answering' with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: gr.Markdown(header) gr.Markdown(abstract) state = gr.State([]) with gr.Row(): with gr.Column(scale=0.5, min_width=500): image_input = gr.Image(type="pil", interactive=True, label="Upload an image ๐Ÿ“").style(height=250) with gr.Column(scale=0.5, min_width=500): chat_input = gr.Textbox(label="Type your text prompt โคต๏ธ") task_button = gr.Radio(label="Contextual Task type", interactive=True, choices=['Cloze Test', 'Captioning', 'Question Answering'], value='Cloze Test') with gr.Row(): submit_button = gr.Button(value="๐Ÿƒ Run", interactive=True, variant="primary") clear_button = gr.Button(value="๐Ÿ”„ Clear", interactive=True) with gr.Row(): with gr.Column(scale=0.5, min_width=500): image_output = gr.Image(type='pil', interactive=False, label="Detection output") with gr.Column(scale=0.5, min_width=500): chat_output = gr.Chatbot(label="Text output").style(height=300) with gr.Row(): with gr.Column(scale=0.33, min_width=330): cloze_examples = gr.Dataset( label='Contextual Cloze Test Examples', components=[image_input, chat_input], samples=cloze_samples, ) with gr.Column(scale=0.33, min_width=330): qa_examples = gr.Dataset( label='Contextual Question Answering Examples', components=[image_input, chat_input], samples=qa_samples, ) with gr.Column(scale=0.33, min_width=330): captioning_examples = gr.Dataset( label='Contextual Captioning Examples', components=[image_input, ], samples=captioning_samples, ) submit_button.click( inference_fn_select, [image_input, chat_input, task_button, state], [image_output, chat_output, state], ) clear_button.click( lambda: (None, None, "", [], [], 'Question Answering'), [], [image_input, image_output, chat_input, chat_output, state, task_button], queue=False, ) image_input.change( lambda: (None, "", []), [], [image_output, chat_output, state], queue=False, ) cloze_examples.click( fn=set_cloze_samples, inputs=[cloze_examples], outputs=[image_input, chat_input, task_button], ) captioning_examples.click( fn=set_captioning_samples, inputs=[captioning_examples], outputs=[image_input, chat_input, task_button], ) qa_examples.click( fn=set_qa_samples, inputs=[qa_examples], outputs=[image_input, chat_input, task_button], ) gr.Markdown(footer) demo.launch(enable_queue=True, share=False) # demo.launch(enable_queue=True, share=True)