Spaces:

maxiw
/

Qwen2-VL-Detection

Running on Zero

App Files Files Community

maxiw commited on Sep 4, 2024

Commit

0da7bd3

verified ·

1 Parent(s): bbaff07

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -3

app.py CHANGED Viewed

@@ -52,7 +52,7 @@ def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scal
 @spaces.GPU
-def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     model = models[model_id].eval()
     processor = processors[model_id]
@@ -61,7 +61,7 @@ def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
             "role": "user",
             "content": [
                 {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
-                {"type": "text", "text": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."},
                 {"type": "text", "text": f"detect {text_input}"},
             ],
         }
@@ -109,6 +109,7 @@ with gr.Blocks(css=css) as demo:
             with gr.Column():
                 input_img = gr.Image(label="Input Picture", type="pil")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
                 text_input = gr.Textbox(label="Description of Localization Target")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
@@ -116,6 +117,6 @@ with gr.Blocks(css=css) as demo:
                 parsed_boxes = gr.Textbox(label="Parsed Boxes")
                 annotated_image = gr.Image(label="Annotated Picture")
-        submit_btn.click(run_example, [input_img, text_input, model_selector], [model_output_text, parsed_boxes, annotated_image])
 demo.launch(debug=True)

 @spaces.GPU
+def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     model = models[model_id].eval()
     processor = processors[model_id]
             "role": "user",
             "content": [
                 {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
+                {"type": "text", "text": system_prompt},
                 {"type": "text", "text": f"detect {text_input}"},
             ],
         }
             with gr.Column():
                 input_img = gr.Image(label="Input Picture", type="pil")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
+                system_prompt = gr.Textbox(label="System Prompt", value="You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
                 text_input = gr.Textbox(label="Description of Localization Target")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 parsed_boxes = gr.Textbox(label="Parsed Boxes")
                 annotated_image = gr.Image(label="Annotated Picture")
+        submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector], [model_output_text, parsed_boxes, annotated_image])
 demo.launch(debug=True)