Spaces:

omlab
/

VLM-R1-Referral-Expression

Runtime error

App Files Files Community

SZhanZ commited on Feb 16

Commit

4b25987

1 Parent(s): 6c162e9

init commit

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +0 -0
app.py +95 -55
examples/image1.jpg +3 -0
examples/image2.jpg +0 -0
requirements.txt +5 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/image1.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,64 +1,104 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import re
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from PIL import Image, ImageDraw
+def draw_bbox(image, bbox):
+    x1, y1, x2, y2 = bbox
+    draw = ImageDraw.Draw(image)
+    draw.rectangle((x1, y1, x2, y2), outline="red", width=5)
+    return image
+def extract_bbox_answer(content):
+    bbox_pattern = r'\{.*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]\s*.*\}'
+    bbox_match = re.search(bbox_pattern, content)
+    if bbox_match:
+        bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
+        return bbox
+    return [0, 0, 0, 0]
+def process_image_and_text(image, text):
+    """Process image and text input, return thinking process and bbox"""
+    question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
+    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": QUESTION_TEMPLATE.format(Question=question)},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[text],
+        images=image,
+        return_tensors="pt",
+        padding=True,
+        padding_side="left",
+        add_special_tokens=False,
+    )
+    # inputs = inputs
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(inputs.input_ids[0]):] for out_ids in generated_ids
+        ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True
+    )[0]
+    print("output_text: ", output_text)
+    # Extract thinking process
+    think_match = re.search(r'<think>(.*?)</think>', output_text, re.DOTALL)
+    thinking_process = think_match.group(1).strip() if think_match else "No thinking process found"
+    # Get bbox and draw
+    bbox = extract_bbox_answer(output_text)
+    # Draw bbox on the image
+    result_image = image.copy()
+    result_image = draw_bbox(result_image, bbox)
+    return thinking_process, result_image
 if __name__ == "__main__":
+    import gradio as gr
+    # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
+    model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path)
+    processor = AutoProcessor.from_pretrained(model_path)
+    def gradio_interface(image, text):
+        thinking, result_image = process_image_and_text(image, text)
+        return thinking, result_image
+    demo = gr.Interface(
+        fn=gradio_interface,
+        inputs=[
+            gr.Image(type="pil", label="Input Image"),
+            gr.Textbox(label="Description Text")
+        ],
+        outputs=[
+            gr.Textbox(label="Thinking Process"),
+            gr.Image(type="pil", label="Result with Bbox")
+        ],
+        title="Visual Referring Expression Demo",
+        description="Upload an image and input description text, the system will return the thinking process and region annotation",
+        examples=[
+            ["examples/image1.jpg", "food with the highest protein"],
+            ["examples/image2.jpg", "the cheapest laptop"],
+        ]
+    )
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

examples/image1.jpg ADDED Viewed

Git LFS Details

SHA256: e779913142b5db662be50e6e5e8d9b598913dc3a1c2c27abfbbd1dd44630cdd9
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

examples/image2.jpg ADDED Viewed

requirements.txt CHANGED Viewed

	@@ -1 +1,5 @@
1	- ~~huggingface_hub==~~0.~~25.2~~

+torch>=2.0.0
+git+https://github.com/huggingface/transformers
+Pillow>=10.0.0
+httpx[socks]
+accelerate>=0.26.0