Spaces:

geshang
/

Seg-R1-demo

Runtime error

App Files Files

xet

Community

geshang commited on Jun 18

Commit

7dacac2

verified ·

1 Parent(s): 64d205a

Create app.py

Browse files

Files changed (1) hide show

app.py +332 -0

app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import os
+import subprocess
+import gradio as gr
+from PIL import Image as PILImage
+import torchvision.transforms.functional as TF
+import numpy as np
+import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+import re
+import io
+import base64
+import cv2
+from typing import List, Tuple, Optional
+# ------------------ 安装SAM2依赖 ------------------
+def install_sam2():
+    """检查并安装SAM2及其依赖"""
+    sam2_dir = "third_party/sam2"
+    if not os.path.exists(sam2_dir):
+        print("Installing SAM2...")
+        os.makedirs("third_party", exist_ok=True)
+        subprocess.run(["git", "clone", "https://github.com/facebookresearch/sam2.git", sam2_dir], check=True)
+        # 安装依赖
+        subprocess.run(["pip", "install", "-e", sam2_dir], check=True)
+        # 下载检查点
+        checkpoints_dir = os.path.join(sam2_dir, "checkpoints")
+        os.makedirs(checkpoints_dir, exist_ok=True)
+        subprocess.run([os.path.join(checkpoints_dir, "download_ckpts.sh")], cwd=checkpoints_dir, shell=True, check=True)
+        print("SAM2 installed successfully!")
+    else:
+        print("SAM2 already installed.")
+# 确保安装SAM2
+install_sam2()
+# ------------------ 初始化模型 ------------------
+# 使用相对路径
+MODEL_PATH = "geshang/Seg-R1-COD"
+SAM_CHECKPOINT = "third_party/sam2/checkpoints/sam2.1_hiera_large.pt"
+# 自动检测设备
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+RESIZE_SIZE = (768, 768)
+# 加载Qwen模型
+try:
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
+        device_map="auto" if DEVICE == "cuda" else None
+    ).to(DEVICE)
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
+    print(f"Qwen model loaded on {DEVICE}")
+except Exception as e:
+    print(f"Error loading Qwen model: {e}")
+    # 创建虚拟模型以便继续运行
+    model = None
+    processor = None
+# SAM Wrapper
+class CustomSAMWrapper:
+    def __init__(self, model_path: str, device: str = DEVICE):
+        try:
+            from sam2.build_sam import build_sam2
+            from sam2.sam2_image_predictor import SAM2ImagePredictor
+            self.device = torch.device(device)
+            model_cfg = os.path.join("third_party/sam2", "configs/sam2.1/sam2.1_hiera_l.yaml")
+            sam_model = build_sam2(model_cfg, model_path)
+            sam_model = sam_model.to(self.device)
+            self.predictor = SAM2ImagePredictor(sam_model)
+            self.last_mask = None
+            print(f"SAM model loaded on {device}")
+        except Exception as e:
+            print(f"Error loading SAM model: {e}")
+            self.predictor = None
+    def predict(self, image: PILImage.Image,
+               points: List[Tuple[int, int]],
+               labels: List[int],
+               bbox: Optional[List[List[int]]] = None) -> Tuple[np.ndarray, float]:
+        if not self.predictor:
+            return np.zeros((image.height, image.width), dtype=bool), 0.0
+        try:
+            input_points = np.array(points) if points else None
+            input_labels = np.array(labels) if labels else None
+            input_bboxes = np.array(bbox) if bbox else None
+            image_np = np.array(image)
+            rgb_image = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+            self.predictor.set_image(rgb_image)
+            mask_pred, score, logits = self.predictor.predict(
+                point_coords=input_points,
+                point_labels=input_labels,
+                box=input_bboxes,
+                multimask_output=False,
+            )
+            self.last_mask = mask_pred[0]
+            return mask_pred[0], score[0]
+        except Exception as e:
+            print(f"SAM prediction error: {e}")
+            return np.zeros((image.height, image.width), dtype=bool), 0.0
+# 初始化SAM包装器
+sam_wrapper = CustomSAMWrapper(SAM_CHECKPOINT, device=DEVICE)
+# ------------------ 推理相关函数 ------------------
+def parse_custom_format(content: str):
+    point_pattern = r"<points>\s*(\[\s*(?:\[\s*\d+\s*,\s*\d+\s*\]\s*,?\s*)+\])\s*</points>"
+    label_pattern = r"<labels>\s*(\[\s*(?:\d+\s*,?\s*)+\])\s*</labels>"
+    bbox_pattern  = r"<bbox>\s*(\[\s*\d+\s*,\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\])\s*</bbox>"
+    point_match = re.search(point_pattern, content)
+    label_match = re.search(label_pattern, content)
+    bbox_matches = re.findall(bbox_pattern, content)
+    try:
+        points = np.array(eval(point_match.group(1))) if point_match else None
+        labels = np.array(eval(label_match.group(1))) if label_match else None
+        if points is not None and labels is not None:
+            if not (len(points.shape) == 2 and points.shape[1] == 2 and len(labels) == points.shape[0]):
+                points, labels = None, None
+        bboxes = []
+        for bbox_str in bbox_matches:
+            bbox = np.array(eval(bbox_str))
+            if len(bbox.shape) == 1 and bbox.shape[0] == 4:
+                bboxes.append(bbox)
+        bboxes = np.stack(bboxes, axis=0) if bboxes else None
+        return points, labels, bboxes
+    except Exception as e:
+        print("Error parsing content:", e)
+        return None, None, None
+def prepare_test_messages(image, prompt):
+    buffered = io.BytesIO()
+    image = TF.resize(image, RESIZE_SIZE)
+    image.save(buffered, format="JPEG")
+    img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    if "segment" in prompt or "mask" in prompt:
+        SYSTEM_PROMPT = (
+            "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+            "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+            "process should enclosed within <think> </think> tags, and the bounding box, points and points labels should be enclosed within <bbox></bbox>, <points></points>, and <labels></labels>, respectively. i.e., "
+            "<think> reasoning process here </think> <bbox>[x1,y1,x2,y2]</bbox>, <points>[[x3,y3],[x4,y4],...]</points>, <labels>[1,0,...]</labels>"
+            "Where 1 indicates a foreground (object) point, and 0 indicates a background point."
+        )
+    else:
+        SYSTEM_PROMPT = "You're a helpful visual assistant."
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"data:image/jpeg;base64,{img_base64}"},
+                {"type": "text", "text": prompt},
+            ],
+        },
+    ]
+    return [messages]
+def answer_question(batch_messages):
+    if not model or not processor:
+        return ["Model not loaded. Please check logs."]
+    try:
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(text=text, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True).to(DEVICE)
+        outputs = model.generate(**inputs, use_cache=True, max_new_tokens=1024)
+        trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, outputs)]
+        return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    except Exception as e:
+        print(f"Error generating answer: {e}")
+        return ["Error generating response"]
+def visualize_masks_on_image(
+    image: PILImage.Image,
+    masks_np: list,
+    colors=[(255, 0, 0), (0, 255, 0), (0, 0, 255),
+            (255, 255, 0), (255, 0, 255), (0, 255, 255),
+            (128, 128, 255)],
+    alpha=0.5,
+):
+    if not masks_np:
+        return image
+    image_np = np.array(image)
+    color_mask = np.zeros((image_np.shape[0], image_np.shape[1], 3), dtype=np.uint8)
+    for i, mask in enumerate(masks_np):
+        color = colors[i % len(colors)]
+        mask = mask.astype(np.uint8)
+        if mask.shape[:2] != image_np.shape[:2]:
+            mask = cv2.resize(mask, (image_np.shape[1], image_np.shape[0]))
+        color_mask[:, :, 0] = color_mask[:, :, 0] | (mask * color[0])
+        color_mask[:, :, 1] = color_mask[:, :, 1] | (mask * color[1])
+        color_mask[:, :, 2] = color_mask[:, :, 2] | (mask * color[2])
+    blended = cv2.addWeighted(image_np, 1 - alpha, color_mask, alpha, 0)
+    return PILImage.fromarray(blended)
+def run_pipeline(image: PILImage.Image, prompt: str):
+    if not model or not processor:
+        return "Models not loaded. Please check logs.", None
+    try:
+        img_original = image.copy()
+        img_resized = TF.resize(image, RESIZE_SIZE)
+        messages = prepare_test_messages(img_resized, prompt)
+        output_text = answer_question(messages)[0]
+        print(f"Model output: {output_text}")
+        points, labels, bbox = parse_custom_format(output_text)
+        mask_pred = None
+        final_mask = np.zeros(RESIZE_SIZE[::-1], dtype=bool)
+        if (points is not None and labels is not None) or (bbox is not None):
+            img = img_resized
+            if bbox is not None and len(bbox.shape) == 2:
+                for b in bbox:
+                    b = b.tolist()
+                    if points is not None and labels is not None:
+                        in_bbox_mask = (
+                            (points[:, 0] >= b[0]) & (points[:, 0] <= b[2]) &
+                            (points[:, 1] >= b[1]) & (points[:, 1] <= b[3])
+                        )
+                        selected_points = points[in_bbox_mask]
+                        selected_labels = labels[in_bbox_mask]
+                    else:
+                        selected_points, selected_labels = None, None
+                    try:
+                        mask, _ = sam_wrapper.predict(
+                            img,
+                            selected_points.tolist() if selected_points is not None and len(selected_points) > 0 else None,
+                            selected_labels.tolist() if selected_labels is not None and len(selected_labels) > 0 else None,
+                            b
+                        )
+                        final_mask |= (mask > 0)
+                    except Exception as e:
+                        print(f"Mask prediction error for bbox: {e}")
+                        continue
+                mask_pred = final_mask
+            else:
+                try:
+                    mask_pred, _ = sam_wrapper.predict(
+                        img,
+                        points.tolist() if points is not None else None,
+                        labels.tolist() if labels is not None else None,
+                        bbox.tolist() if bbox is not None else None
+                    )
+                    mask_pred = mask_pred > 0
+                except Exception as e:
+                    print(f"Mask prediction error: {e}")
+                    mask_pred = np.zeros(RESIZE_SIZE[::-1], dtype=bool)
+        else:
+            return output_text, None
+        # 将掩码调整回原始图像尺寸
+        mask_np = mask_pred
+        mask_img = PILImage.fromarray((mask_np * 255).astype(np.uint8)).resize(img_original.size)
+        mask_img = mask_img.convert("L")
+        mask_np = np.array(mask_img) > 128
+        # 可视化结果
+        visualized_img = visualize_masks_on_image(
+            img_original,
+            masks_np=[mask_np],
+            alpha=0.6
+        )
+        return output_text, visualized_img
+    except Exception as e:
+        print(f"Pipeline error: {e}")
+        return f"Error processing request: {str(e)}", None
+# ------------------ 启动 Gradio ------------------
+with gr.Blocks(title="Seg-R1") as demo:
+    gr.Markdown("# Seg-R1: Visual Segmentation Assistant")
+    gr.Markdown("Upload an image and ask questions about segmentation.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload Image")
+            text_input = gr.Textbox(lines=2, label="Question", placeholder="Ask about objects in the image...")
+            submit_btn = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            text_output = gr.Textbox(label="Model Response", interactive=False)
+            image_output = gr.Image(type="pil", label="Segmentation Result", interactive=False)
+    submit_btn.click(
+        fn=run_pipeline,
+        inputs=[image_input, text_input],
+        outputs=[text_output, image_output]
+    )
+    # gr.Examples(
+    #     examples=[
+    #         ["examples/dog.jpg", "Segment the dog in the image"],
+    #         ["examples/street.jpg", "Find all cars in the image"],
+    #         ["examples/fruits.jpg", "Identify the apples"]
+    #     ],
+    #     inputs=[image_input, text_input],
+    #     outputs=[text_output, image_output],
+    #     fn=run_pipeline,
+    #     cache_examples=False
+    # )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)