Wendy-Fly
/

ACL-2025

Model card Files Files and versions Community

Wendy commited on Jan 25

Commit

9144502

verified ·

1 Parent(s): 4b515a5

Upload cogagent_infer.py with huggingface_hub

Browse files

Files changed (1) hide show

cogagent_infer.py +213 -0

cogagent_infer.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import argparse
+import os
+import re
+import torch
+from PIL import Image, ImageDraw
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from typing import List
+import json
+from tqdm import tqdm
+#class
+def draw_boxes_on_image(image: Image.Image, boxes: List[List[float]], save_path: str):
+    """
+    Draws red bounding boxes on the given image and saves it.
+    Parameters:
+    - image (PIL.Image.Image): The image on which to draw the bounding boxes.
+    - boxes (List[List[float]]): A list of bounding boxes, each defined as [x_min, y_min, x_max, y_max].
+      Coordinates are expected to be normalized (0 to 1).
+    - save_path (str): The path to save the updated image.
+    Description:
+    Each box coordinate is a fraction of the image dimension. This function converts them to actual pixel
+    coordinates and draws a red rectangle to mark the area. The annotated image is then saved to the specified path.
+    """
+    draw = ImageDraw.Draw(image)
+    for box in boxes:
+        x_min = int(box[0] * image.width)
+        y_min = int(box[1] * image.height)
+        x_max = int(box[2] * image.width)
+        y_max = int(box[3] * image.height)
+        draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=3)
+    image.save(save_path)
+def main():
+    """
+    A continuous interactive demo using the CogAgent1.5 model with selectable format prompts.
+    The output_image_path is interpreted as a directory. For each round of interaction,
+    the annotated image will be saved in the directory with the filename:
+    {original_image_name_without_extension}_{round_number}.png
+    Example:
+    python cli_demo.py --model_dir THUDM/cogagent-9b-20241220 --platform "Mac" --max_length 4096 --top_k 1 \
+                     --output_image_path ./results --format_key status_action_op_sensitive
+    """
+    parser = argparse.ArgumentParser(
+        description="Continuous interactive demo with CogAgent model and selectable format."
+    )
+    parser.add_argument(
+        "--model_dir", required=True, help="Path or identifier of the model."
+    )
+    parser.add_argument(
+        "--platform",
+        default="Mac",
+        help="Platform information string (e.g., 'Mac', 'WIN').",
+    )
+    parser.add_argument(
+        "--max_length", type=int, default=4096, help="Maximum generation length."
+    )
+    parser.add_argument(
+        "--top_k", type=int, default=1, help="Top-k sampling parameter."
+    )
+    parser.add_argument(
+        "--output_image_path",
+        default="results",
+        help="Directory to save the annotated images.",
+    )
+    parser.add_argument(
+        "--input_json",
+        default="/Users/baixuehai/Downloads/2025_2/AITM_Test_General_BBox_v0.json",
+        help="Directory to save the annotated images.",
+    )
+    parser.add_argument(
+        "--output_json",
+        default="/Users/baixuehai/Downloads/2025_2/AITM_Test_General_BBox_v0.json",
+        help="Directory to save the annotated images.",
+    )
+    parser.add_argument(
+        "--format_key",
+        default="action_op_sensitive",
+        help="Key to select the prompt format.",
+    )
+    args = parser.parse_args()
+    # Dictionary mapping format keys to format strings
+    format_dict = {
+        "action_op_sensitive": "(Answer in Action-Operation-Sensitive format.)",
+        "status_plan_action_op": "(Answer in Status-Plan-Action-Operation format.)",
+        "status_action_op_sensitive": "(Answer in Status-Action-Operation-Sensitive format.)",
+        "status_action_op": "(Answer in Status-Action-Operation format.)",
+        "action_op": "(Answer in Action-Operation format.)",
+    }
+    # Ensure the provided format_key is valid
+    if args.format_key not in format_dict:
+        raise ValueError(
+            f"Invalid format_key. Available keys are: {list(format_dict.keys())}"
+        )
+    # Ensure the output directory exists
+    os.makedirs(args.output_image_path, exist_ok=True)
+    # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(args.model_dir, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_dir,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map="auto",
+        # quantization_config=BitsAndBytesConfig(load_in_8bit=True), # For INT8 quantization
+        # quantization_config=BitsAndBytesConfig(load_in_4bit=True), # For INT4 quantization
+    ).eval()
+    # Initialize platform and selected format strings
+    platform_str = f"(Platform: {args.platform})\n"
+    format_str = format_dict[args.format_key]
+    # Initialize history lists
+    history_step = []
+    history_action = []
+    round_num = 1
+    with open(args.input_json, "r") as f:
+        data = json.load(f)
+    res = []
+    for i in tqdm(range(len(data))):
+        x = data[i]
+        img_path = x['image']
+        image = Image.open(img_path).convert("RGB")
+        task = x['conversations'][0]['value']
+        # Verify history lengths match
+        try:
+            if len(history_step) != len(history_action):
+                raise ValueError("Mismatch in lengths of history_step and history_action.")
+        except ValueError as e:
+            print(f"警告: {e} - 跳过当前案例")
+        # Format history steps for output
+        history_str = "\nHistory steps: "
+        for index, (step, action) in enumerate(zip(history_step, history_action)):
+            history_str += f"\n{index}. {step}\t{action}"
+        # Compose the query with task, platform, and selected format instructions
+        query = f"Task: {task}{history_str}\n{platform_str}{format_str}"
+        #print(f"Round {round_num} query:\n{query}")
+        inputs = tokenizer.apply_chat_template(
+            [{"role": "user", "image": image, "content": query}],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_tensors="pt",
+            return_dict=True,
+        ).to(model.device)
+        # Generation parameters
+        gen_kwargs = {
+            "max_length": args.max_length,
+            "do_sample": True,
+            "top_k": args.top_k,
+        }
+        # Generate response
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **gen_kwargs)
+            outputs = outputs[:, inputs["input_ids"].shape[1]:]
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            #print(f"Model response:\n{response}")
+        # Extract grounded operation and action
+        grounded_pattern = r"Grounded Operation:\s*(.*)"
+        action_pattern = r"Action:\s*(.*)"
+        matches_history = re.search(grounded_pattern, response)
+        matches_actions = re.search(action_pattern, response)
+        if matches_history:
+            grounded_operation = matches_history.group(1)
+            history_step.append(grounded_operation)
+        if matches_actions:
+            action_operation = matches_actions.group(1)
+            history_action.append(action_operation)
+        # Extract bounding boxes from the response
+        box_pattern = r"box=\[\[?(\d+),(\d+),(\d+),(\d+)\]?\]"
+        matches = re.findall(box_pattern, response)
+        if matches:
+            boxes = [[int(x) / 1000 for x in match] for match in matches]
+            # Extract base name of the user's input image (without extension)
+            base_name = os.path.splitext(os.path.basename(img_path))[0]
+            # Construct the output file name with round number
+            output_file_name = f"{base_name}_{round_num}.png"
+            output_path = os.path.join(args.output_image_path, output_file_name)
+            draw_boxes_on_image(image, boxes, output_path)
+            #print(f"Annotated image saved at: {output_path}")
+        ans = {
+            'query': f"Round {round_num} query:\n{query}",
+            'response': response,
+            'output_path': output_path
+        }
+        res.append(ans)
+        round_num += 1
+    #print(res)
+    with open(args.output_json, "w", encoding="utf-8") as file:
+        json.dump(res, file, ensure_ascii=False, indent=4)
+    # with open(args.output_json,"w", encoding="utf-8")as f:
+    #     json.dump(res,f,ensure_ascii=False, indent=4)
+if __name__ == "__main__":
+    main()