|
import os |
|
import sys |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
from os import path as osp |
|
from io import BytesIO |
|
|
|
from mbench.ytvos_ref import build as build_ytvos_ref |
|
import argparse |
|
import opts |
|
|
|
import sys |
|
from pathlib import Path |
|
import os |
|
from os import path as osp |
|
import skimage |
|
from io import BytesIO |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import regex as re |
|
import json |
|
|
|
import cv2 |
|
from PIL import Image, ImageDraw |
|
import torch |
|
from torchvision.transforms import functional as F |
|
|
|
from skimage import measure |
|
from shapely.geometry import Polygon, MultiPolygon |
|
|
|
import matplotlib.pyplot as plt |
|
import matplotlib.patches as patches |
|
from matplotlib.collections import PatchCollection |
|
from matplotlib.patches import Rectangle |
|
import textwrap |
|
|
|
|
|
import ipywidgets as widgets |
|
from IPython.display import display, clear_output |
|
|
|
from openai import OpenAI |
|
import base64 |
|
import json |
|
|
|
def number_objects_and_encode(idx, color_mask=False): |
|
encoded_frames = {} |
|
contoured_frames = {} |
|
vid_cat_cnts = {} |
|
|
|
vid_meta = metas[idx] |
|
vid_data = train_dataset[idx] |
|
vid_id = vid_meta['video'] |
|
frame_indx = vid_meta['sample_indx'] |
|
cat_names = set(vid_meta['obj_id_cat'].values()) |
|
imgs = vid_data[0] |
|
|
|
for cat in cat_names: |
|
cat_frames = [] |
|
contour_frames = [] |
|
frame_cat_cnts = {} |
|
|
|
for i in range(imgs.size(0)): |
|
frame_name = frame_indx[i] |
|
frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) |
|
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) |
|
|
|
frame_data = vid_data[2][frame_name] |
|
obj_ids = list(frame_data.keys()) |
|
|
|
cat_cnt = 0 |
|
|
|
for j in range(len(obj_ids)): |
|
obj_id = obj_ids[j] |
|
obj_data = frame_data[obj_id] |
|
obj_bbox = obj_data['bbox'] |
|
obj_valid = obj_data['valid'] |
|
obj_mask = obj_data['mask'].numpy().astype(np.uint8) |
|
obj_cat = obj_data['category_name'] |
|
|
|
if obj_cat == cat and obj_valid: |
|
cat_cnt += 1 |
|
|
|
if color_mask == False: |
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 3) |
|
for i, contour in enumerate(contours): |
|
|
|
moments = cv2.moments(contour) |
|
if moments["m00"] != 0: |
|
cx = int(moments["m10"] / moments["m00"]) |
|
cy = int(moments["m01"] / moments["m00"]) |
|
else: |
|
cx, cy = contour[0][0] |
|
|
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
text_size = cv2.getTextSize(text, font, 1, 2)[0] |
|
text_w, text_h = text_size |
|
|
|
|
|
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), |
|
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) |
|
|
|
|
|
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), |
|
font, 1, (255, 255, 255), 2) |
|
|
|
else: |
|
alpha = 0.08 |
|
|
|
colored_obj_mask = np.zeros_like(frame) |
|
colored_obj_mask[obj_mask == 1] = colors[j] |
|
frame[obj_mask == 1] = ( |
|
(1 - alpha) * frame[obj_mask == 1] |
|
+ alpha * colored_obj_mask[obj_mask == 1] |
|
) |
|
|
|
|
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 2) |
|
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) |
|
|
|
|
|
|
|
if len(contours) > 0: |
|
largest_contour = max(contours, key=cv2.contourArea) |
|
M = cv2.moments(largest_contour) |
|
if M["m00"] != 0: |
|
center_x = int(M["m10"] / M["m00"]) |
|
center_y = int(M["m01"] / M["m00"]) |
|
else: |
|
center_x, center_y = 0, 0 |
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
|
|
font_scale = 0.9 |
|
text_size = cv2.getTextSize(text, font, font_scale, 2)[0] |
|
text_x = center_x - text_size[0] // 1 |
|
text_y = center_y |
|
|
|
|
|
|
|
rect_start = (text_x - 5, text_y - text_size[1] - 5) |
|
|
|
rect_end = (text_x + text_size[0] + 5, text_y) |
|
|
|
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) |
|
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buffer = BytesIO() |
|
frame = Image.fromarray(frame) |
|
frame.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) |
|
frame_cat_cnts[frame_name] = cat_cnt |
|
|
|
buffer.seek(0) |
|
buffer.truncate() |
|
frame_for_contour = Image.fromarray(frame_for_contour) |
|
frame_for_contour.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) |
|
|
|
encoded_frames[cat] = cat_frames |
|
contoured_frames[cat] = contour_frames |
|
vid_cat_cnts[cat] = frame_cat_cnts |
|
|
|
return encoded_frames, vid_cat_cnts, contoured_frames |
|
|
|
|
|
def getCaption(idx, color_mask=True): |
|
vid_meta = metas[idx] |
|
vid_data = train_dataset[idx] |
|
vid_id = vid_meta['video'] |
|
print(f"vid id: {vid_id}\n") |
|
|
|
frame_indx = vid_meta['sample_indx'] |
|
cat_names = set(vid_meta['obj_id_cat'].values()) |
|
all_captions = dict() |
|
|
|
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask) |
|
marked = "mask with boundary" if color_mask else "boundary" |
|
|
|
for cat_name in list(cat_names) : |
|
|
|
is_movable = False |
|
if cat_name in ytvos_category_valid_list : |
|
is_movable = True |
|
|
|
if not is_movable: |
|
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') |
|
|
|
|
|
image_captions = {} |
|
captioner = OpenAI() |
|
cat_base64_frames = base64_frames[cat_name] |
|
cont_base64_frames = contoured_frames[cat_name] |
|
|
|
for i in range(len(cat_base64_frames)): |
|
frame_name = frame_indx[i] |
|
cont_base64_image = cont_base64_frames[i] |
|
base64_image = cat_base64_frames[i] |
|
should_filter = False |
|
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] |
|
|
|
if frame_cat_cnts >= 2: |
|
should_filter = True |
|
else: |
|
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') |
|
|
|
if is_movable and should_filter: |
|
|
|
print(f"-----------category name: {cat_name}, frame name: {frame_name}") |
|
caption_filter_text = f""" |
|
You are a visual assistant analyzing a single frame from a video. |
|
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. |
|
|
|
Are {cat_name}s in the image performing all different and recognizable actions or postures? |
|
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position), |
|
facial expressions, and any notable interactions with objects or other {cat_name}s or people. |
|
|
|
Only focus on obvious, prominent actions that can be reliably identified from this single frame. |
|
|
|
- Respond with "YES" if: |
|
1) Most of {cat_name}s exhibit clearly different, unique actions or poses. |
|
2) You can see visible significant differences in action and posture, that an observer can identify at a glance. |
|
3) Each action is unambiguously recognizable and distinct. |
|
|
|
- Respond with "NONE" if: |
|
1) The actions or pose are not clearly differentiable or too similar. |
|
2) They show no noticeable action beyond standing or minor movements. |
|
|
|
Answer strictly with either "YES" or "NONE". |
|
""" |
|
|
|
|
|
response1 = captioner.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": caption_filter_text, |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
} |
|
], |
|
} |
|
], |
|
) |
|
response_content = response1.choices[0].message.content |
|
should_caption = True if "yes" in response_content.lower() else False |
|
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') |
|
|
|
else: |
|
should_caption = False |
|
|
|
|
|
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object. |
|
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. |
|
I want to use your expressions to create a action-centric referring expression dataset. |
|
Therefore, your expressions for these {cat_name}s should describe unique action of each object. |
|
|
|
1. Focus only on clear, unique, and prominent actions that distinguish each object. |
|
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. |
|
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. |
|
4. Do not include common-sense or overly general descriptions like 'the elephant walks'. |
|
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. |
|
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. |
|
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. |
|
8. Include interactions with objects or other entities when they are prominent and observable. |
|
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. |
|
10. Do not include descriptions of appearance such as clothes, color, size, shape etc. |
|
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous. |
|
12. Do not mention object IDs. |
|
13. Use '{cat_name}' as the noun for the referring expressions. |
|
|
|
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). |
|
Output referring expressions for each object id. |
|
""" |
|
|
|
dense_caption_prompt = f""" |
|
You are a visual assistant analyzing a single frame of a video. |
|
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. |
|
I want to use your expressions to create a action-centric referring expression dataset. |
|
Please describe each {cat_name} using **clearly observable** and **specific** actions. |
|
|
|
## Guidelines: |
|
1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object). |
|
2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw). |
|
3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”). |
|
4. Do not use vague expressions like "interacting with something"** or "engaging with another object." |
|
Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button"). |
|
5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction. |
|
6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions. |
|
7. Base your description on the following action definitions: |
|
- Facial with object manipulation |
|
- General body movement, body position or pattern |
|
- Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object"). |
|
- Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone"). |
|
|
|
## Output Format: |
|
- For each labeled {cat_name}, output one line in the format: |
|
ID. action-oriented description |
|
|
|
Example: |
|
1. a bear grasping the edge of a wood with its front paws |
|
2. the bear pushing another bear, leaning forward |
|
|
|
**Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”). |
|
**Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines. |
|
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). |
|
For each labeled {cat_name}, output referring expressions for each object id. |
|
""" |
|
if should_caption: |
|
response2 = captioner.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": dense_caption_prompt, |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
caption = response2.choices[0].message.content |
|
|
|
else: |
|
caption = None |
|
|
|
image_captions[frame_name] = caption |
|
all_captions[cat_name] = image_captions |
|
|
|
|
|
valid_obj_ids = dict() |
|
|
|
for cat in cat_names: |
|
if cat in ytvos_category_valid_list: |
|
obj_id_cat = vid_meta['obj_id_cat'] |
|
valid_cat_ids = [] |
|
for obj_id in list(obj_id_cat.keys()): |
|
if obj_id_cat[obj_id] == cat: |
|
valid_cat_ids.append(obj_id) |
|
valid_obj_ids[cat] = valid_cat_ids |
|
|
|
return vid_id, all_captions, valid_obj_ids |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) |
|
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json") |
|
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json") |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
train_dataset = build_ytvos_ref(image_set = 'train', args = args) |
|
|
|
|
|
metas = train_dataset.metas |
|
|
|
|
|
colors = [ |
|
(255, 0, 0), |
|
(0, 255, 0), |
|
(0, 0, 255), |
|
(255, 255, 0), |
|
(255, 0, 255), |
|
(0, 255, 255), |
|
(128, 0, 128), |
|
(255, 165, 0) |
|
] |
|
|
|
ytvos_category_valid_list = [ |
|
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', |
|
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', |
|
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', |
|
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', |
|
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', |
|
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' |
|
] |
|
|
|
|
|
os.environ['OPENAI_API_KEY'] = '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' |
|
|
|
result_captions = {} |
|
result_valid_obj_ids = {} |
|
|
|
for i in range(370): |
|
vid_id, all_captions, valid_obj_ids = getCaption(i, True) |
|
|
|
if vid_id not in result_captions: |
|
result_captions[vid_id] = all_captions |
|
if vid_id not in result_valid_obj_ids: |
|
result_valid_obj_ids[vid_id] = valid_obj_ids |
|
|
|
print("Finished!", flush=True) |
|
|
|
with open(args.save_caption_path, "w") as file: |
|
json.dump(result_captions, file, indent=4) |
|
|
|
with open(args.save_valid_obj_ids_path, "w") as file: |
|
json.dump(result_valid_obj_ids, file, indent=4) |
|
|