|
import sys |
|
from os import path as osp |
|
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..'))) |
|
|
|
from mbench.ytvos_ref import build as build_ytvos_ref |
|
import argparse |
|
import opts |
|
|
|
import sys |
|
from pathlib import Path |
|
import os |
|
from os import path as osp |
|
import skimage |
|
from io import BytesIO |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import regex as re |
|
import json |
|
|
|
import cv2 |
|
from PIL import Image, ImageDraw |
|
import torch |
|
from torchvision.transforms import functional as F |
|
|
|
from skimage import measure |
|
from shapely.geometry import Polygon, MultiPolygon |
|
|
|
import matplotlib.pyplot as plt |
|
import matplotlib.patches as patches |
|
from matplotlib.collections import PatchCollection |
|
from matplotlib.patches import Rectangle |
|
|
|
|
|
import ipywidgets as widgets |
|
from IPython.display import display, clear_output |
|
|
|
from openai import OpenAI |
|
import base64 |
|
|
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
|
|
|
ytvos_category_valid_list = [ |
|
'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', |
|
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', |
|
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', |
|
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', |
|
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', |
|
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' |
|
] |
|
def getCaption(video_id, json_data): |
|
|
|
video_data = json_data[video_id] |
|
frame_names = video_data['frame_names'] |
|
video_path = video_data['video_path'] |
|
|
|
cat_names = set() |
|
all_captions = dict() |
|
for obj_id in list(video_data['annotations'][0].keys()): |
|
cat_names.add(video_data['annotations'][0][obj_id]['category_name']) |
|
|
|
|
|
|
|
|
|
|
|
for cat_name in list(cat_names) : |
|
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names] |
|
image_captions = {} |
|
|
|
captioner = OpenAI() |
|
|
|
|
|
is_movable = False |
|
if cat_name in ytvos_category_valid_list : |
|
is_movable = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not is_movable: |
|
print(f"Skipping {cat_name}: Determined to be non-movable.") |
|
continue |
|
|
|
for i in range(len(image_paths)): |
|
image_path = image_paths[i] |
|
frame_name = frame_names[i] |
|
base64_image = encode_image(image_path) |
|
|
|
|
|
|
|
response1 = captioner.chat.completions.create( |
|
model="chatgpt-4o-latest", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
|
|
"text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions? |
|
Focus only on clear and prominent actions, avoiding minor or ambiguous ones. |
|
Each action should be unique and clearly associated with a specific object. |
|
|
|
Respond with YES if: |
|
- The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable. |
|
- The {cat_name}s involve clear, distinguishable actions performed independently. |
|
|
|
Respond with NONE if: |
|
- The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person. |
|
- Actions are ambiguous, minor, or not clearly visible. |
|
|
|
If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE. |
|
If the {cat_name} is 'person' and their actions are distinct and clear, output YES. |
|
|
|
Answer only YES or NONE.""" |
|
|
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
response_content = response1.choices[0].message.content |
|
should_caption = True if "yes" in response_content.lower() else False |
|
|
|
|
|
|
|
if should_caption: |
|
response2 = captioner.chat.completions.create( |
|
model="chatgpt-4o-latest", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
|
|
"text": f""" |
|
Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image. |
|
1. Focus only on clear, unique, and prominent actions that distinguish each object. |
|
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. |
|
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. |
|
4. Do not include common-sense or overly general descriptions like 'the elephant walks'. |
|
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. |
|
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'. |
|
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. |
|
8. Include interactions with objects or other entities when they are prominent and observable. |
|
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. |
|
Output only the caption.""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
caption = response2.choices[0].message.content |
|
|
|
else: |
|
caption = None |
|
|
|
image_captions[frame_name] = caption |
|
all_captions[cat_name] = image_captions |
|
|
|
|
|
valid_obj_ids = [] |
|
valid_cat_names = list(all_captions.keys()) |
|
for obj_id in list(video_data['annotations'][0].keys()): |
|
cat = video_data['annotations'][0][obj_id]['category_name'] |
|
if cat in valid_cat_names : valid_obj_ids.append(obj_id) |
|
|
|
return all_captions, valid_obj_ids |
|
|
|
|
|
def getRefExp(video_id, frame_name, caption, obj_id, json_data): |
|
|
|
|
|
video_data = json_data[video_id] |
|
frame_names = video_data['frame_names'] |
|
video_path = video_data['video_path'] |
|
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg')) |
|
frame_indx = frame_names.index(frame_name) |
|
obj_data = video_data['annotations'][frame_indx][obj_id] |
|
|
|
bbox = obj_data['bbox'] |
|
cat_name = obj_data['category_name'] |
|
valid = obj_data['valid'] |
|
|
|
if valid == 0: |
|
print("Object not in this frame!") |
|
return {} |
|
|
|
|
|
x_min, y_min, x_max, y_max = bbox |
|
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) |
|
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2) |
|
plt.figure() |
|
plt.imshow(I) |
|
plt.axis('off') |
|
plt.show() |
|
|
|
|
|
cropped_I = I[y_min:y_max, x_min:x_max] |
|
pil_cropped_I = Image.fromarray(cropped_I) |
|
buff_crop = BytesIO() |
|
pil_cropped_I.save(buff_crop, format='JPEG') |
|
base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8") |
|
|
|
|
|
pil_I = Image.fromarray(I) |
|
buff = BytesIO() |
|
pil_I.save(buff, format='JPEG') |
|
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8") |
|
|
|
|
|
generator = OpenAI() |
|
response_check = generator.chat.completions.create( |
|
model="chatgpt-4o-latest", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
|
|
"type": "text", |
|
"text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}? |
|
Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible. |
|
|
|
Guidelines: |
|
- If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES. |
|
- If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE. |
|
- If the object is clearly visible and identifiable as a {cat_name}, respond with YES. |
|
|
|
Output only either YES or NONE. |
|
""" |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"}, |
|
} |
|
] |
|
}, |
|
] |
|
) |
|
|
|
response_check_content = response_check.choices[0].message.content.strip().lower() |
|
|
|
|
|
if "yes" not in response_check_content: |
|
print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.") |
|
return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False} |
|
|
|
|
|
|
|
response = generator.chat.completions.create( |
|
model="chatgpt-4o-latest", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
|
|
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}. |
|
Guidelines for creating the referring expression: |
|
1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}). |
|
2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s. |
|
3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}. |
|
4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}. |
|
5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities. |
|
6. Use '{cat_name}' as the noun for the referring expressions. |
|
Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}). |
|
|
|
{caption} |
|
""" |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
|
}, |
|
|
|
|
|
|
|
|
|
], |
|
} |
|
], |
|
) |
|
|
|
ref_exp = response.choices[0].message.content.strip() |
|
|
|
|
|
|
|
filter = OpenAI() |
|
response1 = filter.chat.completions.create( |
|
model="chatgpt-4o-latest", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO. |
|
{ref_exp}""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
response1_content = response1.choices[0].message.content |
|
describesHighlighted = True if "yes" in response1_content.lower() else False |
|
|
|
|
|
response2 = filter.chat.completions.create( |
|
model="chatgpt-4o-latest", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO. |
|
{ref_exp}""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
response2_content = response2.choices[0].message.content |
|
notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True |
|
|
|
isValid = True if describesHighlighted and notDescribesNotHighlighted else False |
|
|
|
|
|
|
|
|
|
|
|
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid} |
|
|
|
|
|
if __name__ == '__main__': |
|
with open('mbench/sampled_frame3.json', 'r') as file: |
|
data = json.load(file) |
|
|
|
vid_ids = list(data.keys()) |
|
all_ref_exps = {} |
|
|
|
os.environ['OPENAI_API_KEY'] = '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' |
|
|
|
|
|
for i in range(1): |
|
vid_id = vid_ids[i] |
|
|
|
|
|
|
|
captions, valid_obj_ids = getCaption(vid_id, data) |
|
cats_in_vid = list(captions.keys()) |
|
|
|
|
|
|
|
|
|
ref_expressions = {} |
|
|
|
|
|
for cat_name in cats_in_vid: |
|
if cat_name not in ref_expressions: |
|
ref_expressions[cat_name] = {} |
|
|
|
for frame_name in data[vid_id]['frame_names']: |
|
|
|
|
|
if frame_name not in ref_expressions[cat_name]: |
|
ref_expressions[cat_name][frame_name] = {} |
|
caption = captions[cat_name][frame_name] |
|
if not caption : continue |
|
else : |
|
|
|
for obj_id in valid_obj_ids: |
|
ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data) |
|
ref_expressions[cat_name][frame_name][obj_id] = ref_exp |
|
|
|
all_ref_exps[vid_id] = ref_expressions |
|
|
|
|
|
with open('mbench/result_revised.json', 'w') as file: |
|
json.dump(all_ref_exps, file, indent=4) |
|
|
|
|
|
|
|
|
|
|
|
|