|
from datasets import build_dataset |
|
import argparse |
|
import opts |
|
|
|
import sys |
|
from pathlib import Path |
|
import os |
|
from os import path as osp |
|
import skimage |
|
from io import BytesIO |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import regex as re |
|
import json |
|
|
|
import cv2 |
|
from PIL import Image, ImageDraw |
|
import torch |
|
from torchvision.transforms import functional as F |
|
|
|
from skimage import measure |
|
from shapely.geometry import Polygon, MultiPolygon |
|
|
|
import matplotlib.pyplot as plt |
|
import matplotlib.patches as patches |
|
from matplotlib.collections import PatchCollection |
|
from matplotlib.patches import Rectangle |
|
|
|
|
|
import ipywidgets as widgets |
|
from IPython.display import display, clear_output |
|
|
|
from openai import OpenAI |
|
import base64 |
|
|
|
os.environ['OPENAI_API_KEY'] = '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' |
|
|
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as image_file: |
|
return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
|
def getCaption(video_id, json_data): |
|
|
|
video_data = json_data[video_id] |
|
frame_names = video_data['frame_names'] |
|
video_path = video_data['video_path'] |
|
|
|
cat_names = set() |
|
for obj_id in list(video_data['annotations'][0].keys()): |
|
cat_names.add(video_data['annotations'][0][obj_id]['category_name']) |
|
|
|
if len(cat_names) == 1: |
|
cat_name = next(iter(cat_names)) |
|
else: |
|
print("more than 2 categories") |
|
return -1 |
|
|
|
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names] |
|
image_captions = {} |
|
|
|
captioner = OpenAI() |
|
for i in range(len(image_paths)): |
|
image_path = image_paths[i] |
|
frame_name = frame_names[i] |
|
base64_image = encode_image(image_path) |
|
|
|
|
|
response1 = captioner.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
response_content = response1.choices[0].message.content |
|
should_caption = True if "yes" in response_content.lower() else False |
|
|
|
|
|
if should_caption: |
|
response2 = captioner.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f""" |
|
Describe the image in detail focusing on the {cat_name}s' actions. |
|
1. Each action should be prominent, clear and unique, describing the corresponding object only. |
|
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’. |
|
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting. |
|
4. Do not include actions that needs to be guessed or suggested.""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
caption = response2.choices[0].message.content |
|
else: |
|
caption = None |
|
|
|
image_captions[frame_name] = caption |
|
return image_captions |
|
|
|
def getRefExp(video_id, frame_name, caption, obj_id, json_data): |
|
|
|
video_data = json_data[video_id] |
|
frame_names = video_data['frame_names'] |
|
video_path = video_data['video_path'] |
|
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg')) |
|
frame_indx = frame_names.index(frame_name) |
|
obj_data = video_data['annotations'][frame_indx][obj_id] |
|
|
|
bbox = obj_data['bbox'] |
|
cat_name = obj_data['category_name'] |
|
valid = obj_data['valid'] |
|
|
|
if valid == 0: |
|
print("Object not in this frame!") |
|
return {} |
|
|
|
|
|
x_min, y_min, x_max, y_max = bbox |
|
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) |
|
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2) |
|
plt.figure() |
|
plt.imshow(I) |
|
plt.axis('off') |
|
plt.show() |
|
pil_I = Image.fromarray(I) |
|
buff = BytesIO() |
|
pil_I.save(buff, format='JPEG') |
|
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8") |
|
|
|
|
|
generator = OpenAI() |
|
response = generator.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box. |
|
1. The referring expression describes the action and does not contain information about appearance or location in the picture. |
|
2. Focus only on prominent actions and avoid overly detailed or indeterminate details. |
|
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words. |
|
4. The referring expression should only describe the highlighted {cat_name} and not any other. |
|
5. Use '{cat_name}' as the noun for the referring expressions. |
|
Output only the referring expression. |
|
{caption}""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
ref_exp = response.choices[0].message.content |
|
|
|
|
|
|
|
filter = OpenAI() |
|
response1 = filter.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO. |
|
{ref_exp}""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
response1_content = response1.choices[0].message.content |
|
describesHighlighted = True if "yes" in response1_content.lower() else False |
|
|
|
|
|
response2 = filter.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO. |
|
{ref_exp}""", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
response2_content = response2.choices[0].message.content |
|
describesNotHighlighted = True if "yes" in response2_content.lower() else False |
|
|
|
isValid = True if describesHighlighted and not describesNotHighlighted else False |
|
|
|
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}") |
|
|
|
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid} |
|
|
|
def createRefExp(video_id, json_data): |
|
video_data = json_data[video_id] |
|
obj_ids = list(video_data['annotations'][0].keys()) |
|
frame_names = video_data['frame_names'] |
|
|
|
captions_per_frame = getCaption(video_id, json_data) |
|
|
|
if captions_per_frame == -1: |
|
print("There are more than 2 cateories") |
|
return |
|
|
|
|
|
video_ref_exps = {} |
|
|
|
for frame_name in frame_names: |
|
frame_caption = captions_per_frame[frame_name] |
|
|
|
if frame_caption == None: |
|
video_ref_exps[frame_name] = None |
|
|
|
else: |
|
frame_ref_exps = {} |
|
for obj_id in obj_ids: |
|
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data) |
|
frame_ref_exps[obj_id] = exp_per_obj |
|
video_ref_exps[frame_name] = frame_ref_exps |
|
|
|
return video_ref_exps |
|
|
|
if __name__ == '__main__': |
|
with open('mbench/sampled_frame3.json', 'r') as file: |
|
data = json.load(file) |
|
|
|
with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file: |
|
manual_select = list(file) |
|
for frame in manual_select: |
|
result = json.loads |
|
|
|
all_video_refs = {} |
|
for i in range(10): |
|
video_id = list(data.keys())[i] |
|
video_ref = createRefExp(video_id, data) |
|
all_video_refs[video_id] = video_ref |