from datasets import build_dataset |
import argparse |
import opts |
import sys |
from pathlib import Path |
import os |
from os import path as osp |
import skimage |
from io import BytesIO |
import numpy as np |
import pandas as pd |
import regex as re |
import json |
import cv2 |
from PIL import Image, ImageDraw |
import torch |
from torchvision.transforms import functional as F |
from skimage import measure |
from shapely.geometry import Polygon, MultiPolygon |
import matplotlib.pyplot as plt |
import matplotlib.patches as patches |
from matplotlib.collections import PatchCollection |
from matplotlib.patches import Rectangle |
import ipywidgets as widgets |
from IPython.display import display, clear_output |
from openai import OpenAI |
import base64 |
os.environ['OPENAI_API_KEY'] = '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' |
def encode_image(image_path): |
with open(image_path, "rb") as image_file: |
return base64.b64encode(image_file.read()).decode("utf-8") |
def getCaption(video_id, json_data): |
video_data = json_data[video_id] |
frame_names = video_data['frame_names'] |
video_path = video_data['video_path'] |
cat_names = set() |
for obj_id in list(video_data['annotations'][0].keys()): |
cat_names.add(video_data['annotations'][0][obj_id]['category_name']) |
if len(cat_names) == 1: |
cat_name = next(iter(cat_names)) |
else: |
print("more than 2 categories") |
return -1 |
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names] |
image_captions = {} |
captioner = OpenAI() |
for i in range(len(image_paths)): |
image_path = image_paths[i] |
frame_name = frame_names[i] |
base64_image = encode_image(image_path) |
response1 = captioner.chat.completions.create( |
model="gpt-4o-mini", |
messages=[ |
{ |
"role": "user", |
"content": [ |
{ |
"type": "text", |
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None", |
}, |
{ |
"type": "image_url", |
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
}, |
], |
} |
], |
) |
response_content = response1.choices[0].message.content |
should_caption = True if "yes" in response_content.lower() else False |
if should_caption: |
response2 = captioner.chat.completions.create( |
model="gpt-4o-mini", |
messages=[ |
{ |
"role": "user", |
"content": [ |
{ |
"type": "text", |
"text": f""" |
Describe the image in detail focusing on the {cat_name}s' actions. |
1. Each action should be prominent, clear and unique, describing the corresponding object only. |
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’. |
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting. |
4. Do not include actions that needs to be guessed or suggested.""", |
}, |
{ |
"type": "image_url", |
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
}, |
], |
} |
], |
) |
caption = response2.choices[0].message.content |
else: |
caption = None |
image_captions[frame_name] = caption |
return image_captions |
def getRefExp(video_id, frame_name, caption, obj_id, json_data): |
video_data = json_data[video_id] |
frame_names = video_data['frame_names'] |
video_path = video_data['video_path'] |
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg')) |
frame_indx = frame_names.index(frame_name) |
obj_data = video_data['annotations'][frame_indx][obj_id] |
bbox = obj_data['bbox'] |
cat_name = obj_data['category_name'] |
valid = obj_data['valid'] |
if valid == 0: |
print("Object not in this frame!") |
return {} |
x_min, y_min, x_max, y_max = bbox |
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max) |
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2) |
plt.figure() |
plt.imshow(I) |
plt.axis('off') |
plt.show() |
pil_I = Image.fromarray(I) |
buff = BytesIO() |
pil_I.save(buff, format='JPEG') |
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8") |
generator = OpenAI() |
response = generator.chat.completions.create( |
model="gpt-4o-mini", |
messages=[ |
{ |
"role": "user", |
"content": [ |
{ |
"type": "text", |
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box. |
1. The referring expression describes the action and does not contain information about appearance or location in the picture. |
2. Focus only on prominent actions and avoid overly detailed or indeterminate details. |
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words. |
4. The referring expression should only describe the highlighted {cat_name} and not any other. |
5. Use '{cat_name}' as the noun for the referring expressions. |
Output only the referring expression. |
{caption}""", |
}, |
{ |
"type": "image_url", |
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
}, |
], |
} |
], |
) |
ref_exp = response.choices[0].message.content |
filter = OpenAI() |
response1 = filter.chat.completions.create( |
model="gpt-4o-mini", |
messages=[ |
{ |
"role": "user", |
"content": [ |
{ |
"type": "text", |
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO. |
{ref_exp}""", |
}, |
{ |
"type": "image_url", |
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
}, |
], |
} |
], |
) |
response1_content = response1.choices[0].message.content |
describesHighlighted = True if "yes" in response1_content.lower() else False |
response2 = filter.chat.completions.create( |
model="gpt-4o-mini", |
messages=[ |
{ |
"role": "user", |
"content": [ |
{ |
"type": "text", |
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO. |
{ref_exp}""", |
}, |
{ |
"type": "image_url", |
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"}, |
}, |
], |
} |
], |
) |
response2_content = response2.choices[0].message.content |
describesNotHighlighted = True if "yes" in response2_content.lower() else False |
isValid = True if describesHighlighted and not describesNotHighlighted else False |
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}") |
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid} |
def createRefExp(video_id, json_data): |
video_data = json_data[video_id] |
obj_ids = list(video_data['annotations'][0].keys()) |
frame_names = video_data['frame_names'] |
captions_per_frame = getCaption(video_id, json_data) |
if captions_per_frame == -1: |
print("There are more than 2 cateories") |
return |
video_ref_exps = {} |
for frame_name in frame_names: |
frame_caption = captions_per_frame[frame_name] |
if frame_caption == None: |
video_ref_exps[frame_name] = None |
else: |
frame_ref_exps = {} |
for obj_id in obj_ids: |
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data) |
frame_ref_exps[obj_id] = exp_per_obj |
video_ref_exps[frame_name] = frame_ref_exps |
return video_ref_exps |
if __name__ == '__main__': |
with open('mbench/sampled_frame3.json', 'r') as file: |
data = json.load(file) |
with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file: |
manual_select = list(file) |
for frame in manual_select: |
result = json.loads |
all_video_refs = {} |
for i in range(10): |
video_id = list(data.keys())[i] |
video_ref = createRefExp(video_id, data) |
all_video_refs[video_id] = video_ref |