import sys |
import os |
import argparse |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
import opts |
import numpy as np |
import cv2 |
from PIL import Image |
import json |
from mbench.ytvos_ref import build as build_ytvos_ref |
import t2v_metrics |
import matplotlib.pyplot as plt |
import textwrap |
def scoreCaption(idx, all_captions, all_valid_obj_ids, clip_flant5_score, color_mask = False): |
vid_meta = metas[idx] |
vid_id = vid_meta['video'] |
frames = vid_meta['frames'] |
first_cat = list(all_captions[vid_id].keys())[0] |
sampled_frames = list(all_captions[vid_id][first_cat].keys()) |
imgs = [] |
masks = [] |
for frame_indx in sampled_frames: |
frame_name = frames[int(frame_indx)] |
img_path = os.path.join(str(train_dataset.img_folder), 'JPEGImages', vid_id, frame_name + '.jpg') |
mask_path = os.path.join(str(train_dataset.img_folder), 'Annotations', vid_id, frame_name + '.png') |
img = Image.open(img_path).convert('RGB') |
imgs.append(img) |
mask = Image.open(mask_path).convert('P') |
mask = np.array(mask) |
masks.append(mask) |
vid_captions = all_captions[vid_id] |
cat_names = set(list(vid_captions.keys())) |
vid_result = {} |
for cat in cat_names: |
cat_captions = vid_captions[cat] |
cat_result = {} |
for i in range(len(imgs)): |
frame_name = sampled_frames[i] |
frame = np.copy(np.array(imgs[i])) |
frame_for_contour = np.copy(np.array(imgs[i])) |
mask = masks[i] |
all_obj_ids = np.unique(mask).astype(int) |
all_obj_ids = [str(obj_id) for obj_id in all_obj_ids if obj_id != 0] |
if cat in all_valid_obj_ids[vid_id]: |
valid_obj_ids = all_valid_obj_ids[vid_id][cat] |
else: |
valid_obj_ids = [] |
for j in range(len(all_obj_ids)): |
obj_id = all_obj_ids[j] |
obj_mask = (mask == int(obj_id)).astype(np.uint8) |
if obj_id in valid_obj_ids: |
if color_mask == False: |
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
cv2.drawContours(frame, contours, -1, colors[j], 3) |
for i, contour in enumerate(contours): |
moments = cv2.moments(contour) |
if moments["m00"] != 0: |
cx = int(moments["m10"] / moments["m00"]) |
cy = int(moments["m01"] / moments["m00"]) |
else: |
cx, cy = contour[0][0] |
text = obj_id |
text_size = cv2.getTextSize(text, font, 1, 2)[0] |
text_w, text_h = text_size |
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), |
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) |
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), |
font, 1, (255, 255, 255), 2) |
else: |
alpha = 0.08 |
colored_obj_mask = np.zeros_like(frame) |
colored_obj_mask[obj_mask == 1] = colors[j] |
frame[obj_mask == 1] = ( |
(1 - alpha) * frame[obj_mask == 1] |
+ alpha * colored_obj_mask[obj_mask == 1] |
) |
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
cv2.drawContours(frame, contours, -1, colors[j], 2) |
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) |
if len(contours) > 0: |
largest_contour = max(contours, key=cv2.contourArea) |
M = cv2.moments(largest_contour) |
if M["m00"] != 0: |
center_x = int(M["m10"] / M["m00"]) |
center_y = int(M["m01"] / M["m00"]) |
else: |
center_x, center_y = 0, 0 |
text = obj_id |
font_scale = 0.9 |
text_size = cv2.getTextSize(text, font, font_scale, 2)[0] |
text_x = center_x - text_size[0] // 1 |
text_y = center_y |
rect_start = (text_x - 5, text_y - text_size[1] - 5) |
rect_end = (text_x + text_size[0] + 5, text_y) |
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) |
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) |
frame_caption = cat_captions[frame_name] |
if frame_caption: |
frame = Image.fromarray(frame) |
score = clip_flant5_score(images=[frame], texts=[frame_caption]) |
else: |
score = None |
cat_result[frame_name] = { |
"caption" : frame_caption, |
"score" : score |
} |
vid_result[cat] = cat_result |
return vid_id, vid_result |
if __name__ == '__main__': |
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) |
args = parser.parse_args() |
train_dataset = build_ytvos_ref(image_set = 'train', args = args) |
metas = train_dataset.metas |
with open('mbench/numbered_captions_gpt-4o_final.json', 'r') as file: |
all_captions = json.load(file) |
with open('mbench/numbered_valid_obj_ids_gpt-4o_final.json', 'r') as file: |
all_valid_obj_ids = json.load(file) |
colors = [ |
(255, 0, 0), |
(0, 255, 0), |
(0, 0, 255), |
(255, 255, 0), |
(255, 0, 255), |
(0, 255, 255), |
(128, 0, 128), |
(255, 165, 0) |
] |
clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl') |
all_scores = {} |
for i in range(5): |
vid_id, vid_result = scoreCaption(i, all_captions, all_valid_obj_ids, clip_flant5_score, False) |
all_scores[vid_id] = vid_result |
with open('mbench/numbered_captions_gpt-4o_final_scores.json', 'w', encoding='utf-8') as json_file: |
json.dump(all_scores, indent=4, ensure_ascii=False) |
print("JSON ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ ์ฅ๋์์ต๋๋ค!") |