|
import sys |
|
import os |
|
import argparse |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
|
|
import opts |
|
|
|
import numpy as np |
|
import cv2 |
|
from PIL import Image |
|
import json |
|
|
|
from mbench.ytvos_ref import build as build_ytvos_ref |
|
import t2v_metrics |
|
|
|
import matplotlib.pyplot as plt |
|
import textwrap |
|
|
|
|
|
def scoreCaption(idx, all_captions, all_valid_obj_ids, clip_flant5_score, color_mask = False): |
|
vid_meta = metas[idx] |
|
vid_id = vid_meta['video'] |
|
frames = vid_meta['frames'] |
|
|
|
first_cat = list(all_captions[vid_id].keys())[0] |
|
sampled_frames = list(all_captions[vid_id][first_cat].keys()) |
|
imgs = [] |
|
masks = [] |
|
for frame_indx in sampled_frames: |
|
frame_name = frames[int(frame_indx)] |
|
img_path = os.path.join(str(train_dataset.img_folder), 'JPEGImages', vid_id, frame_name + '.jpg') |
|
mask_path = os.path.join(str(train_dataset.img_folder), 'Annotations', vid_id, frame_name + '.png') |
|
img = Image.open(img_path).convert('RGB') |
|
imgs.append(img) |
|
mask = Image.open(mask_path).convert('P') |
|
mask = np.array(mask) |
|
masks.append(mask) |
|
|
|
vid_captions = all_captions[vid_id] |
|
cat_names = set(list(vid_captions.keys())) |
|
|
|
vid_result = {} |
|
|
|
for cat in cat_names: |
|
|
|
cat_captions = vid_captions[cat] |
|
|
|
cat_result = {} |
|
|
|
for i in range(len(imgs)): |
|
frame_name = sampled_frames[i] |
|
frame = np.copy(np.array(imgs[i])) |
|
frame_for_contour = np.copy(np.array(imgs[i])) |
|
|
|
mask = masks[i] |
|
|
|
all_obj_ids = np.unique(mask).astype(int) |
|
all_obj_ids = [str(obj_id) for obj_id in all_obj_ids if obj_id != 0] |
|
|
|
if cat in all_valid_obj_ids[vid_id]: |
|
valid_obj_ids = all_valid_obj_ids[vid_id][cat] |
|
else: |
|
valid_obj_ids = [] |
|
|
|
for j in range(len(all_obj_ids)): |
|
obj_id = all_obj_ids[j] |
|
obj_mask = (mask == int(obj_id)).astype(np.uint8) |
|
|
|
if obj_id in valid_obj_ids: |
|
if color_mask == False: |
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 3) |
|
for i, contour in enumerate(contours): |
|
|
|
moments = cv2.moments(contour) |
|
if moments["m00"] != 0: |
|
cx = int(moments["m10"] / moments["m00"]) |
|
cy = int(moments["m01"] / moments["m00"]) |
|
else: |
|
cx, cy = contour[0][0] |
|
|
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
text_size = cv2.getTextSize(text, font, 1, 2)[0] |
|
text_w, text_h = text_size |
|
|
|
|
|
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), |
|
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) |
|
|
|
|
|
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), |
|
font, 1, (255, 255, 255), 2) |
|
else: |
|
alpha = 0.08 |
|
colored_obj_mask = np.zeros_like(frame) |
|
colored_obj_mask[obj_mask == 1] = colors[j] |
|
frame[obj_mask == 1] = ( |
|
(1 - alpha) * frame[obj_mask == 1] |
|
+ alpha * colored_obj_mask[obj_mask == 1] |
|
) |
|
|
|
|
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 2) |
|
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) |
|
|
|
|
|
|
|
if len(contours) > 0: |
|
largest_contour = max(contours, key=cv2.contourArea) |
|
M = cv2.moments(largest_contour) |
|
if M["m00"] != 0: |
|
center_x = int(M["m10"] / M["m00"]) |
|
center_y = int(M["m01"] / M["m00"]) |
|
else: |
|
center_x, center_y = 0, 0 |
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
|
|
font_scale = 0.9 |
|
text_size = cv2.getTextSize(text, font, font_scale, 2)[0] |
|
text_x = center_x - text_size[0] // 1 |
|
text_y = center_y |
|
|
|
|
|
|
|
rect_start = (text_x - 5, text_y - text_size[1] - 5) |
|
|
|
rect_end = (text_x + text_size[0] + 5, text_y) |
|
|
|
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) |
|
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frame_caption = cat_captions[frame_name] |
|
if frame_caption: |
|
|
|
|
|
|
|
|
|
frame = Image.fromarray(frame) |
|
score = clip_flant5_score(images=[frame], texts=[frame_caption]) |
|
else: |
|
score = None |
|
|
|
|
|
|
|
|
|
|
|
cat_result[frame_name] = { |
|
"caption" : frame_caption, |
|
"score" : score |
|
} |
|
|
|
vid_result[cat] = cat_result |
|
|
|
return vid_id, vid_result |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) |
|
args = parser.parse_args() |
|
|
|
|
|
|
|
train_dataset = build_ytvos_ref(image_set = 'train', args = args) |
|
|
|
|
|
metas = train_dataset.metas |
|
|
|
|
|
with open('mbench/numbered_captions_gpt-4o_final.json', 'r') as file: |
|
all_captions = json.load(file) |
|
|
|
|
|
with open('mbench/numbered_valid_obj_ids_gpt-4o_final.json', 'r') as file: |
|
all_valid_obj_ids = json.load(file) |
|
|
|
|
|
colors = [ |
|
(255, 0, 0), |
|
(0, 255, 0), |
|
(0, 0, 255), |
|
(255, 255, 0), |
|
(255, 0, 255), |
|
(0, 255, 255), |
|
(128, 0, 128), |
|
(255, 165, 0) |
|
] |
|
|
|
|
|
clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl') |
|
|
|
|
|
all_scores = {} |
|
for i in range(5): |
|
vid_id, vid_result = scoreCaption(i, all_captions, all_valid_obj_ids, clip_flant5_score, False) |
|
all_scores[vid_id] = vid_result |
|
|
|
with open('mbench/numbered_captions_gpt-4o_final_scores.json', 'w', encoding='utf-8') as json_file: |
|
json.dump(all_scores, indent=4, ensure_ascii=False) |
|
|
|
print("JSON ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ ์ฅ๋์์ต๋๋ค!") |