File size: 8,642 Bytes

2c58401

import sys
import os
import argparse
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import opts

import numpy as np
import cv2
from PIL import Image
import json

from mbench.ytvos_ref import build as build_ytvos_ref
import t2v_metrics

import matplotlib.pyplot as plt
import textwrap


def scoreCaption(idx, all_captions, all_valid_obj_ids, clip_flant5_score, color_mask = False):
    vid_meta = metas[idx]
    vid_id = vid_meta['video']
    frames = vid_meta['frames']

    first_cat = list(all_captions[vid_id].keys())[0]
    sampled_frames = list(all_captions[vid_id][first_cat].keys())
    imgs = []
    masks = []
    for frame_indx in sampled_frames:
        frame_name = frames[int(frame_indx)]
        img_path = os.path.join(str(train_dataset.img_folder), 'JPEGImages', vid_id, frame_name + '.jpg')
        mask_path = os.path.join(str(train_dataset.img_folder), 'Annotations', vid_id, frame_name + '.png')
        img = Image.open(img_path).convert('RGB')
        imgs.append(img)
        mask = Image.open(mask_path).convert('P')
        mask = np.array(mask)
        masks.append(mask)

    vid_captions = all_captions[vid_id]
    cat_names = set(list(vid_captions.keys()))

    vid_result = {}

    for cat in cat_names:
        
        cat_captions = vid_captions[cat]

        cat_result = {}

        for i in range(len(imgs)):
            frame_name = sampled_frames[i]
            frame = np.copy(np.array(imgs[i]))
            frame_for_contour = np.copy(np.array(imgs[i]))

            mask = masks[i] 

            all_obj_ids = np.unique(mask).astype(int)
            all_obj_ids = [str(obj_id) for obj_id in all_obj_ids if obj_id != 0]            
            
            if cat in all_valid_obj_ids[vid_id]:
                valid_obj_ids = all_valid_obj_ids[vid_id][cat]
            else:
                valid_obj_ids = []

            for j in range(len(all_obj_ids)):
                obj_id = all_obj_ids[j]
                obj_mask = (mask == int(obj_id)).astype(np.uint8)
            
                if obj_id in valid_obj_ids:
                    if color_mask == False:
                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                        cv2.drawContours(frame, contours, -1, colors[j], 3)
                        for i, contour in enumerate(contours):
                            # 윤곽선 중심 계산
                            moments = cv2.moments(contour)
                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
                                cx = int(moments["m10"] / moments["m00"])
                                cy = int(moments["m01"] / moments["m00"])
                            else:
                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
                            
                            # 텍스트 배경 (검은색 배경 만들기)
                            font = cv2.FONT_HERSHEY_SIMPLEX
                            text = obj_id
                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
                            text_w, text_h = text_size
                            
                            # 텍스트 배경 그리기 (검은색 배경)
                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
                            
                            # 텍스트 그리기 (흰색 텍스트)
                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
                                        font, 1, (255, 255, 255), 2)
                    else:
                        alpha = 0.08
                        colored_obj_mask = np.zeros_like(frame)
                        colored_obj_mask[obj_mask == 1] = colors[j]
                        frame[obj_mask == 1] = (
                            (1 - alpha) * frame[obj_mask == 1]
                            + alpha * colored_obj_mask[obj_mask == 1]
                        )


                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                        cv2.drawContours(frame, contours, -1, colors[j], 2)
                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
                        
                        
                        
                        if len(contours) > 0:
                            largest_contour = max(contours, key=cv2.contourArea)
                            M = cv2.moments(largest_contour)
                            if M["m00"] != 0:  
                                center_x = int(M["m10"] / M["m00"])
                                center_y = int(M["m01"] / M["m00"])
                            else:
                                center_x, center_y = 0, 0

                        font = cv2.FONT_HERSHEY_SIMPLEX
                        text = obj_id

                        font_scale = 0.9  
                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]                        
                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
                        text_y = center_y
                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심

                        # 텍스트 배경 사각형 좌표 계산
                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
                        # rect_end = (text_x + text_size[0] + 5, text_y + 5) 
                        rect_end = (text_x + text_size[0] + 5, text_y)

                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)

                       
                        
            # fig, ax = plt.subplots()
            # ax.imshow(frame)
            # ax.axis('off')

            frame_caption = cat_captions[frame_name]
            if frame_caption:
                # wrapped_text = "\n".join(textwrap.wrap(frame_caption, width=60))
                # ax.text(0.5, -0.3, wrapped_text, ha='center', va='center', fontsize=12, transform=ax.transAxes)

                #calculate vqa score
                frame = Image.fromarray(frame)
                score = clip_flant5_score(images=[frame], texts=[frame_caption])
            else:
                score = None
            
            # plt.title(f"vid_id: {vid_id}, cat: {cat}, frame: {frame_name}, score: {score}")
            # plt.tight_layout()
            # plt.show()

            cat_result[frame_name] = {
                "caption" : frame_caption,
                "score" : score
            }

        vid_result[cat] = cat_result

    return vid_id, vid_result

            

if __name__ == '__main__':
    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
    args = parser.parse_args()

    #==================데이터 불러오기===================
    # 전체 데이터셋
    train_dataset = build_ytvos_ref(image_set = 'train', args = args)

    # 전체 데이터셋 메타데이터
    metas = train_dataset.metas

    # caption 데이터
    with open('mbench/numbered_captions_gpt-4o_final.json', 'r') as file:
        all_captions = json.load(file)
 
    # valid obj ids 데이터
    with open('mbench/numbered_valid_obj_ids_gpt-4o_final.json', 'r') as file:
        all_valid_obj_ids = json.load(file)

    # 색상 후보 8개 (RGB 형식)
    colors = [
        (255, 0, 0),    # Red
        (0, 255, 0),    # Green
        (0, 0, 255),    # Blue
        (255, 255, 0),  # Yellow
        (255, 0, 255),  # Magenta
        (0, 255, 255),  # Cyan
        (128, 0, 128),  # Purple
        (255, 165, 0)   # Orange
    ]

    #==================vqa score 모델 불러오기===================
    clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl')

    #==================vqa score 점수 계산하기===================
    all_scores = {}
    for i in range(5):
        vid_id, vid_result = scoreCaption(i, all_captions, all_valid_obj_ids, clip_flant5_score, False)
        all_scores[vid_id] = vid_result

    with open('mbench/numbered_captions_gpt-4o_final_scores.json', 'w', encoding='utf-8') as json_file:
        json.dump(all_scores, indent=4, ensure_ascii=False)

    print("JSON 파일이 성공적으로 저장되었습니다!")