VRIS_vip / make_ref-ytvos /annotate_ref_ytvos.py
dianecy's picture
Add files using upload-large-folder tool
9b855a7 verified
from datasets import build_dataset
import argparse
import opts
import sys
from pathlib import Path
from os import path as osp
import io
import numpy as np
import pandas as pd
import regex as re
import json
import cv2
from PIL import Image
import torch
from torchvision.transforms import functional as F
from skimage import measure # (pip install scikit-image)
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
import ipywidgets as widgets
from IPython.display import display, clear_output
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
args = parser.parse_args()
#==================๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ===================
# ์ „์ฒด ๋ฐ์ดํ„ฐ์…‹
train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
# ์ „์ฒด ๋ฐ์ดํ„ฐ์…‹ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ
metas = train_dataset.metas
# ํ•„ํ„ฐ๋งํ•œ ํ”„๋ ˆ์ž„๋“ค
selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
#==================๋งˆ์Šคํฌ ๋งŒ๋“œ๋Š” ํ•จ์ˆ˜๋“ค===================
def prepare_mask_for_pil(mask_tensor):
mask_array = mask_tensor.squeeze(0).cpu().numpy()
mask_array = (mask_array * 255).astype(np.uint8)
mask_image = Image.fromarray(mask_array)
return mask_image
def create_sub_masks(mask_image):
width, height = mask_image.size
sub_masks = {}
for x in range(width):
for y in range(height):
# Get the RGB values of the pixel
pixel = mask_image.getpixel((x, y))
# If the pixel is not black...
if pixel != 0 :
# Check to see if we've created a sub-mask...
pixel_str = str(pixel)
sub_mask = sub_masks.get(pixel_str)
if sub_mask is None:
# Create a sub-mask (one bit per pixel) and add to the dictionary
# Note: we add 1 pixel of padding in each direction
# because the contours module doesn't handle cases
# where pixels bleed to the edge of the image
sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
# Set the pixel value to 1 (default is 0), accounting for padding
sub_masks[pixel_str].putpixel((x+1, y+1), 1)
return sub_masks
#==================๋งˆ์Šคํฌ annotation ๋งŒ๋“œ๋Š” ํ•จ์ˆ˜===================
def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
# Find contours (boundary lines) around each sub-mask
# Note: there could be multiple contours if the object
# is partially occluded. (E.g. an elephant behind a tree)
contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
segmentations = []
polygons = []
for contour in contours:
# Flip from (row, col) representation to (x, y)
# and subtract the padding pixel
for i in range(len(contour)):
row, col = contour[i]
contour[i] = (col - 1, row - 1)
# Make a polygon and simplify it
poly = Polygon(contour)
poly = poly.simplify(1.0, preserve_topology=False)
polygons.append(poly)
segmentation = np.array(poly.exterior.coords).ravel().tolist()
segmentations.append(segmentation)
# Combine the polygons to calculate the bounding box and area
multi_poly = MultiPolygon(polygons)
x, y, max_x, max_y = multi_poly.bounds
width = max_x - x
height = max_y - y
bbox = (x, y, width, height)
area = multi_poly.area
annotation = {
'segmentation': segmentations,
'iscrowd': is_crowd,
'image_id': image_id,
'id': annotation_id,
'bbox': bbox,
'area': area
}
return annotation
#==================์‹œ๊ฐํ™” ํ•จ์ˆ˜===================
# annotation dictionary as input
def showRef(annotation, image_dir, seg_box='seg'):
ax = plt.gca()
I = io.imread(osp.join(image_dir, annotation['file_name']))
ax.imshow(I)
for sid, sent in enumerate(annotation['sentences']):
print('%s. %s' % (sid + 1, sent))
if seg_box == 'seg':
polygons = []
color = []
c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
if type(annotation['segmentation'][0]) == list:
# polygon used for refcoco*
for seg in annotation['segmentation']:
poly = np.array(seg).reshape((int(len(seg) / 2), 2))
polygons.append(Polygon(poly))
color.append(c)
p = PatchCollection(polygons,
facecolors=(221/255, 160/255, 221/255), # ์—ฐ๋ณด๋ผ์ƒ‰
linewidths=0,
alpha=0.4)
ax.add_collection(p)
p = PatchCollection(polygons,
facecolors='none',
edgecolors=color,
linewidths=2)
ax.add_collection(p)
# else:
# # mask used for refclef
# rle = annotation['segmentation']
# m = mask.decode(rle)
# img = np.ones((m.shape[0], m.shape[1], 3))
# color_mask = np.array([2.0, 166.0, 101.0]) / 255
# for i in range(3):
# img[:, :, i] = color_mask[i]
# ax.imshow(np.dstack((img, m * 0.5)))
# bounding box
elif seg_box == 'box':
bbox = annotation['bbox']
box_plot = Rectangle((bbox[0], bbox[1]),
bbox[2],
bbox[3],
fill=False,
edgecolor='green',
linewidth=3)
ax.add_patch(box_plot)
#==================๋ชจ๋‘ ์ข…ํ•ฉํ•œ ํ•จ์ˆ˜===================
def create_dict_from_selected_images(selected_frames_df):
image_id = 0
anno_id = 0
train_idx = 0
with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
for selected_idx in range(len(selected_frames_df)):
selected = selected_frames_df.loc[selected_idx]
selected_vid_id = selected['video']
selected_frame_id = selected['frame_id']
for obj_id in selected['objects'].keys():
selected_exp = selected['objects'][obj_id][0] #์บก์…˜
selected_verb = selected['objects'][obj_id][1] #๋™์‚ฌ
train_idx = next(
idx for idx, meta in enumerate(metas)
if meta['video'] == selected_vid_id
and meta['frame_id'] == selected_frame_id
and meta['obj_id'] == int(obj_id)
and meta['exp'] == selected_exp
)
train_frames, train_info = train_dataset[train_idx]
try:
valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id) #validํ•œ frame์ด ์žˆ๋Š” index
except ValueError:
print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
frame = train_frames[valid_frame_loc] #ํ•ด๋‹น frame
frame = F.to_pil_image(frame)
image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
#์›๋ž˜ frame ์ €์žฅํ•˜๊ธฐ
save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
#save_dir.mkdir(exist_ok=True)
save_path = save_dir / f"{image_file_name}.png"
#frame.save(save_path)
#์นดํ…Œ๊ณ ๋ฆฌ
label = train_info['labels'][valid_frame_loc].item() #category id
category_name = metas[train_idx]['category'] #category name
#๋ฐ•์Šค ์ •๋ณด
box = train_info['boxes'][valid_frame_loc]
# Annotation tools ########################################################################
mask = train_info['masks'][valid_frame_loc]
# print(mask.shape)
# frame๊ณผ mask ๋งž๋Š”์ง€ ํ™•์ธ๋งŒ
# plt.imshow(frame.permute(1, 2, 0))
# mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
# mask_color[mask == 1] = [255, 0, 0]
# plt.imshow(mask_color, alpha = 0.5)
# plt.show()
mask_image = prepare_mask_for_pil(mask)
sub_masks = create_sub_masks(mask_image)
for color, sub_mask in sub_masks.items():
# print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
sub_mask_array = np.array(sub_mask, dtype=np.uint8)
annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
anno_id += 1
image_id += 1
#ํŒŒ์ผ ๊ฒฝ๋กœ ์ถ”๊ฐ€
annotation['file_name'] = f"{image_file_name}.png"
#๋ถˆํ•„์š”ํ•œ ์ •๋ณด ์ง€์šฐ๊ธฐ
annotation.pop('iscrowd', None)
annotation.pop('image_id', None)
annotation.pop('id', None)
valid = train_info['valid'][valid_frame_loc]
orig_size = train_info['orig_size']
size = train_info['size']
caption = metas[train_idx]['exp']
#filename, height, width ์ถ”๊ฐ€
#annotation['file_name'] = save_path
annotation['height'] = orig_size[0].item()
annotation['width'] = orig_size[1].item()
# category id,name, sentence dictionary ์ถ”๊ฐ€
annotation['label'] = label
annotation['category_name'] = category_name
sentence_dict = {
"tokens" : caption.split(' '),
"raw" : caption,
"sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
}
annotation['sentences'] = sentence_dict
############################################################################################
# double check for segmentation annotation
# orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
# plt.imshow(orig_img_np)
# plt.axis('off')
# plt.show()
# showRef(annotation, save_dir)
############################################################################################
# ์ตœ์ข…
f.write(json.dumps(annotation) + "\n")
f.flush()
# if __name__ == '__main__':
# create_dict_from_selected_images(selected_frames_df)