diff --git "a/LAVT-RIS/angle_vis.ipynb" "b/LAVT-RIS/angle_vis.ipynb" new file mode 100644--- /dev/null +++ "b/LAVT-RIS/angle_vis.ipynb" @@ -0,0 +1,802 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting easydict\n", + " Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)\n", + "Downloading easydict-1.13-py3-none-any.whl (6.8 kB)\n", + "Installing collected packages: easydict\n", + "Successfully installed easydict-1.13\n" + ] + } + ], + "source": [ + "!pip install easydict" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'utils.dataset'; 'utils' is not a package", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 23\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39measydict\u001b[39;00m \u001b[39mimport\u001b[39;00m EasyDict\n\u001b[1;32m 21\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mfunctools\u001b[39;00m \u001b[39mimport\u001b[39;00m partial\n\u001b[0;32m---> 23\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mengine\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mengine\u001b[39;00m \u001b[39mimport\u001b[39;00m train, validate \n\u001b[1;32m 24\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdataset\u001b[39;00m \u001b[39mimport\u001b[39;00m RefDataset \u001b[39mas\u001b[39;00m origDataset\n\u001b[1;32m 25\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msimple_tokenizer\u001b[39;00m \u001b[39mimport\u001b[39;00m SimpleTokenizer \u001b[39mas\u001b[39;00m _Tokenizer\n", + "File \u001b[0;32m/data2/projects/chaeyun/VerbCentric_RIS/engine/engine.py:14\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mwandb\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mloguru\u001b[39;00m \u001b[39mimport\u001b[39;00m logger\n\u001b[0;32m---> 14\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdataset\u001b[39;00m \u001b[39mimport\u001b[39;00m tokenize\n\u001b[1;32m 15\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmisc\u001b[39;00m \u001b[39mimport\u001b[39;00m (AverageMeter, ProgressMeter, concat_all_gather,\n\u001b[1;32m 16\u001b[0m trainMetricGPU)\n\u001b[1;32m 19\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mreturn_mask\u001b[39m(emb_distance):\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'utils.dataset'; 'utils' is not a package" + ] + } + ], + "source": [ + "\n", + "import os\n", + "import sys\n", + "import cv2\n", + "import json\n", + "import time\n", + "import math\n", + "from tqdm import tqdm\n", + "\n", + "import torch\n", + "import torch.utils.data as data\n", + "import torch.nn.functional as F\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import argparse\n", + "\n", + "HOME_ = '/data2/projects/chaeyun/VerbCentric_RIS'\n", + "sys.path.append(HOME_)\n", + "\n", + "\n", + "from easydict import EasyDict\n", + "from functools import partial\n", + "\n", + "from engine.engine import train, validate \n", + "from utils.dataset import RefDataset as origDataset\n", + "from utils.simple_tokenizer import SimpleTokenizer as _Tokenizer\n", + "\n", + "train_set_pth = '/data2/projects/chaeyun/VerbCentric_RIS/datasets/anns/refcocog_u/train.json'\n", + "val_set_pth = '/data2/projects/chaeyun/VerbCentric_RIS/datasets/anns/refcocog_u/val.json'\n", + "test_set_pth = '/data2/projects/chaeyun/VerbCentric_RIS/datasets/anns/refcocog_u/test.json'\n", + "mask_root = '/data2/projects/chaeyun/VerbCentric_RIS/datasets/masks/refcocog_u'\n", + "\n", + "# ORIGINAL MODEL CONFIG AND WEIGHTS\n", + "orig_config_path = '/data2/projects/chaeyun/VerbCentric_RIS/config/cris_r50.yaml'\n", + "orig_model_path = '/data2/projects/chaeyun/VerbCentric_RIS/exp/CRIS_R50/best_model_miou.pth'\n", + "\n", + "# NEW MODEL CONFIG AND WEIGHTS\n", + "new_config_path = '/data2/projects/chaeyun/VerbCentric_RIS/config/cris_verbonly_b64_nopos.yaml'\n", + "exo_name = 'ACE_hp10_m08_tmp005_b64_v1'\n", + "exo_name2 = 'ACE_hp10_m10_tmp005_b64_v1'\n", + "exo_name3 = 'ACE_hp10_m15_tmp005_b64_v1'\n", + "\n", + "# /data2/projects/chaeyun/VerbCentric_RIS/exp/ACE_hp10_m08_tmp005_b64_v1/\n", + "new_model_path = f'/data2/projects/chaeyun/VerbCentric_RIS/exp/{exo_name}/best_model_miou.pth'\n", + "new_model_path2=f'/data2/projects/chaeyun/VerbCentric_RIS/exp/{exo_name2}/best_model_miou.pth'\n", + "new_model_path3=f'/data2/projects/chaeyun/VerbCentric_RIS/exp/{exo_name3}/best_model_miou.pth'\n", + "\n", + "# IMAGE MIN AND STD \n", + "img_mean = np.array([0.48145466, 0.4578275,0.40821073]).reshape(1, 1, 3)\n", + "img_std = np.array([0.26862954, 0.26130258,0.27577711]).reshape(1, 1, 3)\n", + "\n", + "\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = '3'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/data2/projects/chaeyun/LAVT-RIS'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "import utils.config as config\n", + "from utils.dataset_verbonly import RefDataset\n", + "from utils.misc import (init_random_seed, set_random_seed, setup_logger,\n", + " worker_init_fn)\n", + "\n", + "from model_ import build_segmenter_original, build_segmenter_pos_rev2\n", + "from model_.clip import build_model\n", + "\n", + "from .segmenter import CRIS\n", + "from .segmenter_verbonly import CRIS_PosOnly\n", + "from .segmenter_verbonly_fin import CRIS_PosOnly_rev\n", + "from .segmenter_verbonly_ver3 import CRIS_PosOnly_ver3\n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'dataset': 'refcocog_u', 'train_lmdb': '/home/seunghoon/research/VerbCentric_RIS/datasets/lmdb/refcocog_u/train.lmdb', 'train_split': 'train', 'val_lmdb': '/home/seunghoon/research/VerbCentric_RIS/datasets/lmdb/refcocog_u/val.lmdb', 'val_split': 'val', 'mask_root': '/home/seunghoon/research/VerbCentric_RIS/datasets/masks/refcocog_u', 'clip_pretrain': '/home/seunghoon/research/VerbCentric_RIS/pretrain/RN50.pt', 'input_size': 416, 'word_len': 22, 'word_dim': 1024, 'vis_dim': 512, 'fpn_in': [512, 1024, 1024], 'fpn_out': [256, 512, 1024], 'sync_bn': True, 'freeze': True, 'train_text_encoder': False, 'train_visual_encoder': False, 'num_layers': 3, 'num_head': 8, 'dim_ffn': 2048, 'dropout': 0.1, 'intermediate': False, 'workers': 4, 'workers_val': 4, 'epochs': 50, 'milestones': [35], 'start_epoch': 0, 'batch_size': 64, 'batch_size_val': 64, 'base_lr': 0.0001, 'lr_decay': 0.1, 'lr_multi': 0.1, 'weight_decay': 0.0, 'max_norm': 0.0, 'manual_seed': 0, 'print_freq': 100, 'metric_learning': True, 'metric_mode': 'original', 'exclude_multiobj': True, 'exclude_pos': True, 'loss_option': 'ACL_verbonly', 'metric_loss_weight': 0.1, 'hn_prob': 0.0, 'hn_celoss': True, 'margin_value': 20, 'temperature': 0.05, 'exp_name': 'CRIS_AML_verbonly_pos25_b32', 'output_folder': 'exp/refcocog_u/exclude_multiobj', 'save_freq': 1, 'weight': None, 'resume': 'latest', 'evaluate': True, 'dist_url': 'tcp://localhost:7024', 'dist_backend': 'nccl', 'multiprocessing_distributed': True, 'world_size': 1, 'rank': 0, 'test_split': 'val-test', 'test_lmdb': '/data2/projects/seunghoon/VerbRIS/VerbCentric_CY/datasets/lmdb/refcocog_u/val.lmdb', 'visualize': False}\n", + "CLIP FROZEN !!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-12-10 01:09:00.796\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmodel_\u001b[0m:\u001b[36mbuild_segmenter_pos_rev2\u001b[0m:\u001b[36m19\u001b[0m - \u001b[1mBackbone with decay=325, Head=124\u001b[0m\n" + ] + } + ], + "source": [ + "parser = argparse.ArgumentParser(\n", + " description='Pytorch Referring Expression Segmentation')\n", + "parser.add_argument('--config',\n", + " default=orig_config_path,\n", + " type=str,\n", + " help='config file')\n", + "parser.add_argument('--opts',\n", + " default=None,\n", + " nargs=argparse.REMAINDER,\n", + " help='override some settings in the config.')\n", + "\n", + "args = parser.parse_args([])\n", + "\n", + "cfg = config.load_cfg_from_cfg_file(orig_config_path)\n", + "cfg.metric_learning = True\n", + "args = EasyDict(cfg)\n", + "print(args)\n", + "original_model, original_param_list = build_segmenter_pos_rev2(args)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'dataset': 'refcocog_u', 'train_lmdb': '/home/seunghoon/research/VerbCentric_RIS/datasets/lmdb/refcocog_u/train.lmdb', 'train_split': 'train', 'val_lmdb': '/home/seunghoon/research/VerbCentric_RIS/datasets/lmdb/refcocog_u/val.lmdb', 'val_split': 'val', 'mask_root': '/home/seunghoon/research/VerbCentric_RIS/datasets/masks/refcocog_u', 'clip_pretrain': '/home/seunghoon/research/VerbCentric_RIS/pretrain/RN50.pt', 'input_size': 416, 'word_len': 22, 'word_dim': 1024, 'vis_dim': 512, 'fpn_in': [512, 1024, 1024], 'fpn_out': [256, 512, 1024], 'sync_bn': True, 'freeze': True, 'train_text_encoder': False, 'train_visual_encoder': False, 'num_layers': 3, 'num_head': 8, 'dim_ffn': 2048, 'dropout': 0.1, 'intermediate': False, 'workers': 0, 'workers_val': 0, 'epochs': 50, 'milestones': [35], 'start_epoch': 0, 'batch_size': 64, 'batch_size_val': 64, 'base_lr': 0.0001, 'textft_lr': 1e-05, 'visft_lr': 0.0, 'lr_decay': 0.1, 'lr_multi': 0.1, 'weight_decay': 0.0, 'max_norm': 0.0, 'manual_seed': 0, 'print_freq': 100, 'metric_learning': True, 'metric_mode': 'hardpos_only_ver3', 'exclude_multiobj': True, 'exclude_pos': True, 'loss_option': 'ranking_embed', 'metric_loss_weight': 0.1, 'hn_prob': 0.0, 'hn_celoss': False, 'margin_value': 10, 'temperature': 0.05, 'exp_name': 'CRIS_AML_verbonly_pos25_b32', 'output_folder': 'exp/refcocog_u/exclude_multiobj', 'save_freq': 1, 'weight': None, 'resume': 'latest', 'evaluate': True, 'dist_url': 'tcp://localhost:7024', 'dist_backend': 'nccl', 'multiprocessing_distributed': True, 'world_size': 1, 'rank': 0, 'test_split': 'val-test', 'test_lmdb': '/home/seunghoon/research/VerbCentric_RIS/datasets/lmdb/refcocog_u/val.lmdb', 'visualize': False}\n", + "CLIP FROZEN !!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-12-10 01:09:02.415\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mmodel_\u001b[0m:\u001b[36mbuild_segmenter_pos_rev2\u001b[0m:\u001b[36m19\u001b[0m - \u001b[1mBackbone with decay=325, Head=124\u001b[0m\n" + ] + } + ], + "source": [ + "parser = argparse.ArgumentParser(\n", + " description='Pytorch Referring Expression Segmentation')\n", + "parser.add_argument('--config',\n", + " default=new_config_path,\n", + " type=str,\n", + " help='config file')\n", + "parser.add_argument('--opts',\n", + " default=None,\n", + " nargs=argparse.REMAINDER,\n", + " help='override some settings in the config.')\n", + "\n", + "args = parser.parse_args([])\n", + "\n", + "cfg = config.load_cfg_from_cfg_file(new_config_path)\n", + "cfg.metric_learning = True\n", + "args = EasyDict(cfg)\n", + "print(args)\n", + "new_model, new_param_list = build_segmenter_pos_rev2(args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define dataloader and util functions" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "init_fn = partial(worker_init_fn,\n", + " num_workers=args.workers,\n", + " rank=args.rank,\n", + " seed=args.manual_seed)\n", + "val_data = origDataset(lmdb_dir=args.val_lmdb,\n", + " mask_dir=args.mask_root,\n", + " dataset=args.dataset,\n", + " split=args.val_split,\n", + " mode='val',\n", + " input_size=args.input_size,\n", + " word_length=args.word_len,\n", + " args=args)\n", + "val_loader = data.DataLoader(val_data,\n", + " batch_size=32,\n", + " shuffle=False,\n", + " num_workers=args.workers_val,\n", + " pin_memory=True,\n", + " drop_last=True)\n", + "\n", + "train_data = RefDataset(lmdb_dir=args.train_lmdb,\n", + " mask_dir=args.mask_root,\n", + " dataset=args.dataset,\n", + " split=args.train_split,\n", + " mode='train',\n", + " input_size=args.input_size,\n", + " word_length=args.word_len,\n", + " args=args)\n", + "train_loader = data.DataLoader(train_data,\n", + " batch_size=32,\n", + " shuffle=False,\n", + " num_workers=args.workers,\n", + " pin_memory=True,\n", + " worker_init_fn=init_fn,\n", + " drop_last=True)\n", + "# detokenizer\n", + "tokenizer = _Tokenizer()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([32, 3, 416, 416]) torch.Size([32, 22]) torch.Size([32, 416, 416]) torch.Size([32, 22])\n", + "tensor([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1265, 11688, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1960, 320, 31777, 8172, 49407, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1312, 536, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 7435, 320, 3101, 2972, 49407, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 2862, 530, 2184, 49407, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 2862, 525, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1265, 9729, 1952, 49407, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 4919, 525, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1171, 874, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 2862, 7619, 531, 518, 1823, 49407, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 2862, 1131, 531, 49407, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1265, 530, 518, 1112, 4657, 49407, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1401, 1074, 320, 3470, 1063, 49407, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1629, 5084, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 1265, 32025, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [49406, 530, 7860, 49407, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0],\n", + " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0]])\n" + ] + } + ], + "source": [ + "for data in train_loader:\n", + " img, word_vec, mask, hardpos = data\n", + " print(img.shape, word_vec.shape, mask.shape, hardpos.shape)\n", + " print(hardpos)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load pretrained models" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "device: cuda\n" + ] + } + ], + "source": [ + "# get current device\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "print('device:', device)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1173351/383024302.py:1: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " original_checkpoint = torch.load(orig_model_path, map_location='cuda')\n", + "/tmp/ipykernel_1173351/383024302.py:11: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " new_checkpoint = torch.load(new_model_path, map_location='cuda')\n" + ] + } + ], + "source": [ + "original_checkpoint = torch.load(orig_model_path, map_location='cuda')\n", + "args.start_epoch = original_checkpoint['epoch']\n", + "best_IoU = original_checkpoint[\"best_iou\"]\n", + "best_oIoU = original_checkpoint[\"best_oiou\"]\n", + "original_model = torch.nn.DataParallel(original_model)\n", + "original_model.load_state_dict(original_checkpoint['state_dict'])\n", + "original_model = original_model.to(device)\n", + "#original_model.eval()\n", + "\n", + "\n", + "new_checkpoint = torch.load(new_model_path, map_location='cuda')\n", + "args.start_epoch = new_checkpoint['epoch']\n", + "best_IoU = new_checkpoint[\"best_iou\"]\n", + "best_oIoU = new_checkpoint[\"best_oiou\"]\n", + "new_model = torch.nn.DataParallel(new_model)\n", + "new_model.load_state_dict(new_checkpoint['state_dict'])\n", + "new_model = new_model.to(device)\n", + "#new_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "# similarity matrix calc\n", + "def calc_sim(embeddings):\n", + " # embeddings : (b, c, h, w)\n", + " print(\"embedding shape: \", embeddings.size())\n", + " if len(embeddings.size()) == 3:\n", + " b, c, hw = embeddings.size()\n", + " embeddings = embeddings.view(b, c, -1).mean(dim=2)\n", + " \n", + " # cosine similarity\n", + " sim = F.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)\n", + " return sim\n", + "\n", + "def draw_result(imgs, preds, masks, texts):\n", + " for img, pred, mask, orig_text in zip(imgs, preds, masks, texts):\n", + " # drop zeros of text\n", + " orig_text = orig_text[orig_text != 0]\n", + " orig_text = orig_text[1:-1]\n", + " orig_text = tokenizer.decode(orig_text)\n", + " img = img.permute(1, 2, 0).cpu().numpy()\n", + " # unscale using image_mean and image_std\n", + " img = img * img_std + img_mean\n", + " # and clamp to [0, 1]\n", + " img = np.clip(img, 0, 1)\n", + "\n", + " pred = pred.cpu().numpy()\n", + " mask = mask.cpu().numpy()\n", + " plt.title(f\"text: {orig_text}\")\n", + " plt.imshow(img)\n", + " plt.imshow(pred, alpha=0.5)\n", + " #plt.imshow(mask, alpha=0.5)\n", + " plt.show()\n", + "\n", + "\n", + "def return_mask(emb_distance, positive_verbs, negative_verbs, posneg_verbs=None, verb_mask=None):\n", + "\n", + " assert torch.sum(verb_mask) == torch.sum(positive_verbs + negative_verbs), \"Verb mask does not match the sum of positive and negative verbs.\"\n", + " assert len(positive_verbs) == len(negative_verbs), \"Positive and negative verbs do not have the same length.\"\n", + "\n", + " B_, B_ = emb_distance.shape\n", + " positive_mask = torch.zeros_like(emb_distance)\n", + " negative_mask = torch.ones_like(emb_distance)\n", + " hard_negative_mask = torch.zeros_like(emb_distance)\n", + " positive_mask.fill_diagonal_(1)\n", + "\n", + " # print(\"positive verbs: \", positive_verbs)\n", + " # print(\"negative verbs: \", negative_verbs)\n", + " # print(\"emb shape: \", emb_distance.shape)\n", + "\n", + " if B_ < len(verb_mask):\n", + " # Considering only verbs that pass the verb_mask filter\n", + " positive_verbs = torch.tensor(positive_verbs)[verb_mask]\n", + " negative_verbs = torch.tensor(negative_verbs)[verb_mask]\n", + "\n", + " # Exclude hard negatives from both masks (diagonal)\n", + " for i in range(B_):\n", + " if negative_verbs[i] == 1:\n", + " positive_mask[i, i] = 0\n", + " negative_mask[i, i] = 0\n", + " # Set the entire row and column for the hard negative, except the diagonal\n", + " hard_negative_mask[i, :] = 1 # Mark the i-th row\n", + " hard_negative_mask[:, i] = 1 # Mark the i-th column\n", + " hard_negative_mask[i, i] = 0 # Ensure diagonal element (i, i) is 0\n", + "\n", + " i = 0\n", + " while i < B_:\n", + " if positive_verbs[i] == 1: \n", + " if i + 1 < B_ and positive_verbs[i + 1] == 1:\n", + " positive_mask[i, i + 1] = 1\n", + " positive_mask[i + 1, i] = 1\n", + " i += 2 \n", + " else:\n", + " i += 1\n", + " else:\n", + " # Exclude hard negatives from both masks (diagonal)\n", + " for i in range(B_):\n", + " if negative_verbs[i] == 1:\n", + " positive_mask[i, i] = 0\n", + " negative_mask[i, i] = 0\n", + " # Set the entire row and column for the hard negative, except the diagonal\n", + " hard_negative_mask[i, :] = 1 # Mark the i-th row\n", + " hard_negative_mask[:, i] = 1 # Mark the i-th column\n", + " hard_negative_mask[i, i] = 0 # Ensure diagonal element (i, i) is 0\n", + "\n", + " # Apply the positive pairs logic similarly as above\n", + " i = 0\n", + " while i < B_:\n", + " if positive_verbs[i] == 1 and i + 1 < B_ and positive_verbs[i + 1] == 1:\n", + " positive_mask[i, i + 1] = 1\n", + " positive_mask[i + 1, i] = 1\n", + " i += 2\n", + " else:\n", + " i += 1\n", + "\n", + " negative_mask = negative_mask - positive_mask\n", + " negative_mask[hard_negative_mask.bool()] = 0 # Set hard negative indices to 0 in negative_mask\n", + " # print(\"positive mask: \", positive_mask)\n", + " # print(\"negative mask: \", negative_mask)\n", + " return positive_mask, negative_mask, hard_negative_mask\n", + "\n", + "\n", + "def infer_result(model_in_use, data_loader, vis = False, trial = 25):\n", + " with torch.no_grad(): \n", + " sims = []\n", + " pos_masks = []\n", + " neg_masks = []\n", + " iter = 0\n", + " for i, (imgs, texts, target, hardpos) in enumerate(train_loader):\n", + " # data\n", + " imgs = imgs.cuda(non_blocking=True)\n", + " texts = texts.cuda(non_blocking=True)\n", + " target = target.cuda(non_blocking=True).unsqueeze(1)\n", + " hardpos = hardpos.cuda(non_blocking=True)\n", + " #print(hardpos)\n", + " #print(\"shapes: \", imgs.shape, texts.shape, target.shape, hardpos.shape)\n", + "\n", + " # inference\n", + " # preds, tgts, loss, embeddings = model_in_use(image, text, target, hardpos)\n", + " preds, tgts, loss, sim_matrix, posverb_mask, negverb_mask = model_in_use(imgs, texts, target, hardpos)\n", + " verb_mask = posverb_mask + negverb_mask \n", + " positive_mask, negative_mask, hard_negative_mask = return_mask(sim_matrix, posverb_mask, negverb_mask, verb_mask=verb_mask)\n", + " # print(\"mask shapes: \", positive_mask.shape, negative_mask.shape)\n", + " # print(\"pos mask: \", positive_mask)\n", + " # print(\"neg mask: \", negative_mask)\n", + " # print(\"similarity matrix: \", sim_matrix)\n", + "\n", + " # calc similarity matrix\n", + " #sim = calc_sim(embeddings)\n", + " #print(sim_matrix.size())\n", + " #print(posverb_mask)\n", + " sims.append(sim_matrix.detach().cpu().numpy())\n", + " pos_masks.append(positive_mask.detach().cpu().numpy())\n", + " neg_masks.append(negative_mask.detach().cpu().numpy())\n", + " preds = torch.sigmoid(preds)\n", + " if preds.shape[-2:] != imgs.shape[-2:]:\n", + " preds = F.interpolate(preds,\n", + " size=imgs.shape[-2:],\n", + " mode='bicubic',\n", + " align_corners=True).squeeze(1)\n", + " # overlay image and preds mask\n", + " imgs = imgs.detach().cpu()\n", + " texts = texts.detach().cpu()\n", + " preds = preds.detach().cpu()\n", + " \n", + " #masks = masks.detach().cpu()\n", + " # texts to numpy\n", + " texts = texts.numpy()\n", + " \n", + " if vis:\n", + " draw_result(imgs, preds, tgts, texts)\n", + " # draw_result(imgs, preds, masks, texts)\n", + " iter+=1\n", + " if iter > trial:\n", + " break\n", + " return sims, pos_masks, neg_masks " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sims_orig, positive_mask_orig, negative_mask_orig = infer_result(original_model, train_loader, vis=False, trial=100)\n", + "sims_new, positive_mask_new, negative_mask_new = infer_result(new_model, train_loader, vis=False, trial=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1173351/1865868274.py:13: RuntimeWarning: invalid value encountered in arccos\n", + " sim = np.arccos(sim)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "angle_vals = []\n", + "\n", + "def get_posneg_sim(sims, poses, negs, combined=False):\n", + " rad_ = 180 / math.pi\n", + " pos_vals = []\n", + " neg_vals = []\n", + " combined_vals = []\n", + " for sim, pos_, neg_ in zip(sims, poses, negs):\n", + " #print(sim.shape, pos_.shape, neg_.shape)\n", + " # make diagonals to 0\n", + " pos_ = pos_ - np.eye(pos_.shape[0])\n", + " neg_ = neg_ - np.eye(neg_.shape[0])\n", + " sim = np.arccos(sim)\n", + " # append, except for values with <0.05 \n", + " pos_vals.extend(sim[pos_ == 1].flatten())\n", + " neg_vals.extend(sim[neg_ == 1].flatten())\n", + " if combined :\n", + " combined_vals.extend(sim[pos_ == 1].flatten())\n", + " combined_vals.extend(sim[neg_ == 1].flatten())\n", + " \n", + " if combined : \n", + " return np.array(pos_vals) * rad_, np.array(neg_vals) * rad_, np.array(combined_vals) * rad_\n", + " else : \n", + " return np.array(pos_vals) * rad_, np.array(neg_vals) * rad_\n", + "\n", + "# pos_vals_orig, neg_vals_orig = get_posneg_sim(sims_orig, positive_mask_orig, negative_mask_orig)\n", + "# pos_vals_new, neg_vals_new = get_posneg_sim(sims_new, positive_mask_new, negative_mask_new)\n", + "\n", + "pos_vals_orig, neg_vals_orig, combined_vals_orig = get_posneg_sim(sims_orig, positive_mask_orig, negative_mask_orig, combined=True)\n", + "pos_vals_new, neg_vals_new, combined_vals_new = get_posneg_sim(sims_new, positive_mask_new, negative_mask_new, combined=True)\n", + "\n", + "\n", + "\n", + "\n", + "plt.hist(pos_vals_orig.repeat(10), bins=100, label=f'pos_orig, $\\mu$ = {np.mean(pos_vals_orig):.2f}', alpha=0.5)\n", + "plt.hist(neg_vals_orig, bins=100, label=f'neg_orig, $\\mu$ = {np.mean(neg_vals_orig):.2f}', alpha=0.5)\n", + "plt.xlabel('Angle (degree)', fontsize=12)\n", + "plt.ylabel('# Samples', fontsize=12)\n", + "plt.title('Pairwise Angular Distribution of Samples, CRIS')\n", + "plt.grid()\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "plt.hist(90 - pos_vals_new.repeat(10), bins=100, label=f'pos_ACE, $\\mu$ = {90 - np.mean(pos_vals_new):.2f}', alpha=0.5)\n", + "plt.hist(90 - neg_vals_new, bins=100, label=f'neg_ACE, $\\mu$ = {90 - np.mean(neg_vals_new):.2f}', alpha=0.5)\n", + "plt.xlabel('Angle (degree)', fontsize=12)\n", + "plt.ylabel('# Samples', fontsize=12)\n", + "plt.title('Pairwise Angular Distribution of Samples, CRIS+ACE')\n", + "plt.grid()\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "# for sim in sims:\n", + "# # get off-diagonal elements\n", + "# sim = sim\n", + "# # arccos\n", + "# sim = np.arccos(sim)\n", + "# # append, except for values with <0.05 \n", + "# sim = sim[sim > 0.05]\n", + "# angle_vals.extend(sim.flatten())\n", + "\n", + "# angle_vals_new = []\n", + "# for sim in sims_new:\n", + "# # get off-diagonal elements\n", + "# sim = sim\n", + "# # arccos\n", + "# sim = np.arccos(sim)\n", + "# # append, except for values with <0.05 \n", + "# sim = sim[sim > 0.05]\n", + "# angle_vals_new.extend(sim.flatten())\n", + "\n", + "# plt.hist(angle_vals, bins=100, label='original', alpha=0.5)\n", + "# plt.hist(angle_vals_new, bins=100, label='new', alpha=0.5)\n", + "# plt.legend()\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cris_new", + "language": "python", + "name": "cris_new" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.20" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}