{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import argparse\n",
    "import sys\n",
    "import opts\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.patches as patches\n",
    "import textwrap\n",
    "\n",
    "from PIL import Image, ImageDraw\n",
    "import json\n",
    "import numpy as np\n",
    "from mbench.ytvos_ref import build as build_ytvos_ref"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_folder = 'data/ref-youtube-vos/train'\n",
    "text_colors = ['red', 'blue']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('mbench/result_revised50.json') as file:\n",
    "    data = json.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bounding_box(img):\n",
    "    rows = np.any(img, axis=1)\n",
    "    cols = np.any(img, axis=0)\n",
    "    rmin, rmax = np.where(rows)[0][[0, -1]]\n",
    "    cmin, cmax = np.where(cols)[0][[0, -1]]\n",
    "    return rmin, rmax, cmin, cmax # y1, y2, x1, x2 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "def showImageRef(vid_id):\n",
    "    vid_data = data[vid_id]\n",
    "    cats = list(vid_data.keys())\n",
    "\n",
    "    for cat in cats:\n",
    "        cat_data = vid_data[cat]\n",
    "        frames = list(cat_data.keys())\n",
    "        \n",
    "        for frame in frames:\n",
    "            frame_data = cat_data[frame]\n",
    "            \n",
    "            img_path = os.path.join(img_folder, 'JPEGImages', vid_id, frame + '.jpg')\n",
    "            mask_path = os.path.join(img_folder, 'Annotations', vid_id, frame + '.png')\n",
    "            img = Image.open(img_path).convert('RGB')\n",
    "            mask = Image.open(mask_path).convert('P')\n",
    "            mask = np.array(mask)\n",
    "            \n",
    "            if frame_data:\n",
    "                obj_ids = list(frame_data.keys())\n",
    "                obj_nums = len(obj_ids)\n",
    "\n",
    "                fig, axes = plt.subplots(1, obj_nums, figsize=(16, obj_nums))\n",
    "\n",
    "                for i in range(len(obj_ids)):\n",
    "                    obj_id = obj_ids[i]\n",
    "                    obj_data = frame_data[obj_id]\n",
    "                    if obj_data:\n",
    "                        ref_exp = obj_data['ref_exp']\n",
    "                        isValid = obj_data['isValid']\n",
    "\n",
    "                        obj_mask = (mask == int(obj_id)).astype(np.float32)\n",
    "                        if (obj_mask > 0).any():\n",
    "                            y1, y2, x1, x2 = bounding_box(obj_mask)\n",
    "                            box = np.array([x1, y1, x2, y2])\n",
    "                        else:\n",
    "                            box = np.array([0, 0, 0, 0])\n",
    "                        \n",
    "                        if obj_nums == 1:\n",
    "                            ax = axes\n",
    "                        else:\n",
    "                            ax = axes[i]\n",
    "                        ax.imshow(img)\n",
    "                        width, height = box[2] - box[0], box[3] - box[1]\n",
    "                        rect = patches.Rectangle((x1, y1), width, height, linewidth=2, edgecolor='red', facecolor='none')\n",
    "                        ax.add_patch(rect)\n",
    "\n",
    "                        wrapped_text = \"\\n\".join(textwrap.wrap(ref_exp, width=30))\n",
    "                        ax.annotate(wrapped_text, xy=(0.5, -1.5), xycoords=\"axes fraction\", ha = \"center\", color=text_colors[isValid])\n",
    "                \n",
    "                plt.suptitle(f\"video: {vid_id} - cat: {cat} - frame: {frame}\")\n",
    "                plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "04667fabaa\n"
     ]
    }
   ],
   "source": [
    "vid_id = list(data.keys())[49]\n",
    "print(vid_id)\n",
    "showImageRef(vid_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "referformer",
   "language": "python",
   "name": "referformer"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}