{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/nfs/zhujinguo/datasets/visual_genome/annotations\n"
     ]
    }
   ],
   "source": [
    "import glob \n",
    "import json\n",
    "import os\n",
    "print(os.getcwd())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.41M caption for \"region_descriptions.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(f\"region_descriptions.json\", \"r\") as fp: #5.41\n",
    "    captions = json.load(fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['regions', 'id'])\n"
     ]
    }
   ],
   "source": [
    "print(captions[1].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "captions[1]['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'region_id': 1387,\n",
       "  'width': 43,\n",
       "  'height': 17,\n",
       "  'image_id': 2,\n",
       "  'phrase': 'walk sign is lit up',\n",
       "  'y': 193,\n",
       "  'x': 465},\n",
       " {'region_id': 1388,\n",
       "  'width': 133,\n",
       "  'height': 253,\n",
       "  'image_id': 2,\n",
       "  'phrase': 'man wearing silver backpack',\n",
       "  'y': 322,\n",
       "  'x': 331}]"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "captions[1]['regions'][:2]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "256\n"
     ]
    }
   ],
   "source": [
    "print(len(captions[1]['regions']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'region_id': 1387,\n",
       " 'width': 43,\n",
       " 'height': 17,\n",
       " 'image_id': 2,\n",
       " 'phrase': 'walk sign is lit up',\n",
       " 'y': 193,\n",
       " 'x': 465}"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "captions[1]['regions'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108077/108077 [00:01<00:00, 58748.17it/s]\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108077/108077 [00:03<00:00, 34841.72it/s]\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "from tqdm import tqdm\n",
    "iid2captions = defaultdict(set)\n",
    "for cap in tqdm(captions):\n",
    "    cap = cap[\"regions\"]\n",
    "    \n",
    "    for c in cap:\n",
    "        # v0\n",
    "        # iid2captions[c[\"image_id\"]].append(c['phrase'])\n",
    "        region_area = int(c['height'])*int(c['width'])\n",
    "        if region_area >= 128*128:\n",
    "            iid2captions[c[\"image_id\"]].add(c['phrase'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "for iid in iid2captions.keys():\n",
    "    iid2captions[iid] = list(iid2captions[iid])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "107823"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(iid2captions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "iid2captions[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "not all images have caption annotations\n",
      "108249 107823 107823 107823\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "from glob import glob\n",
    "paths = list(glob(f\"../images/VG_100K/*.jpg\")) + list(\n",
    "    glob(f\"../images/VG_100K_2/*.jpg\")\n",
    ")\n",
    "random.shuffle(paths)\n",
    "caption_paths = [\n",
    "    path for path in paths if int(path.split(\"/\")[-1][:-4]) in iid2captions\n",
    "]\n",
    "iid2subset = {}\n",
    "for path in paths:\n",
    "    if int(path.split(\"/\")[-1][:-4]) in iid2captions:\n",
    "        iid2subset[int(path.split(\"/\")[-1][:-4])] = os.path.join(path.split(\"/\")[-2],path.split(\"/\")[-1])\n",
    "    \n",
    "\n",
    "if len(paths) == len(caption_paths):\n",
    "    print(\"all images have caption annotations\")\n",
    "else:\n",
    "    print(\"not all images have caption annotations\")\n",
    "print(\n",
    "    len(paths), len(caption_paths), len(iid2captions), len(iid2subset)\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "paths\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1645544\n"
     ]
    }
   ],
   "source": [
    "num=0\n",
    "for iid in iid2captions.keys():\n",
    "    num += len(iid2captions[iid])\n",
    "print(num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25614848"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'VG_100K_2/1.jpg'"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iid2subset[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data = {\n",
    "    \"phrase\":  iid2captions,\n",
    "    \"subset\": iid2subset,\n",
    "}\n",
    "json.dump(data, open(\"vg_captions_128filter.json\", \"w\"))\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a745cf6333d4d8275ecd56c526d26202f2d2beb96e1206fac92576cf98b427be"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}