{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/nfs/zhujinguo/datasets/visual_genome/annotations\n" ] } ], "source": [ "import glob \n", "import json\n", "import os\n", "print(os.getcwd())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# 5.41M caption for \"region_descriptions.json\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "with open(f\"region_descriptions.json\", \"r\") as fp: #5.41\n", " captions = json.load(fp)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['regions', 'id'])\n" ] } ], "source": [ "print(captions[1].keys())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "captions[1]['id']" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'region_id': 1387,\n", " 'width': 43,\n", " 'height': 17,\n", " 'image_id': 2,\n", " 'phrase': 'walk sign is lit up',\n", " 'y': 193,\n", " 'x': 465},\n", " {'region_id': 1388,\n", " 'width': 133,\n", " 'height': 253,\n", " 'image_id': 2,\n", " 'phrase': 'man wearing silver backpack',\n", " 'y': 322,\n", " 'x': 331}]" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "captions[1]['regions'][:2]\n" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "256\n" ] } ], "source": [ "print(len(captions[1]['regions']))" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'region_id': 1387,\n", " 'width': 43,\n", " 'height': 17,\n", " 'image_id': 2,\n", " 'phrase': 'walk sign is lit up',\n", " 'y': 193,\n", " 'x': 465}" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "captions[1]['regions'][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108077/108077 [00:01<00:00, 58748.17it/s]\n" ] } ], "source": [] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108077/108077 [00:03<00:00, 34841.72it/s]\n" ] } ], "source": [ "from collections import defaultdict\n", "from tqdm import tqdm\n", "iid2captions = defaultdict(set)\n", "for cap in tqdm(captions):\n", " cap = cap[\"regions\"]\n", " \n", " for c in cap:\n", " # v0\n", " # iid2captions[c[\"image_id\"]].append(c['phrase'])\n", " region_area = int(c['height'])*int(c['width'])\n", " if region_area >= 128*128:\n", " iid2captions[c[\"image_id\"]].add(c['phrase'])\n" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "for iid in iid2captions.keys():\n", " iid2captions[iid] = list(iid2captions[iid])" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "107823" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(iid2captions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "iid2captions[1]" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "not all images have caption annotations\n", "108249 107823 107823 107823\n" ] } ], "source": [ "import random\n", "from glob import glob\n", "paths = list(glob(f\"../images/VG_100K/*.jpg\")) + list(\n", " glob(f\"../images/VG_100K_2/*.jpg\")\n", ")\n", "random.shuffle(paths)\n", "caption_paths = [\n", " path for path in paths if int(path.split(\"/\")[-1][:-4]) in iid2captions\n", "]\n", "iid2subset = {}\n", "for path in paths:\n", " if int(path.split(\"/\")[-1][:-4]) in iid2captions:\n", " iid2subset[int(path.split(\"/\")[-1][:-4])] = os.path.join(path.split(\"/\")[-2],path.split(\"/\")[-1])\n", " \n", "\n", "if len(paths) == len(caption_paths):\n", " print(\"all images have caption annotations\")\n", "else:\n", " print(\"not all images have caption annotations\")\n", "print(\n", " len(paths), len(caption_paths), len(iid2captions), len(iid2subset)\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "paths\n" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1645544\n" ] } ], "source": [ "num=0\n", "for iid in iid2captions.keys():\n", " num += len(iid2captions[iid])\n", "print(num)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "25614848" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'VG_100K_2/1.jpg'" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iid2subset[1]" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "\n", "data = {\n", " \"phrase\": iid2captions,\n", " \"subset\": iid2subset,\n", "}\n", "json.dump(data, open(\"vg_captions_128filter.json\", \"w\"))\n", "\n", "\n" ] } ], "metadata": { "interpreter": { "hash": "a745cf6333d4d8275ecd56c526d26202f2d2beb96e1206fac92576cf98b427be" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" } }, "nbformat": 4, "nbformat_minor": 4 }