{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from collections import defaultdict\n", "import csv" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['info', 'videos', 'sentences'])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_info = json.load(open('train_val_videodatainfo.json' ))\n", "dataset_info.keys()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2990" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_dataset_info = json.load(open('test_videodatainfo.json' ))\n", "test_dataset_info.keys()\n", "len(test_dataset_info['videos'])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "train_9k_video_id = pd.read_csv(open('msrvtt_data/MSRVTT_train.9k.csv'))\n", "train_9k_video_id = train_9k_video_id['video_id'].tolist()\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9000" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(train_9k_video_id)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "\n", "images_field = []\n", "images_val_field = []\n", "id2split = {}\n", "\n", "for video_info in dataset_info['videos']:\n", " if video_info['video_id'] in train_9k_video_id:\n", " images_field.append(\n", " {\n", " \"id\": int(video_info['video_id'].split('video')[-1]),\n", " 'file_name': video_info['video_id']\n", " }\n", " )\n", " \n", " id2split[video_info['video_id']] = 'train'\n", " else:\n", " images_val_field.append(\n", " {\n", " \"id\": int(video_info['video_id'].split('video')[-1]),\n", " 'file_name': video_info['video_id']\n", " }\n", " )\n", " \n", " id2split[video_info['video_id']] = 'test'\n", "\n", "\n", "for video_info in test_dataset_info['videos']:\n", "\n", " if video_info['video_id'] in train_9k_video_id:\n", " images_field.append(\n", " {\n", " \"id\": int(video_info['video_id'].split('video')[-1]),\n", " 'file_name': video_info['video_id']\n", " }\n", " )\n", " \n", " id2split[video_info['video_id']] = 'train'\n", " else:\n", " images_val_field.append(\n", " {\n", " \"id\": int(video_info['video_id'].split('video')[-1]),\n", " 'file_name': video_info['video_id']\n", " }\n", " )\n", " \n", " id2split[video_info['video_id']] = 'test'\n" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "length train: 9000 test: 1000\n" ] } ], "source": [ "\n", "print(\"length train: {} test: {}\".format(len(images_field), len(images_val_field)))" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "annotations_field = []\n", "\n", "for sentence_info in dataset_info['sentences']:\n", " if id2split[sentence_info['video_id']] == 'train':\n", " video_id = int(sentence_info['video_id'].split('video')[-1])\n", "\n", " annotations_field.append(\n", " {\n", " \"image_id\": video_id,\n", " 'id': sentence_info['sen_id'],\n", " \"caption\": sentence_info['caption']\n", " }\n", " )\n", "\n", "for sentence_info in test_dataset_info['sentences']:\n", " if id2split[sentence_info['video_id']] == 'train':\n", " \n", " video_id = int(sentence_info['video_id'].split('video')[-1])\n", "\n", " annotations_field.append(\n", " {\n", " \"image_id\": video_id,\n", " 'id': sentence_info['sen_id'],\n", " \"caption\": sentence_info['caption']\n", " }\n", " )\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "180000" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(annotations_field)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "\n", "data = {\n", " 'images': images_field,\n", " \"annotations\": annotations_field\n", " }\n", "json.dump(data, open('annotations_new/caption_msrvtt_1k_trainval_cocostyle.json', 'w'))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "test_info = pd.read_csv('msrvtt_data/MSRVTT_JSFUSION_test.csv')\n", "videoids = test_info['video_id'].tolist()\n", "sentences = test_info['sentence'].tolist()\n", "images_field = []\n", "\n", "annotations_field = []\n", "\n", "for video_id, sentence in zip(videoids, sentences):\n", " images_field.append(\n", " {\n", " \"id\": int(video_id.split('video')[-1]),\n", " 'file_name': video_id\n", " }\n", " )\n", " annotations_field.append(\n", " {\n", " \"image_id\": int(video_id.split('video')[-1]),\n", " 'id': int(video_id.split('video')[-1]),\n", " \"caption\": sentence\n", " }\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "data = {\n", " 'images': images_field,\n", " \"annotations\": annotations_field\n", " }\n", "json.dump(data, open('annotations_new/caption_msrvtt_1k_test_cocostyle.json', 'w'))" ] } ], "metadata": { "interpreter": { "hash": "a745cf6333d4d8275ecd56c526d26202f2d2beb96e1206fac92576cf98b427be" }, "kernelspec": { "display_name": "Python 3.7.11 64-bit ('xmodaler': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }