{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from collections import  defaultdict\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['info', 'videos', 'sentences'])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_info = json.load(open('train_val_videodatainfo.json' ))\n",
    "dataset_info.keys()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2990"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_dataset_info = json.load(open('test_videodatainfo.json' ))\n",
    "test_dataset_info.keys()\n",
    "len(test_dataset_info['videos'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "train_9k_video_id = pd.read_csv(open('msrvtt_data/MSRVTT_train.9k.csv'))\n",
    "train_9k_video_id = train_9k_video_id['video_id'].tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9000"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_9k_video_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "images_field = []\n",
    "images_val_field = []\n",
    "id2split = {}\n",
    "\n",
    "for video_info in  dataset_info['videos']:\n",
    "    if video_info['video_id'] in train_9k_video_id:\n",
    "        images_field.append(\n",
    "            {\n",
    "                \"id\": int(video_info['video_id'].split('video')[-1]),\n",
    "                'file_name': video_info['video_id']\n",
    "            }\n",
    "        )\n",
    "    \n",
    "        id2split[video_info['video_id']] = 'train'\n",
    "    else:\n",
    "        images_val_field.append(\n",
    "            {\n",
    "                \"id\": int(video_info['video_id'].split('video')[-1]),\n",
    "                'file_name': video_info['video_id']\n",
    "            }\n",
    "        )\n",
    "    \n",
    "        id2split[video_info['video_id']] = 'test'\n",
    "\n",
    "\n",
    "for video_info in  test_dataset_info['videos']:\n",
    "\n",
    "    if video_info['video_id'] in train_9k_video_id:\n",
    "        images_field.append(\n",
    "            {\n",
    "                \"id\": int(video_info['video_id'].split('video')[-1]),\n",
    "                'file_name': video_info['video_id']\n",
    "            }\n",
    "        )\n",
    "    \n",
    "        id2split[video_info['video_id']] = 'train'\n",
    "    else:\n",
    "        images_val_field.append(\n",
    "            {\n",
    "                \"id\": int(video_info['video_id'].split('video')[-1]),\n",
    "                'file_name': video_info['video_id']\n",
    "            }\n",
    "        )\n",
    "    \n",
    "        id2split[video_info['video_id']] = 'test'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "length train: 9000 test: 1000\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(\"length train: {} test: {}\".format(len(images_field), len(images_val_field)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations_field = []\n",
    "\n",
    "for sentence_info in  dataset_info['sentences']:\n",
    "    if id2split[sentence_info['video_id']] == 'train':\n",
    "        video_id = int(sentence_info['video_id'].split('video')[-1])\n",
    "\n",
    "        annotations_field.append(\n",
    "            {\n",
    "                \"image_id\": video_id,\n",
    "                'id': sentence_info['sen_id'],\n",
    "                \"caption\": sentence_info['caption']\n",
    "            }\n",
    "        )\n",
    "\n",
    "for sentence_info in  test_dataset_info['sentences']:\n",
    "    if id2split[sentence_info['video_id']] == 'train':\n",
    "    \n",
    "        video_id = int(sentence_info['video_id'].split('video')[-1])\n",
    "\n",
    "        annotations_field.append(\n",
    "            {\n",
    "                \"image_id\": video_id,\n",
    "                'id': sentence_info['sen_id'],\n",
    "                \"caption\": sentence_info['caption']\n",
    "            }\n",
    "        )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "180000"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(annotations_field)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data = {\n",
    "        'images': images_field,\n",
    "        \"annotations\": annotations_field\n",
    "    }\n",
    "json.dump(data, open('annotations_new/caption_msrvtt_1k_trainval_cocostyle.json', 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_info = pd.read_csv('msrvtt_data/MSRVTT_JSFUSION_test.csv')\n",
    "videoids = test_info['video_id'].tolist()\n",
    "sentences = test_info['sentence'].tolist()\n",
    "images_field = []\n",
    "\n",
    "annotations_field = []\n",
    "\n",
    "for video_id, sentence in zip(videoids, sentences):\n",
    "    images_field.append(\n",
    "            {\n",
    "                \"id\": int(video_id.split('video')[-1]),\n",
    "                'file_name': video_id\n",
    "            }\n",
    "        )\n",
    "    annotations_field.append(\n",
    "            {\n",
    "                \"image_id\": int(video_id.split('video')[-1]),\n",
    "                'id': int(video_id.split('video')[-1]),\n",
    "                \"caption\": sentence\n",
    "            }\n",
    "        )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\n",
    "        'images': images_field,\n",
    "        \"annotations\": annotations_field\n",
    "    }\n",
    "json.dump(data, open('annotations_new/caption_msrvtt_1k_test_cocostyle.json', 'w'))"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a745cf6333d4d8275ecd56c526d26202f2d2beb96e1206fac92576cf98b427be"
  },
  "kernelspec": {
   "display_name": "Python 3.7.11 64-bit ('xmodaler': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}