{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "61c272f2-edbe-4b7d-8fec-3ab431400cd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e9dfd7d7-1685-4fc7-bbb9-3905c32d8ba1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"metadata.json\", \"rb\") as f:\n",
    "    metadata = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "70bdba48-db01-42ac-8d89-edc69d7d7672",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "595375"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "59e193cc-0dd8-4f7e-959a-fbad0133d76c",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"blip_laion_cc_sbu_558k.jsonblip_laion_cc_sbu_558k.json\", \"rb\") as f:\n",
    "    data = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f3157f41-269b-4f7a-b3ba-9be711babe02",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '004539375',\n",
       " 'image': '00453/004539375.jpg',\n",
       " 'conversations': [{'from': 'human',\n",
       "   'value': 'Render a clear and concise summary of the photo.\\n<image>'},\n",
       "  {'from': 'gpt',\n",
       "   'value': 'select luxury furniture 3 - inch gel memory foam mattress topper'}]}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "50d8a051-1526-47dd-ad71-d3c66f7bd34e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '004374662',\n",
       " 'image': '00437/004374662.jpg',\n",
       " 'conversations': [{'from': 'human',\n",
       "   'value': 'Give a brief description of the image.\\n<image>'},\n",
       "  {'from': 'gpt', 'value': 'the north face duffel bag camo large'}]}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[234]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2e6d5664-4583-49a6-93cc-079ee2d1ff6c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "558128"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "11ed106d-6bef-482c-a456-5eaaf2025534",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'GCC_train_001749371',\n",
       " 'image': 'GCC_train_001749371.jpg',\n",
       " 'caption': 'if you are dreaming of simpler or off - the - grid living , a yurt is a fantastic option',\n",
       " 'blip_caption': 'a white and tan yurt sitting on a dirt road',\n",
       " 'url': 'https://i.pinimg.com/736x/14/7b/64/147b64467ee966d9a578097bb70475ad--yurt-kits-small-space-living.jpg'}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata[67]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ce8adcec-2499-4be3-be1d-7313fe54e96a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '000466761',\n",
       " 'image': '00046/000466761.jpg',\n",
       " 'conversations': [{'from': 'human',\n",
       "   'value': '<image>\\nProvide a brief description of the given image.'},\n",
       "  {'from': 'gpt',\n",
       "   'value': 'a clipboard and a pen with the words public health emergency next to it on a white table'}]}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[67]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "068313b6-6379-4ca2-892c-682634d3581e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "9ec33b51-4a0b-4a1e-81f7-2fda7cddb25f",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_data = data[:200000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "095685e5-40f1-4d84-8280-ef74fa56c5a2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200000"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sample_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ffbad552-23fd-475f-8e9a-7118bcc4f51e",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"w\") as f:\n",
    "    json.dump(sample_data, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "69a05d25-6f3b-40c0-a3b5-e185ff526471",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"rb\") as f:\n",
    "    sample = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "200eea06-dfd6-4b3a-bb91-82af7d363951",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "200000"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f86caa1e-edea-4a9c-934f-5420ede80d0d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}