{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9fe51ce7-4c87-4186-9fd3-0fb18ac43e56",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "import requests\n",
    "from transformers import AutoProcessor, CLIPVisionModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0f4c21dd-4258-461d-8511-5be089d068a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")\n",
    "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "98b9f906-ffaa-4be4-8671-4ecf65f12c49",
   "metadata": {},
   "outputs": [],
   "source": [
    "# url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
    "# image = Image.open(requests.get(url, stream=True).raw)\n",
    "image = Image.open(\"002579.jpg\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "54b2e4ce-b77b-4314-87f6-ca2a1970fc79",
   "metadata": {},
   "outputs": [],
   "source": [
    "# image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "cdd65c58-007f-450b-8deb-f8b4f372a823",
   "metadata": {},
   "outputs": [],
   "source": [
    "# image = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e9066c2e-c78b-49d1-979b-10d0f4f09441",
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = processor(images=image, return_tensors=\"pt\", device_map=\"cuda:0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e98b211d-29d9-4662-be0b-e011e89b0101",
   "metadata": {},
   "outputs": [],
   "source": [
    "# inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b030bd3d-4282-4074-98fe-97e658bd0f50",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 3, 224, 224])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inputs[\"pixel_values\"].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "0ce68f11-1c88-4dd7-8b17-0d1de5811fe6",
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs = model(inputs[\"pixel_values\"].to(\"cuda:0\"))\n",
    "last_hidden_state = outputs.last_hidden_state\n",
    "pooled_output = outputs.pooler_output  # pooled CLS states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "30cb0918-a30e-4246-b540-6b8e0d876807",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 768])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pooled_output.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6399543a-f23f-426d-8289-3bb52d293ece",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 50, 768])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "last_hidden_state.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "19a70443-5942-4937-b3ea-6a52d76e2b08",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 768])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs[1].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fa13903f-a94a-4839-ae5a-8df4f55c68b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn\n",
    "from transformers import CLIPVisionConfig,CLIPPreTrainedModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b2bd9198-42f0-40c3-80e1-d167c0b038fb",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'Optional' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mCLIPVisionModelWithProjection\u001b[39;00m(CLIPPreTrainedModel):\n\u001b[1;32m      2\u001b[0m     config_class \u001b[38;5;241m=\u001b[39m CLIPVisionConfig\n\u001b[1;32m      3\u001b[0m     main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpixel_values\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
      "Cell \u001b[0;32mIn[9], line 20\u001b[0m, in \u001b[0;36mCLIPVisionModelWithProjection\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_input_embeddings\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m nn\u001b[38;5;241m.\u001b[39mModule:\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model\u001b[38;5;241m.\u001b[39membeddings\u001b[38;5;241m.\u001b[39mpatch_embedding\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m---> 20\u001b[0m     pixel_values: \u001b[43mOptional\u001b[49m[torch\u001b[38;5;241m.\u001b[39mFloatTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     21\u001b[0m     output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     22\u001b[0m     output_hidden_states: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     23\u001b[0m     return_dict: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     24\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tuple, CLIPVisionModelOutput]:\n\u001b[1;32m     25\u001b[0m     return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m     27\u001b[0m     vision_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model(\n\u001b[1;32m     28\u001b[0m         pixel_values\u001b[38;5;241m=\u001b[39mpixel_values,\n\u001b[1;32m     29\u001b[0m         output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m     30\u001b[0m         output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m     31\u001b[0m         return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m     32\u001b[0m     )\n",
      "\u001b[0;31mNameError\u001b[0m: name 'Optional' is not defined"
     ]
    }
   ],
   "source": [
    "class CLIPVisionModelWithProjection(CLIPPreTrainedModel):\n",
    "    config_class = CLIPVisionConfig\n",
    "    main_input_name = \"pixel_values\"\n",
    "\n",
    "    def __init__(self, config: CLIPVisionConfig):\n",
    "        super().__init__(config)\n",
    "\n",
    "        self.vision_model = CLIPVisionTransformer(config)\n",
    "\n",
    "        self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)\n",
    "\n",
    "        # Initialize weights and apply final processing\n",
    "        self.post_init()\n",
    "\n",
    "    def get_input_embeddings(self) -> nn.Module:\n",
    "        return self.vision_model.embeddings.patch_embedding\n",
    "\n",
    "    def forward(\n",
    "        self,\n",
    "        pixel_values: Optional[torch.FloatTensor] = None,\n",
    "        output_attentions: Optional[bool] = None,\n",
    "        output_hidden_states: Optional[bool] = None,\n",
    "        return_dict: Optional[bool] = None,\n",
    "    ) -> Union[Tuple, CLIPVisionModelOutput]:\n",
    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
    "\n",
    "        vision_outputs = self.vision_model(\n",
    "            pixel_values=pixel_values,\n",
    "            output_attentions=output_attentions,\n",
    "            output_hidden_states=output_hidden_states,\n",
    "            return_dict=return_dict,\n",
    "        )\n",
    "\n",
    "        pooled_output = vision_outputs[1]  # pooled_output\n",
    "\n",
    "        image_embeds = self.visual_projection(pooled_output)\n",
    "\n",
    "        if not return_dict:\n",
    "            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n",
    "            return tuple(output for output in outputs if output is not None)\n",
    "\n",
    "        return CLIPVisionModelOutput(\n",
    "            image_embeds=image_embeds,\n",
    "            last_hidden_state=vision_outputs.last_hidden_state,\n",
    "            hidden_states=vision_outputs.hidden_states,\n",
    "            attentions=vision_outputs.attentions,\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "68a9ee4a-d977-4725-842d-e64e0dd2f61d",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
      "Model config CLIPConfig {\n",
      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
      "  \"architectures\": [\n",
      "    \"CLIPModel\"\n",
      "  ],\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"logit_scale_init_value\": 2.6592,\n",
      "  \"model_type\": \"clip\",\n",
      "  \"projection_dim\": 512,\n",
      "  \"text_config\": {\n",
      "    \"bos_token_id\": 0,\n",
      "    \"dropout\": 0.0,\n",
      "    \"eos_token_id\": 2,\n",
      "    \"model_type\": \"clip_text_model\"\n",
      "  },\n",
      "  \"transformers_version\": \"4.36.2\",\n",
      "  \"vision_config\": {\n",
      "    \"dropout\": 0.0,\n",
      "    \"model_type\": \"clip_vision_model\"\n",
      "  }\n",
      "}\n",
      "\n",
      "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n",
      "All model checkpoint weights were used when initializing CLIPModel.\n",
      "\n",
      "All the weights of CLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPModel for predictions without further training.\n",
      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
      "Model config CLIPConfig {\n",
      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
      "  \"architectures\": [\n",
      "    \"CLIPModel\"\n",
      "  ],\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"logit_scale_init_value\": 2.6592,\n",
      "  \"model_type\": \"clip\",\n",
      "  \"projection_dim\": 512,\n",
      "  \"text_config\": {\n",
      "    \"bos_token_id\": 0,\n",
      "    \"dropout\": 0.0,\n",
      "    \"eos_token_id\": 2,\n",
      "    \"model_type\": \"clip_text_model\"\n",
      "  },\n",
      "  \"transformers_version\": \"4.36.2\",\n",
      "  \"vision_config\": {\n",
      "    \"dropout\": 0.0,\n",
      "    \"model_type\": \"clip_vision_model\"\n",
      "  }\n",
      "}\n",
      "\n",
      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
      "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n",
      "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n",
      "Image processor CLIPImageProcessor {\n",
      "  \"crop_size\": {\n",
      "    \"height\": 224,\n",
      "    \"width\": 224\n",
      "  },\n",
      "  \"do_center_crop\": true,\n",
      "  \"do_convert_rgb\": true,\n",
      "  \"do_normalize\": true,\n",
      "  \"do_rescale\": true,\n",
      "  \"do_resize\": true,\n",
      "  \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n",
      "  \"image_mean\": [\n",
      "    0.48145466,\n",
      "    0.4578275,\n",
      "    0.40821073\n",
      "  ],\n",
      "  \"image_processor_type\": \"CLIPImageProcessor\",\n",
      "  \"image_std\": [\n",
      "    0.26862954,\n",
      "    0.26130258,\n",
      "    0.27577711\n",
      "  ],\n",
      "  \"resample\": 3,\n",
      "  \"rescale_factor\": 0.00392156862745098,\n",
      "  \"size\": {\n",
      "    \"shortest_edge\": 224\n",
      "  }\n",
      "}\n",
      "\n",
      "loading file vocab.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n",
      "loading file merges.txt from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n",
      "loading file tokenizer.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n",
      "loading file added_tokens.json from cache at None\n",
      "loading file special_tokens_map.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n",
      "loading file tokenizer_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n",
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
      "Model config CLIPConfig {\n",
      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
      "  \"architectures\": [\n",
      "    \"CLIPModel\"\n",
      "  ],\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"logit_scale_init_value\": 2.6592,\n",
      "  \"model_type\": \"clip\",\n",
      "  \"projection_dim\": 512,\n",
      "  \"text_config\": {\n",
      "    \"bos_token_id\": 0,\n",
      "    \"dropout\": 0.0,\n",
      "    \"eos_token_id\": 2,\n",
      "    \"model_type\": \"clip_text_model\"\n",
      "  },\n",
      "  \"transformers_version\": \"4.36.2\",\n",
      "  \"vision_config\": {\n",
      "    \"dropout\": 0.0,\n",
      "    \"model_type\": \"clip_vision_model\"\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from PIL import Image\n",
    "import requests\n",
    "from transformers import AutoProcessor, CLIPModel\n",
    "\n",
    "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
    "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
    "\n",
    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
    "image = Image.open(requests.get(url, stream=True).raw)\n",
    "\n",
    "inputs = processor(images=image, return_tensors=\"pt\")\n",
    "\n",
    "image_features = model.get_image_features(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "9ff63766-b706-452b-b735-bf9000fb9c20",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 512])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "image_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "82566e7b-3c91-421a-94c5-f1e2b3e91c8c",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "Model config CLIPVisionConfig {\n",
      "  \"attention_dropout\": 0.0,\n",
      "  \"dropout\": 0.0,\n",
      "  \"hidden_act\": \"quick_gelu\",\n",
      "  \"hidden_size\": 768,\n",
      "  \"image_size\": 224,\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"model_type\": \"clip_vision_model\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_channels\": 3,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"patch_size\": 32,\n",
      "  \"projection_dim\": 512,\n",
      "  \"transformers_version\": \"4.36.2\"\n",
      "}\n",
      "\n",
      "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n",
      "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n",
      "- This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "All the weights of CLIPVisionModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training.\n",
      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
      "Model config CLIPConfig {\n",
      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
      "  \"architectures\": [\n",
      "    \"CLIPModel\"\n",
      "  ],\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"logit_scale_init_value\": 2.6592,\n",
      "  \"model_type\": \"clip\",\n",
      "  \"projection_dim\": 512,\n",
      "  \"text_config\": {\n",
      "    \"bos_token_id\": 0,\n",
      "    \"dropout\": 0.0,\n",
      "    \"eos_token_id\": 2,\n",
      "    \"model_type\": \"clip_text_model\"\n",
      "  },\n",
      "  \"transformers_version\": \"4.36.2\",\n",
      "  \"vision_config\": {\n",
      "    \"dropout\": 0.0,\n",
      "    \"model_type\": \"clip_vision_model\"\n",
      "  }\n",
      "}\n",
      "\n",
      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
      "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n",
      "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n",
      "Image processor CLIPImageProcessor {\n",
      "  \"crop_size\": {\n",
      "    \"height\": 224,\n",
      "    \"width\": 224\n",
      "  },\n",
      "  \"do_center_crop\": true,\n",
      "  \"do_convert_rgb\": true,\n",
      "  \"do_normalize\": true,\n",
      "  \"do_rescale\": true,\n",
      "  \"do_resize\": true,\n",
      "  \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n",
      "  \"image_mean\": [\n",
      "    0.48145466,\n",
      "    0.4578275,\n",
      "    0.40821073\n",
      "  ],\n",
      "  \"image_processor_type\": \"CLIPImageProcessor\",\n",
      "  \"image_std\": [\n",
      "    0.26862954,\n",
      "    0.26130258,\n",
      "    0.27577711\n",
      "  ],\n",
      "  \"resample\": 3,\n",
      "  \"rescale_factor\": 0.00392156862745098,\n",
      "  \"size\": {\n",
      "    \"shortest_edge\": 224\n",
      "  }\n",
      "}\n",
      "\n",
      "loading file vocab.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n",
      "loading file merges.txt from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n",
      "loading file tokenizer.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n",
      "loading file added_tokens.json from cache at None\n",
      "loading file special_tokens_map.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n",
      "loading file tokenizer_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n",
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
      "Model config CLIPConfig {\n",
      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
      "  \"architectures\": [\n",
      "    \"CLIPModel\"\n",
      "  ],\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"logit_scale_init_value\": 2.6592,\n",
      "  \"model_type\": \"clip\",\n",
      "  \"projection_dim\": 512,\n",
      "  \"text_config\": {\n",
      "    \"bos_token_id\": 0,\n",
      "    \"dropout\": 0.0,\n",
      "    \"eos_token_id\": 2,\n",
      "    \"model_type\": \"clip_text_model\"\n",
      "  },\n",
      "  \"transformers_version\": \"4.36.2\",\n",
      "  \"vision_config\": {\n",
      "    \"dropout\": 0.0,\n",
      "    \"model_type\": \"clip_vision_model\"\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from PIL import Image\n",
    "import requests\n",
    "from transformers import AutoProcessor, CLIPVisionModel\n",
    "\n",
    "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
    "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
    "\n",
    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
    "image = Image.open(requests.get(url, stream=True).raw)\n",
    "\n",
    "inputs = processor(images=image, return_tensors=\"pt\")\n",
    "\n",
    "outputs = model(**inputs)\n",
    "last_hidden_state = outputs.last_hidden_state\n",
    "pooled_output = outputs.pooler_output  # pooled CLS states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "bcf0a7b3-6cbb-492e-bc2c-42e3edbe6a0c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 768])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pooled_output.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "67240294-c7a0-4e94-a8c1-86bfe1b21977",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import CLIPPreTrainedModel\n",
    "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n",
    "from typing import Optional, Union, Tuple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "cc9b20db-7f84-44c3-9c78-e84164ccc192",
   "metadata": {},
   "outputs": [],
   "source": [
    "class VisionLanguageConnector(nn.Module):\n",
    "    def __init__(self, hidden_size, projection_dim):\n",
    "        super().__init__()\n",
    "        self.mlp = nn.Sequential(\n",
    "            nn.Linear(hidden_size, hidden_size, bias=False),\n",
    "            nn.GELU(),\n",
    "            nn.Linear(hidden_size, projection_dim, bias=False)\n",
    "        )\n",
    "\n",
    "    def forward(self, x):\n",
    "        return self.mlp(x)\n",
    "        \n",
    "class ClipWithProjection(CLIPPreTrainedModel):\n",
    "    config_class = CLIPVisionConfig\n",
    "    main_input_name = \"pixel_values\"\n",
    "\n",
    "    def __init__(self, config: CLIPVisionConfig):\n",
    "        super().__init__(config)\n",
    "\n",
    "        self.vision_model = CLIPVisionTransformer(config)\n",
    "        self.vision_model.\n",
    "        self.vision_language_connector = VisionLanguageConnector(config.hidden_size, config.projection_dim)\n",
    "\n",
    "        # Initialize weights and apply final processing\n",
    "        self.post_init()\n",
    "\n",
    "    def forward(\n",
    "        self,\n",
    "        pixel_values: Optional[torch.FloatTensor] = None,\n",
    "        output_attentions: Optional[bool] = None,\n",
    "        output_hidden_states: Optional[bool] = None,\n",
    "        return_dict: Optional[bool] = None,\n",
    "    ) -> Union[Tuple, CLIPVisionModelOutput]:\n",
    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
    "\n",
    "        vision_outputs = self.vision_model(\n",
    "            pixel_values=pixel_values,\n",
    "            output_attentions=output_attentions,\n",
    "            output_hidden_states=output_hidden_states,\n",
    "            return_dict=return_dict,\n",
    "        )\n",
    "\n",
    "        pooled_output = vision_outputs[1]  # pooled_output\n",
    "\n",
    "        image_embeds = self.vision_language_connector(pooled_output)\n",
    "\n",
    "        if not return_dict:\n",
    "            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n",
    "            return tuple(output for output in outputs if output is not None)\n",
    "\n",
    "        return CLIPVisionModelOutput(\n",
    "            image_embeds=image_embeds,\n",
    "            last_hidden_state=vision_outputs.last_hidden_state,\n",
    "            hidden_states=vision_outputs.hidden_states,\n",
    "            attentions=vision_outputs.attentions,\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "a4892ab8-39d2-41c9-ad2a-04711c22b95f",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
      "Model config CLIPVisionConfig {\n",
      "  \"attention_dropout\": 0.0,\n",
      "  \"dropout\": 0.0,\n",
      "  \"hidden_act\": \"quick_gelu\",\n",
      "  \"hidden_size\": 768,\n",
      "  \"image_size\": 224,\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"model_type\": \"clip_vision_model\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_channels\": 3,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"patch_size\": 32,\n",
      "  \"projection_dim\": 512,\n",
      "  \"transformers_version\": \"4.36.2\"\n",
      "}\n",
      "\n",
      "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n",
      "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing ClipWithProjection: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n",
      "- This IS expected if you are initializing ClipWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing ClipWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of ClipWithProjection were not initialized from the model checkpoint at openai/clip-vit-base-patch32 and are newly initialized: ['vision_language_connector.mlp.2.weight', 'vision_language_connector.mlp.0.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = ClipWithProjection.from_pretrained(\"openai/clip-vit-base-patch32\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "588ef914-5be9-49e1-b68d-b899e0e74edd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "768"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config.hidden_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "05d95b9e-9831-4415-860e-94793e29d210",
   "metadata": {},
   "outputs": [],
   "source": [
    "outputs = model(**inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "185b1bff-6ffe-4cce-9255-ee7629feba54",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([1, 512])"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs[0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04414a35-c7b3-4986-a79e-1d363916caa4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "485dbbcb-06df-4926-b257-dfd1a4081d44",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'outputs' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43moutputs\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
      "\u001b[0;31mNameError\u001b[0m: name 'outputs' is not defined"
     ]
    }
   ],
   "source": [
    "outputs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f983313c-8e0f-4805-af14-25bb69afd04c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}