{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "99576983-f881-47c8-8b5e-c6f561a93e71",
   "metadata": {},
   "outputs": [],
   "source": [
    "import transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "58ba19f2-4b91-4f90-a33d-4c1ed17e202a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, PhiConfig\n",
    "\n",
    "# Initializing a CLIP-vision config\n",
    "vision_config = CLIPVisionConfig()\n",
    "\n",
    "# Initializing a Llama config\n",
    "text_config = PhiConfig()\n",
    "\n",
    "# Initializing a Llava llava-1.5-7b style configuration\n",
    "configuration = LlavaConfig(vision_config, text_config)\n",
    "\n",
    "# Initializing a model from the llava-1.5-7b style configuration\n",
    "model = LlavaForConditionalGeneration(configuration)\n",
    "\n",
    "# Accessing the model configuration\n",
    "configuration = model.config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a806a07a-fe72-45a3-8ceb-8e942c6c845d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LlavaConfig {\n",
       "  \"ignore_index\": -100,\n",
       "  \"image_token_index\": 32000,\n",
       "  \"model_type\": \"llava\",\n",
       "  \"projector_hidden_act\": \"gelu\",\n",
       "  \"text_config\": {\n",
       "    \"embd_pdrop\": 0.0,\n",
       "    \"hidden_act\": \"gelu_new\",\n",
       "    \"hidden_size\": 2048,\n",
       "    \"intermediate_size\": 8192,\n",
       "    \"layer_norm_eps\": 1e-05,\n",
       "    \"model_type\": \"phi\",\n",
       "    \"num_hidden_layers\": 24,\n",
       "    \"partial_rotary_factor\": 0.5,\n",
       "    \"qk_layernorm\": false,\n",
       "    \"resid_pdrop\": 0.0,\n",
       "    \"vocab_size\": 51200\n",
       "  },\n",
       "  \"transformers_version\": \"4.36.2\",\n",
       "  \"vision_config\": {\n",
       "    \"hidden_size\": 768,\n",
       "    \"image_size\": 224,\n",
       "    \"intermediate_size\": 3072,\n",
       "    \"model_type\": \"clip_vision_model\",\n",
       "    \"num_attention_heads\": 12,\n",
       "    \"num_hidden_layers\": 12,\n",
       "    \"patch_size\": 32,\n",
       "    \"projection_dim\": 512\n",
       "  },\n",
       "  \"vision_feature_layer\": -2,\n",
       "  \"vision_feature_select_strategy\": \"default\",\n",
       "  \"vocab_size\": 32000\n",
       "}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "79efbc6b-f005-4a5c-82a1-112fa37f1904",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'llava-phi'...\n",
      "remote: Enumerating objects: 151, done.\u001b[K\n",
      "remote: Counting objects: 100% (151/151), done.\u001b[K\n",
      "remote: Compressing objects: 100% (116/116), done.\u001b[K\n",
      "remote: Total 151 (delta 36), reused 133 (delta 25), pack-reused 0\u001b[K\n",
      "Receiving objects: 100% (151/151), 333.89 KiB | 112.00 KiB/s, done.\n",
      "Resolving deltas: 100% (36/36), done.\n"
     ]
    }
   ],
   "source": [
    "!git clone https://github.com/zhuyiche/llava-phi.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf827184-f334-4d86-ace1-fe9c92f84d66",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}