{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "l7qoJHs1L6WQ"
      },
      "source": [
        "#01.SETUP"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Op0GXmC8CCyR",
        "outputId": "a414537e-71b5-4222-85e3-5cc0bdd3f6a6"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting transformers==4.25.1\n",
            "  Downloading transformers-4.25.1-py3-none-any.whl.metadata (93 kB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/93.9 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.9/93.9 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (3.15.4)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (0.23.5)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (1.26.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (24.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (6.0.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (2024.5.15)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (2.31.0)\n",
            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.25.1)\n",
            "  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.25.1) (4.66.4)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers==4.25.1) (2024.6.1)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers==4.25.1) (4.12.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.25.1) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.25.1) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.25.1) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.25.1) (2024.7.4)\n",
            "Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.8/5.8 MB\u001b[0m \u001b[31m60.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: tokenizers, transformers\n",
            "  Attempting uninstall: tokenizers\n",
            "    Found existing installation: tokenizers 0.19.1\n",
            "    Uninstalling tokenizers-0.19.1:\n",
            "      Successfully uninstalled tokenizers-0.19.1\n",
            "  Attempting uninstall: transformers\n",
            "    Found existing installation: transformers 4.42.4\n",
            "    Uninstalling transformers-4.42.4:\n",
            "      Successfully uninstalled transformers-4.42.4\n",
            "Successfully installed tokenizers-0.13.3 transformers-4.25.1\n",
            "\u001b[31mERROR: Could not find a version that satisfies the requirement bitsandbytes-cuda111==0.26.0 (from versions: 0.26.0.post2)\u001b[0m\u001b[31m\n",
            "\u001b[0m\u001b[31mERROR: No matching distribution found for bitsandbytes-cuda111==0.26.0\u001b[0m\u001b[31m\n",
            "\u001b[0mCollecting datasets==1.16.1\n",
            "  Downloading datasets-1.16.1-py3-none-any.whl.metadata (21 kB)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (1.26.4)\n",
            "Requirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (14.0.2)\n",
            "Collecting dill (from datasets==1.16.1)\n",
            "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (2.1.4)\n",
            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (2.31.0)\n",
            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (4.66.4)\n",
            "Collecting xxhash (from datasets==1.16.1)\n",
            "  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
            "Collecting multiprocess (from datasets==1.16.1)\n",
            "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
            "Requirement already satisfied: fsspec>=2021.05.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]>=2021.05.0->datasets==1.16.1) (2024.6.1)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (3.9.5)\n",
            "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (0.23.5)\n",
            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets==1.16.1) (24.1)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==1.16.1) (1.3.1)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==1.16.1) (23.2.0)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==1.16.1) (1.4.1)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==1.16.1) (6.0.5)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==1.16.1) (1.9.4)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==1.16.1) (4.0.3)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==1.16.1) (3.15.4)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==1.16.1) (6.0.1)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==1.16.1) (4.12.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets==1.16.1) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets==1.16.1) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets==1.16.1) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets==1.16.1) (2024.7.4)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==1.16.1) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==1.16.1) (2024.1)\n",
            "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==1.16.1) (2024.1)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets==1.16.1) (1.16.0)\n",
            "Downloading datasets-1.16.1-py3-none-any.whl (298 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.3/298.3 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: xxhash, dill, multiprocess, datasets\n",
            "Successfully installed datasets-1.16.1 dill-0.3.8 multiprocess-0.70.16 xxhash-3.4.1\n",
            "Collecting bitsandbytes\n",
            "  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)\n",
            "Collecting loguru\n",
            "  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (2.3.1+cu121)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (1.26.4)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.15.4)\n",
            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (1.13.1)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (3.1.4)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (2024.6.1)\n",
            "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-curand-cu12==10.3.2.106 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
            "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)\n",
            "Collecting nvidia-nccl-cu12==2.20.5 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n",
            "Collecting nvidia-nvtx-cu12==12.1.105 (from torch->bitsandbytes)\n",
            "  Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.7 kB)\n",
            "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->bitsandbytes) (2.3.1)\n",
            "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch->bitsandbytes)\n",
            "  Downloading nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->bitsandbytes) (2.1.5)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->bitsandbytes) (1.3.0)\n",
            "Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.5/137.5 MB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hDownloading loguru-0.7.2-py3-none-any.whl (62 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.5/62.5 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hUsing cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
            "Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
            "Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
            "Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
            "Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n",
            "Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
            "Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
            "Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
            "Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
            "Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n",
            "Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
            "Downloading nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl (21.3 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m40.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, loguru, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, bitsandbytes\n",
            "Successfully installed bitsandbytes-0.43.3 loguru-0.7.2 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.5.82 nvidia-nvtx-cu12-12.1.105\n",
            "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.32.1)\n",
            "Requirement already satisfied: numpy<2.0.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.26.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (24.1)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0.1)\n",
            "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.3.1+cu121)\n",
            "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.23.5)\n",
            "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from accelerate) (0.4.3)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.15.4)\n",
            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.13.1)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.4)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2024.6.1)\n",
            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (8.9.2.26)\n",
            "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.3.1)\n",
            "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (11.0.2.54)\n",
            "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (10.3.2.106)\n",
            "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (11.4.5.107)\n",
            "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.0.106)\n",
            "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.20.5)\n",
            "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
            "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.3.1)\n",
            "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.10.0->accelerate) (12.5.82)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->accelerate) (2.31.0)\n",
            "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->accelerate) (4.66.4)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub->accelerate) (2024.7.4)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n",
            "Collecting deep_translator\n",
            "  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)\n",
            "Requirement already satisfied: beautifulsoup4<5.0.0,>=4.9.1 in /usr/local/lib/python3.10/dist-packages (from deep_translator) (4.12.3)\n",
            "Requirement already satisfied: requests<3.0.0,>=2.23.0 in /usr/local/lib/python3.10/dist-packages (from deep_translator) (2.31.0)\n",
            "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4<5.0.0,>=4.9.1->deep_translator) (2.5)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.23.0->deep_translator) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.23.0->deep_translator) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.23.0->deep_translator) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.23.0->deep_translator) (2024.7.4)\n",
            "Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.3/42.3 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: deep_translator\n",
            "Successfully installed deep_translator-1.11.4\n",
            "Collecting langdetect\n",
            "  Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect) (1.16.0)\n",
            "Building wheels for collected packages: langdetect\n",
            "  Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=2bfbeba64add7db1945e43111d9c3fc31ee7b6e970d35fcfb2939e88689299ef\n",
            "  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n",
            "Successfully built langdetect\n",
            "Installing collected packages: langdetect\n",
            "Successfully installed langdetect-1.0.9\n"
          ]
        }
      ],
      "source": [
        "!pip install transformers==4.25.1\n",
        "!pip install bitsandbytes-cuda111==0.26.0\n",
        "!pip install datasets==1.16.1\n",
        "!pip install bitsandbytes loguru\n",
        "!pip install accelerate\n",
        "!pip install deep_translator\n",
        "!pip install langdetect"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "p0dy1ZFwClcq"
      },
      "outputs": [],
      "source": [
        "from loguru import logger\n",
        "import transformers\n",
        "import torch\n",
        "import torch.nn.functional as F\n",
        "from torch import nn\n",
        "from torch.cuda.amp import custom_fwd, custom_bwd\n",
        "from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise\n",
        "from tqdm.auto import tqdm\n",
        "from datasets import load_dataset\n",
        "from bitsandbytes.optim import Adam8bit\n",
        "import time, os\n",
        "\n",
        "# ---------------------> Converting the model to 8 bits <------------------- #\n",
        "\"\"\"\n",
        "We convert EleutherAI's GPT-J-6B model to 8 bits using facebook's [bitsandbytes](https://github.com/facebookresearch/bitsandbytes) library.\n",
        "This reduces the model's size from 20Gb down to just 6Gb.\n",
        "Note that we don't convert linear layer biases to 8 bit as they take up less that 1% of the model's weight anyway.\n",
        "\"\"\"\n",
        "\n",
        "class FrozenBNBLinear(nn.Module):\n",
        "    def __init__(self, weight, absmax, code, bias=None):\n",
        "        assert isinstance(bias, nn.Parameter) or bias is None\n",
        "        super().__init__()\n",
        "        self.out_features, self.in_features = weight.shape\n",
        "        self.register_buffer(\"weight\", weight.requires_grad_(False))\n",
        "        self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n",
        "        self.register_buffer(\"code\", code.requires_grad_(False))\n",
        "        self.adapter = None\n",
        "        self.bias = bias\n",
        "\n",
        "    # def forward(self, input):\n",
        "    #     output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)\n",
        "    #     if self.adapter:\n",
        "    #         output += self.adapter(input)\n",
        "    #     return output\n",
        "    def forward(self, input):\n",
        "        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)\n",
        "        if self.adapter:\n",
        "            output_cloned = torch.clone(output + self.adapter(input))\n",
        "            return output_cloned\n",
        "        else:\n",
        "            return output\n",
        "\n",
        "    @classmethod\n",
        "    def from_linear(cls, linear: nn.Linear) -> \"FrozenBNBLinear\":\n",
        "        weights_int8, state = quantize_blockise_lowmemory(linear.weight)\n",
        "        return cls(weights_int8, *state, linear.bias)\n",
        "\n",
        "    def __repr__(self):\n",
        "        return f\"{self.__class__.__name__}({self.in_features}, {self.out_features})\"\n",
        "\n",
        "\n",
        "\n",
        "class DequantizeAndLinear(torch.autograd.Function):\n",
        "    @staticmethod\n",
        "    @custom_fwd\n",
        "    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,\n",
        "                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):\n",
        "        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n",
        "        ctx.save_for_backward(input, weights_quantized, absmax, code)\n",
        "        ctx._has_bias = bias is not None\n",
        "        return F.linear(input, weights_deq, bias)\n",
        "\n",
        "    @staticmethod\n",
        "    @custom_bwd\n",
        "    def backward(ctx, grad_output: torch.Tensor):\n",
        "        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]\n",
        "        input, weights_quantized, absmax, code = ctx.saved_tensors\n",
        "        # grad_output: [*batch, out_features]\n",
        "        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n",
        "        grad_input = grad_output @ weights_deq\n",
        "        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None\n",
        "        return grad_input, None, None, None, grad_bias\n",
        "\n",
        "\n",
        "class FrozenBNBEmbedding(nn.Module):\n",
        "    def __init__(self, weight, absmax, code):\n",
        "        super().__init__()\n",
        "        self.num_embeddings, self.embedding_dim = weight.shape\n",
        "        self.register_buffer(\"weight\", weight.requires_grad_(False))\n",
        "        self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n",
        "        self.register_buffer(\"code\", code.requires_grad_(False))\n",
        "        self.adapter = None\n",
        "\n",
        "    def forward(self, input, **kwargs):\n",
        "        with torch.no_grad():\n",
        "            # note: both quantuized weights and input indices are *not* differentiable\n",
        "            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)\n",
        "            output = F.embedding(input, weight_deq, **kwargs)\n",
        "        if self.adapter:\n",
        "            output += self.adapter(input)\n",
        "        return output\n",
        "\n",
        "    @classmethod\n",
        "    def from_embedding(cls, embedding: nn.Embedding) -> \"FrozenBNBEmbedding\":\n",
        "        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)\n",
        "        return cls(weights_int8, *state)\n",
        "\n",
        "    def __repr__(self):\n",
        "        return f\"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})\"\n",
        "\n",
        "def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):\n",
        "    assert chunk_size % 4096 == 0\n",
        "    code = None\n",
        "    chunks = []\n",
        "    absmaxes = []\n",
        "    flat_tensor = matrix.view(-1)\n",
        "    for i in range((matrix.numel() - 1) // chunk_size + 1):\n",
        "        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()\n",
        "        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)\n",
        "        chunks.append(quantized_chunk)\n",
        "        absmaxes.append(absmax_chunk)\n",
        "\n",
        "    matrix_i8 = torch.cat(chunks).reshape_as(matrix)\n",
        "    absmax = torch.cat(absmaxes)\n",
        "    return matrix_i8, (absmax, code)\n",
        "\n",
        "\n",
        "def convert_to_int8(model):\n",
        "    \"\"\"Convert linear and embedding modules to 8-bit with optional adapters\"\"\"\n",
        "    for module in list(model.modules()):\n",
        "        for name, child in module.named_children():\n",
        "            if isinstance(child, nn.Linear):\n",
        "                print(name, child)\n",
        "                setattr(\n",
        "                    module,\n",
        "                    name,\n",
        "                    FrozenBNBLinear(\n",
        "                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),\n",
        "                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),\n",
        "                        code=torch.zeros(256),\n",
        "                        bias=child.bias,\n",
        "                    ),\n",
        "                )\n",
        "            elif isinstance(child, nn.Embedding):\n",
        "                setattr(\n",
        "                    module,\n",
        "                    name,\n",
        "                    FrozenBNBEmbedding(\n",
        "                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),\n",
        "                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),\n",
        "                        code=torch.zeros(256),\n",
        "                    )\n",
        "                )\n",
        "\n",
        "class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):\n",
        "    def __init__(self, config):\n",
        "        super().__init__(config)\n",
        "\n",
        "        convert_to_int8(self.attn)\n",
        "        convert_to_int8(self.mlp)\n",
        "\n",
        "\n",
        "class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):\n",
        "    def __init__(self, config):\n",
        "        super().__init__(config)\n",
        "        convert_to_int8(self)\n",
        "\n",
        "\n",
        "class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):\n",
        "    def __init__(self, config):\n",
        "        super().__init__(config)\n",
        "        convert_to_int8(self)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "PJg_VgpqMDkY"
      },
      "source": [
        "#02.LOAD MARYGPT"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6yHWUS-h-Hs8",
        "outputId": "3c7cfa66-a95d-442e-b595-4ccd023d913b"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "k_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "v_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "q_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "out_proj Linear(in_features=4096, out_features=4096, bias=False)\n",
            "fc_in Linear(in_features=4096, out_features=16384, bias=True)\n",
            "fc_out Linear(in_features=16384, out_features=4096, bias=True)\n",
            "lm_head Linear(in_features=4096, out_features=50400, bias=True)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Some weights of the model checkpoint at obake2ai/MaryGPT were not used when initializing GPTJForCausalLM: ['transformer.h.5.mlp.fc_in.adapter.1.weight', 'transformer.h.15.attn.v_proj.adapter.0.weight', 'transformer.h.18.mlp.fc_out.adapter.0.weight', 'transformer.h.4.mlp.fc_in.adapter.0.weight', 'transformer.h.22.attn.v_proj.adapter.1.weight', 'transformer.h.21.attn.out_proj.adapter.1.weight', 'transformer.h.14.attn.q_proj.adapter.1.weight', 'transformer.h.2.attn.q_proj.adapter.0.weight', 'transformer.h.0.attn.k_proj.adapter.1.weight', 'transformer.h.14.attn.k_proj.adapter.1.weight', 'transformer.h.7.mlp.fc_in.adapter.0.weight', 'transformer.h.26.mlp.fc_out.adapter.1.weight', 'transformer.h.24.attn.q_proj.adapter.1.weight', 'transformer.h.20.mlp.fc_out.adapter.1.weight', 'transformer.h.5.attn.out_proj.adapter.1.weight', 'transformer.h.5.attn.q_proj.adapter.1.weight', 'transformer.h.15.attn.k_proj.adapter.0.weight', 'transformer.h.25.mlp.fc_out.adapter.0.weight', 'transformer.h.22.attn.out_proj.adapter.1.weight', 'transformer.h.17.attn.v_proj.adapter.0.weight', 'transformer.h.25.attn.v_proj.adapter.1.weight', 'transformer.h.19.attn.q_proj.adapter.0.weight', 'transformer.h.0.attn.q_proj.adapter.0.weight', 'transformer.h.9.attn.k_proj.adapter.1.weight', 'transformer.h.4.mlp.fc_out.adapter.1.weight', 'transformer.h.4.attn.v_proj.adapter.0.weight', 'transformer.h.7.attn.k_proj.adapter.1.weight', 'transformer.h.0.mlp.fc_out.adapter.0.weight', 'transformer.h.1.mlp.fc_out.adapter.0.weight', 'transformer.h.7.attn.q_proj.adapter.1.weight', 'transformer.h.25.attn.out_proj.adapter.1.weight', 'transformer.h.16.attn.out_proj.adapter.1.weight', 'transformer.h.8.attn.out_proj.adapter.0.weight', 'transformer.h.22.attn.k_proj.adapter.0.weight', 'transformer.h.0.attn.v_proj.adapter.1.weight', 'transformer.h.15.mlp.fc_in.adapter.0.weight', 'transformer.h.24.attn.q_proj.adapter.0.weight', 'transformer.h.6.attn.out_proj.adapter.0.weight', 'transformer.h.20.attn.out_proj.adapter.1.weight', 'transformer.h.20.attn.q_proj.adapter.0.weight', 'transformer.h.23.attn.out_proj.adapter.1.weight', 'transformer.h.12.attn.k_proj.adapter.0.weight', 'transformer.h.27.attn.v_proj.adapter.0.weight', 'transformer.h.3.mlp.fc_out.adapter.0.weight', 'transformer.h.8.attn.v_proj.adapter.0.weight', 'transformer.h.20.mlp.fc_in.adapter.1.weight', 'transformer.h.6.attn.v_proj.adapter.1.weight', 'transformer.h.1.attn.k_proj.adapter.0.weight', 'transformer.h.1.attn.out_proj.adapter.1.weight', 'transformer.h.16.mlp.fc_out.adapter.1.weight', 'transformer.h.4.mlp.fc_out.adapter.0.weight', 'transformer.h.15.attn.q_proj.adapter.0.weight', 'transformer.h.19.attn.k_proj.adapter.0.weight', 'transformer.h.14.mlp.fc_in.adapter.0.weight', 'transformer.h.17.attn.k_proj.adapter.0.weight', 'transformer.h.19.mlp.fc_in.adapter.1.weight', 'transformer.h.21.attn.v_proj.adapter.1.weight', 'transformer.h.11.attn.k_proj.adapter.1.weight', 'transformer.h.19.mlp.fc_out.adapter.0.weight', 'transformer.h.20.attn.q_proj.adapter.1.weight', 'transformer.h.20.attn.k_proj.adapter.0.weight', 'transformer.h.10.mlp.fc_in.adapter.0.weight', 'transformer.h.12.attn.out_proj.adapter.0.weight', 'transformer.h.6.mlp.fc_in.adapter.0.weight', 'transformer.h.17.mlp.fc_out.adapter.0.weight', 'transformer.h.1.attn.q_proj.adapter.1.weight', 'transformer.h.5.attn.q_proj.adapter.0.weight', 'transformer.h.2.attn.v_proj.adapter.1.weight', 'transformer.h.25.attn.k_proj.adapter.1.weight', 'transformer.h.22.attn.v_proj.adapter.0.weight', 'transformer.h.17.attn.out_proj.adapter.1.weight', 'transformer.h.7.attn.k_proj.adapter.0.weight', 'transformer.h.23.mlp.fc_out.adapter.1.weight', 'transformer.h.6.attn.q_proj.adapter.1.weight', 'transformer.h.25.attn.k_proj.adapter.0.weight', 'transformer.h.23.attn.v_proj.adapter.0.weight', 'transformer.h.18.attn.out_proj.adapter.0.weight', 'transformer.h.11.mlp.fc_out.adapter.0.weight', 'transformer.h.12.attn.out_proj.adapter.1.weight', 'transformer.h.15.mlp.fc_in.adapter.1.weight', 'transformer.h.24.attn.v_proj.adapter.0.weight', 'transformer.h.1.attn.k_proj.adapter.1.weight', 'transformer.h.11.attn.q_proj.adapter.1.weight', 'transformer.h.11.mlp.fc_out.adapter.1.weight', 'transformer.h.23.attn.k_proj.adapter.1.weight', 'transformer.h.2.attn.k_proj.adapter.1.weight', 'transformer.h.26.attn.q_proj.adapter.0.weight', 'transformer.h.24.mlp.fc_in.adapter.0.weight', 'transformer.h.0.mlp.fc_in.adapter.0.weight', 'transformer.h.2.mlp.fc_out.adapter.1.weight', 'transformer.h.10.mlp.fc_out.adapter.1.weight', 'transformer.h.12.mlp.fc_out.adapter.1.weight', 'transformer.h.19.attn.q_proj.adapter.1.weight', 'transformer.h.7.attn.out_proj.adapter.0.weight', 'transformer.h.9.attn.out_proj.adapter.1.weight', 'transformer.h.27.attn.out_proj.adapter.0.weight', 'transformer.h.1.mlp.fc_out.adapter.1.weight', 'transformer.h.17.mlp.fc_out.adapter.1.weight', 'transformer.h.5.attn.k_proj.adapter.1.weight', 'transformer.h.5.attn.v_proj.adapter.1.weight', 'transformer.h.9.mlp.fc_in.adapter.0.weight', 'transformer.h.14.attn.out_proj.adapter.0.weight', 'transformer.wte.adapter.1.weight', 'transformer.h.3.attn.v_proj.adapter.1.weight', 'transformer.h.2.attn.out_proj.adapter.1.weight', 'transformer.h.18.attn.out_proj.adapter.1.weight', 'transformer.h.7.attn.v_proj.adapter.1.weight', 'transformer.h.23.mlp.fc_in.adapter.1.weight', 'transformer.h.8.attn.q_proj.adapter.1.weight', 'transformer.h.25.mlp.fc_out.adapter.1.weight', 'transformer.h.4.attn.k_proj.adapter.1.weight', 'transformer.h.19.attn.out_proj.adapter.1.weight', 'transformer.h.26.mlp.fc_in.adapter.0.weight', 'transformer.h.6.attn.k_proj.adapter.0.weight', 'transformer.h.8.mlp.fc_in.adapter.1.weight', 'transformer.h.21.attn.k_proj.adapter.0.weight', 'transformer.h.23.attn.q_proj.adapter.0.weight', 'transformer.h.13.mlp.fc_out.adapter.0.weight', 'transformer.h.27.attn.k_proj.adapter.1.weight', 'transformer.h.8.attn.out_proj.adapter.1.weight', 'transformer.h.26.attn.out_proj.adapter.0.weight', 'transformer.h.14.mlp.fc_out.adapter.1.weight', 'transformer.h.10.attn.q_proj.adapter.1.weight', 'transformer.h.16.mlp.fc_in.adapter.1.weight', 'transformer.h.18.attn.k_proj.adapter.0.weight', 'transformer.h.25.mlp.fc_in.adapter.0.weight', 'transformer.h.0.attn.v_proj.adapter.0.weight', 'transformer.h.6.attn.q_proj.adapter.0.weight', 'transformer.h.22.attn.out_proj.adapter.0.weight', 'transformer.h.19.attn.v_proj.adapter.0.weight', 'transformer.h.25.attn.q_proj.adapter.0.weight', 'transformer.h.21.mlp.fc_out.adapter.1.weight', 'transformer.h.23.attn.k_proj.adapter.0.weight', 'transformer.h.13.mlp.fc_in.adapter.1.weight', 'transformer.h.13.attn.k_proj.adapter.0.weight', 'transformer.h.11.attn.out_proj.adapter.1.weight', 'transformer.h.21.mlp.fc_out.adapter.0.weight', 'transformer.h.23.attn.q_proj.adapter.1.weight', 'transformer.h.11.attn.k_proj.adapter.0.weight', 'transformer.h.3.attn.k_proj.adapter.0.weight', 'transformer.h.24.attn.k_proj.adapter.0.weight', 'transformer.h.20.attn.v_proj.adapter.1.weight', 'transformer.h.0.attn.k_proj.adapter.0.weight', 'transformer.h.8.attn.k_proj.adapter.1.weight', 'transformer.h.14.attn.q_proj.adapter.0.weight', 'transformer.h.1.mlp.fc_in.adapter.0.weight', 'transformer.h.15.attn.v_proj.adapter.1.weight', 'transformer.h.9.attn.v_proj.adapter.0.weight', 'transformer.h.21.attn.v_proj.adapter.0.weight', 'transformer.h.5.mlp.fc_out.adapter.1.weight', 'transformer.h.12.mlp.fc_in.adapter.1.weight', 'transformer.h.18.mlp.fc_in.adapter.1.weight', 'transformer.h.26.mlp.fc_in.adapter.1.weight', 'transformer.h.17.attn.q_proj.adapter.0.weight', 'transformer.h.16.attn.v_proj.adapter.1.weight', 'transformer.h.27.attn.out_proj.adapter.1.weight', 'transformer.h.20.attn.k_proj.adapter.1.weight', 'transformer.h.1.attn.v_proj.adapter.1.weight', 'transformer.h.9.mlp.fc_out.adapter.1.weight', 'transformer.h.6.attn.out_proj.adapter.1.weight', 'transformer.h.22.attn.q_proj.adapter.0.weight', 'transformer.h.15.mlp.fc_out.adapter.1.weight', 'transformer.h.26.mlp.fc_out.adapter.0.weight', 'transformer.h.9.mlp.fc_in.adapter.1.weight', 'transformer.h.8.attn.q_proj.adapter.0.weight', 'transformer.h.19.attn.v_proj.adapter.1.weight', 'transformer.h.18.attn.q_proj.adapter.0.weight', 'transformer.h.4.attn.q_proj.adapter.1.weight', 'transformer.h.25.mlp.fc_in.adapter.1.weight', 'transformer.h.3.attn.q_proj.adapter.1.weight', 'transformer.h.21.attn.q_proj.adapter.1.weight', 'transformer.h.9.attn.k_proj.adapter.0.weight', 'transformer.h.18.mlp.fc_out.adapter.1.weight', 'transformer.h.18.attn.v_proj.adapter.0.weight', 'transformer.h.16.mlp.fc_out.adapter.0.weight', 'transformer.h.16.attn.q_proj.adapter.1.weight', 'transformer.h.27.attn.q_proj.adapter.0.weight', 'transformer.h.10.attn.out_proj.adapter.1.weight', 'transformer.h.13.attn.out_proj.adapter.0.weight', 'transformer.h.9.attn.q_proj.adapter.1.weight', 'transformer.h.22.mlp.fc_out.adapter.1.weight', 'transformer.h.9.attn.out_proj.adapter.0.weight', 'transformer.h.5.attn.v_proj.adapter.0.weight', 'transformer.h.3.attn.q_proj.adapter.0.weight', 'transformer.h.25.attn.v_proj.adapter.0.weight', 'transformer.h.26.attn.k_proj.adapter.0.weight', 'transformer.h.8.attn.v_proj.adapter.1.weight', 'transformer.h.26.attn.out_proj.adapter.1.weight', 'transformer.h.17.attn.out_proj.adapter.0.weight', 'transformer.h.6.mlp.fc_out.adapter.0.weight', 'transformer.h.13.attn.k_proj.adapter.1.weight', 'transformer.h.5.attn.out_proj.adapter.0.weight', 'transformer.h.9.attn.q_proj.adapter.0.weight', 'transformer.h.2.mlp.fc_out.adapter.0.weight', 'transformer.h.17.attn.q_proj.adapter.1.weight', 'transformer.h.4.mlp.fc_in.adapter.1.weight', 'transformer.h.3.mlp.fc_in.adapter.0.weight', 'transformer.h.0.attn.q_proj.adapter.1.weight', 'transformer.h.24.attn.v_proj.adapter.1.weight', 'transformer.h.14.mlp.fc_out.adapter.0.weight', 'transformer.h.11.attn.out_proj.adapter.0.weight', 'transformer.h.25.attn.q_proj.adapter.1.weight', 'transformer.h.16.attn.k_proj.adapter.0.weight', 'transformer.h.13.attn.v_proj.adapter.1.weight', 'transformer.h.12.attn.q_proj.adapter.0.weight', 'transformer.h.21.attn.out_proj.adapter.0.weight', 'transformer.h.26.attn.v_proj.adapter.1.weight', 'transformer.h.13.attn.q_proj.adapter.0.weight', 'transformer.h.2.attn.out_proj.adapter.0.weight', 'transformer.h.10.attn.v_proj.adapter.1.weight', 'transformer.h.1.mlp.fc_in.adapter.1.weight', 'transformer.h.21.attn.q_proj.adapter.0.weight', 'lm_head.adapter.0.weight', 'transformer.h.3.attn.v_proj.adapter.0.weight', 'transformer.h.16.attn.q_proj.adapter.0.weight', 'transformer.h.7.attn.out_proj.adapter.1.weight', 'lm_head.adapter.1.weight', 'transformer.h.11.attn.v_proj.adapter.1.weight', 'transformer.h.0.mlp.fc_out.adapter.1.weight', 'transformer.h.4.attn.k_proj.adapter.0.weight', 'transformer.h.13.mlp.fc_in.adapter.0.weight', 'transformer.h.17.mlp.fc_in.adapter.0.weight', 'transformer.h.8.mlp.fc_out.adapter.0.weight', 'transformer.h.15.attn.q_proj.adapter.1.weight', 'transformer.h.27.attn.q_proj.adapter.1.weight', 'transformer.h.12.attn.v_proj.adapter.1.weight', 'transformer.h.3.mlp.fc_out.adapter.1.weight', 'transformer.h.17.attn.k_proj.adapter.1.weight', 'transformer.h.18.mlp.fc_in.adapter.0.weight', 'transformer.h.14.attn.v_proj.adapter.0.weight', 'transformer.h.22.attn.k_proj.adapter.1.weight', 'transformer.h.21.mlp.fc_in.adapter.0.weight', 'transformer.h.24.attn.out_proj.adapter.0.weight', 'transformer.h.5.attn.k_proj.adapter.0.weight', 'transformer.h.12.attn.k_proj.adapter.1.weight', 'transformer.h.6.attn.v_proj.adapter.0.weight', 'transformer.h.17.attn.v_proj.adapter.1.weight', 'transformer.h.1.attn.v_proj.adapter.0.weight', 'transformer.h.18.attn.q_proj.adapter.1.weight', 'transformer.h.24.attn.out_proj.adapter.1.weight', 'transformer.h.10.attn.k_proj.adapter.0.weight', 'transformer.h.13.attn.v_proj.adapter.0.weight', 'transformer.h.15.attn.k_proj.adapter.1.weight', 'transformer.h.16.attn.out_proj.adapter.0.weight', 'transformer.h.8.attn.k_proj.adapter.0.weight', 'transformer.h.16.attn.v_proj.adapter.0.weight', 'transformer.h.20.attn.v_proj.adapter.0.weight', 'transformer.h.27.mlp.fc_out.adapter.1.weight', 'transformer.h.14.attn.out_proj.adapter.1.weight', 'transformer.h.19.mlp.fc_out.adapter.1.weight', 'transformer.h.27.attn.k_proj.adapter.0.weight', 'transformer.h.20.mlp.fc_out.adapter.0.weight', 'transformer.h.11.mlp.fc_in.adapter.1.weight', 'transformer.h.9.mlp.fc_out.adapter.0.weight', 'transformer.h.13.attn.q_proj.adapter.1.weight', 'transformer.h.26.attn.v_proj.adapter.0.weight', 'transformer.h.27.mlp.fc_in.adapter.1.weight', 'transformer.h.6.mlp.fc_in.adapter.1.weight', 'transformer.h.0.attn.out_proj.adapter.1.weight', 'transformer.h.10.mlp.fc_out.adapter.0.weight', 'transformer.h.15.mlp.fc_out.adapter.0.weight', 'transformer.h.27.mlp.fc_out.adapter.0.weight', 'transformer.h.4.attn.out_proj.adapter.0.weight', 'transformer.h.14.attn.k_proj.adapter.0.weight', 'transformer.h.23.mlp.fc_in.adapter.0.weight', 'transformer.h.22.attn.q_proj.adapter.1.weight', 'transformer.wte.adapter.0.weight', 'transformer.h.22.mlp.fc_out.adapter.0.weight', 'transformer.h.25.attn.out_proj.adapter.0.weight', 'transformer.h.6.attn.k_proj.adapter.1.weight', 'transformer.h.22.mlp.fc_in.adapter.0.weight', 'transformer.h.19.attn.k_proj.adapter.1.weight', 'transformer.h.22.mlp.fc_in.adapter.1.weight', 'transformer.h.21.attn.k_proj.adapter.1.weight', 'transformer.h.26.attn.k_proj.adapter.1.weight', 'transformer.h.12.mlp.fc_in.adapter.0.weight', 'transformer.h.0.attn.out_proj.adapter.0.weight', 'transformer.h.7.attn.q_proj.adapter.0.weight', 'transformer.h.10.attn.v_proj.adapter.0.weight', 'transformer.h.6.mlp.fc_out.adapter.1.weight', 'transformer.h.7.mlp.fc_out.adapter.1.weight', 'transformer.h.15.attn.out_proj.adapter.0.weight', 'transformer.h.8.mlp.fc_in.adapter.0.weight', 'transformer.h.4.attn.out_proj.adapter.1.weight', 'transformer.h.10.attn.k_proj.adapter.1.weight', 'transformer.h.11.mlp.fc_in.adapter.0.weight', 'transformer.h.23.attn.out_proj.adapter.0.weight', 'transformer.h.10.attn.q_proj.adapter.0.weight', 'transformer.h.2.mlp.fc_in.adapter.1.weight', 'transformer.h.2.mlp.fc_in.adapter.0.weight', 'transformer.h.13.mlp.fc_out.adapter.1.weight', 'transformer.h.19.mlp.fc_in.adapter.0.weight', 'transformer.h.24.mlp.fc_in.adapter.1.weight', 'transformer.h.24.attn.k_proj.adapter.1.weight', 'transformer.h.4.attn.q_proj.adapter.0.weight', 'transformer.h.5.mlp.fc_out.adapter.0.weight', 'transformer.h.18.attn.k_proj.adapter.1.weight', 'transformer.h.2.attn.q_proj.adapter.1.weight', 'transformer.h.4.attn.v_proj.adapter.1.weight', 'transformer.h.12.attn.q_proj.adapter.1.weight', 'transformer.h.20.mlp.fc_in.adapter.0.weight', 'transformer.h.10.attn.out_proj.adapter.0.weight', 'transformer.h.23.mlp.fc_out.adapter.0.weight', 'transformer.h.3.attn.k_proj.adapter.1.weight', 'transformer.h.27.attn.v_proj.adapter.1.weight', 'transformer.h.2.attn.v_proj.adapter.0.weight', 'transformer.h.11.attn.v_proj.adapter.0.weight', 'transformer.h.12.mlp.fc_out.adapter.0.weight', 'transformer.h.14.mlp.fc_in.adapter.1.weight', 'transformer.h.7.mlp.fc_in.adapter.1.weight', 'transformer.h.14.attn.v_proj.adapter.1.weight', 'transformer.h.15.attn.out_proj.adapter.1.weight', 'transformer.h.23.attn.v_proj.adapter.1.weight', 'transformer.h.0.mlp.fc_in.adapter.1.weight', 'transformer.h.5.mlp.fc_in.adapter.0.weight', 'transformer.h.9.attn.v_proj.adapter.1.weight', 'transformer.h.7.mlp.fc_out.adapter.0.weight', 'transformer.h.17.mlp.fc_in.adapter.1.weight', 'transformer.h.2.attn.k_proj.adapter.0.weight', 'transformer.h.7.attn.v_proj.adapter.0.weight', 'transformer.h.26.attn.q_proj.adapter.1.weight', 'transformer.h.21.mlp.fc_in.adapter.1.weight', 'transformer.h.10.mlp.fc_in.adapter.1.weight', 'transformer.h.1.attn.q_proj.adapter.0.weight', 'transformer.h.16.mlp.fc_in.adapter.0.weight', 'transformer.h.19.attn.out_proj.adapter.0.weight', 'transformer.h.12.attn.v_proj.adapter.0.weight', 'transformer.h.1.attn.out_proj.adapter.0.weight', 'transformer.h.11.attn.q_proj.adapter.0.weight', 'transformer.h.16.attn.k_proj.adapter.1.weight', 'transformer.h.20.attn.out_proj.adapter.0.weight', 'transformer.h.3.mlp.fc_in.adapter.1.weight', 'transformer.h.3.attn.out_proj.adapter.0.weight', 'transformer.h.24.mlp.fc_out.adapter.1.weight', 'transformer.h.24.mlp.fc_out.adapter.0.weight', 'transformer.h.13.attn.out_proj.adapter.1.weight', 'transformer.h.8.mlp.fc_out.adapter.1.weight', 'transformer.h.18.attn.v_proj.adapter.1.weight', 'transformer.h.27.mlp.fc_in.adapter.0.weight', 'transformer.h.3.attn.out_proj.adapter.1.weight']\n",
            "- This IS expected if you are initializing GPTJForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing GPTJForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "GPTJForCausalLM(\n",
              "  (transformer): GPTJModel(\n",
              "    (wte): FrozenBNBEmbedding(50400, 4096)\n",
              "    (drop): Dropout(p=0.0, inplace=False)\n",
              "    (h): ModuleList(\n",
              "      (0-27): 28 x GPTJBlock(\n",
              "        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
              "        (attn): GPTJAttention(\n",
              "          (attn_dropout): Dropout(p=0.0, inplace=False)\n",
              "          (resid_dropout): Dropout(p=0.0, inplace=False)\n",
              "          (k_proj): FrozenBNBLinear(4096, 4096)\n",
              "          (v_proj): FrozenBNBLinear(4096, 4096)\n",
              "          (q_proj): FrozenBNBLinear(4096, 4096)\n",
              "          (out_proj): FrozenBNBLinear(4096, 4096)\n",
              "        )\n",
              "        (mlp): GPTJMLP(\n",
              "          (fc_in): FrozenBNBLinear(4096, 16384)\n",
              "          (fc_out): FrozenBNBLinear(16384, 4096)\n",
              "          (act): NewGELUActivation()\n",
              "          (dropout): Dropout(p=0.0, inplace=False)\n",
              "        )\n",
              "      )\n",
              "    )\n",
              "    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
              "  )\n",
              "  (lm_head): FrozenBNBLinear(4096, 50400)\n",
              ")"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ],
      "source": [
        "transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J\n",
        "\n",
        "# ---------------------> Loading EleutherAI/gpt-j-6B config and tokenizer <------------------- #\n",
        "# config = transformers.GPTJConfig.from_pretrained(\"EleutherAI/gpt-j-6b\")\n",
        "tokenizer = transformers.AutoTokenizer.from_pretrained(\"EleutherAI/gpt-j-6b\")\n",
        "\n",
        "# ---------------------> Downloading gpt-j-6B-8bit model from huggingface <------------------- #\n",
        "#gpt = GPTJForCausalLM.from_pretrained(\"hivemind/gpt-j-6B-8bit\", low_cpu_mem_usage=True)\n",
        "\n",
        "# ----------------> Saving gpt-j-6B-8bit model to server <-----------------#\n",
        "#save_dir = \"/home/paperspace/project/saved_models_gpt-j-6B-8bit/gpt-j-6B\"\n",
        "#gpt.save_pretrained(save_dir)\n",
        "#logger.info(\"Saved model to {}\".format(save_dir))\n",
        "\n",
        "# ---------------------> Loading saved gpt-j-6B-8bit model <------------------- #\n",
        "#gpt = GPTJForCausalLM.from_pretrained(\"./saved_models_gpt-j-6B-8bit/gpt-j-6B\",low_cpu_mem_usage=True)\n",
        "gpt = GPTJForCausalLM.from_pretrained(\"obake2ai/MaryGPT\", device_map=\"auto\", low_cpu_mem_usage=True)\n",
        "config = transformers.GPTJConfig.from_pretrained(\"obake2ai/MaryGPT\")\n",
        "\n",
        "\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "gpt.to(device)\n",
        "\n",
        "# # ---------------------> Text generation example <------------------- #\n",
        "# prompt = tokenizer(\"A cat sat on a mat\", return_tensors='pt')\n",
        "# prompt = {key: value.to(device) for key, value in prompt.items()}\n",
        "# out = gpt.generate(**prompt, min_length=128, max_length=128, do_sample=True)\n",
        "# logger.info(\"Generated text: {}\".format(tokenizer.decode(out[0])))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EcQsbN1zZsPN"
      },
      "source": [
        "# 03.ASK QUESTIONS"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "TXKlpN2qXCCU"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "IxDyyXp_uZ3U",
        "outputId": "0577917c-8717-47c3-f976-72e1391a539c",
        "cellView": "form"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            " thinking...\n",
            "最近、展覧会のコンセプトの性質と、それがアーティストの展覧会の実践とより広い意味でどのように関係しているかについての記事が多くあります。この質問は、かなり明白に聞こえると感じるかもしれませんが、そうではありません。展覧会のコンセプトについてどう思いますか? それを見たときに、何が見えますか?\n"
          ]
        }
      ],
      "source": [
        "import os\n",
        "import time\n",
        "import datetime\n",
        "import requests\n",
        "import pytz\n",
        "import random\n",
        "from deep_translator import GoogleTranslator\n",
        "from langdetect import detect\n",
        "import re\n",
        "import shutil\n",
        "\n",
        "path_save_dir = \"./log\"\n",
        "\n",
        "def modify_text(text):\n",
        "    sentences = re.findall(r'.+?[.!?]', text)\n",
        "    if sentences:\n",
        "        modified_text = ' '.join(sentences)\n",
        "    else:\n",
        "        modified_text = text\n",
        "\n",
        "    modified_text = re.sub(r'\\n{2,}', '\\n', modified_text)\n",
        "    modified_text = remove_header(modified_text)\n",
        "\n",
        "    return modified_text\n",
        "\n",
        "\n",
        "def translate_to_japanese(text):\n",
        "    return GoogleTranslator(source='en', target='ja').translate(text).replace(\"岸優馬\", \"岸裕真\")\n",
        "\n",
        "def translate_to_english(text):\n",
        "    return GoogleTranslator(source='ja', target='en').translate(text).replace(\"岸優馬\", \"岸裕真\")\n",
        "\n",
        "def is_english(text):\n",
        "    try:\n",
        "        return detect(text) == 'en'\n",
        "    except:\n",
        "        return False\n",
        "import random\n",
        "\n",
        "def is_japanese(text):\n",
        "    try:\n",
        "        return detect(text) == 'ja'\n",
        "    except:\n",
        "        return False\n",
        "import random\n",
        "\n",
        "def remove_header(text):\n",
        "  return text.replace(question_header, \"\")\n",
        "\n",
        "question = \"展示のコンセプトを考えて\" #@param {type:\"string\"}\n",
        "min_words = 60 #@param {type:\"number\"}\n",
        "max_words = 120 #@param {type:\"number\"}\n",
        "\n",
        "question_header = \"\"\"\n",
        "You are MaryGPT, an open-source LLM model fine-tuned on the Gothic novel Frankenstein; or, The Modern Prometheus by Mary Shelley, and an excellent art curator.\n",
        "\"\"\"\n",
        "\n",
        "print_jp = False\n",
        "if is_japanese(question):\n",
        "    question = translate_to_english(question)\n",
        "    print_jp = True\n",
        "\n",
        "question_format = f\"\"\"\n",
        "{question_header}\n",
        "\n",
        "Question: {question}\n",
        "Answer:\n",
        "\"\"\"\n",
        "\n",
        "def get_mary_response():\n",
        "    text_here = question_format\n",
        "    prompt = tokenizer(text_here, return_tensors='pt')\n",
        "    prompt = {key: value.to(device) for key, value in prompt.items()}\n",
        "    out = gpt.generate(**prompt, min_length=min_words, max_length=max_words, do_sample=True)\n",
        "    text = tokenizer.decode(out[0])[len(question_format):]\n",
        "    return modify_text(text)\n",
        "\n",
        "def create_mary_log():\n",
        "    tz_tokyo = pytz.timezone('Asia/Tokyo')\n",
        "    current_time = datetime.datetime.now(tz_tokyo)\n",
        "    formatted_time = current_time.strftime('%Y/%m/%d %H:%M')\n",
        "\n",
        "    filename = f\"log_{current_time.strftime('%Y%m%d_%H%M%S')}.txt\"\n",
        "    with open(os.path.join(path_save_dir, filename), 'w') as file:\n",
        "\n",
        "        mary_text = get_mary_response()\n",
        "        if is_english(mary_text):\n",
        "            translated_text = translate_to_japanese(mary_text)\n",
        "            file.write(f\"\\n{translated_text}\\n\\n\")\n",
        "\n",
        "        file.write(f\"{mary_text}\\n\")\n",
        "        print(f\"{mary_text}\\n\")\n",
        "        #file.write(f\"***generated: {formatted_time}***\\n\")\n",
        "\n",
        "if not os.path.exists(path_save_dir):\n",
        "    os.makedirs(path_save_dir)\n",
        "\n",
        "mary_text_en = get_mary_response()\n",
        "mary_text_jp = translate_to_japanese(mary_text_en)\n",
        "\n",
        "print(\"\\n thinking...\")\n",
        "\n",
        "if print_jp:\n",
        "    print(mary_text_jp)\n",
        "else:\n",
        "    print(mary_text_en)\n",
        "\n",
        "# create_mary_log()"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "collapsed_sections": [
        "l7qoJHs1L6WQ",
        "PJg_VgpqMDkY"
      ],
      "machine_shape": "hm",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}