{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How to FineTune Llama 3 with  SFTTrainer and  Unsloth\n",
    "Hello everyone, today we are going to show how we can Fine Tune Llama 3 with SFTTrainer and Unsloth\n",
    "First we are going to perform a simmple Fine Tunning by using SFTTrainer\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Installation of Pytorch\n",
    "The first step is install pythorch v 2.2.1 with Cuda 12.1 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pip in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (23.3)\n",
      "Collecting pip\n",
      "  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)\n",
      "Downloading pip-24.0-py3-none-any.whl (2.1 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m38.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: pip\n",
      "  Attempting uninstall: pip\n",
      "    Found existing installation: pip 23.3\n",
      "    Uninstalling pip-23.3:\n",
      "      Successfully uninstalled pip-23.3\n",
      "Successfully installed pip-24.0\n",
      "Looking in indexes: https://download.pytorch.org/whl/cu121\n",
      "Collecting torch==2.2.1\n",
      "  Downloading https://download.pytorch.org/whl/cu121/torch-2.2.1%2Bcu121-cp310-cp310-linux_x86_64.whl (757.3 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m757.3/757.3 MB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: torchvision in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (0.15.2)\n",
      "Collecting torchaudio\n",
      "  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.3.0%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m86.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hCollecting xformers\n",
      "  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl (222.7 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.7/222.7 MB\u001b[0m \u001b[31m33.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: filelock in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torch==2.2.1) (3.9.0)\n",
      "Collecting typing-extensions>=4.8.0 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
      "Requirement already satisfied: sympy in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torch==2.2.1) (1.12)\n",
      "Requirement already satisfied: networkx in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torch==2.2.1) (2.8.4)\n",
      "Requirement already satisfied: jinja2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torch==2.2.1) (3.1.3)\n",
      "Requirement already satisfied: fsspec in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torch==2.2.1) (2022.11.0)\n",
      "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m97.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m50.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m120.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m20.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m50.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m52.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-nccl-cu12==2.19.3 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting triton==2.2.0 (from torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m167.9/167.9 MB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch==2.2.1)\n",
      "  Downloading https://download.pytorch.org/whl/cu121/nvidia_nvjitlink_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (19.8 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.8/19.8 MB\u001b[0m \u001b[31m125.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: numpy in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torchvision) (1.23.5)\n",
      "Requirement already satisfied: requests in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torchvision) (2.31.0)\n",
      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from torchvision) (10.3.0)\n",
      "INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.\n",
      "Collecting torchaudio\n",
      "  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.2.2%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m129.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25h  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.2.1%2Bcu121-cp310-cp310-linux_x86_64.whl (3.4 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hINFO: pip is looking at multiple versions of xformers to determine which version is compatible with other requirements. This could take a while.\n",
      "Collecting xformers\n",
      "  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.26-cp310-cp310-manylinux2014_x86_64.whl (222.6 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.6/222.6 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25h  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.25.post1-cp310-cp310-manylinux2014_x86_64.whl (222.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.5/222.5 MB\u001b[0m \u001b[31m34.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25h  Downloading https://download.pytorch.org/whl/cu121/xformers-0.0.25-cp310-cp310-manylinux2014_x86_64.whl (222.5 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.5/222.5 MB\u001b[0m \u001b[31m20.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from jinja2->torch==2.2.1) (2.1.1)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests->torchvision) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests->torchvision) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests->torchvision) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests->torchvision) (2024.2.2)\n",
      "Requirement already satisfied: mpmath>=0.19 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from sympy->torch==2.2.1) (1.3.0)\n",
      "Installing collected packages: typing-extensions, triton, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, torch, xformers, torchaudio\n",
      "  Attempting uninstall: typing-extensions\n",
      "    Found existing installation: typing_extensions 4.4.0\n",
      "    Uninstalling typing_extensions-4.4.0:\n",
      "      Successfully uninstalled typing_extensions-4.4.0\n",
      "  Attempting uninstall: torch\n",
      "    Found existing installation: torch 2.0.1\n",
      "    Uninstalling torch-2.0.1:\n",
      "      Successfully uninstalled torch-2.0.1\n",
      "Successfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.1.105 nvidia-nvtx-cu12-12.1.105 torch-2.2.1+cu121 torchaudio-2.2.1+cu121 triton-2.2.0 typing-extensions-4.9.0 xformers-0.0.25\n"
     ]
    }
   ],
   "source": [
    "!python -m pip install --upgrade pip\n",
    "!pip3 install torch==2.2.1 torchvision torchaudio   xformers --index-url https://download.pytorch.org/whl/cu121"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3 - Installation of Uslotch packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)\n",
      "  Cloning https://github.com/unslothai/unsloth.git to /tmp/wsuser/pip-install-8a93kdi0/unsloth_56c62c14bb3f4be29d884342054fdd22\n",
      "  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/wsuser/pip-install-8a93kdi0/unsloth_56c62c14bb3f4be29d884342054fdd22\n",
      "  Resolved https://github.com/unslothai/unsloth.git to commit 4211cc01409e3ced4f7abebaf68e244193b46e2c\n",
      "  Installing build dependencies ... \u001b[?25ldone\n",
      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
      "\u001b[?25h  Installing backend dependencies ... \u001b[?25ldone\n",
      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: tyro in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.8.3)\n",
      "Requirement already satisfied: transformers>=4.38.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (4.40.2)\n",
      "Requirement already satisfied: datasets>=2.16.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2.19.1)\n",
      "Requirement already satisfied: sentencepiece in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.1.97)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (4.65.0)\n",
      "Requirement already satisfied: psutil in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (5.9.0)\n",
      "Requirement already satisfied: wheel>=0.42.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.43.0)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.23.5)\n",
      "Requirement already satisfied: protobuf<4.0.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (3.20.3)\n",
      "Requirement already satisfied: filelock in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (3.9.0)\n",
      "Requirement already satisfied: pyarrow>=12.0.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (16.0.0)\n",
      "Requirement already satisfied: pyarrow-hotfix in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.6)\n",
      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.3.8)\n",
      "Requirement already satisfied: pandas in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.5.3)\n",
      "Requirement already satisfied: requests>=2.19.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2.31.0)\n",
      "Requirement already satisfied: xxhash in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (3.4.1)\n",
      "Requirement already satisfied: multiprocess in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.70.16)\n",
      "Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2024.3.1)\n",
      "Requirement already satisfied: aiohttp in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (3.9.3)\n",
      "Requirement already satisfied: huggingface-hub>=0.21.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.23.0)\n",
      "Requirement already satisfied: packaging in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (23.0)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (6.0)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from transformers>=4.38.2->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2022.3.15)\n",
      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from transformers>=4.38.2->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.19.1)\n",
      "Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from transformers>=4.38.2->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.4.3)\n",
      "Requirement already satisfied: docstring-parser>=0.14.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.16)\n",
      "Requirement already satisfied: typing-extensions>=4.7.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (4.9.0)\n",
      "Requirement already satisfied: rich>=11.1.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (13.7.1)\n",
      "Requirement already satisfied: shtab>=1.5.6 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.7.1)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.2.0)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (23.1.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.3.3)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (6.0.2)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.8.1)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (4.0.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2024.2.2)\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from rich>=11.1.0->tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (3.0.0)\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from rich>=11.1.0->tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2.15.1)\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from pandas->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from pandas->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (2022.7)\n",
      "Requirement already satisfied: mdurl~=0.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=11.1.0->tyro->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (0.1.2)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas->datasets>=2.16.0->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git) (1.16.0)\n",
      "Requirement already satisfied: trl in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (0.8.6)\n",
      "Requirement already satisfied: peft in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (0.10.0)\n",
      "Requirement already satisfied: accelerate in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (0.30.0)\n",
      "Requirement already satisfied: bitsandbytes in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (0.43.1)\n",
      "Requirement already satisfied: datasets in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (2.19.1)\n",
      "Requirement already satisfied: filelock in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (3.9.0)\n",
      "Requirement already satisfied: numpy>=1.17 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (1.23.5)\n",
      "Requirement already satisfied: pyarrow>=12.0.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (16.0.0)\n",
      "Requirement already satisfied: pyarrow-hotfix in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (0.6)\n",
      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (0.3.8)\n",
      "Requirement already satisfied: pandas in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (1.5.3)\n",
      "Requirement already satisfied: requests>=2.19.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (2.31.0)\n",
      "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (4.65.0)\n",
      "Requirement already satisfied: xxhash in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (3.4.1)\n",
      "Requirement already satisfied: multiprocess in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (0.70.16)\n",
      "Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets) (2024.3.1)\n",
      "Requirement already satisfied: aiohttp in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (3.9.3)\n",
      "Requirement already satisfied: huggingface-hub>=0.21.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (0.23.0)\n",
      "Requirement already satisfied: packaging in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (23.0)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from datasets) (6.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets) (1.2.0)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets) (23.1.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.3)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.2)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets) (1.8.1)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.2)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from huggingface-hub>=0.21.2->datasets) (4.9.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.7)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2024.2.2)\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from pandas->datasets) (2022.7)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n",
      "Requirement already satisfied: hyperopt in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (0.2.5)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (1.23.5)\n",
      "Requirement already satisfied: scipy in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (1.10.1)\n",
      "Requirement already satisfied: six in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (1.16.0)\n",
      "Requirement already satisfied: networkx>=2.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (2.8.4)\n",
      "Requirement already satisfied: future in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (0.18.3)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (4.65.0)\n",
      "Requirement already satisfied: cloudpickle in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from hyperopt) (2.2.1)\n",
      "Requirement already satisfied: optuna in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (3.6.1)\n",
      "Requirement already satisfied: alembic>=1.5.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (1.13.1)\n",
      "Requirement already satisfied: colorlog in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (6.8.2)\n",
      "Requirement already satisfied: numpy in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (1.23.5)\n",
      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (23.0)\n",
      "Requirement already satisfied: sqlalchemy>=1.3.0 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (1.4.39)\n",
      "Requirement already satisfied: tqdm in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (4.65.0)\n",
      "Requirement already satisfied: PyYAML in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from optuna) (6.0)\n",
      "Requirement already satisfied: Mako in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from alembic>=1.5.0->optuna) (1.3.3)\n",
      "Requirement already satisfied: typing-extensions>=4 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from alembic>=1.5.0->optuna) (4.9.0)\n",
      "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from sqlalchemy>=1.3.0->optuna) (2.0.1)\n",
      "Requirement already satisfied: MarkupSafe>=0.9.2 in /opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages (from Mako->alembic>=1.5.0->optuna) (2.1.1)\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "major_version, minor_version = torch.cuda.get_device_capability()\n",
    "# Must install separately since Colab has torch 2.2.1, which breaks packages\n",
    "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
    "if major_version >= 8:\n",
    "    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)\n",
    "    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft \\\n",
    "    accelerate bitsandbytes\n",
    "else:\n",
    "    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)\n",
    "    !pip install --no-deps  trl peft accelerate bitsandbytes\n",
    "!pip install datasets\n",
    "!pip install hyperopt\n",
    "!pip install optuna    \n",
    "pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 4 - Analysis of our infrastructure\n",
    "In ordering to perform any training it is important to know our system in order to take the full advantage of the system."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "dH4JvbO9oiHE",
    "outputId": "399bc210-c095-4807-900f-6b4cf2fe133f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unable to find python bindings at /usr/local/dcgm/bindings/python3. No data will be captured.\n",
      "xFormers 0.0.25\n",
      "memory_efficient_attention.ckF:                    unavailable\n",
      "memory_efficient_attention.ckB:                    unavailable\n",
      "memory_efficient_attention.ck_decoderF:            unavailable\n",
      "memory_efficient_attention.ck_splitKF:             unavailable\n",
      "memory_efficient_attention.cutlassF:               available\n",
      "memory_efficient_attention.cutlassB:               available\n",
      "memory_efficient_attention.decoderF:               available\n",
      "memory_efficient_attention.flshattF@v2.5.6:        available\n",
      "memory_efficient_attention.flshattB@v2.5.6:        available\n",
      "memory_efficient_attention.smallkF:                available\n",
      "memory_efficient_attention.smallkB:                available\n",
      "memory_efficient_attention.triton_splitKF:         unavailable\n",
      "indexing.scaled_index_addF:                        unavailable\n",
      "indexing.scaled_index_addB:                        unavailable\n",
      "indexing.index_select:                             unavailable\n",
      "sequence_parallel_fused.write_values:              available\n",
      "sequence_parallel_fused.wait_values:               available\n",
      "sequence_parallel_fused.cuda_memset_32b_async:     available\n",
      "sp24.sparse24_sparsify_both_ways:                  available\n",
      "sp24.sparse24_apply:                               available\n",
      "sp24.sparse24_apply_dense_output:                  available\n",
      "sp24._sparse24_gemm:                               available\n",
      "sp24._cslt_sparse_mm@0.4.0:                        available\n",
      "swiglu.dual_gemm_silu:                             available\n",
      "swiglu.gemm_fused_operand_sum:                     available\n",
      "swiglu.fused.p.cpp:                                available\n",
      "is_triton_available:                               False\n",
      "pytorch.version:                                   2.2.1+cu121\n",
      "pytorch.cuda:                                      available\n",
      "gpu.compute_capability:                            7.0\n",
      "gpu.name:                                          Tesla V100-PCIE-16GB\n",
      "dcgm_profiler:                                     unavailable\n",
      "build.info:                                        available\n",
      "build.cuda_version:                                1201\n",
      "build.hip_version:                                 None\n",
      "build.python_version:                              3.10.13\n",
      "build.torch_version:                               2.2.1+cu121\n",
      "build.env.TORCH_CUDA_ARCH_LIST:                    5.0+PTX 6.0 6.1 7.0 7.5 8.0+PTX 9.0\n",
      "build.env.PYTORCH_ROCM_ARCH:                       None\n",
      "build.env.XFORMERS_BUILD_TYPE:                     Release\n",
      "build.env.XFORMERS_ENABLE_DEBUG_ASSERTIONS:        None\n",
      "build.env.NVCC_FLAGS:                              None\n",
      "build.env.XFORMERS_PACKAGE_FROM:                   wheel-v0.0.25\n",
      "build.nvcc_version:                                12.1.66\n",
      "source.privacy:                                    open source\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "++++++++++++++++++ BUG REPORT INFORMATION ++++++++++++++++++\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "++++++++++++++++++++++++++ OTHER +++++++++++++++++++++++++++\n",
      "CUDA specs: CUDASpecs(highest_compute_capability=(7, 0), cuda_version_string='121', cuda_version_tuple=(12, 1))\n",
      "PyTorch settings found: CUDA_VERSION=121, Highest Compute Capability: (7, 0).\n",
      "To manually override the PyTorch CUDA version please see: https://github.com/TimDettmers/bitsandbytes/blob/main/docs/source/nonpytorchcuda.mdx\n",
      "WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU!\n",
      "If you run into issues with 8-bit matmul, you can try 4-bit quantization:\n",
      "https://huggingface.co/blog/4bit-transformers-bitsandbytes\n",
      "The directory listed in your path is found to be non-existent: /usr/local/nvidia/lib\n",
      "The directory listed in your path is found to be non-existent: //private.runtime.dataplatform.cloud.ibm.com\n",
      "The directory listed in your path is found to be non-existent: /home/wsuser/jars/*\n",
      "The directory listed in your path is found to be non-existent: /opt/jdbc/*\n",
      "The directory listed in your path is found to be non-existent: bluemix/prod\n",
      "The directory listed in your path is found to be non-existent: //api.dataplatform.cloud.ibm.com\n",
      "The directory listed in your path is found to be non-existent: //matplotlib_inline.backend_inline\n",
      "The directory listed in your path is found to be non-existent: --xla_gpu_cuda_data_dir=/opt/conda/envs/Python-RT23.1-CUDA\n",
      "CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "++++++++++++++++++++++ DEBUG INFO END ++++++++++++++++++++++\n",
      "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n",
      "Checking that the library is importable and CUDA is callable...\n",
      "SUCCESS!\n",
      "Installation was successful!\n",
      "Thu May  9 19:59:13 2024       \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
      "|-----------------------------------------+----------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                      |               MIG M. |\n",
      "|=========================================+======================+======================|\n",
      "|   0  Tesla V100-PCIE-16GB           Off | 00000000:AF:00.0 Off |                    0 |\n",
      "| N/A   33C    P0              40W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "|   1  Tesla V100-PCIE-16GB           Off | 00000000:D8:00.0 Off |                    0 |\n",
      "| N/A   33C    P0              26W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "                                                                                         \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                            |\n",
      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
      "|        ID   ID                                                             Usage      |\n",
      "|=======================================================================================|\n",
      "|  No running processes found                                                           |\n",
      "+---------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!python -m xformers.info\n",
    "!python -m bitsandbytes\n",
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 5 Login to Hugging Face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /home/wsuser/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "token=\"hf_\"\n",
    "from huggingface_hub import login, logout\n",
    "login(token) # non-blocking login"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 5  Simple Fine Tunning Method\n",
    "\n",
    "First let us show the simplest method that is given by  SFTTrainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "25db645610ac40e6a8a647896dec0f16",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a8d6ac36539a4c12a5dc6eafbc58d9ca",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "53fa0f7afe9849c99fa1761aafd363c1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "da22a855c9524144870d16abcf850047",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6770b2824fa64dc1a3dd6279683e6915",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cd128b3e68ac4bf6a87d812e1c0c248e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0fb4fe0fc5384b3182762db3f8c42b9e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7f9cfeabcf424ccc98a2b9d69d770a0a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8244c9b77e7f4afe8524963a8f7b02e2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "690662c1a9244cdd9ea9f1162ce3cd30",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3813d56b85b74450a93e9ade68a807b5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ebb06c38dd79488ab8eb273359406577",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5251b05267db420d8f14ce05b7d52b81",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "947b7ad7d14f4201902e1a292ec7f513",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1ddcf0ae14934fd79d8926aee0972abe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7423975680ef41ff9ac362c72b2c0907",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
      "max_steps is given, it will override any value given in num_train_epochs\n",
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/utils/checkpoint.py:460: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1/1 00:04, Epoch 0/1]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2.346700</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=1, training_loss=2.346719980239868, metrics={'train_runtime': 8.2949, 'train_samples_per_second': 0.482, 'train_steps_per_second': 0.121, 'total_flos': 74593973698560.0, 'train_loss': 2.346719980239868, 'epoch': 0.04})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer\n",
    "from peft import LoraConfig\n",
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "# Load the dataset\n",
    "dataset_name = \"ruslanmv/ai-medical-dataset\"\n",
    "dataset = load_dataset(dataset_name, split=\"train\")\n",
    "# Select the first 1000 rows of the dataset\n",
    "dataset = dataset.select(range(100))\n",
    "# Device map\n",
    "device_map = 'auto'  # for PP and running with `python test_sft.py`\n",
    "# Load the model + tokenizer\n",
    "model_name = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "bnb_config = BitsAndBytesConfig(\n",
    "    load_in_4bit=True,\n",
    "    bnb_4bit_quant_type=\"nf4\",\n",
    "    bnb_4bit_compute_dtype=torch.float16,\n",
    ")\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model_name,\n",
    "    quantization_config=bnb_config,\n",
    "    trust_remote_code=True,\n",
    "    use_cache=False,\n",
    "    device_map=device_map\n",
    ")\n",
    "# PEFT config\n",
    "lora_alpha = 16\n",
    "lora_dropout = 0.1\n",
    "lora_r = 32  # 64\n",
    "peft_config = LoraConfig(\n",
    "    lora_alpha=lora_alpha,\n",
    "    lora_dropout=lora_dropout,\n",
    "    r=lora_r,\n",
    "    bias=\"none\",\n",
    "    task_type=\"CAUSAL_LM\",\n",
    "    target_modules=[\"k_proj\", \"q_proj\", \"v_proj\", \"up_proj\", \"down_proj\", \"gate_proj\"],\n",
    "    modules_to_save=[\"embed_tokens\", \"input_layernorm\", \"post_attention_layernorm\", \"norm\"],\n",
    ")\n",
    "# Args\n",
    "max_seq_length = 512\n",
    "output_dir = \"./results\"\n",
    "per_device_train_batch_size = 2  # reduced batch size to avoid OOM\n",
    "gradient_accumulation_steps = 2\n",
    "optim = \"adamw_torch\"\n",
    "save_steps = 10\n",
    "logging_steps = 1\n",
    "learning_rate = 2e-4\n",
    "max_grad_norm = 0.3\n",
    "max_steps = 1  \n",
    "warmup_ratio = 0.1\n",
    "lr_scheduler_type = \"cosine\"\n",
    "training_arguments = TrainingArguments(\n",
    "    output_dir=output_dir,\n",
    "    per_device_train_batch_size=per_device_train_batch_size,\n",
    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
    "    optim=optim,\n",
    "    save_steps=save_steps,\n",
    "    logging_steps=logging_steps,\n",
    "    learning_rate=learning_rate,\n",
    "    fp16=True,\n",
    "    max_grad_norm=max_grad_norm,\n",
    "    max_steps=max_steps,\n",
    "    warmup_ratio=warmup_ratio,\n",
    "    group_by_length=True,\n",
    "    lr_scheduler_type=lr_scheduler_type,\n",
    "    gradient_checkpointing=True,  # gradient checkpointing\n",
    "    #report_to=\"wandb\",\n",
    ")\n",
    "# Trainer\n",
    "trainer = SFTTrainer(\n",
    "    model=model,\n",
    "    train_dataset=dataset,\n",
    "    peft_config=peft_config,\n",
    "    dataset_text_field=\"context\",\n",
    "    max_seq_length=max_seq_length,\n",
    "    tokenizer=tokenizer,\n",
    "    args=training_arguments,\n",
    ")\n",
    "\n",
    "# Train :)\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /home/wsuser/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "token=\"\"\n",
    "from huggingface_hub import login, logout\n",
    "login(token) # non-blocking login"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multiple GPUS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "Some weights of BartForCausalLM were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from datasets import load_dataset\n",
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "import os\n",
    "import socket\n",
    "\n",
    "# Distributed training setup (assuming all GPUs are available on a single machine)\n",
    "def init_distributed(rank, world_size):\n",
    "    \"\"\"Initializes distributed training using `nccl` backend.\"\"\"\n",
    "    if rank == 0:\n",
    "        os.environ[\"MASTER_ADDR\"] = socket.gethostname()  # Set MASTER_ADDR using rank 0's hostname\n",
    "    else:\n",
    "        # Wait a bit to ensure MASTER_ADDR is set before other ranks try to use it\n",
    "        import time\n",
    "        time.sleep(5)\n",
    "    os.environ[\"MASTER_PORT\"] = \"12345\"  # Set MASTER_PORT environment variable\n",
    "    os.environ[\"RANK\"] = str(rank)  # Set RANK environment variable\n",
    "    os.environ[\"WORLD_SIZE\"] = str(world_size)  # Set WORLD_SIZE environment variable\n",
    "    torch.distributed.init_process_group(backend='nccl', init_method='env://')\n",
    "\n",
    "# Cleanup after training\n",
    "def cleanup_distributed():\n",
    "    if torch.distributed.is_initialized():\n",
    "        torch.distributed.destroy_process_group()\n",
    "\n",
    "# Model and tokenizer selection\n",
    "model_name = \"facebook/bart-base\"  # Replace with your desired model\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "\n",
    "# Dataset loading (replace with your dataset and field names)\n",
    "dataset = load_dataset(\"glue\", \"mnli\", split=\"train\")\n",
    "text_field = \"premise\"  # Assuming premise is the field containing text for prediction\n",
    "\n",
    "# Training arguments (adjust hyperparameters as needed)\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./results\",\n",
    "    per_device_train_batch_size=2,  # Adjust based on GPU memory (might need to adjust)\n",
    "    save_steps=500,\n",
    "    save_total_limit=2,\n",
    "    num_train_epochs=3,  # Adjust training time as needed\n",
    ")\n",
    "\n",
    "world_size = torch.cuda.device_count()\n",
    "if world_size > 1:\n",
    "    # Initialize distributed training\n",
    "    init_distributed(rank=0, world_size=world_size)  # Rank is assumed to be 0 here\n",
    "\n",
    "    # Wrap model in DDP for distributed training\n",
    "    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[torch.cuda.current_device()])\n",
    "\n",
    "    # Create SFTTrainer with distributed settings\n",
    "    trainer = SFTTrainer(\n",
    "        model=model,\n",
    "        args=training_args,  # Pass training_args as 'args' instead of 'training_args'\n",
    "        train_dataset=dataset,\n",
    "        dataset_text_field=text_field,\n",
    "        compute_metrics=None,  # You can define your custom metrics here\n",
    "    )\n",
    "    print(\"Trainer For distributed training loaded\")\n",
    "else:\n",
    "    # For single-GPU training\n",
    "    trainer = SFTTrainer(\n",
    "        model=model,\n",
    "        args=training_args,  # Pass training_args as 'args' instead of 'training_args'\n",
    "        train_dataset=dataset,\n",
    "        dataset_text_field=text_field,\n",
    "        compute_metrics=None,  # You can define your custom metrics here\n",
    "    )\n",
    "    print(\"Trainer For single-GPU loaded\")\n",
    "\n",
    "# Start training\n",
    "trainer.train()\n",
    "\n",
    "# Cleanup after training\n",
    "cleanup_distributed()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "Some weights of BartForCausalLM were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:246: UserWarning: You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to 1024\n",
      "  warnings.warn(\n",
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trainer For single-GPU loaded\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='329' max='294528' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [   329/294528 00:36 < 9:12:58, 8.87 it/s, Epoch 0.00/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 66\u001b[0m\n\u001b[1;32m     63\u001b[0m   \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTrainer For single-GPU loaded\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     65\u001b[0m \u001b[38;5;66;03m# Start training\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     68\u001b[0m \u001b[38;5;66;03m# Cleanup after training\u001b[39;00m\n\u001b[1;32m     69\u001b[0m cleanup_distributed()\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361\u001b[0m, in \u001b[0;36mSFTTrainer.train\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mneftune_noise_alpha \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer_supports_neftune:\n\u001b[1;32m    359\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trl_activate_neftune(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel)\n\u001b[0;32m--> 361\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    363\u001b[0m \u001b[38;5;66;03m# After training we make sure to retrieve back the original forward pass method\u001b[39;00m\n\u001b[1;32m    364\u001b[0m \u001b[38;5;66;03m# for the embedding layer by removing the forward post hook.\u001b[39;00m\n\u001b[1;32m    365\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mneftune_noise_alpha \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer_supports_neftune:\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:1859\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1857\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1858\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1859\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1860\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1861\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1862\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1863\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1864\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:2203\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2200\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2202\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2203\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2205\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2206\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2207\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2208\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2209\u001b[0m ):\n\u001b[1;32m   2210\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2211\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:3138\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3135\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3137\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3138\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   3141\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:3161\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3159\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3160\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3161\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3162\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3163\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3164\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185\u001b[0m, in \u001b[0;36mDataParallel.forward\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    183\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodule(\u001b[38;5;241m*\u001b[39minputs[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodule_kwargs[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m    184\u001b[0m replicas \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplicate(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodule, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice_ids[:\u001b[38;5;28mlen\u001b[39m(inputs)])\n\u001b[0;32m--> 185\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparallel_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodule_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    186\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgather(outputs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_device)\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200\u001b[0m, in \u001b[0;36mDataParallel.parallel_apply\u001b[0;34m(self, replicas, inputs, kwargs)\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mparallel_apply\u001b[39m(\u001b[38;5;28mself\u001b[39m, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Any]:\n\u001b[0;32m--> 200\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparallel_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice_ids\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:100\u001b[0m, in \u001b[0;36mparallel_apply\u001b[0;34m(modules, inputs, kwargs_tup, devices)\u001b[0m\n\u001b[1;32m     98\u001b[0m         thread\u001b[38;5;241m.\u001b[39mstart()\n\u001b[1;32m     99\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m thread \u001b[38;5;129;01min\u001b[39;00m threads:\n\u001b[0;32m--> 100\u001b[0m         \u001b[43mthread\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    101\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    102\u001b[0m     _worker(\u001b[38;5;241m0\u001b[39m, modules[\u001b[38;5;241m0\u001b[39m], inputs[\u001b[38;5;241m0\u001b[39m], kwargs_tup[\u001b[38;5;241m0\u001b[39m], devices[\u001b[38;5;241m0\u001b[39m], streams[\u001b[38;5;241m0\u001b[39m])\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/threading.py:1096\u001b[0m, in \u001b[0;36mThread.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m   1093\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcannot join current thread\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1095\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1096\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_tstate_lock\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1098\u001b[0m     \u001b[38;5;66;03m# the behavior of a negative timeout isn't documented, but\u001b[39;00m\n\u001b[1;32m   1099\u001b[0m     \u001b[38;5;66;03m# historically .join(timeout=x) for x<0 has acted as if timeout=0\u001b[39;00m\n\u001b[1;32m   1100\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wait_for_tstate_lock(timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mmax\u001b[39m(timeout, \u001b[38;5;241m0\u001b[39m))\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/threading.py:1116\u001b[0m, in \u001b[0;36mThread._wait_for_tstate_lock\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m   1113\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m   1115\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1116\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mlock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblock\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m   1117\u001b[0m         lock\u001b[38;5;241m.\u001b[39mrelease()\n\u001b[1;32m   1118\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop()\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "from datasets import load_dataset\n",
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "import os\n",
    "\n",
    "# Distributed training setup (assuming all GPUs are available on a single machine)\n",
    "def init_distributed():\n",
    "  # Replace with actual hostname or IP if using multiple machines\n",
    "  os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
    "  os.environ[\"MASTER_PORT\"] = \"12345\"\n",
    "  torch.distributed.init_process_group(backend='nccl', world_size=torch.cuda.device_count(), rank=rank)\n",
    "\n",
    "def cleanup_distributed():\n",
    "  torch.distributed.destroy_process_group()\n",
    "\n",
    "# Model and tokenizer selection\n",
    "model_name = \"facebook/bart-base\"  # Replace with your desired model\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModelForCausalLM.from_pretrained(model_name)\n",
    "\n",
    "# Dataset loading (replace with your dataset and field names)\n",
    "dataset = load_dataset(\"glue\", \"mnli\", split=\"train\")\n",
    "text_field = \"premise\"  # Assuming premise is the field containing text for prediction\n",
    "\n",
    "# Training arguments (adjust hyperparameters as needed)\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./results\",\n",
    "    per_device_train_batch_size=2,  # Adjust based on GPU memory\n",
    "    save_steps=500,\n",
    "    save_total_limit=2,\n",
    "    num_train_epochs=3,  # Adjust training time as needed\n",
    ")\n",
    "\n",
    "# Distributed training setup with SFTTrainer\n",
    "if torch.distributed.is_initialized():\n",
    "  rank = torch.distributed.get_rank()\n",
    "  world_size = torch.distributed.get_world_size()\n",
    "  # Wrap model in DDP for distributed training\n",
    "  model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])\n",
    "\n",
    "  # Create SFTTrainer with distributed settings\n",
    "  trainer = SFTTrainer(\n",
    "      model=model,\n",
    "      args=training_args,  # Pass training_args as 'args' instead of 'training_args'\n",
    "      train_dataset=dataset,\n",
    "      dataset_text_field=text_field,\n",
    "      compute_metrics=None,  # You can define your custom metrics here\n",
    "      world_size=world_size,\n",
    "      rank=rank,\n",
    "  )\n",
    "  print(f\"Trainer For distributed training loaded on rank {rank}\")\n",
    "else:\n",
    "  # For single-GPU training\n",
    "  trainer = SFTTrainer(\n",
    "      model=model,\n",
    "      args=training_args,  # Pass training_args as 'args' instead of 'training_args'\n",
    "      train_dataset=dataset,\n",
    "      dataset_text_field=text_field,\n",
    "      compute_metrics=None,  # You can define your custom metrics here\n",
    "  )\n",
    "  print(\"Trainer For single-GPU loaded\")\n",
    "\n",
    "# Start training\n",
    "trainer.train()\n",
    "\n",
    "# Cleanup after training\n",
    "cleanup_distributed()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Traceback (most recent call last):\n",
      "  File \"<string>\", line 1, in <module>\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/spawn.py\", line 116, in spawn_main\n",
      "    exitcode = _main(fd, parent_sentinel)\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/spawn.py\", line 126, in _main\n",
      "    self = reduction.pickle.load(from_parent)\n",
      "AttributeError: Can't get attribute 'main_worker' on <module '__main__' (built-in)>\n",
      "Traceback (most recent call last):\n",
      "  File \"<string>\", line 1, in <module>\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/spawn.py\", line 116, in spawn_main\n",
      "    exitcode = _main(fd, parent_sentinel)\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/spawn.py\", line 126, in _main\n",
      "    self = reduction.pickle.load(from_parent)\n",
      "AttributeError: Can't get attribute 'main_worker' on <module '__main__' (built-in)>\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import torch\n",
    "import torch.multiprocessing as mp\n",
    "from datasets import load_dataset\n",
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer\n",
    "from peft import LoraConfig\n",
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments\n",
    "from torch.nn.parallel import DistributedDataParallel as DDP\n",
    "\n",
    "\n",
    "# Distributed training setup\n",
    "def init_distributed():\n",
    "    os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
    "    os.environ[\"MASTER_PORT\"] = \"12345\"\n",
    "    torch.distributed.init_process_group(backend='nccl', world_size=torch.cuda.device_count(), rank=rank)\n",
    "\n",
    "def cleanup_distributed():\n",
    "    torch.distributed.destroy_process_group()\n",
    "\n",
    "def main_worker(rank, world_size):\n",
    "    init_distributed()\n",
    "\n",
    "    # Your model training and fine-tuning code goes here\n",
    "    # Load the dataset\n",
    "    dataset_name = \"ruslanmv/ai-medical-dataset\"\n",
    "    dataset = load_dataset(dataset_name, split=\"train\")\n",
    "    # Select the first 1M rows of the dataset\n",
    "    dataset = dataset.select(range(100))\n",
    "\n",
    "    # Load the model + tokenizer\n",
    "    model_name = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "    tokenizer.pad_token = tokenizer.eos_token\n",
    "    bnb_config = BitsAndBytesConfig(\n",
    "        load_in_4bit=True,\n",
    "        bnb_4bit_quant_type=\"nf4\",\n",
    "        bnb_4bit_compute_dtype=torch.float16,\n",
    "    )\n",
    "    model = AutoModelForCausalLM.from_pretrained(\n",
    "        model_name,\n",
    "        quantization_config=bnb_config,\n",
    "        trust_remote_code=True,\n",
    "        use_cache=False,\n",
    "    )\n",
    "\n",
    "    # Check for available GPUs\n",
    "    device = torch.device(f\"cuda:{rank}\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "    # PEFT config\n",
    "    lora_alpha = 1\n",
    "    lora_dropout = 0.1\n",
    "    lora_r = 32  # 64\n",
    "    peft_config = LoraConfig(\n",
    "        lora_alpha=lora_alpha,\n",
    "        lora_dropout=lora_dropout,\n",
    "        task_type=\"CAUSAL_LM\",\n",
    "        target_modules=[\"k_proj\", \"q_proj\", \"v_proj\", \"up_proj\", \"down_proj\", \"gate_proj\"],\n",
    "        modules_to_save=[\"embed_tokens\", \"input_layernorm\", \"post_attention_layernorm\", \"norm\"],\n",
    "    )\n",
    "\n",
    "    # Args\n",
    "    max_seq_length = 512\n",
    "    output_dir = \"./results\"\n",
    "    per_device_train_batch_size = 2  # reduced batch size to avoid OOM\n",
    "    gradient_accumulation_steps = 2\n",
    "    optim = \"adamw_torch\"\n",
    "    save_steps = 10\n",
    "    logging_steps = 1\n",
    "    learning_rate = 2e-4\n",
    "    max_grad_norm = 0.3\n",
    "    max_steps = 1  # 300 Approx the size of guanaco at bs 8, ga 2, 2 GPUs.\n",
    "    warmup_ratio = 0.1\n",
    "    lr_scheduler_type = \"cosine\"\n",
    "    training_arguments = TrainingArguments(\n",
    "        output_dir=output_dir,\n",
    "        per_device_train_batch_size=per_device_train_batch_size,\n",
    "        gradient_accumulation_steps=gradient_accumulation_steps,\n",
    "        optim=optim,\n",
    "        save_steps=save_steps,\n",
    "        logging_steps=logging_steps,\n",
    "        learning_rate=learning_rate,\n",
    "        fp16=True,\n",
    "        max_grad_norm=max_grad_norm,\n",
    "        max_steps=max_steps,\n",
    "        warmup_ratio=warmup_ratio,\n",
    "        group_by_length=True,\n",
    "        lr_scheduler_type=lr_scheduler_type,\n",
    "        gradient_checkpointing=True,  # gradient checkpointing\n",
    "        #report_to=\"wandb\",\n",
    "    )\n",
    "\n",
    "    # Trainer\n",
    "    trainer = SFTTrainer(\n",
    "        model=model,\n",
    "        train_dataset=dataset,\n",
    "        peft_config=peft_config,\n",
    "        dataset_text_field=\"context\",\n",
    "        max_seq_length=max_seq_length,\n",
    "        tokenizer=tokenizer,\n",
    "        args=training_arguments,\n",
    "    )\n",
    "\n",
    "    # Train :)\n",
    "    trainer.train()\n",
    "    cleanup_distributed()\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    world_size = torch.cuda.device_count()\n",
    "    mp.set_start_method('spawn')  # Add this line to fix the error\n",
    "    processes = []\n",
    "    for rank in range(world_size):\n",
    "        p = mp.Process(target=main_worker, args=(rank, world_size))\n",
    "        p.start()\n",
    "        processes.append(p)\n",
    "    for p in processes:\n",
    "        p.join()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Supervised Finetuning Trainer (SFT Trainer)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def finetune():\n",
    "    from datasets import load_dataset\n",
    "    import torch\n",
    "    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer\n",
    "    from peft import LoraConfig\n",
    "    from trl import SFTTrainer\n",
    "    from transformers import TrainingArguments\n",
    "    from torch.nn.parallel import DistributedDataParallel as DDP\n",
    "    # Load the dataset\n",
    "    dataset_name = \"ruslanmv/ai-medical-dataset\"\n",
    "    dataset = load_dataset(dataset_name, split=\"train\")\n",
    "    # Select the first 1M rows of the dataset\n",
    "    dataset = dataset.select(range(100))\n",
    "    # Load the model + tokenizer\n",
    "    model_name = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "    tokenizer.pad_token = tokenizer.eos_token\n",
    "    bnb_config = BitsAndBytesConfig(\n",
    "      load_in_4bit=True,\n",
    "      bnb_4bit_quant_type=\"nf4\",\n",
    "      bnb_4bit_compute_dtype=torch.float16,\n",
    "    )\n",
    "    model = AutoModelForCausalLM.from_pretrained(\n",
    "      model_name,\n",
    "      quantization_config=bnb_config,\n",
    "      trust_remote_code=True,\n",
    "      use_cache=False,\n",
    "    )\n",
    "    # Check for available GPUs\n",
    "    if torch.cuda.device_count() > 1:\n",
    "      print(\"Multiple GPUs detected, enabling DataParallel...\")\n",
    "      model = DDP(model)  # Wrap the model with DDP\n",
    "    else:\n",
    "      print(\"Using single GPU...\")\n",
    "    # PEFT config\n",
    "    lora_alpha = 16\n",
    "    lora_dropout = 0.1\n",
    "    lora_r = 32  # 64\n",
    "    peft_config = LoraConfig(\n",
    "      lora_alpha=lora_alpha,\n",
    "      lora_dropout=lora_dropout,\n",
    "      r=lora_r,\n",
    "      bias=\"none\",\n",
    "      task_type=\"CAUSAL_LM\",\n",
    "      target_modules=[\"k_proj\", \"q_proj\", \"v_proj\", \"up_proj\", \"down_proj\", \"gate_proj\"],\n",
    "      modules_to_save=[\"embed_tokens\", \"input_layernorm\", \"post_attention_layernorm\", \"norm\"],\n",
    "    )\n",
    "    # Args\n",
    "    max_seq_length = 512\n",
    "    output_dir = \"./results\"\n",
    "    per_device_train_batch_size = 2  # reduced batch size to avoid OOM\n",
    "    gradient_accumulation_steps = 2\n",
    "    optim = \"adamw_torch\"\n",
    "    save_steps = 10\n",
    "    logging_steps = 1\n",
    "    learning_rate = 2e-4\n",
    "    max_grad_norm = 0.3\n",
    "    max_steps = 1  # 300 Approx the size of guanaco at bs 8, ga 2, 2 GPUs.\n",
    "    warmup_ratio = 0.1\n",
    "    lr_scheduler_type = \"cosine\"\n",
    "\n",
    "    training_arguments = TrainingArguments(\n",
    "      output_dir=output_dir,\n",
    "      per_device_train_batch_size=per_device_train_batch_size,\n",
    "      gradient_accumulation_steps=gradient_accumulation_steps,\n",
    "      optim=optim,\n",
    "      save_steps=save_steps,\n",
    "      logging_steps=logging_steps,\n",
    "      learning_rate=learning_rate,\n",
    "      fp16=True,\n",
    "      max_grad_norm=max_grad_norm,\n",
    "      max_steps=max_steps,\n",
    "      warmup_ratio=warmup_ratio,\n",
    "      group_by_length=True,\n",
    "      lr_scheduler_type=lr_scheduler_type,\n",
    "      gradient_checkpointing=True,  # gradient checkpointing\n",
    "      #report_to=\"wandb\",\n",
    "    )\n",
    "    # Trainer\n",
    "    trainer = SFTTrainer(\n",
    "      model=model,\n",
    "      train_dataset=dataset,\n",
    "      peft_config=peft_config,\n",
    "      dataset_text_field=\"context\",\n",
    "      max_seq_length=max_seq_length,\n",
    "      tokenizer=tokenizer,\n",
    "      args=training_arguments,\n",
    "    )\n",
    "    # Train :)\n",
    "    trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing distributed process group...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d32deaece84b4e2382865810c9c3f1f4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6b4dd257b7a447ac98d8d3c926b41a85",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "48dea94b34a84052a77c073ede9289e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "`low_cpu_mem_usage` was None, now set to True since model is quantized.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6b29e51f80e84dcd9192be7755f12d79",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "`low_cpu_mem_usage` was None, now set to True since model is quantized.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "449bb46e71164a8687ecbdb75000da92",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Multiple GPUs detected, enabling DataParallel...\n",
      "Multiple GPUs detected, enabling DataParallel...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch\n",
      "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch\n",
      "Process Process-3:\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
      "    self.run()\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"/tmp/wsuser/ipykernel_4766/3134301475.py\", line 21, in main_worker\n",
      "    finetune()  # Move model to assigned GPU\n",
      "  File \"/tmp/wsuser/ipykernel_4766/2145007916.py\", line 80, in finetune\n",
      "    trainer = SFTTrainer(\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/trl/trainer/sft_trainer.py\", line 226, in __init__\n",
      "    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1688, in __getattr__\n",
      "    raise AttributeError(f\"'{type(self).__name__}' object has no attribute '{name}'\")\n",
      "AttributeError: 'DistributedDataParallel' object has no attribute 'get_input_embeddings'\n",
      "Process Process-4:\n",
      "Traceback (most recent call last):\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
      "    self.run()\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"/tmp/wsuser/ipykernel_4766/3134301475.py\", line 21, in main_worker\n",
      "    finetune()  # Move model to assigned GPU\n",
      "  File \"/tmp/wsuser/ipykernel_4766/2145007916.py\", line 80, in finetune\n",
      "    trainer = SFTTrainer(\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/trl/trainer/sft_trainer.py\", line 226, in __init__\n",
      "    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)\n",
      "  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1688, in __getattr__\n",
      "    raise AttributeError(f\"'{type(self).__name__}' object has no attribute '{name}'\")\n",
      "AttributeError: 'DistributedDataParallel' object has no attribute 'get_input_embeddings'\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import torch\n",
    "import torch.multiprocessing as mp\n",
    "\n",
    "def init_distributed(rank, world_size, local_rank=0):  # Add local_rank argument\n",
    "    os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
    "    os.environ[\"MASTER_PORT\"] = \"12345\"  # Adjust port if needed\n",
    "    if rank == 0:\n",
    "        print(\"Initializing distributed process group...\")\n",
    "    torch.distributed.init_process_group(backend='nccl', world_size=world_size, rank=rank)\n",
    "    torch.cuda.set_device(local_rank)  # Set unique GPU device for each process\n",
    "\n",
    "def cleanup_distributed():\n",
    "    torch.distributed.destroy_process_group()\n",
    "\n",
    "def main_worker(rank, world_size):\n",
    "    local_rank = rank % torch.cuda.device_count()  # Assign unique local rank\n",
    "    init_distributed(rank, world_size, local_rank)\n",
    "    # Your model training and fine-tuning code goes here with model on local_rank GPU\n",
    "    finetune()  # Move model to assigned GPU\n",
    "    cleanup_distributed()\n",
    "if __name__ == \"__main__\":\n",
    "    world_size = torch.cuda.device_count()\n",
    "\n",
    "    # Workaround for Jupyter Notebook and interactive environments\n",
    "    processes = []\n",
    "    for rank in range(world_size):\n",
    "        p = mp.Process(target=main_worker, args=(rank, world_size))\n",
    "        p.start()\n",
    "        processes.append(p)\n",
    "\n",
    "    for p in processes:\n",
    "        p.join()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How to FineTune Llama 3 with Unsloth\n",
    "Hello everyone, today we are going to show how we can Fine Tune Llama 2 with a Usloth package."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 5 -  Loading packages\n",
    "Once we have installed all the packages we load the packages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.2.1+cu121\n",
      "12.1\n",
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: write).\n",
      "Your token has been saved to /home/wsuser/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "from huggingface_hub import notebook_login\n",
    "from transformers import TrainingArguments\n",
    "from trl import SFTTrainer\n",
    "from unsloth import FastLanguageModel\n",
    "print(torch.__version__)\n",
    "print(torch.version.cuda)\n",
    "token=\"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\"\n",
    "from huggingface_hub import login, logout\n",
    "login(token) # non-blocking login"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 6 -  Setup configuration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "**Model Configuration**\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_config={ \"model_config\": {\n",
    "    \"base_model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",  # The base model\n",
    "    \"finetuned_model\": \"ruslanmv/Medical-Mind-Llama-3-8b-1M\",  # The finetuned model\n",
    "    \"finetuned_name\": \"Medical-Mind-Llama-3-8b-v1M\",\n",
    "    \"max_seq_length\": 2048,  # The maximum sequence length\n",
    "    \"dtype\": None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
    "    \"load_in_4bit\": True,  # Load the model in 4-bit\n",
    "}}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `base_model`: specifies the pre-trained model to use as the base model for fine-tuning.\n",
    "* `finetuned_model`: specifies the finetuned model to use for fine-tuning.\n",
    "* `finetuned_name`: specifies the name of the finetuned model.\n",
    "* `max_seq_length`: specifies the maximum sequence length that the model can process.\n",
    "* `dtype`: specifies the data type to use for the model's weights and activations. `None` means auto-detection, which will choose the most suitable data type based on the hardware.\n",
    "* `load_in_4bit`: specifies whether to load the model i 4-bit precision, which can reduce memory usage and improve performance.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**LoRA Configuration**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "lora_config={\"lora_config\": {\n",
    "    \"r\": 16,  # The number of LoRA layers 8, 16, 32, 64\n",
    "    \"target_modules\": [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
    "                      \"gate_proj\", \"up_proj\", \"down_proj\"],  # The target modules\n",
    "    \"lora_alpha\": 16,  # The alpha value for LoRA\n",
    "    \"lora_dropout\": 0,  # The dropout value for LoRA\n",
    "    \"bias\": \"none\",  # The bias for LoRA\n",
    "    \"use_gradient_checkpointing\": True,  # Use gradient checkpointing\n",
    "    \"use_rslora\": False,  # Use RSLora\n",
    "    \"use_dora\": False,  # Use DoRa\n",
    "    \"loftq_config\": None  # The LoFTQ configuration\n",
    "}\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `r`: specifies the number of LoRA layers to use.\n",
    "* `target_modules`: specifies the modules to which LoRA should be applied.\n",
    "* `lora_alpha`: specifies the alpha value for LoRA, which controls the strength of the LoRA layers.\n",
    "* `lora_dropout`: specifies the dropout value for LoRA, which controls the random dropping of neurons during training.\n",
    "* `bias`: specifies the bias for LoRA, which can be set to \"none\" or a specific value.\n",
    "* `use_gradient_checkpointing`: specifies whether to use gradient checkpointing, which can reduce memory usage during training.\n",
    "* `use_rslora` and `use_dora`: specify whether to use RSLora and DoRa, respectively, which are variants of LoRA.\n",
    "* `loftq_config`: specifies the LoFTQ configuration, which is not used in this example.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Training Configuration**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_config={\"training_config\": {\n",
    "    \"per_device_train_batch_size\": 2,  # The batch size\n",
    "    \"gradient_accumulation_steps\": 4,  # The gradient accumulation steps\n",
    "    \"warmup_steps\": 5,  # The warmup steps\n",
    "    \"max_steps\": 0,  # The maximum steps (0 if the epochs are defined)\n",
    "    \"num_train_epochs\": 1,  # The number of training epochs\n",
    "    \"learning_rate\": 2e-4,  # The learning rate\n",
    "    \"fp16\": not torch.cuda.is_bf16_supported(),  # The fp16\n",
    "    \"bf16\": torch.cuda.is_bf16_supported(),  # The bf16\n",
    "    \"logging_steps\":  1,  # The logging steps\n",
    "    \"optim\": \"adamw_8bit\",  # The optimizer\n",
    "    \"weight_decay\": 0.0,  # The weight decay\n",
    "    \"lr_scheduler_type\": \"linear\",  # The learning rate scheduler\n",
    "    \"seed\": 42,  # The seed\n",
    "    \"output_dir\": \"outputs\",  # The output directory\n",
    "}\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* `per_device_train_batch_size`: specifies the batch size to use for training.\n",
    "* `gradient_accumulation_steps`: specifies the number of steps to accumulate gradients before updating the model.\n",
    "* `warmup_steps`: specifies the number of warmup steps to perform before starting training.\n",
    "* `max_steps`: specifies the maximum number of steps to train for. If set to 0, the model will train for the specified number of epochs.\n",
    "* `num_train_epochs`: specifies the number of epochs to train for.\n",
    "* `learning_rate`: specifies the initial learning rate to use for training.\n",
    "* `fp16` and `bf16`: specify whether to use 16-bit floating-point precision (fp16) or 16-bit bfloat precision (bf16) for training.\n",
    "* `logging_steps`: specifies the number of steps to log training metrics.\n",
    "* `optim`: specifies the optimizer to use for training.\n",
    "* `weight_decay`: specifies the weight decay rate to use for regularization.\n",
    "* `lr_scheduler_type`: specifies the learning rate scheduler to use.\n",
    "* `seed`: specifies the random seed to use for training.\n",
    "* `output_dir`: specifies the output directory to save training results."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Hugging Face Username**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "hugging_face_username={\"hugging_face_username\": \"ruslanmv\"}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Training Dataset**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_dataset={\"training_dataset\": {\n",
    "        \"name\": \"ruslanmv/ai-medical-dataset\", # The dataset name(huggingface/datasets)\n",
    "        \"split\": \"train\",  # The dataset split\n",
    "        \"input_fields\": [\"question\", \"context\"] ,# The input fields\n",
    "        \"input_field\": \"text\",# The input field\n",
    "    },\n",
    "                }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**`training_dataset`**: This is the top-level key for the dataset configuration.\n",
    "\n",
    "**`name`**: This specifies the name of the dataset. In this case, it's `ruslanmv/ai-medical-dataset`, which is a dataset hosted on the Hugging Face Hub. The format is `username/dataset_name`.\n",
    "\n",
    "**`split`**: This specifies the split of the dataset to use for training. In this case, it's set to `\"train\"`, which means the model will be trained on the training split of the dataset.\n",
    "\n",
    "**`input_fields`**: This specifies the input fields of the dataset that will be used for trainine, it's a list containing two fields: `\"question\"` and `\"context\"`. These fields are likely to be the input features of the dataset.\n",
    "\n",
    "**`input_field`**: This specifies the primary input field of the dataset. In this case, it's set to `\"text\"`. This field is likely to be the text input that the model will process.\n",
    "\n",
    "Here's an example of what this dataset might look like:\n",
    "\n",
    "| question | context | text |\n",
    "| --- | --- | --- |\n",
    "| How does COVID-19 spread? | COVID-19 is a respiratory disease... | The COVID-19 is.. |\n",
    "| ... | ... | ... |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = {}\n",
    "config.update(hugging_face_username)\n",
    "config.update(model_config)\n",
    "config.update(lora_config)\n",
    "config.update(training_config)\n",
    "config.update(training_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "def save_config_to_json(config, file_path):\n",
    "    with open(file_path, 'w') as f:\n",
    "        json.dump(config, f, indent=4)\n",
    "file_path = \"original_config.json\"\n",
    "save_config_to_json(config, file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==((====))==  Unsloth: Fast Llama patching release 2024.4\n",
      "   \\\\   /|    GPU: Tesla V100-PCIE-16GB. Max memory: 15.773 GB. Platform = Linux.\n",
      "O^O/ \\_/ \\    Pytorch: 2.2.1+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.\n",
      "\\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.\n",
      " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "# Loading the model and the tokenizer for the model\n",
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name=config[\"model_config\"].get(\"base_model\"),\n",
    "    max_seq_length=config[\"model_config\"].get(\"max_seq_length\"),\n",
    "    dtype=config[\"model_config\"].get(\"dtype\"),\n",
    "    load_in_4bit=config[\"model_config\"].get(\"load_in_4bit\"),\n",
    "\n",
    "\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup for QLoRA/LoRA peft of the base model\n",
    "model = FastLanguageModel.get_peft_model(\n",
    "    model,\n",
    "    r = config.get(\"lora_config\").get(\"r\"),\n",
    "    target_modules = config.get(\"lora_config\").get(\"target_modules\"),\n",
    "    lora_alpha = config.get(\"lora_config\").get(\"lora_alpha\"),\n",
    "    lora_dropout = config.get(\"lora_config\").get(\"lora_dropout\"),\n",
    "    bias = config.get(\"lora_config\").get(\"bias\"),\n",
    "    use_gradient_checkpointing = config.get(\"lora_config\").get(\"use_gradient_checkpointing\"),\n",
    "    random_state = 42,\n",
    "    use_rslora = config.get(\"lora_config\").get(\"use_rslora\"),\n",
    "    use_dora = config.get(\"lora_config\").get(\"use_dora\"),\n",
    "    loftq_config = config.get(\"lora_config\").get(\"loftq_config\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer\n",
    "#tokenizer = AutoTokenizer.from_pretrained(config.get(\"model_config\").get(\"base_model\"))\n",
    "tokenizer.add_eos_token = True\n",
    "tokenizer.pad_token_id = 0\n",
    "tokenizer.padding_side = \"left\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset does not exist.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2f16e19d432e4cfb8518dc80b1f54552",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/2.97k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "05063beb14274cbbaf203a403477efde",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2833c846be424c57a2240601530d430d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f45497aa51104f80b0501fd0c8a2efe4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0/18 [00:00<?, ?files/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2a9cd3f9409b4055a2e8f25249c6be07",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/21210000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d2deb39e743440abb7d7fbca11e7e864",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c46c30d45dbe4234a6390594356ed6cb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0dfd1af07b0849a1ba9476cea77f6c5b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "is_test=True\n",
    "import datasets\n",
    "import os\n",
    "dataset_path = \"train_dataset\"\n",
    "if os.path.exists(dataset_path):\n",
    "    print(\"Dataset exists!\")\n",
    "    train_dataset = datasets.load_from_disk(\"train_dataset\")\n",
    "else:\n",
    "    print(\"Dataset does not exist.\")\n",
    "    # Loading the training dataset\n",
    "    train_dataset = load_dataset(config.get(\"training_dataset\").get(\"name\"), split = config.get(\"training_dataset\").get(\"split\"))    \n",
    "    \n",
    "    if is_test:\n",
    "        # Select the first 1M rows of the dataset\n",
    "        train_dataset = train_dataset.select(range(100))\n",
    "        \n",
    "    medical_prompt = \"\"\"You are an AI Medical Assistant Chatbot, trained to answer medical questions. Below is an instruction that describes a task, paired with an response context. Write a response that appropriately completes the request.\n",
    "    ### Instruction:\n",
    "    {}\n",
    "\n",
    "    ### Response:\n",
    "    {}\"\"\"\n",
    "    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN\n",
    "    def formatting_prompts_func(examples):\n",
    "        instructions = examples[\"question\"]\n",
    "        outputs      = examples[\"context\"]\n",
    "        texts = []\n",
    "        for instruction, output in zip(instructions,  outputs):\n",
    "            # Must add EOS_TOKEN, otherwise your generation will go on forever!\n",
    "            text = medical_prompt.format(instruction,  output) + EOS_TOKEN\n",
    "            texts.append(text)\n",
    "        return { \"text\" : texts, }\n",
    "    pass\n",
    "    train_dataset= train_dataset.map(formatting_prompts_func, batched = True,)\n",
    "    train_dataset['text'][1]    \n",
    "    import datasets\n",
    "    # assume 'test_dataset' is a Dataset object\n",
    "    train_dataset.save_to_disk(\"train_dataset\")    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['question', 'context', 'text'],\n",
       "    num_rows: 100\n",
       "})"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "is_multiple=True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Multiple GPUs enabled\n"
     ]
    }
   ],
   "source": [
    "if is_multiple:\n",
    "    # Set up GPU acceleration\n",
    "    if torch.cuda.device_count() > 1:\n",
    "        print(\"Multiple GPUs enabled\")\n",
    "        devices = [f\"cuda:{i}\" for i in range(torch.cuda.device_count())]\n",
    "        model_parallel = torch.nn.DataParallel(model, device_ids= devices ) #[0, 1]\n",
    "        # Access the original model from the DataParallel object\n",
    "        model = model_parallel.module\n",
    "    else:\n",
    "        print(\"No DataParallel \")\n",
    "        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "        model.to(device)        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['cuda:0', 'cuda:1']"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "devices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PeftModelForCausalLM(\n",
       "  (base_model): LoraModel(\n",
       "    (model): LlamaForCausalLM(\n",
       "      (model): LlamaModel(\n",
       "        (embed_tokens): Embedding(128256, 4096)\n",
       "        (layers): ModuleList(\n",
       "          (0-31): 32 x LlamaDecoderLayer(\n",
       "            (self_attn): LlamaSdpaAttention(\n",
       "              (q_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=4096, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=4096, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (k_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=4096, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (v_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=4096, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=1024, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (o_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=4096, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=4096, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (rotary_emb): LlamaRotaryEmbedding()\n",
       "            )\n",
       "            (mlp): LlamaMLP(\n",
       "              (gate_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=4096, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=14336, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (up_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=4096, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=14336, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (down_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Identity()\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=14336, out_features=16, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=16, out_features=4096, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (act_fn): SiLU()\n",
       "            )\n",
       "            (input_layernorm): LlamaRMSNorm()\n",
       "            (post_attention_layernorm): LlamaRMSNorm()\n",
       "          )\n",
       "        )\n",
       "        (norm): LlamaRMSNorm()\n",
       "      )\n",
       "      (lm_head): Linear(in_features=4096, out_features=128256, bias=False)\n",
       "    )\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "abb0b4aadff048d9b612a7a8e6ebd4a9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:318: UserWarning: You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code.\n",
      "  warnings.warn(\n",
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    }
   ],
   "source": [
    "# Setting up the trainer for the model\n",
    "trainer = SFTTrainer(\n",
    "    model = model,\n",
    "    tokenizer = tokenizer,\n",
    "    train_dataset = train_dataset,\n",
    "    dataset_text_field = config.get(\"training_dataset\").get(\"input_field\"),\n",
    "    max_seq_length = config.get(\"model_config\").get(\"max_seq_length\"),\n",
    "    dataset_num_proc = 2,\n",
    "    packing = False,\n",
    "    args = TrainingArguments(\n",
    "        per_device_train_batch_size = config.get(\"training_config\").get(\"per_device_train_batch_size\"),\n",
    "        gradient_accumulation_steps = config.get(\"training_config\").get(\"gradient_accumulation_steps\"),\n",
    "        warmup_steps = config.get(\"training_config\").get(\"warmup_steps\"),\n",
    "        max_steps = config.get(\"training_config\").get(\"max_steps\"),\n",
    "        num_train_epochs= config.get(\"training_config\").get(\"num_train_epochs\"),\n",
    "        learning_rate = config.get(\"training_config\").get(\"learning_rate\"),\n",
    "        fp16 = config.get(\"training_config\").get(\"fp16\"),\n",
    "        bf16 = config.get(\"training_config\").get(\"bf16\"),\n",
    "        logging_steps = config.get(\"training_config\").get(\"logging_steps\"),\n",
    "        optim = config.get(\"training_config\").get(\"optim\"),\n",
    "        weight_decay = config.get(\"training_config\").get(\"weight_decay\"),\n",
    "        lr_scheduler_type = config.get(\"training_config\").get(\"lr_scheduler_type\"),\n",
    "        seed = 42,\n",
    "        output_dir = config.get(\"training_config\").get(\"output_dir\"),\n",
    "        \n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reserved Memory: 11.19GB\n",
      "Max Memory: 15.77GB\n"
     ]
    }
   ],
   "source": [
    "# Memory statistics before training\n",
    "gpu_statistics = torch.cuda.get_device_properties(0)\n",
    "reserved_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 2)\n",
    "max_memory = round(gpu_statistics.total_memory / 1024**3, 2)\n",
    "print(f\"Reserved Memory: {reserved_memory}GB\")\n",
    "print(f\"Max Memory: {max_memory}GB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "id": "Q-g-4RvNXkyD"
   },
   "outputs": [],
   "source": [
    "##  [ 1038/2651250 53:49 < 2295:10:28, 0.32 it/s, Epoch 0.00/1] old"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 948
    },
    "id": "yI9mEQ7ZOUx2",
    "outputId": "6466d591-76f8-45e2-e665-39ad9bf8ae7f",
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1\n",
      "   \\\\   /|    Num examples = 100 | Num Epochs = 1\n",
      "O^O/ \\_/ \\    Batch size per device = 4 | Gradient Accumulation steps = 4\n",
      "\\        /    Total batch size = 16 | Total steps = 6\n",
      " \"-____-\"     Number of trainable parameters = 41,943,040\n"
     ]
    },
    {
     "ename": "RuntimeError",
     "evalue": "Caught RuntimeError in replica 1 on device 1.\nOriginal Traceback (most recent call last):\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py\", line 83, in _worker\n    output = module(*input, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 882, in PeftModelForCausalLM_fast_forward\n    return self.base_model(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/peft/tuners/tuners_utils.py\", line 161, in forward\n    return self.model.forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 813, in _CausalLM_fast_forward\n    outputs = self.model(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 668, in LlamaModel_fast_forward\n    layer_outputs = torch.utils.checkpoint.checkpoint(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_compile.py\", line 24, in inner\n    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py\", line 489, in _fn\n    return fn(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_dynamo/external_utils.py\", line 17, in inner\n    return fn(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/utils/checkpoint.py\", line 482, in checkpoint\n    return CheckpointFunction.apply(function, preserve, *args)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/autograd/function.py\", line 553, in apply\n    return super().apply(*args, **kwargs)  # type: ignore[misc]\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/utils/checkpoint.py\", line 261, in forward\n    outputs = run_function(*args)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 664, in custom_forward\n    return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 433, in LlamaDecoderLayer_fast_forward\n    hidden_states, self_attn_weights, present_key_value = self.self_attn(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 308, in LlamaAttention_fast_forward\n    Q, K, V = self.apply_qkv(self, hidden_states)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/fast_lora.py\", line 312, in apply_lora_qkv\n    Q, K, V = LoRA_QKV.apply(X,\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/autograd/function.py\", line 553, in apply\n    return super().apply(*args, **kwargs)  # type: ignore[misc]\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py\", line 115, in decorate_fwd\n    return fwd(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/fast_lora.py\", line 227, in forward\n    Q = matmul_lora(X, QW, QW_quant, QA, QB, QS)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/utils.py\", line 225, in matmul_lora\n    W = fast_dequantize(W.t(), W_quant)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/utils.py\", line 104, in fast_dequantize\n    out_absmax += offset\nRuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!\n",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[28], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Training the model\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m trainer_stats \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361\u001b[0m, in \u001b[0;36mSFTTrainer.train\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mneftune_noise_alpha \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer_supports_neftune:\n\u001b[1;32m    359\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trl_activate_neftune(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel)\n\u001b[0;32m--> 361\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    363\u001b[0m \u001b[38;5;66;03m# After training we make sure to retrieve back the original forward pass method\u001b[39;00m\n\u001b[1;32m    364\u001b[0m \u001b[38;5;66;03m# for the embedding layer by removing the forward post hook.\u001b[39;00m\n\u001b[1;32m    365\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mneftune_noise_alpha \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_trainer_supports_neftune:\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:1859\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1857\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1858\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1859\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1860\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1861\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1862\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1863\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1864\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m<string>:361\u001b[0m, in \u001b[0;36m_fast_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:3138\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3135\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3137\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3138\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   3141\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/transformers/trainer.py:3161\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3159\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3160\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3161\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3162\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3163\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3164\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py:1511\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1509\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1510\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1511\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py:1520\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1515\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1516\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1518\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1519\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1520\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1523\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185\u001b[0m, in \u001b[0;36mDataParallel.forward\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    183\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodule(\u001b[38;5;241m*\u001b[39minputs[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodule_kwargs[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m    184\u001b[0m replicas \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplicate(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodule, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice_ids[:\u001b[38;5;28mlen\u001b[39m(inputs)])\n\u001b[0;32m--> 185\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparallel_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodule_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    186\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgather(outputs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_device)\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200\u001b[0m, in \u001b[0;36mDataParallel.parallel_apply\u001b[0;34m(self, replicas, inputs, kwargs)\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mparallel_apply\u001b[39m(\u001b[38;5;28mself\u001b[39m, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Any]:\n\u001b[0;32m--> 200\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparallel_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice_ids\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108\u001b[0m, in \u001b[0;36mparallel_apply\u001b[0;34m(modules, inputs, kwargs_tup, devices)\u001b[0m\n\u001b[1;32m    106\u001b[0m     output \u001b[38;5;241m=\u001b[39m results[i]\n\u001b[1;32m    107\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(output, ExceptionWrapper):\n\u001b[0;32m--> 108\u001b[0m         \u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreraise\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    109\u001b[0m     outputs\u001b[38;5;241m.\u001b[39mappend(output)\n\u001b[1;32m    110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
      "File \u001b[0;32m/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_utils.py:722\u001b[0m, in \u001b[0;36mExceptionWrapper.reraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    718\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m    719\u001b[0m     \u001b[38;5;66;03m# If the exception takes multiple arguments, don't try to\u001b[39;00m\n\u001b[1;32m    720\u001b[0m     \u001b[38;5;66;03m# instantiate since we don't know how to\u001b[39;00m\n\u001b[1;32m    721\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 722\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception\n",
      "\u001b[0;31mRuntimeError\u001b[0m: Caught RuntimeError in replica 1 on device 1.\nOriginal Traceback (most recent call last):\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py\", line 83, in _worker\n    output = module(*input, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 882, in PeftModelForCausalLM_fast_forward\n    return self.base_model(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/peft/tuners/tuners_utils.py\", line 161, in forward\n    return self.model.forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 813, in _CausalLM_fast_forward\n    outputs = self.model(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 668, in LlamaModel_fast_forward\n    layer_outputs = torch.utils.checkpoint.checkpoint(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_compile.py\", line 24, in inner\n    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py\", line 489, in _fn\n    return fn(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/_dynamo/external_utils.py\", line 17, in inner\n    return fn(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/utils/checkpoint.py\", line 482, in checkpoint\n    return CheckpointFunction.apply(function, preserve, *args)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/autograd/function.py\", line 553, in apply\n    return super().apply(*args, **kwargs)  # type: ignore[misc]\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/utils/checkpoint.py\", line 261, in forward\n    outputs = run_function(*args)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 664, in custom_forward\n    return module(*inputs, past_key_value, output_attentions, padding_mask = padding_mask)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 433, in LlamaDecoderLayer_fast_forward\n    hidden_states, self_attn_weights, present_key_value = self.self_attn(\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n    return self._call_impl(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n    return forward_call(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/accelerate/hooks.py\", line 166, in new_forward\n    output = module._old_forward(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/models/llama.py\", line 308, in LlamaAttention_fast_forward\n    Q, K, V = self.apply_qkv(self, hidden_states)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/fast_lora.py\", line 312, in apply_lora_qkv\n    Q, K, V = LoRA_QKV.apply(X,\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/autograd/function.py\", line 553, in apply\n    return super().apply(*args, **kwargs)  # type: ignore[misc]\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py\", line 115, in decorate_fwd\n    return fwd(*args, **kwargs)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/fast_lora.py\", line 227, in forward\n    Q = matmul_lora(X, QW, QW_quant, QA, QB, QS)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/utils.py\", line 225, in matmul_lora\n    W = fast_dequantize(W.t(), W_quant)\n  File \"/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/unsloth/kernels/utils.py\", line 104, in fast_dequantize\n    out_absmax += offset\nRuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!\n"
     ]
    }
   ],
   "source": [
    "# Training the model\n",
    "trainer_stats = trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "YQFEr64koXp1",
    "outputId": "2e1b3775-1f5d-4b8e-a0ef-32266cb7fa2a"
   },
   "outputs": [],
   "source": [
    "# Memory statistics after training\n",
    "used_memory = round(torch.cuda.max_memory_allocated() / 1024**3, 2)\n",
    "used_memory_lora = round(used_memory - reserved_memory, 2)\n",
    "used_memory_persentage = round((used_memory / max_memory) * 100, 2)\n",
    "used_memory_lora_persentage = round((used_memory_lora / max_memory) * 100, 2)\n",
    "print(f\"Used Memory: {used_memory}GB ({used_memory_persentage}%)\")\n",
    "print(f\"Used Memory for training(fine-tuning) LoRA: {used_memory_lora}GB ({used_memory_lora_persentage}%)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_model=config.get(\"model_config\").get(\"finetuned_model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "1YJB4bZyoXp1"
   },
   "outputs": [],
   "source": [
    "# Saving the trainer stats\n",
    "with open(\"trainer_stats.json\", \"w\") as f:\n",
    "    json.dump(trainer_stats, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Save and push the adapter to HF\n",
    "import os\n",
    "current_directory = os.getcwd()\n",
    "# New model name\n",
    "new_model = config.get(\"model_config\").get(\"finetuned_name\") #\"Medical-Mind-Llama-3-8b\"\n",
    "# Save the fine-tuned model\n",
    "save_path = os.path.join(current_directory, \"models\", new_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#os.makedirs(save_path, exist_ok=True)  # Create directory if it doesn't exist\n",
    "#trainer.model.save_pretrained(save_path)\n",
    "tokenizer.save_pretrained(save_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(model.save_pretrained_merged)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To save the final model as LoRA adapters, either use Huggingface's push_to_hub for an online save or save_pretrained for a local save.\n",
    "\n",
    "[NOTE] This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the model to the created directory\n",
    "# `lora`: Save LoRA adapters with no merging. Useful for HF inference.\n",
    "#model.save_pretrained(save_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Saving the model using merged_16bit(float16), \n",
    "#`16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.\n",
    "#model.save_pretrained_merged(save_path, tokenizer, save_method = \"merged_16bit\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.\n",
    "model.save_pretrained_merged(save_path, tokenizer, save_method = \"merged_4bit_forced\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the list of files in the directory\n",
    "files_in_model_dir = os.listdir(save_path)\n",
    "# Print the list of files\n",
    "print(\"Files in the directory:\")\n",
    "for file in files_in_model_dir:\n",
    "    print(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from huggingface_hub import HfApi\n",
    "def upload_folder(folder_path, repository_name, path_in_repo):\n",
    "    api = HfApi()\n",
    "    \n",
    "    # Check if the repository exists, if not, create it\n",
    "    repo_exists = api.repo_exists(repository_name)\n",
    "    if not repo_exists:\n",
    "        api.create_repo(repository_name)\n",
    "        print(f\"Repository '{repository_name}' created on Hugging Face Hub.\")\n",
    "\n",
    "    for root, dirs, files in os.walk(folder_path):\n",
    "        for file in files:\n",
    "            file_path = os.path.join(root, file)\n",
    "            relative_path = os.path.relpath(file_path, folder_path)\n",
    "            repo_path = os.path.join(path_in_repo, relative_path)\n",
    "            api.upload_file(path_or_fileobj=file_path, repo_id=repository_name, path_in_repo=repo_path)\n",
    "            print(f\"{repo_path} uploaded to {repository_name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the repository name and path in the repository\n",
    "repository_name = \"ruslanmv/\"+new_model\n",
    "path_in_repo = \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "repository_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the folder and its contents to the repository\n",
    "upload_folder(save_path, repository_name, path_in_repo)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#help(model.push_to_hub_merged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#save_path='/home/wsuser/work/models/Medical-Mind-Llama-3-8b'\n",
    "#repo_id='ruslanmv/Medical-Mind-Llama-3-8b'\n",
    "#commit_message=\"Uploading Model\"\n",
    "#model.push_to_hub_merged(repo_id, tokenizer, save_method = \"merged_16bit\",commit_message=commit_message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model.push_to_hub_merged(config.get(\"model_config\").get(\"finetuned_model\"), tokenizer, save_method = \"merged_4bit\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model.save_pretrained_gguf(config.get(\"model_config\").get(\"finetuned_model\"), tokenizer)\n",
    "#model.push_to_hub_gguf(config.get(\"model_config\").get(\"finetuned_model\"), tokenizer,repository_private=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#model.save_pretrained_gguf(config.get(\"model_config\").get(\"finetuned_model\"), tokenizer, quantization_method = \"q4_k_m\")\n",
    "#model.push_to_hub_gguf(config.get(\"model_config\").get(\"finetuned_model\"), tokenizer, quantization_method = \"q4_k_m\",private=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/Python-RT23.1-CUDA/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8f1cc811458d464c906abab21b49d230",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==((====))==  Unsloth: Fast Llama patching release 2024.4\n",
      "   \\\\   /|    GPU: Tesla V100-PCIE-16GB. Max memory: 15.773 GB. Platform = Linux.\n",
      "O^O/ \\_/ \\    Pytorch: 2.2.1+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.\n",
      "\\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.\n",
      " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c900fe27e6ab4026bf82f8d9fc8127ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors.index.json:   0%|          | 0.00/132k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "71c34d1aab5b4eafa1ac4bedea7a99e1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a86b815277ad420c98f60f36e03d1db6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "97cb140a6fc0412eb54980d172002cb8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7dfd858cf250481795c44a4a255e8963",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7712983e35464f43b533191252bc6c26",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "generation_config.json:   0%|          | 0.00/143 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dc245edfb09f47aeb8d2ca1c0e4b3085",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2089efc7021b4f98a904e6310b8552a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "241ce29274d845819b32f30e334f2cad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/321 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Answer: I'd be happy to help you with that!\n",
      "\n",
      "The main cause of inflammatory CD4+ T cells is a complex process involving multiple factors. CD4+ T cells, also known as T helper cells, play a crucial role in the immune response. Inflammation occurs when these cells become activated and produce pro-inflammatory cytokines, leading to an imbalance in the immune response.\n",
      "\n",
      "Some common causes of inflammatory CD4+ T cells include:\n",
      "\n",
      "1. **Infections**: Bacterial, viral, or fungal infections can trigger an immune response, leading to the activation of CD4+ T cells and subsequent inflammation.\n",
      "2. **Autoimmune disorders**: Conditions like rheumatoid arthritis, lupus, or multiple sclerosis can cause CD4+ T cells to become chronically activated, leading to chronic inflammation.\n",
      "3. **Allergies**: Allergic reactions can trigger the activation of CD4+ T cells, resulting in the release of pro-inflammatory cytokines and the development of inflammation.\n",
      "4. **Cancer**: Tumor cells can stimulate the activation of CD4+ T cells, leading to an inflammatory response.\n",
      "5. **Environmental factors**: Exposure to pollutants, toxins, or other environmental stressors can contribute to the activation of CD4+ T cells and the development\n"
     ]
    }
   ],
   "source": [
    "is_inference=False\n",
    "if is_inference:\n",
    "    from unsloth import FastLanguageModel\n",
    "    import torch\n",
    "    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n",
    "    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
    "    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n",
    "    model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "        model_name = \"ruslanmv/Medical-Mind-Llama-3-8b-1M\",\n",
    "        max_seq_length = max_seq_length,\n",
    "        dtype = dtype,\n",
    "        load_in_4bit = load_in_4bit,\n",
    "    )\n",
    "    # Using FastLanguageModel for fast inference\n",
    "    FastLanguageModel.for_inference(model)\n",
    "    question=\"This is the question: What was the main cause of the inflammatory CD4+ T cells?\"\n",
    "    prompt=f\"<|start_header_id|>system<|end_header_id|> You are a Medical AI chatbot assistant .<|eot_id|><|start_header_id|> user <|end_header_id|>{question}<|eot_id|>\"\n",
    "    # Tokenizing the input and generating the output\n",
    "    inputs = tokenizer([prompt], return_tensors = \"pt\").to(\"cuda\")\n",
    "    outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)\n",
    "    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]  # Get the first element from the batch\n",
    "\n",
    "    # Split the answer at the first line break, assuming system intro and question are on separate lines\n",
    "    answer_parts = answer.split(\"\\n\", 1)\n",
    "\n",
    "    # If there are multiple parts, consider the second part as the answer\n",
    "    if len(answer_parts) > 1:\n",
    "      answer = answer_parts[1].strip()  # Remove leading/trailing whitespaces\n",
    "    else:\n",
    "      answer = \"\"  # If no split possible, set answer to empty string\n",
    "\n",
    "    print(f\"Answer: {answer}\")    \n",
    " "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hyperparameter search\n",
    "**Step 1: Define the Hyperparameter Search Space**\n",
    "We need to define the search space for the hyperparameters we want to tune. For example, let's say we want to tune the following hyperparameters:\n",
    "\n",
    "* `learning_rate`\n",
    "* `per_device_train_batch_size`\n",
    "* `gradient_accumulation_steps`\n",
    "* `warmup_steps`\n",
    "* `num_train_epochs`\n",
    "* `lora_alpha`\n",
    "* `lora_dropout`\n",
    "\n",
    "We can define the search space as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import hyperopt\n",
    "from hyperopt import hp\n",
    "from hyperopt import Trials\n",
    "from hyperopt import fmin, tpe, Trials\n",
    "# Define the search space for hyperparameters\n",
    "space = {\n",
    "  'learning_rate': hp.loguniform('learning_rate', -5, -1),  # Learning rate in log scale\n",
    "  #'lora_alpha': hp.quniform('lora_alpha', 1, 32, 1),  # LoRA alpha with quantized steps\n",
    "  #'lora_dropout': hp.uniform('lora_dropout', 0, 0.5),  # LoRA dropout rate\n",
    "\n",
    "  'per_device_train_batch_size': hp.quniform('per_device_train_batch_size', 2, 16, q=1),  \n",
    "  'gradient_accumulation_steps': hp.quniform('gradient_accumulation_steps', 1, 8, 1),  # Added for exploration\n",
    "  # Uncomment these if you want to tune other hyperparameters\n",
    "  # 'warmup_steps': hp.quniform('warmup_steps', 0, 1000, 1),\n",
    "  # 'num_train_epochs': hp.quniform('num_train_epochs', 1, 5, 1),    \n",
    "\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Step 2. Define the Objective Function**\n",
    "\n",
    "The objective function is a function that takes in the hyperparameters, sets them in the `config` dictionary, trains the model, and returns the loss or metric to minimize. We need to modify the previous fine-tuning code to define the objective function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objective(params):\n",
    "  # Set hyperparameters in the config dictionary (assuming it's defined elsewhere)\n",
    "  config['training_config']['learning_rate'] = params['learning_rate']\n",
    " # config['lora_config']['lora_alpha'] = params['lora_alpha']\n",
    " # config['lora_config']['lora_dropout'] = params['lora_dropout']   \n",
    "  config['training_config']['per_device_train_batch_size'] = params['per_device_train_batch_size']\n",
    "  config['training_config']['gradient_accumulation_steps'] = params['gradient_accumulation_steps']\n",
    "  # ... Set other hyperparameters from params dictionary ...   \n",
    "  #config['training_config']['warmup_steps'] = params['warmup_steps']\n",
    "  #config['training_config']['num_train_epochs'] = params['num_train_epochs']\n",
    "\n",
    "  # Load the model and tokenizer (assuming these are defined elsewhere)\n",
    "  try:\n",
    "      model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "          model_name=config.get(\"model_config\").get(\"base_model\"),\n",
    "          max_seq_length=config.get(\"model_config\").get(\"max_seq_length\"),\n",
    "          dtype=config.get(\"model_config\").get(\"dtype\"),\n",
    "          load_in_4bit=config.get(\"model_config\").get(\"load_in_4bit\"),\n",
    "      )\n",
    "  except Exception as e:\n",
    "      print(f\"Error loading model and tokenizer: {e}\")\n",
    "      return float(\"inf\")  # Return high value for errors\n",
    "\n",
    "  # Setup LoRA for the model (assuming FastLanguageModel supports LoRA)\n",
    "  try:\n",
    "      model = FastLanguageModel.get_peft_model(\n",
    "          model,\n",
    "          r=config.get(\"lora_config\").get(\"r\"),\n",
    "          target_modules=config.get(\"lora_config\").get(\"target_modules\"),\n",
    "          lora_alpha=params['lora_alpha'],\n",
    "          lora_dropout=params['lora_dropout'],\n",
    "          bias=config.get(\"lora_config\").get(\"bias\"),\n",
    "          use_gradient_checkpointing=config.get(\"lora_config\").get(\"use_gradient_checkpointing\"),\n",
    "          random_state=42,\n",
    "          use_rslora=config.get(\"lora_config\").get(\"use_rslora\"),\n",
    "          use_dora=config.get(\"lora_config\").get(\"use_dora\"),\n",
    "          loftq_config=config.get(\"lora_config\").get(\"loftq_config\")\n",
    "      )\n",
    "  except Exception as e:\n",
    "      print(f\"Error setting up LoRA: {e}\")\n",
    "      return float(\"inf\")  # Return high value for errors\n",
    "  # Train the model on the test dataset (assuming SFTTrainer and training arguments are defined)\n",
    "  try:\n",
    "      trainer = SFTTrainer(\n",
    "          model=model,\n",
    "          tokenizer=tokenizer,\n",
    "          train_dataset=train_dataset,\n",
    "          dataset_text_field=config.get(\"training_dataset\").get(\"input_field\"),\n",
    "          max_seq_length=config.get(\"model_config\").get(\"max_seq_length\"),\n",
    "          dataset_num_proc=2,\n",
    "          packing=False,\n",
    "          args=TrainingArguments(\n",
    "              per_device_train_batch_size=int(params['per_device_train_batch_size']),\n",
    "              gradient_accumulation_steps=params['gradient_accumulation_steps'],\n",
    "              warmup_steps=params['warmup_steps'],\n",
    "              max_steps=config.get(\"training_config\").get(\"max_steps\"),\n",
    "              num_train_epochs=params['num_train_epochs'],\n",
    "              learning_rate=params['learning_rate'],\n",
    "              fp16=config.get(\"training_config\").get(\"fp16\"),\n",
    "              bf16=config.get(\"training_config\").get(\"bf16\"),\n",
    "              logging_steps=config.get(\"training_config\").get(\"logging_steps\"),\n",
    "              optim=config.get(\"training_config\").get(\"optim\"),\n",
    "              weight_decay=config.get(\"training_config\").get(\"weight_decay\"),\n",
    "              lr_scheduler_type=config.get(\"training_config\").get(\"lr_scheduler_type\"),\n",
    "              seed=42,\n",
    "              output_dir=config.get(\"training_config\").get(\"output_dir\")\n",
    "          )\n",
    "      )\n",
    "      trainer_stats = trainer.train()\n",
    "      return trainer_stats.loss  # Assuming loss is the metric to minimize\n",
    "  except Exception as e:\n",
    "      print(f\"Error during training: {e}\")\n",
    "      return float(\"inf\")  # Return high value for failed trials\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Step 3: Perform Hyperparameter Search**\n",
    "\n",
    "Now that we have defined the objective function, we can perform the hyperparameter search using Hyperopt's `fmin` function. We need to specify the objective function, the search space, and the maximum number of evaluations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Create a Trials object to track hyperparameter evaluations\n",
    "trials = Trials()\n",
    "# Run hyperparameter optimization using TPE algorithm\n",
    "best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=2)\n",
    "# Print the best hyperparameters found during optimization\n",
    "print(\"Best Hyperparameters:\", best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import gc\n",
    "def reset_gpu_memory():\n",
    "    torch.cuda.empty_cache()\n",
    "    gc.collect()\n",
    "    print(\"GPU memory cleared!\")\n",
    "# Example usage:\n",
    "reset_gpu_memory()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Full code version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "####  Fixed Version\n",
    "import hyperopt\n",
    "from hyperopt import hp\n",
    "from hyperopt import Trials\n",
    "from hyperopt import fmin, tpe, Trials\n",
    "# Define the search space for hyperparameters\n",
    "space = {\n",
    "  'learning_rate'              : hp.loguniform('learning_rate', -5, -1),  # Learning rate in log scale\n",
    "  'per_device_train_batch_size': hp.quniform('per_device_train_batch_size', 2, 16, 1),   \n",
    "  'gradient_accumulation_steps': hp.quniform('gradient_accumulation_steps', 1, 8, 1), \n",
    "  # Uncomment these if you want to tune them\n",
    "  #'lora_alpha'                : hp.quniform('lora_alpha', 1, 32, 1),  # LoRA alpha with quantized steps\n",
    "  #'lora_dropout'              : hp.uniform('lora_dropout', 0, 0.5),  # LoRA dropout rate\n",
    "  # 'warmup_steps'             : hp.quniform('warmup_steps', 0, 1000, 1),\n",
    "  # 'num_train_epochs'         : hp.quniform('num_train_epochs', 1, 5, 1),\n",
    "}\n",
    "def objective(params):\n",
    "    # Set hyperparameters in the config dictionary (assuming it's defined elsewhere)\n",
    "    config['training_config']['learning_rate']=params['learning_rate']\n",
    "    config['training_config']['per_device_train_batch_size'] = params['per_device_train_batch_size']\n",
    "    config['training_config']['gradient_accumulation_steps'] = params['gradient_accumulation_steps']    \n",
    "    # config['lora_config']['lora_alpha'] = params['lora_alpha']\n",
    "    # config['lora_config']['lora_dropout'] = params['lora_dropout']\n",
    "    # ... Set other hyperparameters from params dictionary ...\n",
    "    #config['training_config']['warmup_steps'] = params['warmup_steps']\n",
    "    #config['training_config']['num_train_epochs'] = params['num_train_epochs']\n",
    "    # Load the model and tokenizer (assuming these are defined elsewhere)    \n",
    "    try:\n",
    "            model, tokenizer    = FastLanguageModel.from_pretrained(\n",
    "            model_name          = config.get(\"model_config\").get(\"base_model\"),\n",
    "            max_seq_length      = config.get(\"model_config\").get(\"max_seq_length\"),\n",
    "            dtype               = config.get(\"model_config\").get(\"dtype\"),\n",
    "            load_in_4bit        = config.get(\"model_config\").get(\"load_in_4bit\")\n",
    "            )\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading model and tokenizer: {e}\")\n",
    "        return float(\"inf\")  # Return high value for errors\n",
    "\n",
    "    # Setup LoRA for the model (assuming FastLanguageModel supports LoRA)\n",
    "    try:\n",
    "        model = FastLanguageModel.get_peft_model(\n",
    "        model,\n",
    "        r                            = config.get(\"lora_config\").get(\"r\"),\n",
    "        target_modules               = config.get(\"lora_config\").get(\"target_modules\"),\n",
    "        lora_alpha                   = config.get(\"lora_config\").get('lora_alpha'), #params['lora_alpha'],\n",
    "        lora_dropout                 = config.get(\"lora_config\").get('lora_dropout'),#params['lora_dropout'],\n",
    "        bias                         = config.get(\"lora_config\").get(\"bias\"),\n",
    "        use_gradient_checkpointing   = config.get(\"lora_config\").get(\"use_gradient_checkpointing\"),\n",
    "        random_state                 = 42,\n",
    "        use_rslora                   = config.get(\"lora_config\").get(\"use_rslora\"),\n",
    "        use_dora                     = config.get(\"lora_config\").get(\"use_dora\"),\n",
    "        loftq_config                 = config.get(\"lora_config\").get(\"loftq_config\")\n",
    "        )\n",
    "    except Exception as e:\n",
    "        print(f\"Error setting up LoRA: {e}\")\n",
    "        return float(\"inf\")  # Return high value for errors\n",
    "    # Train the model on the test dataset (assuming SFTTrainer and training arguments are defined)\n",
    "    try:\n",
    "        trainer = SFTTrainer(\n",
    "              model=model,\n",
    "              tokenizer            =  tokenizer,\n",
    "              train_dataset        = train_dataset,\n",
    "              dataset_text_field   = config.get(\"training_dataset\").get(\"input_field\"),\n",
    "              max_seq_length       = config.get(\"model_config\").get(\"max_seq_length\"),\n",
    "              dataset_num_proc     = 2,\n",
    "              packing              = False,\n",
    "              args=TrainingArguments(\n",
    "                  per_device_train_batch_size = int(params['per_device_train_batch_size']), #config.get(\"training_config\").get('per_device_train_batch_size'),\n",
    "                  gradient_accumulation_steps = params['gradient_accumulation_steps'], #config.get(\"training_config\").get('gradient_accumulation_steps'),\n",
    "                  warmup_steps                = config.get(\"training_config\").get('warmup_steps'),#params['warmup_steps'],\n",
    "                  max_steps                   = config.get(\"training_config\").get(\"max_steps\"),\n",
    "                  num_train_epochs            = config.get(\"training_config\").get('num_train_epochs'),#params['num_train_epochs'],\n",
    "                  learning_rate               = params['learning_rate'],\n",
    "                  fp16                        = config.get(\"training_config\").get(\"fp16\"),\n",
    "                  bf16                        = config.get(\"training_config\").get(\"bf16\"),\n",
    "                  logging_steps               = config.get(\"training_config\").get(\"logging_steps\"),\n",
    "                  optim                       = config.get(\"training_config\").get(\"optim\"),\n",
    "                  weight_decay                = config.get(\"training_config\").get(\"weight_decay\"),\n",
    "                  lr_scheduler_type           = config.get(\"training_config\").get(\"lr_scheduler_type\"),\n",
    "                  seed                        = 42,\n",
    "                  output_dir                  = config.get(\"training_config\").get(\"output_dir\")\n",
    "              )\n",
    "          )\n",
    "        trainer_stats = trainer.train()\n",
    "        return trainer_stats.loss  # Assuming loss is the metric to minimize\n",
    "    except Exception as e:\n",
    "        print(f\"Error during training: {e}\")\n",
    "        return float(\"inf\")  # Return high value for failed trials    \n",
    "# Create a Trials object to track hyperparameter evaluations\n",
    "trials = Trials()\n",
    "# Run hyperparameter optimization using TPE algorithm\n",
    "best = fmin(objective, space, algo=tpe.suggest, trials=trials, max_evals=2)\n",
    "# Print the best hyperparameters found during optimization\n",
    "print(\"Best Hyperparameters:\", best)   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Analyzing Hyperparameters:\n",
    "\n",
    "*  **Batch Size**: Generally, increasing the batch size can improve training speed by utilizing hardware resources more efficiently. However, there's a limit beyond which performance degrades. You can tune the batch size within a reasonable range (e.g., 2, 4, 8, 16) to see its impact.\n",
    "\n",
    "* **Learning Rate**: A higher learning rate can accelerate training initially. But, a too high value can lead to unstable training and potentially slower convergence. Consider a range of learning rates (e.g., log-uniform distribution between 1e-5 and 1e-3) for exploration.\n",
    "\n",
    "* **Gradient Accumulation Steps**: This technique accumulates gradients over multiple batches before updating model weights. It can help reduce memory requirements but might slow down training per epoch. Experiment with different accumulation steps (e.g., 1, 2, 4) to find a balance.\n",
    "\n",
    "* **Optimizer Choice**: Some optimizers like Adam or SGD with momentum can be faster than others depending on the model and dataset. Explore different optimizers and their hyperparameters (e.g., momentum coefficient) to see if they lead to faster convergence.\n",
    "\n",
    "## Additional Considerations:\n",
    "\n",
    "Early Stopping: Implement early stopping to automatically terminate training if the validation loss doesn't improve for a certain number of epochs. This can save training time if the model starts overfitting.\n",
    "Warmup Steps: A gradual increase in the learning rate during the initial training phase (warmup steps) can improve stability and potentially accelerate convergence compared to a fixed learning rate from the beginning.\n",
    "\n",
    "\n",
    "* Experimentation and Profiling:\n",
    "\n",
    "The best hyperparameters for faster training depend on your specific model, dataset, and hardware. You'll need to experiment with different configurations using tools like Hyperopt to find the optimal settings.\n",
    "Consider using profiling tools to identify bottlenecks in your training pipeline. This can help you focus on optimizing specific parts of the training process that are most time-consuming.\n",
    "By analyzing these hyperparameters and implementing techniques like early stopping and warmup steps, you can potentially achieve faster fine-tuning while maintaining good model performance."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Method 1  Optuna"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from optuna import create_study, Trial\n",
    "import time  # Assuming you can use time.time() to measure training time\n",
    "\n",
    "# Define search space with additional hyperparameter\n",
    "search_space = {\n",
    "  \"learning_rate\": [1e-5, 5e-5, 1e-4, 2e-4],\n",
    "  \"per_device_train_batch_size\": [2, 4, 8],\n",
    "  \"lora_alpha\": [8, 16, 32],\n",
    "  \"gradient_accumulation_steps\": [1, 2, 4, 8],  # Added gradient accumulation steps\n",
    "}\n",
    "\n",
    "def objective(trial):\n",
    "  # Set hyperparameters based on trial values\n",
    "  config[\"training_config\"][\"learning_rate\"] = trial.suggest_float(\"learning_rate\", search_space[\"learning_rate\"][0], search_space[\"learning_rate\"][-1])\n",
    "  config[\"training_config\"][\"per_device_train_batch_size\"] = trial.suggest_int(\"per_device_train_batch_size\", search_space[\"per_device_train_batch_size\"][0], search_space[\"per_device_train_batch_size\"][-1])\n",
    "  config[\"training_config\"][\"gradient_accumulation_steps\"] = trial.suggest_int(\"gradient_accumulation_steps\", search_space[\"gradient_accumulation_steps\"][0], search_space[\"gradient_accumulation_steps\"][-1])\n",
    "  config[\"lora_config\"][\"lora_alpha\"] = trial.suggest_int(\"lora_alpha\", search_space[\"lora_alpha\"][0], search_space[\"lora_alpha\"][-1])\n",
    "\n",
    "  # Train the model with the current hyperparameters\n",
    "  start_time = time.time()\n",
    "  try:\n",
    "      trainer_stats = trainer_test.train()\n",
    "      training_time = time.time() - start_time\n",
    "      return training_time  # Minimize training time\n",
    "  except Exception as e:\n",
    "      return float(\"inf\")  # Assign a high value if training fails\n",
    "\n",
    "study = create_study(direction=\"minimize\")\n",
    "study.optimize(objective, n_trials=2)  # Adjust the number of trials\n",
    "\n",
    "# Access the best trial and its hyperparameters after optimization\n",
    "best_trial = study.best_trial\n",
    "best_params = best_trial.params\n",
    "\n",
    "print(\"Best Trial:\", best_trial.number)\n",
    "print(\"Best Hyperparameters (Likely Fastest):\", best_params)\n",
    "print(\"Best Training Time:\", best_trial.value, \"seconds\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "L4",
   "machine_shape": "hm",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3.10",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "0005f2d9fe1e4cc98ea58b0c2868b433": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_45c1d5b0df0e420a87f791dd4cf0e425",
      "max": 100,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_9ed49f1a099846a3a65cd6608bafb0e4",
      "value": 100
     }
    },
    "0058ed544fed4272848a891a68b9adc0": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "00eea4b0c6e44c62900ea8e7d919efe9": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "02fc530028ea4d538b7f6b48463ae700": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "058b2b9959b84b6f9f5d3862ef53d029": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_7807f312425b4f4d9249aa1ac77d7461",
      "placeholder": "​",
      "style": "IPY_MODEL_d8e7ea9552a84b8284b31d77090b54af",
      "value": "Map (num_proc=2): 100%"
     }
    },
    "0f55ae30c2704632941cca4727c1c4f2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "11dc1dcf6b29471580c32c818fa41d88": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_9344b22940c64654a82bb2ce06530e30",
       "IPY_MODEL_4f68a26f64e844c7be21cc180eb6c1a2",
       "IPY_MODEL_769b40273bab41af8eb66e494b613241"
      ],
      "layout": "IPY_MODEL_320c09781518483e82defa86c28316d1"
     }
    },
    "1634ba52355b4681a913039666926f85": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_eff94d2d010e4b4f93a6dfcb61103a52",
      "max": 18,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_da5cd094aaae45f4a0ca051ad5babd78",
      "value": 18
     }
    },
    "1850ab17bafd4a43b5ab5899d1875a40": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "1a72b512e1374e67a858edf2844fc157": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_284192f01a924f87afd8b5087ca9af6c",
      "max": 18,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_273bf76f74bc4fb492ccb67d9e202f7b",
      "value": 18
     }
    },
    "217ca5cd404d4756a399fba3aa4fbc15": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8f88a5b04723482ea430679e504c65f9",
      "placeholder": "​",
      "style": "IPY_MODEL_8d153f070a8d4ad1b32996a9fd82beda",
      "value": " 18/18 [00:00&lt;00:00,  9.43it/s]"
     }
    },
    "22ea45365d21439fb5069974bbe69711": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "23a71f8847e647daba35e495706fc846": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_22ea45365d21439fb5069974bbe69711",
      "placeholder": "​",
      "style": "IPY_MODEL_bd087d0aa3214c5dbecc9b0bd4d976df",
      "value": "Resolving data files: 100%"
     }
    },
    "273bf76f74bc4fb492ccb67d9e202f7b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "284192f01a924f87afd8b5087ca9af6c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2c5564fb033346afbe7692a24a52b302": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "31a203cdd2f54cda8a05214844888156": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "320c09781518483e82defa86c28316d1": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "32cff795f8bc490dbf63ed130e1f581f": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "33fb10908c23457aa4796626102fc8c5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "341dca5ac74348dd9b5a347e38fa0b40": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "3564e3cf0fe84281838d84525794e735": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_912164947c5847908424f3e60c5adb64",
       "IPY_MODEL_7517ce80636040e29665a9353afab183",
       "IPY_MODEL_e14b9d980a1a41fb9e81385cb0f73d3a"
      ],
      "layout": "IPY_MODEL_ada78aafba3f47ab8eb45cf3c83a6805"
     }
    },
    "37803098ceed4528bb690ebee028c840": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "39d3b72ab6214bcf9b0bb6b6294e957c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "3a97281be4c1433aa3abe6c25b7113e2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_4e19e78059b842a5832ccae2f765a30c",
       "IPY_MODEL_1a72b512e1374e67a858edf2844fc157",
       "IPY_MODEL_c9cfd66b68a1437d946c83163fa877df"
      ],
      "layout": "IPY_MODEL_cccd970273ae43d2a6e60ac421bdc882"
     }
    },
    "3f7afd4bd28842cbb73e62c155667030": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_9a5fd3a68fd1445f92bea51a7fec3e6b",
      "max": 18,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_37803098ceed4528bb690ebee028c840",
      "value": 18
     }
    },
    "44f189b81bbd48ca8cb146ead641d2b5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e903140c8c794c48b231924d3975b7a6",
      "placeholder": "​",
      "style": "IPY_MODEL_7e74d789c82747e0b5066a00b9e36c1d",
      "value": " 100/100 [00:00&lt;00:00, 125.88 examples/s]"
     }
    },
    "45b3259e3cac4de8bd19d12f07de2adb": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "45c1d5b0df0e420a87f791dd4cf0e425": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "4a0426a353ca41cba39d4dfeba925451": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "4e19e78059b842a5832ccae2f765a30c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_32cff795f8bc490dbf63ed130e1f581f",
      "placeholder": "​",
      "style": "IPY_MODEL_4a0426a353ca41cba39d4dfeba925451",
      "value": "Resolving data files: 100%"
     }
    },
    "4f68a26f64e844c7be21cc180eb6c1a2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_341dca5ac74348dd9b5a347e38fa0b40",
      "max": 18,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_8ba6fd1bf16a4680b8a8c9c55ecf23e7",
      "value": 18
     }
    },
    "51a6d3c97480476e8c22d9ad670bdc47": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "53ee8f5e8b7d4076bdb0167baf2e5729": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "58b932a03b2c4aa4891d541f186244b9": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5d1fbd3c62d94df7befdefc451221414": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_8ad6abb48f38469f9d399eea8f5e5b70",
       "IPY_MODEL_6cea0da24cf54811a43168c606759bab",
       "IPY_MODEL_eb8c88f5c06c49fe9099371b3cf112ae"
      ],
      "layout": "IPY_MODEL_89a1354722e640758978befc06ed4a78"
     }
    },
    "64539b4212fe4d989976f56369bb746b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "67b9a3505ae644dbb3c4fc14781a2731": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_53ee8f5e8b7d4076bdb0167baf2e5729",
      "max": 100,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_d70fd9035f9b4d82892fae34c28c46d5",
      "value": 100
     }
    },
    "696e82ec6a174974a90d5abc7c101ee7": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "6cea0da24cf54811a43168c606759bab": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_dade882aca304a31b693a2c58807d825",
      "max": 18,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_02fc530028ea4d538b7f6b48463ae700",
      "value": 18
     }
    },
    "72eca1e2871b458abd3383d9711215a2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_058b2b9959b84b6f9f5d3862ef53d029",
       "IPY_MODEL_85d4879bd7d64766905db34cef052fed",
       "IPY_MODEL_44f189b81bbd48ca8cb146ead641d2b5"
      ],
      "layout": "IPY_MODEL_f89c5c949e984361bce7f97d86d2a2e5"
     }
    },
    "734b6d3e3406403293c4bc955a643528": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_dc3b2edc3f5d480a93b57b15b4444608",
      "placeholder": "​",
      "style": "IPY_MODEL_7967d420aff1414e9fe53eb04c928eb4",
      "value": "Map: 100%"
     }
    },
    "7517ce80636040e29665a9353afab183": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_bb078c8c1f6a48359dc654d91ece684d",
      "max": 18,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_9b9322336b564a409086955ebda07fc3",
      "value": 18
     }
    },
    "769b40273bab41af8eb66e494b613241": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_dc85f5e365f4488fa185d0ae35fde806",
      "placeholder": "​",
      "style": "IPY_MODEL_51a6d3c97480476e8c22d9ad670bdc47",
      "value": " 18/18 [00:00&lt;00:00, 1567.70it/s]"
     }
    },
    "7807f312425b4f4d9249aa1ac77d7461": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "793f49f397b54daab63194cee8d04256": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "7967d420aff1414e9fe53eb04c928eb4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "7e11cccce8be49008f8db3a0c3ea603d": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "7e74d789c82747e0b5066a00b9e36c1d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "82c6c2752a0746f3935e069c0f8811d6": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "85d4879bd7d64766905db34cef052fed": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0058ed544fed4272848a891a68b9adc0",
      "max": 100,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_33fb10908c23457aa4796626102fc8c5",
      "value": 100
     }
    },
    "89a1354722e640758978befc06ed4a78": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "8a195771bdc0462e8f9fbb60eb9141b1": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "8a8d3a006ee24c4393d7c2f2d040ce52": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "8ad6abb48f38469f9d399eea8f5e5b70": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_39d3b72ab6214bcf9b0bb6b6294e957c",
      "placeholder": "​",
      "style": "IPY_MODEL_696e82ec6a174974a90d5abc7c101ee7",
      "value": "Resolving data files: 100%"
     }
    },
    "8ba6fd1bf16a4680b8a8c9c55ecf23e7": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "8d153f070a8d4ad1b32996a9fd82beda": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "8f88a5b04723482ea430679e504c65f9": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "912164947c5847908424f3e60c5adb64": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ff108c92fb5547869ee545cf9a094b07",
      "placeholder": "​",
      "style": "IPY_MODEL_2c5564fb033346afbe7692a24a52b302",
      "value": "Loading dataset shards: 100%"
     }
    },
    "9344b22940c64654a82bb2ce06530e30": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_793f49f397b54daab63194cee8d04256",
      "placeholder": "​",
      "style": "IPY_MODEL_fa79cfa23f3a430dab69a59d93383cd0",
      "value": "Resolving data files: 100%"
     }
    },
    "963c0aa5620b4ea8b5a903894646121c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "9a5fd3a68fd1445f92bea51a7fec3e6b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "9b9322336b564a409086955ebda07fc3": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "9bceb9eddb2147c1abbf3391c70e6784": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "9ed49f1a099846a3a65cd6608bafb0e4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "9f91f7ce62e243f59d72e5ba36f97b8f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_af0233735d744b7e838f50f52c9d6cbe",
      "placeholder": "​",
      "style": "IPY_MODEL_8a8d3a006ee24c4393d7c2f2d040ce52",
      "value": "Loading dataset shards: 100%"
     }
    },
    "a419499622cd4374937423a79677298f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b93514308ae44afbb1a0511f5f9c6ddf",
      "placeholder": "​",
      "style": "IPY_MODEL_58b932a03b2c4aa4891d541f186244b9",
      "value": " 18/18 [00:00&lt;00:00, 1458.49it/s]"
     }
    },
    "ada78aafba3f47ab8eb45cf3c83a6805": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "af0096de28414303ba5324f4087cd92e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "af0233735d744b7e838f50f52c9d6cbe": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b7e7896aeac74b6eae27de0677100e57": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "b8b277831f1a45109b3a4a3565fbdb9d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_9f91f7ce62e243f59d72e5ba36f97b8f",
       "IPY_MODEL_1634ba52355b4681a913039666926f85",
       "IPY_MODEL_217ca5cd404d4756a399fba3aa4fbc15"
      ],
      "layout": "IPY_MODEL_bc6d92cb8837428bb7038d75e6af604e"
     }
    },
    "b93514308ae44afbb1a0511f5f9c6ddf": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bb078c8c1f6a48359dc654d91ece684d": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bb1156b7d349440d9cc8a2f0328465a7": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_23a71f8847e647daba35e495706fc846",
       "IPY_MODEL_3f7afd4bd28842cbb73e62c155667030",
       "IPY_MODEL_a419499622cd4374937423a79677298f"
      ],
      "layout": "IPY_MODEL_64539b4212fe4d989976f56369bb746b"
     }
    },
    "bc6d92cb8837428bb7038d75e6af604e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bd087d0aa3214c5dbecc9b0bd4d976df": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "be6162f66e594d3ebd8c53ebab3bbfa6": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_963c0aa5620b4ea8b5a903894646121c",
      "placeholder": "​",
      "style": "IPY_MODEL_31a203cdd2f54cda8a05214844888156",
      "value": " 100/100 [00:00&lt;00:00, 5440.44 examples/s]"
     }
    },
    "c4d39c87c16c4961b942d896742ff7ce": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_af0096de28414303ba5324f4087cd92e",
      "placeholder": "​",
      "style": "IPY_MODEL_0f55ae30c2704632941cca4727c1c4f2",
      "value": " 100/100 [00:01&lt;00:00, 113.55 examples/s]"
     }
    },
    "c9cfd66b68a1437d946c83163fa877df": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_45b3259e3cac4de8bd19d12f07de2adb",
      "placeholder": "​",
      "style": "IPY_MODEL_b7e7896aeac74b6eae27de0677100e57",
      "value": " 18/18 [00:00&lt;00:00,  1.32it/s]"
     }
    },
    "cccd970273ae43d2a6e60ac421bdc882": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d70fd9035f9b4d82892fae34c28c46d5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "d8e7ea9552a84b8284b31d77090b54af": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "da5cd094aaae45f4a0ca051ad5babd78": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "dade882aca304a31b693a2c58807d825": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "dc3b2edc3f5d480a93b57b15b4444608": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "dc85f5e365f4488fa185d0ae35fde806": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e14b9d980a1a41fb9e81385cb0f73d3a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_9bceb9eddb2147c1abbf3391c70e6784",
      "placeholder": "​",
      "style": "IPY_MODEL_8a195771bdc0462e8f9fbb60eb9141b1",
      "value": " 18/18 [00:35&lt;00:00,  1.20it/s]"
     }
    },
    "e257e4a2bfdb48038102173d397ab2e4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_82c6c2752a0746f3935e069c0f8811d6",
      "placeholder": "​",
      "style": "IPY_MODEL_1850ab17bafd4a43b5ab5899d1875a40",
      "value": "Map (num_proc=2): 100%"
     }
    },
    "e3bd7f85ce194cd4b697c2eb82038658": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_734b6d3e3406403293c4bc955a643528",
       "IPY_MODEL_0005f2d9fe1e4cc98ea58b0c2868b433",
       "IPY_MODEL_be6162f66e594d3ebd8c53ebab3bbfa6"
      ],
      "layout": "IPY_MODEL_7e11cccce8be49008f8db3a0c3ea603d"
     }
    },
    "e5880b946aae4b84a94226a5d6acaf45": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e903140c8c794c48b231924d3975b7a6": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "eb8c88f5c06c49fe9099371b3cf112ae": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_00eea4b0c6e44c62900ea8e7d919efe9",
      "placeholder": "​",
      "style": "IPY_MODEL_fe17bedb5ef04d8b9e064fa1e0d75185",
      "value": " 18/18 [00:00&lt;00:00,  1.42it/s]"
     }
    },
    "eff94d2d010e4b4f93a6dfcb61103a52": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "f89c5c949e984361bce7f97d86d2a2e5": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "fa79cfa23f3a430dab69a59d93383cd0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "fe17bedb5ef04d8b9e064fa1e0d75185": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "ff108c92fb5547869ee545cf9a094b07": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "ffa74977e7464cebb16d3cf8ee976d51": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_e257e4a2bfdb48038102173d397ab2e4",
       "IPY_MODEL_67b9a3505ae644dbb3c4fc14781a2731",
       "IPY_MODEL_c4d39c87c16c4961b942d896742ff7ce"
      ],
      "layout": "IPY_MODEL_e5880b946aae4b84a94226a5d6acaf45"
     }
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}