deepnet
/

ShortGpt

Model card Files Files and versions Community

deepnet commited on Sep 24, 2024

Commit

821537b

verified ·

1 Parent(s): b069c69

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +4 -0
.ipynb_checkpoints/FalconDataSet-checkpoint.ipynb +394 -0
.ipynb_checkpoints/language_modeling-checkpoint.ipynb +1186 -0
.ipynb_checkpoints/language_modeling-checkpoint.py +187 -0
FalconData.csv +3 -0
FalconData2.csv +3 -0
FalconDataSet.ipynb +717 -0
FalconData_train.csv +3 -0
FalconData_train2.csv +3 -0
FalconData_validation.csv +0 -0
FalconData_validation2.csv +0 -0
LICENSE +21 -0
README.md +71 -0
language_modeling.ipynb +932 -0
language_modeling.py +187 -0
short_gpt/.ipynb_checkpoints/short_hf-checkpoint.ipynb +1679 -0
short_gpt/.ipynb_checkpoints/short_llama-checkpoint.py +219 -0
short_gpt/layer_removal.py +23 -0
short_gpt/metrics.py +26 -0
short_gpt/short_hf.ipynb +1679 -0
short_gpt/short_llama.ipynb +573 -0
short_gpt/short_llama.py +219 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+FalconData.csv filter=lfs diff=lfs merge=lfs -text
+FalconData2.csv filter=lfs diff=lfs merge=lfs -text
+FalconData_train.csv filter=lfs diff=lfs merge=lfs -text
+FalconData_train2.csv filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/FalconDataSet-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,394 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "460d90da-b986-4c1c-8a66-eab144b0ba8d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n",
+      "Fetched data for all the Pages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "import time\n",
+    "\n",
+    "import random\n",
+    "pages = [\n",
+    "            random.randint(1, 968000015)\n",
+    "            for _ in range(10)\n",
+    "        ]\n",
+    "# print(pages)\n",
+    "\n",
+    "base_url = \"https://datasets-server.huggingface.co/rows\"\n",
+    "params = {\n",
+    "            \"dataset\": \"tiiuae/falcon-refinedweb\",\n",
+    "            \"config\": \"default\",\n",
+    "            \"split\": \"train\",\n",
+    "        }\n",
+    "# response = requests.get(base_url, params=params)\n",
+    "# response.raise_for_status()\n",
+    "# for row in response.json()[\"rows\"]:\n",
+    "#   content = row[\"row\"][\"content\"]\n",
+    "num_rows_per_page = 100\n",
+    "retry_limit = 10\n",
+    "retry_delay = 5\n",
+    "Falcon = []\n",
+    "\n",
+    "def fetch_data_for_page(page):\n",
+    "        params[\"offset\"] = page\n",
+    "        params[\"limit\"] = num_rows_per_page\n",
+    "        attempt = 0\n",
+    "        while attempt < retry_limit:\n",
+    "            try:\n",
+    "                response = requests.get(base_url, params=params)\n",
+    "                response.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code\n",
+    "                for row in response.json()[\"rows\"]:\n",
+    "                    content = row[\"row\"][\"content\"]\n",
+    "                    Falcon.append(content)\n",
+    "                len(Falcon)\n",
+    "                print(f\"Fetched data for all the Pages.\")\n",
+    "                break\n",
+    "            except requests.exceptions.HTTPError as e:\n",
+    "                attempt += 1\n",
+    "                print(\n",
+    "                    f\"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}\"\n",
+    "                )\n",
+    "                if attempt < retry_limit:\n",
+    "                    time.sleep(retry_delay)  # Wait before the next retry\n",
+    "                else:\n",
+    "                    print(\n",
+    "                        \"Maximum retry limit reached. Unable to fetch data.\"\n",
+    "                    )\n",
+    "                    raise\n",
+    "\n",
+    "for page in pages:\n",
+    "  fetch_data_for_page(page)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "f8f3baf1-5480-450b-a456-174a5c114d3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "\n",
+    "# Open the CSV file for writing\n",
+    "with open(\"FalconDataEval2.csv\", \"w\", newline=\"\") as csvfile:\n",
+    "    # Create a CSV writer object\n",
+    "    writer = csv.writer(csvfile)\n",
+    "\n",
+    "    # Write the header row\n",
+    "    writer.writerow([\"Text\"])\n",
+    "\n",
+    "    # Write each element in the list as a row in the CSV file\n",
+    "    for element in Falcon:\n",
+    "        writer.writerow([element])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ea47c936-2c2b-4414-ba57-74fb6827ec0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of duplicate rows: 0\n",
+      "Empty DataFrame\n",
+      "Columns: [Text]\n",
+      "Index: []\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Read the CSV file into a pandas DataFrame\n",
+    "df = pd.read_csv(\"FalconDataEval2.csv\")\n",
+    "\n",
+    "# Check for duplicate rows\n",
+    "duplicate_rows = df[df.duplicated()]\n",
+    "\n",
+    "# Print the number of duplicate rows\n",
+    "print(f\"Number of duplicate rows: {len(duplicate_rows)}\")\n",
+    "\n",
+    "# Print the duplicate rows\n",
+    "print(duplicate_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f4178cd6-747f-4e05-a9bf-17b97f959e06",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Our Annual Garden Party is a fun-filled event ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Photos by Philip Cosores\\n“There were many poi...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Media Matters Also Wants To Throw Out The Firs...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[More]\\nWhile bringing in your own cup is fine...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Read at : Google Alert – gardening\\nHow to Bui...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text\n",
+       "0  Our Annual Garden Party is a fun-filled event ...\n",
+       "1  Photos by Philip Cosores\\n“There were many poi...\n",
+       "2  Media Matters Also Wants To Throw Out The Firs...\n",
+       "3  [More]\\nWhile bringing in your own cup is fine...\n",
+       "4  Read at : Google Alert – gardening\\nHow to Bui..."
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "264548c1-4cf4-441f-a433-2f5d57861dc4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>995</th>\n",
+       "      <td>The Ketologic review Diaries\\nShould you have ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>996</th>\n",
+       "      <td>A pack of hand cooked sea salted and red wine ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>997</th>\n",
+       "      <td>この広告は、90日以上更新していないブログに表示しています。\\nsniperspy free...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>998</th>\n",
+       "      <td>Arthur Koestler - Wikipedia.\\nEssay - Merriam-...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>999</th>\n",
+       "      <td>Serving Software Downloads in 976 Categories, ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  Text\n",
+       "995  The Ketologic review Diaries\\nShould you have ...\n",
+       "996  A pack of hand cooked sea salted and red wine ...\n",
+       "997  この広告は、90日以上更新していないブログに表示しています。\\nsniperspy free...\n",
+       "998  Arthur Koestler - Wikipedia.\\nEssay - Merriam-...\n",
+       "999  Serving Software Downloads in 976 Categories, ..."
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "be5a87a8-cfee-4f63-992e-8fa1d4a5cdbb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text    To imagine delaying myself. Hard cock, selling...\n",
+       "Name: 48, dtype: object"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target_row=48\n",
+    "specific_row = df.iloc[target_row]\n",
+    "specific_row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e97d9e18-eaa0-4a1b-96ab-c89a0f4c738d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text    The old wireline Bell telephone system was bui...\n",
+      "Name: 19995, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(specific_row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "940ef35f-7517-403d-9f42-73760182dcaa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text    The old wireline Bell telephone system was bui...\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(specific_row.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "915ac669-718f-47f5-b175-a5f928b407db",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "57\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(specific_row.to_string()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab5ee254-9ba7-496b-97c7-3b6185c21971",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/language_modeling-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1186 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.40.2)\n",
+      "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.23.0)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.4.28)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
+      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.3)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.2)\n",
+      "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.0.0)\n",
+      "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
+      "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+      "Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets) (2023.10.0)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.0b0)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (4.8.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.1.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "# Transformers installation\n",
+    "# ! pip install transformers datasets\n",
+    "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
+    "# ! pip install git+https://github.com/huggingface/transformers.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Causal language modeling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are two types of language modeling, causal and masked. This guide illustrates causal language modeling.\n",
+    "Causal language models are frequently used for text generation. You can use these models for creative applications like\n",
+    "choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "cellView": "form",
+    "hide_input": true
+   },
+   "outputs": [],
+   "source": [
+    "# #@title\n",
+    "# from IPython.display import HTML\n",
+    "\n",
+    "# HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Vpjb1lu0MDk?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on\n",
+    "the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.\n",
+    "\n",
+    "This guide will show you how to:\n",
+    "\n",
+    "1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.\n",
+    "2. Use your finetuned model for inference.\n",
+    "\n",
+    "<Tip>\n",
+    "You can finetune other architectures for causal language modeling following the same steps in this guide.\n",
+    "Choose one of the following architectures:\n",
+    "\n",
+    "<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->\n",
+    "[BART](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bart), [BERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bert), [Bert Generation](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bert-generation), [BigBird](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/big_bird), [BigBird-Pegasus](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bigbird_pegasus), [BioGpt](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/biogpt), [Blenderbot](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/blenderbot), [BlenderbotSmall](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/blenderbot-small), [BLOOM](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bloom), [CamemBERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/camembert), [CodeGen](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/codegen), [CPM-Ant](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/cpmant), [CTRL](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/ctrl), [Data2VecText](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/data2vec-text), [ELECTRA](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/electra), [ERNIE](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/ernie), [GIT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/git), [GPT-Sw3](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt-sw3), [OpenAI GPT-2](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt2), [GPTBigCode](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_bigcode), [GPT Neo](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_neo), [GPT NeoX](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_neox), [GPT NeoX Japanese](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_neox_japanese), [GPT-J](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gptj), [LLaMA](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/llama), [Marian](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/marian), [mBART](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mbart), [MEGA](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mega), [Megatron-BERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/megatron-bert), [MVP](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mvp), [OpenLlama](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/open-llama), [OpenAI GPT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/openai-gpt), [OPT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/opt), [Pegasus](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/pegasus), [PLBart](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/plbart), [ProphetNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/prophetnet), [QDQBert](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/qdqbert), [Reformer](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/reformer), [RemBERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/rembert), [RoBERTa](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roberta), [RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roberta-prelayernorm), [RoCBert](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roc_bert), [RoFormer](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roformer), [RWKV](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/rwkv), [Speech2Text2](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/speech_to_text_2), [Transformer-XL](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/transfo-xl), [TrOCR](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/trocr), [XGLM](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xglm), [XLM](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm), [XLM-ProphetNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-prophetnet), [XLM-RoBERTa](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-roberta), [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-roberta-xl), [XLNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlnet), [X-MOD](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xmod)\n",
+    "\n",
+    "\n",
+    "<!--End of the generated tip-->\n",
+    "\n",
+    "</Tip>\n",
+    "\n",
+    "Before you begin, make sure you have all the necessary libraries installed:\n",
+    "\n",
+    "```bash\n",
+    "pip install transformers datasets evaluate\n",
+    "```\n",
+    "\n",
+    "We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6d9e280e08e40ddbbcb8fbe97e1fae9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# from huggingface_hub import notebook_login\n",
+    "\n",
+    "# notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load ELI5 dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library.\n",
+    " This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from datasets import load_dataset\n",
+    "\n",
+    "# eli5 = load_dataset(\"eli5\", split=\"train_asks[:5000]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e5c92a52c290468496943cb8023e4479",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cf14d12614594f51b63d4aa8259d278f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating validation split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "# Falcon = load_dataset(\"csv\", data_files=\"FalconData.csv\")\n",
+    "Falcon = load_dataset('csv', data_files={\"train\": 'FalconData.csv', \"validation\": 'FalconDataEval.csv'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split the dataset's `train_asks` split into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Falcon = Falcon.train_test_split(test_size=0.10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then take a look at an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'Allow me to clarify a genuine fast for amateur online users What exactly is Youtube . com? Youtube . com is probably the most in-demand web site on the web which allow you to view and publish video lessons for free. These are generally submitted by Vimeo members on this video discussing system. Yet another thing YouTube registration is provided for free so anyone can join, however account is not required for watching video lessons. In order to sometimes observe video clips or post your own video lessons so that you can show to your friends, loved ones as well as other Vimeo members. Once you get dependent at viewing video clip, it is possible to phone yourself a YouTuber!\\n- Everything you are unable to upload? Nonetheless there are some regulations or YouTube\\'s regards to use that you should.\\n- Observing a Vimeo movie is really simple, you just need to.\\nObserving a You tube movie is absolutely simple, you just need to variety your best song or television set plan from the research discipline click on \"Research\" option and that\\'s it. It will approach your demand and give you a list of related results. You are able to click on a outcome and this will commence taking part in the recording. youtube downloader\\nAble to click on a outcome and\\nSo, just how to publish your chosen videos? Youtube . com is very popular online video discussing foundation that allows one to publish their video lessons. Uploading a relevant video online is an easy process, just select any video submit through your computer on your YouTube accounts webpage and it will surely begin posting the video. Nonetheless Vimeo will not offer any choice to down load a printed video that you will be seeing, you can easily take note of the site Link so that you can view it later, which seems handy for YouTube users.\\nEverything you cannot upload? Nevertheless there are a few regulations or YouTube\\'s terms of use that you have to comply with, specifically you happen to be unacceptable to upload any restricted content or erotic information. Nevertheless you can use it to showcase your products online.\\nA few regulations or\\nOnline video good quality once you upload Vimeo permits you to post all popular movie formats and produces good quality probable. Whenever you post a youtube video to Youtube . com, you ought to anticipate that high quality will slightly be changed, it is because YouTube optimizes the video for speedier packing. You can even add Hi-def or Hi-def video lessons nevertheless it will take much longer to weight once you observe it. Greater the high quality more slowly movie will load.\\nYou upload Vimeo\\nProbably the most well-known movie web sites online is You tube as well as for certain, you can find videos inside the web site you want to create you everywhere and adding it inside your PSP device might be what you need. However, YouTube video lessons will not be quickly down loadable. You might need a downloader to download the recording through the website and shop it inside your personal computer. video downloader\\nAfter you have saved the recording, it may possibly not certainly be around the preferred format which can be legible along with your Playstation portable. For those who have saved a structure not in mp4, you may want to transform the submit with your Computer in to a Playstation portable-pleasant structure. You may need a video clip converter for this task, and when you have changed the video tutorials, anyone can down load these to your Playstation portable.\\nWith your Playstation portable For those\\nIn accessing, simply link up your Playstation portable to the laptop or computer by means of its cord, use the Universal serial bus setting and download the video lessons and music that you want to bring along.\\nThat will help you look for a converter or a video downloader, specifically if you want to obtain video clips from Vimeo, be involved in forums and discover topics relevant to this. Certainly, you will also find a great deal of PSP movie information that may also assist you in making the best from your gadget and help you learn to see a number of videos on your gadget.\\nAlso find a great deal of PSP\\nYou can even get into membership web sites where PSP enthusiast collect and discuss information and facts and even more importantly, offers you the tools and software program that you will want to save music, videos and media records to your devices and permit you to enjoy the gizmo a lot more. Although these membership internet sites require only a minimum cost, it really is however vital that you are working with and creating dealings in a guaranteed and harmless internet site.\\n- You can even get into membership websites.\\n- One of the more preferred video clip sites on.\\n- Video quality when you post Vimeo enables you.\\n0 thoughts on “The Most Effective and Well-liked you tube downloader6675”'}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon['train'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'For some reason, removing motor grease from cotton-poly blend is perceived as one of the more difficult laundry problems out there. The truth is, that there are several methods that you can use to get rid of this type of stain, which are listed here. While some of these methods may seem a little strange, each and every one of them will work. All you need to do is be willing to try it. If you are hesitant about using any of these methods at all, be sure to test them out on a similar piece of fabric to see what the end result will be. If there is any damage to your particular piece of fabric, than do not use the method to happen to have a few white t-shirts, blouses, or button-up shirts, then chances are you know the pain of having to ...Discover More\\nTablecloths are not cheap, and it is always a great idea to protect anything that is expensive. Cleaning tablecloths is ...Discover More\\nWhile it can be annoying to find that your white apparel and linens have turned yellow in the laundry, it no longer needs ...Discover More\\nFREE SERVICE: Get tips like this every week in Cleaning Tips from Tips.Net. Enter your address and click \"Subscribe.\"\\nView most recent newsletter.\\n2015-08-29 08:54:35\\nJune\\nComing from a long line of mechanics, I\\'ve always kept a bottle of LESTOIL around...works GREAT on auto grease, and cooking grease as well, just follow the directions on the bottle.\\nFREE SERVICE: Get tips like this every week in Cleaning Tips from Tips.Net. Enter your address and click \"Subscribe.\"\\n(Your e-mail address is not shared with anyone, ever.)\\nView the most recent newsletter.'}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon['validation'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling\n",
+    "tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocess"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "hide_input": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# #@title\n",
+    "# from IPython.display import HTML\n",
+    "\n",
+    "# HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, GPT2TokenizerFast\n",
+    "\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\n",
+    "\n",
+    "\n",
+    "tokenizer = GPT2TokenizerFast.from_pretrained(\"Xenova/gpt-4\")#, cache_dir=cache_dir)\n",
+    "tokenizer.pad_token = tokenizer.eos_token"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to\n",
+    "extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'Allow me to clarify a genuine fast for amateur online users What exactly is Youtube . com? Youtube . com is probably the most in-demand web site on the web which allow you to view and publish video lessons for free. These are generally submitted by Vimeo members on this video discussing system. Yet another thing YouTube registration is provided for free so anyone can join, however account is not required for watching video lessons. In order to sometimes observe video clips or post your own video lessons so that you can show to your friends, loved ones as well as other Vimeo members. Once you get dependent at viewing video clip, it is possible to phone yourself a YouTuber!\\n- Everything you are unable to upload? Nonetheless there are some regulations or YouTube\\'s regards to use that you should.\\n- Observing a Vimeo movie is really simple, you just need to.\\nObserving a You tube movie is absolutely simple, you just need to variety your best song or television set plan from the research discipline click on \"Research\" option and that\\'s it. It will approach your demand and give you a list of related results. You are able to click on a outcome and this will commence taking part in the recording. youtube downloader\\nAble to click on a outcome and\\nSo, just how to publish your chosen videos? Youtube . com is very popular online video discussing foundation that allows one to publish their video lessons. Uploading a relevant video online is an easy process, just select any video submit through your computer on your YouTube accounts webpage and it will surely begin posting the video. Nonetheless Vimeo will not offer any choice to down load a printed video that you will be seeing, you can easily take note of the site Link so that you can view it later, which seems handy for YouTube users.\\nEverything you cannot upload? Nevertheless there are a few regulations or YouTube\\'s terms of use that you have to comply with, specifically you happen to be unacceptable to upload any restricted content or erotic information. Nevertheless you can use it to showcase your products online.\\nA few regulations or\\nOnline video good quality once you upload Vimeo permits you to post all popular movie formats and produces good quality probable. Whenever you post a youtube video to Youtube . com, you ought to anticipate that high quality will slightly be changed, it is because YouTube optimizes the video for speedier packing. You can even add Hi-def or Hi-def video lessons nevertheless it will take much longer to weight once you observe it. Greater the high quality more slowly movie will load.\\nYou upload Vimeo\\nProbably the most well-known movie web sites online is You tube as well as for certain, you can find videos inside the web site you want to create you everywhere and adding it inside your PSP device might be what you need. However, YouTube video lessons will not be quickly down loadable. You might need a downloader to download the recording through the website and shop it inside your personal computer. video downloader\\nAfter you have saved the recording, it may possibly not certainly be around the preferred format which can be legible along with your Playstation portable. For those who have saved a structure not in mp4, you may want to transform the submit with your Computer in to a Playstation portable-pleasant structure. You may need a video clip converter for this task, and when you have changed the video tutorials, anyone can down load these to your Playstation portable.\\nWith your Playstation portable For those\\nIn accessing, simply link up your Playstation portable to the laptop or computer by means of its cord, use the Universal serial bus setting and download the video lessons and music that you want to bring along.\\nThat will help you look for a converter or a video downloader, specifically if you want to obtain video clips from Vimeo, be involved in forums and discover topics relevant to this. Certainly, you will also find a great deal of PSP movie information that may also assist you in making the best from your gadget and help you learn to see a number of videos on your gadget.\\nAlso find a great deal of PSP\\nYou can even get into membership web sites where PSP enthusiast collect and discuss information and facts and even more importantly, offers you the tools and software program that you will want to save music, videos and media records to your devices and permit you to enjoy the gizmo a lot more. Although these membership internet sites require only a minimum cost, it really is however vital that you are working with and creating dealings in a guaranteed and harmless internet site.\\n- You can even get into membership websites.\\n- One of the more preferred video clip sites on.\\n- Video quality when you post Vimeo enables you.\\n0 thoughts on “The Most Effective and Well-liked you tube downloader6675”'}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon = Falcon.flatten()\n",
+    "Falcon[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead\n",
+    "of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them.\n",
+    "\n",
+    "Here is a first preprocessing function to join the list of strings for each example and tokenize the result:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    return tokenizer([\" \".join(x) for x in examples[\"Text\"]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The OrderedVocab you are attempting to save contains holes for indices [100256, 100261, 100262, 100263, 100266, 100267, 100268, 100269, 100270, 100271, 100272, 100273, 100274, 100275], your vocabulary could be corrupted !\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "51bff46d94664c468064b17d1a8bf1c0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/20000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (8569 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (14224 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (15104 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (32874 > 8192). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The OrderedVocab you are attempting to save contains holes for indices [100256, 100261, 100262, 100263, 100266, 100267, 100268, 100269, 100270, 100271, 100272, 100273, 100274, 100275], your vocabulary could be corrupted !\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a093fd9868042a9ac76ed1c141711a7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (8414 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (11892 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (22303 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (12749 > 8192). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenized_Falcon = Falcon.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=4,\n",
+    "    remove_columns=Falcon[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.\n",
+    "\n",
+    "You can now use a second preprocessing function to\n",
+    "- concatenate all the sequences\n",
+    "- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "block_size = 1048\n",
+    "\n",
+    "\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "    # Split by chunks of block_size.\n",
+    "    result = {\n",
+    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "        for k, t in concatenated_examples.items()\n",
+    "    }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Apply the `group_texts` function over the entire dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6134c09493054ce3940da711dc2e965e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/20000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd3f26e9c76f42f1827aa11aa45416df",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now create a batch of examples using [DataCollatorForLanguageModeling](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling). It's more efficient to *dynamically pad* the\n",
+    "sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.\n",
+    "\n",
+    "Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForLanguageModeling\n",
+    "\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<Tip>\n",
+    "\n",
+    "If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the [basic tutorial](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!\n",
+    "\n",
+    "</Tip>\n",
+    "\n",
+    "You're ready to start training your model now! Load DistilGPT2 with [AutoModelForCausalLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f55ae69743a74a08943641e2da03e791",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
+    "import torch\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"tensorplex-labs/pretraining-sn9-7B-5\", torch_dtype=torch.bfloat16)              "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point, only three steps remain:\n",
+    "\n",
+    "1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).\n",
+    "2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, datasets, and data collator.\n",
+    "3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.cuda.empty_cache()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import gc\n",
+    "\n",
+    "# del tensor_name  # Delete the tensor\n",
+    "gc.collect()     # Collect garbage\n",
+    "torch.cuda.empty_cache()  # Clear cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch.autograd.grad_mode.no_grad at 0x7f41880db6d0>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.no_grad()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LlamaForCausalLM(\n",
+       "  (model): LlamaModel(\n",
+       "    (embed_tokens): Embedding(100288, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-29): 30 x LlamaDecoderLayer(\n",
+       "        (self_attn): LlamaSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (rotary_emb): LlamaRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): LlamaMLP(\n",
+       "          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n",
+       "          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n",
+       "          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): LlamaRMSNorm()\n",
+       "        (post_attention_layernorm): LlamaRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): LlamaRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=100288, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"Fine-Tuned-S9\",\n",
+    "    bf16=True,\n",
+    "    # evaluation_strategy=\"epoch\",\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    learning_rate=2e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    per_device_eval_batch_size=2,\n",
+    "    # lr_scheduler_type = 'cosine',\n",
+    "    push_to_hub=False,\n",
+    "    save_total_limit = 2\n",
+    "    # save_strategy = “no”\n",
+    "    load_best_model_at_end=False\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=lm_dataset[\"train\"],\n",
+    "    eval_dataset=lm_dataset[\"validation\"],\n",
+    "    # eval_dataset=lm_dataset[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    ")\n",
+    "\n",
+    "# trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once training is completed, use the [evaluate()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.evaluate) method to evaluate your model and get its perplexity:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Perplexity: 2.21\n"
+     ]
+    }
+   ],
+   "source": [
+    "import math\n",
+    "\n",
+    "eval_results = trainer.evaluate()\n",
+    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<Tip>\n",
+    "\n",
+    "For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding\n",
+    "[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)\n",
+    "or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).\n",
+    "\n",
+    "</Tip>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great, now that you've finetuned a model, you can use it for inference!\n",
+    "\n",
+    "Come up with a prompt you'd like to generate text from:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prompt = \"Somatic hypermutation allows the immune system to\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for text generation with your model, and pass your text to it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Could not load model Fine-Tuned-S9/checkpoint-4000 with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>). See the original errors:\n\nwhile loading with AutoModelForCausalLM, an error is thrown:\nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/pipelines/base.py\", line 283, in infer_framework_load_model\n    model = model_class.from_pretrained(model, **kwargs)\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py\", line 563, in from_pretrained\n    return model_class.from_pretrained(\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\", line 3260, in from_pretrained\n    raise EnvironmentError(\nOSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory Fine-Tuned-S9/checkpoint-4000.\n\nwhile loading with LlamaForCausalLM, an error is thrown:\nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/pipelines/base.py\", line 283, in infer_framework_load_model\n    model = model_class.from_pretrained(model, **kwargs)\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\", line 3260, in from_pretrained\n    raise EnvironmentError(\nOSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory Fine-Tuned-S9/checkpoint-4000.\n\n\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[20], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pipeline\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m# checkpoint-4000\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m generator \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtext-generation\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mFine-Tuned-S9/checkpoint-4000\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      4\u001b[0m generator(prompt)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/pipelines/__init__.py:906\u001b[0m, in \u001b[0;36mpipeline\u001b[0;34m(task, model, config, tokenizer, feature_extractor, image_processor, framework, revision, use_fast, token, device, device_map, torch_dtype, trust_remote_code, model_kwargs, pipeline_class, **kwargs)\u001b[0m\n\u001b[1;32m    904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(model, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m framework \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    905\u001b[0m     model_classes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m: targeted_task[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtf\u001b[39m\u001b[38;5;124m\"\u001b[39m], \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m: targeted_task[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m]}\n\u001b[0;32m--> 906\u001b[0m     framework, model \u001b[38;5;241m=\u001b[39m \u001b[43minfer_framework_load_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    907\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    908\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmodel_classes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_classes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    909\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    910\u001b[0m \u001b[43m        \u001b[49m\u001b[43mframework\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mframework\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    911\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    912\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhub_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    913\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    914\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    916\u001b[0m model_config \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mconfig\n\u001b[1;32m    917\u001b[0m hub_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_commit_hash\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39m_commit_hash\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/pipelines/base.py:296\u001b[0m, in \u001b[0;36minfer_framework_load_model\u001b[0;34m(model, config, model_classes, task, framework, **model_kwargs)\u001b[0m\n\u001b[1;32m    294\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m class_name, trace \u001b[38;5;129;01min\u001b[39;00m all_traceback\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m    295\u001b[0m             error \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwhile loading with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mclass_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, an error is thrown:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtrace\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 296\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    297\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not load model \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with any of the following classes: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mclass_tuple\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. See the original errors:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00merror\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    298\u001b[0m         )\n\u001b[1;32m    300\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m framework \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    301\u001b[0m     framework \u001b[38;5;241m=\u001b[39m infer_framework(model\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n",
+      "\u001b[0;31mValueError\u001b[0m: Could not load model Fine-Tuned-S9/checkpoint-4000 with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>). See the original errors:\n\nwhile loading with AutoModelForCausalLM, an error is thrown:\nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/pipelines/base.py\", line 283, in infer_framework_load_model\n    model = model_class.from_pretrained(model, **kwargs)\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py\", line 563, in from_pretrained\n    return model_class.from_pretrained(\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\", line 3260, in from_pretrained\n    raise EnvironmentError(\nOSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory Fine-Tuned-S9/checkpoint-4000.\n\nwhile loading with LlamaForCausalLM, an error is thrown:\nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/pipelines/base.py\", line 283, in infer_framework_load_model\n    model = model_class.from_pretrained(model, **kwargs)\n  File \"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\", line 3260, in from_pretrained\n    raise EnvironmentError(\nOSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory Fine-Tuned-S9/checkpoint-4000.\n\n\n"
+     ]
+    }
+   ],
+   "source": [
+    "# from transformers import pipeline\n",
+    "# # checkpoint-4000\n",
+    "# generator = pipeline(\"text-generation\", model=\"Fine-Tuned-S9/checkpoint-4000\")\n",
+    "# generator(prompt)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tokenize the text and return the `input_ids` as PyTorch tensors:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# from transformers import AutoTokenizer\n",
+    "\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(\"Xenova/gpt-4\")\n",
+    "# inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the [generate()](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate) method to generate text.\n",
+    "For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](https://huggingface.co/docs/transformers/main/en/tasks/../generation_strategies) page."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ba147780e8548d28a00a655e81e588a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04e2f536d4d1492bbb4dcf72abbf2cc3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "df7e14c799c0457f8422442a065f3b03",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee74102a34694e6cb57a00210d34cf19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "978d214714044affb97e1b31ab6deafc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0a2fb5b3f2ec4e3e8d7bc9db54a0635e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00003-of-00003.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Error while downloading from https://cdn-lfs-us-1.huggingface.co/repos/54/cf/54cf63a091d3be4443d28131b5c3686f6dd17bc8fe13dfd74b30bc4eafc5b3e2/4c4148f267d0c0cb2979c9cf8e60f11fb91770076c28a2a79f4446ea30bff523?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27model-00003-of-00003.safetensors%3B+filename%3D%22model-00003-of-00003.safetensors%22%3B&Expires=1715867899&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNTg2Nzg5OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzU0L2NmLzU0Y2Y2M2EwOTFkM2JlNDQ0M2QyODEzMWI1YzM2ODZmNmRkMTdiYzhmZTEzZGZkNzRiMzBiYzRlYWZjNWIzZTIvNGM0MTQ4ZjI2N2QwYzBjYjI5NzljOWNmOGU2MGYxMWZiOTE3NzAwNzZjMjhhMmE3OWY0NDQ2ZWEzMGJmZjUyMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=NRnXWL-gncnyNfcEhT0Xqi7WNbx5rVxELBfBIjnfb3zk7DCNDIqSPi-iNcrXmNkEmINWGbghFy4ifzUqvzNOmm0cJF10hMi%7E6R5DBKRBK0DRGtC2fC72sXzk9ysyJ6mQRSegUeDZy2KZqUL3wzwRC2Xhv8baK%7ENi0FGjUSh0Hmpg7Wgbs2quZRMM7lXqI-y3bkKh7L6OBXnx3W55Mlzzt87CgYLyotXuFIUrQ1W5lN6R3LWZuDvJ0ClLVuSKjTGwBv9MRQYLewybb4yqSmmEDfTkmuCphg2%7EfzNJ53Q2kqMEVC6gRPf67v8NDR9j57zOtoNSc1-SdaCem95aycbC7A__&Key-Pair-Id=KCD77M1F0VK2B: HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.\n",
+      "Trying to resume download...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "635db10feaa74dff93285752d9e79520",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00003-of-00003.safetensors:  71%|#######   | 2.71G/3.84G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "38e479e6424d4edc8d00795ce084d4c2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "602b879326a44c58bc0909a3b86cd666",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# from transformers import AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"deepnet/SN6-BestLlama\")\n",
+    "# outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Decode the generated token ids back into text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Somatic hypermutation allows the immune system to recognize foreign proteins. \\n - . \\n - \\n 1 . 3 \\n S e t s \\n 0 \\n A c c e p t s \\n A l m o s t \\n 1 \\n C l o s e d \\n T o p i c s \\n P a p e r s \\n 0 \\n P a p e r s \\n B e a r i n g \\n P a g e s \\n 0 \\n P a g e s \\n R e c o']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# tokenizer.batch_decode(outputs, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Somatic hypermutation allows the immune system to recognize foreign proteins. \\n - . \\n - \\n 1 . 3 \\n S e t s \\n 0 \\n A c c e p t s \\n A l m o s t \\n 1 \\n C l o s e d \\n T o p i c s \\n P a p e r s \\n 0 \\n P a p e r s \\n B e a r i n g \\n P a g e s \\n 0 \\n P a g e s \\n R e c o']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# tokenizer.batch_decode(outputs, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

.ipynb_checkpoints/language_modeling-checkpoint.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# -*- coding: utf-8 -*-
+# Transformers installation
+# ! pip install transformers datasets
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+# #@title
+# from IPython.display import HTML
+# HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/Vpjb1lu0MDk?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')
+# from huggingface_hub import notebook_login
+# notebook_login()
+# from datasets import load_dataset
+# eli5 = load_dataset("eli5", split="train_asks[:5000]")
+from datasets import load_dataset
+# Falcon = load_dataset("csv", data_files="FalconData.csv")
+Falcon = load_dataset('csv', data_files={"train": 'FalconData_train.csv', "validation": 'FalconData_validation.csv'})
+print('Dataset Loaded!')
+# Falcon = Falcon.train_test_split(test_size=0.10)
+"""Then take a look at an example:"""
+Falcon['train'][0]
+Falcon['validation'][0]
+# #@title
+# from IPython.display import HTML
+# HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')
+"""The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:"""
+from transformers import AutoTokenizer, GPT2TokenizerFast
+tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+# tokenizer = GPT2TokenizerFast.from_pretrained("Xenova/gpt-4")#, cache_dir=cache_dir)
+# tokenizer.pad_token
+# tokenizer.eos_token=128000
+# tokenizer.bos_token='128000'
+# tokenizer.eos_token='128001'
+tokenizer.pad_token = tokenizer.eos_token
+Falcon = Falcon.flatten()
+Falcon["train"][0]
+def preprocess_function(examples):
+    return tokenizer([" ".join(x) for x in examples["Text"]])
+tokenized_Falcon = Falcon.map(
+    preprocess_function,
+    batched=True,
+    num_proc=4,
+    remove_columns=Falcon["train"].column_names,
+)
+block_size = tokenizer.model_max_length
+# block_size = 2048
+def group_texts(examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    # customize this part to your needs.
+    if total_length >= block_size:
+        total_length = (total_length // block_size) * block_size
+    # Split by chunks of block_size.
+    result = {
+        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+        for k, t in concatenated_examples.items()
+    }
+    result["labels"] = result["input_ids"].copy()
+    return result
+"""Apply the `group_texts` function over the entire dataset:"""
+lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4)
+from transformers import DataCollatorForLanguageModeling
+# tokenizer.pad_token
+# tokenizer.bos_token='128000'
+# tokenizer.eos_token='128001'
+data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+import torch
+model = AutoModelForCausalLM.from_pretrained("rwh/tinytoo", torch_dtype=torch.bfloat16)
+print('Model Loaded!')
+# import torch
+# torch.cuda.empty_cache()
+# import torch
+# import gc
+# # del tensor_name  # Delete the tensor
+# gc.collect()     # Collect garbage
+# torch.cuda.empty_cache()  # Clear cache
+# torch.cuda.empty_cache()
+# torch.no_grad()
+model.to('cuda')
+OutputDir = "ReadyModel3"
+training_args = TrainingArguments(
+    output_dir=OutputDir,
+    overwrite_output_dir=True,
+    bf16=True,
+    # evaluation_strategy="epoch",
+    evaluation_strategy="steps",
+    # learning_rate=3.25e-06,
+    # learning_rate=2e-5,
+    learning_rate=1e-5,
+    # weight_decay=0.01,
+    weight_decay=0.001,
+    num_train_epochs=5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    # lr_scheduler_type = 'cosine',
+    lr_scheduler_type = 'linear',
+    push_to_hub=False,
+    save_total_limit = 2,
+    save_strategy = "steps",
+    load_best_model_at_end=True,
+    save_safetensors=True,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=lm_dataset["train"],
+    eval_dataset=lm_dataset["validation"],
+    # eval_dataset=lm_dataset["test"],
+    data_collator=data_collator,
+)
+# trainer.train()
+print('Started Training!')
+trainer.train()
+trainer.save_model(OutputDir)
+print('Saved Model Path:', OutputDir)
+import math
+eval_results = trainer.evaluate()
+print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

FalconData.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4da726ba49818c96e679a57343b2b03c3c34af0cff0fe5b84725d6ccbc2405c8
+size 25530585

FalconData2.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c40586cddb6904a918b7f6e2f1b09293434df3c62f77ccae9664cc08df4aa7ef
+size 129479461

FalconDataSet.ipynb ADDED Viewed

	@@ -0,0 +1,717 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "460d90da-b986-4c1c-8a66-eab144b0ba8d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started Fetching Data\n",
+      "Failed to fetch data, retrying. Attempt 1/10\n",
+      "Failed to fetch data, retrying. Attempt 1/10\n",
+      "Fetched data for all the Pages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "import time\n",
+    "\n",
+    "import random\n",
+    "pages = [\n",
+    "            random.randint(1, 968000015)\n",
+    "            for _ in range(500)\n",
+    "        ]\n",
+    "# print(pages)\n",
+    "\n",
+    "base_url = \"https://datasets-server.huggingface.co/rows\"\n",
+    "params = {\n",
+    "            \"dataset\": \"tiiuae/falcon-refinedweb\",\n",
+    "            \"config\": \"default\",\n",
+    "            \"split\": \"train\",\n",
+    "        }\n",
+    "# response = requests.get(base_url, params=params)\n",
+    "# response.raise_for_status()\n",
+    "# for row in response.json()[\"rows\"]:\n",
+    "#   content = row[\"row\"][\"content\"]\n",
+    "num_rows_per_page = 100\n",
+    "retry_limit = 10\n",
+    "retry_delay = 5\n",
+    "Falcon = []\n",
+    "\n",
+    "print('Started Fetching Data')\n",
+    "def fetch_data_for_page(page):\n",
+    "        params[\"offset\"] = page\n",
+    "        params[\"limit\"] = num_rows_per_page\n",
+    "        attempt = 0\n",
+    "        while attempt < retry_limit:\n",
+    "            try:\n",
+    "                response = requests.get(base_url, params=params)\n",
+    "                response.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code\n",
+    "                for row in response.json()[\"rows\"]:\n",
+    "                    content = row[\"row\"][\"content\"]\n",
+    "                    Falcon.append(content)\n",
+    "                len(Falcon)\n",
+    "                #print(f\"Fetched data for all the Pages.\")\n",
+    "                break\n",
+    "            except requests.exceptions.HTTPError as e:\n",
+    "                attempt += 1\n",
+    "                print(\n",
+    "                    f\"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}\"\n",
+    "                )\n",
+    "                if attempt < retry_limit:\n",
+    "                    time.sleep(retry_delay)  # Wait before the next retry\n",
+    "                else:\n",
+    "                    print(\n",
+    "                        \"Maximum retry limit reached. Unable to fetch data.\"\n",
+    "                    )\n",
+    "                    raise\n",
+    "\n",
+    "for page in pages:\n",
+    "  fetch_data_for_page(page)\n",
+    "\n",
+    "print(f\"Fetched data for all the Pages.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "f8f3baf1-5480-450b-a456-174a5c114d3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "\n",
+    "# Open the CSV file for writing\n",
+    "with open(\"FalconData2.csv\", \"w\", newline=\"\") as csvfile:\n",
+    "    # Create a CSV writer object\n",
+    "    writer = csv.writer(csvfile)\n",
+    "\n",
+    "    # Write the header row\n",
+    "    writer.writerow([\"Text\"])\n",
+    "\n",
+    "    # Write each element in the list as a row in the CSV file\n",
+    "    for element in Falcon:\n",
+    "        writer.writerow([element])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "ea47c936-2c2b-4414-ba57-74fb6827ec0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of duplicate rows: 5\n",
+      "                                                   Text\n",
+      "522                                               Name:\n",
+      "11746  Description.\\nReviews\\nThere are no reviews yet.\n",
+      "17606  Description.\\nReviews\\nThere are no reviews yet.\n",
+      "30436                                               NaN\n",
+      "42549                                               !\\n\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Read the CSV file into a pandas DataFrame\n",
+    "df = pd.read_csv(\"FalconData2.csv\")\n",
+    "\n",
+    "# Check for duplicate rows\n",
+    "duplicate_rows = df[df.duplicated()]\n",
+    "\n",
+    "# Print the number of duplicate rows\n",
+    "print(f\"Number of duplicate rows: {len(duplicate_rows)}\")\n",
+    "\n",
+    "# Print the duplicate rows\n",
+    "print(duplicate_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "f4178cd6-747f-4e05-a9bf-17b97f959e06",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[…]\\nM&amp;S bank […]\\nLowest unsecured loan rate...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>JavaScript seems to be disabled in your browse...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CMTech has designed a game to foster social in...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>A Storyteller's Point of View\\nMy\\nWriting\\nLe...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>mspu.us was registered 1 decade 3 years ago. I...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text\n",
+       "0   […]\\nM&S bank […]\\nLowest unsecured loan rate...\n",
+       "1  JavaScript seems to be disabled in your browse...\n",
+       "2  CMTech has designed a game to foster social in...\n",
+       "3  A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n",
+       "4  mspu.us was registered 1 decade 3 years ago. I..."
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "264548c1-4cf4-441f-a433-2f5d57861dc4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>49995</th>\n",
+       "      <td>Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49996</th>\n",
+       "      <td>This.\\n51.351813 -105.220438\\n12 replies on “L...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49997</th>\n",
+       "      <td>VIDEO 1: Panel discussion with John Nichols, a...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49998</th>\n",
+       "      <td>The Prototype DA-2A made its first flight on M...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49999</th>\n",
+       "      <td>default search action\\nBibTeX record journals/...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                    Text\n",
+       "49995  Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...\n",
+       "49996  This.\\n51.351813 -105.220438\\n12 replies on “L...\n",
+       "49997  VIDEO 1: Panel discussion with John Nichols, a...\n",
+       "49998  The Prototype DA-2A made its first flight on M...\n",
+       "49999  default search action\\nBibTeX record journals/..."
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "3f215b09-8050-4477-860c-d3ed0a19f45d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of Words:\n",
+      "0         65\n",
+      "1         79\n",
+      "2        287\n",
+      "3        302\n",
+      "4        130\n",
+      "        ... \n",
+      "49995     64\n",
+      "49996    325\n",
+      "49997     58\n",
+      "49998    623\n",
+      "49999     67\n",
+      "Name: Text, Length: 50000, dtype: int64\n",
+      "Smallest Row:\n",
+      "Text    This\n",
+      "Name: 270, dtype: object\n",
+      "\n",
+      "Largest Row:\n",
+      "Text    MAMMALS\\n400. Abu Jafar, M.Z., and C. Hays-Sha...\n",
+      "Name: 33020, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate the word count for each row without storing it as a column\n",
+    "word_counts = df['Text'].apply(lambda x: len(str(x).split()))\n",
+    "\n",
+    "\n",
+    "print(\"Number of Words:\")\n",
+    "print(word_counts)\n",
+    "\n",
+    "# print(\"Smallest Count\")\n",
+    "# print(word_counts.min())\n",
+    "\n",
+    "# print(\"Largest Count\")\n",
+    "# print(word_counts.max())\n",
+    "\n",
+    "# Find the row with the smallest word count\n",
+    "smallest_row = df.loc[word_counts.idxmin()]\n",
+    "\n",
+    "# Find the row with the largest word count\n",
+    "largest_row = df.loc[word_counts.idxmax()]\n",
+    "\n",
+    "# Display the smallest and largest rows\n",
+    "print(\"Smallest Row:\")\n",
+    "print(smallest_row)\n",
+    "\n",
+    "print(\"\\nLargest Row:\")\n",
+    "print(largest_row)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "be5a87a8-cfee-4f63-992e-8fa1d4a5cdbb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text    NaN\n",
+       "Name: 30436, dtype: object"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target_row=30436\n",
+    "specific_row = df.iloc[target_row]\n",
+    "specific_row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e97d9e18-eaa0-4a1b-96ab-c89a0f4c738d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text    The old wireline Bell telephone system was bui...\n",
+      "Name: 19995, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(specific_row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "940ef35f-7517-403d-9f42-73760182dcaa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text    The old wireline Bell telephone system was bui...\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(specific_row.to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "915ac669-718f-47f5-b175-a5f928b407db",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "57\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(specific_row.to_string()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "ab5ee254-9ba7-496b-97c7-3b6185c21971",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training set size: 49000\n",
+      "Validation set size: 1000\n"
+     ]
+    }
+   ],
+   "source": [
+    "# import pandas as pd\n",
+    "\n",
+    "# # Load the dataset\n",
+    "# df = pd.read_csv(\"FalconData2.csv\")\n",
+    "\n",
+    "# # Calculate the index to split the data at the last 10%\n",
+    "# split_index = int(len(df) * 0.980)\n",
+    "\n",
+    "# # Split the data into training and validation sets\n",
+    "# train_df = df.iloc[:split_index]  # First 90% for training\n",
+    "# validation_df = df.iloc[split_index:]  # Last 10% for validation\n",
+    "\n",
+    "# # Display the sizes of the training and validation sets\n",
+    "# print(f\"Training set size: {len(train_df)}\")\n",
+    "# print(f\"Validation set size: {len(validation_df)}\")\n",
+    "\n",
+    "# # Optionally, save the datasets to new CSV files\n",
+    "# train_df.to_csv(\"FalconData_train2.csv\", index=False)\n",
+    "# validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "7a16fb10-40cd-4668-b363-57ca64819ad3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of rows removed due to NaN values: 2\n",
+      "Training set size: 48998\n",
+      "Validation set size: 1000\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv(\"FalconData2.csv\")\n",
+    "\n",
+    "# Check for NaN values and remove rows with NaN values\n",
+    "# df = df.dropna()\n",
+    "original_length = len(df)\n",
+    "\n",
+    "df = df.dropna()\n",
+    "\n",
+    "removed_rows = original_length - len(df)\n",
+    "print(f\"Number of rows removed due to NaN values: {removed_rows}\")\n",
+    "\n",
+    "# Calculate the index to split the data at the last 2%\n",
+    "split_index = int(len(df) * 0.98)\n",
+    "\n",
+    "# Split the data into training and validation sets\n",
+    "train_df = df.iloc[:split_index]  # First 98% for training\n",
+    "validation_df = df.iloc[split_index:]  # Last 2% for validation\n",
+    "\n",
+    "# Display the sizes of the training and validation sets\n",
+    "print(f\"Training set size: {len(train_df)}\")\n",
+    "print(f\"Validation set size: {len(validation_df)}\")\n",
+    "\n",
+    "# Save the datasets to new CSV files\n",
+    "train_df.to_csv(\"FalconData_train2.csv\", index=False)\n",
+    "validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "55d929c5-c198-4a91-b31d-65dd83fa00d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of duplicate rows: 4\n",
+      "                                                   Text\n",
+      "522                                               Name:\n",
+      "11745  Description.\\nReviews\\nThere are no reviews yet.\n",
+      "17605  Description.\\nReviews\\nThere are no reviews yet.\n",
+      "42547                                               !\\n\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read the CSV file into a pandas DataFrame\n",
+    "df1 = pd.read_csv(\"FalconData_train2.csv\")\n",
+    "\n",
+    "# Check for duplicate rows\n",
+    "duplicate_rows1 = df1[df1.duplicated()]\n",
+    "\n",
+    "# Print the number of duplicate rows\n",
+    "print(f\"Number of duplicate rows: {len(duplicate_rows1)}\")\n",
+    "\n",
+    "# Print the duplicate rows\n",
+    "print(duplicate_rows1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "3cc404d9-e85e-48ff-aa34-750ebe3e3d3c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[…]\\nM&amp;S bank […]\\nLowest unsecured loan rate...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>JavaScript seems to be disabled in your browse...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CMTech has designed a game to foster social in...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>A Storyteller's Point of View\\nMy\\nWriting\\nLe...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>mspu.us was registered 1 decade 3 years ago. I...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text\n",
+       "0   […]\\nM&S bank […]\\nLowest unsecured loan rate...\n",
+       "1  JavaScript seems to be disabled in your browse...\n",
+       "2  CMTech has designed a game to foster social in...\n",
+       "3  A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n",
+       "4  mspu.us was registered 1 decade 3 years ago. I..."
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df1.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "641c606f-6f7f-4097-a8de-a9f6be0047b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>48995</th>\n",
+       "      <td>A Chenango County man was charged Wednesday wi...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48996</th>\n",
+       "      <td>2-Tone Black Personalized Embroidered One Init...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48997</th>\n",
+       "      <td>NARAL Pro-Choice America PAC Endorses Colleen ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48998</th>\n",
+       "      <td>Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48999</th>\n",
+       "      <td>Pantry feeds families in need\\n- Details\\n- Ca...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                    Text\n",
+       "48995  A Chenango County man was charged Wednesday wi...\n",
+       "48996  2-Tone Black Personalized Embroidered One Init...\n",
+       "48997  NARAL Pro-Choice America PAC Endorses Colleen ...\n",
+       "48998  Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...\n",
+       "48999  Pantry feeds families in need\\n- Details\\n- Ca..."
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df1.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8f7dbf6-5d74-4f8f-85d0-e890a5b8d152",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

FalconData_train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f50d9deb1ffab95c3a7026107c574f6024ae3791849e11c1705f8951caa6a2
+size 23342205

FalconData_train2.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c9f267ded526b4479e8a8754d2554ac84100531c971a962cb3fc0d0a74c52de
+size 126785171

FalconData_validation.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

FalconData_validation2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Shivaen
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# ShortGPT
+Unofficial implementations of:
+- ["ShortGPT: Layers in Large Language Models are More Redundant Than You Expect"](https://arxiv.org/pdf/2403.03853)
+- ["The Unreasonable Ineffectiveness of the Deeper Layers"](https://arxiv.org/abs/2403.17887)
+### To Use
+- Follow Llama 2 setup found [here](https://github.com/facebookresearch/llama).
+- Reference `short_gpt/short_llama.ipynb` for necessary function calls.
+- For HuggingFace models, reference this [branch](https://github.com/sramshetty/ShortGPT/tree/hf-models).
+### Details
+- Use a wrapper around Llama to collect hidden states and compute BI (block influence).
+  - BI implementation may be subject to change or improvements if others find issues, thanks in advance!
+- Sum importance values across layers while inferencing on [pg19](https://huggingface.co/datasets/pg19).
+  - Dataset can be slow to load from huggingface so you may want to use an alternative.
+- Use sorted layer-wise importance values to determine which layers are least important and subject to removal.
+- Demonstrate *model-healing* with Mistral-7B-v0.1 described in "The Unreasonable Ineffectiveness of the Deeper Layers", where finetuning with LoRA after layer removal can recover downstream model performance.
+### Results
+Comparison of ShortGPT layers removed on Llama-2-7B (9 least important layers):
+Paper: [27, 26, 25, 28, 24, 29, 23, 21, 22] \
+This Implementation: [25, 27, 24, 26, 28, 29, 23, 22, 21]
+Same layers but different order.
+### TODO:
+- [x] Is order significant -> Authors mention that layer order varies between datasets but their relative ordering suggests "similar levels of importance" [link](https://huggingface.co/papers/2403.03853#65f028667c916f24c80e93b3).
+- [x] Add more models and metrics -> Add experimental support for HF models on this [branch](https://github.com/sramshetty/ShortGPT/tree/hf-models).
+  - [x] Add angular distance metric
+  - [x] Demonstrate model healing using HuggingFace model [here](https://github.com/sramshetty/ShortGPT/blob/hf-models/short_gpt/short_hf.ipynb).
+### Citations
+```bibtex
+@misc{men2024shortgpt,
+    title={ShortGPT: Layers in Large Language Models are More Redundant Than You Expect},
+    author={Xin Men and Mingyu Xu and Qingyu Zhang and Bingning Wang and Hongyu Lin and Yaojie Lu and Xianpei Han and Weipeng Chen},
+    year={2024},
+    eprint={2403.03853},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+@misc{gromov2024unreasonable,
+    title={The Unreasonable Ineffectiveness of the Deeper Layers},
+    author={Andrey Gromov and Kushal Tirumala and Hassan Shapourian and Paolo Glorioso and Daniel A. Roberts},
+    year={2024},
+    eprint={2403.17887},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+@misc{song2024sleb,
+    title={SLEB: Streamlining LLMs through Redundancy Verification and Elimination of Transformer Blocks},
+    author={Jiwon Song and Kyungseok Oh and Taesu Kim and Hyungjun Kim and Yulhwa Kim and Jae-Joon Kim},
+    year={2024},
+    eprint={2402.09025},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+@article{raecompressive2019,
+    author = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and Hillier, Chloe and Lillicrap, Timothy P},
+    title = {Compressive Transformers for Long-Range Sequence Modelling},
+    journal = {arXiv preprint},
+    url = {https://arxiv.org/abs/1911.05507},
+    year = {2019},
+}
+```

language_modeling.ipynb ADDED Viewed

	@@ -0,0 +1,932 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.45.0.dev0)\n",
+      "Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (2.21.0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.15.4)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.24.6)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (1.26.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (24.1)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.7.24)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n",
+      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.19.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.4.5)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.66.5)\n",
+      "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (17.0.0)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets) (3.5.0)\n",
+      "Requirement already satisfied: multiprocess in /usr/local/lib/python3.11/dist-packages (from datasets) (0.70.16)\n",
+      "Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from datasets) (3.10.5)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (2.4.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.11.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.7)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.2.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2024.7.4)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0mCollecting git+https://github.com/huggingface/transformers.git\n",
+      "  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-sok4bqyk\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-sok4bqyk\n",
+      "  Resolved https://github.com/huggingface/transformers.git to commit 96429e74a8191521bcb4b99f48ad1fbc8f9e6873\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (3.15.4)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (0.24.6)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (1.26.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (24.1)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (2024.7.24)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (2.32.3)\n",
+      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (0.19.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (0.4.5)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers==4.45.0.dev0) (4.66.5)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers==4.45.0.dev0) (2024.6.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers==4.45.0.dev0) (4.12.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.45.0.dev0) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.45.0.dev0) (3.7)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.45.0.dev0) (2.2.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.45.0.dev0) (2024.7.4)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "# Transformers installation\n",
+    "! pip install transformers datasets\n",
+    "# To install from source instead of the last release, comment the command above and uncomment the following one.\n",
+    "! pip install git+https://github.com/huggingface/transformers.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (0.34.2)\n",
+      "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.11/dist-packages (from accelerate) (1.26.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (24.1)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from accelerate) (6.0.0)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from accelerate) (6.0.2)\n",
+      "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (2.4.0)\n",
+      "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.24.6)\n",
+      "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.4.5)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (3.15.4)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2024.6.1)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.5)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (1.13.2)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (3.3)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (3.1.4)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (9.1.0.70)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (12.1.3.1)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (11.0.2.54)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (10.3.2.106)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (11.4.5.107)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (12.1.0.106)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (2.20.5)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (12.1.105)\n",
+      "Requirement already satisfied: triton==3.0.0 in /usr/local/lib/python3.11/dist-packages (from torch>=1.10.0->accelerate) (3.0.0)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.11/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.10.0->accelerate) (12.6.20)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.7)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.2.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.7.4)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0mRequirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.45.0.dev0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.15.4)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.24.6)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (1.26.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (24.1)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.7.24)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n",
+      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.19.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.4.5)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.66.5)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.6.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.7)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.2.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2024.7.4)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "! pip install -U accelerate\n",
+    "! pip install -U transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install accelerate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install transformers[torch]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Causal language modeling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are two types of language modeling, causal and masked. This guide illustrates causal language modeling.\n",
+    "Causal language models are frequently used for text generation. You can use these models for creative applications like\n",
+    "choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "cellView": "form",
+    "hide_input": true
+   },
+   "outputs": [],
+   "source": [
+    "# #@title\n",
+    "# from IPython.display import HTML\n",
+    "\n",
+    "# HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/Vpjb1lu0MDk?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Causal language modeling predicts the next token in a sequence of tokens, and the model can only attend to tokens on\n",
+    "the left. This means the model cannot see future tokens. GPT-2 is an example of a causal language model.\n",
+    "\n",
+    "This guide will show you how to:\n",
+    "\n",
+    "1. Finetune [DistilGPT2](https://huggingface.co/distilgpt2) on the [r/askscience](https://www.reddit.com/r/askscience/) subset of the [ELI5](https://huggingface.co/datasets/eli5) dataset.\n",
+    "2. Use your finetuned model for inference.\n",
+    "\n",
+    "<Tip>\n",
+    "You can finetune other architectures for causal language modeling following the same steps in this guide.\n",
+    "Choose one of the following architectures:\n",
+    "\n",
+    "<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->\n",
+    "[BART](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bart), [BERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bert), [Bert Generation](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bert-generation), [BigBird](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/big_bird), [BigBird-Pegasus](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bigbird_pegasus), [BioGpt](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/biogpt), [Blenderbot](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/blenderbot), [BlenderbotSmall](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/blenderbot-small), [BLOOM](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/bloom), [CamemBERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/camembert), [CodeGen](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/codegen), [CPM-Ant](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/cpmant), [CTRL](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/ctrl), [Data2VecText](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/data2vec-text), [ELECTRA](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/electra), [ERNIE](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/ernie), [GIT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/git), [GPT-Sw3](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt-sw3), [OpenAI GPT-2](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt2), [GPTBigCode](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_bigcode), [GPT Neo](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_neo), [GPT NeoX](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_neox), [GPT NeoX Japanese](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gpt_neox_japanese), [GPT-J](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/gptj), [LLaMA](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/llama), [Marian](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/marian), [mBART](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mbart), [MEGA](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mega), [Megatron-BERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/megatron-bert), [MVP](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/mvp), [OpenLlama](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/open-llama), [OpenAI GPT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/openai-gpt), [OPT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/opt), [Pegasus](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/pegasus), [PLBart](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/plbart), [ProphetNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/prophetnet), [QDQBert](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/qdqbert), [Reformer](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/reformer), [RemBERT](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/rembert), [RoBERTa](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roberta), [RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roberta-prelayernorm), [RoCBert](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roc_bert), [RoFormer](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/roformer), [RWKV](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/rwkv), [Speech2Text2](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/speech_to_text_2), [Transformer-XL](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/transfo-xl), [TrOCR](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/trocr), [XGLM](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xglm), [XLM](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm), [XLM-ProphetNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-prophetnet), [XLM-RoBERTa](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-roberta), [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlm-roberta-xl), [XLNet](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xlnet), [X-MOD](https://huggingface.co/docs/transformers/main/en/tasks/../model_doc/xmod)\n",
+    "\n",
+    "\n",
+    "<!--End of the generated tip-->\n",
+    "\n",
+    "</Tip>\n",
+    "\n",
+    "Before you begin, make sure you have all the necessary libraries installed:\n",
+    "\n",
+    "```bash\n",
+    "pip install transformers datasets evaluate\n",
+    "```\n",
+    "\n",
+    "We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from huggingface_hub import notebook_login\n",
+    "\n",
+    "# notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load ELI5 dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Start by loading a smaller subset of the r/askscience subset of the ELI5 dataset from the 🤗 Datasets library.\n",
+    " This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from datasets import load_dataset\n",
+    "\n",
+    "# eli5 = load_dataset(\"eli5\", split=\"train_asks[:5000]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "# Falcon = load_dataset(\"csv\", data_files=\"FalconData.csv\")\n",
+    "Falcon = load_dataset('csv', data_files={\"train\": 'FalconData_train.csv', \"validation\": 'FalconData_validation.csv'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split the dataset's `train_asks` split into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Falcon = Falcon.train_test_split(test_size=0.10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then take a look at an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'Once the kind of organization is decided, right now is the time for the purpose of the huge talk with the parents. Additionally, you will have to credit your company while using the board. Right now there a few techniques which usually you can get started on the cellular phone restoration organization.\\nBecause you develop your organization, you can want to realize how to raise your skill sets and tactics. After formulating your firm notion and organizing the funds, the next idea to perform is to check out the organization. In addition , if occur to be certainly not in the automobile business yet work via the internet with consumers via the net and email, after that some of your suggestions you are going to see are certain to get the work performed to get you too.\\nWhat you will requirement for your company depends upon a great deal of factors, therefore is actually ideal to pay a visit to the Nevada Department of Insurance internet site to get detailed info. Once you wish to start up your unique enterprise, then simply it is important to apply entitlements of your have firm. The few males and ladies in little business want to know more and carry out more with a great deal fewer. For illustration, the ordinary organization runs the data centre 10 hours every day. Even more businesses experience began to take notice of the huge benefits of giving birth to a business program analyst in staff. As you take your small business to the world-wide market segments, it is going to become important to think about a lot a large number of things to ascertain the organization efficiently. Decide what kind of business being you desire to allocate to your panorama business.\\nRecuperate this will depend after the sort of assistance you give. Right now there are a lot of different varieties of Web service yet I will list the most typical types out there. Found in addition, you will need high-speed on the net service to mail and acquire job data files to your consumers.\\nMany people today are unsuccessful in organization given that they make avoidable mistakes! A put together organization is a great likelihood to communicate the fine art just the way that you like it. You can actually without difficulty control the company if it’s legitimate. While not efficient communication, the businesses could not discover the strategies to create the business and website link while using the all over the world clients and companions. A great excellent car shop tools business will make sure you experience all owners and parts manuals alongside one another with service plan directives for all of you heavy machines you purchase or perhaps let out.\\nIn case you blowing wind up going, where you began your company won’t change! It’s actually now possible to advertise your business to anybody anywhere for the purpose of practically no selling price. So you may absolutely cost-free to pay attention to different important things that matter to you such as growing your business and a lot more. If the service is mostly an operation product, you should supply a replicate within the operation contract. Websites like craigslist and or perhaps Tradelit That is certainly, in the event people are likely to build a company. Presently a days and nights Many businesses are unaware of the significance of SEO in improving the internet occurrence. If you expect to have carrying out a fee-for-service tutoring organization, then you might preference to think about signing up your company considering the state.\\nKind of organization Primarily based upon at the sort of business, you need to do business with a variety of organizations. Not only a single company are able to take advantage of a similar well-known. If an organization can better figure out their normal user’s requires, it will develop into a excellent less complicated to guarantee that every consumer has a confident knowledge in handling your business with regards to a entire. Even firms want a huge data stats official certifications prior to taking the help of a person. As a result, all of them over the world are inclined to take full advantage of technology, on particular, cordless devices and public hotspots. The organization should also be capable of offering any kind of teaching vital to buy and sell each machine safely. Daily, an increasing number of businesses are putting up or perhaps establishing an electronic business. For more info read right here whatsbakingsd.com .'}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon['train'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': ', John Morris (19282003), historian\\nOxford Biography Index Number 101089999 [what is this?] Primary authority: Oxford DNB\\nColin Lucas, Roberts, John Morris (19282003), first published\\nJan 2007; online edn, Oct 2009, 1683 words, with portrait illustration\\n> View John Roberts complete biography [Oxford DNB subscription required; no subscription?]\\n> View John Roberts complete biography\\n[WWW subscription required; no subscription?]'}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon['validation'][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "While this may look like a lot, you're only really interested in the `text` field. What's cool about language modeling\n",
+    "tasks is you don't need labels (also known as an unsupervised task) because the next word *is* the label."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocess"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "cellView": "form",
+    "hide_input": true
+   },
+   "outputs": [],
+   "source": [
+    "# #@title\n",
+    "# from IPython.display import HTML\n",
+    "\n",
+    "# HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:1614: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, GPT2TokenizerFast\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\n",
+    "\n",
+    "\n",
+    "# tokenizer = GPT2TokenizerFast.from_pretrained(\"Xenova/gpt-4\")#, cache_dir=cache_dir)\n",
+    "tokenizer.pad_token = tokenizer.eos_token"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to\n",
+    "extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'Once the kind of organization is decided, right now is the time for the purpose of the huge talk with the parents. Additionally, you will have to credit your company while using the board. Right now there a few techniques which usually you can get started on the cellular phone restoration organization.\\nBecause you develop your organization, you can want to realize how to raise your skill sets and tactics. After formulating your firm notion and organizing the funds, the next idea to perform is to check out the organization. In addition , if occur to be certainly not in the automobile business yet work via the internet with consumers via the net and email, after that some of your suggestions you are going to see are certain to get the work performed to get you too.\\nWhat you will requirement for your company depends upon a great deal of factors, therefore is actually ideal to pay a visit to the Nevada Department of Insurance internet site to get detailed info. Once you wish to start up your unique enterprise, then simply it is important to apply entitlements of your have firm. The few males and ladies in little business want to know more and carry out more with a great deal fewer. For illustration, the ordinary organization runs the data centre 10 hours every day. Even more businesses experience began to take notice of the huge benefits of giving birth to a business program analyst in staff. As you take your small business to the world-wide market segments, it is going to become important to think about a lot a large number of things to ascertain the organization efficiently. Decide what kind of business being you desire to allocate to your panorama business.\\nRecuperate this will depend after the sort of assistance you give. Right now there are a lot of different varieties of Web service yet I will list the most typical types out there. Found in addition, you will need high-speed on the net service to mail and acquire job data files to your consumers.\\nMany people today are unsuccessful in organization given that they make avoidable mistakes! A put together organization is a great likelihood to communicate the fine art just the way that you like it. You can actually without difficulty control the company if it’s legitimate. While not efficient communication, the businesses could not discover the strategies to create the business and website link while using the all over the world clients and companions. A great excellent car shop tools business will make sure you experience all owners and parts manuals alongside one another with service plan directives for all of you heavy machines you purchase or perhaps let out.\\nIn case you blowing wind up going, where you began your company won’t change! It’s actually now possible to advertise your business to anybody anywhere for the purpose of practically no selling price. So you may absolutely cost-free to pay attention to different important things that matter to you such as growing your business and a lot more. If the service is mostly an operation product, you should supply a replicate within the operation contract. Websites like craigslist and or perhaps Tradelit That is certainly, in the event people are likely to build a company. Presently a days and nights Many businesses are unaware of the significance of SEO in improving the internet occurrence. If you expect to have carrying out a fee-for-service tutoring organization, then you might preference to think about signing up your company considering the state.\\nKind of organization Primarily based upon at the sort of business, you need to do business with a variety of organizations. Not only a single company are able to take advantage of a similar well-known. If an organization can better figure out their normal user’s requires, it will develop into a excellent less complicated to guarantee that every consumer has a confident knowledge in handling your business with regards to a entire. Even firms want a huge data stats official certifications prior to taking the help of a person. As a result, all of them over the world are inclined to take full advantage of technology, on particular, cordless devices and public hotspots. The organization should also be capable of offering any kind of teaching vital to buy and sell each machine safely. Daily, an increasing number of businesses are putting up or perhaps establishing an electronic business. For more info read right here whatsbakingsd.com .'}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon = Falcon.flatten()\n",
+    "Falcon[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Each subfield is now a separate column as indicated by the `answers` prefix, and the `text` field is a list now. Instead\n",
+    "of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them.\n",
+    "\n",
+    "Here is a first preprocessing function to join the list of strings for each example and tokenize the result:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    return tokenizer([\" \".join(x) for x in examples[\"Text\"]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once, and increasing the number of processes with `num_proc`. Remove any columns you don't need:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_Falcon = Falcon.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=4,\n",
+    "    remove_columns=Falcon[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.\n",
+    "\n",
+    "You can now use a second preprocessing function to\n",
+    "- concatenate all the sequences\n",
+    "- split the concatenated sequences into shorter chunks defined by `block_size`, which should be both shorter than the maximum input length and short enough for your GPU RAM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "block_size = 1048\n",
+    "\n",
+    "\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "    # Split by chunks of block_size.\n",
+    "    result = {\n",
+    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "        for k, t in concatenated_examples.items()\n",
+    "    }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Apply the `group_texts` function over the entire dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now create a batch of examples using [DataCollatorForLanguageModeling](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling). It's more efficient to *dynamically pad* the\n",
+    "sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.\n",
+    "\n",
+    "Use the end-of-sequence token as the padding token and set `mlm=False`. This will use the inputs as labels shifted to the right by one element:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForLanguageModeling\n",
+    "\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<Tip>\n",
+    "\n",
+    "If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the [basic tutorial](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!\n",
+    "\n",
+    "</Tip>\n",
+    "\n",
+    "You're ready to start training your model now! Load DistilGPT2 with [AutoModelForCausalLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
+    "import torch\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"rwh/tinytoo\", torch_dtype=torch.bfloat16)              "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point, only three steps remain:\n",
+    "\n",
+    "1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model).\n",
+    "2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, datasets, and data collator.\n",
+    "3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import gc\n",
+    "\n",
+    "# del tensor_name  # Delete the tensor\n",
+    "gc.collect()     # Collect garbage\n",
+    "torch.cuda.empty_cache()  # Clear cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<torch.autograd.grad_mode.no_grad at 0x7f0a24519350>"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.no_grad()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LlamaForCausalLM(\n",
+       "  (model): LlamaModel(\n",
+       "    (embed_tokens): Embedding(50257, 1408)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-23): 24 x LlamaDecoderLayer(\n",
+       "        (self_attn): LlamaSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=1408, out_features=1408, bias=False)\n",
+       "          (k_proj): Linear(in_features=1408, out_features=1408, bias=False)\n",
+       "          (v_proj): Linear(in_features=1408, out_features=1408, bias=False)\n",
+       "          (o_proj): Linear(in_features=1408, out_features=1408, bias=False)\n",
+       "          (rotary_emb): LlamaRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): LlamaMLP(\n",
+       "          (gate_proj): Linear(in_features=1408, out_features=4340, bias=False)\n",
+       "          (up_proj): Linear(in_features=1408, out_features=4340, bias=False)\n",
+       "          (down_proj): Linear(in_features=4340, out_features=1408, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): LlamaRMSNorm((1408,), eps=1e-05)\n",
+       "        (post_attention_layernorm): LlamaRMSNorm((1408,), eps=1e-05)\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): LlamaRMSNorm((1408,), eps=1e-05)\n",
+       "    (rotary_emb): LlamaRotaryEmbedding()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1408, out_features=50257, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.11/dist-packages/transformers/training_args.py:1541: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"Fine-Tuned-S9\",\n",
+    "    bf16=True,\n",
+    "    # evaluation_strategy=\"epoch\",\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    learning_rate=2e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    per_device_eval_batch_size=2,\n",
+    "    # lr_scheduler_type = 'cosine',\n",
+    "    push_to_hub=False,\n",
+    "    save_total_limit = 2,\n",
+    "    # save_strategy = “no”\n",
+    "    load_best_model_at_end=False\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=lm_dataset[\"train\"],\n",
+    "    eval_dataset=lm_dataset[\"validation\"],\n",
+    "    # eval_dataset=lm_dataset[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    ")\n",
+    "\n",
+    "# trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once training is completed, use the [evaluate()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.evaluate) method to evaluate your model and get its perplexity:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "eval_results = trainer.evaluate()\n",
+    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<Tip>\n",
+    "\n",
+    "For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding\n",
+    "[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)\n",
+    "or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).\n",
+    "\n",
+    "</Tip>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great, now that you've finetuned a model, you can use it for inference!\n",
+    "\n",
+    "Come up with a prompt you'd like to generate text from:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prompt = \"Somatic hypermutation allows the immune system to\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for text generation with your model, and pass your text to it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from transformers import pipeline\n",
+    "# # checkpoint-4000\n",
+    "# generator = pipeline(\"text-generation\", model=\"Fine-Tuned-S9/checkpoint-4000\")\n",
+    "# generator(prompt)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tokenize the text and return the `input_ids` as PyTorch tensors:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from transformers import AutoTokenizer\n",
+    "\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(\"Xenova/gpt-4\")\n",
+    "# inputs = tokenizer(prompt, return_tensors=\"pt\").input_ids"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the [generate()](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate) method to generate text.\n",
+    "For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](https://huggingface.co/docs/transformers/main/en/tasks/../generation_strategies) page."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from transformers import AutoModelForCausalLM\n",
+    "\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"deepnet/SN6-BestLlama\")\n",
+    "# outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Decode the generated token ids back into text:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tokenizer.batch_decode(outputs, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tokenizer.batch_decode(outputs, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

language_modeling.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# -*- coding: utf-8 -*-
+# Transformers installation
+# ! pip install transformers datasets
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+# #@title
+# from IPython.display import HTML
+# HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/Vpjb1lu0MDk?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')
+# from huggingface_hub import notebook_login
+# notebook_login()
+# from datasets import load_dataset
+# eli5 = load_dataset("eli5", split="train_asks[:5000]")
+from datasets import load_dataset
+# Falcon = load_dataset("csv", data_files="FalconData.csv")
+Falcon = load_dataset('csv', data_files={"train": 'FalconData_train2.csv', "validation": 'FalconData_validation2.csv'})
+print('Dataset Loaded!')
+# Falcon = Falcon.train_test_split(test_size=0.10)
+"""Then take a look at an example:"""
+Falcon['train'][0]
+Falcon['validation'][0]
+# #@title
+# from IPython.display import HTML
+# HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/ma1TrR7gE7I?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')
+"""The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:"""
+from transformers import AutoTokenizer, GPT2TokenizerFast
+tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+# tokenizer = GPT2TokenizerFast.from_pretrained("Xenova/gpt-4")#, cache_dir=cache_dir)
+# tokenizer.pad_token
+# tokenizer.eos_token=128000
+# tokenizer.bos_token='128000'
+# tokenizer.eos_token='128001'
+tokenizer.pad_token = tokenizer.eos_token
+Falcon = Falcon.flatten()
+Falcon["train"][0]
+def preprocess_function(examples):
+    return tokenizer([" ".join(x) for x in examples["Text"]])
+tokenized_Falcon = Falcon.map(
+    preprocess_function,
+    batched=True,
+    num_proc=4,
+    remove_columns=Falcon["train"].column_names,
+)
+block_size = tokenizer.model_max_length
+# block_size = 2048
+def group_texts(examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    # customize this part to your needs.
+    if total_length >= block_size:
+        total_length = (total_length // block_size) * block_size
+    # Split by chunks of block_size.
+    result = {
+        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+        for k, t in concatenated_examples.items()
+    }
+    result["labels"] = result["input_ids"].copy()
+    return result
+"""Apply the `group_texts` function over the entire dataset:"""
+lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4)
+from transformers import DataCollatorForLanguageModeling
+# tokenizer.pad_token
+# tokenizer.bos_token='128000'
+# tokenizer.eos_token='128001'
+data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+import torch
+model = AutoModelForCausalLM.from_pretrained("rwh/tiny8", torch_dtype=torch.bfloat16)
+print('Model Loaded!')
+# import torch
+# torch.cuda.empty_cache()
+# import torch
+# import gc
+# # del tensor_name  # Delete the tensor
+# gc.collect()     # Collect garbage
+# torch.cuda.empty_cache()  # Clear cache
+# torch.cuda.empty_cache()
+# torch.no_grad()
+model.to('cuda')
+OutputDir = "C1ReadyModel"
+training_args = TrainingArguments(
+    output_dir=OutputDir,
+    overwrite_output_dir=True,
+    bf16=True,
+    # evaluation_strategy="epoch",
+    evaluation_strategy="steps",
+    # learning_rate=3.25e-06,
+    # learning_rate=2e-5,
+    learning_rate=1e-5,
+    weight_decay=0.01,
+   # weight_decay=0.001,
+    num_train_epochs=6,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    # lr_scheduler_type = 'cosine',
+    lr_scheduler_type = 'linear',
+    push_to_hub=False,
+    save_total_limit = 2,
+    save_strategy = "steps",
+    load_best_model_at_end=True,
+    save_safetensors=True,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=lm_dataset["train"],
+    eval_dataset=lm_dataset["validation"],
+    # eval_dataset=lm_dataset["test"],
+    data_collator=data_collator,
+)
+# trainer.train()
+print('Started Training!')
+trainer.train()
+trainer.save_model(OutputDir)
+print('Saved Model Path:', OutputDir)
+import math
+eval_results = trainer.evaluate()
+print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

short_gpt/.ipynb_checkpoints/short_hf-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1679 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n",
+      "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.1.1)\n",
+      "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.40.2)\n",
+      "Requirement already satisfied: peft in /usr/local/lib/python3.10/dist-packages (0.10.0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.13.1)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.2)\n",
+      "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.0.0)\n",
+      "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+      "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.2)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
+      "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+      "Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets) (2023.10.0)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.0b0)\n",
+      "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.0)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.8.0)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch) (8.9.2.26)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.3.1)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch) (11.0.2.54)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch) (10.3.2.106)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch) (11.4.5.107)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.0.106)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /usr/local/lib/python3.10/dist-packages (from torch) (2.18.1)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.1.0)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.3.101)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.4.28)\n",
+      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.3)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft) (5.9.6)\n",
+      "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from peft) (0.30.0)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.6)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.1.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2023.11.17)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install datasets torch transformers peft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "from peft import (\n",
+    "    get_peft_model,\n",
+    "    LoraConfig,\n",
+    "    TaskType,\n",
+    ")\n",
+    "from transformers import default_data_collator, Trainer, TrainingArguments\n",
+    "\n",
+    "from short_hf import ShortHFModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data = load_dataset(\"pg19\", split=\"validation\")  # authors sample 10,000 texts to compute block influences\n",
+    "# dataloader = DataLoader(\n",
+    "#     data,\n",
+    "#     batch_size=2,\n",
+    "#     shuffle=True,\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"validation\")  # authors sample 10,000 texts to compute block influences\n",
+    "dataloader = DataLoader(\n",
+    "    data,\n",
+    "    batch_size=1,\n",
+    "    shuffle=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !huggingface-cli login\n",
+    "# pip install huggingface_hub\n",
+    "!python3 -c \"from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#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"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "asifahmed\n"
+     ]
+    }
+   ],
+   "source": [
+    "!huggingface-cli whoami"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pip install git+https://github.com/tri-ml/linear_open_lm.git\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9fcf366ecc414808b39285438599f5b9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# from open_lm.open_lm_hf import *\n",
+    "\n",
+    "MAX_SEQ_LEN = 2048\n",
+    "short_model = ShortHFModel(\n",
+    "    # model_name=\"tiiuae/falcon-7b\",\n",
+    "    model_name=\"mistralai/Mistral-7B-v0.1\",\n",
+    "    layers_path=\"model.layers\",\n",
+    "    n_prune_layers=2\n",
+    ")\n",
+    "# short_model.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralForCausalLM(\n",
+       "  (model): MistralModel(\n",
+       "    (embed_tokens): Embedding(32000, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-31): 32 x MistralDecoderLayer(\n",
+       "        (self_attn): MistralSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (rotary_emb): MistralRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): MistralMLP(\n",
+       "          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): MistralRMSNorm()\n",
+       "        (post_attention_layernorm): MistralRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): MistralRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# AutoModelForCausalLM.from_pretrained(\n",
+    "#             pretrained_model_name_or_path=model_dir,\n",
+    "#             local_files_only=True,\n",
+    "#             use_safetensors=True,\n",
+    "#             torch_dtype=torch.bfloat16,\n",
+    "#         )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<generator object Module.parameters at 0x7f00b3917840>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.model.parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7241732096"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pytorch_total_params = sum(p.numel() for p in short_model.model.parameters())\n",
+    "pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # Save the model state to the specified path.\n",
+    "# model_dir='ShortModelSaved/'\n",
+    "# short_model.model.save_pretrained(\n",
+    "#         save_directory=model_dir,\n",
+    "#         safe_serialization=True,\n",
+    "#     )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralDecoderLayer(\n",
+       "  (self_attn): MistralSdpaAttention(\n",
+       "    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (rotary_emb): MistralRotaryEmbedding()\n",
+       "  )\n",
+       "  (mlp): MistralMLP(\n",
+       "    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "    (act_fn): SiLU()\n",
+       "  )\n",
+       "  (input_layernorm): MistralRMSNorm()\n",
+       "  (post_attention_layernorm): MistralRMSNorm()\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['I am an avid fan of 3D printing. I have been using 3D printers for over 10 years and have been involved in the development of several 3D printers. I have also been involved in the development of several 3D printing software packages.\\n\\nI have been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages.']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sample generationThe evolution of AI has lead to \n",
+    "gen = short_model.model.generate(\n",
+    "    short_model.tokenizer([\"I am an avid fan of \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "    max_new_tokens=256\n",
+    ")\n",
+    "short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # sample generation\n",
+    "# gen = short_model.model.generate(\n",
+    "#     short_model.tokenizer([\"The evolution of AI has lead to  \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=256\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute Importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for i, batch in enumerate(tqdm(dataloader)):\n",
+    "#     prompts = batch['text']\n",
+    "\n",
+    "#     short_model.eval_importance(\n",
+    "#         prompts=prompts,\n",
+    "#         max_seq_len=MAX_SEQ_LEN,\n",
+    "#         stride=256,\n",
+    "#         max_gen_len=0\n",
+    "#     )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.importances"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove unimportant layers\n",
+    "\n",
+    "Layers removed when using subset of pg19 val set: [25, 26, 24, 27, 22, 23, 28, 21, 29]\n",
+    "\n",
+    "Authors mention that the layer order is quite nuanced and can vary with different datasets. However, relative order suggests similar importance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.remove_layers()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.remove_layers()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # reassign layer_idx to attentions for caching\n",
+    "# for layer_idx, module in enumerate(short_model.layers):\n",
+    "#     module.self_attn.layer_idx = layer_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<generator object Module.parameters at 0x7f625768a2d0>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# short_model.model.parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7241732096"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# pytorch_total_params = sum(p.numel() for p in short_model.model.parameters())\n",
+    "# pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As the paper states: \\\n",
+    "    - \"Our experiments reveal that the effect of layer removal is significantly more pronounced on generative\n",
+    "        tasks compared to multiple-choice tasks. On benchmarks such as GSM8K (Cobbe et al., 2021) and\n",
+    "        HumanEval (Chen et al., 2021), removing 25% of the layers often leads to a severe performance\n",
+    "        drop, with scores approaching zero.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gen = short_model.model.generate(\n",
+    "#     short_model.tokenizer([\"I am an avid fan of  \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=20,\n",
+    "#     use_cache=True\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gen = short_model.model.generate(I am an avid fan of \n",
+    "#     short_model.tokenizer([\"The evolution of AI has lead to \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=20,\n",
+    "#     use_cache=True\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute Angular Importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6fd2bf4360b4aba801085bab0755a06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3760 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for i, batch in enumerate(tqdm(dataloader)):\n",
+    "    prompts = batch['text']\n",
+    "\n",
+    "    short_model.eval_importance(\n",
+    "        prompts=prompts,\n",
+    "        max_seq_len=MAX_SEQ_LEN,\n",
+    "        stride=256,\n",
+    "        max_gen_len=0,\n",
+    "        angular=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[128390.1328125,\n",
+       " 80922.06787109375,\n",
+       " 61075.2890625,\n",
+       " nan,\n",
+       " nan,\n",
+       " 56557.81268310547,\n",
+       " nan,\n",
+       " 52294.552001953125,\n",
+       " 47928.185302734375,\n",
+       " 42335.215576171875,\n",
+       " 40547.564208984375,\n",
+       " 37178.684326171875,\n",
+       " 34713.912841796875,\n",
+       " 33843.728271484375,\n",
+       " 35384.353271484375,\n",
+       " 35603.388427734375,\n",
+       " 35621.970458984375,\n",
+       " 35356.719482421875,\n",
+       " 35365.243896484375,\n",
+       " 34914.025146484375,\n",
+       " 27854.576904296875,\n",
+       " 24398.073974609375,\n",
+       " 20450.390380859375,\n",
+       " 19501.300537109375,\n",
+       " 18430.427490234375,\n",
+       " 18231.873779296875,\n",
+       " 17917.493896484375,\n",
+       " 17806.815185546875,\n",
+       " 21227.195068359375,\n",
+       " 23928.313018798828,\n",
+       " 22738.702880859375,\n",
+       " 86123.783203125]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.importances"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove unimportant layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[27, 28]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.remove_layers(angular=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralDecoderLayer(\n",
+       "  (self_attn): MistralSdpaAttention(\n",
+       "    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (rotary_emb): MistralRotaryEmbedding()\n",
+       "  )\n",
+       "  (mlp): MistralMLP(\n",
+       "    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "    (act_fn): SiLU()\n",
+       "  )\n",
+       "  (input_layernorm): MistralRMSNorm()\n",
+       "  (post_attention_layernorm): MistralRMSNorm()\n",
+       ")"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-29): 30 x MistralDecoderLayer(\n",
+       "    (self_attn): MistralSdpaAttention(\n",
+       "      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (rotary_emb): MistralRotaryEmbedding()\n",
+       "    )\n",
+       "    (mlp): MistralMLP(\n",
+       "      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "      (act_fn): SiLU()\n",
+       "    )\n",
+       "    (input_layernorm): MistralRMSNorm()\n",
+       "    (post_attention_layernorm): MistralRMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reassign layer_idx to attentions for caching\n",
+    "for layer_idx, module in enumerate(short_model.layers):\n",
+    "    module.self_attn.layer_idx = layer_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-29): 30 x MistralDecoderLayer(\n",
+       "    (self_attn): MistralSdpaAttention(\n",
+       "      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (rotary_emb): MistralRotaryEmbedding()\n",
+       "    )\n",
+       "    (mlp): MistralMLP(\n",
+       "      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "      (act_fn): SiLU()\n",
+       "    )\n",
+       "    (input_layernorm): MistralRMSNorm()\n",
+       "    (post_attention_layernorm): MistralRMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['I am an avid fan of 19th century American literature. I have read all of the classics, and I have also read many of the lesser known works. I have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gen = short_model.model.generate(\n",
+    "    short_model.tokenizer([\"I am an avid fan of \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "    max_new_tokens=256,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['The evolution of AI has lead to 3 major types of AI:\\n\\n1. Strong AI\\n2. Weak AI\\n3. Super AI\\n\\nStrong AI is the type of AI that is capable of performing any task that a human can perform. This type of AI is still in the development phase and is not yet available in the market.\\n\\nWeak AI is the type of AI that is capable of performing a specific task. This type of AI is available in the market and is used in a variety of applications.\\n\\nSuper AI is the type of AI that is capable of performing any task that a human can perform and is also capable of learning and adapting. This type of AI is still in the development phase and is not yet available in the market.\\n\\n## What is the difference between AI and AI?\\n\\nThe difference between AI and AI is that AI is a type of artificial intelligence that is capable of performing a specific task, while AI is a type of artificial intelligence that is capable of performing any task.\\n\\n## What is the difference between AI and AI?\\n\\nThe difference between AI and AI is that AI is a type of artificial intelligence that is capable of performing a specific task, while AI is a type of artificial intelligence that is capable']"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# gen = short_model.model.generate(I am an avid fan of \n",
+    "#     short_model.tokenizer([\"The evolution of AI has lead to \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=256,\n",
+    "#     use_cache=True\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)\n",
+    "\n",
+    "\n",
+    "gen = short_model.model.generate(\n",
+    "    short_model.tokenizer([\"The evolution of AI has lead to \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "    max_new_tokens=256,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6805508096"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pytorch_total_params = sum(p.numel() for p in short_model.model.parameters())\n",
+    "pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # Save the model state to the specified path.\n",
+    "model_dir='SmallModelSaved/'\n",
+    "short_model.model.save_pretrained(\n",
+    "        save_directory=model_dir,\n",
+    "        safe_serialization=True,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model Healing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tokenizer = short_model.tokenizer\n",
+    "model = short_model.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Datset Loaded!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "# Falcon = load_dataset(\"csv\", data_files=\"FalconData.csv\")\n",
+    "Falcon = load_dataset('csv', data_files={\"train\": 'FalconData2.csv', \"validation\": 'FalconDataEval2.csv'})\n",
+    "\n",
+    "print('Datset Loaded!')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'School Picture Gallery\\nFrance Ski School\\nChildren from Year 5 & 6 travelled to France from Newcastle airport to take part in a week of Ski School. The children had already spent 3 weeks learning the basics of skiing at Silksworth Ski School in Sunderland. When the children arrived in France they took part in a daily Ski School, during which the children made OUTSTANDING progress. The children also took part in French activities, explored local landmarks and took part in shopping activities in Chamonix. It was an incredible adventure for the children and staff!'}"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Falcon = Falcon.train_test_split(test_size=0.10)\n",
+    "\n",
+    "\"\"\"Then take a look at an example:\"\"\"\n",
+    "\n",
+    "Falcon['train'][0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'Our Annual Garden Party is a fun-filled event with a ton of landscaping and garden supplies; gardening demonstrations, experts, and vendors; activities for kids; live bands; and local food. It’s been so popular that we’re extending it to TWO DAYS this year!\\nFestivities at 10am – 4pm Saturday and 11am – 3pm Sunday\\nShopping from 9am – 6pm both days\\nThroughout the winter, we collect gently-used and surplus lawn & garden supplies as well as outdoor décor and furniture. Then, we put it all out for your shopping pleasure! The sale begins at 9:00 am Saturday, but folks start lining up outside the gates even earlier, eager to dig through piles of flowerpots and shovels. (If you can’t get there in the morning, don’t worry – the staff continues to bring out items throughout the weekend.)\\nThe Garden Sale 1st.\\nThere will be prizes for people and pets dressed in garden party finery.\\nPhoto by Carrie Delesky\\nSo find yourself a dapper suit or fancy hat, and check out all the activities in store for you:\\nAnacostia Watershed Society\\nPrince George’s Chapter, Maryland Master Gardeners\\nMOM’s Organic Market\\nTreincarnation\\nVeteran Compost\\nPhoto by Carrie Delesky\\nSaturday the Forklift’s Matt Menke and Gary Barnhart of GL Barnhart Construction. Drop in for a while, or stay the whole.'}"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon['validation'][0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:\"\"\"\n",
+    "\n",
+    "from transformers import AutoTokenizer, GPT2TokenizerFast\n",
+    "\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\n",
+    "\n",
+    "\n",
+    "tokenizer = GPT2TokenizerFast.from_pretrained(\"Xenova/gpt-4\")#, cache_dir=cache_dir)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'School Picture Gallery\\nFrance Ski School\\nChildren from Year 5 & 6 travelled to France from Newcastle airport to take part in a week of Ski School. The children had already spent 3 weeks learning the basics of skiing at Silksworth Ski School in Sunderland. When the children arrived in France they took part in a daily Ski School, during which the children made OUTSTANDING progress. The children also took part in French activities, explored local landmarks and took part in shopping activities in Chamonix. It was an incredible adventure for the children and staff!'}"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon = Falcon.flatten()\n",
+    "Falcon[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The OrderedVocab you are attempting to save contains holes for indices [100256, 100261, 100262, 100263, 100266, 100267, 100268, 100269, 100270, 100271, 100272, 100273, 100274, 100275], your vocabulary could be corrupted !\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d2182d4fa561406ab7eb5fc6c19c6d17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (10412 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (10738 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (12860 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (23091 > 8192). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The OrderedVocab you are attempting to save contains holes for indices [100256, 100261, 100262, 100263, 100266, 100267, 100268, 100269, 100270, 100271, 100272, 100273, 100274, 100275], your vocabulary could be corrupted !\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "121ffe72baf143f4aeea4616bee88405",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (9078 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (15886 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (28727 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (8257 > 8192). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    return tokenizer([\" \".join(x) for x in examples[\"Text\"]])\n",
+    "\n",
+    "\n",
+    "\n",
+    "tokenized_Falcon = Falcon.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=4,\n",
+    "    remove_columns=Falcon[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d7b13436ae54624bd96973987373482",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "beade64b537441ef99a54830bb66eef2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# block_size = tokenizer.model_max_length\n",
+    "block_size = 2048\n",
+    "\n",
+    "\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "    # Split by chunks of block_size.\n",
+    "    result = {\n",
+    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "        for k, t in concatenated_examples.items()\n",
+    "    }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result\n",
+    "\n",
+    "\"\"\"Apply the `group_texts` function over the entire dataset:\"\"\"\n",
+    "\n",
+    "lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForLanguageModeling\n",
+    "\n",
+    "# tokenizer.pad_token = tokenizer.eos_token\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
+    "# import torch\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"tensorplex-labs/pretraining-sn9-7B-5\", torch_dtype=torch.bfloat16)\n",
+    "\n",
+    "# print('Model Loaded!')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralForCausalLM(\n",
+       "  (model): MistralModel(\n",
+       "    (embed_tokens): Embedding(32000, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-29): 30 x MistralDecoderLayer(\n",
+       "        (self_attn): MistralSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (rotary_emb): MistralRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): MistralMLP(\n",
+       "          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): MistralRMSNorm()\n",
+       "        (post_attention_layernorm): MistralRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): MistralRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6805508096"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pytorch_total_params = sum(p.numel() for p in model.parameters())\n",
+    "pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"Fine-Tuned-S9-2\",\n",
+    "    overwrite_output_dir=True,\n",
+    "    bf16=True,\n",
+    "    # evaluation_strategy=\"epoch\",\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    learning_rate=2e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    per_device_eval_batch_size=2,\n",
+    "    lr_scheduler_type = 'cosine',\n",
+    "    push_to_hub=False,\n",
+    "    save_total_limit = 2,\n",
+    "    # save_strategy = “no”\n",
+    "    load_best_model_at_end=False,\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=lm_dataset[\"train\"],\n",
+    "    eval_dataset=lm_dataset[\"validation\"],\n",
+    "    # eval_dataset=lm_dataset[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started Training!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mthatmlguy\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.17.0"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/ShortGPT/short_gpt/wandb/run-20240516_090043-ni1hktjg</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/thatmlguy/huggingface/runs/ni1hktjg' target=\"_blank\">misty-serenity-4</a></strong> to <a href='https://wandb.ai/thatmlguy/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/thatmlguy/huggingface' target=\"_blank\">https://wandb.ai/thatmlguy/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/thatmlguy/huggingface/runs/ni1hktjg' target=\"_blank\">https://wandb.ai/thatmlguy/huggingface/runs/ni1hktjg</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='2' max='6459' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [   2/6459 : < :, Epoch 0.00/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "OutOfMemoryError",
+     "evalue": "CUDA out of memory. Tried to allocate 112.00 MiB. GPU ",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[49], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# trainer.train()\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mStarted Training!\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1859\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1857\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1858\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1859\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1860\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1861\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1862\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1863\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1864\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2203\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2200\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2202\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2203\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2205\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2206\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2207\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2208\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2209\u001b[0m ):\n\u001b[1;32m   2210\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2211\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3138\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3135\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3137\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3138\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   3141\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3161\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3159\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3160\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3161\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3162\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3163\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3164\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:822\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    821\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 822\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:810\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    809\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 810\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py:16\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     15\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 16\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1158\u001b[0m, in \u001b[0;36mMistralForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1155\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m   1157\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1158\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1159\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1160\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1161\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1162\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1163\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1164\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1165\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1166\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1167\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1168\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1170\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1171\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1043\u001b[0m, in \u001b[0;36mMistralModel.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1033\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m   1034\u001b[0m         decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m   1035\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1040\u001b[0m         use_cache,\n\u001b[1;32m   1041\u001b[0m     )\n\u001b[1;32m   1042\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1043\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1044\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1045\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1046\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1047\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1048\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1049\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1050\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1052\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1054\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:770\u001b[0m, in \u001b[0;36mMistralDecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)\u001b[0m\n\u001b[1;32m    768\u001b[0m residual \u001b[38;5;241m=\u001b[39m hidden_states\n\u001b[1;32m    769\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_attention_layernorm(hidden_states)\n\u001b[0;32m--> 770\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmlp\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    771\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m residual \u001b[38;5;241m+\u001b[39m hidden_states\n\u001b[1;32m    773\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (hidden_states,)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:179\u001b[0m, in \u001b[0;36mMistralMLP.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    178\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[0;32m--> 179\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdown_proj(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mact_fn(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgate_proj\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mup_proj(x))\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py:116\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 112.00 MiB. GPU "
+     ]
+    }
+   ],
+   "source": [
+    "# trainer.train()\n",
+    "print('Started Training!')\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "eval_results = trainer.evaluate()\n",
+    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # referencing https://github.com/meta-llama/llama-recipes/blob/main/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb\n",
+    "# eval_prompt = \"\"\"\n",
+    "# Summarize this dialog:\n",
+    "# A: Hi Tom, are you busy tomorrow's afternoon?\n",
+    "# B: I'm pretty sure I am. What's up?\n",
+    "# A: Can you go with me to the animal shelter?.\n",
+    "# B: What do you want to do?\n",
+    "# A: I want to get a puppy for my son.\n",
+    "# B: That will make him so happy.\n",
+    "# A: Yeah, we've discussed it many times. I think he's ready now.\n",
+    "# B: That's good. Raising a dog is a tough issue. Like having a baby ;-) \n",
+    "# A: I'll get him one of those little dogs.\n",
+    "# B: One that won't grow up too big;-)\n",
+    "# A: And eat too much;-))\n",
+    "# B: Do you know which one he would like?\n",
+    "# A: Oh, yes, I took him there last Monday. He showed me one that he really liked.\n",
+    "# B: I bet you had to drag him away.\n",
+    "# A: He wanted to take it home right away ;-).\n",
+    "# B: I wonder what he'll name it.\n",
+    "# A: He said he'd name it after his dead hamster - Lemmy  - he's  a great Motorhead fan :-)))\n",
+    "# ---\n",
+    "# Summary:\n",
+    "# \"\"\"\n",
+    "\n",
+    "# model_input = tokenizer(eval_prompt, return_tensors=\"pt\").to(\"cuda\")\n",
+    "\n",
+    "# model.eval()\n",
+    "# with torch.no_grad():\n",
+    "#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100, use_cache=True)[0], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def get_preprocessed_samsum():\n",
+    "#     dataset = load_dataset(\"samsum\", split=\"train\")\n",
+    "\n",
+    "#     prompt = (\n",
+    "#         f\"Summarize this dialog:\\n{{dialog}}\\n---\\nSummary:\\n\"\n",
+    "#     )\n",
+    "\n",
+    "#     def apply_prompt_template(sample):\n",
+    "#         return {\n",
+    "#             \"prompt\": prompt.format(dialog=sample[\"dialogue\"]),\n",
+    "#             \"summary\": sample[\"summary\"],\n",
+    "#         }\n",
+    "\n",
+    "#     dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))\n",
+    "\n",
+    "#     def tokenize_add_label(sample):\n",
+    "#         prompt = tokenizer.encode(tokenizer.bos_token + sample[\"prompt\"], add_special_tokens=False)\n",
+    "#         summary = tokenizer.encode(sample[\"summary\"] +  tokenizer.eos_token, add_special_tokens=False)\n",
+    "#         sample = {\n",
+    "#             \"input_ids\": prompt + summary,\n",
+    "#             \"attention_mask\" : [1] * (len(prompt) + len(summary)),\n",
+    "#             \"labels\": [-100] * len(prompt) + summary,\n",
+    "#             }\n",
+    "\n",
+    "#         return sample\n",
+    "\n",
+    "#     dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))\n",
+    "\n",
+    "#     return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.train()\n",
+    "\n",
+    "# def create_peft_config(model):\n",
+    "#     peft_config = LoraConfig(\n",
+    "#         task_type=TaskType.CAUSAL_LM,\n",
+    "#         inference_mode=False,\n",
+    "#         r=8,\n",
+    "#         lora_alpha=32,\n",
+    "#         lora_dropout=0.05,\n",
+    "#         target_modules = [\"q_proj\", \"v_proj\"]\n",
+    "#     )\n",
+    "\n",
+    "#     model = get_peft_model(model, peft_config)\n",
+    "#     model.print_trainable_parameters()\n",
+    "#     return model, peft_config\n",
+    "\n",
+    "# # create peft config\n",
+    "# model, lora_config = create_peft_config(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# output_dir = \"tmp/\"\n",
+    "\n",
+    "# config = {\n",
+    "#     'lora_config': lora_config,\n",
+    "#     'learning_rate': 1e-6,\n",
+    "#     'num_train_epochs': 1,\n",
+    "#     'per_device_train_batch_size': 1,\n",
+    "#     'gradient_checkpointing': False,\n",
+    "# }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# training_args = TrainingArguments(\n",
+    "#     output_dir=output_dir,\n",
+    "#     overwrite_output_dir=True,\n",
+    "#     # logging strategies\n",
+    "#     logging_strategy=\"steps\",\n",
+    "#     logging_steps=10,\n",
+    "#     save_strategy=\"no\",\n",
+    "#     optim=\"adamw_torch_fused\",\n",
+    "#     **{k:v for k,v in config.items() if k != 'lora_config'}\n",
+    "# )\n",
+    "\n",
+    "# # Create Trainer instance\n",
+    "# trainer = Trainer(\n",
+    "#     model=model,\n",
+    "#     args=training_args,\n",
+    "#     train_dataset=get_preprocessed_samsum(),\n",
+    "#     data_collator=default_data_collator,\n",
+    "#     callbacks=[],\n",
+    "# )\n",
+    "\n",
+    "# # Start training\n",
+    "# trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.eval()\n",
+    "# with torch.no_grad():\n",
+    "#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

short_gpt/.ipynb_checkpoints/short_llama-checkpoint.py ADDED Viewed

	@@ -0,0 +1,219 @@

+from typing import List, Optional
+import numpy as np
+import torch
+from llama import Llama, Transformer
+from metrics import *
+def sample_top_p(probs: torch.Tensor, p: float):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+    Returns:
+        torch.Tensor: Sampled token indices.
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+class TransformerWrapper(Transformer):
+    def __init__(self, model):
+        self.__dict__ = model.__dict__.copy()
+    @torch.inference_mode()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        start_pos: int,
+        return_hiddens: Optional[bool] = False):
+        """
+        Perform a forward pass through the Transformer model.
+        Args:
+            tokens (torch.Tensor): Input token indices.
+            start_pos (int): Starting position for attention caching.
+            (Optional) return_hiddens (bool): Whether to return hidden states. Defaults to False.
+        Returns:
+            torch.Tensor: Output logits after applying the Transformer model.
+            (Optional) List[torch.Tensor]: Hidden states for each transformer block.
+        """
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+        mask = None
+        if seqlen > 1:
+            mask = torch.full(
+                (seqlen, seqlen), float("-inf"), device=tokens.device
+            )
+            mask = torch.triu(mask, diagonal=1)
+            # When performing key-value caching, we compute the attention scores
+            # only for the new sequence. Thus, the matrix of scores is of size
+            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
+            # j > cache_len + i, since row i corresponds to token cache_len + i.
+            mask = torch.hstack([
+                torch.zeros((seqlen, start_pos), device=tokens.device),
+                mask
+            ]).type_as(h)
+        hiddens = [h]
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+            if return_hiddens:
+                hiddens.append(h)
+        h = self.norm(h)
+        output = self.output(h).float()
+        if return_hiddens:
+            return output, hiddens
+        return output
+class ShortLlama():
+    def __init__(self, llama: Llama, n_prune_layers: Optional[int] = None):
+        checkpoint = llama.model.state_dict()
+        llama.model = TransformerWrapper(llama.model)  # wrap transformer to collect hidden states
+        llama.model.load_state_dict(checkpoint, strict=False)
+        self.llama = llama
+        self.n_prune_layers = n_prune_layers
+        self.importances = [0 for _ in self.llama.model.layers]  # layer-wise importance scores
+    def remove_layers(
+        self,
+        layers_to_remove: Optional[List[int]] = [],
+        angular: Optional[bool] = False
+    ):
+        if angular:
+            assert self.importances, "Need to compute importances with eval_importance()"
+            assert self.n_prune_layers, "Need number of layers to prune, set `n_prune_layers`"
+            start_layer = np.argsort(np.array(self.importances[:-self.n_prune_layers+1]))[0]
+            layers_to_remove = list(range(start_layer, start_layer + self.n_prune_layers))
+        elif not layers_to_remove and self.n_prune_layers:
+            assert self.importances, "Need to compute importances with eval_importance()"
+            layers_to_remove = np.argsort(np.array(self.importances))[:self.n_prune_layers].tolist()
+        # remove layers in reverse to avoid indexing errors
+        for layer_idx in sorted(layers_to_remove, reverse=True):
+            try:
+                del self.llama.model.layers[layer_idx]
+            except IndexError:
+                print(f"layer {layer_idx} does not exist, function may have already been called")
+                return []
+        return layers_to_remove
+    def compute_bi(self, hiddens: List[torch.Tensor], angular: bool):
+        n = 1
+        if angular:
+            assert self.n_prune_layers is not None, "Set number of layers to prune to use angular importance"
+            n = self.n_prune_layers
+        for i in range(len(hiddens) - n):
+            in_hidden = hiddens[i]
+            out_hidden = hiddens[i+n]
+            if angular:
+                # use only last token for angular distance as described in section 3.2
+                # https://arxiv.org/pdf/2403.17887.pdf
+                in_hidden = in_hidden[:,-1:]
+                out_hidden = out_hidden[:,-1:]
+            self.importances[i] += block_influence(
+                in_hidden,
+                out_hidden,
+                angular=angular
+            ).sum().cpu().item()
+    @torch.inference_mode()
+    def eval_importance(
+        self,
+        prompt_tokens: List[List[int]],
+        max_gen_len: Optional[int] = 0,
+        temperature: Optional[float] = 0.6,
+        top_p: Optional[float] = 0.9,
+        angular: Optional[bool] = False
+    ):
+        """
+        Computes layer-wise importances over input tokens.
+        NOTE: ShortGPT paper performs no generation during importance computation, which suggests a `max_gen_len`= 0.
+        Args:
+            prompt_tokens (List[List[int]]): List of tokenized prompts, where each prompt is represented as a list of integers.
+            (Optional) max_gen_len (int): Maximum length of the generated text sequence.
+            (Optional) temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            (Optional) top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            (Optional) angular (bool): Whether to ues angular distance. Defaults to False.
+        Returns:
+            None
+        """
+        params = self.llama.model.params
+        bsz = len(prompt_tokens)
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+        assert max_prompt_len <= params.max_seq_len
+        total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
+        pad_id = self.llama.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+        prev_pos = 0
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
+        input_text_mask = tokens != pad_id
+        for cur_pos in range(min_prompt_len, total_len):
+            logits = self.llama.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
+            tokens[:, cur_pos] = next_token
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                next_token == self.llama.tokenizer.eos_id
+            )
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+        # compute block influence over full sequences rather than at each token
+        _, hiddens = self.llama.model.forward(tokens, 0, return_hiddens=True)
+        self.compute_bi(hiddens, angular=angular)
+        return

short_gpt/layer_removal.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from collections import OrderedDict
+import torch.nn as nn
+def layer_removal(
+    model: nn.Module,
+    layers_to_remove: OrderedDict
+):
+    """
+    Generic removal implementation
+    """
+    for layer_name, layer_idx in layers_to_remove.items():
+        modules = layer_name.split(".")
+        mod = model
+        for m in modules[:-1]:
+            mod = getattr(mod, m)
+        if layer_idx is None:
+            del getattr(mod, modules[-1])
+        else:
+            del getattr(mod, modules[-1])[layer_idx]

short_gpt/metrics.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+def block_influence(
+    input_hidden_state: torch.Tensor,
+    output_hidden_state: torch.Tensor,
+    angular=False,
+):
+    """
+    input_hidden_state: B, S, D
+    output_hidden_state: B, S, D
+    """
+    _, _, d = input_hidden_state.shape
+    input_hidden_state = input_hidden_state.reshape(-1, d)
+    output_hidden_state = output_hidden_state.reshape(-1, d)
+    norm_input = input_hidden_state.norm(dim=-1, keepdim=True)
+    norm_output = output_hidden_state.norm(dim=-1, keepdim=True)
+    sim = (input_hidden_state @ output_hidden_state.T) / (norm_input * norm_output)
+    sim = sim.diagonal().nan_to_num(nan=0.5)
+    if angular:
+        return (torch.arccos(sim) / torch.pi)
+    return 1 - sim

short_gpt/short_hf.ipynb ADDED Viewed

	@@ -0,0 +1,1679 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n",
+      "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.1.1)\n",
+      "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.40.2)\n",
+      "Requirement already satisfied: peft in /usr/local/lib/python3.10/dist-packages (0.10.0)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.13.1)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.2)\n",
+      "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.0.0)\n",
+      "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+      "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.2)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
+      "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+      "Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets) (2023.10.0)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.0b0)\n",
+      "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.0)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n",
+      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.8.0)\n",
+      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch) (8.9.2.26)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.3.1)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch) (11.0.2.54)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch) (10.3.2.106)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch) (11.4.5.107)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.0.106)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /usr/local/lib/python3.10/dist-packages (from torch) (2.18.1)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch) (12.1.105)\n",
+      "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.1.0)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.3.101)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.4.28)\n",
+      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.3)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft) (5.9.6)\n",
+      "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from peft) (0.30.0)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.6)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.1.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2023.11.17)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install datasets torch transformers peft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "from peft import (\n",
+    "    get_peft_model,\n",
+    "    LoraConfig,\n",
+    "    TaskType,\n",
+    ")\n",
+    "from transformers import default_data_collator, Trainer, TrainingArguments\n",
+    "\n",
+    "from short_hf import ShortHFModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data = load_dataset(\"pg19\", split=\"validation\")  # authors sample 10,000 texts to compute block influences\n",
+    "# dataloader = DataLoader(\n",
+    "#     data,\n",
+    "#     batch_size=2,\n",
+    "#     shuffle=True,\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"validation\")  # authors sample 10,000 texts to compute block influences\n",
+    "dataloader = DataLoader(\n",
+    "    data,\n",
+    "    batch_size=1,\n",
+    "    shuffle=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !huggingface-cli login\n",
+    "# pip install huggingface_hub\n",
+    "!python3 -c \"from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#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"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "asifahmed\n"
+     ]
+    }
+   ],
+   "source": [
+    "!huggingface-cli whoami"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pip install git+https://github.com/tri-ml/linear_open_lm.git\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9fcf366ecc414808b39285438599f5b9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# from open_lm.open_lm_hf import *\n",
+    "\n",
+    "MAX_SEQ_LEN = 2048\n",
+    "short_model = ShortHFModel(\n",
+    "    # model_name=\"tiiuae/falcon-7b\",\n",
+    "    model_name=\"mistralai/Mistral-7B-v0.1\",\n",
+    "    layers_path=\"model.layers\",\n",
+    "    n_prune_layers=2\n",
+    ")\n",
+    "# short_model.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralForCausalLM(\n",
+       "  (model): MistralModel(\n",
+       "    (embed_tokens): Embedding(32000, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-31): 32 x MistralDecoderLayer(\n",
+       "        (self_attn): MistralSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (rotary_emb): MistralRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): MistralMLP(\n",
+       "          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): MistralRMSNorm()\n",
+       "        (post_attention_layernorm): MistralRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): MistralRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# AutoModelForCausalLM.from_pretrained(\n",
+    "#             pretrained_model_name_or_path=model_dir,\n",
+    "#             local_files_only=True,\n",
+    "#             use_safetensors=True,\n",
+    "#             torch_dtype=torch.bfloat16,\n",
+    "#         )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<generator object Module.parameters at 0x7f00b3917840>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.model.parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7241732096"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pytorch_total_params = sum(p.numel() for p in short_model.model.parameters())\n",
+    "pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # Save the model state to the specified path.\n",
+    "# model_dir='ShortModelSaved/'\n",
+    "# short_model.model.save_pretrained(\n",
+    "#         save_directory=model_dir,\n",
+    "#         safe_serialization=True,\n",
+    "#     )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralDecoderLayer(\n",
+       "  (self_attn): MistralSdpaAttention(\n",
+       "    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (rotary_emb): MistralRotaryEmbedding()\n",
+       "  )\n",
+       "  (mlp): MistralMLP(\n",
+       "    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "    (act_fn): SiLU()\n",
+       "  )\n",
+       "  (input_layernorm): MistralRMSNorm()\n",
+       "  (post_attention_layernorm): MistralRMSNorm()\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['I am an avid fan of 3D printing. I have been using 3D printers for over 10 years and have been involved in the development of several 3D printers. I have also been involved in the development of several 3D printing software packages.\\n\\nI have been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages. I have also been involved in the development of several 3D printing software packages.']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sample generationThe evolution of AI has lead to \n",
+    "gen = short_model.model.generate(\n",
+    "    short_model.tokenizer([\"I am an avid fan of \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "    max_new_tokens=256\n",
+    ")\n",
+    "short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # sample generation\n",
+    "# gen = short_model.model.generate(\n",
+    "#     short_model.tokenizer([\"The evolution of AI has lead to  \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=256\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute Importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for i, batch in enumerate(tqdm(dataloader)):\n",
+    "#     prompts = batch['text']\n",
+    "\n",
+    "#     short_model.eval_importance(\n",
+    "#         prompts=prompts,\n",
+    "#         max_seq_len=MAX_SEQ_LEN,\n",
+    "#         stride=256,\n",
+    "#         max_gen_len=0\n",
+    "#     )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.importances"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove unimportant layers\n",
+    "\n",
+    "Layers removed when using subset of pg19 val set: [25, 26, 24, 27, 22, 23, 28, 21, 29]\n",
+    "\n",
+    "Authors mention that the layer order is quite nuanced and can vary with different datasets. However, relative order suggests similar importance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.remove_layers()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.remove_layers()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# short_model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # reassign layer_idx to attentions for caching\n",
+    "# for layer_idx, module in enumerate(short_model.layers):\n",
+    "#     module.self_attn.layer_idx = layer_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<generator object Module.parameters at 0x7f625768a2d0>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# short_model.model.parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7241732096"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# pytorch_total_params = sum(p.numel() for p in short_model.model.parameters())\n",
+    "# pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As the paper states: \\\n",
+    "    - \"Our experiments reveal that the effect of layer removal is significantly more pronounced on generative\n",
+    "        tasks compared to multiple-choice tasks. On benchmarks such as GSM8K (Cobbe et al., 2021) and\n",
+    "        HumanEval (Chen et al., 2021), removing 25% of the layers often leads to a severe performance\n",
+    "        drop, with scores approaching zero.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gen = short_model.model.generate(\n",
+    "#     short_model.tokenizer([\"I am an avid fan of  \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=20,\n",
+    "#     use_cache=True\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gen = short_model.model.generate(I am an avid fan of \n",
+    "#     short_model.tokenizer([\"The evolution of AI has lead to \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=20,\n",
+    "#     use_cache=True\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute Angular Importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6fd2bf4360b4aba801085bab0755a06",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3760 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for i, batch in enumerate(tqdm(dataloader)):\n",
+    "    prompts = batch['text']\n",
+    "\n",
+    "    short_model.eval_importance(\n",
+    "        prompts=prompts,\n",
+    "        max_seq_len=MAX_SEQ_LEN,\n",
+    "        stride=256,\n",
+    "        max_gen_len=0,\n",
+    "        angular=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[128390.1328125,\n",
+       " 80922.06787109375,\n",
+       " 61075.2890625,\n",
+       " nan,\n",
+       " nan,\n",
+       " 56557.81268310547,\n",
+       " nan,\n",
+       " 52294.552001953125,\n",
+       " 47928.185302734375,\n",
+       " 42335.215576171875,\n",
+       " 40547.564208984375,\n",
+       " 37178.684326171875,\n",
+       " 34713.912841796875,\n",
+       " 33843.728271484375,\n",
+       " 35384.353271484375,\n",
+       " 35603.388427734375,\n",
+       " 35621.970458984375,\n",
+       " 35356.719482421875,\n",
+       " 35365.243896484375,\n",
+       " 34914.025146484375,\n",
+       " 27854.576904296875,\n",
+       " 24398.073974609375,\n",
+       " 20450.390380859375,\n",
+       " 19501.300537109375,\n",
+       " 18430.427490234375,\n",
+       " 18231.873779296875,\n",
+       " 17917.493896484375,\n",
+       " 17806.815185546875,\n",
+       " 21227.195068359375,\n",
+       " 23928.313018798828,\n",
+       " 22738.702880859375,\n",
+       " 86123.783203125]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.importances"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove unimportant layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[27, 28]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.remove_layers(angular=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralDecoderLayer(\n",
+       "  (self_attn): MistralSdpaAttention(\n",
+       "    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "    (rotary_emb): MistralRotaryEmbedding()\n",
+       "  )\n",
+       "  (mlp): MistralMLP(\n",
+       "    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "    (act_fn): SiLU()\n",
+       "  )\n",
+       "  (input_layernorm): MistralRMSNorm()\n",
+       "  (post_attention_layernorm): MistralRMSNorm()\n",
+       ")"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-29): 30 x MistralDecoderLayer(\n",
+       "    (self_attn): MistralSdpaAttention(\n",
+       "      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (rotary_emb): MistralRotaryEmbedding()\n",
+       "    )\n",
+       "    (mlp): MistralMLP(\n",
+       "      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "      (act_fn): SiLU()\n",
+       "    )\n",
+       "    (input_layernorm): MistralRMSNorm()\n",
+       "    (post_attention_layernorm): MistralRMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reassign layer_idx to attentions for caching\n",
+    "for layer_idx, module in enumerate(short_model.layers):\n",
+    "    module.self_attn.layer_idx = layer_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-29): 30 x MistralDecoderLayer(\n",
+       "    (self_attn): MistralSdpaAttention(\n",
+       "      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "      (rotary_emb): MistralRotaryEmbedding()\n",
+       "    )\n",
+       "    (mlp): MistralMLP(\n",
+       "      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "      (act_fn): SiLU()\n",
+       "    )\n",
+       "    (input_layernorm): MistralRMSNorm()\n",
+       "    (post_attention_layernorm): MistralRMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['I am an avid fan of 19th century American literature. I have read all of the classics, and I have also read many of the lesser known works. I have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens, and I have read all of his novels. I have also read many of the novels of other 19th century authors, such as Jane Austen, William Shakespeare, and William Blake.\\n\\nI have a particular interest in the works of Charles Dickens']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gen = short_model.model.generate(\n",
+    "    short_model.tokenizer([\"I am an avid fan of \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "    max_new_tokens=256,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['The evolution of AI has lead to 3 major types of AI:\\n\\n1. Strong AI\\n2. Weak AI\\n3. Super AI\\n\\nStrong AI is the type of AI that is capable of performing any task that a human can perform. This type of AI is still in the development phase and is not yet available in the market.\\n\\nWeak AI is the type of AI that is capable of performing a specific task. This type of AI is available in the market and is used in a variety of applications.\\n\\nSuper AI is the type of AI that is capable of performing any task that a human can perform and is also capable of learning and adapting. This type of AI is still in the development phase and is not yet available in the market.\\n\\n## What is the difference between AI and AI?\\n\\nThe difference between AI and AI is that AI is a type of artificial intelligence that is capable of performing a specific task, while AI is a type of artificial intelligence that is capable of performing any task.\\n\\n## What is the difference between AI and AI?\\n\\nThe difference between AI and AI is that AI is a type of artificial intelligence that is capable of performing a specific task, while AI is a type of artificial intelligence that is capable']"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# gen = short_model.model.generate(I am an avid fan of \n",
+    "#     short_model.tokenizer([\"The evolution of AI has lead to \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "#     max_new_tokens=256,\n",
+    "#     use_cache=True\n",
+    "# )\n",
+    "# short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)\n",
+    "\n",
+    "\n",
+    "gen = short_model.model.generate(\n",
+    "    short_model.tokenizer([\"The evolution of AI has lead to \"], return_tensors='pt').input_ids.to(\"cuda\"),\n",
+    "    max_new_tokens=256,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "short_model.tokenizer.batch_decode(gen, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6805508096"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pytorch_total_params = sum(p.numel() for p in short_model.model.parameters())\n",
+    "pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # Save the model state to the specified path.\n",
+    "model_dir='SmallModelSaved/'\n",
+    "short_model.model.save_pretrained(\n",
+    "        save_directory=model_dir,\n",
+    "        safe_serialization=True,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model Healing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tokenizer = short_model.tokenizer\n",
+    "model = short_model.model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Datset Loaded!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "# Falcon = load_dataset(\"csv\", data_files=\"FalconData.csv\")\n",
+    "Falcon = load_dataset('csv', data_files={\"train\": 'FalconData2.csv', \"validation\": 'FalconDataEval2.csv'})\n",
+    "\n",
+    "print('Datset Loaded!')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'School Picture Gallery\\nFrance Ski School\\nChildren from Year 5 & 6 travelled to France from Newcastle airport to take part in a week of Ski School. The children had already spent 3 weeks learning the basics of skiing at Silksworth Ski School in Sunderland. When the children arrived in France they took part in a daily Ski School, during which the children made OUTSTANDING progress. The children also took part in French activities, explored local landmarks and took part in shopping activities in Chamonix. It was an incredible adventure for the children and staff!'}"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Falcon = Falcon.train_test_split(test_size=0.10)\n",
+    "\n",
+    "\"\"\"Then take a look at an example:\"\"\"\n",
+    "\n",
+    "Falcon['train'][0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'Our Annual Garden Party is a fun-filled event with a ton of landscaping and garden supplies; gardening demonstrations, experts, and vendors; activities for kids; live bands; and local food. It’s been so popular that we’re extending it to TWO DAYS this year!\\nFestivities at 10am – 4pm Saturday and 11am – 3pm Sunday\\nShopping from 9am – 6pm both days\\nThroughout the winter, we collect gently-used and surplus lawn & garden supplies as well as outdoor décor and furniture. Then, we put it all out for your shopping pleasure! The sale begins at 9:00 am Saturday, but folks start lining up outside the gates even earlier, eager to dig through piles of flowerpots and shovels. (If you can’t get there in the morning, don’t worry – the staff continues to bring out items throughout the weekend.)\\nThe Garden Sale 1st.\\nThere will be prizes for people and pets dressed in garden party finery.\\nPhoto by Carrie Delesky\\nSo find yourself a dapper suit or fancy hat, and check out all the activities in store for you:\\nAnacostia Watershed Society\\nPrince George’s Chapter, Maryland Master Gardeners\\nMOM’s Organic Market\\nTreincarnation\\nVeteran Compost\\nPhoto by Carrie Delesky\\nSaturday the Forklift’s Matt Menke and Gary Barnhart of GL Barnhart Construction. Drop in for a while, or stay the whole.'}"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon['validation'][0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:\"\"\"\n",
+    "\n",
+    "from transformers import AutoTokenizer, GPT2TokenizerFast\n",
+    "\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(\"distilgpt2\")\n",
+    "\n",
+    "\n",
+    "tokenizer = GPT2TokenizerFast.from_pretrained(\"Xenova/gpt-4\")#, cache_dir=cache_dir)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Text': 'School Picture Gallery\\nFrance Ski School\\nChildren from Year 5 & 6 travelled to France from Newcastle airport to take part in a week of Ski School. The children had already spent 3 weeks learning the basics of skiing at Silksworth Ski School in Sunderland. When the children arrived in France they took part in a daily Ski School, during which the children made OUTSTANDING progress. The children also took part in French activities, explored local landmarks and took part in shopping activities in Chamonix. It was an incredible adventure for the children and staff!'}"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Falcon = Falcon.flatten()\n",
+    "Falcon[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The OrderedVocab you are attempting to save contains holes for indices [100256, 100261, 100262, 100263, 100266, 100267, 100268, 100269, 100270, 100271, 100272, 100273, 100274, 100275], your vocabulary could be corrupted !\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d2182d4fa561406ab7eb5fc6c19c6d17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (10412 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (10738 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (12860 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (23091 > 8192). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The OrderedVocab you are attempting to save contains holes for indices [100256, 100261, 100262, 100263, 100266, 100267, 100268, 100269, 100270, 100271, 100272, 100273, 100274, 100275], your vocabulary could be corrupted !\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "121ffe72baf143f4aeea4616bee88405",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (9078 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (15886 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (28727 > 8192). Running this sequence through the model will result in indexing errors\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (8257 > 8192). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
+   "source": [
+    "def preprocess_function(examples):\n",
+    "    return tokenizer([\" \".join(x) for x in examples[\"Text\"]])\n",
+    "\n",
+    "\n",
+    "\n",
+    "tokenized_Falcon = Falcon.map(\n",
+    "    preprocess_function,\n",
+    "    batched=True,\n",
+    "    num_proc=4,\n",
+    "    remove_columns=Falcon[\"train\"].column_names,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d7b13436ae54624bd96973987373482",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "beade64b537441ef99a54830bb66eef2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# block_size = tokenizer.model_max_length\n",
+    "block_size = 2048\n",
+    "\n",
+    "\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "    # Split by chunks of block_size.\n",
+    "    result = {\n",
+    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "        for k, t in concatenated_examples.items()\n",
+    "    }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result\n",
+    "\n",
+    "\"\"\"Apply the `group_texts` function over the entire dataset:\"\"\"\n",
+    "\n",
+    "lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForLanguageModeling\n",
+    "\n",
+    "# tokenizer.pad_token = tokenizer.eos_token\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
+    "# import torch\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"tensorplex-labs/pretraining-sn9-7B-5\", torch_dtype=torch.bfloat16)\n",
+    "\n",
+    "# print('Model Loaded!')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "MistralForCausalLM(\n",
+       "  (model): MistralModel(\n",
+       "    (embed_tokens): Embedding(32000, 4096)\n",
+       "    (layers): ModuleList(\n",
+       "      (0-29): 30 x MistralDecoderLayer(\n",
+       "        (self_attn): MistralSdpaAttention(\n",
+       "          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n",
+       "          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
+       "          (rotary_emb): MistralRotaryEmbedding()\n",
+       "        )\n",
+       "        (mlp): MistralMLP(\n",
+       "          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n",
+       "          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n",
+       "          (act_fn): SiLU()\n",
+       "        )\n",
+       "        (input_layernorm): MistralRMSNorm()\n",
+       "        (post_attention_layernorm): MistralRMSNorm()\n",
+       "      )\n",
+       "    )\n",
+       "    (norm): MistralRMSNorm()\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6805508096"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pytorch_total_params = sum(p.numel() for p in model.parameters())\n",
+    "pytorch_total_params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"Fine-Tuned-S9-2\",\n",
+    "    overwrite_output_dir=True,\n",
+    "    bf16=True,\n",
+    "    # evaluation_strategy=\"epoch\",\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    learning_rate=2e-5,\n",
+    "    weight_decay=0.01,\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    per_device_eval_batch_size=2,\n",
+    "    lr_scheduler_type = 'cosine',\n",
+    "    push_to_hub=False,\n",
+    "    save_total_limit = 2,\n",
+    "    # save_strategy = “no”\n",
+    "    load_best_model_at_end=False,\n",
+    ")\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=lm_dataset[\"train\"],\n",
+    "    eval_dataset=lm_dataset[\"validation\"],\n",
+    "    # eval_dataset=lm_dataset[\"test\"],\n",
+    "    data_collator=data_collator,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Started Training!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mthatmlguy\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.17.0"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/workspace/ShortGPT/short_gpt/wandb/run-20240516_090043-ni1hktjg</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/thatmlguy/huggingface/runs/ni1hktjg' target=\"_blank\">misty-serenity-4</a></strong> to <a href='https://wandb.ai/thatmlguy/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/thatmlguy/huggingface' target=\"_blank\">https://wandb.ai/thatmlguy/huggingface</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/thatmlguy/huggingface/runs/ni1hktjg' target=\"_blank\">https://wandb.ai/thatmlguy/huggingface/runs/ni1hktjg</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='2' max='6459' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [   2/6459 : < :, Epoch 0.00/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "OutOfMemoryError",
+     "evalue": "CUDA out of memory. Tried to allocate 112.00 MiB. GPU ",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[49], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# trainer.train()\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mStarted Training!\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1859\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1857\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1858\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1859\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1860\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1861\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1862\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1863\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1864\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2203\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2200\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2202\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2203\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2205\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2206\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2207\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2208\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2209\u001b[0m ):\n\u001b[1;32m   2210\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2211\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3138\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3135\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3137\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3138\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3140\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   3141\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3161\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3159\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3160\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3161\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3162\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3163\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3164\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:822\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    821\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 822\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:810\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    809\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 810\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py:16\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     15\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 16\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1158\u001b[0m, in \u001b[0;36mMistralForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1155\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m   1157\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1158\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1159\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1160\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1161\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1162\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1163\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1164\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1165\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1166\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1167\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1168\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1170\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1171\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1043\u001b[0m, in \u001b[0;36mMistralModel.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1033\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m   1034\u001b[0m         decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m   1035\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1040\u001b[0m         use_cache,\n\u001b[1;32m   1041\u001b[0m     )\n\u001b[1;32m   1042\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1043\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1044\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1045\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1046\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1047\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1048\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1049\u001b[0m \u001b[43m        \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1050\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1052\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1054\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:770\u001b[0m, in \u001b[0;36mMistralDecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)\u001b[0m\n\u001b[1;32m    768\u001b[0m residual \u001b[38;5;241m=\u001b[39m hidden_states\n\u001b[1;32m    769\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpost_attention_layernorm(hidden_states)\n\u001b[0;32m--> 770\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmlp\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    771\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m residual \u001b[38;5;241m+\u001b[39m hidden_states\n\u001b[1;32m    773\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (hidden_states,)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:179\u001b[0m, in \u001b[0;36mMistralMLP.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    178\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[0;32m--> 179\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdown_proj(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mact_fn(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgate_proj\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mup_proj(x))\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py:116\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 116\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 112.00 MiB. GPU "
+     ]
+    }
+   ],
+   "source": [
+    "# trainer.train()\n",
+    "print('Started Training!')\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "eval_results = trainer.evaluate()\n",
+    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # referencing https://github.com/meta-llama/llama-recipes/blob/main/recipes/finetuning/huggingface_trainer/peft_finetuning.ipynb\n",
+    "# eval_prompt = \"\"\"\n",
+    "# Summarize this dialog:\n",
+    "# A: Hi Tom, are you busy tomorrow's afternoon?\n",
+    "# B: I'm pretty sure I am. What's up?\n",
+    "# A: Can you go with me to the animal shelter?.\n",
+    "# B: What do you want to do?\n",
+    "# A: I want to get a puppy for my son.\n",
+    "# B: That will make him so happy.\n",
+    "# A: Yeah, we've discussed it many times. I think he's ready now.\n",
+    "# B: That's good. Raising a dog is a tough issue. Like having a baby ;-) \n",
+    "# A: I'll get him one of those little dogs.\n",
+    "# B: One that won't grow up too big;-)\n",
+    "# A: And eat too much;-))\n",
+    "# B: Do you know which one he would like?\n",
+    "# A: Oh, yes, I took him there last Monday. He showed me one that he really liked.\n",
+    "# B: I bet you had to drag him away.\n",
+    "# A: He wanted to take it home right away ;-).\n",
+    "# B: I wonder what he'll name it.\n",
+    "# A: He said he'd name it after his dead hamster - Lemmy  - he's  a great Motorhead fan :-)))\n",
+    "# ---\n",
+    "# Summary:\n",
+    "# \"\"\"\n",
+    "\n",
+    "# model_input = tokenizer(eval_prompt, return_tensors=\"pt\").to(\"cuda\")\n",
+    "\n",
+    "# model.eval()\n",
+    "# with torch.no_grad():\n",
+    "#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100, use_cache=True)[0], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def get_preprocessed_samsum():\n",
+    "#     dataset = load_dataset(\"samsum\", split=\"train\")\n",
+    "\n",
+    "#     prompt = (\n",
+    "#         f\"Summarize this dialog:\\n{{dialog}}\\n---\\nSummary:\\n\"\n",
+    "#     )\n",
+    "\n",
+    "#     def apply_prompt_template(sample):\n",
+    "#         return {\n",
+    "#             \"prompt\": prompt.format(dialog=sample[\"dialogue\"]),\n",
+    "#             \"summary\": sample[\"summary\"],\n",
+    "#         }\n",
+    "\n",
+    "#     dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))\n",
+    "\n",
+    "#     def tokenize_add_label(sample):\n",
+    "#         prompt = tokenizer.encode(tokenizer.bos_token + sample[\"prompt\"], add_special_tokens=False)\n",
+    "#         summary = tokenizer.encode(sample[\"summary\"] +  tokenizer.eos_token, add_special_tokens=False)\n",
+    "#         sample = {\n",
+    "#             \"input_ids\": prompt + summary,\n",
+    "#             \"attention_mask\" : [1] * (len(prompt) + len(summary)),\n",
+    "#             \"labels\": [-100] * len(prompt) + summary,\n",
+    "#             }\n",
+    "\n",
+    "#         return sample\n",
+    "\n",
+    "#     dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))\n",
+    "\n",
+    "#     return dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.train()\n",
+    "\n",
+    "# def create_peft_config(model):\n",
+    "#     peft_config = LoraConfig(\n",
+    "#         task_type=TaskType.CAUSAL_LM,\n",
+    "#         inference_mode=False,\n",
+    "#         r=8,\n",
+    "#         lora_alpha=32,\n",
+    "#         lora_dropout=0.05,\n",
+    "#         target_modules = [\"q_proj\", \"v_proj\"]\n",
+    "#     )\n",
+    "\n",
+    "#     model = get_peft_model(model, peft_config)\n",
+    "#     model.print_trainable_parameters()\n",
+    "#     return model, peft_config\n",
+    "\n",
+    "# # create peft config\n",
+    "# model, lora_config = create_peft_config(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# output_dir = \"tmp/\"\n",
+    "\n",
+    "# config = {\n",
+    "#     'lora_config': lora_config,\n",
+    "#     'learning_rate': 1e-6,\n",
+    "#     'num_train_epochs': 1,\n",
+    "#     'per_device_train_batch_size': 1,\n",
+    "#     'gradient_checkpointing': False,\n",
+    "# }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# training_args = TrainingArguments(\n",
+    "#     output_dir=output_dir,\n",
+    "#     overwrite_output_dir=True,\n",
+    "#     # logging strategies\n",
+    "#     logging_strategy=\"steps\",\n",
+    "#     logging_steps=10,\n",
+    "#     save_strategy=\"no\",\n",
+    "#     optim=\"adamw_torch_fused\",\n",
+    "#     **{k:v for k,v in config.items() if k != 'lora_config'}\n",
+    "# )\n",
+    "\n",
+    "# # Create Trainer instance\n",
+    "# trainer = Trainer(\n",
+    "#     model=model,\n",
+    "#     args=training_args,\n",
+    "#     train_dataset=get_preprocessed_samsum(),\n",
+    "#     data_collator=default_data_collator,\n",
+    "#     callbacks=[],\n",
+    "# )\n",
+    "\n",
+    "# # Start training\n",
+    "# trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model.eval()\n",
+    "# with torch.no_grad():\n",
+    "#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

short_gpt/short_llama.ipynb ADDED Viewed

	@@ -0,0 +1,573 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "from llama import Llama\n",
+    "\n",
+    "from short_llama import ShortLlama"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Shivaen\\anaconda3\\envs\\shortgpt\\lib\\site-packages\\datasets\\load.py:1461: FutureWarning: The repository for pg19 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/pg19\n",
+      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
+      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = load_dataset(\"pg19\", split=\"validation\")  # authors sample 10,000 texts to compute block influences\n",
+    "dataloader = DataLoader(\n",
+    "    data,\n",
+    "    batch_size=1,\n",
+    "    shuffle=True,\n",
+    "    generator=torch.Generator(device=\"cuda\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fetch and Wrap Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> initializing model parallel with size 1\n",
+      "> initializing ddp with size 1\n",
+      "> initializing pipeline with size 1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Shivaen\\anaconda3\\envs\\shortgpt\\lib\\site-packages\\torch\\__init__.py:696: UserWarning: torch.set_default_tensor_type() is deprecated as of PyTorch 2.1, please use torch.set_default_dtype() and torch.set_default_device() as alternatives. (Triggered internally at C:\\cb\\pytorch_1000000000000\\work\\torch\\csrc\\tensor\\python_tensor.cpp:453.)\n",
+      "  _C._set_default_tensor_type(t)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded in 10.96 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "MAX_SEQ_LEN = 1024  # authors use a context width of 1024\n",
+    "llama = Llama.build(\n",
+    "    ckpt_dir=\"../../llama/llama-2-7b\",\n",
+    "    tokenizer_path=\"../../llama/tokenizer.model\",\n",
+    "    max_seq_len=MAX_SEQ_LEN,\n",
+    "    max_batch_size=1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-31): 32 x TransformerBlock(\n",
+       "    (attention): Attention(\n",
+       "      (wq): ColumnParallelLinear()\n",
+       "      (wk): ColumnParallelLinear()\n",
+       "      (wv): ColumnParallelLinear()\n",
+       "      (wo): RowParallelLinear()\n",
+       "    )\n",
+       "    (feed_forward): FeedForward(\n",
+       "      (w1): ColumnParallelLinear()\n",
+       "      (w2): RowParallelLinear()\n",
+       "      (w3): ColumnParallelLinear()\n",
+       "    )\n",
+       "    (attention_norm): RMSNorm()\n",
+       "    (ffn_norm): RMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama = ShortLlama(llama=llama, n_prune_layers=9)\n",
+    "\n",
+    "short_llama.llama.model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'generation': '1960s-70s era pop music. I grew up listening to the radio'}]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sample generation\n",
+    "short_llama.llama.text_completion(\n",
+    "    prompts=[\"I am an avid fan of \"],\n",
+    "    max_gen_len=20\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute Importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bf50ed0464aa454386d996e71b4541b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/50 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for batch in tqdm(dataloader):\n",
+    "    prompts = batch['text']\n",
+    "\n",
+    "    prompt_tokens = [short_llama.llama.tokenizer.encode(x, bos=True, eos=False) for x in prompts]\n",
+    "    max_prompt_len = max(len(t) for t in prompt_tokens)\n",
+    "\n",
+    "    # authors use a sliding window of size 1024 with a shift of 256\n",
+    "    for start in range(0, max_prompt_len, 256):\n",
+    "\n",
+    "        inputs = [p[start:start+MAX_SEQ_LEN] for p in prompt_tokens if len(p) > start]\n",
+    "\n",
+    "        short_llama.eval_importance(\n",
+    "            prompt_tokens=inputs,\n",
+    "            max_gen_len=0\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[8358921.716796875,\n",
+       " 5211709.220703125,\n",
+       " 3259066.66796875,\n",
+       " 3164092.5087890625,\n",
+       " 3518517.248046875,\n",
+       " 3153696.0009765625,\n",
+       " 3062620.751953125,\n",
+       " 2856062.2998046875,\n",
+       " 2674124.23828125,\n",
+       " 2545894.03125,\n",
+       " 2382950.501953125,\n",
+       " 2194983.1455078125,\n",
+       " 2146358.5107421875,\n",
+       " 2180816.779296875,\n",
+       " 2145900.15234375,\n",
+       " 2126212.3974609375,\n",
+       " 2180678.5244140625,\n",
+       " 1686190.7548828125,\n",
+       " 1524035.5732421875,\n",
+       " 1270041.162109375,\n",
+       " 1368594.52734375,\n",
+       " 954588.056640625,\n",
+       " 944560.7900390625,\n",
+       " 780482.943359375,\n",
+       " 743930.5283203125,\n",
+       " 732873.1806640625,\n",
+       " 745402.265625,\n",
+       " 733417.81640625,\n",
+       " 762292.994140625,\n",
+       " 771143.9541015625,\n",
+       " 1303522.251953125,\n",
+       " 5824847.5546875]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.importances"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove unimportant layers\n",
+    "\n",
+    "Layers removed when using pg19 val set: [25, 27, 24, 26, 28, 29, 23, 22, 21]\n",
+    "\n",
+    "Note: Different order than paper but same 9 least important layers -> [27, 26, 25, 28, 24, 29, 23, 21, 22]\n",
+    "\n",
+    "Additionally, authors mention that the layer order is quite nuanced and can vary with different datasets. However, relative order suggests similar importance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[25, 27, 24, 26, 28, 29, 23, 22, 21]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.remove_layers()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-22): 23 x TransformerBlock(\n",
+       "    (attention): Attention(\n",
+       "      (wq): ColumnParallelLinear()\n",
+       "      (wk): ColumnParallelLinear()\n",
+       "      (wv): ColumnParallelLinear()\n",
+       "      (wo): RowParallelLinear()\n",
+       "    )\n",
+       "    (feed_forward): FeedForward(\n",
+       "      (w1): ColumnParallelLinear()\n",
+       "      (w2): RowParallelLinear()\n",
+       "      (w3): ColumnParallelLinear()\n",
+       "    )\n",
+       "    (attention_norm): RMSNorm()\n",
+       "    (ffn_norm): RMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.llama.model.layers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As the paper states: \\\n",
+    "    - \"Our experiments reveal that the effect of layer removal is significantly more pronounced on generative\n",
+    "        tasks compared to multiple-choice tasks. On benchmarks such as GSM8K (Cobbe et al., 2021) and\n",
+    "        HumanEval (Chen et al., 2021), removing 25% of the layers often leads to a severe performance\n",
+    "        drop, with scores approaching zero.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'generation': 'Đo n Khơ 20th Century. Hinweis: In = ,t and lồ'}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.llama.text_completion(\n",
+    "    prompts=[\"I am an avid fan of \"],\n",
+    "    max_gen_len=20\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compute Angular Importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8ae0be70aa9344edbd252648c84e08e0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/50 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for batch in tqdm(dataloader):\n",
+    "    prompts = batch['text']\n",
+    "\n",
+    "    prompt_tokens = [short_llama.llama.tokenizer.encode(x, bos=True, eos=False) for x in prompts]\n",
+    "    max_prompt_len = max(len(t) for t in prompt_tokens)\n",
+    "\n",
+    "    # authors use a sliding window of size 1024 with a shift of 256\n",
+    "    for start in range(0, max_prompt_len, 256):\n",
+    "\n",
+    "        inputs = [p[start:start+MAX_SEQ_LEN] for p in prompt_tokens if len(p) > start]\n",
+    "\n",
+    "        short_llama.eval_importance(\n",
+    "            prompt_tokens=inputs,\n",
+    "            max_gen_len=0,\n",
+    "            angular=True\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[8640.460205078125,\n",
+       " 7881.541015625,\n",
+       " 7303.3876953125,\n",
+       " 7156.226318359375,\n",
+       " 7003.533935546875,\n",
+       " 6749.5189208984375,\n",
+       " 6630.6031494140625,\n",
+       " 6494.6051025390625,\n",
+       " 6475.490295410156,\n",
+       " 6482.81884765625,\n",
+       " 6489.277587890625,\n",
+       " 6479.0064697265625,\n",
+       " 6486.2188720703125,\n",
+       " 6440.6580810546875,\n",
+       " 6338.8604736328125,\n",
+       " 6196.098876953125,\n",
+       " 6014.3204345703125,\n",
+       " 5677.5113525390625,\n",
+       " 5532.0673828125,\n",
+       " 5384.6334228515625,\n",
+       " 5314.61669921875,\n",
+       " 5176.587646484375,\n",
+       " 5425.315673828125,\n",
+       " 7029.1893310546875,\n",
+       " 0,\n",
+       " 0,\n",
+       " 0,\n",
+       " 0,\n",
+       " 0,\n",
+       " 0,\n",
+       " 0,\n",
+       " 0]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.importances"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Remove unimportant layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[21, 22, 23, 24, 25, 26, 27, 28, 29]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.remove_layers(angular=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ModuleList(\n",
+       "  (0-22): 23 x TransformerBlock(\n",
+       "    (attention): Attention(\n",
+       "      (wq): ColumnParallelLinear()\n",
+       "      (wk): ColumnParallelLinear()\n",
+       "      (wv): ColumnParallelLinear()\n",
+       "      (wo): RowParallelLinear()\n",
+       "    )\n",
+       "    (feed_forward): FeedForward(\n",
+       "      (w1): ColumnParallelLinear()\n",
+       "      (w2): RowParallelLinear()\n",
+       "      (w3): ColumnParallelLinear()\n",
+       "    )\n",
+       "    (attention_norm): RMSNorm()\n",
+       "    (ffn_norm): RMSNorm()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.llama.model.layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'generation': 'Đo n Khơ 20th Century. Hinweis: In = ,t and lồ'}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "short_llama.llama.text_completion(\n",
+    "    prompts=[\"I am an avid fan of \"],\n",
+    "    max_gen_len=20\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "shortgpt",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

short_gpt/short_llama.py ADDED Viewed

	@@ -0,0 +1,219 @@

+from typing import List, Optional
+import numpy as np
+import torch
+from llama import Llama, Transformer
+from metrics import *
+def sample_top_p(probs: torch.Tensor, p: float):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+    Returns:
+        torch.Tensor: Sampled token indices.
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+class TransformerWrapper(Transformer):
+    def __init__(self, model):
+        self.__dict__ = model.__dict__.copy()
+    @torch.inference_mode()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        start_pos: int,
+        return_hiddens: Optional[bool] = False):
+        """
+        Perform a forward pass through the Transformer model.
+        Args:
+            tokens (torch.Tensor): Input token indices.
+            start_pos (int): Starting position for attention caching.
+            (Optional) return_hiddens (bool): Whether to return hidden states. Defaults to False.
+        Returns:
+            torch.Tensor: Output logits after applying the Transformer model.
+            (Optional) List[torch.Tensor]: Hidden states for each transformer block.
+        """
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+        mask = None
+        if seqlen > 1:
+            mask = torch.full(
+                (seqlen, seqlen), float("-inf"), device=tokens.device
+            )
+            mask = torch.triu(mask, diagonal=1)
+            # When performing key-value caching, we compute the attention scores
+            # only for the new sequence. Thus, the matrix of scores is of size
+            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
+            # j > cache_len + i, since row i corresponds to token cache_len + i.
+            mask = torch.hstack([
+                torch.zeros((seqlen, start_pos), device=tokens.device),
+                mask
+            ]).type_as(h)
+        hiddens = [h]
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+            if return_hiddens:
+                hiddens.append(h)
+        h = self.norm(h)
+        output = self.output(h).float()
+        if return_hiddens:
+            return output, hiddens
+        return output
+class ShortLlama():
+    def __init__(self, llama: Llama, n_prune_layers: Optional[int] = None):
+        checkpoint = llama.model.state_dict()
+        llama.model = TransformerWrapper(llama.model)  # wrap transformer to collect hidden states
+        llama.model.load_state_dict(checkpoint, strict=False)
+        self.llama = llama
+        self.n_prune_layers = n_prune_layers
+        self.importances = [0 for _ in self.llama.model.layers]  # layer-wise importance scores
+    def remove_layers(
+        self,
+        layers_to_remove: Optional[List[int]] = [],
+        angular: Optional[bool] = False
+    ):
+        if angular:
+            assert self.importances, "Need to compute importances with eval_importance()"
+            assert self.n_prune_layers, "Need number of layers to prune, set `n_prune_layers`"
+            start_layer = np.argsort(np.array(self.importances[:-self.n_prune_layers+1]))[0]
+            layers_to_remove = list(range(start_layer, start_layer + self.n_prune_layers))
+        elif not layers_to_remove and self.n_prune_layers:
+            assert self.importances, "Need to compute importances with eval_importance()"
+            layers_to_remove = np.argsort(np.array(self.importances))[:self.n_prune_layers].tolist()
+        # remove layers in reverse to avoid indexing errors
+        for layer_idx in sorted(layers_to_remove, reverse=True):
+            try:
+                del self.llama.model.layers[layer_idx]
+            except IndexError:
+                print(f"layer {layer_idx} does not exist, function may have already been called")
+                return []
+        return layers_to_remove
+    def compute_bi(self, hiddens: List[torch.Tensor], angular: bool):
+        n = 1
+        if angular:
+            assert self.n_prune_layers is not None, "Set number of layers to prune to use angular importance"
+            n = self.n_prune_layers
+        for i in range(len(hiddens) - n):
+            in_hidden = hiddens[i]
+            out_hidden = hiddens[i+n]
+            if angular:
+                # use only last token for angular distance as described in section 3.2
+                # https://arxiv.org/pdf/2403.17887.pdf
+                in_hidden = in_hidden[:,-1:]
+                out_hidden = out_hidden[:,-1:]
+            self.importances[i] += block_influence(
+                in_hidden,
+                out_hidden,
+                angular=angular
+            ).sum().cpu().item()
+    @torch.inference_mode()
+    def eval_importance(
+        self,
+        prompt_tokens: List[List[int]],
+        max_gen_len: Optional[int] = 0,
+        temperature: Optional[float] = 0.6,
+        top_p: Optional[float] = 0.9,
+        angular: Optional[bool] = False
+    ):
+        """
+        Computes layer-wise importances over input tokens.
+        NOTE: ShortGPT paper performs no generation during importance computation, which suggests a `max_gen_len`= 0.
+        Args:
+            prompt_tokens (List[List[int]]): List of tokenized prompts, where each prompt is represented as a list of integers.
+            (Optional) max_gen_len (int): Maximum length of the generated text sequence.
+            (Optional) temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            (Optional) top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            (Optional) angular (bool): Whether to ues angular distance. Defaults to False.
+        Returns:
+            None
+        """
+        params = self.llama.model.params
+        bsz = len(prompt_tokens)
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+        assert max_prompt_len <= params.max_seq_len
+        total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
+        pad_id = self.llama.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+        prev_pos = 0
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
+        input_text_mask = tokens != pad_id
+        for cur_pos in range(min_prompt_len, total_len):
+            logits = self.llama.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
+            tokens[:, cur_pos] = next_token
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                next_token == self.llama.tokenizer.eos_id
+            )
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+        # compute block influence over full sequences rather than at each token
+        _, hiddens = self.llama.model.forward(tokens, 0, return_hiddens=True)
+        self.compute_bi(hiddens, angular=angular)
+        return