Upload Portuguese_Sentiment_Analysis.ipynb

Browse files

Files changed (1) hide show

Portuguese_Sentiment_Analysis.ipynb +1829 -0

Portuguese_Sentiment_Analysis.ipynb ADDED Viewed

	@@ -0,0 +1,1829 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5YjKoArIA-wD"
+      },
+      "outputs": [],
+      "source": [
+        "# Instruction to avoid the extense logs for pip install\n",
+        "%%capture\n",
+        "\n",
+        "# This block installs all the dependencies required to train a basic sentiment analyzer.\n",
+        "# It only needs to run once at the beginning of the code execution.\n",
+        "!pip install -U pt_pump_up\n",
+        "!pip install -U datasets\n",
+        "!pip install -U transformers"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "HkrJAyMwFC2k",
+        "outputId": "c3f8c7f2-d27c-40f0-da62-634abc098551"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0mCollecting numpy==1.26.0\n",
+            "  Using cached numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0mInstalling collected packages: numpy\n",
+            "  Attempting uninstall: numpy\n",
+            "    Found existing installation: numpy 1.26.4\n",
+            "    Uninstalling numpy-1.26.4:\n",
+            "      Successfully uninstalled numpy-1.26.4\n",
+            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "cudf-cu12 24.4.1 requires pandas<2.2.2dev0,>=2.0, but you have pandas 2.2.2 which is incompatible.\n",
+            "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 which is incompatible.\n",
+            "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 16.1.0 which is incompatible.\n",
+            "pt-pump-up 0.0.11 requires numpy>=1.26.4, but you have numpy 1.26.0 which is incompatible.\u001b[0m\u001b[31m\n",
+            "\u001b[0mSuccessfully installed numpy-1.26.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Hotfix to deal with module 'numpy.linalg._umath_linalg' has no attribute '_ilp64' error that only happens in colab\n",
+        "!pip install numpy==1.26.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vaxy_YLFB8PE"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "import torch\n",
+        "\n",
+        "# The model needs a GPU to Train. Without GPU it would take ages to train\n",
+        "if not torch.cuda.is_available():\n",
+        "    raise Exception(\"GPU not available. Please enable it in the notebook settings.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TLbjjbX2uI2x",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "ed180b72-bab9-4e2a-e848-6ae92425c01d"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0mRequirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.20.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.15.3)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.0)\n",
+            "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (16.1.0)\n",
+            "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
+            "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+            "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n",
+            "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.4)\n",
+            "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
+            "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
+            "Requirement already satisfied: fsspec[http]<=2024.5.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n",
+            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n",
+            "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.4)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.1)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.12.2)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.3.2)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.7)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.0.7)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.6.2)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n",
+            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ],
+      "source": [
+        "# HuggingFace standard library to download datasets from the HuggingFace Hub\n",
+        "!pip install -U datasets\n",
+        "\n",
+        "from datasets import load_dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "NonjqjgkAzkm"
+      },
+      "outputs": [],
+      "source": [
+        "# Usage of the PT-Pump-Up library is not mandatory, but it will make your life easier.\n",
+        "# It reuses code previously developed for similar NLP tasks. That is already tested and validated.\n",
+        "from pt_pump_up.benchmarking import TrainerFactory"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "J5-xoiDDXS0R"
+      },
+      "source": [
+        "### Sentiment Analysis with BERT"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "zax-hsAmXVMD",
+        "outputId": "0e7370f4-15d4-44e1-e04d-700bf3bc50a8"
+      },
+      "source": [
+        "import gc\n",
+        "import torch\n",
+        "\n",
+        "torch.cuda.empty_cache()\n",
+        "gc.collect()\n",
+        "\n",
+        "\n",
+        "!nvidia-smi\n"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Thu Aug  1 16:02:17 2024       \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
+            "|-----------------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                      |               MIG M. |\n",
+            "|=========================================+======================+======================|\n",
+            "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   73C    P0              31W /  70W |  11471MiB / 15360MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                            |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+            "|        ID   ID                                                             Usage      |\n",
+            "|=======================================================================================|\n",
+            "+---------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6YNYgk8EC9f_"
+      },
+      "source": [
+        "### Setup\n",
+        "\n",
+        "We'll need [the Transformers library](https://huggingface.co/transformers/) by Hugging Face:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "U4Xpam-YYN0D",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "ff83ebe3-63db-4ff3-f4fe-7737566b1a93"
+      },
+      "source": [
+        "!pip install -q -U watermark"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wq4WwHKPDE_x",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "4b0610e4-5ae5-4f92-833c-4d6e547d89fe"
+      },
+      "source": [
+        "!pip install -qq transformers"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_W04uiKzDGnF",
+        "outputId": "d301c90f-ec5a-40db-8781-4623f4a609c5"
+      },
+      "source": [
+        "%reload_ext watermark\n",
+        "%watermark -v -p numpy,pandas,torch,transformers"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Python implementation: CPython\n",
+            "Python version       : 3.10.12\n",
+            "IPython version      : 7.34.0\n",
+            "\n",
+            "numpy       : 1.26.0\n",
+            "pandas      : 2.2.2\n",
+            "torch       : 2.3.0+cu121\n",
+            "transformers: 4.43.2\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install huggingface_hub"
+      ],
+      "metadata": {
+        "id": "CzEnu5jZc4vM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!huggingface-cli login"
+      ],
+      "metadata": {
+        "id": "qDortxA9c1dG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from collections import defaultdict\n",
+        "import torch\n",
+        "from huggingface_hub import HfApi, upload_file\n",
+        "api = HfApi()\n",
+        "\n",
+        "# Defina o nome do repositório\n",
+        "repo_id = \"insfsilva/SentimentAnalysis-IMDB-Portuguese-bert-large-portuguese-cased\""
+      ],
+      "metadata": {
+        "id": "6fJCtx-McxeG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hexarFBTDo4r"
+      },
+      "source": [
+        "#### Setup & Config"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wVtdTVF4DazQ"
+      },
+      "source": [
+        "import transformers\n",
+        "from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup\n",
+        "import torch\n",
+        "\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import seaborn as sns\n",
+        "from pylab import rcParams\n",
+        "import matplotlib.pyplot as plt\n",
+        "from matplotlib import rc\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.metrics import confusion_matrix, classification_report\n",
+        "from collections import defaultdict\n",
+        "from textwrap import wrap\n",
+        "\n",
+        "from torch import nn, optim\n",
+        "from torch.utils.data import Dataset, DataLoader\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "%matplotlib inline\n",
+        "%config InlineBackend.figure_format='retina'\n",
+        "\n",
+        "sns.set(style='whitegrid', palette='muted', font_scale=1.2)\n",
+        "\n",
+        "HAPPY_COLORS_PALETTE = [\"#01BEFE\", \"#FFDD00\", \"#FF7D00\", \"#FF006D\", \"#ADFF02\", \"#8F00FF\"]\n",
+        "\n",
+        "sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))\n",
+        "\n",
+        "rcParams['figure.figsize'] = 12, 8\n",
+        "\n",
+        "RANDOM_SEED = 42\n",
+        "np.random.seed(RANDOM_SEED)\n",
+        "torch.manual_seed(RANDOM_SEED)\n",
+        "\n",
+        "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
+        "device"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_PhgD2fiWmTo"
+      },
+      "source": [
+        "df = pd.read_csv('/content/imdb-reviews-pt-br.csv')\n",
+        "df.head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9C1BU77AEFgI"
+      },
+      "source": [
+        "df.shape"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WXZDP5WwEKO8"
+      },
+      "source": [
+        "We have about 11k examples. Let's check for missing values:\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "iXPjHlkIWZvA"
+      },
+      "source": [
+        "df.info()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E8xR01alEwOP"
+      },
+      "source": [
+        "Great, no missing values in the score and review texts! Do we have class imbalance?"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "WUfzETcuExZq"
+      },
+      "source": [
+        "sns.countplot(df.sentiment)\n",
+        "plt.xlabel('Labels');"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "okU44oZQE7Tt"
+      },
+      "source": [
+        "class_names = ['negative', 'positive']"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qVxTGOeMFAhw"
+      },
+      "source": [
+        "ax = sns.countplot(df.sentiment)\n",
+        "plt.xlabel('review sentiment')\n",
+        "ax.set_xticklabels(class_names);"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df['sentiment'] = df['sentiment'].map({'neg': 0, 'pos': 1})\n",
+        "\n",
+        "# Exibe as primeiras linhas do DataFrame após a conversão\n",
+        "print(\"\\nApós a conversão:\")\n",
+        "print(df.head())\n"
+      ],
+      "metadata": {
+        "id": "HGkxsgVdDlOi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df"
+      ],
+      "metadata": {
+        "id": "SgSz7TDyEJCJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2RIoVgVwFGxN"
+      },
+      "source": [
+        "## Data Preprocessing\n",
+        "\n",
+        "You might already know that Machine Learning models don't work with raw text. You need to convert text to numbers (of some sort). BERT requires even more attention (good one, right?). Here are the requirements:\n",
+        "\n",
+        "- Add special tokens to separate sentences and do classification\n",
+        "- Pass sequences of constant length (introduce padding)\n",
+        "- Create array of 0s (pad token) and 1s (real token) called *attention mask*\n",
+        "\n",
+        "The Transformers library provides (you've guessed it) a wide variety of Transformer models (including BERT). It works with TensorFlow and PyTorch! It also includes prebuild tokenizers that do the heavy lifting for us!\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sVZo_lRVFDQh"
+      },
+      "source": [
+        "PRE_TRAINED_MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7Qp57OGfHy4M"
+      },
+      "source": [
+        "https://huggingface.co/neuralmind/bert-base-portuguese-cased\n",
+        "\n",
+        "https://github.com/neuralmind-ai/portuguese-bert"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V-oaIAX6GQd-"
+      },
+      "source": [
+        "Let's load a pre-trained [BertTokenizer](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_14-cLRrGGP-"
+      },
+      "source": [
+        "tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CmDWEEIFGU85"
+      },
+      "source": [
+        "We'll use this text to understand the tokenization process:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GHf6ubxeGVUo"
+      },
+      "source": [
+        "sample_txt = 'Quem conta um conto aumenta um ponto.'"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ANQLQ87YGuhI"
+      },
+      "source": [
+        "Some basic operations can convert the text to tokens and tokens to unique integers (ids):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Q0ZYj5BqGSRY"
+      },
+      "source": [
+        "tokens = tokenizer.tokenize(sample_txt)\n",
+        "token_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
+        "\n",
+        "print(f' Sentence: {sample_txt}')\n",
+        "print(f'   Tokens: {tokens}')\n",
+        "print(f'Token IDs: {token_ids}')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "v_D7UIRrKH9g"
+      },
+      "source": [
+        "### Special Tokens\n",
+        "\n",
+        "`[SEP]` - marker for ending of a sentence"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TuUvyBXFKI_0"
+      },
+      "source": [
+        "tokenizer.sep_token, tokenizer.sep_token_id"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bmidBkN1KOyu"
+      },
+      "source": [
+        "`[CLS]` - we must add this token to the start of each sentence, so BERT knows we're doing classification"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rJLW4_zEKKLU"
+      },
+      "source": [
+        "tokenizer.cls_token, tokenizer.cls_token_id"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rVutL1_1KS16"
+      },
+      "source": [
+        "There is also a special token for padding:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "E6hw82DTKTSI"
+      },
+      "source": [
+        "tokenizer.pad_token, tokenizer.pad_token_id"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B6nVbxU-KWOZ"
+      },
+      "source": [
+        "BERT understands tokens that were in the training set. Everything else can be encoded using the `[UNK]` (unknown) token:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fcoGQ_i4KY5_"
+      },
+      "source": [
+        "tokenizer.unk_token, tokenizer.unk_token_id"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "39XPl_0-Kdck"
+      },
+      "source": [
+        "All of that work can be done using the [`encode_plus()`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.encode_plus) method:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "WuqMkd4UKQtv"
+      },
+      "source": [
+        "encoding = tokenizer.encode_plus(\n",
+        "  sample_txt,\n",
+        "  max_length=32,\n",
+        "  add_special_tokens=True, # Add '[CLS]' and '[SEP]'\n",
+        "  return_token_type_ids=False,\n",
+        "  #padding='longest',\n",
+        "  pad_to_max_length=True,\n",
+        "  return_attention_mask=True,\n",
+        "  return_tensors='pt',  # Return PyTorch tensors\n",
+        ")\n",
+        "\n",
+        "encoding.keys()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V64k2WIHKj_W"
+      },
+      "source": [
+        "The token ids are now stored in a Tensor and padded to a length of 32:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "llb6Q4JoKkVs"
+      },
+      "source": [
+        "print(len(encoding['input_ids'][0]))\n",
+        "encoding['input_ids'][0]"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GJ3591WBKoFh"
+      },
+      "source": [
+        "The attention mask has the same length:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-YFYP7_zKoW3"
+      },
+      "source": [
+        "print(len(encoding['attention_mask'][0]))\n",
+        "encoding['attention_mask']"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KUJXjJVhKrmF"
+      },
+      "source": [
+        "We can inverse the tokenization to have a look at the special tokens:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "UPlKoS5dKsFG"
+      },
+      "source": [
+        "tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-w0xt3vKKwMq"
+      },
+      "source": [
+        "### Choosing Sequence Length\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "36vHlDrzKxj3"
+      },
+      "source": [
+        "token_lens = []\n",
+        "\n",
+        "for txt in df.text_pt:\n",
+        "  tokens = tokenizer.encode(txt, max_length=512)\n",
+        "  token_lens.append(len(tokens))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YV-5tsnXLntz"
+      },
+      "source": [
+        "and plot the distribution:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "K7aQTcu8Lp26"
+      },
+      "source": [
+        "sns.distplot(token_lens)\n",
+        "plt.xlim([0, 520]);\n",
+        "plt.xlabel('Token count');"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "token_lens[500]\n",
+        "max(token_lens)"
+      ],
+      "metadata": {
+        "id": "4WC0s1mLGgiU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "djv5kONnLxb-"
+      },
+      "source": [
+        "Most of the reviews seem to contain less than 128 tokens, but we'll be on the safe side and choose a maximum length of 160."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "UQalJqpBLrLB"
+      },
+      "source": [
+        "MAX_LEN = 512"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dKDyNuXWL5KX"
+      },
+      "source": [
+        "We have all building blocks required to create a PyTorch dataset. Let's do it:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "DxQSt1sZL5kp"
+      },
+      "source": [
+        "class IMDBDataset(Dataset):\n",
+        "\n",
+        "  def __init__(self, texts, labels, tokenizer, max_len):\n",
+        "    self.texts = texts\n",
+        "    self.labels = labels\n",
+        "    self.tokenizer = tokenizer\n",
+        "    self.max_len = max_len\n",
+        "\n",
+        "  def __len__(self):\n",
+        "    return len(self.texts)\n",
+        "\n",
+        "  def __getitem__(self, item):\n",
+        "    text = str(self.texts[item])\n",
+        "    label = self.labels[item]\n",
+        "\n",
+        "    encoding = self.tokenizer.encode_plus(\n",
+        "      text,\n",
+        "      add_special_tokens=True,\n",
+        "      max_length=self.max_len,\n",
+        "      return_token_type_ids=False,\n",
+        "      #padding='longest',\n",
+        "      pad_to_max_length=True,\n",
+        "      return_attention_mask=True,\n",
+        "      return_tensors='pt',\n",
+        "    )\n",
+        "\n",
+        "    return {\n",
+        "      'text': text,\n",
+        "      'input_ids': encoding['input_ids'].flatten(),\n",
+        "      'attention_mask': encoding['attention_mask'].flatten(),\n",
+        "      'labels': torch.tensor(label, dtype=torch.long)\n",
+        "    }"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2_w5mFC9L-i6"
+      },
+      "source": [
+        "The tokenizer is doing most of the heavy lifting for us. We also return the review texts, so it'll be easier to evaluate the predictions from our model. Let's split the data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "aLqEExhdL-_p"
+      },
+      "source": [
+        "df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)\n",
+        "df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tGSB9J8tMBIC"
+      },
+      "source": [
+        "df_train.shape, df_val.shape, df_test.shape"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Tid8js2AMEzJ"
+      },
+      "source": [
+        "We also need to create a couple of data loaders. Here's a helper function to do it:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ys08dkFnMFOx"
+      },
+      "source": [
+        "def create_data_loader(df, tokenizer, max_len, batch_size):\n",
+        "  ds =  IMDBDataset(\n",
+        "    texts=df.text_pt.to_numpy(),\n",
+        "    labels=df.sentiment.to_numpy(),\n",
+        "    tokenizer=tokenizer,\n",
+        "    max_len=max_len\n",
+        "  )\n",
+        "\n",
+        "  return DataLoader(\n",
+        "    ds,\n",
+        "    batch_size=batch_size,\n",
+        "    num_workers=4\n",
+        "  )"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9ht8GyZnMGqi"
+      },
+      "source": [
+        "BATCH_SIZE = 32\n",
+        "\n",
+        "train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)\n",
+        "val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)\n",
+        "test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ZTB7u2Y5MOAY"
+      },
+      "source": [
+        "Let's have a look at an example batch from our training data loader:\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "6KxMc6MleUDu"
+      },
+      "source": [
+        "len(train_data_loader)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fK1LVloAMJr1"
+      },
+      "source": [
+        "data = next(iter(train_data_loader))\n",
+        "data.keys()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tkeqE-FWMPiZ"
+      },
+      "source": [
+        "print(data['input_ids'].shape)\n",
+        "print(data['attention_mask'].shape)\n",
+        "print(data['labels'].shape)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4OhEv9k9Mca6"
+      },
+      "source": [
+        "## Sentiment Classification with BERT and Hugging Face"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "J8yGbr4vMhYv"
+      },
+      "source": [
+        "There are a lot of helpers that make using BERT easy with the Transformers library. Depending on the task you might want to use [BertForSequenceClassification](https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification), [BertForQuestionAnswering](https://huggingface.co/transformers/model_doc/bert.html#bertforquestionanswering) or something else.\n",
+        "\n",
+        "But who cares, right? We're *hardcore*! We'll use the basic [BertModel](https://huggingface.co/transformers/model_doc/bert.html#bertmodel) and build our sentiment classifier on top of it. Let's load the model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "B_Zv3uXgMh9l"
+      },
+      "source": [
+        "bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-wc2P7aiMpGC"
+      },
+      "source": [
+        "And try to use it on the encoding of our sample text:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tsPxPiG5Mptq"
+      },
+      "source": [
+        "last_hidden_state, pooled_output = bert_model(\n",
+        "  input_ids=encoding['input_ids'],\n",
+        "  attention_mask=encoding['attention_mask']\n",
+        ")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Z9TpeUr8Ms7b"
+      },
+      "source": [
+        "The `last_hidden_state` is a sequence of hidden states of the last layer of the model. Obtaining the `pooled_output` is done by applying the [BertPooler](https://github.com/huggingface/transformers/blob/edf0582c0be87b60f94f41c659ea779876efc7be/src/transformers/modeling_bert.py#L426) on `last_hidden_state`."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I2tokVklN3OS"
+      },
+      "source": [
+        "You can think of the `pooled_output` as a summary of the content, according to BERT. Albeit, you might try and do better. Let's look at the shape of the output."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Su91OfW7MtaK"
+      },
+      "source": [
+        "bert_model.config.hidden_size"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sadx51ayOJSx"
+      },
+      "source": [
+        "We can use all of this knowledge to create a classifier that uses the BERT model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lM-0eiqhNUbN"
+      },
+      "source": [
+        "class SentimentClassifier(nn.Module):\n",
+        "\n",
+        "  def __init__(self, n_classes):\n",
+        "    super(SentimentClassifier, self).__init__()\n",
+        "    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)\n",
+        "    self.drop = nn.Dropout(p=0.3)\n",
+        "    #The last_hidden_state is a sequence of hidden states of the last layer of the model\n",
+        "    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)\n",
+        "\n",
+        "  def forward(self, input_ids, attention_mask):\n",
+        "    _, pooled_output = self.bert(\n",
+        "      input_ids=input_ids,\n",
+        "      attention_mask=attention_mask\n",
+        "    )\n",
+        "    output = self.drop(pooled_output)\n",
+        "    return self.out(output)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oWHPz4z6OMU9"
+      },
+      "source": [
+        "Our classifier delegates most of the heavy lifting to the BertModel. We use a dropout layer for some regularization and a fully-connected layer for our output. Note that we're returning the raw output of the last layer since that is required for the cross-entropy loss function in PyTorch to work.\n",
+        "\n",
+        "This should work like any other PyTorch model. Let's create an instance and move it to the GPU:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9Z4pMOzZOMuv"
+      },
+      "source": [
+        "model = SentimentClassifier(len(class_names))\n",
+        "model = model.to(device)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "len(class_names)"
+      ],
+      "metadata": {
+        "id": "lvADEz3lIjbP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OqsR6aKCOT5o"
+      },
+      "source": [
+        "We'll move the example batch of our training data to the GPU:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "OdKyQQKhOUZ7"
+      },
+      "source": [
+        "input_ids = data['input_ids'].to(device)\n",
+        "attention_mask = data['attention_mask'].to(device)\n",
+        "\n",
+        "print(input_ids.shape) # batch size x seq length\n",
+        "print(attention_mask.shape) # batch size x seq length"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ux4i9Nr1OYWT"
+      },
+      "source": [
+        "To get the predicted probabilities from our trained model, we'll apply the softmax function to the outputs:"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7Q8gFB-7dsYU"
+      },
+      "source": [
+        "### Training\n",
+        "\n",
+        "To reproduce the training procedure from the BERT paper, we'll use the [AdamW](https://huggingface.co/transformers/main_classes/optimizer_schedules.html#adamw) optimizer provided by Hugging Face. It corrects weight decay, so it's similar to the original paper. We'll also use a linear scheduler with no warmup steps:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7X1BJqEXEjB-"
+      },
+      "source": [
+        "EPOCHS = 4\n",
+        "\n",
+        "optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)\n",
+        "total_steps = len(train_data_loader) * EPOCHS\n",
+        "\n",
+        "scheduler = get_linear_schedule_with_warmup(\n",
+        "  optimizer,\n",
+        "  num_warmup_steps=0,\n",
+        "  num_training_steps=total_steps\n",
+        ")\n",
+        "\n",
+        "loss_fn = nn.CrossEntropyLoss().to(device)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GJu9X-TEd2-t"
+      },
+      "source": [
+        "How do we come up with all hyperparameters? The BERT authors have some recommendations for fine-tuning:\n",
+        "\n",
+        "- Batch size: 16, 32\n",
+        "- Learning rate (Adam): 5e-5, 3e-5, 2e-5\n",
+        "- Number of epochs: 2, 3, 4\n",
+        "\n",
+        "\n",
+        "Let's continue with writing a helper function for training our model for one epoch:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tvGtIud7d1O0"
+      },
+      "source": [
+        "def train_epoch(\n",
+        "  model,\n",
+        "  data_loader,\n",
+        "  loss_fn,\n",
+        "  optimizer,\n",
+        "  device,\n",
+        "  scheduler,\n",
+        "  n_examples\n",
+        "):\n",
+        "  model = model.train()\n",
+        "\n",
+        "  losses = []\n",
+        "  correct_predictions = 0\n",
+        "\n",
+        "  for d in data_loader:\n",
+        "    input_ids = d[\"input_ids\"].to(device)\n",
+        "    attention_mask = d[\"attention_mask\"].to(device)\n",
+        "    labels = d[\"labels\"].to(device)\n",
+        "\n",
+        "    outputs = model(\n",
+        "      input_ids=input_ids,\n",
+        "      attention_mask=attention_mask\n",
+        "    )\n",
+        "\n",
+        "    _, preds = torch.max(outputs, dim=1)\n",
+        "    loss = loss_fn(outputs, labels)\n",
+        "\n",
+        "    correct_predictions += torch.sum(preds == labels)\n",
+        "    losses.append(loss.item())\n",
+        "\n",
+        "    loss.backward()\n",
+        "    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
+        "    optimizer.step()\n",
+        "    scheduler.step()\n",
+        "    optimizer.zero_grad()\n",
+        "\n",
+        "  return correct_predictions.double() / n_examples, np.mean(losses)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JJQJjc1Hd6Zt"
+      },
+      "source": [
+        "Training the model should look familiar, except for two things. The scheduler gets called every time a batch is fed to the model. We're avoiding exploding gradients by clipping the gradients of the model using [clip_grad_norm_](https://pytorch.org/docs/stable/nn.html#clip-grad-norm).\n",
+        "\n",
+        "Let's write another one that helps us evaluate the model on a given data loader:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Xkg8XyzZd5to"
+      },
+      "source": [
+        "def eval_model(model, data_loader, loss_fn, device, n_examples):\n",
+        "  model = model.eval()\n",
+        "\n",
+        "  losses = []\n",
+        "  correct_predictions = 0\n",
+        "\n",
+        "  with torch.no_grad():\n",
+        "    for d in data_loader:\n",
+        "      input_ids = d[\"input_ids\"].to(device)\n",
+        "      attention_mask = d[\"attention_mask\"].to(device)\n",
+        "      labels = d[\"labels\"].to(device)\n",
+        "\n",
+        "      outputs = model(\n",
+        "        input_ids=input_ids,\n",
+        "        attention_mask=attention_mask\n",
+        "      )\n",
+        "      _, preds = torch.max(outputs, dim=1)\n",
+        "\n",
+        "      loss = loss_fn(outputs, labels)\n",
+        "\n",
+        "      correct_predictions += torch.sum(preds == labels)\n",
+        "      losses.append(loss.item())\n",
+        "\n",
+        "  return correct_predictions.double() / n_examples, np.mean(losses)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import HfApi\n",
+        "api = HfApi()\n",
+        "api.create_repo(repo_id=repo_id, exist_ok=True)"
+      ],
+      "metadata": {
+        "id": "AfCElgQYekkH"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import Repository\n",
+        "repo = Repository(local_dir=\"best_model\", clone_from=repo_id)"
+      ],
+      "metadata": {
+        "id": "iE7vC_74iy64"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dKtLIg1Qd-gC"
+      },
+      "source": [
+        "Using those two, we can write our training loop. We'll also store the training history:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FD7hwSShd-HU"
+      },
+      "source": [
+        "%%time\n",
+        "\n",
+        "history = defaultdict(list)\n",
+        "best_accuracy = 0\n",
+        "\n",
+        "for epoch in range(EPOCHS):\n",
+        "\n",
+        "  print(f'Epoch {epoch + 1}/{EPOCHS}')\n",
+        "  print('-' * 4)\n",
+        "\n",
+        "  train_acc, train_loss = train_epoch(\n",
+        "    model,\n",
+        "    train_data_loader,\n",
+        "    loss_fn,\n",
+        "    optimizer,\n",
+        "    device,\n",
+        "    scheduler,\n",
+        "    len(df_train)\n",
+        "  )\n",
+        "\n",
+        "  print(f'Train loss {train_loss} accuracy {train_acc}')\n",
+        "\n",
+        "  val_acc, val_loss = eval_model(\n",
+        "    model,\n",
+        "    val_data_loader,\n",
+        "    loss_fn,\n",
+        "    device,\n",
+        "    len(df_val)\n",
+        "  )\n",
+        "\n",
+        "  print(f'Val   loss {val_loss} accuracy {val_acc}')\n",
+        "  print()\n",
+        "\n",
+        "  history['train_acc'].append(train_acc)\n",
+        "  history['train_loss'].append(train_loss)\n",
+        "  history['val_acc'].append(val_acc)\n",
+        "  history['val_loss'].append(val_loss)\n",
+        "\n",
+        "\n",
+        "  if val_acc > best_accuracy:\n",
+        "        print(\"Melhor modelo encontrado, a guardar e a fazer push para o Hub...\")\n",
+        "\n",
+        "        model.save_pretrained('best_model')\n",
+        "        tokenizer.save_pretrained('best_model')\n",
+        "\n",
+        "\n",
+        "        repo.push_to_hub(commit_message=f\"epoch {epoch + 1} - val_acc {val_acc:.4f}\")\n",
+        "\n",
+        "        best_accuracy = val_acc\n",
+        "\n",
+        "\n",
+        "repo.push_to_hub()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5TPmJAz9TKlv"
+      },
+      "source": [
+        "Note that we're storing the state of the best model, indicated by the highest validation accuracy.\n",
+        "\n",
+        "Whoo, this took some time! We can look at the training vs validation accuracy:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "anE8h-syhBiB"
+      },
+      "source": [
+        "plt.plot(history['train_acc'], label='train accuracy')\n",
+        "plt.plot(history['val_acc'], label='validation accuracy')\n",
+        "\n",
+        "plt.title('Training history')\n",
+        "plt.ylabel('Accuracy')\n",
+        "plt.xlabel('Epoch')\n",
+        "plt.legend()\n",
+        "plt.ylim([0, 1]);"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CusxQuuFTPwj"
+      },
+      "source": [
+        "The training accuracy starts to approach 100% after 10 epochs or so. You might try to fine-tune the parameters a bit more, but this will be good enough for us.\n",
+        "\n",
+        "Don't want to wait? Uncomment the next cell to download my pre-trained model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NdKQjFZaTMj7"
+      },
+      "source": [
+        "# !gdown --id 1-2Lnf3e1wN25WwhX35tMOqQM_8Ds-2Ey  #==> PUT MY ID 1-2Lnf3e1wN25WwhX35tMOqQM_8Ds-2Ey/\n",
+        "\n",
+        "# model = SentimentClassifier(len(class_names))\n",
+        "# model.load_state_dict(torch.load('best_model_state.bin'))\n",
+        "# model = model.to(device)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ie-jdgYtTVMl"
+      },
+      "source": [
+        "### Evaluation\n",
+        "\n",
+        "So how good is our model on predicting sentiment? Let's start by calculating the accuracy on the test data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VZko6upATa7I"
+      },
+      "source": [
+        "test_acc, _ = eval_model(\n",
+        "  model,\n",
+        "  test_data_loader,\n",
+        "  loss_fn,\n",
+        "  device,\n",
+        "  len(df_test)\n",
+        ")\n",
+        "\n",
+        "test_acc.item()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "viRJUSeOTneC"
+      },
+      "source": [
+        "The accuracy is about 1% lower on the test set. Our model seems to generalize well.\n",
+        "\n",
+        "We'll define a helper function to get the predictions from our model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "eWVt9c9yTbY2"
+      },
+      "source": [
+        "\n",
+        "def get_predictions(model, data_loader):\n",
+        "  model = model.eval()\n",
+        "\n",
+        "  texts_pt = []\n",
+        "  predictions = []\n",
+        "  prediction_probs = []\n",
+        "  real_values = []\n",
+        "\n",
+        "  with torch.no_grad():\n",
+        "    for d in data_loader:\n",
+        "\n",
+        "      texts = d[\"text\"]\n",
+        "      input_ids = d[\"input_ids\"].to(device)\n",
+        "      attention_mask = d[\"attention_mask\"].to(device)\n",
+        "      labels = d[\"labels\"].to(device)\n",
+        "\n",
+        "      outputs = model(\n",
+        "        input_ids=input_ids,\n",
+        "        attention_mask=attention_mask\n",
+        "      )\n",
+        "      _, preds = torch.max(outputs, dim=1)\n",
+        "\n",
+        "      probs = F.softmax(outputs, dim=1)\n",
+        "\n",
+        "      texts_pt.extend(texts)\n",
+        "      predictions.extend(preds)\n",
+        "      prediction_probs.extend(probs)\n",
+        "      real_values.extend(labels)\n",
+        "\n",
+        "  predictions = torch.stack(predictions).cpu()\n",
+        "  prediction_probs = torch.stack(prediction_probs).cpu()\n",
+        "  real_values = torch.stack(real_values).cpu()\n",
+        "  return texts_pt, predictions, prediction_probs, real_values"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bE7DFsi1T4pk"
+      },
+      "source": [
+        "This is similar to the evaluation function, except that we're storing the text of the reviews and the predicted probabilities (by applying the softmax on the model outputs):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wqDtEAAET2qD"
+      },
+      "source": [
+        "y_text_pt, y_pred, y_pred_probs, y_test = get_predictions(\n",
+        "  model,\n",
+        "  test_data_loader\n",
+        ")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "D3fNSJqdT8oT"
+      },
+      "source": [
+        "Let's have a look at the classification report\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4nx51SfMT69t"
+      },
+      "source": [
+        "print(classification_report(y_test, y_pred, target_names=class_names))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LISbHZAIT-it"
+      },
+      "source": [
+        "def show_confusion_matrix(confusion_matrix):\n",
+        "  hmap = sns.heatmap(confusion_matrix, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
+        "  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')\n",
+        "  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')\n",
+        "  plt.ylabel('True sentiment')\n",
+        "  plt.xlabel('Predicted sentiment');\n",
+        "\n",
+        "cm = confusion_matrix(y_test, y_pred)\n",
+        "df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)\n",
+        "show_confusion_matrix(df_cm)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SGrauBCKUNs4"
+      },
+      "source": [
+        "\n",
+        "This confirms that our model is having difficulty classifying neutral reviews. It mistakes those for negative and positive at a roughly equal frequency.\n",
+        "\n",
+        "That's a good overview of the performance of our model. But let's have a look at an example from our test data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ryOEAWnCUDes"
+      },
+      "source": [
+        "idx = 10\n",
+        "\n",
+        "texts = y_text_pt[idx]\n",
+        "true_sentiment = y_test[idx]\n",
+        "pred_df = pd.DataFrame({\n",
+        "  'class_names': class_names,\n",
+        "  'values': y_pred_probs[idx]\n",
+        "})"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NfUZerrFUQgF"
+      },
+      "source": [
+        "print(\"\\n\".join(wrap(texts)))\n",
+        "print()\n",
+        "print(f'True sentiment: {class_names[true_sentiment]}')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "__zvq7oAUoTq"
+      },
+      "source": [
+        "Now we can look at the confidence of each sentiment of our model:\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yI2foZLwUa-D"
+      },
+      "source": [
+        "sns.barplot(x='values', y='class_names', data=pred_df, orient='h')\n",
+        "plt.ylabel('sentiment')\n",
+        "plt.xlabel('probability')\n",
+        "plt.xlim([0, 1]);"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VKKFcZWGUtPT"
+      },
+      "source": [
+        "### Predicting on Raw Text\n",
+        "\n",
+        "Let's use our model to predict the sentiment of some raw text:"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Y_uMQh4iU7oz"
+      },
+      "source": [
+        "We have to use the tokenizer to encode the text:\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qBhmfageU0KH"
+      },
+      "source": [
+        "sample_text = \"Eu não sei\""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "IhBP47RrU7Fo"
+      },
+      "source": [
+        "encoded_text = tokenizer.encode_plus(\n",
+        "  sample_text,\n",
+        "  max_length=MAX_LEN,\n",
+        "  add_special_tokens=True,\n",
+        "  return_token_type_ids=False,\n",
+        "  pad_to_max_length=True,\n",
+        "  return_attention_mask=True,\n",
+        "  return_tensors='pt',\n",
+        ")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "78iajgyHU_np"
+      },
+      "source": [
+        "Let's get the predictions from our model:\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "l7yUhQJoU9yb"
+      },
+      "source": [
+        "input_ids = encoded_text['input_ids'].to(device)\n",
+        "attention_mask = encoded_text['attention_mask'].to(device)\n",
+        "\n",
+        "output = model(input_ids, attention_mask)\n",
+        "_, prediction = torch.max(output, dim=1)\n",
+        "\n",
+        "print(f'Text: {sample_text}')\n",
+        "print(f'Sentiment  : {class_names[prediction]}')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}