{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": { "id": "I-5YovjdgLu0" }, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'torch'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\LENOVO\\OneDrive\\Desktop\\Twitter_sentiment\\Twitter-Sentiment-Analysis-Web\\sentiment_analysis_Twitter.ipynb Cell 1\u001b[0m line \u001b[0;36m2\n\u001b[0;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtorch\u001b[39;00m \u001b[39mimport\u001b[39;00m nn\n\u001b[0;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtorch\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdata\u001b[39;00m \u001b[39mimport\u001b[39;00m DataLoader, Dataset\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch'" ] } ], "source": [ "import os\n", "import torch\n", "from torch import nn\n", "from torch.utils.data import DataLoader, Dataset\n", "from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NHhBadHEDv0e", "outputId": "94c8fdab-c4cd-4b6f-d2f4-9ad5675054d7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/content\n" ] } ], "source": [ "import os\n", "current_directory = os.getcwd()\n", "print(current_directory)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ll5qSoY5gT5Y", "outputId": "1a477d0d-f4a1-4b95-eac9-9d7129a1353f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\n", "Collecting transformers\n", " Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)\n", " ---------------------------------------- 0.0/123.5 kB ? eta -:--:--\n", " --- ------------------------------------ 10.2/123.5 kB ? eta -:--:--\n", " --- ------------------------------------ 10.2/123.5 kB ? eta -:--:--\n", " --- ------------------------------------ 10.2/123.5 kB ? eta -:--:--\n", " --- ------------------------------------ 10.2/123.5 kB ? eta -:--:--\n", " ------------ ------------------------ 41.0/123.5 kB 164.3 kB/s eta 0:00:01\n", " ------------------ ------------------ 61.4/123.5 kB 204.8 kB/s eta 0:00:01\n", " ------------------ ------------------ 61.4/123.5 kB 204.8 kB/s eta 0:00:01\n", " ------------------ ------------------ 61.4/123.5 kB 204.8 kB/s eta 0:00:01\n", " -------------------------------- --- 112.6/123.5 kB 252.2 kB/s eta 0:00:01\n", " ------------------------------------ 123.5/123.5 kB 278.9 kB/s eta 0:00:00\n", "Requirement already satisfied: filelock in c:\\users\\lenovo\\appdata\\roaming\\python\\python312\\site-packages (from transformers) (3.13.1)\n", "Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)\n", " Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)\n", "Requirement already satisfied: numpy>=1.17 in c:\\users\\lenovo\\appdata\\roaming\\python\\python312\\site-packages (from transformers) (1.26.2)\n", "Requirement already satisfied: packaging>=20.0 in c:\\users\\lenovo\\appdata\\roaming\\python\\python312\\site-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\lenovo\\appdata\\roaming\\python\\python312\\site-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\lenovo\\appdata\\roaming\\python\\python312\\site-packages (from transformers) (2023.10.3)\n", "Requirement already satisfied: requests in c:\\users\\lenovo\\appdata\\roaming\\python\\python312\\site-packages (from transformers) (2.31.0)\n", "Collecting tokenizers<0.19,>=0.14 (from transformers)\n", " Downloading tokenizers-0.15.0.tar.gz (318 kB)\n", " ---------------------------------------- 0.0/318.5 kB ? eta -:--:--\n", " ---- -------------------------------- 41.0/318.5 kB 991.0 kB/s eta 0:00:01\n", " ---------- ---------------------------- 81.9/318.5 kB 1.2 MB/s eta 0:00:01\n", " -------------- ----------------------- 122.9/318.5 kB 1.0 MB/s eta 0:00:01\n", " ---------------- ------------------- 143.4/318.5 kB 853.3 kB/s eta 0:00:01\n", " ------------------ ----------------- 163.8/318.5 kB 893.0 kB/s eta 0:00:01\n", " ------------------- ---------------- 174.1/318.5 kB 655.4 kB/s eta 0:00:01\n", " ----------------------- ------------ 204.8/318.5 kB 692.4 kB/s eta 0:00:01\n", " --------------------------- -------- 245.8/318.5 kB 754.9 kB/s eta 0:00:01\n", " ---------------------------- ------- 256.0/318.5 kB 655.4 kB/s eta 0:00:01\n", " ------------------------------------ 318.5/318.5 kB 730.5 kB/s eta 0:00:00\n", " Installing build dependencies: started\n", " Installing build dependencies: finished with status 'done'\n", " Getting requirements to build wheel: started\n", " Getting requirements to build wheel: finished with status 'done'\n", " Preparing metadata (pyproject.toml): started\n", " Preparing metadata (pyproject.toml): finished with status 'error'\n", "Note: you may need to restart the kernel to use updated packages.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " error: subprocess-exited-with-error\n", " \n", " × Preparing metadata (pyproject.toml) did not run successfully.\n", " │ exit code: 1\n", " ╰─> [6 lines of output]\n", " Checking for Rust toolchain....\n", " \n", " Cargo, the Rust package manager, is not installed or is not on PATH.\n", " This package requires Rust and Cargo to compile extensions. Install it through\n", " the system's package manager or via https://rustup.rs/\n", " \n", " [end of output]\n", " \n", " note: This error originates from a subprocess, and is likely not a problem with pip.\n", "error: metadata-generation-failed\n", "\n", "× Encountered error while generating package metadata.\n", "╰─> See above for output.\n", "\n", "note: This is an issue with the package mentioned above, not pip.\n", "hint: See above for details.\n" ] } ], "source": [ "pip install transformers\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dOvDqAjGp3Gl", "outputId": "97e2bcf5-725f-4f2c-c626-1b97980c0f36" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vrm1tJu8H9Hb" }, "outputs": [], "source": [ "df=pd.read_csv(\"/Twitter_Data.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "lUZsmGDtmoLl", "outputId": "a673f581-3f2e-40f5-966c-cfbff840c647" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clean_textcategory
0when modi promised “minimum government maximum...-1.0
1talk all the nonsense and continue all the dra...0.0
2what did just say vote for modi welcome bjp t...1.0
3asking his supporters prefix chowkidar their n...1.0
4answer who among these the most powerful world...1.0
.........
139418per capita income average indians have gone wh...1.0
139419wow what response behold all south bangalorean...1.0
139420modi also not visited there0.0
139421does his categorisation cover bjp many states ...1.0
139422problembut take responsibility for unprecedent...NaN
\n", "

139423 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " clean_text category\n", "0 when modi promised “minimum government maximum... -1.0\n", "1 talk all the nonsense and continue all the dra... 0.0\n", "2 what did just say vote for modi welcome bjp t... 1.0\n", "3 asking his supporters prefix chowkidar their n... 1.0\n", "4 answer who among these the most powerful world... 1.0\n", "... ... ...\n", "139418 per capita income average indians have gone wh... 1.0\n", "139419 wow what response behold all south bangalorean... 1.0\n", "139420 modi also not visited there 0.0\n", "139421 does his categorisation cover bjp many states ... 1.0\n", "139422 problembut take responsibility for unprecedent... NaN\n", "\n", "[139423 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SZIvOfIqQBr6" }, "outputs": [], "source": [ "df = df[df['category'] != 0.0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "7wyu27xmRRXr", "outputId": "5ec7a455-93fc-41e1-8711-097b07e08069" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clean_textcategory
0when modi promised “minimum government maximum...-1.0
2what did just say vote for modi welcome bjp t...1.0
3asking his supporters prefix chowkidar their n...1.0
4answer who among these the most powerful world...1.0
8with upcoming election india saga going import...1.0
.........
16084this scheme very good but the congress governm...1.0
16085sorry modi came 2001 kindly see the condition ...1.0
16086feels like total conspiracy holding the seat t...1.0
16088told them love and respect their president the...1.0
16089imagine the hatred modi and shah hNaN
\n", "

10518 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " clean_text category\n", "0 when modi promised “minimum government maximum... -1.0\n", "2 what did just say vote for modi welcome bjp t... 1.0\n", "3 asking his supporters prefix chowkidar their n... 1.0\n", "4 answer who among these the most powerful world... 1.0\n", "8 with upcoming election india saga going import... 1.0\n", "... ... ...\n", "16084 this scheme very good but the congress governm... 1.0\n", "16085 sorry modi came 2001 kindly see the condition ... 1.0\n", "16086 feels like total conspiracy holding the seat t... 1.0\n", "16088 told them love and respect their president the... 1.0\n", "16089 imagine the hatred modi and shah h NaN\n", "\n", "[10518 rows x 2 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HBwdWFxwwJU2" }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def load_imdb_data(df):\n", " df = df.sample(3000)\n", " texts = df['clean_text'].tolist()\n", "\n", " labels = []\n", " for sentiment in df['category'].tolist():\n", " if sentiment == 1.0:\n", " labels.append(1)\n", "\n", " else: # Assuming all other sentiments are \"negative\"\n", " labels.append(0)\n", "\n", " return texts, labels\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "El7JG36wxttq" }, "outputs": [], "source": [ "#data_file = \"/Twitter_Data.csv\"\n", "texts, labels = load_imdb_data(df)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sD-Dlfdux1Ql" }, "outputs": [], "source": [ "#TextClassificationDataset\n", "class TextClassificationDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_length):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, idx):\n", " text = self.texts[idx]\n", " label = self.labels[idx]\n", " encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)\n", " return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rMg3hgB4x4gH" }, "outputs": [], "source": [ "class BERTClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTClassifier, self).__init__()\n", " self.bert = BertModel.from_pretrained(bert_model_name)\n", " self.dropout = nn.Dropout(0.1)\n", " self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)\n", "\n", "def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " pooled_output = outputs.pooler_output\n", " x = self.dropout(pooled_output)\n", " logits = self.fc(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "e8rNru3fyK0w" }, "outputs": [], "source": [ "# Training loop\n", "def train(model, data_loader, optimizer, scheduler, device):\n", " model.train()\n", " for batch in data_loader:\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) # Corrected this line\n", " loss = outputs.loss\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "AE1YVOA2yNrH" }, "outputs": [], "source": [ "def evaluate(model, data_loader, device):\n", " model.eval()\n", " predictions = []\n", " actual_labels = []\n", "\n", " with torch.no_grad():\n", " for batch in data_loader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", "\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits # Access the logits\n", "\n", " _, preds = torch.max(logits, dim=1) # Apply argmax to logits\n", " predictions.extend(preds.cpu().tolist())\n", " actual_labels.extend(labels.cpu().tolist())\n", "\n", " accuracy = accuracy_score(actual_labels, predictions)\n", " report = classification_report(actual_labels, predictions, target_names=['class_0', 'class_1']) # Adjust target_names as needed\n", "\n", " return accuracy, report\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PRrj9l1fyQbN" }, "outputs": [], "source": [ "import torch\n", "\n", "def predict_sentiment(text, model, tokenizer, device, max_length=128):\n", " # Tokenize input text\n", " inputs = tokenizer(text, max_length=max_length, padding=True, truncation=True, return_tensors=\"pt\")\n", " input_ids = inputs[\"input_ids\"].to(device)\n", " attention_mask = inputs[\"attention_mask\"].to(device)\n", "\n", " # Make prediction\n", " with torch.no_grad():\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " predicted_class = torch.argmax(logits, dim=1).item()\n", "\n", " if predicted_class == 1:\n", " return \"positive\"\n", " else:\n", " return \"negative\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Kwnh-gU-ySrE" }, "outputs": [], "source": [ "# Set up parameters\n", "bert_model_name = 'bert-base-uncased'\n", "num_classes = 2\n", "max_length = 128\n", "batch_size = 16\n", "num_epochs = 1\n", "learning_rate = 2e-5\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pI91QNlYynSk" }, "outputs": [], "source": [ "train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zUIGzsGpy32I" }, "outputs": [], "source": [ "tokenizer = BertTokenizer.from_pretrained(bert_model_name)\n", "train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)\n", "val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=batch_size)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "irjPZM5Py_qv", "outputId": "eab4ec72-3ff7-4f1c-c9d6-6f521a9b9fa6" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model = BERTClassifier(bert_model_name, num_classes).to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XofYUsehzGDb", "outputId": "c5f91997-362d-4157-eb85-f2350e85a5a6" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] } ], "source": [ "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mnQNKANpzRRD" }, "outputs": [], "source": [ "pip install accelerate>=0.20.1\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7W5kAcnPzqVT" }, "outputs": [], "source": [ "from transformers import TrainingArguments, Trainer\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Egab1OA-0v9R" }, "outputs": [], "source": [ "\n", "pip install transformers[torch,accelerate]>=0.20.1\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8DUWCwa4086I" }, "outputs": [], "source": [ "def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids, attention_mask=attention_mask)\n", " pooled_output = outputs.pooler_output\n", " x = self.dropout(pooled_output)\n", " logits = self.fc(x)\n", " return logits\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kLCayimP1UWO" }, "outputs": [], "source": [ "from transformers import BertForSequenceClassification\n", "\n", "class BERTClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTClassifier, self).__init__()\n", " self.bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " return logits\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "E63X_f522Olk", "outputId": "dbf380bc-d8f3-45d8-d760-5f03a2c160e6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: line 1: 5.0.0: No such file or directory\n" ] } ], "source": [ "pip install transformers>=4.11.0,<5.0.0 torch>=1.8.0,<1.9.0\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qLXID90a3NKs", "outputId": "28301248-f07f-4fc5-9efe-de471173254a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.1)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.3)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.15,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n" ] } ], "source": [ "pip install transformers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "o8pR9XQx3uzM", "outputId": "9e8fdcb2-0874-40d5-d888-8e7fe20011b4" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/2\n", "Validation Accuracy: 0.7583\n", " precision recall f1-score support\n", "\n", " class_0 0.64 0.64 0.64 202\n", " class_1 0.82 0.82 0.82 398\n", "\n", " accuracy 0.76 600\n", " macro avg 0.73 0.73 0.73 600\n", "weighted avg 0.76 0.76 0.76 600\n", "\n", "Epoch 2/2\n", "Validation Accuracy: 0.7917\n", " precision recall f1-score support\n", "\n", " class_0 0.71 0.64 0.68 202\n", " class_1 0.83 0.87 0.85 398\n", "\n", " accuracy 0.79 600\n", " macro avg 0.77 0.76 0.76 600\n", "weighted avg 0.79 0.79 0.79 600\n", "\n" ] } ], "source": [ "import torch\n", "import pandas as pd\n", "from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup\n", "from torch.utils.data import DataLoader, Dataset\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Define your dataset loading function and TextClassificationDataset class\n", "\n", "# Define your BERT model and training functions\n", "\n", "# Set up parameters\n", "bert_model_name = 'bert-base-uncased'\n", "num_classes = 2\n", "max_length = 128\n", "batch_size = 16\n", "num_epochs = 2\n", "learning_rate = 2e-5\n", "\n", "# Load and preprocess your data\n", "\n", "# Initialize BERT tokenizer\n", "tokenizer = BertTokenizer.from_pretrained(bert_model_name)\n", "\n", "# Create datasets and dataloaders\n", "\n", "# Set up device\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Initialize BERT model\n", "model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)\n", "model.to(device)\n", "\n", "# Initialize optimizer and scheduler\n", "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)\n", "\n", "# Training and evaluation loop\n", "for epoch in range(num_epochs):\n", " print(f\"Epoch {epoch + 1}/{num_epochs}\")\n", "\n", " # Training phase\n", " train(model, train_dataloader, optimizer, scheduler, device)\n", "\n", " # Evaluation phase\n", " accuracy, report = evaluate(model, val_dataloader, device)\n", " print(f\"Validation Accuracy: {accuracy:.4f}\")\n", " print(report)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xvRmDWYCt_zA" }, "outputs": [ { "ename": "NameError", "evalue": "name 'torch' is not defined", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\LENOVO\\OneDrive\\Desktop\\Twitter_sentiment\\Twitter-Sentiment-Analysis-Web\\sentiment_analysis_Twitter.ipynb Cell 29\u001b[0m line \u001b[0;36m3\n\u001b[0;32m 1\u001b[0m \u001b[39m#saving model\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m torch\u001b[39m.\u001b[39msave(model\u001b[39m.\u001b[39mstate_dict(), \u001b[39m\"\u001b[39m\u001b[39mbert_classifier.pth\u001b[39m\u001b[39m\"\u001b[39m)\n", "\u001b[1;31mNameError\u001b[0m: name 'torch' is not defined" ] } ], "source": [ "#saving model\n", "\n", "torch.save(model.state_dict(), \"bert_classifier.pth\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8GyBtZuAup5A", "outputId": "4640da62-ddef-4d5c-81b9-41122a5fafb4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted sentiment: negative\n" ] } ], "source": [ "# Test sentiment prediction\n", "test_text = \"modi is a prime minister,but made many worst changes\"\n", "sentiment = predict_sentiment(test_text, model, tokenizer, device)\n", "\n", "\n", "print(f\"Predicted sentiment: {sentiment}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "uy4mW35_6ER8" }, "source": [ "**TWITTER** **DATASET**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eMM0FwF_6Soy" }, "outputs": [], "source": [ "df=pd.read_csv(\"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "bpx1HJ3F6atE", "outputId": "67272dbb-abcf-4b2e-90c9-b19fb55ec4d7" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clean_textcategory
0when modi promised “minimum government maximum...-1.0
1talk all the nonsense and continue all the dra...0.0
2what did just say vote for modi welcome bjp t...1.0
3asking his supporters prefix chowkidar their n...1.0
4answer who among these the most powerful world...1.0
.........
162975why these 456 crores paid neerav modi not reco...-1.0
162976dear rss terrorist payal gawar what about modi...-1.0
162977did you cover her interaction forum where she ...0.0
162978there big project came into india modi dream p...0.0
162979have you ever listen about like gurukul where ...1.0
\n", "

162980 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " clean_text category\n", "0 when modi promised “minimum government maximum... -1.0\n", "1 talk all the nonsense and continue all the dra... 0.0\n", "2 what did just say vote for modi welcome bjp t... 1.0\n", "3 asking his supporters prefix chowkidar their n... 1.0\n", "4 answer who among these the most powerful world... 1.0\n", "... ... ...\n", "162975 why these 456 crores paid neerav modi not reco... -1.0\n", "162976 dear rss terrorist payal gawar what about modi... -1.0\n", "162977 did you cover her interaction forum where she ... 0.0\n", "162978 there big project came into india modi dream p... 0.0\n", "162979 have you ever listen about like gurukul where ... 1.0\n", "\n", "[162980 rows x 2 columns]" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6hM2_89O6mWk" }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def load_twitter_data(data_file):\n", " df = pd.read_csv(data_file)\n", " df = df.sample(3000) # Sample 3000 rows if needed\n", "\n", " texts = df['clean_text'].tolist()\n", "\n", " labels = []\n", " for category in df['category'].tolist():\n", " # Assuming that the categories in the Twitter dataset are \"positive,\" \"neutral,\" and \"negative\"\n", " if category == \"positive\":\n", " labels.append(1)\n", " elif category == \"neutral\":\n", " labels.append(0)\n", " else:\n", " labels.append(2) # Assuming all other categories are \"negative\"\n", "\n", " return texts, labels\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 192 }, "id": "xRjswru97QNc", "outputId": "8b3b1473-5637-4af2-e840-1153e4860409" }, "outputs": [ { "ename": "NameError", "evalue": "ignored", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdata_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtexts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_twitter_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'load_twitter_data' is not defined" ] } ], "source": [ "data_file = \"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\"\n", "texts, labels = load_twitter_data(data_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cQJf7kj1750W" }, "outputs": [], "source": [ "from torch.utils.data import Dataset\n", "import torch\n", "\n", "class TwitterClassificationDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_length):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, idx):\n", " text = self.texts[idx]\n", " label = self.labels[idx]\n", " encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)\n", " return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "opOVTWRu8d88", "outputId": "6603a4fb-9e91-467f-c674-7c2a30f82461" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.34.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.3)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.15,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.6)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n" ] } ], "source": [ "pip install transformers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PSQ2cqTz8Q24" }, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "from transformers import BertModel\n", "\n", "class BERTClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTClassifier, self).__init__()\n", " self.bert = BertModel.from_pretrained(bert_model_name)\n", " self.dropout = nn.Dropout(0.1)\n", " self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " pooled_output = outputs.pooler_output\n", " x = self.dropout(pooled_output)\n", " logits = self.fc(x)\n", " return logits\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5DXhUD5S8w0K" }, "outputs": [], "source": [ "def train(model, data_loader, optimizer, scheduler, device):\n", " model.train()\n", " for batch in data_loader:\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n", " loss = outputs.loss\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qxaWGnSm9xkM" }, "outputs": [], "source": [ "def evaluate(model, data_loader, device):\n", " model.eval()\n", " predictions = []\n", " actual_labels = []\n", "\n", " with torch.no_grad():\n", " for batch in data_loader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", "\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits # Access the logits\n", "\n", " _, preds = torch.max(logits, dim=1) # Apply argmax to logits\n", " predictions.extend(preds.cpu().tolist())\n", " actual_labels.extend(labels.cpu().tolist())\n", "\n", " # Modify target_names to match your class labels\n", " target_names = ['positive', 'neutral', 'negative']\n", "\n", " accuracy = accuracy_score(actual_labels, predictions)\n", " report = classification_report(actual_labels, predictions, target_names=target_names)\n", "\n", " return accuracy, report\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vYpL6c1i-GKr" }, "outputs": [], "source": [ "import torch\n", "\n", "def predict_sentiment(text, model, tokenizer, device, max_length=128):\n", " # Tokenize and prepare the input\n", " inputs = tokenizer(\n", " text,\n", " truncation=True,\n", " padding=True,\n", " max_length=max_length,\n", " return_tensors=\"pt\"\n", " )\n", " input_ids = inputs[\"input_ids\"].to(device)\n", " attention_mask = inputs[\"attention_mask\"].to(device)\n", "\n", " # Get model predictions\n", " with torch.no_grad():\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " _, preds = torch.max(logits, dim=1)\n", "\n", " if preds.item() == 1:\n", " return \"positive\"\n", " elif preds.item() == 0:\n", " return \"neutral\"\n", " else:\n", " return \"negative\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "q1YkfWK2-eH5" }, "outputs": [], "source": [ "# Set up parameters\n", "bert_model_name = 'bert-base-uncased'\n", "num_classes = 2\n", "max_length = 128\n", "batch_size = 4\n", "num_epochs = 1\n", "learning_rate = 2e-5" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nyMyH8d--jdo" }, "outputs": [], "source": [ " from sklearn.model_selection import train_test_split\n", " train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SSHcueM1-oGP" }, "outputs": [], "source": [ "from transformers import BertTokenizer\n", "\n", "# Define your BERT model and tokenizer\n", "bert_model_name = 'bert-base-uncased'\n", "tokenizer = BertTokenizer.from_pretrained(bert_model_name)\n", "\n", "# Create your datasets and dataloaders\n", "train_dataset = TwitterClassificationDataset(train_texts, train_labels, tokenizer, max_length)\n", "val_dataset = TwitterClassificationDataset(val_texts, val_labels, tokenizer, max_length)\n", "\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xboCkOAL_MCq", "outputId": "6e1da11f-d319-4c10-e1e6-988cfa1fee96" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n" ] } ], "source": [ "# Define your optimizer and scheduler\n", "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bmP7dnfMaFZc" }, "outputs": [], "source": [ "pip install accelerate>=0.20.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Y8y12Nc7aJHy" }, "outputs": [], "source": [ "from transformers import TrainingArguments, Trainer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OTIjUKf8aLMu" }, "outputs": [], "source": [ "pip install transformers[torch,accelerate]>=0.20.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "UWKbABLKaNGf" }, "outputs": [], "source": [ "def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids, attention_mask=attention_mask)\n", " pooled_output = outputs.pooler_output\n", " x = self.dropout(pooled_output)\n", " logits = self.fc(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cksBAEZ9bKCW" }, "outputs": [], "source": [ "from transformers import BertForSequenceClassification\n", "\n", "class BERTTwitterClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTTwitterClassifier, self).__init__()\n", " self.bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " return logits\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7K4usv5CbNTC", "outputId": "777a5573-a5d4-40f3-b390-a99971b78c2f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: line 1: 5.0.0: No such file or directory\n" ] } ], "source": [ "pip install transformers>=4.11.0,<5.0.0 torch>=1.8.0,<1.9.0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bMqJF1TibZvO", "outputId": "ed3eff75-65d6-4a8b-a7c6-ea13abcb7d0c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.34.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.3)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.15,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.6)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n" ] } ], "source": [ "pip install transformers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 576 }, "id": "JxX0DWNKbcOg", "outputId": "a8c51cfe-5ddf-4291-97b4-58553a08805c" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/1\n", "Validation Accuracy: 1.0\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " neutral 0.00 0.00 0.00 0\n", " positive 0.00 0.00 0.00 0\n", " negative 1.00 1.00 1.00 600\n", "\n", " micro avg 1.00 1.00 1.00 600\n", " macro avg 0.33 0.33 0.33 600\n", "weighted avg 1.00 1.00 1.00 600\n", "\n" ] }, { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "'text_to_predict = \"I love this product!\"\\npredicted_sentiment = predict_sentiment(text_to_predict, model, tokenizer, device)\\nprint(f\"Predicted Sentiment: {predicted_sentiment}\")'" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import torch.nn as nn\n", "from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup\n", "from torch.utils.data import DataLoader\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "\n", "# Load the Twitter dataset\n", "data_file = \"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\"\n", "df = pd.read_csv(data_file)\n", "\n", "# Define a function to load and preprocess the Twitter data\n", "def load_twitter_data(data_file, num_samples=None):\n", " df = pd.read_csv(data_file)\n", " if num_samples is not None:\n", " df = df.sample(num_samples) # Sample a subset of the data if needed\n", "\n", " texts = df['clean_text'].tolist()\n", "\n", " labels = []\n", " for category in df['category'].tolist():\n", " if category == \"positive\":\n", " labels.append(1)\n", " elif category == \"neutral\":\n", " labels.append(0)\n", " else:\n", " labels.append(2) # Negative sentiment\n", "\n", " return texts, labels\n", "\n", "# Split the dataset into train and validation sets\n", "texts, labels = load_twitter_data(data_file, num_samples=3000)\n", "train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)\n", "\n", "# Define BERT-specific parameters\n", "bert_model_name = 'bert-base-uncased'\n", "max_length = 128\n", "batch_size = 4\n", "num_epochs = 1\n", "learning_rate = 2e-5\n", "\n", "# Load the BERT tokenizer\n", "tokenizer = BertTokenizer.from_pretrained(bert_model_name)\n", "\n", "# Define the Twitter classification dataset\n", "class TwitterClassificationDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_length):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, idx):\n", " text = self.texts[idx]\n", " label = self.labels[idx]\n", " encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)\n", " return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}\n", "\n", "# Create dataloaders for training and validation\n", "train_dataset = TwitterClassificationDataset(train_texts, train_labels, tokenizer, max_length)\n", "val_dataset = TwitterClassificationDataset(val_texts, val_labels, tokenizer, max_length)\n", "\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)\n", "\n", "# Define the BERT-based sentiment classifier\n", "class BERTTwitterClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTTwitterClassifier, self).__init__()\n", " self.bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " return logits\n", "\n", "# Create the model and move it to the GPU if available\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model = BERTTwitterClassifier(bert_model_name, num_classes=3).to(device)\n", "\n", "# Define the optimizer and learning rate scheduler\n", "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)\n", "\n", "# Training loop\n", "for epoch in range(num_epochs):\n", " model.train()\n", " for batch in train_dataloader:\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " logits = model(input_ids=input_ids, attention_mask=attention_mask)\n", " loss = nn.CrossEntropyLoss()(logits, labels) # Cross-Entropy loss\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n", "\n", " model.eval()\n", " val_predictions = []\n", " val_actual_labels = []\n", "\n", " with torch.no_grad():\n", " for batch in val_dataloader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", "\n", " logits = model(input_ids=input_ids, attention_mask=attention_mask)\n", " _, preds = torch.max(logits, dim=1)\n", " val_predictions.extend(preds.cpu().tolist())\n", " val_actual_labels.extend(labels.cpu().tolist())\n", "\n", " val_accuracy = accuracy_score(val_actual_labels, val_predictions)\n", " val_report = classification_report(val_actual_labels, val_predictions, labels=[0, 1, 2], target_names=['neutral', 'positive', 'negative'])\n", "\n", " print(f\"Epoch {epoch + 1}/{num_epochs}\")\n", " print(f\"Validation Accuracy: {val_accuracy}\")\n", " print(f\"Classification Report:\\n{val_report}\")\n", "\n", "# Save the trained model if needed\n", "torch.save(model.state_dict(), 'twitter_sentiment_model.pth')\n", "\n", "# Function to predict sentiment for a given text\n", "def predict_sentiment(text, model, tokenizer, device, max_length=128):\n", " inputs = tokenizer(\n", " text,\n", " truncation=True,\n", " padding=True,\n", " max_length=max_length,\n", " return_tensors=\"pt\"\n", " )\n", " input_ids = inputs[\"input_ids\"].to(device)\n", " attention_mask = inputs[\"attention_mask\"].to(device)\n", "\n", " with torch.no_grad():\n", " logits = model(input_ids=input_ids, attention_mask=attention_mask)\n", " predicted_class = torch.argmax(logits, dim=1).item()\n", "\n", " if predicted_class == 1:\n", " return \"positive\"\n", " elif predicted_class == 0:\n", " return \"neutral\"\n", " else:\n", " return \"negative\"\n", "\n", "# Example usage of predict_sentiment\n", "'''text_to_predict = \"I love this product!\"\n", "predicted_sentiment = predict_sentiment(text_to_predict, model, tokenizer, device)\n", "print(f\"Predicted Sentiment: {predicted_sentiment}\")'''\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "g_OS-neQpFOo" }, "outputs": [], "source": [ "torch.save(model.state_dict(), 'twitter_sentiment_model.pth')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2pzvsdiZN5Ee", "outputId": "36969ce1-0023-4f5f-c0e7-207bd04dcfac" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted Sentiment: negative\n" ] } ], "source": [ "# Example usage of predict_sentiment\n", "text_to_predict = \"I love this product!\"\n", "predicted_sentiment = predict_sentiment(text_to_predict, model, tokenizer, device)\n", "print(f\"Predicted Sentiment: {predicted_sentiment}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "R_at4FydN_6B", "outputId": "3ac0cb09-3d3c-446d-ddbb-81481d59bcf9" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/2\n", "Validation Accuracy: 1.0\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " neutral 0.00 0.00 0.00 0\n", " positive 0.00 0.00 0.00 0\n", " negative 1.00 1.00 1.00 600\n", "\n", " micro avg 1.00 1.00 1.00 600\n", " macro avg 0.33 0.33 0.33 600\n", "weighted avg 1.00 1.00 1.00 600\n", "\n", "Epoch 2/2\n", "Validation Accuracy: 1.0\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " neutral 0.00 0.00 0.00 0\n", " positive 0.00 0.00 0.00 0\n", " negative 1.00 1.00 1.00 600\n", "\n", " micro avg 1.00 1.00 1.00 600\n", " macro avg 0.33 0.33 0.33 600\n", "weighted avg 1.00 1.00 1.00 600\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup\n", "from torch.utils.data import DataLoader\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "\n", "# Load the Twitter dataset\n", "data_file = \"/content/drive/MyDrive/Colab Notebooks/Twitter_Data.csv\"\n", "df = pd.read_csv(data_file)\n", "\n", "# Define a function to load and preprocess the Twitter data\n", "def load_twitter_data(data_file, num_samples=None):\n", " df = pd.read_csv(data_file)\n", " if num_samples is not None:\n", " df = df.sample(num_samples) # Sample a subset of the data if needed\n", "\n", " texts = df['clean_text'].tolist()\n", "\n", " labels = []\n", " for category in df['category'].tolist():\n", " if category == \"positive\":\n", " labels.append(1)\n", " elif category == \"neutral\":\n", " labels.append(0)\n", " else:\n", " labels.append(2) # Negative sentiment\n", "\n", " return texts, labels\n", "\n", "# Split the dataset into train and validation sets\n", "texts, labels = load_twitter_data(data_file, num_samples=3000)\n", "train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)\n", "\n", "# Define BERT-specific parameters\n", "bert_model_name = 'bert-base-uncased'\n", "max_length = 128\n", "batch_size = 4\n", "num_epochs = 2\n", "learning_rate = 2e-5\n", "\n", "# Load the BERT tokenizer\n", "tokenizer = BertTokenizer.from_pretrained(bert_model_name)\n", "\n", "# Define the Twitter classification dataset\n", "class TwitterClassificationDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_length):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, idx):\n", " text = self.texts[idx]\n", " label = self.labels[idx]\n", " encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)\n", " return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}\n", "\n", "# Create dataloaders for training and validation\n", "train_dataset = TwitterClassificationDataset(train_texts, train_labels, tokenizer, max_length)\n", "val_dataset = TwitterClassificationDataset(val_texts, val_labels, tokenizer, max_length)\n", "\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)\n", "\n", "# Define the BERT-based sentiment classifier\n", "class BERTTwitterClassifier(nn.Module):\n", " def __init__(self, bert_model_name, num_classes):\n", " super(BERTTwitterClassifier, self).__init__()\n", " self.bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_classes)\n", "\n", " def forward(self, input_ids, attention_mask):\n", " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", " logits = outputs.logits\n", " return logits\n", "\n", "# Create the model and move it to the GPU if available\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Modify the model to output 3 classes (positive, neutral, negative)\n", "model = BERTTwitterClassifier(bert_model_name, num_classes=3).to(device)\n", "\n", "# Define the optimizer and learning rate scheduler\n", "optimizer = AdamW(model.parameters(), lr=learning_rate)\n", "total_steps = len(train_dataloader) * num_epochs\n", "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)\n", "\n", "# Training loop\n", "for epoch in range(num_epochs):\n", " model.train()\n", " for batch in train_dataloader:\n", " optimizer.zero_grad()\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", " logits = model(input_ids=input_ids, attention_mask=attention_mask)\n", "\n", " # Use CrossEntropyLoss for multi-class classification\n", " loss = nn.CrossEntropyLoss()(logits, labels)\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n", "\n", " model.eval()\n", " val_predictions = []\n", " val_actual_labels = []\n", "\n", " with torch.no_grad():\n", " for batch in val_dataloader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['label'].to(device)\n", "\n", " logits = model(input_ids=input_ids, attention_mask=attention_mask)\n", " _, preds = torch.max(logits, dim=1)\n", " val_predictions.extend(preds.cpu().tolist())\n", " val_actual_labels.extend(labels.cpu().tolist())\n", "\n", " val_accuracy = accuracy_score(val_actual_labels, val_predictions)\n", " val_report = classification_report(val_actual_labels, val_predictions, labels=[0, 1, 2], target_names=['neutral', 'positive', 'negative'])\n", "\n", " print(f\"Epoch {epoch + 1}/{num_epochs}\")\n", " print(f\"Validation Accuracy: {val_accuracy}\")\n", " print(f\"Classification Report:\\n{val_report}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0zWdn7lDPjJd" }, "outputs": [], "source": [ "class TwitterClassificationDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_length):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, idx):\n", " text = self.texts[idx]\n", " label = self.labels[idx]\n", "\n", " # Tokenize the single text input\n", " encoding = self.tokenizer(\n", " text,\n", " return_tensors='pt',\n", " max_length=self.max_length,\n", " padding='max_length',\n", " truncation=True\n", " )\n", "\n", " return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r7TkTK02QJp3", "outputId": "fbfd095e-2bd9-47ef-f303-0c631e3cd1d8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted Sentiment: negative\n" ] } ], "source": [ "# Example usage of predict_sentiment\n", "text_to_predict = \"The weather is cold.\"\n", "predicted_sentiment = predict_sentiment(text_to_predict, model, tokenizer, device)\n", "print(f\"Predicted Sentiment: {predicted_sentiment}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p5-Nn8jQRZjX", "outputId": "ea1b5f71-7758-488d-c97c-de3afa164835" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: nbconvert in /usr/local/lib/python3.10/dist-packages (6.5.4)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.9.3)\n", "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.11.2)\n", "Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert) (6.1.0)\n", "Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.7.1)\n", "Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.4)\n", "Requirement already satisfied: jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (3.1.2)\n", "Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.5.0)\n", "Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.2.2)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.1.3)\n", "Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.8.4)\n", "Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.9.0)\n", "Requirement already satisfied: nbformat>=5.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.9.2)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from nbconvert) (23.2)\n", "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.5.0)\n", "Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.16.1)\n", "Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.2.1)\n", "Requirement already satisfied: traitlets>=5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.1)\n", "Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core>=4.7->nbconvert) (4.0.0)\n", "Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.10/dist-packages (from nbclient>=0.5.0->nbconvert) (6.1.12)\n", "Requirement already satisfied: fastjsonschema in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (2.19.0)\n", "Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (4.19.2)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert) (2.5)\n", "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (1.16.0)\n", "Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (0.5.1)\n", "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (23.1.0)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (2023.11.1)\n", "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.31.0)\n", "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.12.0)\n", "Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (23.2.1)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.8.2)\n", "Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.3.2)\n" ] } ], "source": [ "!pip install nbconvert\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5jVto0CSTK9d" }, "outputs": [], "source": [ "from nbconvert import HTMLExporter\n", "import nbformat\n", "\n", "# Load the notebook\n", "notebook_path = '/content/drive/MyDrive/Colab Notebooks/sentiment_analysis_Twitter.ipynb'\n", "with open(notebook_path) as f:\n", " notebook_content = nbformat.read(f, as_version=4)\n", "\n", "# Convert the notebook to HTML using the default template\n", "html_exporter = HTMLExporter()\n", "(body, resources) = html_exporter.from_notebook_node(notebook_content)\n", "\n", "# Save the HTML content to a file\n", "html_output_path = 'notebook.html'\n", "with open(html_output_path, 'w', encoding='utf-8') as f:\n", " f.write(body)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wnISl5lhHPX9", "outputId": "a02ec205-2cee-41b4-ea06-4eca7c24d080" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[NbConvertApp] WARNING | pattern 'sentiment_analysis_Twitter.ipynb' matched no files\n", "This application is used to convert notebook files (*.ipynb)\n", " to various other formats.\n", "\n", " WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.\n", "\n", "Options\n", "=======\n", "The options below are convenience aliases to configurable class-options,\n", "as listed in the \"Equivalent to\" description-line of the aliases.\n", "To see all configurable class-options for some , use:\n", " --help-all\n", "\n", "--debug\n", " set log level to logging.DEBUG (maximize logging output)\n", " Equivalent to: [--Application.log_level=10]\n", "--show-config\n", " Show the application's configuration (human-readable format)\n", " Equivalent to: [--Application.show_config=True]\n", "--show-config-json\n", " Show the application's configuration (json format)\n", " Equivalent to: [--Application.show_config_json=True]\n", "--generate-config\n", " generate default config file\n", " Equivalent to: [--JupyterApp.generate_config=True]\n", "-y\n", " Answer yes to any questions instead of prompting.\n", " Equivalent to: [--JupyterApp.answer_yes=True]\n", "--execute\n", " Execute the notebook prior to export.\n", " Equivalent to: [--ExecutePreprocessor.enabled=True]\n", "--allow-errors\n", " Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.\n", " Equivalent to: [--ExecutePreprocessor.allow_errors=True]\n", "--stdin\n", " read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'\n", " Equivalent to: [--NbConvertApp.from_stdin=True]\n", "--stdout\n", " Write notebook output to stdout instead of files.\n", " Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]\n", "--inplace\n", " Run nbconvert in place, overwriting the existing notebook (only\n", " relevant when converting to notebook format)\n", " Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]\n", "--clear-output\n", " Clear output of current file and save in place,\n", " overwriting the existing notebook.\n", " Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]\n", "--no-prompt\n", " Exclude input and output prompts from converted document.\n", " Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]\n", "--no-input\n", " Exclude input cells and output prompts from converted document.\n", " This mode is ideal for generating code-free reports.\n", " Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]\n", "--allow-chromium-download\n", " Whether to allow downloading chromium if no suitable version is found on the system.\n", " Equivalent to: [--WebPDFExporter.allow_chromium_download=True]\n", "--disable-chromium-sandbox\n", " Disable chromium security sandbox when converting to PDF..\n", " Equivalent to: [--WebPDFExporter.disable_sandbox=True]\n", "--show-input\n", " Shows code input. This flag is only useful for dejavu users.\n", " Equivalent to: [--TemplateExporter.exclude_input=False]\n", "--embed-images\n", " Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.\n", " Equivalent to: [--HTMLExporter.embed_images=True]\n", "--sanitize-html\n", " Whether the HTML in Markdown cells and cell outputs should be sanitized..\n", " Equivalent to: [--HTMLExporter.sanitize_html=True]\n", "--log-level=\n", " Set the log level by value or name.\n", " Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']\n", " Default: 30\n", " Equivalent to: [--Application.log_level]\n", "--config=\n", " Full path of a config file.\n", " Default: ''\n", " Equivalent to: [--JupyterApp.config_file]\n", "--to=\n", " The export format to be used, either one of the built-in formats\n", " ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']\n", " or a dotted object name that represents the import path for an\n", " ``Exporter`` class\n", " Default: ''\n", " Equivalent to: [--NbConvertApp.export_format]\n", "--template=\n", " Name of the template to use\n", " Default: ''\n", " Equivalent to: [--TemplateExporter.template_name]\n", "--template-file=\n", " Name of the template file to use\n", " Default: None\n", " Equivalent to: [--TemplateExporter.template_file]\n", "--theme=\n", " Template specific theme(e.g. the name of a JupyterLab CSS theme distributed\n", " as prebuilt extension for the lab template)\n", " Default: 'light'\n", " Equivalent to: [--HTMLExporter.theme]\n", "--sanitize_html=\n", " Whether the HTML in Markdown cells and cell outputs should be sanitized.This\n", " should be set to True by nbviewer or similar tools.\n", " Default: False\n", " Equivalent to: [--HTMLExporter.sanitize_html]\n", "--writer=\n", " Writer class used to write the\n", " results of the conversion\n", " Default: 'FilesWriter'\n", " Equivalent to: [--NbConvertApp.writer_class]\n", "--post=\n", " PostProcessor class used to write the\n", " results of the conversion\n", " Default: ''\n", " Equivalent to: [--NbConvertApp.postprocessor_class]\n", "--output=\n", " overwrite base name use for output files.\n", " can only be used when converting one notebook at a time.\n", " Default: ''\n", " Equivalent to: [--NbConvertApp.output_base]\n", "--output-dir=\n", " Directory to write output(s) to. Defaults\n", " to output to the directory of each notebook. To recover\n", " previous default behaviour (outputting to the current\n", " working directory) use . as the flag value.\n", " Default: ''\n", " Equivalent to: [--FilesWriter.build_directory]\n", "--reveal-prefix=\n", " The URL prefix for reveal.js (version 3.x).\n", " This defaults to the reveal CDN, but can be any url pointing to a copy\n", " of reveal.js.\n", " For speaker notes to work, this must be a relative path to a local\n", " copy of reveal.js: e.g., \"reveal.js\".\n", " If a relative path is given, it must be a subdirectory of the\n", " current directory (from which the server is run).\n", " See the usage documentation\n", " (https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)\n", " for more details.\n", " Default: ''\n", " Equivalent to: [--SlidesExporter.reveal_url_prefix]\n", "--nbformat=\n", " The nbformat version to write.\n", " Use this to downgrade notebooks.\n", " Choices: any of [1, 2, 3, 4]\n", " Default: 4\n", " Equivalent to: [--NotebookExporter.nbformat_version]\n", "\n", "Examples\n", "--------\n", "\n", " The simplest way to use nbconvert is\n", "\n", " > jupyter nbconvert mynotebook.ipynb --to html\n", "\n", " Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].\n", "\n", " > jupyter nbconvert --to latex mynotebook.ipynb\n", "\n", " Both HTML and LaTeX support multiple output templates. LaTeX includes\n", " 'base', 'article' and 'report'. HTML includes 'basic', 'lab' and\n", " 'classic'. You can specify the flavor of the format used.\n", "\n", " > jupyter nbconvert --to html --template lab mynotebook.ipynb\n", "\n", " You can also pipe the output to stdout, rather than a file\n", "\n", " > jupyter nbconvert mynotebook.ipynb --stdout\n", "\n", " PDF is generated via latex\n", "\n", " > jupyter nbconvert mynotebook.ipynb --to pdf\n", "\n", " You can get (and serve) a Reveal.js-powered slideshow\n", "\n", " > jupyter nbconvert myslides.ipynb --to slides --post serve\n", "\n", " Multiple notebooks can be given at the command line in a couple of\n", " different ways:\n", "\n", " > jupyter nbconvert notebook*.ipynb\n", " > jupyter nbconvert notebook1.ipynb notebook2.ipynb\n", "\n", " or you can specify the notebooks list in a config file, containing::\n", "\n", " c.NbConvertApp.notebooks = [\"my_notebook.ipynb\"]\n", "\n", " > jupyter nbconvert --config mycfg.py\n", "\n", "To see all available configurables, use `--help-all`.\n", "\n" ] } ], "source": [ "!jupyter nbconvert --to html sentiment_analysis_Twitter.ipynb" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "w3y4HAp1IVe7" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 0 }