{ "cells": [ { "cell_type": "markdown", "source": [ " **Midterm: Neural Network-Based Language\n", "Model for Next Token Prediction**\n" ], "metadata": { "id": "hOqh8ZFysD6G" } }, { "cell_type": "code", "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from torch.utils.data import Dataset, DataLoader\n", "import numpy as np\n", "import re\n", "from collections import Counter\n", "from sklearn.model_selection import train_test_split\n" ], "metadata": { "id": "14bhPtpVtqk5" }, "execution_count": 16, "outputs": [] }, { "cell_type": "markdown", "source": [ "**Load Icelandic and English text files**" ], "metadata": { "id": "6mDLDJqTUtS1" } }, { "cell_type": "code", "source": [ "# Load Icelandic and English text files\n", "with open('/content/Icelandic_sampled.txt', 'r', encoding='utf-8') as f:\n", " icelandic_text = f.read()\n", "\n", "with open('/content/alpaca_sampled.txt', 'r', encoding='utf-8') as f:\n", " english_text = f.read()\n", "\n", "print(\"Datasets loaded successfully.\")\n", "\n", "# Preprocessing function to clean text\n", "def preprocess_text(text):\n", " text = text.lower()\n", " text = re.sub(r'[^a-zA-ZÍÚÁÉÓÖÞÆÉíúáéóöþæ ]', '', text) # Retain Icelandic letters\n", " return text\n", "\n", "# Apply preprocessing to both datasets\n", "english_text = preprocess_text(english_text)\n", "icelandic_text = preprocess_text(icelandic_text)\n", "\n", "print(f\"Sample of English Text: {english_text[:100]}\")\n", "print(f\"Sample of Icelandic Text: {icelandic_text[:100]}\")\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "43x2PjLihFr2", "outputId": "286106bb-9e22-4afc-dba8-c44fa351d36c" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Datasets loaded successfully.\n", "Sample of English Text: instruction explain newtons third law of motion input output newtons third law of motion states tha\n", "Sample of Icelandic Text: instruction kindly translate the given sentence into icelandic input write a comment praising the gi\n" ] } ] }, { "cell_type": "markdown", "source": [ "** Tokenization function**" ], "metadata": { "id": "gtpfe1seU1IP" } }, { "cell_type": "code", "source": [ "# Tokenization function\n", "def tokenize(text):\n", " return text.split()\n", "\n", "english_tokens = tokenize(english_text)\n", "icelandic_tokens = tokenize(icelandic_text)\n", "\n", "print(f\"English tokens: {len(english_tokens)}\")\n", "print(f\"Icelandic tokens: {len(icelandic_tokens)}\")\n", "\n", "# Build vocabulary\n", "def build_vocab(tokens):\n", " vocab = Counter(tokens)\n", " vocab = {word: i for i, (word, _) in enumerate(vocab.items())}\n", " return vocab\n", "\n", "# Create vocabularies for both languages\n", "english_vocab = build_vocab(english_tokens)\n", "icelandic_vocab = build_vocab(icelandic_tokens)\n", "\n", "print(f\"English Vocabulary Size: {len(english_vocab)}\")\n", "print(f\"Icelandic Vocabulary Size: {len(icelandic_vocab)}\")\n", "\n", "# Convert tokens to indices\n", "english_data = [english_vocab[word] for word in english_tokens]\n", "icelandic_data = [icelandic_vocab[word] for word in icelandic_tokens]\n", "\n", "# Combine datasets\n", "combined_data = english_data + icelandic_data\n", "print(f\"Combined dataset size: {len(combined_data)}\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3LnwrEBKhFuT", "outputId": "7c931294-0b22-45b1-927f-4013a97a1b53" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "English tokens: 32477\n", "Icelandic tokens: 36422\n", "English Vocabulary Size: 6221\n", "Icelandic Vocabulary Size: 9601\n", "Combined dataset size: 68899\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Dataset class for sequence prediction**" ], "metadata": { "id": "wsGscGEvU7bM" } }, { "cell_type": "code", "source": [ "# Dataset class for sequence prediction\n", "class TextDataset(Dataset):\n", " def __init__(self, data, sequence_length):\n", " self.data = data\n", " self.sequence_length = sequence_length\n", "\n", " def __len__(self):\n", " return len(self.data) - self.sequence_length\n", "\n", " def __getitem__(self, idx):\n", " return (torch.tensor(self.data[idx:idx + self.sequence_length]),\n", " torch.tensor(self.data[idx + self.sequence_length]))\n", "\n", "# Sequence length for training\n", "sequence_length = 5\n", "\n", "# Create the combined dataset\n", "combined_dataset = TextDataset(combined_data, sequence_length)\n", "print(f\"Dataset length: {len(combined_dataset)}\")\n", "\n", "# Split into training and validation sets\n", "train_data, val_data = train_test_split(combined_dataset, test_size=0.1)\n", "print(f\"Training samples: {len(train_data)}, Validation samples: {len(val_data)}\")\n", "\n", "# Create DataLoaders for training and validation\n", "batch_size = 64\n", "train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)\n", "val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)\n", "\n", "print(f\"Batch size: {batch_size}\")\n", "print(f\"Training batches: {len(train_loader)}, Validation batches: {len(val_loader)}\")\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6eyfZ2klhFw1", "outputId": "058f0dba-c5d3-48b4-edd0-95c12a4521aa" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Dataset length: 68894\n", "Training samples: 62004, Validation samples: 6890\n", "Batch size: 64\n", "Training batches: 969, Validation batches: 108\n" ] } ] }, { "cell_type": "markdown", "source": [ "**Loss function and optimizer**" ], "metadata": { "id": "cYim4C6HVM6p" } }, { "cell_type": "code", "source": [ "# Loss function and optimizer\n", "criterion = nn.CrossEntropyLoss()\n", "optimizer = optim.Adam(combined_model.parameters(), lr=0.001)\n", "\n", "# Function to train the model and validate\n", "def train_model(model, train_loader, val_loader, optimizer, num_epochs, checkpoint_path):\n", " model.train()\n", " train_losses, val_losses = [], []\n", "\n", " for epoch in range(num_epochs):\n", " epoch_train_loss = 0\n", " for inputs, targets in train_loader:\n", " optimizer.zero_grad()\n", " outputs = model(inputs)\n", " loss = criterion(outputs, targets)\n", " loss.backward()\n", " optimizer.step()\n", " epoch_train_loss += loss.item()\n", "\n", " # Validation step\n", " model.eval()\n", " val_loss = 0\n", " with torch.no_grad():\n", " for inputs, targets in val_loader:\n", " outputs = model(inputs)\n", " loss = criterion(outputs, targets)\n", " val_loss += loss.item()\n", "\n", " train_losses.append(epoch_train_loss / len(train_loader))\n", " val_losses.append(val_loss / len(val_loader))\n", "\n", " # Save checkpoint for every epoch\n", " torch.save(model.state_dict(), f'{checkpoint_path}_epoch{epoch+1}.pth')\n", "\n", " print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')\n", "\n", " return train_losses, val_losses\n", "\n", " import pandas as pd\n", "import numpy as np\n", "\n", "# Train the model\n", "num_epochs = 10\n", "train_losses, val_losses = train_model(combined_model, train_loader, val_loader, optimizer, num_epochs, 'combined_model_checkpoint')\n", "\n", "# Save training and validation losses to a CSV file\n", "losses_df = pd.DataFrame({\n", " 'Epoch': range(1, num_epochs + 1),\n", " 'Train_Loss': train_losses,\n", " 'Val_Loss': val_losses\n", "})\n", "losses_df.to_csv('training_validation_losses.csv', index=False)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ESRw-m00hFz_", "outputId": "9a489cb6-90b8-47c0-e647-799213667927" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/10, Train Loss: 7.6275, Val Loss: 7.1583\n", "Epoch 2/10, Train Loss: 6.6923, Val Loss: 6.9158\n", "Epoch 3/10, Train Loss: 5.9922, Val Loss: 6.8872\n", "Epoch 4/10, Train Loss: 5.1496, Val Loss: 7.0510\n", "Epoch 5/10, Train Loss: 4.2123, Val Loss: 7.2246\n", "Epoch 6/10, Train Loss: 3.2722, Val Loss: 7.5333\n", "Epoch 7/10, Train Loss: 2.4096, Val Loss: 7.8671\n", "Epoch 8/10, Train Loss: 1.6956, Val Loss: 8.1713\n", "Epoch 9/10, Train Loss: 1.1467, Val Loss: 8.4934\n", "Epoch 10/10, Train Loss: 0.7460, Val Loss: 8.7863\n" ] } ] }, { "cell_type": "markdown", "source": [ "**final model**" ], "metadata": { "id": "Xu12K0UbVR_g" } }, { "cell_type": "code", "source": [ "# Save the final model\n", "torch.save(combined_model.state_dict(), 'combined_model.pth')" ], "metadata": { "id": "lg4Fzwker0IQ" }, "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "# Plot losses\n", "import matplotlib.pyplot as plt\n", "\n", "def plot_losses(train_losses, val_losses, title):\n", " plt.plot(train_losses, label='Training Loss')\n", " plt.plot(val_losses, label='Validation Loss')\n", " plt.title(title)\n", " plt.xlabel('Epoch')\n", " plt.ylabel('Loss')\n", " plt.legend()\n", " plt.savefig('model_loss.png')\n", " plt.show()\n", "\n", "# Plot combined model losses\n", "plot_losses(train_losses, val_losses, 'Combined Model Loss')\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 472 }, "id": "cGOHhDm8hF2z", "outputId": "69498d44-8384-4b62-85bd-c3a04eb68e9c" }, "execution_count": 14, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "**perplexity based on the validation set**" ], "metadata": { "id": "Zs9yNsPpVX39" } }, { "cell_type": "code", "source": [ "# Calculate perplexity based on the validation set\n", "def calculate_perplexity(model, val_loader):\n", " model.eval()\n", " total_loss = 0\n", " total_words = 0\n", "\n", " with torch.no_grad():\n", " for inputs, targets in val_loader:\n", " outputs = model(inputs)\n", " loss = criterion(outputs, targets)\n", " total_loss += loss.item()\n", " total_words += targets.size(0)\n", "\n", " avg_loss = total_loss / len(val_loader)\n", " perplexity = np.exp(avg_loss)\n", "\n", " return perplexity\n", "\n", "# Perplexity calculation for the combined model\n", "combined_perplexity = calculate_perplexity(combined_model, val_loader)\n", "print(f'Combined Model Perplexity: {combined_perplexity:.2f}')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "j29uMcyKjKge", "outputId": "21b4437a-54e3-445a-f7c5-7017882d6645" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Combined Model Perplexity: 6543.78\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import torch\n", "import matplotlib.pyplot as plt\n", "\n", "# Calculate perplexity based on the validation set\n", "def calculate_perplexity(model, val_loader):\n", " model.eval()\n", " perplexity_values = [] # Store perplexity for each batch\n", " total_loss = 0\n", "\n", " with torch.no_grad():\n", " for inputs, targets in val_loader:\n", " outputs = model(inputs)\n", " loss = criterion(outputs, targets)\n", " total_loss += loss.item()\n", " avg_loss = total_loss / (len(perplexity_values) + 1) # Average loss after each batch\n", " perplexity = np.exp(avg_loss)\n", " perplexity_values.append(perplexity) # Append current perplexity\n", "\n", " return perplexity_values\n", "\n", "# Perplexity calculation for the combined model\n", "combined_perplexity = calculate_perplexity(combined_model, val_loader)\n", "\n", "# Plotting perplexity values\n", "plt.figure(figsize=(10, 6))\n", "plt.plot(combined_perplexity, label='Model Perplexity', marker='o')\n", "plt.title('Perplexity over Validation Set')\n", "plt.xlabel('Batch Number')\n", "plt.ylabel('Perplexity')\n", "plt.yscale('log') # Log scale can help visualize perplexity better if the values vary widely\n", "plt.legend()\n", "plt.grid()\n", "plt.show()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 504 }, "id": "a_z_MqsySii7", "outputId": "5dbed4bb-5977-4570-9ba7-e9846f99d31e" }, "execution_count": 24, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "**Generate text**" ], "metadata": { "id": "07kkXXJmViX8" } }, { "cell_type": "code", "source": [ "import torch.nn.functional as F\n", "\n", "# Ensure '' token is in the vocabularies\n", "if '' not in english_vocab:\n", " english_vocab[''] = len(english_vocab)\n", "if '' not in icelandic_vocab:\n", " icelandic_vocab[''] = len(icelandic_vocab)\n", "\n", "# Reverse vocab dictionaries\n", "english_reverse_vocab = {idx: word for word, idx in english_vocab.items()}\n", "icelandic_reverse_vocab = {idx: word for word, idx in icelandic_vocab.items()}\n", "\n", "# Function to generate text\n", "def generate_text(model, vocab, reverse_vocab, seed_text, max_length=50):\n", " model.eval()\n", "\n", " # Tokenize and convert seed text to indices\n", " seed_tokens = [vocab.get(word, vocab['']) for word in seed_text.split()]\n", " input_seq = torch.tensor(seed_tokens).unsqueeze(0) # Add batch dimension\n", "\n", " generated_text = seed_text.split()\n", "\n", " with torch.no_grad():\n", " for _ in range(max_length):\n", " output = model(input_seq) # Forward pass\n", "\n", " # Check the shape of the output\n", " if len(output.shape) == 2:\n", " # Handle (batch_size, vocab_size)\n", " predictions = F.softmax(output, dim=-1)\n", " else:\n", " # Handle (batch_size, sequence_length, vocab_size)\n", " predictions = F.softmax(output[:, -1, :], dim=-1)\n", "\n", " next_token_idx = torch.argmax(predictions, dim=-1).item()\n", "\n", " # Append the predicted token\n", " next_token_word = reverse_vocab.get(next_token_idx, '')\n", " generated_text.append(next_token_word)\n", "\n", " # Update input sequence with the predicted token\n", " input_seq = torch.cat([input_seq, torch.tensor([[next_token_idx]])], dim=1)\n", "\n", " return ' '.join(generated_text)\n", "\n", "# Generate text in English\n", "print(\"Generating text in English...\")\n", "seed_text = \"Today is a good ohh yes\"\n", "generated_english = generate_text(combined_model, english_vocab, english_reverse_vocab, seed_text)\n", "print(\"Generated English Text:\", generated_english)\n", "\n", "# Generate text in Icelandic\n", "print(\"Generating text in Icelandic...\")\n", "seed_text_icelandic = \"þetta mun auka\"\n", "generated_icelandic = generate_text(combined_model, icelandic_vocab, icelandic_reverse_vocab, seed_text_icelandic)\n", "print(\"Generated Icelandic Text:\", generated_icelandic)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "N09nTZPNqRXw", "outputId": "bd5a1748-694c-493f-c631-fcb29ffe9374" }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Generating text in English...\n", "Generated English Text: Today is a good ohh yes in three research input output one is an object input output there are several types of these disasters can vary these can can can find the instance and accuracy the behavior of the given polynomial input x x x x can can add as the sentence but the speaker she\n", "Generating text in Icelandic...\n", "Generated Icelandic Text: þetta mun auka áberandi í utan eins og vieigandi alaandi og hjálpa til a gera gera get um a afslætti sínu sé einföld og sigrast og sanngjarnan til vinnu og gera er almennt á núverandi me frammistöu og getu getu okkar eins og okkar okkar til þátttöku og málverk til a draga úr\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "3_URP8RbqRLT" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "END END" ], "metadata": { "id": "BENy15FBq52n" } } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }