# -*- coding: utf-8 -*-
"""thota14.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1Muegp4GV6UHkiBYRzqf2uztvmPHWe8V_

**Midterm: Neural Network-Based Language
Model for Next Token Prediction**
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
from collections import Counter
from sklearn.model_selection import train_test_split

"""**Load Icelandic and English text files**"""

# Load Icelandic and English text files
with open('/content/Icelandic_sampled.txt', 'r', encoding='utf-8') as f:
    icelandic_text = f.read()

with open('/content/alpaca_sampled.txt', 'r', encoding='utf-8') as f:
    english_text = f.read()

print("Datasets loaded successfully.")

# Preprocessing function to clean text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-ZÍÚÁÉÓÖÞÆÉíúáéóöþæ ]', '', text)  # Retain Icelandic letters
    return text

# Apply preprocessing to both datasets
english_text = preprocess_text(english_text)
icelandic_text = preprocess_text(icelandic_text)

print(f"Sample of English Text: {english_text[:100]}")
print(f"Sample of Icelandic Text: {icelandic_text[:100]}")

"""** Tokenization function**"""

# Tokenization function
def tokenize(text):
    return text.split()

english_tokens = tokenize(english_text)
icelandic_tokens = tokenize(icelandic_text)

print(f"English tokens: {len(english_tokens)}")
print(f"Icelandic tokens: {len(icelandic_tokens)}")

# Build vocabulary
def build_vocab(tokens):
    vocab = Counter(tokens)
    vocab = {word: i for i, (word, _) in enumerate(vocab.items())}
    return vocab

# Create vocabularies for both languages
english_vocab = build_vocab(english_tokens)
icelandic_vocab = build_vocab(icelandic_tokens)

print(f"English Vocabulary Size: {len(english_vocab)}")
print(f"Icelandic Vocabulary Size: {len(icelandic_vocab)}")

# Convert tokens to indices
english_data = [english_vocab[word] for word in english_tokens]
icelandic_data = [icelandic_vocab[word] for word in icelandic_tokens]

# Combine datasets
combined_data = english_data + icelandic_data
print(f"Combined dataset size: {len(combined_data)}")

"""**Dataset class for sequence prediction**"""

# Dataset class for sequence prediction
class TextDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx:idx + self.sequence_length]),
                torch.tensor(self.data[idx + self.sequence_length]))

# Sequence length for training
sequence_length = 5

# Create the combined dataset
combined_dataset = TextDataset(combined_data, sequence_length)
print(f"Dataset length: {len(combined_dataset)}")

# Split into training and validation sets
train_data, val_data = train_test_split(combined_dataset, test_size=0.1)
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")

# Create DataLoaders for training and validation
batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

print(f"Batch size: {batch_size}")
print(f"Training batches: {len(train_loader)}, Validation batches: {len(val_loader)}")

"""**Loss function and optimizer**"""

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

# Function to train the model and validate
def train_model(model, train_loader, val_loader, optimizer, num_epochs, checkpoint_path):
    model.train()
    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
        epoch_train_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        train_losses.append(epoch_train_loss / len(train_loader))
        val_losses.append(val_loss / len(val_loader))

        # Save checkpoint for every epoch
        torch.save(model.state_dict(), f'{checkpoint_path}_epoch{epoch+1}.pth')

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')

    return train_losses, val_losses

    import pandas as pd
import numpy as np

# Train the model
num_epochs = 10
train_losses, val_losses = train_model(combined_model, train_loader, val_loader, optimizer, num_epochs, 'combined_model_checkpoint')

# Save training and validation losses to a CSV file
losses_df = pd.DataFrame({
    'Epoch': range(1, num_epochs + 1),
    'Train_Loss': train_losses,
    'Val_Loss': val_losses
})
losses_df.to_csv('training_validation_losses.csv', index=False)

"""**final model**"""

# Save the final model
torch.save(combined_model.state_dict(), 'combined_model.pth')

# Plot losses
import matplotlib.pyplot as plt

def plot_losses(train_losses, val_losses, title):
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('model_loss.png')
    plt.show()

# Plot combined model losses
plot_losses(train_losses, val_losses, 'Combined Model Loss')

"""**perplexity based on the validation set**"""

# Calculate perplexity based on the validation set
def calculate_perplexity(model, val_loader):
    model.eval()
    total_loss = 0
    total_words = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_words += targets.size(0)

    avg_loss = total_loss / len(val_loader)
    perplexity = np.exp(avg_loss)

    return perplexity

# Perplexity calculation for the combined model
combined_perplexity = calculate_perplexity(combined_model, val_loader)
print(f'Combined Model Perplexity: {combined_perplexity:.2f}')

import numpy as np
import torch
import matplotlib.pyplot as plt

# Calculate perplexity based on the validation set
def calculate_perplexity(model, val_loader):
    model.eval()
    perplexity_values = []  # Store perplexity for each batch
    total_loss = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            avg_loss = total_loss / (len(perplexity_values) + 1)  # Average loss after each batch
            perplexity = np.exp(avg_loss)
            perplexity_values.append(perplexity)  # Append current perplexity

    return perplexity_values

# Perplexity calculation for the combined model
combined_perplexity = calculate_perplexity(combined_model, val_loader)

# Plotting perplexity values
plt.figure(figsize=(10, 6))
plt.plot(combined_perplexity, label='Model Perplexity', marker='o')
plt.title('Perplexity over Validation Set')
plt.xlabel('Batch Number')
plt.ylabel('Perplexity')
plt.yscale('log')  # Log scale can help visualize perplexity better if the values vary widely
plt.legend()
plt.grid()
plt.show()

"""**Generate text**"""

import torch.nn.functional as F

# Ensure '<UNK>' token is in the vocabularies
if '<UNK>' not in english_vocab:
    english_vocab['<UNK>'] = len(english_vocab)
if '<UNK>' not in icelandic_vocab:
    icelandic_vocab['<UNK>'] = len(icelandic_vocab)

# Reverse vocab dictionaries
english_reverse_vocab = {idx: word for word, idx in english_vocab.items()}
icelandic_reverse_vocab = {idx: word for word, idx in icelandic_vocab.items()}

# Function to generate text
def generate_text(model, vocab, reverse_vocab, seed_text, max_length=50):
    model.eval()

    # Tokenize and convert seed text to indices
    seed_tokens = [vocab.get(word, vocab['<UNK>']) for word in seed_text.split()]
    input_seq = torch.tensor(seed_tokens).unsqueeze(0)  # Add batch dimension

    generated_text = seed_text.split()

    with torch.no_grad():
        for _ in range(max_length):
            output = model(input_seq)  # Forward pass

            # Check the shape of the output
            if len(output.shape) == 2:
                # Handle (batch_size, vocab_size)
                predictions = F.softmax(output, dim=-1)
            else:
                # Handle (batch_size, sequence_length, vocab_size)
                predictions = F.softmax(output[:, -1, :], dim=-1)

            next_token_idx = torch.argmax(predictions, dim=-1).item()

            # Append the predicted token
            next_token_word = reverse_vocab.get(next_token_idx, '<UNK>')
            generated_text.append(next_token_word)

            # Update input sequence with the predicted token
            input_seq = torch.cat([input_seq, torch.tensor([[next_token_idx]])], dim=1)

    return ' '.join(generated_text)

# Generate text in English
print("Generating text in English...")
seed_text = "Today is a good ohh yes"
generated_english = generate_text(combined_model, english_vocab, english_reverse_vocab, seed_text)
print("Generated English Text:", generated_english)

# Generate text in Icelandic
print("Generating text in Icelandic...")
seed_text_icelandic = "þetta mun auka"
generated_icelandic = generate_text(combined_model, icelandic_vocab, icelandic_reverse_vocab, seed_text_icelandic)
print("Generated Icelandic Text:", generated_icelandic)


"""END END"""