|
|
|
"""thota14.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1Muegp4GV6UHkiBYRzqf2uztvmPHWe8V_ |
|
|
|
**Midterm: Neural Network-Based Language |
|
Model for Next Token Prediction** |
|
""" |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.optim as optim |
|
from torch.utils.data import Dataset, DataLoader |
|
import numpy as np |
|
import re |
|
from collections import Counter |
|
from sklearn.model_selection import train_test_split |
|
|
|
"""**Load Icelandic and English text files**""" |
|
|
|
|
|
with open('/content/Icelandic_sampled.txt', 'r', encoding='utf-8') as f: |
|
icelandic_text = f.read() |
|
|
|
with open('/content/alpaca_sampled.txt', 'r', encoding='utf-8') as f: |
|
english_text = f.read() |
|
|
|
print("Datasets loaded successfully.") |
|
|
|
|
|
def preprocess_text(text): |
|
text = text.lower() |
|
text = re.sub(r'[^a-zA-ZÍÚÁÉÓÖÞÆÉíúáéóöþæ ]', '', text) |
|
return text |
|
|
|
|
|
english_text = preprocess_text(english_text) |
|
icelandic_text = preprocess_text(icelandic_text) |
|
|
|
print(f"Sample of English Text: {english_text[:100]}") |
|
print(f"Sample of Icelandic Text: {icelandic_text[:100]}") |
|
|
|
"""** Tokenization function**""" |
|
|
|
|
|
def tokenize(text): |
|
return text.split() |
|
|
|
english_tokens = tokenize(english_text) |
|
icelandic_tokens = tokenize(icelandic_text) |
|
|
|
print(f"English tokens: {len(english_tokens)}") |
|
print(f"Icelandic tokens: {len(icelandic_tokens)}") |
|
|
|
|
|
def build_vocab(tokens): |
|
vocab = Counter(tokens) |
|
vocab = {word: i for i, (word, _) in enumerate(vocab.items())} |
|
return vocab |
|
|
|
|
|
english_vocab = build_vocab(english_tokens) |
|
icelandic_vocab = build_vocab(icelandic_tokens) |
|
|
|
print(f"English Vocabulary Size: {len(english_vocab)}") |
|
print(f"Icelandic Vocabulary Size: {len(icelandic_vocab)}") |
|
|
|
|
|
english_data = [english_vocab[word] for word in english_tokens] |
|
icelandic_data = [icelandic_vocab[word] for word in icelandic_tokens] |
|
|
|
|
|
combined_data = english_data + icelandic_data |
|
print(f"Combined dataset size: {len(combined_data)}") |
|
|
|
"""**Dataset class for sequence prediction**""" |
|
|
|
|
|
class TextDataset(Dataset): |
|
def __init__(self, data, sequence_length): |
|
self.data = data |
|
self.sequence_length = sequence_length |
|
|
|
def __len__(self): |
|
return len(self.data) - self.sequence_length |
|
|
|
def __getitem__(self, idx): |
|
return (torch.tensor(self.data[idx:idx + self.sequence_length]), |
|
torch.tensor(self.data[idx + self.sequence_length])) |
|
|
|
|
|
sequence_length = 5 |
|
|
|
|
|
combined_dataset = TextDataset(combined_data, sequence_length) |
|
print(f"Dataset length: {len(combined_dataset)}") |
|
|
|
|
|
train_data, val_data = train_test_split(combined_dataset, test_size=0.1) |
|
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}") |
|
|
|
|
|
batch_size = 64 |
|
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) |
|
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False) |
|
|
|
print(f"Batch size: {batch_size}") |
|
print(f"Training batches: {len(train_loader)}, Validation batches: {len(val_loader)}") |
|
|
|
"""**Loss function and optimizer**""" |
|
|
|
|
|
criterion = nn.CrossEntropyLoss() |
|
optimizer = optim.Adam(combined_model.parameters(), lr=0.001) |
|
|
|
|
|
def train_model(model, train_loader, val_loader, optimizer, num_epochs, checkpoint_path): |
|
model.train() |
|
train_losses, val_losses = [], [] |
|
|
|
for epoch in range(num_epochs): |
|
epoch_train_loss = 0 |
|
for inputs, targets in train_loader: |
|
optimizer.zero_grad() |
|
outputs = model(inputs) |
|
loss = criterion(outputs, targets) |
|
loss.backward() |
|
optimizer.step() |
|
epoch_train_loss += loss.item() |
|
|
|
|
|
model.eval() |
|
val_loss = 0 |
|
with torch.no_grad(): |
|
for inputs, targets in val_loader: |
|
outputs = model(inputs) |
|
loss = criterion(outputs, targets) |
|
val_loss += loss.item() |
|
|
|
train_losses.append(epoch_train_loss / len(train_loader)) |
|
val_losses.append(val_loss / len(val_loader)) |
|
|
|
|
|
torch.save(model.state_dict(), f'{checkpoint_path}_epoch{epoch+1}.pth') |
|
|
|
print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}') |
|
|
|
return train_losses, val_losses |
|
|
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
num_epochs = 10 |
|
train_losses, val_losses = train_model(combined_model, train_loader, val_loader, optimizer, num_epochs, 'combined_model_checkpoint') |
|
|
|
|
|
losses_df = pd.DataFrame({ |
|
'Epoch': range(1, num_epochs + 1), |
|
'Train_Loss': train_losses, |
|
'Val_Loss': val_losses |
|
}) |
|
losses_df.to_csv('training_validation_losses.csv', index=False) |
|
|
|
"""**final model**""" |
|
|
|
|
|
torch.save(combined_model.state_dict(), 'combined_model.pth') |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
|
def plot_losses(train_losses, val_losses, title): |
|
plt.plot(train_losses, label='Training Loss') |
|
plt.plot(val_losses, label='Validation Loss') |
|
plt.title(title) |
|
plt.xlabel('Epoch') |
|
plt.ylabel('Loss') |
|
plt.legend() |
|
plt.savefig('model_loss.png') |
|
plt.show() |
|
|
|
|
|
plot_losses(train_losses, val_losses, 'Combined Model Loss') |
|
|
|
"""**perplexity based on the validation set**""" |
|
|
|
|
|
def calculate_perplexity(model, val_loader): |
|
model.eval() |
|
total_loss = 0 |
|
total_words = 0 |
|
|
|
with torch.no_grad(): |
|
for inputs, targets in val_loader: |
|
outputs = model(inputs) |
|
loss = criterion(outputs, targets) |
|
total_loss += loss.item() |
|
total_words += targets.size(0) |
|
|
|
avg_loss = total_loss / len(val_loader) |
|
perplexity = np.exp(avg_loss) |
|
|
|
return perplexity |
|
|
|
|
|
combined_perplexity = calculate_perplexity(combined_model, val_loader) |
|
print(f'Combined Model Perplexity: {combined_perplexity:.2f}') |
|
|
|
import numpy as np |
|
import torch |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
def calculate_perplexity(model, val_loader): |
|
model.eval() |
|
perplexity_values = [] |
|
total_loss = 0 |
|
|
|
with torch.no_grad(): |
|
for inputs, targets in val_loader: |
|
outputs = model(inputs) |
|
loss = criterion(outputs, targets) |
|
total_loss += loss.item() |
|
avg_loss = total_loss / (len(perplexity_values) + 1) |
|
perplexity = np.exp(avg_loss) |
|
perplexity_values.append(perplexity) |
|
|
|
return perplexity_values |
|
|
|
|
|
combined_perplexity = calculate_perplexity(combined_model, val_loader) |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.plot(combined_perplexity, label='Model Perplexity', marker='o') |
|
plt.title('Perplexity over Validation Set') |
|
plt.xlabel('Batch Number') |
|
plt.ylabel('Perplexity') |
|
plt.yscale('log') |
|
plt.legend() |
|
plt.grid() |
|
plt.show() |
|
|
|
"""**Generate text**""" |
|
|
|
import torch.nn.functional as F |
|
|
|
|
|
if '<UNK>' not in english_vocab: |
|
english_vocab['<UNK>'] = len(english_vocab) |
|
if '<UNK>' not in icelandic_vocab: |
|
icelandic_vocab['<UNK>'] = len(icelandic_vocab) |
|
|
|
|
|
english_reverse_vocab = {idx: word for word, idx in english_vocab.items()} |
|
icelandic_reverse_vocab = {idx: word for word, idx in icelandic_vocab.items()} |
|
|
|
|
|
def generate_text(model, vocab, reverse_vocab, seed_text, max_length=50): |
|
model.eval() |
|
|
|
|
|
seed_tokens = [vocab.get(word, vocab['<UNK>']) for word in seed_text.split()] |
|
input_seq = torch.tensor(seed_tokens).unsqueeze(0) |
|
|
|
generated_text = seed_text.split() |
|
|
|
with torch.no_grad(): |
|
for _ in range(max_length): |
|
output = model(input_seq) |
|
|
|
|
|
if len(output.shape) == 2: |
|
|
|
predictions = F.softmax(output, dim=-1) |
|
else: |
|
|
|
predictions = F.softmax(output[:, -1, :], dim=-1) |
|
|
|
next_token_idx = torch.argmax(predictions, dim=-1).item() |
|
|
|
|
|
next_token_word = reverse_vocab.get(next_token_idx, '<UNK>') |
|
generated_text.append(next_token_word) |
|
|
|
|
|
input_seq = torch.cat([input_seq, torch.tensor([[next_token_idx]])], dim=1) |
|
|
|
return ' '.join(generated_text) |
|
|
|
|
|
print("Generating text in English...") |
|
seed_text = "Today is a good ohh yes" |
|
generated_english = generate_text(combined_model, english_vocab, english_reverse_vocab, seed_text) |
|
print("Generated English Text:", generated_english) |
|
|
|
|
|
print("Generating text in Icelandic...") |
|
seed_text_icelandic = "þetta mun auka" |
|
generated_icelandic = generate_text(combined_model, icelandic_vocab, icelandic_reverse_vocab, seed_text_icelandic) |
|
print("Generated Icelandic Text:", generated_icelandic) |
|
|
|
|
|
|
|
"""END END""" |