srikanth1579
/

Midterm

Model card Files Files and versions Community

Midterm / thota14_Q7.py

srikanth1579

Upload files

3d59f40 verified 5 months ago

raw

history blame contribute delete

9.99 kB

	# -- coding: utf-8 --
	"""thota14.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1Muegp4GV6UHkiBYRzqf2uztvmPHWe8V_

	**Midterm: Neural Network-Based Language
	Model for Next Token Prediction**
	"""

	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import Dataset, DataLoader
	import numpy as np
	import re
	from collections import Counter
	from sklearn.model_selection import train_test_split

	"""Load Icelandic and English text files"""

	# Load Icelandic and English text files
	with open('/content/Icelandic_sampled.txt', 'r', encoding='utf-8') as f:
	icelandic_text = f.read()

	with open('/content/alpaca_sampled.txt', 'r', encoding='utf-8') as f:
	english_text = f.read()

	print("Datasets loaded successfully.")

	# Preprocessing function to clean text
	def preprocess_text(text):
	text = text.lower()
	text = re.sub(r'[^a-zA-ZÍÚÁÉÓÖÞÆÉíúáéóöþæ ]', '', text) # Retain Icelandic letters
	return text

	# Apply preprocessing to both datasets
	english_text = preprocess_text(english_text)
	icelandic_text = preprocess_text(icelandic_text)

	print(f"Sample of English Text: {english_text[:100]}")
	print(f"Sample of Icelandic Text: {icelandic_text[:100]}")

	""" Tokenization function"""

	# Tokenization function
	def tokenize(text):
	return text.split()

	english_tokens = tokenize(english_text)
	icelandic_tokens = tokenize(icelandic_text)

	print(f"English tokens: {len(english_tokens)}")
	print(f"Icelandic tokens: {len(icelandic_tokens)}")

	# Build vocabulary
	def build_vocab(tokens):
	vocab = Counter(tokens)
	vocab = {word: i for i, (word, _) in enumerate(vocab.items())}
	return vocab

	# Create vocabularies for both languages
	english_vocab = build_vocab(english_tokens)
	icelandic_vocab = build_vocab(icelandic_tokens)

	print(f"English Vocabulary Size: {len(english_vocab)}")
	print(f"Icelandic Vocabulary Size: {len(icelandic_vocab)}")

	# Convert tokens to indices
	english_data = [english_vocab[word] for word in english_tokens]
	icelandic_data = [icelandic_vocab[word] for word in icelandic_tokens]

	# Combine datasets
	combined_data = english_data + icelandic_data
	print(f"Combined dataset size: {len(combined_data)}")

	"""Dataset class for sequence prediction"""

	# Dataset class for sequence prediction
	class TextDataset(Dataset):
	def __init__(self, data, sequence_length):
	self.data = data
	self.sequence_length = sequence_length

	def __len__(self):
	return len(self.data) - self.sequence_length

	def __getitem__(self, idx):
	return (torch.tensor(self.data[idx:idx + self.sequence_length]),
	torch.tensor(self.data[idx + self.sequence_length]))

	# Sequence length for training
	sequence_length = 5

	# Create the combined dataset
	combined_dataset = TextDataset(combined_data, sequence_length)
	print(f"Dataset length: {len(combined_dataset)}")

	# Split into training and validation sets
	train_data, val_data = train_test_split(combined_dataset, test_size=0.1)
	print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")

	# Create DataLoaders for training and validation
	batch_size = 64
	train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
	val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

	print(f"Batch size: {batch_size}")
	print(f"Training batches: {len(train_loader)}, Validation batches: {len(val_loader)}")

	"""Loss function and optimizer"""

	# Loss function and optimizer
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

	# Function to train the model and validate
	def train_model(model, train_loader, val_loader, optimizer, num_epochs, checkpoint_path):
	model.train()
	train_losses, val_losses = [], []

	for epoch in range(num_epochs):
	epoch_train_loss = 0
	for inputs, targets in train_loader:
	optimizer.zero_grad()
	outputs = model(inputs)
	loss = criterion(outputs, targets)
	loss.backward()
	optimizer.step()
	epoch_train_loss += loss.item()

	# Validation step
	model.eval()
	val_loss = 0
	with torch.no_grad():
	for inputs, targets in val_loader:
	outputs = model(inputs)
	loss = criterion(outputs, targets)
	val_loss += loss.item()

	train_losses.append(epoch_train_loss / len(train_loader))
	val_losses.append(val_loss / len(val_loader))

	# Save checkpoint for every epoch
	torch.save(model.state_dict(), f'{checkpoint_path}_epoch{epoch+1}.pth')

	print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')

	return train_losses, val_losses

	import pandas as pd
	import numpy as np

	# Train the model
	num_epochs = 10
	train_losses, val_losses = train_model(combined_model, train_loader, val_loader, optimizer, num_epochs, 'combined_model_checkpoint')

	# Save training and validation losses to a CSV file
	losses_df = pd.DataFrame({
	'Epoch': range(1, num_epochs + 1),
	'Train_Loss': train_losses,
	'Val_Loss': val_losses
	})
	losses_df.to_csv('training_validation_losses.csv', index=False)

	"""final model"""

	# Save the final model
	torch.save(combined_model.state_dict(), 'combined_model.pth')

	# Plot losses
	import matplotlib.pyplot as plt

	def plot_losses(train_losses, val_losses, title):
	plt.plot(train_losses, label='Training Loss')
	plt.plot(val_losses, label='Validation Loss')
	plt.title(title)
	plt.xlabel('Epoch')
	plt.ylabel('Loss')
	plt.legend()
	plt.savefig('model_loss.png')
	plt.show()

	# Plot combined model losses
	plot_losses(train_losses, val_losses, 'Combined Model Loss')

	"""perplexity based on the validation set"""

	# Calculate perplexity based on the validation set
	def calculate_perplexity(model, val_loader):
	model.eval()
	total_loss = 0
	total_words = 0

	with torch.no_grad():
	for inputs, targets in val_loader:
	outputs = model(inputs)
	loss = criterion(outputs, targets)
	total_loss += loss.item()
	total_words += targets.size(0)

	avg_loss = total_loss / len(val_loader)
	perplexity = np.exp(avg_loss)

	return perplexity

	# Perplexity calculation for the combined model
	combined_perplexity = calculate_perplexity(combined_model, val_loader)
	print(f'Combined Model Perplexity: {combined_perplexity:.2f}')

	import numpy as np
	import torch
	import matplotlib.pyplot as plt

	# Calculate perplexity based on the validation set
	def calculate_perplexity(model, val_loader):
	model.eval()
	perplexity_values = [] # Store perplexity for each batch
	total_loss = 0

	with torch.no_grad():
	for inputs, targets in val_loader:
	outputs = model(inputs)
	loss = criterion(outputs, targets)
	total_loss += loss.item()
	avg_loss = total_loss / (len(perplexity_values) + 1) # Average loss after each batch
	perplexity = np.exp(avg_loss)
	perplexity_values.append(perplexity) # Append current perplexity

	return perplexity_values

	# Perplexity calculation for the combined model
	combined_perplexity = calculate_perplexity(combined_model, val_loader)

	# Plotting perplexity values
	plt.figure(figsize=(10, 6))
	plt.plot(combined_perplexity, label='Model Perplexity', marker='o')
	plt.title('Perplexity over Validation Set')
	plt.xlabel('Batch Number')
	plt.ylabel('Perplexity')
	plt.yscale('log') # Log scale can help visualize perplexity better if the values vary widely
	plt.legend()
	plt.grid()
	plt.show()

	"""Generate text"""

	import torch.nn.functional as F

	# Ensure '<UNK>' token is in the vocabularies
	if '<UNK>' not in english_vocab:
	english_vocab['<UNK>'] = len(english_vocab)
	if '<UNK>' not in icelandic_vocab:
	icelandic_vocab['<UNK>'] = len(icelandic_vocab)

	# Reverse vocab dictionaries
	english_reverse_vocab = {idx: word for word, idx in english_vocab.items()}
	icelandic_reverse_vocab = {idx: word for word, idx in icelandic_vocab.items()}

	# Function to generate text
	def generate_text(model, vocab, reverse_vocab, seed_text, max_length=50):
	model.eval()

	# Tokenize and convert seed text to indices
	seed_tokens = [vocab.get(word, vocab['<UNK>']) for word in seed_text.split()]
	input_seq = torch.tensor(seed_tokens).unsqueeze(0) # Add batch dimension

	generated_text = seed_text.split()

	with torch.no_grad():
	for _ in range(max_length):
	output = model(input_seq) # Forward pass

	# Check the shape of the output
	if len(output.shape) == 2:
	# Handle (batch_size, vocab_size)
	predictions = F.softmax(output, dim=-1)
	else:
	# Handle (batch_size, sequence_length, vocab_size)
	predictions = F.softmax(output[:, -1, :], dim=-1)

	next_token_idx = torch.argmax(predictions, dim=-1).item()

	# Append the predicted token
	next_token_word = reverse_vocab.get(next_token_idx, '<UNK>')
	generated_text.append(next_token_word)

	# Update input sequence with the predicted token
	input_seq = torch.cat([input_seq, torch.tensor([[next_token_idx]])], dim=1)

	return ' '.join(generated_text)

	# Generate text in English
	print("Generating text in English...")
	seed_text = "Today is a good ohh yes"
	generated_english = generate_text(combined_model, english_vocab, english_reverse_vocab, seed_text)
	print("Generated English Text:", generated_english)

	# Generate text in Icelandic
	print("Generating text in Icelandic...")
	seed_text_icelandic = "þetta mun auka"
	generated_icelandic = generate_text(combined_model, icelandic_vocab, icelandic_reverse_vocab, seed_text_icelandic)
	print("Generated Icelandic Text:", generated_icelandic)



	"""END END"""