In [1]:
pip install sentencepiece torch torchvision torchaudio pandas scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
 Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
 Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
 Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
 Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
 Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
 Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
 Downloading nvidia_curand_cu12-10.3.5.147-py

In [2]:


!pip install sentencepiece --quiet

In [3]:

"""
Model File for Roman Urdu Poetry Generation

This file contains the complete code for:
 - Data loading, cleaning, and tokenization using SentencePiece
 - Train/Test/Validation split creation
 - Dataset and DataLoader creation
 - Definition of a BiLSTM Language Model (with 3 layers, dropout, etc.)
 - Training, validation, and testing routines
 - Saving the trained model weights
 - A poetry generation function using nucleus (top-p) sampling with formatted output

Run this file to train and test the model. The trained weights will be saved to a file and loaded on subsequent runs.
"""

'\nModel File for Roman Urdu Poetry Generation\n\nThis file contains the complete code for:\n - Data loading, cleaning, and tokenization using SentencePiece\n - Train/Test/Validation split creation\n - Dataset and DataLoader creation\n - Definition of a BiLSTM Language Model (with 3 layers, dropout, etc.)\n - Training, validation, and testing routines\n - Saving the trained model weights\n - A poetry generation function using nucleus (top-p) sampling with formatted output\n\nRun this file to train and test the model. The trained weights will be saved to a file and loaded on subsequent runs.\n'

In [4]:
# -------------------------
# 1. Import Libraries
# -------------------------
import os
import random
import numpy as np
import pandas as pd
import sentencepiece as spm
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import unicodedata
from sklearn.model_selection import train_test_split

In [5]:

# -------------------------
# 2. Set Random Seeds and Device
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [6]:
# -------------------------
# 3. Load and Clean Dataset
# -------------------------
DATA_PATH = "Roman-Urdu-Poetry.csv" # Make sure this file exists in your working directory
df = pd.read_csv(DATA_PATH)

def remove_diacritics(text: str) -> str:
 """
 Removes Unicode diacritical marks from the text.
 """
 return ''.join(ch for ch in unicodedata.normalize('NFD', text)
 if not unicodedata.combining(ch))

def clean_text(text):
 """
 Cleans the input text by removing diacritics, extra spaces, and unwanted punctuation.
 """
 text = remove_diacritics(text)
 text = re.sub(r"\s+", " ", text)
 text = re.sub(r"[^\w\s\.\,\;\:\'\?\!\-]+", "", text)
 return text.strip()

df["Poetry"] = df["Poetry"].astype(str).apply(clean_text)
texts = df["Poetry"].tolist()
print(f"Total number of poetry lines: {len(texts)}")

Total number of poetry lines: 1314


In [7]:
# -------------------------
# 4. Train/Test/Validation Split (80/10/10)
# -------------------------
train_texts, test_texts = train_test_split(texts, test_size=0.1, random_state=SEED)
train_texts, val_texts = train_test_split(train_texts, test_size=0.1111, random_state=SEED)
print(f"Train samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")

Train samples: 1050
Validation samples: 132
Test samples: 132


In [8]:
# -------------------------
# 5. Train a SentencePiece BPE Tokenizer
# -------------------------
all_texts_file = "all_texts.txt"
if not os.path.exists(all_texts_file):
 with open(all_texts_file, "w", encoding="utf-8") as f:
 for line in texts:
 f.write(line.strip() + "\n")
else:
 print(f"{all_texts_file} already exists; skipping file creation.")


sp_model_prefix = "urdu_sp"
model_file = f"{sp_model_prefix}.model"
vocab_file = f"{sp_model_prefix}.vocab"

vocab_size = 12000 # Adjust as needed
model_type = "bpe"

if not (os.path.exists(model_file) and os.path.exists(vocab_file)):
 print("SentencePiece model or vocab not found. Training...")
 spm.SentencePieceTrainer.Train(
 f"--input={all_texts_file} "
 f"--model_prefix={sp_model_prefix} "
 f"--vocab_size={vocab_size} "
 f"--model_type={model_type} "
 "--character_coverage=1.0 "
 "--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3"
 )
else:
 print("SentencePiece model & vocab found; skipping training.")

# Load the SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(model_file)
print("Loaded SentencePiece model with vocab size:", sp.get_piece_size())


Loaded SentencePiece model with vocab size: 12000


In [9]:
# -------------------------
# 6. Tokenize Data
# -------------------------
train_ids = [sp.encode_as_ids(t) for t in train_texts]
val_ids = [sp.encode_as_ids(t) for t in val_texts]
test_ids = [sp.encode_as_ids(t) for t in test_texts]

In [10]:
# -------------------------
# 7. Create Dataset and DataLoader
# -------------------------
class PoetryDataset(Dataset):
 def __init__(self, token_ids_list, max_length=250):
 self.data = token_ids_list
 self.max_length = max_length

 def __len__(self):
 return len(self.data)

 def __getitem__(self, idx):
 # Truncate tokens to max_length
 token_ids = self.data[idx][:self.max_length]
 # Create input by adding BOS token (2) at the beginning
 input_ids = [2] + token_ids
 # Create target by appending EOS token (3) at the end
 target_ids = token_ids + [3]
 return torch.tensor(input_ids, dtype=torch.long), torch.tensor(target_ids, dtype=torch.long)

def collate_fn(batch):
 inputs, targets = zip(*batch)
 max_len = max(len(x) for x in inputs)
 padded_inputs = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in inputs]
 padded_targets = [torch.cat([t, torch.zeros(max_len - len(t), dtype=torch.long)]) for t in targets]
 return torch.stack(padded_inputs), torch.stack(padded_targets)

In [11]:
train_dataset = PoetryDataset(train_ids, max_length=250)
val_dataset = PoetryDataset(val_ids, max_length=250)
test_dataset = PoetryDataset(test_ids, max_length=250)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=True)

In [12]:
# -------------------------
# 8. Define the BiLSTM Language Model
# -------------------------
class BiLSTMLanguageModel(nn.Module):
 def __init__(self, vocab_size, embed_dim=512, hidden_dim=768, num_layers=3, dropout=0.2):
 super(BiLSTMLanguageModel, self).__init__()
 self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
 # Stacked Bi-LSTM layers
 self.lstm = nn.LSTM(
 input_size=embed_dim,
 hidden_size=hidden_dim,
 num_layers=num_layers,
 batch_first=True,
 bidirectional=True,
 dropout=dropout
 )
 # Linear layer to project LSTM outputs to vocabulary size
 self.fc = nn.Linear(hidden_dim * 2, vocab_size)

 def forward(self, x, hidden=None):
 emb = self.embed(x)
 out, hidden = self.lstm(emb, hidden)
 logits = self.fc(out)
 return logits, hidden

In [13]:
vocab_size = sp.get_piece_size()
model = BiLSTMLanguageModel(vocab_size, embed_dim=512, hidden_dim=768, num_layers=3, dropout=0.2)
model = model.to(device)

In [14]:
# -------------------------
# 9. Training Setup (Loss, Optimizer, Scheduler)
# -------------------------
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

def evaluate(model, data_loader):
 model.eval()
 total_loss, total_tokens = 0, 0
 with torch.no_grad():
 for inputs, targets in data_loader:
 inputs = inputs.to(device)
 targets = targets.to(device)
 logits, _ = model(inputs)
 logits = logits.view(-1, vocab_size)
 targets = targets.view(-1)
 loss = criterion(logits, targets)
 total_loss += loss.item() * (targets != 0).sum().item()
 total_tokens += (targets != 0).sum().item()
 return total_loss / total_tokens

In [15]:
# -------------------------
# 10. Training Loop with Testing Code and Weight Saving
# -------------------------
num_epochs = 10
weights_path = "model_weights.pth"

if not os.path.exists(weights_path):
 for epoch in range(num_epochs):
 model.train()
 total_loss, total_tokens = 0, 0
 for inputs, targets in train_loader:
 inputs = inputs.to(device)
 targets = targets.to(device)
 optimizer.zero_grad()
 logits, _ = model(inputs)
 logits = logits.view(-1, vocab_size)
 targets = targets.view(-1)
 loss = criterion(logits, targets)
 loss.backward()
 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
 optimizer.step()
 total_loss += loss.item() * (targets != 0).sum().item()
 total_tokens += (targets != 0).sum().item()
 train_loss = total_loss / total_tokens
 val_loss = evaluate(model, val_loader)
 scheduler.step()
 print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
 test_loss = evaluate(model, test_loader)
 print(f"Test Loss: {test_loss:.4f}")
 torch.save(model.state_dict(), weights_path)
else:
 print("Loading pre-trained model weights...")
 model.load_state_dict(torch.load(weights_path, map_location=device))

Epoch [1/10], Train Loss: 7.1034, Val Loss: 6.2269
Epoch [2/10], Train Loss: 5.7528, Val Loss: 5.4652
Epoch [3/10], Train Loss: 5.0948, Val Loss: 4.9459
Epoch [4/10], Train Loss: 4.4997, Val Loss: 4.2981
Epoch [5/10], Train Loss: 3.9654, Val Loss: 3.9398
Epoch [6/10], Train Loss: 3.6264, Val Loss: 3.6214
Epoch [7/10], Train Loss: 3.3671, Val Loss: 3.4665
Epoch [8/10], Train Loss: 3.2082, Val Loss: 3.3188
Epoch [9/10], Train Loss: 3.0880, Val Loss: 3.2478
Epoch [10/10], Train Loss: 3.0126, Val Loss: 3.1772
Test Loss: 3.1696


In [16]:


def generate_poetry_nucleus(model, sp, start_word, num_words=12, temperature=1.2, top_p=0.85):
 """
 Generate a poetry sequence using nucleus (top-p) sampling.
 The output is formatted so that every 6 words appear on a new line.
 If num_words is specified, it means 1 starting word + (num_words - 1) generated tokens.
 """
 model.eval()
 start_ids = sp.encode_as_ids(start_word)
 input_ids = [2] + start_ids # Insert BOS (token 2)
 input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)
 hidden = None

 with torch.no_grad():
 logits, hidden = model(input_tensor, hidden)

 generated_ids = input_ids[:] # Copy initial tokens

 for _ in range(num_words - 1): # Generate one less token
 # Get the logits of the last generated token
 last_logits = logits[:, -1, :] # Shape: (1, vocab_size)
 scaled_logits = last_logits / temperature

 # Sort the logits in descending order
 sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True)
 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

 # Filter out tokens with cumulative probability above top_p
 filtered_indices = cumulative_probs > top_p
 if torch.all(filtered_indices):
 filtered_indices[-1] = False # Ensure at least one token remains
 sorted_indices = sorted_indices[~filtered_indices]
 sorted_logits = sorted_logits[~filtered_indices]

 # Sample the next token from the filtered distribution
 if len(sorted_indices) > 0:
 next_token_id = sorted_indices[torch.multinomial(F.softmax(sorted_logits, dim=-1), 1).item()].item()
 else:
 next_token_id = torch.argmax(last_logits).item()
 generated_ids.append(next_token_id)

 # Prepare next input and update hidden state
 next_input = torch.tensor([[next_token_id]], dtype=torch.long, device=device)
 logits, hidden = model(next_input, hidden)

 # Decode generated tokens (skip BOS) and format output: 6 words per line
 generated_text = sp.decode_ids(generated_ids[1:])
 words = generated_text.split()
 formatted_text = "\n".join([" ".join(words[i:i+6]) for i in range(0, len(words), 6)])
 return formatted_text


In [18]:


# -------------------------
# 12. Example Usage for Testing (Optional)
# -------------------------
if __name__ == "__main__":
 # Test the generation function in the notebook/script
 start_word = "ishq"
 print("Generated Poetry:\n", generate_poetry_nucleus(model, sp, start_word, num_words=12, temperature=1.2, top_p=0.85))


Generated Poetry:
 ishq nishan tum phir kar phir
ik baat aur phir ye phir
