Spaces:

Shilpaj
/

SmoLLMv2

Runtime error

App Files Files Community

Shilpaj commited on Jan 26

Commit

f42f624

verified ·

1 Parent(s): bd2d227

Feat: Upload app files

Browse files

Files changed (7) hide show

app.py +145 -0
config.py +149 -0
inference.py +102 -0
last.ckpt +3 -0
requirements.txt +15 -0
smollmv2.py +243 -0
smollv2_lightning.py +498 -0

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+#! /usr/bin/env python3
+"""
+This script is a simple text generator using the SmollmV2 model.
+It uses Gradio to create a web interface for generating text.
+"""
+# Third-Party Imports
+import torch
+import torch.nn.functional as F
+import gradio as gr
+from transformers import GPT2Tokenizer
+import spaces
+import os
+from pathlib import Path
+# Local imports
+from smollmv2 import SmollmV2
+from config import SmollmConfig, DataConfig
+from smollv2_lightning import LitSmollmv2
+def combine_model_parts(model_dir="split_models", output_file="checkpoints/last.ckpt"):
+    """
+    Combine split model parts into a single checkpoint file
+    """
+    # Create checkpoints directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    # Check if combined model already exists
+    if os.path.exists(output_file):
+        print(f"Model already combined at: {output_file}")
+        return output_file
+    # Ensure the model parts exist
+    if not os.path.exists(model_dir):
+        raise FileNotFoundError(f"Model directory {model_dir} not found")
+    # Combine the parts
+    parts = sorted(Path(model_dir).glob("last.ckpt.part_*"))
+    if not parts:
+        raise FileNotFoundError("No model parts found")
+    print("Combining model parts...")
+    with open(output_file, 'wb') as outfile:
+        for part in parts:
+            print(f"Processing part: {part}")
+            with open(part, 'rb') as infile:
+                outfile.write(infile.read())
+    print(f"Model combined successfully: {output_file}")
+    return output_file
+def load_model():
+    """
+    Load the SmollmV2 model and tokenizer.
+    """
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Combine model parts and get the checkpoint path
+    checkpoint_path = combine_model_parts()
+    # Load the model from combined checkpoint using Lightning module
+    model = LitSmollmv2.load_from_checkpoint(
+        checkpoint_path,
+        model_config=SmollmConfig,
+        strict=False
+    )
+    model.to(device)
+    model.eval()
+    # Initialize tokenizer
+    tokenizer = GPT2Tokenizer.from_pretrained(DataConfig.tokenizer_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer, device
+@spaces.GPU(enable_queue=True)
+def generate_text(prompt, num_tokens, temperature=0.8, top_p=0.9):
+    """
+    Generate text using the SmollmV2 model.
+    """
+    # Ensure num_tokens doesn't exceed model's block size
+    num_tokens = min(num_tokens, SmollmConfig.block_size)
+    # Tokenize input prompt
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Generate tokens one at a time
+    for _ in range(num_tokens):
+        # Get the model's predictions
+        with torch.no_grad():
+            with torch.autocast(device_type=device, dtype=torch.bfloat16):
+                logits, _ = model.model(input_ids)
+        # Get the next token probabilities
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        # Apply top-p sampling
+        if top_p > 0:
+            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+            cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
+            sorted_indices_to_keep = cumsum_probs <= top_p
+            sorted_indices_to_keep[..., 1:] = sorted_indices_to_keep[..., :-1].clone()
+            sorted_indices_to_keep[..., 0] = 1
+            indices_to_keep = torch.zeros_like(probs, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_keep)
+            probs = torch.where(indices_to_keep, probs, torch.zeros_like(probs))
+            probs = probs / probs.sum(dim=-1, keepdim=True)
+        # Sample next token
+        next_token = torch.multinomial(probs, num_samples=1)
+        # Append to input_ids
+        input_ids = torch.cat([input_ids, next_token], dim=-1)
+        # Stop if we generate an EOS token
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+    # Decode and return the generated text
+    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    return generated_text
+# Load the model globally
+model, tokenizer, device = load_model()
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=generate_text,
+    inputs=[
+        gr.Textbox(label="Enter your prompt", value="Once upon a time"),
+        gr.Slider(minimum=1, maximum=SmollmConfig.block_size, value=100, step=1, label="Number of tokens to generate"),
+        gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature (higher = more random)"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p (nucleus sampling)")
+    ],
+    outputs=gr.Textbox(label="Generated Text"),
+    title="SmollmV2 Text Generator",
+    description="Generate text using the SmollmV2 model",
+    allow_flagging="never",
+    cache_examples=True
+)
+if __name__ == "__main__":
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+"""
+Configuration class for GPT model
+Author: Shilpaj Bhalerao
+Date: 2025-01-19
+"""
+# Standard Library Imports
+from dataclasses import dataclass, field
+@dataclass
+class RoPEConfig:
+    """
+    Configuration for Rotary Position Embeddings
+    """
+    base: int = 10000                # Base for the angle calculations
+    scaling_factor: float = 1.0      # Scaling factor for rotary embeddings
+    head_dim_fraction: float = 0.3125  # Set to get exactly kv_dim=24 (216 total)
+    round_multiple: int = 8          # Round kv_dim to nearest multiple of this number
+@dataclass
+class SmollmConfig:
+    """
+    Configuration for Smollm training setup
+    """
+    # Model configuration
+    block_size: int = 2048    # max sequence length
+    vocab_size: int = 49152   # vocabulary size
+    n_layer: int = 30         # number of transformer layers
+    n_head: int = 9           # number of attention heads
+    n_embd: int = 576         # embedding dimension
+    mlp_ratio: int = 2.67     # Based on MLP implementation (1536/576)
+    dropout: float = 0.0      # No dropout used in implementation
+    # Training configuration
+    batch_size: int = 1                # Minimum batch size (from smollv2_lightning.py)
+    num_workers: int = 0               # No additional workers to save memory
+    shuffle_buffer_size: int = 1000    # Shuffle buffer size for dataset
+    max_length: int = 2048             # Sequence length for training
+    learning_rate: float = 3e-5        # From LitGPT initialization
+    weight_decay: float = 1e-4         # From LitGPT initialization
+    # Generation configuration
+    max_new_tokens: int = 100          # From generation code in training_step
+    # Training control
+    seed: int = 1337
+    max_steps: int = 5000
+    clear_cache_every: int = 1000  # Clear GPU cache every N steps, 0 to disable
+    # Generation parameters
+    context_length: int = 10      # Number of tokens to use as context
+    temperature: float = 1.0      # Sampling temperature
+    top_k: int = 50              # Top-k sampling parameter
+@dataclass
+class CheckpointConfig:
+    """
+    Configuration for checkpointing
+    """
+    checkpoint_dir: str = "checkpoints"
+    checkpoint_every: int = 500  # Save checkpoint every 500 steps
+    save_last: bool = True
+    save_top_k: int = 1  # Changed from checkpoint_save_top_k
+    save_weights_only: bool = True  # Changed from checkpoint_save_weights_only
+    monitor: str = "train_loss"  # Monitor training loss for checkpointing
+    mode: str = "min"  # Mode for the monitor metric
+    save_on_train_epoch_end: bool = False  # Whether to save on training epoch end
+@dataclass
+class LoggingConfig:
+    """
+    Configuration for logging
+    """
+    log_every: int = 50      # Log metrics every 50 steps
+    generate_every: int = 500  # Generate sample text every 500 steps
+    log_metrics: bool = True
+    log_progress_bar: bool = True
+    log_model_summary: bool = True
+@dataclass
+class OptimizerConfig:
+    """
+    Configuration for optimizer
+    """
+    optimizer: str = "AdamW"  # Using AdamW optimizer
+    learning_rate: float = 3e-5
+    weight_decay: float = 1e-4
+    max_lr: float = 3e-4      # max_lr = learning_rate * 10
+    div_factor: float = 25.0  # From OneCycleLR config
+    final_div_factor: float = 100.0  # From OneCycleLR config
+    pct_start: float = 0.2    # From OneCycleLR config
+    # Additional optimizer settings
+    optimizer_kwargs: dict = field(default_factory=lambda: {
+        'betas': (0.9, 0.95),  # Default betas for AdamW
+        'eps': 1e-8,           # Default epsilon value
+    })
+    three_phase: bool = False     # Use three-phase learning rate schedule
+    anneal_strategy: str = 'linear'  # Learning rate annealing strategy
+@dataclass
+class DataConfig:
+    """
+    Configuration for dataset and tokenizer
+    """
+    # Dataset configuration
+    dataset_path: str = "HuggingFaceTB/smollm-corpus"
+    dataset_name: str = "cosmopedia-v2"
+    # Tokenizer configuration
+    tokenizer_path: str = "HuggingFaceTB/cosmo2-tokenizer"
+    # DataLoader configuration
+    batch_size: int = 32
+    num_workers: int = 4
+    shuffle_buffer_size: int = 1000
+    max_length: int = 512
+    # Dataset splits
+    validation_split: float = 0.1  # 10% for validation
+    pin_memory: bool = True
+    streaming: bool = True         # Use streaming mode for dataset
+@dataclass
+class TrainerConfig:
+    """
+    Configuration for PyTorch Lightning Trainer
+    """
+    accelerator: str = 'auto'
+    devices: int = 1
+    precision: str = '16-mixed'
+    log_every_n_steps: int = 10
+    strategy: str = 'auto'
+    deterministic: bool = False
+    benchmark: bool = True
+    enable_progress_bar: bool = True
+    enable_model_summary: bool = True
+    profiler: str = 'simple'
+    gradient_clip_val: float = 1.0
+    accumulate_grad_batches: int = 2
+    val_check_interval: int = 1000  # Run validation every N training steps
+    check_val_every_n_epoch: None = None  # Disable epoch-based validation

inference.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#! /usr/bin/env python
+"""
+Inference script for SmollmV2 model
+Author: Shilpaj Bhalerao
+Date: 2025-01-25
+"""
+# Third-Party Imports
+import torch
+from transformers import GPT2Tokenizer
+# Local Imports
+from smollv2_lightning import LitSmollmv2
+from config import SmollmConfig, DataConfig
+def load_model(checkpoint_path):
+    """
+    Load the trained model from checkpoint.
+    """
+    model = LitSmollmv2.load_from_checkpoint(
+        checkpoint_path,
+        model_config=SmollmConfig,
+        strict=False
+    )
+    model.eval()
+    return model
+def generate_text(model, prompt, max_new_tokens=100, temperature=0.8, top_p=0.9):
+    """
+    Generate text using the loaded model.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    # Initialize tokenizer the same way as in CosmopediaDataModule
+    tokenizer = GPT2Tokenizer.from_pretrained(DataConfig.tokenizer_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    # Tokenize input prompt
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Generate tokens one at a time
+    for _ in range(max_new_tokens):
+        # Get the model's predictions
+        with torch.no_grad():
+            logits, _ = model.model(input_ids)
+        # Get the next token probabilities
+        logits = logits[:, -1, :] / temperature
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        # Sample from the distribution
+        if top_p > 0:
+            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+            cumsum_probs = torch.cumsum(sorted_probs, dim=-1)
+            sorted_indices_to_keep = cumsum_probs <= top_p
+            sorted_indices_to_keep[..., 1:] = sorted_indices_to_keep[..., :-1].clone()
+            sorted_indices_to_keep[..., 0] = 1
+            indices_to_keep = torch.zeros_like(probs, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_keep)
+            probs = torch.where(indices_to_keep, probs, torch.zeros_like(probs))
+            probs = probs / probs.sum(dim=-1, keepdim=True)
+        # Sample next token
+        next_token = torch.multinomial(probs, num_samples=1)
+        # Append to input_ids
+        input_ids = torch.cat([input_ids, next_token], dim=-1)
+        # Stop if we generate an EOS token
+        if next_token.item() == tokenizer.eos_token_id:
+            break
+    # Decode and return the generated text
+    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    return generated_text
+def main():
+    # Path to your checkpoint
+    checkpoint_path = "./checkpoints/last.ckpt"
+    # Load the model
+    model = load_model(checkpoint_path)
+    print("Model loaded successfully!")
+    # Example prompts for generation
+    prompts = [
+        "Once upon a time",
+        "The future of artificial intelligence",
+        "In the distant galaxy"
+    ]
+    # Generate text for each prompt
+    for prompt in prompts:
+        print("\nPrompt:", prompt)
+        generated = generate_text(prompt=prompt, model=model)
+        print("Generated:", generated)
+        print("-" * 50)
+if __name__ == "__main__":
+    main()

last.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c7f0b043f2a6492e6f20568c0842d06c64fe20c95ddb03ca3a7fcab5f57e2d4
+size 811285105

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# Core ML libraries
+torch>=2.0.0
+transformers>=4.30.0
+lightning>=2.0.0
+# Web UI
+gradio>=5.13.1
+# HuggingFace Space utilities
+huggingface-hub>=0.19.0
+spaces>=0.19.0
+# Optional dependencies for better performance
+accelerate>=0.20.0
+bitsandbytes>=0.41.0

smollmv2.py ADDED Viewed

	@@ -0,0 +1,243 @@

+#! /usr/bin/env python
+"""
+SmollmV2 model implementation
+Author: Shilpaj Bhalerao
+Date: 2025-01-19
+"""
+# Third-Party Imports
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+# Local Imports
+from config import SmollmConfig, RoPEConfig
+class RoPEAttention:
+    """
+    Rotary Position Embedding attention with support for different Q/K dimensions
+    """
+    def __init__(self, head_dim, kv_dim, base=RoPEConfig.base):
+        """
+        Initialize rotary embeddings
+        Args:
+            head_dim: Dimension of query head
+            kv_dim: Dimension of key/value head
+            base: Base for the angle calculations (default: 10000)
+        """
+        super().__init__()
+        # Generate theta parameter for rotary embeddings for both Q and K dimensions
+        inv_freq_k = 1.0 / (base ** (torch.arange(0, kv_dim, 2).float() / kv_dim))
+        self.register_buffer('inv_freq_k', inv_freq_k)
+        self.head_dim = head_dim
+        self.kv_dim = kv_dim
+        self.seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+    def _update_cos_sin_cache(self, x, seq_len):
+        """Update cached cos and sin values for given sequence length"""
+        if seq_len != self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq_k)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq_k)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.cos_cached = emb.cos()[None, None, :, :]
+            self.sin_cached = emb.sin()[None, None, :, :]
+    def _rotate_half(self, x):
+        """Rotate half the hidden dims of the input."""
+        x1 = x[..., :x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2:]
+        return torch.cat((-x2, x1), dim=-1)
+    def __call__(self, q, k):
+        """
+        Apply rotary embeddings to input queries and keys
+        Args:
+            q: Query tensor of shape (batch, n_head, seq_len, head_dim)
+            k: Key tensor of shape (batch, n_head, seq_len, kv_dim)
+        Returns:
+            q_rot: Rotated query tensor
+            k_rot: Rotated key tensor
+        """
+        seq_len = q.shape[2]
+        self._update_cos_sin_cache(k, seq_len)
+        # Apply rotary embeddings to keys
+        k_cos = self.cos_cached[..., :self.kv_dim]
+        k_sin = self.sin_cached[..., :self.kv_dim]
+        k_rot = (k * k_cos) + (self._rotate_half(k) * k_sin)
+        # For queries, we only apply rotation to the part that interacts with keys
+        q_part = q[..., :self.kv_dim]
+        q_cos = self.cos_cached[..., :self.kv_dim]
+        q_sin = self.sin_cached[..., :self.kv_dim]
+        q_rot_part = (q_part * q_cos) + (self._rotate_half(q_part) * q_sin)
+        # Combine rotated part with unrotated parts for query
+        q_rot = torch.cat([q_rot_part, q[..., self.kv_dim:]], dim=-1)
+        return q_rot, k_rot
+    def register_buffer(self, name, tensor):
+        """Helper function to register a buffer"""
+        setattr(self, name, tensor)
+class CausalSelfAttention(nn.Module):
+    """
+    Causal self-attention mechanism with reduced KV dimensions and RoPE
+    """
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # Calculate dimensions
+        self.head_dim = config.n_embd // config.n_head  # 576/9 = 64
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        # Make kv_dim divisible by n_head (189 is closest to 192 that's divisible by 9)
+        self.kv_dim = 189  # 189 = 9 * 21, closest to 192 that's divisible by 9
+        self.kv_dim_per_head = self.kv_dim // self.n_head  # 21
+        # Separate projections with reduced dimensions for k,v
+        self.q_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.k_proj = nn.Linear(config.n_embd, self.kv_dim, bias=False)  # 189 dimensions
+        self.v_proj = nn.Linear(config.n_embd, self.kv_dim, bias=False)  # 189 dimensions
+        # output projection
+        self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        # rotary embeddings
+        self.rope = RoPEAttention(self.head_dim, self.kv_dim_per_head)
+    def forward(self, x):
+        B, T, C = x.size()
+        # calculate query, key, values
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        # reshape with exact dimensions
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.kv_dim_per_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.kv_dim_per_head).transpose(1, 2)
+        # apply rotary embeddings
+        q, k = self.rope(q, k)
+        # pad k and v to match q dimension for attention
+        k_pad = torch.zeros_like(q)
+        v_pad = torch.zeros_like(q)
+        k_pad[..., :self.kv_dim_per_head] = k
+        v_pad[..., :self.kv_dim_per_head] = v
+        # flash attention
+        y = F.scaled_dot_product_attention(q, k_pad, v_pad, is_causal=True)
+        # reshape back
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        # output projection
+        y = self.o_proj(y)
+        return y
+class MLP(nn.Module):
+    """
+    MLP (Multi-Layer Perceptron) layer with gate/up/down projection structure
+    """
+    def __init__(self, config):
+        super().__init__()
+        hidden_dim = int(config.n_embd * config.mlp_ratio) - 1
+        self.gate_proj = nn.Linear(config.n_embd, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(config.n_embd, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)
+        self.down_proj.NANOGPT_SCALE_INIT = 1
+    def forward(self, x):
+        # SwiGLU activation as used in PaLM, Llama, etc.
+        gate = self.gate_proj(x)
+        up = self.up_proj(x)
+        x = F.silu(gate) * up
+        x = self.down_proj(x)
+        return x
+class Block(nn.Module):
+    """
+    Transformer block
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, bias=False)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, bias=False)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class SmollmV2(nn.Module):
+    """
+    SmollmV2 model
+    """
+    def __init__(self, config=SmollmConfig()):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = nn.LayerNorm(config.n_embd, bias=False),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # weight sharing
+        self.transformer.wte.weight = self.lm_head.weight
+        # weight initialization
+        self.apply(self._init_weights)
+        # Compile the model if torch version supports it
+        if hasattr(torch, 'compile'):
+            self.forward = torch.compile(self.forward)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            std = 0.02
+            if hasattr(module, 'NANGPT_SCALE_INIT'):
+                std *= (2 * self.config.n_layer) ** -0.5
+            torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std = 0.04)
+    def forward(self, idx, targets=None):
+        # idx is of shape (B, T)
+        B, T = idx.size()
+        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
+        # forward the token and posisition embeddings
+        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
+        x = tok_emb
+        # forward the blocks of the transformer
+        for block in self.transformer.h:
+            x = block(x)
+        # forward the final layernorm and the classifier
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x) # (B, T, vocab_size)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss

smollv2_lightning.py ADDED Viewed

	@@ -0,0 +1,498 @@

+#!/usr/bin/env python
+"""
+Lightning module for SmollmV2 model training
+"""
+# Standard Library Imports
+import os
+from typing import Tuple
+# Third-Party Imports
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
+from pytorch_lightning.loggers import TensorBoardLogger
+import matplotlib.pyplot as plt
+from tensorboard.backend.event_processing import event_accumulator
+import time
+import numpy as np
+from contextlib import nullcontext
+import torch.nn.functional as F
+# Local Imports
+from config import (SmollmConfig, OptimizerConfig, CheckpointConfig,
+                   LoggingConfig, TrainerConfig)
+from smollmv2 import SmollmV2
+from cosmopedia_datamodule import CosmopediaDataModule
+class LitSmollmv2(pl.LightningModule):
+    """
+    Lightning module for SmollmV2 model training
+    """
+    def __init__(
+        self,
+        learning_rate=OptimizerConfig.learning_rate,
+        weight_decay=OptimizerConfig.weight_decay,
+        total_epochs=None,
+        total_steps=None,
+        interupt_steps=SmollmConfig.max_steps,
+        compile_model=True
+    ):
+        """
+        Constructor
+        :param learning_rate: Learning rate for the optimizer
+        :param weight_decay: Weight decay for the optimizer
+        :param total_epochs: Total number of epochs (optional)
+        :param total_steps: Total number of steps (optional)
+        :param compile_model: Whether to compile the model for faster training
+        Note: Provide either total_epochs or total_steps, not both
+        """
+        super().__init__()
+        self.save_hyperparameters()
+        if total_epochs is None and total_steps is None:
+            raise ValueError("Must provide either total_epochs or total_steps")
+        if total_epochs is not None and total_steps is not None:
+            raise ValueError("Provide either total_epochs or total_steps, not both")
+        # Set seeds from config
+        torch.manual_seed(SmollmConfig.seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(SmollmConfig.seed)
+        # Initialize the model
+        self.model = SmollmV2(SmollmConfig())
+        # Compile the model if requested and supported
+        if compile_model and hasattr(torch, 'compile'):
+            print("Compiling model for faster training...")
+            self.model = torch.compile(self.model)
+        # Print total model parameters
+        total_params = sum(p.numel() for p in self.model.parameters())
+        print(f"Total model parameters: {total_params:,}\n")
+        # OneCycleLR parameters from OptimizerConfig
+        self.max_lr = OptimizerConfig.max_lr
+        self.div_factor = OptimizerConfig.div_factor
+        self.final_div_factor = OptimizerConfig.final_div_factor
+        self.pct_start = OptimizerConfig.pct_start
+        self.total_epochs = total_epochs
+        self.total_steps = total_steps
+        # Add performance monitoring attributes
+        self.iter_num = 0
+        self.iter_time = 0.0
+        self.tokens_processed = 0
+        self.interupt_steps = interupt_steps
+    def on_load_checkpoint(self, checkpoint):
+        """Restore iter_num when loading from checkpoint"""
+        if 'iter_num' in checkpoint:
+            self.iter_num = checkpoint['iter_num']
+    def on_save_checkpoint(self, checkpoint):
+        """Save iter_num in checkpoint"""
+        checkpoint['iter_num'] = self.iter_num
+    def forward(self, x, targets=None):
+        """
+        Method to forward the input through the model
+        """
+        return self.model(x, targets)
+    def training_step(self, batch, batch_idx):
+        """
+        Method to perform a training step with performance monitoring
+        """
+        try:
+            # Stop training at max steps from config
+            if self.iter_num >= self.interupt_steps:
+                self.trainer.should_stop = True
+                return None
+            # Start timing
+            t0 = time.time()
+            # Process batch
+            input_ids = batch['input_ids']
+            labels = batch['labels']
+            attention_mask = batch['attention_mask']
+            # Clear cache before forward pass
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Forward pass
+            logits, loss = self(input_ids, targets=labels)
+            # Calculate tokens processed
+            tokens_per_iter = np.prod(input_ids.shape)
+            self.tokens_processed += tokens_per_iter
+            # Ensure CUDA synchronization after forward pass
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            # Calculate iteration time
+            dt = time.time() - t0
+            self.iter_time += dt
+            # Log metrics
+            self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
+            self.log('lr', self.trainer.optimizers[0].param_groups[0]['lr'], on_step=True)
+            # Generate sample prediction
+            if self.iter_num % LoggingConfig.generate_every == 0:
+                # Get a sample input from the batch
+                context_length = SmollmConfig.context_length  # Number of tokens to use as context
+                sample_input = input_ids[0:1, :context_length]
+                # Generate prediction
+                self.model.eval()
+                with torch.no_grad():
+                    max_new_tokens = SmollmConfig.max_new_tokens
+                    temperature = SmollmConfig.temperature
+                    top_k = SmollmConfig.top_k
+                    for _ in range(max_new_tokens):
+                        # Get model predictions
+                        logits, _ = self(sample_input)
+                        logits = logits[:, -1, :] / temperature
+                        # Apply top-k sampling
+                        if top_k is not None:
+                            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                            logits[logits < v[:, [-1]]] = -float('Inf')
+                        probs = F.softmax(logits, dim=-1)
+                        next_token = torch.multinomial(probs, num_samples=1)
+                        sample_input = torch.cat([sample_input, next_token], dim=1)
+                    # Convert tokens to text using the tokenizer from datamodule
+                    try:
+                        input_text = self.trainer.datamodule.tokenizer.decode(sample_input[0, :10].tolist())
+                        generated_text = self.trainer.datamodule.tokenizer.decode(sample_input[0, 10:].tolist())
+                        print(f"\nStep {self.iter_num} - Sample Generation:")
+                        print(f"Input: {input_text}")
+                        print(f"Generated: {generated_text}\n")
+                    except Exception as e:
+                        print(f"Error decoding text: {str(e)}")
+                self.model.train()  # Set back to training mode
+            # Log performance metrics
+            if self.iter_num % LoggingConfig.log_every == 0:
+                tokens_per_sec = self.tokens_processed / self.iter_time if self.iter_time > 0 else 0
+                self.log('tokens_per_sec', tokens_per_sec, on_step=True)
+                self.log('iter_time_ms', dt * 1000, on_step=True)
+                print(f"\nstep {self.iter_num} | loss: {loss.item():.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")
+                if torch.cuda.is_available():
+                    self.log('gpu_memory', torch.cuda.memory_allocated() / 1e9, on_step=True)
+                    self.log('gpu_memory_reserved', torch.cuda.memory_reserved() / 1e9, on_step=True)
+                    print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB / {torch.cuda.memory_reserved() / 1e9:.2f}GB")
+                # Clear GPU cache periodically if enabled
+                if SmollmConfig.clear_cache_every > 0 and self.iter_num % SmollmConfig.clear_cache_every == 0:
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                self.tokens_processed = 0
+                self.iter_time = 0.0
+            self.iter_num += 1
+            return loss
+        except RuntimeError as e:
+            if "out of memory" in str(e):
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                print(f"WARNING: out of memory - {str(e)}")
+                return None
+            raise e
+    def validation_step(self, batch, batch_idx):
+        """
+        Method to perform a validation step
+        """
+        # Start timing for validation
+        t0 = time.time()
+        # Ensure CUDA synchronization for accurate timing
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        # Process batch - updated for Cosmopedia format
+        input_ids = batch['input_ids']
+        labels = batch['labels']
+        attention_mask = batch['attention_mask']
+        # Forward pass
+        logits, loss = self(input_ids, targets=labels)
+        # Ensure CUDA synchronization after forward pass
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        # Calculate validation time
+        dt = time.time() - t0
+        # Log metrics
+        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, sync_dist=True)
+        if batch_idx == 0:  # Only print for first batch
+            print(f"\nValidation - loss: {loss.item():.4f} | dt: {dt*1000:.2f}ms")
+            if torch.cuda.is_available():
+                print(f"GPU Memory: {torch.cuda.memory_allocated() / 1e9:.2f}GB / {torch.cuda.memory_reserved() / 1e9:.2f}GB")
+        return loss
+    def configure_optimizers(self):
+        """
+        Method to configure the optimizer and scheduler
+        """
+        # Create an instance of OptimizerConfig
+        optim_config = OptimizerConfig()
+        optimizer = getattr(optim, optim_config.optimizer)(
+            self.parameters(),
+            lr=self.hparams.learning_rate,
+            weight_decay=self.hparams.weight_decay,
+            **optim_config.optimizer_kwargs
+        )
+        # Calculate total steps
+        if self.total_steps is None:
+            total_steps = len(self.trainer.datamodule.train_dataloader()) * self.total_epochs
+        else:
+            total_steps = self.total_steps
+        scheduler = {
+            'scheduler': optim.lr_scheduler.OneCycleLR(
+                optimizer,
+                max_lr=self.max_lr,
+                total_steps=total_steps,
+                pct_start=self.pct_start,
+                div_factor=self.div_factor,
+                final_div_factor=self.final_div_factor,
+                three_phase=optim_config.three_phase,
+                anneal_strategy=optim_config.anneal_strategy
+            ),
+            'interval': 'step'
+        }
+        return [optimizer], [scheduler]
+    def on_train_epoch_end(self):
+        """
+        Called at the end of training epoch
+        """
+        # Reset performance counters at epoch end
+        self.tokens_processed = 0
+        self.iter_time = 0.0
+def plot_learning_rate(log_dir):
+    """
+    Plot learning rate from TensorBoard logs
+    """
+    event_files = []
+    for root, dirs, files in os.walk(log_dir):
+        for file in files:
+            if "events.out.tfevents" in file:
+                event_files.append(os.path.join(root, file))
+    lr_data = []
+    steps = []
+    for event_file in event_files:
+        ea = event_accumulator.EventAccumulator(
+            event_file,
+            size_guidance={'scalars': 0}
+        )
+        ea.Reload()
+        if 'lr' in ea.Tags()['scalars']:
+            events = ea.Scalars('lr')
+            for event in events:
+                lr_data.append(event.value)
+                steps.append(event.step)
+    if lr_data:
+        plt.figure(figsize=(10, 6))
+        plt.plot(steps, lr_data, '-', linewidth=2)
+        plt.title('Learning Rate Schedule')
+        plt.xlabel('Training Steps')
+        plt.ylabel('Learning Rate')
+        plt.grid(True)
+        plt.margins(x=0.02)
+        plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
+        plt.savefig('learning_rate_schedule.png', dpi=300, bbox_inches='tight')
+        plt.close()
+def train_model(epochs=None, steps=None, ckpt_path=None, interupt_steps=SmollmConfig.max_steps):
+    """
+    Train the model for specified number of epochs or steps
+    :param epochs: Number of epochs to train (optional)
+    :param steps: Number of steps to train (optional)
+    :param ckpt_path: Path to checkpoint for resuming training
+    :param interupt_steps: Number of steps after which to interrupt training
+    Note: Provide either epochs or steps, not both
+    """
+    # Set compilation mode for PyTorch 2.0+
+    if hasattr(torch, 'compile'):
+        torch._dynamo.config.suppress_errors = True
+        torch._dynamo.config.verbose = False
+    torch.set_float32_matmul_precision('high')
+    # Initialize data module with reduced workers and batch size
+    data_module = CosmopediaDataModule(
+        batch_size=SmollmConfig.batch_size,  # Reduced from 32
+        num_workers=SmollmConfig.num_workers,  # Reduced from 4
+        shuffle_buffer_size=SmollmConfig.shuffle_buffer_size,
+        max_length=SmollmConfig.block_size
+    )
+    # Initialize model
+    model = LitSmollmv2(total_epochs=epochs, total_steps=steps, interupt_steps=interupt_steps)
+    # Setup callbacks with reduced frequency
+    checkpoint_callback = ModelCheckpoint(
+        dirpath='checkpoints',
+        filename='smollmv2-{step:05d}-{val_loss:.2f}',
+        save_top_k=CheckpointConfig.save_top_k,  # Save only the best model
+        monitor=CheckpointConfig.monitor,  # Monitor training loss instead of validation loss
+        mode=CheckpointConfig.mode,
+        save_last=CheckpointConfig.save_last,
+        every_n_train_steps=CheckpointConfig.checkpoint_every,  # Reduced checkpoint frequency
+        save_on_train_epoch_end=CheckpointConfig.save_on_train_epoch_end
+    )
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    # Setup logger
+    logger = TensorBoardLogger("lightning_logs", name="smollmv2", log_graph=True)
+    # Add gradient scaler for mixed precision training
+    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None
+    # Initialize trainer with performance monitoring
+    trainer_kwargs = {
+        'accelerator': TrainerConfig.accelerator,
+        'devices': TrainerConfig.devices,
+        'callbacks': [checkpoint_callback, lr_monitor],
+        'logger': logger,
+        'precision': TrainerConfig.precision,
+        'log_every_n_steps': TrainerConfig.log_every_n_steps,
+        'strategy': TrainerConfig.strategy,
+        'deterministic': TrainerConfig.deterministic,
+        'benchmark': TrainerConfig.benchmark,
+        'enable_progress_bar': TrainerConfig.enable_progress_bar,
+        'enable_model_summary': TrainerConfig.enable_model_summary,
+        'profiler': TrainerConfig.profiler,
+        'gradient_clip_val': TrainerConfig.gradient_clip_val,
+        'accumulate_grad_batches': TrainerConfig.accumulate_grad_batches,
+        'val_check_interval': TrainerConfig.val_check_interval,
+        'check_val_every_n_epoch': TrainerConfig.check_val_every_n_epoch
+    }
+    # Add either max_epochs or max_steps
+    if epochs is not None:
+        trainer_kwargs['max_epochs'] = epochs
+    else:
+        trainer_kwargs['max_steps'] = steps
+    trainer = pl.Trainer(**trainer_kwargs)
+    # Train with performance monitoring
+    print("\nStarting training with performance monitoring...")
+    print("Format: step | loss | iteration time | tokens per second | GPU memory\n")
+    # Enable garbage collection
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    try:
+        trainer.fit(model, data_module, ckpt_path=ckpt_path)
+    except KeyboardInterrupt:
+        print("\nTraining interrupted by user. Saving checkpoint...")
+        if not os.path.exists('checkpoints'):
+            os.makedirs('checkpoints')
+        trainer.save_checkpoint("checkpoints/interrupted_training.ckpt")
+        print("Checkpoint saved. Exiting...")
+    except Exception as e:
+        print(f"An error occurred during training: {str(e)}")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        raise e
+    return checkpoint_callback.best_model_path
+def get_latest_checkpoint():
+    """
+    Find the latest checkpoint in the checkpoints directory
+    """
+    checkpoint_dir = 'checkpoints'
+    if not os.path.exists(checkpoint_dir):
+        return None
+    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith('.ckpt')]
+    if not checkpoints:
+        return None
+    latest_checkpoint = max(
+        [os.path.join(checkpoint_dir, f) for f in checkpoints],
+        key=os.path.getmtime
+    )
+    return latest_checkpoint
+def main(interupt_steps=SmollmConfig.max_steps):
+    """
+    Main function to handle training workflow
+    """
+    # Ask user for training mode
+    mode = input("Train by epochs or steps? (e/s): ").lower()
+    if mode == 'e':
+        total_epochs = int(input("Enter number of epochs: "))
+        steps = None
+    else:
+        steps = int(input("Enter number of steps: "))
+        total_epochs = None
+    try:
+        latest_checkpoint = get_latest_checkpoint()
+        if latest_checkpoint and os.path.exists(latest_checkpoint):
+            print(f"\nFound existing checkpoint: {latest_checkpoint}")
+            user_input = input("Resume training from checkpoint? (y/n): ").lower()
+            if user_input == 'y':
+                print(f"\nResuming training from checkpoint: {latest_checkpoint}")
+                train_model(epochs=total_epochs, steps=steps, ckpt_path=latest_checkpoint, interupt_steps=interupt_steps)
+            else:
+                print("\nStarting fresh training...")
+                best_model_path = train_model(epochs=total_epochs, steps=steps, interupt_steps=interupt_steps)
+        else:
+            print("\nNo checkpoints found. Starting fresh training...")
+            best_model_path = train_model(epochs=total_epochs, steps=steps, interupt_steps=interupt_steps)
+        print("\nGenerating learning rate plot...")
+        plot_learning_rate("lightning_logs")
+        print("Learning rate plot saved as 'learning_rate_schedule.png'")
+    except Exception as e:
+        print(f"An error occurred during training: {str(e)}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()