Spaces:

padmanabhbosamia
/

VerySmolTextGen

Running

App Files Files Community

padmanabhbosamia commited on 8 days ago

Commit

416db21

verified ·

1 Parent(s): ac8a885

Upload 5 files

Browse files

Files changed (5) hide show

config.py +195 -0
config.yaml +81 -0
model.py +522 -0
smol-lm2-final.ckpt +3 -0
train_script.py +264 -0

config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import yaml
+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class ModelConfig:
+    type: str = "custom"
+    name: str = "smollm2_transformer"
+    tokenizer_name: str = "HuggingFaceTB/SmolLM2-135M"
+    vocab_size: int = 49152
+    hidden_size: int = 576
+    num_attention_heads: int = 9
+    num_key_value_heads: int = 3
+    num_hidden_layers: int = 30
+    intermediate_size: int = 1536
+    hidden_act: str = "gelu"
+    max_position_embeddings: int = 512
+    initializer_range: float = 0.02
+    rms_norm_eps: float = 1e-5
+    use_cache: bool = True
+    pad_token_id: Optional[int] = None
+    max_length: int = 512
+    def __post_init__(self):
+        # Ensure numeric values are proper types
+        self.vocab_size = int(self.vocab_size)
+        self.hidden_size = int(self.hidden_size)
+        self.num_attention_heads = int(self.num_attention_heads)
+        self.num_key_value_heads = int(self.num_key_value_heads)
+        self.num_hidden_layers = int(self.num_hidden_layers)
+        self.intermediate_size = int(self.intermediate_size)
+        self.max_position_embeddings = int(self.max_position_embeddings)
+        self.initializer_range = float(self.initializer_range)
+        self.rms_norm_eps = float(self.rms_norm_eps)
+        self.max_length = int(self.max_length)
+@dataclass
+class OptimizerConfig:
+    type: str = "adamW"
+    weight_decay: float = 0.01
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.95
+    adam_eps: float = 1e-8
+    torch_adam_is_fused: bool = True
+    clip_grad: float = 1.0
+    accumulate_grad_in_fp32: bool = True
+    def __post_init__(self):
+        # Ensure numeric values are proper floats
+        self.weight_decay = float(self.weight_decay)
+        self.adam_beta1 = float(self.adam_beta1)
+        self.adam_beta2 = float(self.adam_beta2)
+        self.adam_eps = float(self.adam_eps)
+        self.clip_grad = float(self.clip_grad)
+@dataclass
+class SchedulerConfig:
+    type: str = "one_cycle"
+    learning_rate: float = 0.003
+    warmup_steps: int = 100
+    max_lr: float = 0.003
+    pct_start: float = 0.02
+    anneal_strategy: str = "cos"
+    cycle_momentum: bool = False
+    div_factor: float = 25.0
+    final_div_factor: float = 1000.0
+@dataclass
+class TrainingConfig:
+    output_dir: str = "./results"
+    batch_size: int = 2
+    micro_batch_size: int = 1
+    gradient_accumulation_steps: int = 4
+    sequence_length: int = 512
+    learning_rate: float = 0.003
+    max_steps: int = 5050
+    first_phase_steps: int = 5000
+    second_phase_steps: int = 50
+    sample_frequency: int = 500
+    second_phase_sample_frequency: int = 10
+    logging_dir: str = "./logs"
+    logging_steps: int = 1
+    save_steps: int = 500
+    checkpoint_dir: str = "checkpoints"
+    sample_prompt: str = "Explain what machine learning is:"
+    max_generate_length: int = 100
+@dataclass
+class HardwareConfig:
+    precision: str = "16-mixed"
+    accelerator: str = "gpu"
+    devices: int = 1
+    strategy: str = "auto"
+    gradient_clip: float = 1.0
+@dataclass
+class DatasetConfig:
+    name: str
+    path: str
+    subset: str
+    weight: float
+    split_ratio: float = 1.0  # Default to using full dataset
+@dataclass
+class DataLoadingConfig:
+    num_workers: int = 2
+    batch_size: int = 32
+    pin_memory: bool = True
+    prefetch_factor: int = 2
+    persistent_workers: bool = True
+@dataclass
+class DataConfig:
+    datasets: List[DatasetConfig] = field(default_factory=list)
+    loading: DataLoadingConfig = field(default_factory=DataLoadingConfig)
+class SmolLM2Config:
+    def __init__(self, config_path: str = None):
+        self.model = ModelConfig()
+        self.optimizer = OptimizerConfig()
+        self.scheduler = SchedulerConfig()
+        self.training = TrainingConfig()
+        self.hardware = HardwareConfig()
+        self.data = DataConfig()
+        # Default dataset configuration
+        self.data.datasets = [
+            DatasetConfig(
+                name="wikitext",
+                path="wikitext",
+                subset="wikitext-2-raw-v1",
+                weight=1.0
+            )
+        ]
+        if config_path and os.path.exists(config_path):
+            self.load_from_yaml(config_path)
+    def load_from_yaml(self, config_path: str):
+        with open(config_path, 'r') as f:
+            config_dict = yaml.safe_load(f)
+        # Update configurations from yaml
+        if 'model' in config_dict:
+            for k, v in config_dict['model'].items():
+                setattr(self.model, k, v)
+        if 'optimizer' in config_dict:
+            for k, v in config_dict['optimizer'].items():
+                setattr(self.optimizer, k, v)
+        if 'scheduler' in config_dict:
+            for k, v in config_dict['scheduler'].items():
+                setattr(self.scheduler, k, v)
+        if 'training' in config_dict:
+            for k, v in config_dict['training'].items():
+                setattr(self.training, k, v)
+        if 'hardware' in config_dict:
+            for k, v in config_dict['hardware'].items():
+                setattr(self.hardware, k, v)
+        if 'data' in config_dict:
+            for k, v in config_dict['data'].items():
+                if k == 'datasets':
+                    for dataset in v:
+                        self.data.datasets.append(DatasetConfig(**dataset))
+                elif k == 'loading':
+                    for k, v in config_dict['data']['loading'].items():
+                        setattr(self.data.loading, k, v)
+    def save_to_yaml(self, config_path: str):
+        config_dict = {
+            'model': self.model.__dict__,
+            'optimizer': self.optimizer.__dict__,
+            'scheduler': self.scheduler.__dict__,
+            'training': self.training.__dict__,
+            'hardware': self.hardware.__dict__,
+            'data': self.data.__dict__
+        }
+        with open(config_path, 'w') as f:
+            yaml.dump(config_dict, f, default_flow_style=False)
+    def __repr__(self):
+        return f"SmolLM2Config(\n" \
+               f"  model={self.model}\n" \
+               f"  optimizer={self.optimizer}\n" \
+               f"  scheduler={self.scheduler}\n" \
+               f"  training={self.training}\n" \
+               f"  hardware={self.hardware}\n" \
+               f"  data={self.data}\n" \
+               f")"

config.yaml ADDED Viewed

	@@ -0,0 +1,81 @@

+model:
+  type: "custom"
+  name: "smollm2_transformer"
+  tokenizer_name: "gpt2"
+  vocab_size: 50257
+  hidden_size: 256
+  num_attention_heads: 4
+  num_key_value_heads: 2
+  num_hidden_layers: 6
+  intermediate_size: 512
+  hidden_act: "gelu"
+  max_position_embeddings: 256
+  initializer_range: 0.02
+  rms_norm_eps: 1.0e-5
+  use_cache: true
+  pad_token_id: null
+optimizer:
+  type: "adamW"
+  weight_decay: 0.01
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_eps: 1.0e-8
+  torch_adam_is_fused: true
+  clip_grad: 1.0
+  accumulate_grad_in_fp32: true
+scheduler:
+  type: "one_cycle"
+  learning_rate: 0.001
+  warmup_steps: 50
+  max_lr: 0.001
+  pct_start: 0.02
+  anneal_strategy: "cos"
+  cycle_momentum: false
+  div_factor: 25.0
+  final_div_factor: 1000.0
+training:
+  output_dir: "./results"
+  batch_size: 4
+  micro_batch_size: 2
+  gradient_accumulation_steps: 2
+  sequence_length: 256
+  learning_rate: 0.001
+  max_steps: 5050  # Total steps (5000 + 50)
+  first_phase_steps: 5000  # Initial training phase
+  second_phase_steps: 50   # Fine-tuning phase
+  sample_frequency: 100    # Sample every 100 steps in first phase
+  second_phase_sample_frequency: 5  # Sample more frequently in second phase
+  logging_dir: "./logs"
+  logging_steps: 1
+  save_steps: 100
+  checkpoint_dir: "checkpoints"
+  sample_prompt: "Explain what machine learning is:"
+  max_generate_length: 50
+hardware:
+  precision: "16-mixed"
+  accelerator: "gpu"
+  devices: 1
+  strategy: "auto"
+  gradient_clip: 1.0
+  cuda_memory_fraction: 0.9
+  allow_tf32: true
+  benchmark: true
+  deterministic: false
+data:
+  datasets:
+    - name: "wikitext"
+      path: "wikitext"
+      subset: "wikitext-103-raw-v1"
+      split_ratio: 0.01  # Use only 1% of the dataset
+      weight: 1.0
+  loading:
+    num_workers: 2
+    batch_size: 16
+    pin_memory: true
+    prefetch_factor: 2
+    persistent_workers: true

model.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import OneCycleLR
+from transformers import AutoTokenizer
+import torch.nn as nn
+import math
+from torch.utils.data import DataLoader, Dataset
+from datasets import load_dataset
+import os
+def _init_weights(module, std=0.02):
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=std)
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=std)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.eps = float(eps)  # Ensure eps is a float
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        norm = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        return x * norm * self.weight
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = int(max_position_embeddings)  # Convert to int
+        self.base = base
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        t = torch.arange(self.max_position_embeddings).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :])
+    def forward(self, x, seq_len=None):
+        # Convert seq_len to int and ensure it's a valid value
+        seq_len = int(seq_len) if seq_len is not None else x.size(1)
+        if seq_len > self.max_position_embeddings:
+            seq_len = self.max_position_embeddings
+        return (
+            self.cos_cached[:,:,:seq_len,:],
+            self.sin_cached[:,:,:seq_len,:]
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    # Ensure proper broadcasting
+    cos = cos[:, :, :q.size(2), :]  # [batch, 1, seq_len, dim]
+    sin = sin[:, :, :q.size(2), :]  # [batch, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+    def forward(self, hidden_states, cos, sin, attention_mask=None):
+        batch_size, seq_length, _ = hidden_states.shape
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+        # Reshape for attention computation
+        q = q.view(batch_size, seq_length, self.num_attention_heads, self.head_dim)
+        k = k.view(batch_size, seq_length, self.num_key_value_heads, self.head_dim)
+        v = v.view(batch_size, seq_length, self.num_key_value_heads, self.head_dim)
+        # Transpose for attention computation
+        q = q.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+        k = k.transpose(1, 2)  # [batch, num_kv_heads, seq_len, head_dim]
+        v = v.transpose(1, 2)  # [batch, num_kv_heads, seq_len, head_dim]
+        # Apply rotary embeddings
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        # Repeat k/v heads if num_key_value_heads < num_attention_heads
+        if self.num_key_value_heads != self.num_attention_heads:
+            k = k.repeat_interleave(self.num_attention_heads // self.num_key_value_heads, dim=1)
+            v = v.repeat_interleave(self.num_attention_heads // self.num_key_value_heads, dim=1)
+        # Compute attention
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        # Compute output
+        output = torch.matmul(attn_weights, v)
+        output = output.transpose(1, 2).contiguous()  # [batch, seq_len, num_heads, head_dim]
+        output = output.view(batch_size, seq_length, -1)
+        return self.o_proj(output)
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class DecoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = Attention(config)
+        self.mlp = MLP(config)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(self, hidden_states, cos, sin, attention_mask=None):
+        # Self attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, cos, sin, attention_mask)
+        hidden_states = residual + hidden_states
+        # MLP
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class SmolLM2(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        # Initialize transformer layers
+        self.layers = nn.ModuleList([
+            DecoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        # Final layer norm
+        self.norm = RMSNorm(config.hidden_size, eps=float(config.rms_norm_eps))
+        # Initialize rotary embeddings
+        self.rotary_emb = RotaryEmbedding(
+            config.hidden_size // config.num_attention_heads,
+            max_position_embeddings=config.max_position_embeddings
+        )
+        # Initialize weights
+        self.apply(lambda p: _init_weights(p, std=config.initializer_range))
+    def forward(self, input_ids, attention_mask=None):
+        try:
+            # Ensure inputs are on the correct device
+            device = input_ids.device
+            batch_size, seq_length = input_ids.shape
+            # Input validation
+            if seq_length > self.config.max_position_embeddings:
+                raise ValueError(f"Input sequence length {seq_length} exceeds maximum position embeddings {self.config.max_position_embeddings}")
+            # Get embeddings
+            hidden_states = self.embed_tokens(input_ids)
+            # Get position embeddings
+            cos, sin = self.rotary_emb(hidden_states, seq_length)
+            # Generate attention mask if none provided
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length),
+                    dtype=torch.bool,
+                    device=device
+                )
+            else:
+                # Convert to boolean if it's not already and ensure contiguous memory
+                attention_mask = attention_mask.bool().contiguous()
+            # Create causal mask
+            causal_mask = torch.triu(
+                torch.ones((seq_length, seq_length), device=device),
+                diagonal=1
+            ).bool()
+            # Create attention mask [batch_size, 1, seq_length, seq_length]
+            attention_mask = attention_mask.view(batch_size, 1, 1, seq_length)
+            attention_mask = attention_mask.expand(batch_size, 1, seq_length, seq_length)
+            # Prepare causal mask
+            causal_mask = causal_mask.view(1, 1, seq_length, seq_length)
+            # Combine masks
+            mask = attention_mask & ~causal_mask
+            # Convert boolean mask to float mask
+            mask = mask.to(dtype=hidden_states.dtype)
+            mask = (1.0 - mask) * torch.finfo(hidden_states.dtype).min
+            # Apply transformer layers
+            for layer in self.layers:
+                hidden_states = layer(hidden_states, cos, sin, mask)
+            # Apply final normalization
+            hidden_states = self.norm(hidden_states)
+            # Project back to vocabulary
+            logits = F.linear(hidden_states, self.embed_tokens.weight)
+            return logits
+        except Exception as e:
+            print(f"\nForward pass error:")
+            print(f"Input shape: {input_ids.shape}")
+            print(f"Device: {input_ids.device}")
+            print(f"CUDA memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+            print(f"Error: {str(e)}")
+            raise
+    def generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        max_length=100,
+        temperature=0.7,
+        top_p=0.9,
+        top_k=50,
+        num_return_sequences=1,
+        do_sample=True,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None
+    ):
+        try:
+            batch_size = input_ids.shape[0]
+            current_length = input_ids.shape[1]
+            device = input_ids.device
+            # Input validation
+            if current_length >= self.config.max_position_embeddings:
+                raise ValueError(f"Input sequence length {current_length} exceeds maximum position embeddings {self.config.max_position_embeddings}")
+            # Ensure we don't exceed maximum position embeddings
+            max_length = min(max_length, self.config.max_position_embeddings)
+            # Initialize attention mask if None
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids, dtype=torch.bool, device=device)
+            for _ in range(max_length - current_length):
+                # Forward pass
+                outputs = self(input_ids, attention_mask)
+                next_token_logits = outputs[:, -1, :] / temperature
+                # Apply top-k filtering
+                if top_k > 0:
+                    indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
+                    next_token_logits[indices_to_remove] = float('-inf')
+                # Apply top-p filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    next_token_logits[indices_to_remove] = float('-inf')
+                # Sample from the filtered distribution
+                if do_sample:
+                    probs = F.softmax(next_token_logits, dim=-1)
+                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+                else:
+                    next_tokens = torch.argmax(next_token_logits, dim=-1)
+                # Append new tokens
+                input_ids = torch.cat([input_ids, next_tokens.unsqueeze(-1)], dim=-1)
+                attention_mask = torch.cat([attention_mask, torch.ones_like(next_tokens.unsqueeze(-1))], dim=-1)
+                # Stop if we've hit special tokens
+                if (pad_token_id is not None and (next_tokens == pad_token_id).all()) or \
+                   (eos_token_id is not None and (next_tokens == eos_token_id).all()):
+                    break
+            return input_ids
+        except Exception as e:
+            print(f"\nGeneration error:")
+            print(f"Input shape: {input_ids.shape}")
+            print(f"Device: {input_ids.device}")
+            print(f"Error: {str(e)}")
+            raise
+class TextDataset(Dataset):
+    def __init__(self, config, split="train"):
+        self.config = config
+        # Load dataset from HuggingFace
+        full_dataset = load_dataset(
+            config.data.datasets[0].path,
+            config.data.datasets[0].subset,
+            split=split
+        )
+        # Apply split ratio if less than 1
+        if config.data.datasets[0].split_ratio < 1.0:
+            num_samples = int(len(full_dataset) * config.data.datasets[0].split_ratio)
+            self.dataset = full_dataset.select(range(num_samples))
+        else:
+            self.dataset = full_dataset
+        # Initialize tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model.tokenizer_name)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        # Get text from dataset
+        text = self.dataset[idx]["text"]
+        # Tokenize
+        encodings = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.config.model.max_position_embeddings,
+            padding="max_length",
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": encodings.input_ids.squeeze(),
+            "attention_mask": encodings.attention_mask.squeeze(),
+            "labels": encodings.input_ids.squeeze()
+        }
+class SmolLM2Lightning(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.save_hyperparameters()
+        self.config = config
+        # Initialize tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model.tokenizer_name)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize the base model
+        self.model = SmolLM2(config.model)
+    def forward(self, input_ids, attention_mask=None):
+        return self.model(input_ids, attention_mask)
+    def training_step(self, batch, batch_idx):
+        try:
+            input_ids = batch["input_ids"]
+            labels = batch["labels"]
+            attention_mask = batch.get("attention_mask", None)
+            # Ensure tensors are contiguous and on the correct device
+            inputs = input_ids[..., :-1].contiguous()
+            labels = input_ids[..., 1:].contiguous()
+            if attention_mask is not None:
+                attention_mask = attention_mask[..., :-1].contiguous()
+            # Forward pass
+            logits = self(inputs, attention_mask)
+            # Calculate loss
+            loss = F.cross_entropy(
+                logits.view(-1, self.config.model.vocab_size),
+                labels.view(-1),
+                ignore_index=self.config.model.pad_token_id if self.config.model.pad_token_id is not None else -100,
+                reduction='mean'
+            )
+            # Detach loss for logging
+            loss_value = loss.detach().float()
+            # Log metrics
+            self.log('train_loss', loss_value, prog_bar=True, on_step=True, sync_dist=True)
+            return loss
+        except Exception as e:
+            print(f"\nTraining step error:")
+            print(f"Input shape: {input_ids.shape if input_ids is not None else 'None'}")
+            print(f"Device: {input_ids.device if input_ids is not None else 'None'}")
+            print(f"Error: {str(e)}")
+            raise
+    def validation_step(self, batch, batch_idx):
+        try:
+            input_ids = batch["input_ids"]
+            labels = batch["labels"]
+            attention_mask = batch.get("attention_mask", None)
+            # Ensure tensors are contiguous and on the correct device
+            inputs = input_ids[..., :-1].contiguous()
+            labels = input_ids[..., 1:].contiguous()
+            if attention_mask is not None:
+                attention_mask = attention_mask[..., :-1].contiguous()
+            # Forward pass
+            logits = self(inputs, attention_mask)
+            # Calculate loss
+            loss = F.cross_entropy(
+                logits.view(-1, self.config.model.vocab_size),
+                labels.view(-1),
+                ignore_index=self.config.model.pad_token_id if self.config.model.pad_token_id is not None else -100,
+                reduction='mean'
+            )
+            # Detach loss for logging
+            loss_value = loss.detach().float()
+            # Log metrics
+            self.log('val_loss', loss_value, prog_bar=True, on_epoch=True, sync_dist=True)
+            return loss
+        except Exception as e:
+            print(f"\nValidation step error:")
+            print(f"Input shape: {input_ids.shape if input_ids is not None else 'None'}")
+            print(f"Device: {input_ids.device if input_ids is not None else 'None'}")
+            print(f"Error: {str(e)}")
+            raise
+    def configure_optimizers(self):
+        # Create optimizer with explicit type conversion
+        optimizer = AdamW(
+            self.parameters(),
+            lr=float(self.config.scheduler.learning_rate),
+            weight_decay=float(self.config.optimizer.weight_decay),
+            betas=(float(self.config.optimizer.adam_beta1),
+                   float(self.config.optimizer.adam_beta2)),
+            eps=float(self.config.optimizer.adam_eps),
+        )
+        # Create scheduler
+        scheduler = OneCycleLR(
+            optimizer,
+            max_lr=float(self.config.scheduler.max_lr),
+            total_steps=int(self.config.training.max_steps),
+            pct_start=float(self.config.scheduler.pct_start),
+            anneal_strategy=self.config.scheduler.anneal_strategy,
+            cycle_momentum=bool(self.config.scheduler.cycle_momentum),
+            div_factor=float(self.config.scheduler.div_factor),
+            final_div_factor=float(self.config.scheduler.final_div_factor),
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "interval": "step",
+                "frequency": 1
+            }
+        }
+    def generate(self, *args, **kwargs):
+        return self.model.generate(*args, **kwargs)
+    def train_dataloader(self):
+        dataset = TextDataset(self.config, split="train")
+        return DataLoader(
+            dataset,
+            batch_size=self.config.training.batch_size,
+            shuffle=True,
+            num_workers=self.config.data.loading.num_workers,
+            pin_memory=self.config.data.loading.pin_memory,
+            persistent_workers=True,
+            prefetch_factor=self.config.data.loading.prefetch_factor,
+            drop_last=True  # Drop incomplete batches
+        )
+    def val_dataloader(self):
+        dataset = TextDataset(self.config, split="validation")
+        return DataLoader(
+            dataset,
+            batch_size=self.config.training.batch_size,
+            shuffle=False,
+            num_workers=self.config.data.loading.num_workers,
+            pin_memory=self.config.data.loading.pin_memory,
+            persistent_workers=True,
+            prefetch_factor=self.config.data.loading.prefetch_factor
+        )

smol-lm2-final.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7992a6cb4ad6ca593be88b64f9e4359f771afaeabf6da719bd6aab480461fb08
+size 197102570

train_script.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import torch
+import wandb
+import shutil
+from config import SmolLM2Config
+from model import SmolLM2Lightning
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import ModelCheckpoint, Callback
+from pytorch_lightning.loggers import WandbLogger
+from env_setup import setup_environment, cleanup_environment
+# Set CUDA environment variables before any other CUDA operations
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+os.environ['TORCH_USE_CUDA_DSA'] = '1'
+def setup_training():
+    """Setup training environment"""
+    try:
+        if torch.cuda.is_available():
+            # Configure CUDA settings
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            torch.backends.cudnn.benchmark = True
+            torch.set_float32_matmul_precision('high')
+            # Set default device
+            device = torch.device('cuda:0')
+            torch.cuda.set_device(device)
+            # Print GPU info
+            print(f"Using GPU: {torch.cuda.get_device_name()}")
+            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+            return device
+    except Exception as e:
+        print(f"CUDA setup error: {str(e)}")
+    print("Using CPU")
+    return torch.device('cpu')
+def cleanup_training():
+    """Cleanup training resources"""
+    try:
+        # Move model to CPU before cleanup
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Clean up wandb
+        try:
+            wandb.finish()
+        except:
+            pass
+    except Exception as e:
+        print(f"Cleanup error: {str(e)}")
+# Setup CUDA at module level
+device = setup_training()
+class GenerationMonitorCallback(Callback):
+    def __init__(self, prompt="Explain what machine learning is:", sample_every_n_steps=500):
+        super().__init__()
+        self.prompt = prompt
+        self.sample_every_n_steps = sample_every_n_steps
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        try:
+            if (trainer.global_step + 1) % self.sample_every_n_steps == 0:
+                # Switch to eval mode
+                pl_module.eval()
+                with torch.no_grad():
+                    # Tokenize prompt
+                    inputs = pl_module.tokenizer(
+                        self.prompt,
+                        return_tensors="pt",
+                        truncation=True,
+                        max_length=pl_module.config.model.max_position_embeddings,
+                        padding=True
+                    ).to(pl_module.device)
+                    try:
+                        # Generate text with error handling
+                        outputs = pl_module.generate(
+                            input_ids=inputs.input_ids,
+                            attention_mask=inputs.attention_mask,
+                            max_length=100,
+                            temperature=0.7,
+                            top_p=0.9,
+                            top_k=50,
+                            do_sample=True,
+                            pad_token_id=pl_module.tokenizer.pad_token_id,
+                            bos_token_id=pl_module.tokenizer.bos_token_id,
+                            eos_token_id=pl_module.tokenizer.eos_token_id
+                        )
+                        # Decode generated text
+                        generated_text = pl_module.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                        # Print results
+                        print(f"\n=== Generation at step {trainer.global_step + 1} ===")
+                        print(f"Prompt: {self.prompt}")
+                        print(f"Generated: {generated_text}\n")
+                    except RuntimeError as e:
+                        print(f"\nError during generation at step {trainer.global_step + 1}: {str(e)}")
+                        print(f"Input shape: {inputs.input_ids.shape}")
+                        print(f"Input device: {inputs.input_ids.device}")
+                # Switch back to train mode
+                pl_module.train()
+        except Exception as e:
+            print(f"\nCallback error at step {trainer.global_step + 1}: {str(e)}")
+def init_wandb(project_name, run_name):
+    """Initialize WandB with error handling and cleanup"""
+    try:
+        # Try to clean up any existing wandb directory
+        wandb_dir = os.path.join(os.getcwd(), "wandb")
+        if os.path.exists(wandb_dir):
+            try:
+                shutil.rmtree(wandb_dir)
+                print("Cleaned up existing wandb directory")
+            except Exception as e:
+                print(f"Warning: Could not clean up wandb directory: {str(e)}")
+        # Create fresh wandb directory with proper permissions
+        os.makedirs(wandb_dir, exist_ok=True)
+        # Initialize WandB logger
+        logger = WandbLogger(
+            project=project_name,
+            name=run_name,
+            save_dir=os.getcwd(),
+            settings=wandb.Settings(start_method="thread")
+        )
+        return logger
+    except Exception as e:
+        print(f"Error initializing WandB: {str(e)}")
+        print("Continuing without WandB logging...")
+        return None
+def main():
+    device = setup_training()
+    try:
+        # Load configuration
+        config = SmolLM2Config("config.yaml")
+        # Initialize model
+        model = SmolLM2Lightning(config)
+        # Phase 1: Initial Training
+        print("\n=== Starting Phase 1 Training ===")
+        # Initialize wandb logger for phase 1 with error handling
+        wandb_logger = init_wandb("smol-lm2", "training_run_phase1")
+        # Setup checkpoint callback for phase 1
+        checkpoint_callback = ModelCheckpoint(
+            dirpath=config.training.checkpoint_dir,
+            filename="smol-lm2-phase1-{epoch:02d}-{train_loss:.2f}",
+            save_top_k=3,
+            monitor="train_loss",
+            mode="min",
+            every_n_train_steps=config.training.save_steps
+        )
+        # Setup generation monitoring callback for phase 1
+        generation_callback = GenerationMonitorCallback(
+            prompt=config.training.sample_prompt,
+            sample_every_n_steps=config.training.sample_frequency
+        )
+        # Initialize trainer for phase 1
+        trainer_phase1 = pl.Trainer(
+            max_steps=config.training.first_phase_steps,
+            accelerator=config.hardware.accelerator,
+            devices=config.hardware.devices,
+            precision=config.hardware.precision,
+            logger=wandb_logger,
+            callbacks=[checkpoint_callback, generation_callback],
+            gradient_clip_val=config.hardware.gradient_clip,
+            accumulate_grad_batches=config.training.gradient_accumulation_steps,
+            log_every_n_steps=config.training.logging_steps,
+            deterministic=False,
+            benchmark=True,
+            strategy='auto',  # Let PyTorch Lightning handle device strategy
+        )
+        # Train phase 1 with error handling
+        try:
+            trainer_phase1.fit(model)
+        except Exception as e:
+            print(f"Error during phase 1 training: {str(e)}")
+            raise
+        # Save phase 1 checkpoint
+        phase1_checkpoint_path = os.path.join(config.training.checkpoint_dir, "smol-lm2-phase1-final.ckpt")
+        trainer_phase1.save_checkpoint(phase1_checkpoint_path)
+        print(f"Phase 1 completed. Model saved to {phase1_checkpoint_path}")
+        # Clear GPU memory between phases
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Phase 2: Fine-tuning
+        print("\n=== Starting Phase 2 Training ===")
+        # Load the model from phase 1 checkpoint with error handling
+        try:
+            model = SmolLM2Lightning.load_from_checkpoint(phase1_checkpoint_path, config=config)
+        except Exception as e:
+            print(f"Error loading checkpoint for phase 2: {str(e)}")
+            raise
+        # Initialize wandb logger for phase 2 with error handling
+        wandb_logger = init_wandb("smol-lm2", "training_run_phase2")
+        # Setup generation monitoring callback with higher frequency for phase 2
+        generation_callback = GenerationMonitorCallback(
+            prompt=config.training.sample_prompt,
+            sample_every_n_steps=config.training.second_phase_sample_frequency
+        )
+        # Initialize trainer for phase 2
+        trainer_phase2 = pl.Trainer(
+            max_steps=config.training.second_phase_steps,
+            accelerator=config.hardware.accelerator,
+            devices=config.hardware.devices,
+            precision=config.hardware.precision,
+            logger=wandb_logger,
+            callbacks=[generation_callback],
+            gradient_clip_val=config.hardware.gradient_clip,
+            accumulate_grad_batches=config.training.gradient_accumulation_steps,
+            log_every_n_steps=config.training.logging_steps,
+            deterministic=False,
+            benchmark=True,
+        )
+        # Train phase 2 with error handling
+        try:
+            trainer_phase2.fit(model)
+        except Exception as e:
+            print(f"Error during phase 2 training: {str(e)}")
+            raise
+        # Save final model
+        final_checkpoint_path = os.path.join(config.training.checkpoint_dir, "smol-lm2-final.ckpt")
+        trainer_phase2.save_checkpoint(final_checkpoint_path)
+        print(f"Phase 2 completed. Final model saved to {final_checkpoint_path}")
+    except Exception as e:
+        print(f"\nTraining failed with error: {str(e)}")
+        if torch.cuda.is_available():
+            print(f"CUDA memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+            print(f"CUDA memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
+        raise
+    finally:
+        cleanup_training()
+if __name__ == "__main__":
+    main()