from SmolLm3 import LlamaModel import torch import yaml from transformers import AutoTokenizer from torch.utils.data import DataLoader import numpy as np from datasets import load_dataset import logging import math from utils import upload_file_to_s3 # At the start of training loop # print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") # print(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") logger = logging.getLogger(__name__) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') file_handler = logging.FileHandler('training.log') file_handler.setFormatter(formatter) # Set formatter on the handler, not the logger logger.addHandler(file_handler) logger.setLevel(logging.INFO) def encode_text(examples, tokenizer, seq_length): """Tokenize and prepare text examples for training.""" tokens = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=seq_length + 1, return_tensors="pt", ) # Use clone().detach() as recommended input_ids = tokens["input_ids"].squeeze(0).clone().detach() input_ids = torch.clamp(input_ids, min=0, max=tokenizer.vocab_size - 1) labels = input_ids.clone().detach() labels = labels[1:].to(torch.int64) input_ids = input_ids[:-1].to(torch.int64) return {"input_ids": input_ids, "labels": labels} def load_cosmopedia_dataset(batch_size=8, seq_length=1024, tokenizer=None): """ Returns a torch dataloader for the cosmopedia dataset """ # Set tokenizer parallelism explicitly import os os.environ["TOKENIZERS_PARALLELISM"] = "false" logger.info("tokenizer parallelism set to false") try: # Increase timeout and retries for dataset loading from datasets import config config.HF_DATASETS_TIMEOUT = 300 # 5 minutes timeout config.MAX_RETRIES = 10 # Increase retry attempts logger.info("dataset loading config set") train_dataset = load_dataset( "HuggingFaceTB/smollm-corpus", name="cosmopedia-v2", split="train", streaming=True, ) logger.info("dataset loaded") # Use partial to bind tokenizer and seq_length to the encode function from functools import partial encode_fn = partial(encode_text, tokenizer=tokenizer, seq_length=seq_length) train_dataset = train_dataset.map( encode_fn, remove_columns=["text"], batched=False ) train_dataset = train_dataset.with_format("torch") train_dataloader = DataLoader( train_dataset, batch_size=batch_size, num_workers=2, pin_memory=True, prefetch_factor=4, persistent_workers=True ) return train_dataloader except Exception as e: logger.error(f"Error loading dataset: {str(e)}") return None def generate(model, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, eos_token=None, device=None): logger.info(f"Generating on device {device}") model = model.to(device) idx = idx.to(device) model.eval() for _ in range(max_new_tokens): idx_cond = idx[:, -context_length:] with torch.no_grad(): logits, _ = model(idx_cond) # Unpack both logits and loss (ignore loss) logits = logits.view(idx_cond.shape[0], -1, model.config['vocab_size']) # Reshape to [batch, seq, vocab] # Get the logits for the last token only logits = logits[:, -1, :] # Shape: [batch_size, vocab_size] if top_k is not None: # top k sampling top_logits, top_pos = torch.topk(logits, top_k) min_logit = top_logits[:, -1].unsqueeze(-1) logits = torch.where(logits < min_logit, torch.tensor(float('-inf')).to(logits.device), logits) # temperature scaling if temperature > 0.0: logits /= temperature probs = torch.softmax(logits, dim=-1) idx_next = torch.multinomial(probs, num_samples=1) else: idx_next = torch.argmax(logits, dim=-1, keepdim=True) if idx_next.item() == eos_token: break idx = torch.cat((idx, idx_next), dim=1) model.train() return idx def sync_device(device): if device.startswith('cuda'): torch.cuda.synchronize() elif device == 'cpu': torch.cpu.synchronize() if hasattr(torch.cpu, 'synchronize') else None elif device.startswith('mps'): # For Apple Silicon torch.mps.synchronize() def print_gpu_memory(step_name=""): """ Print GPU memory statistics with a specified step name """ if torch.cuda.is_available(): logger.info(f"\nGPU Memory Stats {step_name}:") logger.info(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") logger.info(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") logger.info(f"Max GPU Memory allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB") # Learning rate scheduler def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr): """ Modified learning rate scheduler with: 1. Linear warmup for first 3000 steps 2. Cosine decay from 3000 to 60000 steps 3. Minimum learning rate of 1.5e-5 (5% of max_lr) """ min_lr = max_lr * 0.05 # Minimum learning rate (5% of max_lr) if current_step < warmup_steps: # Linear warmup from 0 to max_lr return float(current_step) / float(max(1, warmup_steps)) else: # Cosine decay from max_lr to min_lr progress = float(current_step - warmup_steps) / float(max(1, max_steps - warmup_steps)) return min_lr + 0.5 * (max_lr - min_lr) * (1.0 + math.cos(math.pi * progress)) def train_model(config, model, train_loader, test_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context="Jack Gisburn rather a cheap genius- ", tokenizer=None): total_loss = 0 tokens_seen, global_step = 0, -1 # Adjusted gradient accumulation setup actual_batch_size = config['tokens']['micro_batch_size'] # Now 16 effective_batch_size_multiplier = 2 # Reduced from 4 to maintain reasonable memory usage target_batch_size = effective_batch_size_multiplier * config['tokens']['micro_batch_size'] gradient_accumulation_steps = target_batch_size // actual_batch_size # Adjusted learning rate parameters for new batch size max_lr = 3e-4 # Keep the same max learning rate warmup_steps = 3000 # Increase warmup steps for longer training max_steps = 60000 # Set to match 10 hours of training min_lr = max_lr * 0.05 # Reduce minimum LR to 5% of max (was 10%) # Create LambdaLR scheduler with the improved lambda function lr_lambda = lambda step: get_lr_lambda(step, warmup_steps, max_steps, max_lr) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) logger.info(f"Training with learning rate schedule:") logger.info(f"Max LR: {max_lr}") logger.info(f"Warmup Steps: {warmup_steps}") logger.info(f"Max Steps: {max_steps}") logger.info(f"Min LR: {max_lr * 0.05}") logger.info(f"Gradient Accumulation Steps: {gradient_accumulation_steps}") logger.info(f"Effective Batch Size: {actual_batch_size * gradient_accumulation_steps}") print_gpu_memory("at start of training") # Add these near the start of training loop torch.cuda.empty_cache() torch.backends.cudnn.benchmark = True for epoch in range(num_epochs): model.train() optimizer.zero_grad() # Zero gradients at start of epoch for batch_idx, batch in enumerate(train_loader): input_batch = batch['input_ids'].to(device) target_batch = batch['labels'].to(device) # Forward pass with torch.autocast(device_type=device, dtype=torch.bfloat16): logits, original_loss = model(input_batch, target_batch) # Scale loss for gradient accumulation scaled_loss = original_loss / gradient_accumulation_steps scaled_loss.backward() # Add the original loss to total_loss for logging total_loss += original_loss.item() # Don't multiply back up tokens_seen += input_batch.numel() # Calculate running average loss total_batches = batch_idx + 1 avg_loss = total_loss / total_batches if batch_idx % 25 == 0: logger.info(f"Batch {batch_idx + 1}, Running Avg Loss: {avg_loss:.5f}") # Only update weights after accumulating gradients if (batch_idx + 1) % gradient_accumulation_steps == 0: # Gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() # Update learning rate optimizer.zero_grad() global_step += 1 # Evaluation block if global_step % eval_freq == 0 and global_step > 0: # Use total batches processed instead of global_step current_lr = scheduler.get_last_lr()[0] optimizer_lr = optimizer.param_groups[0]['lr'] print_gpu_memory(f"at step {global_step}") logger.info(f"learning rate: {current_lr:.8f}") logger.info(f"Ep {epoch+1} (Step {global_step:06d}): " f"Avg loss {avg_loss:.3f} | {tokens_seen} tokens seen") logger.info(f"optimizer lr: {optimizer_lr:.8f}") logger.info(f"scheduler lr: {current_lr:.8f}") # Generate sample text encoded_text = tokenizer.encode(start_context, return_tensors="pt") random_topk = np.random.randint(1, 10) logger.info(f"random_topk: {random_topk}") random_temperature = np.random.uniform(0.7, 0.9) logger.info(f"random_temperature: {random_temperature}") logger.info(f"global step {global_step} , batch_idx {batch_idx} => generating text") generated_text = generate(model, idx=encoded_text, max_new_tokens=256, context_length=256, temperature=random_temperature, top_k=random_topk, eos_token=tokenizer.eos_token_id, device=device) logger.info(f"+++"*30) logger.info(tokenizer.decode(generated_text.squeeze(0))) logger.info(f"+++"*30) # Save checkpoint model_file_name = f"model_{global_step}_steps_avg_loss_{avg_loss:.5f}_optimizer_lr_{optimizer_lr:.8f}.pth" torch.save({ 'step': global_step, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'loss': avg_loss, }, model_file_name) s3_path = upload_file_to_s3(model_file_name, config['model']['model_config']['s3_bucket'], config['model']['model_config']['s3_checkpoint_folder']) logger.info(f"Model saved to S3: {s3_path}") log_path = upload_file_to_s3(config['model']['model_config']['s3_log_file_name'], config['model']['model_config']['s3_bucket'], config['model']['model_config']['s3_log_folder']) logger.info(f"Log saved to S3: {log_path}") if batch_idx % 100 == 0: logger.info(f"Batch {batch_idx} finished") logger.info(f"+++"*30) logger.info("Training complete") if __name__ == "__main__": config = yaml.load(open("config_smollm2_135M.yaml", "r"), Loader=yaml.FullLoader) logger.info(config) # Set memory efficient settings torch.set_float32_matmul_precision('high') torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True # Empty cache before model creation torch.cuda.empty_cache() model = LlamaModel(config['model']) device = 'cuda' if torch.cuda.is_available() else 'cpu' # Enable gradient checkpointing for memory efficiency # model.gradient_checkpointing_enable() model.to(device) model = torch.compile(model) logger.info(model) logger.info("++"*30) optimizer = torch.optim.AdamW( model.parameters(), lr=3e-4, weight_decay=0.15, betas=(0.9, 0.95) ) tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer") tokenizer.pad_token = tokenizer.eos_token vocab_size = tokenizer.vocab_size # Adjusted batch size and sequence length train_loader = load_cosmopedia_dataset( batch_size=16, # Set to 16 seq_length=1024, # Kept at 1024 tokenizer=tokenizer ) import time t1 = time.time() device = 'cuda' if torch.cuda.is_available() else 'cpu' # Set environment variable for memory allocation import os os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' train_model( config, model, train_loader, train_loader, optimizer=optimizer, device=device, num_epochs=1, eval_freq=1000, # Increase eval frequency to every 500 steps eval_iter=1000, start_context="Once Upon a Time far far away in a galaxy", tokenizer=tokenizer ) t2 = time.time() logger.info(f"Time taken for training: {t2 - t1:.2f} seconds")