Spaces:
Sleeping
Sleeping
File size: 14,396 Bytes
fb26382 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 |
from SmolLm3 import LlamaModel
import torch
import yaml
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import numpy as np
from datasets import load_dataset
import logging
import math
from utils import upload_file_to_s3
# At the start of training loop
# print(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
# print(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
logger = logging.getLogger(__name__)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler('training.log')
file_handler.setFormatter(formatter) # Set formatter on the handler, not the logger
logger.addHandler(file_handler)
logger.setLevel(logging.INFO)
def encode_text(examples, tokenizer, seq_length):
"""Tokenize and prepare text examples for training."""
tokens = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=seq_length + 1,
return_tensors="pt",
)
# Use clone().detach() as recommended
input_ids = tokens["input_ids"].squeeze(0).clone().detach()
input_ids = torch.clamp(input_ids, min=0, max=tokenizer.vocab_size - 1)
labels = input_ids.clone().detach()
labels = labels[1:].to(torch.int64)
input_ids = input_ids[:-1].to(torch.int64)
return {"input_ids": input_ids, "labels": labels}
def load_cosmopedia_dataset(batch_size=8, seq_length=1024, tokenizer=None):
"""
Returns a torch dataloader for the cosmopedia dataset
"""
# Set tokenizer parallelism explicitly
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logger.info("tokenizer parallelism set to false")
try:
# Increase timeout and retries for dataset loading
from datasets import config
config.HF_DATASETS_TIMEOUT = 300 # 5 minutes timeout
config.MAX_RETRIES = 10 # Increase retry attempts
logger.info("dataset loading config set")
train_dataset = load_dataset(
"HuggingFaceTB/smollm-corpus",
name="cosmopedia-v2",
split="train",
streaming=True,
)
logger.info("dataset loaded")
# Use partial to bind tokenizer and seq_length to the encode function
from functools import partial
encode_fn = partial(encode_text, tokenizer=tokenizer, seq_length=seq_length)
train_dataset = train_dataset.map(
encode_fn,
remove_columns=["text"],
batched=False
)
train_dataset = train_dataset.with_format("torch")
train_dataloader = DataLoader(
train_dataset,
batch_size=batch_size,
num_workers=2,
pin_memory=True,
prefetch_factor=4,
persistent_workers=True
)
return train_dataloader
except Exception as e:
logger.error(f"Error loading dataset: {str(e)}")
return None
def generate(model, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, eos_token=None, device=None):
logger.info(f"Generating on device {device}")
model = model.to(device)
idx = idx.to(device)
model.eval()
for _ in range(max_new_tokens):
idx_cond = idx[:, -context_length:]
with torch.no_grad():
logits, _ = model(idx_cond) # Unpack both logits and loss (ignore loss)
logits = logits.view(idx_cond.shape[0], -1, model.config['vocab_size']) # Reshape to [batch, seq, vocab]
# Get the logits for the last token only
logits = logits[:, -1, :] # Shape: [batch_size, vocab_size]
if top_k is not None:
# top k sampling
top_logits, top_pos = torch.topk(logits, top_k)
min_logit = top_logits[:, -1].unsqueeze(-1)
logits = torch.where(logits < min_logit,
torch.tensor(float('-inf')).to(logits.device),
logits)
# temperature scaling
if temperature > 0.0:
logits /= temperature
probs = torch.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
else:
idx_next = torch.argmax(logits, dim=-1, keepdim=True)
if idx_next.item() == eos_token:
break
idx = torch.cat((idx, idx_next), dim=1)
model.train()
return idx
def sync_device(device):
if device.startswith('cuda'):
torch.cuda.synchronize()
elif device == 'cpu':
torch.cpu.synchronize() if hasattr(torch.cpu, 'synchronize') else None
elif device.startswith('mps'): # For Apple Silicon
torch.mps.synchronize()
def print_gpu_memory(step_name=""):
"""
Print GPU memory statistics with a specified step name
"""
if torch.cuda.is_available():
logger.info(f"\nGPU Memory Stats {step_name}:")
logger.info(f"GPU Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
logger.info(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
logger.info(f"Max GPU Memory allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
# Learning rate scheduler
def get_lr_lambda(current_step, warmup_steps, max_steps, max_lr):
"""
Modified learning rate scheduler with:
1. Linear warmup for first 3000 steps
2. Cosine decay from 3000 to 60000 steps
3. Minimum learning rate of 1.5e-5 (5% of max_lr)
"""
min_lr = max_lr * 0.05 # Minimum learning rate (5% of max_lr)
if current_step < warmup_steps:
# Linear warmup from 0 to max_lr
return float(current_step) / float(max(1, warmup_steps))
else:
# Cosine decay from max_lr to min_lr
progress = float(current_step - warmup_steps) / float(max(1, max_steps - warmup_steps))
return min_lr + 0.5 * (max_lr - min_lr) * (1.0 + math.cos(math.pi * progress))
def train_model(config, model, train_loader, test_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context="Jack Gisburn rather a cheap genius- ", tokenizer=None):
total_loss = 0
tokens_seen, global_step = 0, -1
# Adjusted gradient accumulation setup
actual_batch_size = config['tokens']['micro_batch_size'] # Now 16
effective_batch_size_multiplier = 2 # Reduced from 4 to maintain reasonable memory usage
target_batch_size = effective_batch_size_multiplier * config['tokens']['micro_batch_size']
gradient_accumulation_steps = target_batch_size // actual_batch_size
# Adjusted learning rate parameters for new batch size
max_lr = 3e-4 # Keep the same max learning rate
warmup_steps = 3000 # Increase warmup steps for longer training
max_steps = 60000 # Set to match 10 hours of training
min_lr = max_lr * 0.05 # Reduce minimum LR to 5% of max (was 10%)
# Create LambdaLR scheduler with the improved lambda function
lr_lambda = lambda step: get_lr_lambda(step, warmup_steps, max_steps, max_lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
logger.info(f"Training with learning rate schedule:")
logger.info(f"Max LR: {max_lr}")
logger.info(f"Warmup Steps: {warmup_steps}")
logger.info(f"Max Steps: {max_steps}")
logger.info(f"Min LR: {max_lr * 0.05}")
logger.info(f"Gradient Accumulation Steps: {gradient_accumulation_steps}")
logger.info(f"Effective Batch Size: {actual_batch_size * gradient_accumulation_steps}")
print_gpu_memory("at start of training")
# Add these near the start of training loop
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
for epoch in range(num_epochs):
model.train()
optimizer.zero_grad() # Zero gradients at start of epoch
for batch_idx, batch in enumerate(train_loader):
input_batch = batch['input_ids'].to(device)
target_batch = batch['labels'].to(device)
# Forward pass
with torch.autocast(device_type=device, dtype=torch.bfloat16):
logits, original_loss = model(input_batch, target_batch)
# Scale loss for gradient accumulation
scaled_loss = original_loss / gradient_accumulation_steps
scaled_loss.backward()
# Add the original loss to total_loss for logging
total_loss += original_loss.item() # Don't multiply back up
tokens_seen += input_batch.numel()
# Calculate running average loss
total_batches = batch_idx + 1
avg_loss = total_loss / total_batches
if batch_idx % 25 == 0:
logger.info(f"Batch {batch_idx + 1}, Running Avg Loss: {avg_loss:.5f}")
# Only update weights after accumulating gradients
if (batch_idx + 1) % gradient_accumulation_steps == 0:
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step() # Update learning rate
optimizer.zero_grad()
global_step += 1
# Evaluation block
if global_step % eval_freq == 0 and global_step > 0:
# Use total batches processed instead of global_step
current_lr = scheduler.get_last_lr()[0]
optimizer_lr = optimizer.param_groups[0]['lr']
print_gpu_memory(f"at step {global_step}")
logger.info(f"learning rate: {current_lr:.8f}")
logger.info(f"Ep {epoch+1} (Step {global_step:06d}): "
f"Avg loss {avg_loss:.3f} | {tokens_seen} tokens seen")
logger.info(f"optimizer lr: {optimizer_lr:.8f}")
logger.info(f"scheduler lr: {current_lr:.8f}")
# Generate sample text
encoded_text = tokenizer.encode(start_context, return_tensors="pt")
random_topk = np.random.randint(1, 10)
logger.info(f"random_topk: {random_topk}")
random_temperature = np.random.uniform(0.7, 0.9)
logger.info(f"random_temperature: {random_temperature}")
logger.info(f"global step {global_step} , batch_idx {batch_idx} => generating text")
generated_text = generate(model,
idx=encoded_text,
max_new_tokens=256,
context_length=256,
temperature=random_temperature,
top_k=random_topk,
eos_token=tokenizer.eos_token_id,
device=device)
logger.info(f"+++"*30)
logger.info(tokenizer.decode(generated_text.squeeze(0)))
logger.info(f"+++"*30)
# Save checkpoint
model_file_name = f"model_{global_step}_steps_avg_loss_{avg_loss:.5f}_optimizer_lr_{optimizer_lr:.8f}.pth"
torch.save({
'step': global_step,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict(),
'loss': avg_loss,
}, model_file_name)
s3_path = upload_file_to_s3(model_file_name, config['model']['model_config']['s3_bucket'],
config['model']['model_config']['s3_checkpoint_folder'])
logger.info(f"Model saved to S3: {s3_path}")
log_path = upload_file_to_s3(config['model']['model_config']['s3_log_file_name'], config['model']['model_config']['s3_bucket'],
config['model']['model_config']['s3_log_folder'])
logger.info(f"Log saved to S3: {log_path}")
if batch_idx % 100 == 0:
logger.info(f"Batch {batch_idx} finished")
logger.info(f"+++"*30)
logger.info("Training complete")
if __name__ == "__main__":
config = yaml.load(open("config_smollm2_135M.yaml", "r"), Loader=yaml.FullLoader)
logger.info(config)
# Set memory efficient settings
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
# Empty cache before model creation
torch.cuda.empty_cache()
model = LlamaModel(config['model'])
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Enable gradient checkpointing for memory efficiency
# model.gradient_checkpointing_enable()
model.to(device)
model = torch.compile(model)
logger.info(model)
logger.info("++"*30)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=3e-4,
weight_decay=0.15,
betas=(0.9, 0.95)
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
tokenizer.pad_token = tokenizer.eos_token
vocab_size = tokenizer.vocab_size
# Adjusted batch size and sequence length
train_loader = load_cosmopedia_dataset(
batch_size=16, # Set to 16
seq_length=1024, # Kept at 1024
tokenizer=tokenizer
)
import time
t1 = time.time()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Set environment variable for memory allocation
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
train_model(
config,
model,
train_loader,
train_loader,
optimizer=optimizer,
device=device,
num_epochs=1,
eval_freq=1000, # Increase eval frequency to every 500 steps
eval_iter=1000,
start_context="Once Upon a Time far far away in a galaxy",
tokenizer=tokenizer
)
t2 = time.time()
logger.info(f"Time taken for training: {t2 - t1:.2f} seconds") |