model: type: "custom" name: "smollm2_transformer" tokenizer_name: "gpt2" vocab_size: 50257 hidden_size: 256 num_attention_heads: 4 num_key_value_heads: 2 num_hidden_layers: 6 intermediate_size: 512 hidden_act: "gelu" max_position_embeddings: 256 initializer_range: 0.02 rms_norm_eps: 1.0e-5 use_cache: true pad_token_id: null optimizer: type: "adamW" weight_decay: 0.01 adam_beta1: 0.9 adam_beta2: 0.95 adam_eps: 1.0e-8 torch_adam_is_fused: true clip_grad: 1.0 accumulate_grad_in_fp32: true scheduler: type: "one_cycle" learning_rate: 0.001 warmup_steps: 50 max_lr: 0.001 pct_start: 0.02 anneal_strategy: "cos" cycle_momentum: false div_factor: 25.0 final_div_factor: 1000.0 training: output_dir: "./results" batch_size: 4 micro_batch_size: 2 gradient_accumulation_steps: 2 sequence_length: 256 learning_rate: 0.001 max_steps: 5050 # Total steps (5000 + 50) first_phase_steps: 5000 # Initial training phase second_phase_steps: 50 # Fine-tuning phase sample_frequency: 100 # Sample every 100 steps in first phase second_phase_sample_frequency: 5 # Sample more frequently in second phase logging_dir: "./logs" logging_steps: 1 save_steps: 100 checkpoint_dir: "checkpoints" sample_prompt: "Explain what machine learning is:" max_generate_length: 50 hardware: precision: "16-mixed" accelerator: "gpu" devices: 1 strategy: "auto" gradient_clip: 1.0 cuda_memory_fraction: 0.9 allow_tf32: true benchmark: true deterministic: false data: datasets: - name: "wikitext" path: "wikitext" subset: "wikitext-103-raw-v1" split_ratio: 0.01 # Use only 1% of the dataset weight: 1.0 loading: num_workers: 2 batch_size: 16 pin_memory: true prefetch_factor: 2 persistent_workers: true