|
model:
|
|
type: "custom"
|
|
name: "smollm2_transformer"
|
|
tokenizer_name: "gpt2"
|
|
vocab_size: 50257
|
|
hidden_size: 256
|
|
num_attention_heads: 4
|
|
num_key_value_heads: 2
|
|
num_hidden_layers: 6
|
|
intermediate_size: 512
|
|
hidden_act: "gelu"
|
|
max_position_embeddings: 256
|
|
initializer_range: 0.02
|
|
rms_norm_eps: 1.0e-5
|
|
use_cache: true
|
|
pad_token_id: null
|
|
|
|
optimizer:
|
|
type: "adamW"
|
|
weight_decay: 0.01
|
|
adam_beta1: 0.9
|
|
adam_beta2: 0.95
|
|
adam_eps: 1.0e-8
|
|
torch_adam_is_fused: true
|
|
clip_grad: 1.0
|
|
accumulate_grad_in_fp32: true
|
|
|
|
scheduler:
|
|
type: "one_cycle"
|
|
learning_rate: 0.001
|
|
warmup_steps: 50
|
|
max_lr: 0.001
|
|
pct_start: 0.02
|
|
anneal_strategy: "cos"
|
|
cycle_momentum: false
|
|
div_factor: 25.0
|
|
final_div_factor: 1000.0
|
|
|
|
training:
|
|
output_dir: "./results"
|
|
batch_size: 4
|
|
micro_batch_size: 2
|
|
gradient_accumulation_steps: 2
|
|
sequence_length: 256
|
|
learning_rate: 0.001
|
|
max_steps: 5050
|
|
first_phase_steps: 5000
|
|
second_phase_steps: 50
|
|
sample_frequency: 100
|
|
second_phase_sample_frequency: 5
|
|
logging_dir: "./logs"
|
|
logging_steps: 1
|
|
save_steps: 100
|
|
checkpoint_dir: "checkpoints"
|
|
sample_prompt: "Explain what machine learning is:"
|
|
max_generate_length: 50
|
|
|
|
hardware:
|
|
precision: "16-mixed"
|
|
accelerator: "gpu"
|
|
devices: 1
|
|
strategy: "auto"
|
|
gradient_clip: 1.0
|
|
cuda_memory_fraction: 0.9
|
|
allow_tf32: true
|
|
benchmark: true
|
|
deterministic: false
|
|
|
|
data:
|
|
datasets:
|
|
- name: "wikitext"
|
|
path: "wikitext"
|
|
subset: "wikitext-103-raw-v1"
|
|
split_ratio: 0.01
|
|
weight: 1.0
|
|
loading:
|
|
num_workers: 2
|
|
batch_size: 16
|
|
pin_memory: true
|
|
prefetch_factor: 2
|
|
persistent_workers: true |