VerySmolTextGen / config.yaml
padmanabhbosamia's picture
Upload 5 files
416db21 verified
model:
type: "custom"
name: "smollm2_transformer"
tokenizer_name: "gpt2"
vocab_size: 50257
hidden_size: 256
num_attention_heads: 4
num_key_value_heads: 2
num_hidden_layers: 6
intermediate_size: 512
hidden_act: "gelu"
max_position_embeddings: 256
initializer_range: 0.02
rms_norm_eps: 1.0e-5
use_cache: true
pad_token_id: null
optimizer:
type: "adamW"
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-8
torch_adam_is_fused: true
clip_grad: 1.0
accumulate_grad_in_fp32: true
scheduler:
type: "one_cycle"
learning_rate: 0.001
warmup_steps: 50
max_lr: 0.001
pct_start: 0.02
anneal_strategy: "cos"
cycle_momentum: false
div_factor: 25.0
final_div_factor: 1000.0
training:
output_dir: "./results"
batch_size: 4
micro_batch_size: 2
gradient_accumulation_steps: 2
sequence_length: 256
learning_rate: 0.001
max_steps: 5050 # Total steps (5000 + 50)
first_phase_steps: 5000 # Initial training phase
second_phase_steps: 50 # Fine-tuning phase
sample_frequency: 100 # Sample every 100 steps in first phase
second_phase_sample_frequency: 5 # Sample more frequently in second phase
logging_dir: "./logs"
logging_steps: 1
save_steps: 100
checkpoint_dir: "checkpoints"
sample_prompt: "Explain what machine learning is:"
max_generate_length: 50
hardware:
precision: "16-mixed"
accelerator: "gpu"
devices: 1
strategy: "auto"
gradient_clip: 1.0
cuda_memory_fraction: 0.9
allow_tf32: true
benchmark: true
deterministic: false
data:
datasets:
- name: "wikitext"
path: "wikitext"
subset: "wikitext-103-raw-v1"
split_ratio: 0.01 # Use only 1% of the dataset
weight: 1.0
loading:
num_workers: 2
batch_size: 16
pin_memory: true
prefetch_factor: 2
persistent_workers: true