crpatel commited on
Commit
c31f4d9
·
verified ·
1 Parent(s): df34a30

Upload config_smollm2_135M.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config_smollm2_135M.yaml +103 -0
config_smollm2_135M.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoints:
2
+ checkpoint_interval: 2000
3
+ checkpoints_path: checkpoints
4
+ checkpoints_path_is_shared_file_system: false
5
+ resume_checkpoint_path: null
6
+ save_final_state: false
7
+ save_initial_state: false
8
+ data_stages:
9
+ - data:
10
+ dataset:
11
+ dataset_folder:
12
+ - datasets/smollm2-corpus
13
+ dataset_weights:
14
+ - 1.0
15
+ num_loading_workers: 0
16
+ seed: 8
17
+ name: stable phase
18
+ start_training_step: 1
19
+ general:
20
+ benchmark_csv_path: null
21
+ consumed_train_samples: null
22
+ ignore_sanity_checks: true
23
+ project: smollm2
24
+ run: smollm2-135M
25
+ seed: 8
26
+ step: null
27
+ logging:
28
+ iteration_step_info_interval: 1
29
+ log_level: info
30
+ log_level_replica: info
31
+ model:
32
+ ddp_bucket_cap_mb: 25
33
+ dtype: bfloat16
34
+ init_method:
35
+ std: 0.041666666666666664
36
+ make_vocab_size_divisible_by: 1
37
+ model_config:
38
+ bos_token_id: 0
39
+ eos_token_id: 0
40
+ hidden_act: silu
41
+ hidden_size: 576
42
+ initializer_range: 0.041666666666666664
43
+ intermediate_size: 1536
44
+ is_llama_config: true
45
+ max_position_embeddings: 2048
46
+ num_attention_heads: 9
47
+ num_hidden_layers: 30
48
+ num_key_value_heads: 3
49
+ pad_token_id: null
50
+ pretraining_tp: 1
51
+ rms_norm_eps: 1.0e-05
52
+ rope_interleaved: false
53
+ rope_scaling: null
54
+ rope_theta: 10000.0
55
+ tie_word_embeddings: true
56
+ use_cache: true
57
+ vocab_size: 49152
58
+ s3_bucket: smollm2-train-jan-25-era3
59
+ s3_checkpoint_folder: checkpoints
60
+ s3_log_folder: logs
61
+ s3_log_file_name: training.log
62
+ optimizer:
63
+ accumulate_grad_in_fp32: true
64
+ clip_grad: 1.0
65
+ learning_rate_scheduler:
66
+ learning_rate: 0.003
67
+ lr_decay_starting_step: 1600000
68
+ lr_decay_steps: 400000
69
+ lr_decay_style: linear
70
+ lr_warmup_steps: 2000
71
+ lr_warmup_style: linear
72
+ min_decay_lr: 0
73
+ optimizer_factory:
74
+ adam_beta1: 0.9
75
+ adam_beta2: 0.95
76
+ adam_eps: 1.0e-08
77
+ name: adamW
78
+ torch_adam_is_fused: true
79
+ weight_decay: 0.01
80
+ zero_stage: 0
81
+ parallelism:
82
+ dp: 64
83
+ expert_parallel_size: 1
84
+ pp: 1
85
+ pp_engine: 1f1b
86
+ recompute_layer: false
87
+ tp: 1
88
+ tp_linear_async_communication: true
89
+ tp_mode: REDUCE_SCATTER
90
+ tp_recompute_allgather: true
91
+ profiler: null
92
+ tokenizer:
93
+ tokenizer_max_length: null
94
+ tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
95
+ tokenizer_revision: null
96
+ tokens:
97
+ batch_accumulation_per_replica: 1
98
+ limit_test_batches: 0
99
+ limit_val_batches: 0
100
+ micro_batch_size: 16 #16
101
+ sequence_length: 1024 #2048
102
+ train_steps: 2000000
103
+ val_check_interval: 1000