SmolLM2-135M-cosmopedia2-70kSteps / config_smollm2_135M.yaml

Upload config_smollm2_135M.yaml with huggingface_hub

c31f4d9 verified 24 days ago

2.43 kB

	checkpoints:
	checkpoint_interval: 2000
	checkpoints_path: checkpoints
	checkpoints_path_is_shared_file_system: false
	resume_checkpoint_path: null
	save_final_state: false
	save_initial_state: false
	data_stages:
	- data:
	dataset:
	dataset_folder:
	- datasets/smollm2-corpus
	dataset_weights:
	- 1.0
	num_loading_workers: 0
	seed: 8
	name: stable phase
	start_training_step: 1
	general:
	benchmark_csv_path: null
	consumed_train_samples: null
	ignore_sanity_checks: true
	project: smollm2
	run: smollm2-135M
	seed: 8
	step: null
	logging:
	iteration_step_info_interval: 1
	log_level: info
	log_level_replica: info
	model:
	ddp_bucket_cap_mb: 25
	dtype: bfloat16
	init_method:
	std: 0.041666666666666664
	make_vocab_size_divisible_by: 1
	model_config:
	bos_token_id: 0
	eos_token_id: 0
	hidden_act: silu
	hidden_size: 576
	initializer_range: 0.041666666666666664
	intermediate_size: 1536
	is_llama_config: true
	max_position_embeddings: 2048
	num_attention_heads: 9
	num_hidden_layers: 30
	num_key_value_heads: 3
	pad_token_id: null
	pretraining_tp: 1
	rms_norm_eps: 1.0e-05
	rope_interleaved: false
	rope_scaling: null
	rope_theta: 10000.0
	tie_word_embeddings: true
	use_cache: true
	vocab_size: 49152
	s3_bucket: smollm2-train-jan-25-era3
	s3_checkpoint_folder: checkpoints
	s3_log_folder: logs
	s3_log_file_name: training.log
	optimizer:
	accumulate_grad_in_fp32: true
	clip_grad: 1.0
	learning_rate_scheduler:
	learning_rate: 0.003
	lr_decay_starting_step: 1600000
	lr_decay_steps: 400000
	lr_decay_style: linear
	lr_warmup_steps: 2000
	lr_warmup_style: linear
	min_decay_lr: 0
	optimizer_factory:
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_eps: 1.0e-08
	name: adamW
	torch_adam_is_fused: true
	weight_decay: 0.01
	zero_stage: 0
	parallelism:
	dp: 64
	expert_parallel_size: 1
	pp: 1
	pp_engine: 1f1b
	recompute_layer: false
	tp: 1
	tp_linear_async_communication: true
	tp_mode: REDUCE_SCATTER
	tp_recompute_allgather: true
	profiler: null
	tokenizer:
	tokenizer_max_length: null
	tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
	tokenizer_revision: null
	tokens:
	batch_accumulation_per_replica: 1
	limit_test_batches: 0
	limit_val_batches: 0
	micro_batch_size: 16 #16
	sequence_length: 1024 #2048
	train_steps: 2000000
	val_check_interval: 1000