Spaces:

padmanabhbosamia
/

VerySmolTextGen

Running

App Files Files Community

VerySmolTextGen / config.yaml

padmanabhbosamia

Upload 5 files

416db21 verified 15 days ago

raw

history blame contribute delete

1.93 kB

	model:
	type: "custom"
	name: "smollm2_transformer"
	tokenizer_name: "gpt2"
	vocab_size: 50257
	hidden_size: 256
	num_attention_heads: 4
	num_key_value_heads: 2
	num_hidden_layers: 6
	intermediate_size: 512
	hidden_act: "gelu"
	max_position_embeddings: 256
	initializer_range: 0.02
	rms_norm_eps: 1.0e-5
	use_cache: true
	pad_token_id: null

	optimizer:
	type: "adamW"
	weight_decay: 0.01
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_eps: 1.0e-8
	torch_adam_is_fused: true
	clip_grad: 1.0
	accumulate_grad_in_fp32: true

	scheduler:
	type: "one_cycle"
	learning_rate: 0.001
	warmup_steps: 50
	max_lr: 0.001
	pct_start: 0.02
	anneal_strategy: "cos"
	cycle_momentum: false
	div_factor: 25.0
	final_div_factor: 1000.0

	training:
	output_dir: "./results"
	batch_size: 4
	micro_batch_size: 2
	gradient_accumulation_steps: 2
	sequence_length: 256
	learning_rate: 0.001
	max_steps: 5050 # Total steps (5000 + 50)
	first_phase_steps: 5000 # Initial training phase
	second_phase_steps: 50 # Fine-tuning phase
	sample_frequency: 100 # Sample every 100 steps in first phase
	second_phase_sample_frequency: 5 # Sample more frequently in second phase
	logging_dir: "./logs"
	logging_steps: 1
	save_steps: 100
	checkpoint_dir: "checkpoints"
	sample_prompt: "Explain what machine learning is:"
	max_generate_length: 50

	hardware:
	precision: "16-mixed"
	accelerator: "gpu"
	devices: 1
	strategy: "auto"
	gradient_clip: 1.0
	cuda_memory_fraction: 0.9
	allow_tf32: true
	benchmark: true
	deterministic: false

	data:
	datasets:
	- name: "wikitext"
	path: "wikitext"
	subset: "wikitext-103-raw-v1"
	split_ratio: 0.01 # Use only 1% of the dataset
	weight: 1.0
	loading:
	num_workers: 2
	batch_size: 16
	pin_memory: true
	prefetch_factor: 2
	persistent_workers: true