gsmyrnis commited on
Commit
7325891
·
verified ·
1 Parent(s): 74e48dd

Upload configs.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. configs.yaml +13 -21
configs.yaml CHANGED
@@ -1,34 +1,27 @@
1
- adam_beta1: 0.9
2
- adam_beta2: 0.999
3
  bf16: true
4
- cutoff_len: 2048
5
  dataset: mlfoundations-dev/r1_annotated_math
6
  dataset_dir: ONLINE
7
  ddp_timeout: 180000000
8
  deepspeed: dcft/train/zero3.json
9
  do_train: true
10
- enable_liger_kernel: false
11
- eval_strategy: epoch
12
  finetuning_type: full
13
  formatting: sharegpt
14
- global_batch_size: 512
15
- gradient_accumulation_steps: 1
16
- gradient_checkpointing: true
17
  hub_model_id: mlfoundations-dev/llama3-1_8b_r1_annotated_math
18
- include_hp: dcft/train/hp_settings/hritik.yaml
19
- learning_rate: 5.0e-06
20
- logging_steps: 10
21
- lr_scheduler_type: constant
22
- max_grad_norm: 1
23
  messages: conversations
24
- model_name_or_path: meta-llama/Meta-Llama-3.1-8B
25
- neat_packing: true
26
  num_train_epochs: 3.0
27
  output_dir: /tmp/dcft_checkpoints/llama3-1_8b_r1_annotated_math
28
  overwrite_cache: true
29
- overwrite_output_dir: true
30
- packing: true
31
- per_device_train_batch_size: 16
32
  plot_loss: true
33
  preprocessing_num_workers: 16
34
  push_to_db: true
@@ -37,6 +30,5 @@ report_to: wandb
37
  run_name: llama3-1_8b_r1_annotated_math
38
  save_strategy: epoch
39
  stage: sft
40
- template: llama3
41
- val_size: 0.05
42
- weight_decay: 0.1
 
 
 
1
  bf16: true
2
+ cutoff_len: 16384
3
  dataset: mlfoundations-dev/r1_annotated_math
4
  dataset_dir: ONLINE
5
  ddp_timeout: 180000000
6
  deepspeed: dcft/train/zero3.json
7
  do_train: true
8
+ eval_strategy: 'no'
 
9
  finetuning_type: full
10
  formatting: sharegpt
11
+ global_batch_size: 96
12
+ gradient_accumulation_steps: 3
 
13
  hub_model_id: mlfoundations-dev/llama3-1_8b_r1_annotated_math
14
+ include_hp: dcft/train/hp_settings/reasoning.yaml
15
+ learning_rate: 1.0e-05
16
+ logging_steps: 1
17
+ lr_scheduler_type: cosine
18
+ max_samples: 1000000
19
  messages: conversations
20
+ model_name_or_path: Qwen/Qwen2.5-7B-Instruct
 
21
  num_train_epochs: 3.0
22
  output_dir: /tmp/dcft_checkpoints/llama3-1_8b_r1_annotated_math
23
  overwrite_cache: true
24
+ per_device_train_batch_size: 1
 
 
25
  plot_loss: true
26
  preprocessing_num_workers: 16
27
  push_to_db: true
 
30
  run_name: llama3-1_8b_r1_annotated_math
31
  save_strategy: epoch
32
  stage: sft
33
+ template: qwen25
34
+ warmup_ratio: 0.1