new evals_per_epoch and saves_per_epoch to make things cleaner (#944)
Browse files* new evals_per_epoch and saves_per_epoch to make things cleaner
* update per PR feedback
- README.md +3 -1
- examples/cerebras/btlm-ft.yml +2 -2
- examples/cerebras/qlora.yml +2 -2
- examples/code-llama/13b/lora.yml +2 -2
- examples/code-llama/13b/qlora.yml +2 -2
- examples/code-llama/34b/lora.yml +2 -2
- examples/code-llama/34b/qlora.yml +2 -2
- examples/code-llama/7b/lora.yml +2 -2
- examples/code-llama/7b/qlora.yml +2 -2
- examples/falcon/config-7b-lora.yml +2 -2
- examples/falcon/config-7b-qlora.yml +2 -2
- examples/falcon/config-7b.yml +2 -2
- examples/gptj/qlora.yml +2 -2
- examples/jeopardy-bot/config.yml +2 -2
- examples/llama-2/fft_optimized.yml +2 -2
- examples/llama-2/gptq-lora.yml +2 -2
- examples/llama-2/lora.yml +2 -2
- examples/llama-2/qlora.yml +2 -2
- examples/llama-2/relora.yml +2 -2
- examples/llama-2/tiny-llama.yml +2 -2
- examples/mamba/config.yml +2 -2
- examples/mistral/config.yml +2 -2
- examples/mistral/mixtral.yml +2 -2
- examples/mistral/qlora.yml +2 -2
- examples/mpt-7b/config.yml +2 -2
- examples/openllama-3b/config.yml +2 -2
- examples/openllama-3b/lora.yml +2 -2
- examples/openllama-3b/qlora.yml +2 -2
- examples/phi/phi-ft.yml +2 -2
- examples/phi/phi-qlora.yml +2 -2
- examples/pythia/lora.yml +1 -1
- examples/qwen/lora.yml +2 -2
- examples/qwen/qlora.yml +2 -2
- examples/redpajama/config-3b.yml +2 -2
- examples/replit-3b/config-lora.yml +2 -2
- examples/xgen-7b/xgen-7b-8k-qlora.yml +2 -2
- src/axolotl/utils/config.py +30 -0
README.md
CHANGED
|
@@ -691,9 +691,11 @@ warmup_ratio: 0.05 # cannot use with warmup_steps
|
|
| 691 |
learning_rate: 0.00003
|
| 692 |
lr_quadratic_warmup:
|
| 693 |
logging_steps:
|
|
|
|
|
|
|
| 694 |
save_strategy: # Set to `no` to skip checkpoint saves
|
| 695 |
save_steps: # Leave empty to save at each epoch
|
| 696 |
-
|
| 697 |
save_total_limit: # Checkpoints saved at a time
|
| 698 |
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
| 699 |
# if both are set, num_epochs will not be guaranteed.
|
|
|
|
| 691 |
learning_rate: 0.00003
|
| 692 |
lr_quadratic_warmup:
|
| 693 |
logging_steps:
|
| 694 |
+
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
|
| 695 |
+
evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
|
| 696 |
save_strategy: # Set to `no` to skip checkpoint saves
|
| 697 |
save_steps: # Leave empty to save at each epoch
|
| 698 |
+
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
| 699 |
save_total_limit: # Checkpoints saved at a time
|
| 700 |
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
| 701 |
# if both are set, num_epochs will not be guaranteed.
|
examples/cerebras/btlm-ft.yml
CHANGED
|
@@ -72,8 +72,8 @@ gptq_groupsize:
|
|
| 72 |
gptq_model_v1:
|
| 73 |
|
| 74 |
warmup_steps: 32
|
| 75 |
-
|
| 76 |
-
|
| 77 |
save_total_limit:
|
| 78 |
|
| 79 |
debug:
|
|
|
|
| 72 |
gptq_model_v1:
|
| 73 |
|
| 74 |
warmup_steps: 32
|
| 75 |
+
evals_per_epoch: 4
|
| 76 |
+
saves_per_epoch: 1
|
| 77 |
save_total_limit:
|
| 78 |
|
| 79 |
debug:
|
examples/cerebras/qlora.yml
CHANGED
|
@@ -49,8 +49,8 @@ flash_attention:
|
|
| 49 |
gptq_groupsize:
|
| 50 |
gptq_model_v1:
|
| 51 |
warmup_steps: 10
|
| 52 |
-
|
| 53 |
-
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
| 56 |
weight_decay: 0.1
|
|
|
|
| 49 |
gptq_groupsize:
|
| 50 |
gptq_model_v1:
|
| 51 |
warmup_steps: 10
|
| 52 |
+
evals_per_epoch: 4
|
| 53 |
+
saves_per_epoch: 1
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
| 56 |
weight_decay: 0.1
|
examples/code-llama/13b/lora.yml
CHANGED
|
@@ -54,8 +54,8 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
|
| 58 |
-
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.0
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
evals_per_epoch: 4
|
| 58 |
+
saves_per_epoch: 1
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.0
|
examples/code-llama/13b/qlora.yml
CHANGED
|
@@ -56,8 +56,8 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
|
| 60 |
-
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
evals_per_epoch: 4
|
| 60 |
+
saves_per_epoch: 1
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
examples/code-llama/34b/lora.yml
CHANGED
|
@@ -54,8 +54,8 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
|
| 58 |
-
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.0
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
evals_per_epoch: 4
|
| 58 |
+
saves_per_epoch: 1
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.0
|
examples/code-llama/34b/qlora.yml
CHANGED
|
@@ -56,8 +56,8 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
|
| 60 |
-
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
evals_per_epoch: 4
|
| 60 |
+
saves_per_epoch: 1
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
examples/code-llama/7b/lora.yml
CHANGED
|
@@ -54,8 +54,8 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
|
| 58 |
-
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.0
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
evals_per_epoch: 4
|
| 58 |
+
saves_per_epoch: 1
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.0
|
examples/code-llama/7b/qlora.yml
CHANGED
|
@@ -56,8 +56,8 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
|
| 60 |
-
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
evals_per_epoch: 4
|
| 60 |
+
saves_per_epoch: 1
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
examples/falcon/config-7b-lora.yml
CHANGED
|
@@ -51,8 +51,8 @@ flash_attention:
|
|
| 51 |
gptq_groupsize:
|
| 52 |
gptq_model_v1:
|
| 53 |
warmup_steps: 40
|
| 54 |
-
|
| 55 |
-
|
| 56 |
debug:
|
| 57 |
deepspeed:
|
| 58 |
weight_decay: 0.0
|
|
|
|
| 51 |
gptq_groupsize:
|
| 52 |
gptq_model_v1:
|
| 53 |
warmup_steps: 40
|
| 54 |
+
evals_per_epoch: 4
|
| 55 |
+
saves_per_epoch: 1
|
| 56 |
debug:
|
| 57 |
deepspeed:
|
| 58 |
weight_decay: 0.0
|
examples/falcon/config-7b-qlora.yml
CHANGED
|
@@ -80,8 +80,8 @@ flash_attention:
|
|
| 80 |
gptq_groupsize:
|
| 81 |
gptq_model_v1:
|
| 82 |
warmup_steps: 10
|
| 83 |
-
|
| 84 |
-
|
| 85 |
debug:
|
| 86 |
deepspeed:
|
| 87 |
weight_decay: 0.000001
|
|
|
|
| 80 |
gptq_groupsize:
|
| 81 |
gptq_model_v1:
|
| 82 |
warmup_steps: 10
|
| 83 |
+
evals_per_epoch: 4
|
| 84 |
+
saves_per_epoch: 1
|
| 85 |
debug:
|
| 86 |
deepspeed:
|
| 87 |
weight_decay: 0.000001
|
examples/falcon/config-7b.yml
CHANGED
|
@@ -51,8 +51,8 @@ flash_attention:
|
|
| 51 |
gptq_groupsize:
|
| 52 |
gptq_model_v1:
|
| 53 |
warmup_steps: 40
|
| 54 |
-
|
| 55 |
-
|
| 56 |
debug:
|
| 57 |
deepspeed:
|
| 58 |
weight_decay: 0.0
|
|
|
|
| 51 |
gptq_groupsize:
|
| 52 |
gptq_model_v1:
|
| 53 |
warmup_steps: 40
|
| 54 |
+
evals_per_epoch: 4
|
| 55 |
+
saves_per_epoch: 1
|
| 56 |
debug:
|
| 57 |
deepspeed:
|
| 58 |
weight_decay: 0.0
|
examples/gptj/qlora.yml
CHANGED
|
@@ -46,8 +46,8 @@ flash_attention:
|
|
| 46 |
gptq_groupsize:
|
| 47 |
gptq_model_v1:
|
| 48 |
warmup_steps: 10
|
| 49 |
-
|
| 50 |
-
|
| 51 |
debug:
|
| 52 |
deepspeed:
|
| 53 |
weight_decay: 0.1
|
|
|
|
| 46 |
gptq_groupsize:
|
| 47 |
gptq_model_v1:
|
| 48 |
warmup_steps: 10
|
| 49 |
+
evals_per_epoch: 4
|
| 50 |
+
saves_per_epoch: 1
|
| 51 |
debug:
|
| 52 |
deepspeed:
|
| 53 |
weight_decay: 0.1
|
examples/jeopardy-bot/config.yml
CHANGED
|
@@ -42,8 +42,8 @@ flash_attention:
|
|
| 42 |
gptq_groupsize:
|
| 43 |
gptq_model_v1:
|
| 44 |
warmup_steps: 20
|
| 45 |
-
|
| 46 |
-
|
| 47 |
debug:
|
| 48 |
deepspeed:
|
| 49 |
weight_decay: 0.1
|
|
|
|
| 42 |
gptq_groupsize:
|
| 43 |
gptq_model_v1:
|
| 44 |
warmup_steps: 20
|
| 45 |
+
evals_per_epoch: 4
|
| 46 |
+
saves_per_epoch: 1
|
| 47 |
debug:
|
| 48 |
deepspeed:
|
| 49 |
weight_decay: 0.1
|
examples/llama-2/fft_optimized.yml
CHANGED
|
@@ -58,9 +58,9 @@ flash_attn_fuse_qkv: false
|
|
| 58 |
flash_attn_fuse_mlp: true
|
| 59 |
|
| 60 |
warmup_steps: 100
|
| 61 |
-
|
| 62 |
eval_table_size:
|
| 63 |
-
|
| 64 |
debug:
|
| 65 |
deepspeed: #deepspeed/zero2.json # multi-gpu only
|
| 66 |
weight_decay: 0.1
|
|
|
|
| 58 |
flash_attn_fuse_mlp: true
|
| 59 |
|
| 60 |
warmup_steps: 100
|
| 61 |
+
evals_per_epoch: 4
|
| 62 |
eval_table_size:
|
| 63 |
+
saves_per_epoch: 1
|
| 64 |
debug:
|
| 65 |
deepspeed: #deepspeed/zero2.json # multi-gpu only
|
| 66 |
weight_decay: 0.1
|
examples/llama-2/gptq-lora.yml
CHANGED
|
@@ -62,8 +62,8 @@ flash_attention:
|
|
| 62 |
sdp_attention:
|
| 63 |
flash_optimum:
|
| 64 |
warmup_steps: 100
|
| 65 |
-
|
| 66 |
-
|
| 67 |
debug:
|
| 68 |
deepspeed:
|
| 69 |
weight_decay: 0.1
|
|
|
|
| 62 |
sdp_attention:
|
| 63 |
flash_optimum:
|
| 64 |
warmup_steps: 100
|
| 65 |
+
evals_per_epoch: 4
|
| 66 |
+
saves_per_epoch: 1
|
| 67 |
debug:
|
| 68 |
deepspeed:
|
| 69 |
weight_decay: 0.1
|
examples/llama-2/lora.yml
CHANGED
|
@@ -54,10 +54,10 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
|
| 58 |
eval_table_size:
|
| 59 |
eval_table_max_new_tokens: 128
|
| 60 |
-
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
evals_per_epoch: 4
|
| 58 |
eval_table_size:
|
| 59 |
eval_table_max_new_tokens: 128
|
| 60 |
+
saves_per_epoch: 1
|
| 61 |
debug:
|
| 62 |
deepspeed:
|
| 63 |
weight_decay: 0.0
|
examples/llama-2/qlora.yml
CHANGED
|
@@ -56,9 +56,9 @@ xformers_attention:
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
|
| 60 |
eval_table_size:
|
| 61 |
-
|
| 62 |
debug:
|
| 63 |
deepspeed:
|
| 64 |
weight_decay: 0.0
|
|
|
|
| 56 |
flash_attention: true
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
evals_per_epoch: 4
|
| 60 |
eval_table_size:
|
| 61 |
+
saves_per_epoch: 1
|
| 62 |
debug:
|
| 63 |
deepspeed:
|
| 64 |
weight_decay: 0.0
|
examples/llama-2/relora.yml
CHANGED
|
@@ -60,8 +60,8 @@ xformers_attention:
|
|
| 60 |
flash_attention: true
|
| 61 |
|
| 62 |
warmup_steps: 10
|
| 63 |
-
|
| 64 |
-
|
| 65 |
debug:
|
| 66 |
deepspeed:
|
| 67 |
weight_decay: 0.0
|
|
|
|
| 60 |
flash_attention: true
|
| 61 |
|
| 62 |
warmup_steps: 10
|
| 63 |
+
evals_per_epoch: 4
|
| 64 |
+
saves_per_epoch: 1
|
| 65 |
debug:
|
| 66 |
deepspeed:
|
| 67 |
weight_decay: 0.0
|
examples/llama-2/tiny-llama.yml
CHANGED
|
@@ -54,9 +54,9 @@ xformers_attention:
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
-
|
| 58 |
eval_table_size:
|
| 59 |
-
|
| 60 |
debug:
|
| 61 |
deepspeed:
|
| 62 |
weight_decay: 0.0
|
|
|
|
| 54 |
flash_attention: true
|
| 55 |
|
| 56 |
warmup_steps: 10
|
| 57 |
+
evals_per_epoch: 4
|
| 58 |
eval_table_size:
|
| 59 |
+
saves_per_epoch: 1
|
| 60 |
debug:
|
| 61 |
deepspeed:
|
| 62 |
weight_decay: 0.0
|
examples/mamba/config.yml
CHANGED
|
@@ -47,10 +47,10 @@ xformers_attention:
|
|
| 47 |
flash_attention:
|
| 48 |
|
| 49 |
warmup_steps: 10
|
| 50 |
-
|
| 51 |
eval_table_size:
|
| 52 |
eval_table_max_new_tokens: 128
|
| 53 |
-
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
| 56 |
weight_decay: 0.0
|
|
|
|
| 47 |
flash_attention:
|
| 48 |
|
| 49 |
warmup_steps: 10
|
| 50 |
+
evals_per_epoch: 4
|
| 51 |
eval_table_size:
|
| 52 |
eval_table_max_new_tokens: 128
|
| 53 |
+
saves_per_epoch: 1
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
| 56 |
weight_decay: 0.0
|
examples/mistral/config.yml
CHANGED
|
@@ -46,10 +46,10 @@ xformers_attention:
|
|
| 46 |
flash_attention: true
|
| 47 |
|
| 48 |
warmup_steps: 10
|
| 49 |
-
|
| 50 |
eval_table_size:
|
| 51 |
eval_table_max_new_tokens: 128
|
| 52 |
-
|
| 53 |
debug:
|
| 54 |
deepspeed:
|
| 55 |
weight_decay: 0.0
|
|
|
|
| 46 |
flash_attention: true
|
| 47 |
|
| 48 |
warmup_steps: 10
|
| 49 |
+
evals_per_epoch: 4
|
| 50 |
eval_table_size:
|
| 51 |
eval_table_max_new_tokens: 128
|
| 52 |
+
saves_per_epoch: 1
|
| 53 |
debug:
|
| 54 |
deepspeed:
|
| 55 |
weight_decay: 0.0
|
examples/mistral/mixtral.yml
CHANGED
|
@@ -67,10 +67,10 @@ loss_watchdog_threshold: 5.0
|
|
| 67 |
loss_watchdog_patience: 3
|
| 68 |
|
| 69 |
warmup_steps: 10
|
| 70 |
-
|
| 71 |
eval_table_size:
|
| 72 |
eval_table_max_new_tokens: 128
|
| 73 |
-
|
| 74 |
debug:
|
| 75 |
deepspeed: deepspeed/zero2.json
|
| 76 |
weight_decay: 0.0
|
|
|
|
| 67 |
loss_watchdog_patience: 3
|
| 68 |
|
| 69 |
warmup_steps: 10
|
| 70 |
+
evals_per_epoch: 4
|
| 71 |
eval_table_size:
|
| 72 |
eval_table_max_new_tokens: 128
|
| 73 |
+
saves_per_epoch: 1
|
| 74 |
debug:
|
| 75 |
deepspeed: deepspeed/zero2.json
|
| 76 |
weight_decay: 0.0
|
examples/mistral/qlora.yml
CHANGED
|
@@ -66,10 +66,10 @@ loss_watchdog_threshold: 5.0
|
|
| 66 |
loss_watchdog_patience: 3
|
| 67 |
|
| 68 |
warmup_steps: 10
|
| 69 |
-
|
| 70 |
eval_table_size:
|
| 71 |
eval_table_max_new_tokens: 128
|
| 72 |
-
|
| 73 |
debug:
|
| 74 |
deepspeed:
|
| 75 |
weight_decay: 0.0
|
|
|
|
| 66 |
loss_watchdog_patience: 3
|
| 67 |
|
| 68 |
warmup_steps: 10
|
| 69 |
+
evals_per_epoch: 4
|
| 70 |
eval_table_size:
|
| 71 |
eval_table_max_new_tokens: 128
|
| 72 |
+
saves_per_epoch: 1
|
| 73 |
debug:
|
| 74 |
deepspeed:
|
| 75 |
weight_decay: 0.0
|
examples/mpt-7b/config.yml
CHANGED
|
@@ -44,8 +44,8 @@ flash_attention:
|
|
| 44 |
gptq_groupsize:
|
| 45 |
gptq_model_v1:
|
| 46 |
warmup_steps: 20
|
| 47 |
-
|
| 48 |
-
|
| 49 |
debug:
|
| 50 |
deepspeed:
|
| 51 |
weight_decay: 0.0001
|
|
|
|
| 44 |
gptq_groupsize:
|
| 45 |
gptq_model_v1:
|
| 46 |
warmup_steps: 20
|
| 47 |
+
evals_per_epoch: 4
|
| 48 |
+
saves_per_epoch: 1
|
| 49 |
debug:
|
| 50 |
deepspeed:
|
| 51 |
weight_decay: 0.0001
|
examples/openllama-3b/config.yml
CHANGED
|
@@ -49,8 +49,8 @@ flash_attention: true
|
|
| 49 |
gptq_groupsize:
|
| 50 |
gptq_model_v1:
|
| 51 |
warmup_steps: 20
|
| 52 |
-
|
| 53 |
-
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
| 56 |
weight_decay: 0.1
|
|
|
|
| 49 |
gptq_groupsize:
|
| 50 |
gptq_model_v1:
|
| 51 |
warmup_steps: 20
|
| 52 |
+
evals_per_epoch: 4
|
| 53 |
+
saves_per_epoch: 1
|
| 54 |
debug:
|
| 55 |
deepspeed:
|
| 56 |
weight_decay: 0.1
|
examples/openllama-3b/lora.yml
CHANGED
|
@@ -54,8 +54,8 @@ flash_attention: true
|
|
| 54 |
gptq_groupsize:
|
| 55 |
gptq_model_v1:
|
| 56 |
warmup_steps: 20
|
| 57 |
-
|
| 58 |
-
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.1
|
|
|
|
| 54 |
gptq_groupsize:
|
| 55 |
gptq_model_v1:
|
| 56 |
warmup_steps: 20
|
| 57 |
+
evals_per_epoch: 4
|
| 58 |
+
saves_per_epoch: 1
|
| 59 |
debug:
|
| 60 |
deepspeed:
|
| 61 |
weight_decay: 0.1
|
examples/openllama-3b/qlora.yml
CHANGED
|
@@ -48,8 +48,8 @@ flash_attention: true
|
|
| 48 |
gptq_groupsize:
|
| 49 |
gptq_model_v1:
|
| 50 |
warmup_steps: 20
|
| 51 |
-
|
| 52 |
-
|
| 53 |
debug:
|
| 54 |
deepspeed:
|
| 55 |
weight_decay: 0.1
|
|
|
|
| 48 |
gptq_groupsize:
|
| 49 |
gptq_model_v1:
|
| 50 |
warmup_steps: 20
|
| 51 |
+
evals_per_epoch: 4
|
| 52 |
+
saves_per_epoch: 1
|
| 53 |
debug:
|
| 54 |
deepspeed:
|
| 55 |
weight_decay: 0.1
|
examples/phi/phi-ft.yml
CHANGED
|
@@ -59,8 +59,8 @@ xformers_attention:
|
|
| 59 |
flash_attention:
|
| 60 |
|
| 61 |
warmup_steps: 100
|
| 62 |
-
|
| 63 |
-
|
| 64 |
debug:
|
| 65 |
deepspeed:
|
| 66 |
weight_decay: 0.1
|
|
|
|
| 59 |
flash_attention:
|
| 60 |
|
| 61 |
warmup_steps: 100
|
| 62 |
+
evals_per_epoch: 4
|
| 63 |
+
saves_per_epoch: 1
|
| 64 |
debug:
|
| 65 |
deepspeed:
|
| 66 |
weight_decay: 0.1
|
examples/phi/phi-qlora.yml
CHANGED
|
@@ -59,8 +59,8 @@ xformers_attention:
|
|
| 59 |
flash_attention:
|
| 60 |
|
| 61 |
warmup_steps: 100
|
| 62 |
-
|
| 63 |
-
|
| 64 |
debug:
|
| 65 |
deepspeed:
|
| 66 |
weight_decay: 0.1
|
|
|
|
| 59 |
flash_attention:
|
| 60 |
|
| 61 |
warmup_steps: 100
|
| 62 |
+
evals_per_epoch: 4
|
| 63 |
+
saves_per_epoch: 1
|
| 64 |
debug:
|
| 65 |
deepspeed:
|
| 66 |
weight_decay: 0.1
|
examples/pythia/lora.yml
CHANGED
|
@@ -33,5 +33,5 @@ early_stopping_patience:
|
|
| 33 |
resume_from_checkpoint:
|
| 34 |
local_rank:
|
| 35 |
weight_decay: 0.1
|
| 36 |
-
|
| 37 |
logging_steps: 1
|
|
|
|
| 33 |
resume_from_checkpoint:
|
| 34 |
local_rank:
|
| 35 |
weight_decay: 0.1
|
| 36 |
+
evals_per_epoch: 4
|
| 37 |
logging_steps: 1
|
examples/qwen/lora.yml
CHANGED
|
@@ -56,10 +56,10 @@ xformers_attention:
|
|
| 56 |
flash_attention:
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
|
| 60 |
eval_table_size:
|
| 61 |
eval_table_max_new_tokens: 128
|
| 62 |
-
|
| 63 |
debug:
|
| 64 |
deepspeed:
|
| 65 |
weight_decay: 0.0
|
|
|
|
| 56 |
flash_attention:
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
evals_per_epoch: 4
|
| 60 |
eval_table_size:
|
| 61 |
eval_table_max_new_tokens: 128
|
| 62 |
+
saves_per_epoch: 1
|
| 63 |
debug:
|
| 64 |
deepspeed:
|
| 65 |
weight_decay: 0.0
|
examples/qwen/qlora.yml
CHANGED
|
@@ -56,10 +56,10 @@ xformers_attention:
|
|
| 56 |
flash_attention:
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
-
|
| 60 |
eval_table_size:
|
| 61 |
eval_table_max_new_tokens: 128
|
| 62 |
-
|
| 63 |
debug:
|
| 64 |
deepspeed:
|
| 65 |
weight_decay: 0.0
|
|
|
|
| 56 |
flash_attention:
|
| 57 |
|
| 58 |
warmup_steps: 10
|
| 59 |
+
evals_per_epoch: 4
|
| 60 |
eval_table_size:
|
| 61 |
eval_table_max_new_tokens: 128
|
| 62 |
+
saves_per_epoch: 1
|
| 63 |
debug:
|
| 64 |
deepspeed:
|
| 65 |
weight_decay: 0.0
|
examples/redpajama/config-3b.yml
CHANGED
|
@@ -45,8 +45,8 @@ flash_attention:
|
|
| 45 |
gptq_groupsize:
|
| 46 |
gptq_model_v1:
|
| 47 |
warmup_steps: 20
|
| 48 |
-
|
| 49 |
-
|
| 50 |
debug:
|
| 51 |
deepspeed:
|
| 52 |
weight_decay: 0.0001
|
|
|
|
| 45 |
gptq_groupsize:
|
| 46 |
gptq_model_v1:
|
| 47 |
warmup_steps: 20
|
| 48 |
+
evals_per_epoch: 4
|
| 49 |
+
saves_per_epoch: 1
|
| 50 |
debug:
|
| 51 |
deepspeed:
|
| 52 |
weight_decay: 0.0001
|
examples/replit-3b/config-lora.yml
CHANGED
|
@@ -45,8 +45,8 @@ flash_attention:
|
|
| 45 |
gptq_groupsize:
|
| 46 |
gptq_model_v1:
|
| 47 |
warmup_steps: 20
|
| 48 |
-
|
| 49 |
-
|
| 50 |
debug:
|
| 51 |
deepspeed:
|
| 52 |
weight_decay: 0
|
|
|
|
| 45 |
gptq_groupsize:
|
| 46 |
gptq_model_v1:
|
| 47 |
warmup_steps: 20
|
| 48 |
+
evals_per_epoch: 4
|
| 49 |
+
saves_per_epoch: 1
|
| 50 |
debug:
|
| 51 |
deepspeed:
|
| 52 |
weight_decay: 0
|
examples/xgen-7b/xgen-7b-8k-qlora.yml
CHANGED
|
@@ -78,8 +78,8 @@ flash_attention:
|
|
| 78 |
gptq_groupsize:
|
| 79 |
gptq_model_v1:
|
| 80 |
warmup_steps: 10
|
| 81 |
-
|
| 82 |
-
|
| 83 |
debug:
|
| 84 |
deepspeed:
|
| 85 |
weight_decay: 0.0
|
|
|
|
| 78 |
gptq_groupsize:
|
| 79 |
gptq_model_v1:
|
| 80 |
warmup_steps: 10
|
| 81 |
+
evals_per_epoch: 4
|
| 82 |
+
saves_per_epoch: 1
|
| 83 |
debug:
|
| 84 |
deepspeed:
|
| 85 |
weight_decay: 0.0
|
src/axolotl/utils/config.py
CHANGED
|
@@ -77,6 +77,15 @@ def normalize_config(cfg):
|
|
| 77 |
else:
|
| 78 |
cfg.torch_dtype = torch.float32
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
|
| 81 |
|
| 82 |
if not cfg.base_model_config:
|
|
@@ -352,6 +361,27 @@ def validate_config(cfg):
|
|
| 352 |
cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
|
| 353 |
"sharegpt_simple", "sharegpt"
|
| 354 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
|
| 356 |
raise ValueError(
|
| 357 |
"save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
|
|
|
|
| 77 |
else:
|
| 78 |
cfg.torch_dtype = torch.float32
|
| 79 |
|
| 80 |
+
if cfg.saves_per_epoch:
|
| 81 |
+
save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
|
| 82 |
+
if save_steps < 1.0: # prevent saves on every step
|
| 83 |
+
cfg.save_steps = save_steps
|
| 84 |
+
if cfg.evals_per_epoch:
|
| 85 |
+
eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
|
| 86 |
+
if eval_steps < 1.0: # prevent evals on every step
|
| 87 |
+
cfg.eval_steps = eval_steps
|
| 88 |
+
|
| 89 |
cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
|
| 90 |
|
| 91 |
if not cfg.base_model_config:
|
|
|
|
| 361 |
cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
|
| 362 |
"sharegpt_simple", "sharegpt"
|
| 363 |
)
|
| 364 |
+
|
| 365 |
+
if cfg.saves_per_epoch and cfg.save_steps:
|
| 366 |
+
raise ValueError(
|
| 367 |
+
"save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
|
| 368 |
+
)
|
| 369 |
+
if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps":
|
| 370 |
+
raise ValueError(
|
| 371 |
+
"save_strategy must be empty or set to `steps` when used with saves_per_epoch."
|
| 372 |
+
)
|
| 373 |
+
if cfg.evals_per_epoch and cfg.eval_steps:
|
| 374 |
+
raise ValueError(
|
| 375 |
+
"eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
|
| 376 |
+
)
|
| 377 |
+
if (
|
| 378 |
+
cfg.evals_per_epoch
|
| 379 |
+
and cfg.evaluation_strategy
|
| 380 |
+
and cfg.evaluation_strategy != "steps"
|
| 381 |
+
):
|
| 382 |
+
raise ValueError(
|
| 383 |
+
"evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
|
| 384 |
+
)
|
| 385 |
if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
|
| 386 |
raise ValueError(
|
| 387 |
"save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
|