fix eval_steps to be a sane default (#797)
Browse files* fix eval_steps to be a sane default
* update docs for fractional eval_steps
- README.md +2 -2
- examples/cerebras/qlora.yml +1 -1
- examples/code-llama/13b/lora.yml +2 -2
- examples/code-llama/13b/qlora.yml +2 -2
- examples/code-llama/34b/lora.yml +2 -2
- examples/code-llama/34b/qlora.yml +2 -2
- examples/code-llama/7b/lora.yml +2 -2
- examples/code-llama/7b/qlora.yml +2 -2
- examples/falcon/config-7b-qlora.yml +1 -1
- examples/gptj/qlora.yml +1 -1
- examples/jeopardy-bot/config.yml +1 -1
- examples/llama-2/gptq-lora.yml +1 -1
- examples/llama-2/lora.yml +2 -2
- examples/llama-2/qlora.yml +2 -2
- examples/llama-2/relora.yml +2 -2
- examples/llama-2/tiny-llama.yml +2 -2
- examples/mistral/config.yml +2 -2
- examples/mistral/qlora.yml +1 -1
- examples/mpt-7b/config.yml +1 -1
- examples/pythia/lora.yml +2 -2
- examples/redpajama/config-3b.yml +1 -1
- examples/replit-3b/config-lora.yml +1 -1
- examples/xgen-7b/xgen-7b-8k-qlora.yml +1 -1
    	
        README.md
    CHANGED
    
    | @@ -618,14 +618,14 @@ gradient_accumulation_steps: 1 | |
| 618 | 
             
            # The number of samples to include in each batch. This is the number of samples sent to each GPU.
         | 
| 619 | 
             
            micro_batch_size: 2
         | 
| 620 | 
             
            eval_batch_size:
         | 
| 621 | 
            -
            num_epochs:  | 
| 622 | 
             
            warmup_steps: 100
         | 
| 623 | 
             
            learning_rate: 0.00003
         | 
| 624 | 
             
            lr_quadratic_warmup:
         | 
| 625 | 
             
            logging_steps:
         | 
| 626 | 
             
            save_strategy: # Set to `no` to skip checkpoint saves
         | 
| 627 | 
             
            save_steps: # Leave empty to save at each epoch
         | 
| 628 | 
            -
            eval_steps: # Leave empty to eval at each epoch
         | 
| 629 | 
             
            save_total_limit: # Checkpoints saved at a time
         | 
| 630 | 
             
            # Maximum number of iterations to train for. It precedes num_epochs which means that
         | 
| 631 | 
             
            # if both are set, num_epochs will not be guaranteed.
         | 
|  | |
| 618 | 
             
            # The number of samples to include in each batch. This is the number of samples sent to each GPU.
         | 
| 619 | 
             
            micro_batch_size: 2
         | 
| 620 | 
             
            eval_batch_size:
         | 
| 621 | 
            +
            num_epochs: 4
         | 
| 622 | 
             
            warmup_steps: 100
         | 
| 623 | 
             
            learning_rate: 0.00003
         | 
| 624 | 
             
            lr_quadratic_warmup:
         | 
| 625 | 
             
            logging_steps:
         | 
| 626 | 
             
            save_strategy: # Set to `no` to skip checkpoint saves
         | 
| 627 | 
             
            save_steps: # Leave empty to save at each epoch
         | 
| 628 | 
            +
            eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
         | 
| 629 | 
             
            save_total_limit: # Checkpoints saved at a time
         | 
| 630 | 
             
            # Maximum number of iterations to train for. It precedes num_epochs which means that
         | 
| 631 | 
             
            # if both are set, num_epochs will not be guaranteed.
         | 
    	
        examples/cerebras/qlora.yml
    CHANGED
    
    | @@ -49,7 +49,7 @@ flash_attention: | |
| 49 | 
             
            gptq_groupsize:
         | 
| 50 | 
             
            gptq_model_v1:
         | 
| 51 | 
             
            warmup_steps: 10
         | 
| 52 | 
            -
            eval_steps:  | 
| 53 | 
             
            save_steps:
         | 
| 54 | 
             
            debug:
         | 
| 55 | 
             
            deepspeed:
         | 
|  | |
| 49 | 
             
            gptq_groupsize:
         | 
| 50 | 
             
            gptq_model_v1:
         | 
| 51 | 
             
            warmup_steps: 10
         | 
| 52 | 
            +
            eval_steps: 0.05
         | 
| 53 | 
             
            save_steps:
         | 
| 54 | 
             
            debug:
         | 
| 55 | 
             
            deepspeed:
         | 
    	
        examples/code-llama/13b/lora.yml
    CHANGED
    
    | @@ -34,7 +34,7 @@ wandb_log_model: | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            -
            num_epochs:  | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
| @@ -54,7 +54,7 @@ xformers_attention: | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            -
            eval_steps:  | 
| 58 | 
             
            save_steps:
         | 
| 59 | 
             
            debug:
         | 
| 60 | 
             
            deepspeed:
         | 
|  | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            +
            num_epochs: 4
         | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            +
            eval_steps: 0.05
         | 
| 58 | 
             
            save_steps:
         | 
| 59 | 
             
            debug:
         | 
| 60 | 
             
            deepspeed:
         | 
    	
        examples/code-llama/13b/qlora.yml
    CHANGED
    
    | @@ -36,7 +36,7 @@ wandb_log_model: | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            -
            num_epochs:  | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
| @@ -56,7 +56,7 @@ xformers_attention: | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            -
            eval_steps:  | 
| 60 | 
             
            save_steps:
         | 
| 61 | 
             
            debug:
         | 
| 62 | 
             
            deepspeed:
         | 
|  | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            +
            num_epochs: 4
         | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            +
            eval_steps: 0.05
         | 
| 60 | 
             
            save_steps:
         | 
| 61 | 
             
            debug:
         | 
| 62 | 
             
            deepspeed:
         | 
    	
        examples/code-llama/34b/lora.yml
    CHANGED
    
    | @@ -34,7 +34,7 @@ wandb_log_model: | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            -
            num_epochs:  | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
| @@ -54,7 +54,7 @@ xformers_attention: | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            -
            eval_steps:  | 
| 58 | 
             
            save_steps:
         | 
| 59 | 
             
            debug:
         | 
| 60 | 
             
            deepspeed:
         | 
|  | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            +
            num_epochs: 4
         | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            +
            eval_steps: 0.05
         | 
| 58 | 
             
            save_steps:
         | 
| 59 | 
             
            debug:
         | 
| 60 | 
             
            deepspeed:
         | 
    	
        examples/code-llama/34b/qlora.yml
    CHANGED
    
    | @@ -36,7 +36,7 @@ wandb_log_model: | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            -
            num_epochs:  | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
| @@ -56,7 +56,7 @@ xformers_attention: | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            -
            eval_steps:  | 
| 60 | 
             
            save_steps:
         | 
| 61 | 
             
            debug:
         | 
| 62 | 
             
            deepspeed:
         | 
|  | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            +
            num_epochs: 4
         | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            +
            eval_steps: 0.05
         | 
| 60 | 
             
            save_steps:
         | 
| 61 | 
             
            debug:
         | 
| 62 | 
             
            deepspeed:
         | 
    	
        examples/code-llama/7b/lora.yml
    CHANGED
    
    | @@ -34,7 +34,7 @@ wandb_log_model: | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            -
            num_epochs:  | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
| @@ -54,7 +54,7 @@ xformers_attention: | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            -
            eval_steps:  | 
| 58 | 
             
            save_steps:
         | 
| 59 | 
             
            debug:
         | 
| 60 | 
             
            deepspeed:
         | 
|  | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            +
            num_epochs: 4
         | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            +
            eval_steps: 0.05
         | 
| 58 | 
             
            save_steps:
         | 
| 59 | 
             
            debug:
         | 
| 60 | 
             
            deepspeed:
         | 
    	
        examples/code-llama/7b/qlora.yml
    CHANGED
    
    | @@ -36,7 +36,7 @@ wandb_log_model: | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            -
            num_epochs:  | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
| @@ -56,7 +56,7 @@ xformers_attention: | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            -
            eval_steps:  | 
| 60 | 
             
            save_steps:
         | 
| 61 | 
             
            debug:
         | 
| 62 | 
             
            deepspeed:
         | 
|  | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            +
            num_epochs: 4
         | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            +
            eval_steps: 0.05
         | 
| 60 | 
             
            save_steps:
         | 
| 61 | 
             
            debug:
         | 
| 62 | 
             
            deepspeed:
         | 
    	
        examples/falcon/config-7b-qlora.yml
    CHANGED
    
    | @@ -53,7 +53,7 @@ output_dir: ./qlora-out | |
| 53 | 
             
            # decrease if OOM, increase for max VRAM utilization
         | 
| 54 | 
             
            micro_batch_size: 1
         | 
| 55 | 
             
            gradient_accumulation_steps: 2
         | 
| 56 | 
            -
            num_epochs:  | 
| 57 | 
             
            # Optimizer for QLoRA
         | 
| 58 | 
             
            optimizer: paged_adamw_32bit
         | 
| 59 | 
             
            torchdistx_path:
         | 
|  | |
| 53 | 
             
            # decrease if OOM, increase for max VRAM utilization
         | 
| 54 | 
             
            micro_batch_size: 1
         | 
| 55 | 
             
            gradient_accumulation_steps: 2
         | 
| 56 | 
            +
            num_epochs: 4
         | 
| 57 | 
             
            # Optimizer for QLoRA
         | 
| 58 | 
             
            optimizer: paged_adamw_32bit
         | 
| 59 | 
             
            torchdistx_path:
         | 
    	
        examples/gptj/qlora.yml
    CHANGED
    
    | @@ -46,7 +46,7 @@ flash_attention: | |
| 46 | 
             
            gptq_groupsize:
         | 
| 47 | 
             
            gptq_model_v1:
         | 
| 48 | 
             
            warmup_steps: 10
         | 
| 49 | 
            -
            eval_steps:  | 
| 50 | 
             
            save_steps:
         | 
| 51 | 
             
            debug:
         | 
| 52 | 
             
            deepspeed:
         | 
|  | |
| 46 | 
             
            gptq_groupsize:
         | 
| 47 | 
             
            gptq_model_v1:
         | 
| 48 | 
             
            warmup_steps: 10
         | 
| 49 | 
            +
            eval_steps: 0.05
         | 
| 50 | 
             
            save_steps:
         | 
| 51 | 
             
            debug:
         | 
| 52 | 
             
            deepspeed:
         | 
    	
        examples/jeopardy-bot/config.yml
    CHANGED
    
    | @@ -24,7 +24,7 @@ wandb_log_model: | |
| 24 | 
             
            output_dir: ./jeopardy-bot-7b
         | 
| 25 | 
             
            gradient_accumulation_steps: 1
         | 
| 26 | 
             
            micro_batch_size: 1
         | 
| 27 | 
            -
            num_epochs:  | 
| 28 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 29 | 
             
            torchdistx_path:
         | 
| 30 | 
             
            lr_scheduler: cosine
         | 
|  | |
| 24 | 
             
            output_dir: ./jeopardy-bot-7b
         | 
| 25 | 
             
            gradient_accumulation_steps: 1
         | 
| 26 | 
             
            micro_batch_size: 1
         | 
| 27 | 
            +
            num_epochs: 4
         | 
| 28 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 29 | 
             
            torchdistx_path:
         | 
| 30 | 
             
            lr_scheduler: cosine
         | 
    	
        examples/llama-2/gptq-lora.yml
    CHANGED
    
    | @@ -37,7 +37,7 @@ wandb_log_model: | |
| 37 | 
             
            output_dir: ./model-out
         | 
| 38 | 
             
            gradient_accumulation_steps: 1
         | 
| 39 | 
             
            micro_batch_size: 1
         | 
| 40 | 
            -
            num_epochs:  | 
| 41 | 
             
            optimizer: adamw_torch
         | 
| 42 | 
             
            adam_beta2: 0.95
         | 
| 43 | 
             
            adam_eps: 0.00001
         | 
|  | |
| 37 | 
             
            output_dir: ./model-out
         | 
| 38 | 
             
            gradient_accumulation_steps: 1
         | 
| 39 | 
             
            micro_batch_size: 1
         | 
| 40 | 
            +
            num_epochs: 4
         | 
| 41 | 
             
            optimizer: adamw_torch
         | 
| 42 | 
             
            adam_beta2: 0.95
         | 
| 43 | 
             
            adam_eps: 0.00001
         | 
    	
        examples/llama-2/lora.yml
    CHANGED
    
    | @@ -34,7 +34,7 @@ wandb_log_model: | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            -
            num_epochs:  | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
| @@ -54,7 +54,7 @@ xformers_attention: | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            -
            eval_steps:  | 
| 58 | 
             
            eval_table_size:
         | 
| 59 | 
             
            eval_table_max_new_tokens: 128
         | 
| 60 | 
             
            save_steps:
         | 
|  | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            +
            num_epochs: 4
         | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            +
            eval_steps: 0.05
         | 
| 58 | 
             
            eval_table_size:
         | 
| 59 | 
             
            eval_table_max_new_tokens: 128
         | 
| 60 | 
             
            save_steps:
         | 
    	
        examples/llama-2/qlora.yml
    CHANGED
    
    | @@ -36,7 +36,7 @@ wandb_log_model: | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            -
            num_epochs:  | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
| @@ -56,7 +56,7 @@ xformers_attention: | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            -
            eval_steps:  | 
| 60 | 
             
            eval_table_size:
         | 
| 61 | 
             
            save_steps:
         | 
| 62 | 
             
            debug:
         | 
|  | |
| 36 |  | 
| 37 | 
             
            gradient_accumulation_steps: 4
         | 
| 38 | 
             
            micro_batch_size: 2
         | 
| 39 | 
            +
            num_epochs: 4
         | 
| 40 | 
             
            optimizer: paged_adamw_32bit
         | 
| 41 | 
             
            lr_scheduler: cosine
         | 
| 42 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 56 | 
             
            flash_attention: true
         | 
| 57 |  | 
| 58 | 
             
            warmup_steps: 10
         | 
| 59 | 
            +
            eval_steps: 0.05
         | 
| 60 | 
             
            eval_table_size:
         | 
| 61 | 
             
            save_steps:
         | 
| 62 | 
             
            debug:
         | 
    	
        examples/llama-2/relora.yml
    CHANGED
    
    | @@ -40,7 +40,7 @@ wandb_log_model: | |
| 40 |  | 
| 41 | 
             
            gradient_accumulation_steps: 4
         | 
| 42 | 
             
            micro_batch_size: 4
         | 
| 43 | 
            -
            num_epochs:  | 
| 44 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 45 | 
             
            lr_scheduler: cosine
         | 
| 46 | 
             
            learning_rate: 0.0002
         | 
| @@ -60,7 +60,7 @@ xformers_attention: | |
| 60 | 
             
            flash_attention: true
         | 
| 61 |  | 
| 62 | 
             
            warmup_steps: 10
         | 
| 63 | 
            -
            eval_steps:  | 
| 64 | 
             
            save_steps: 50
         | 
| 65 | 
             
            debug:
         | 
| 66 | 
             
            deepspeed:
         | 
|  | |
| 40 |  | 
| 41 | 
             
            gradient_accumulation_steps: 4
         | 
| 42 | 
             
            micro_batch_size: 4
         | 
| 43 | 
            +
            num_epochs: 4
         | 
| 44 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 45 | 
             
            lr_scheduler: cosine
         | 
| 46 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 60 | 
             
            flash_attention: true
         | 
| 61 |  | 
| 62 | 
             
            warmup_steps: 10
         | 
| 63 | 
            +
            eval_steps: 0.05
         | 
| 64 | 
             
            save_steps: 50
         | 
| 65 | 
             
            debug:
         | 
| 66 | 
             
            deepspeed:
         | 
    	
        examples/llama-2/tiny-llama.yml
    CHANGED
    
    | @@ -34,7 +34,7 @@ wandb_log_model: | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            -
            num_epochs:  | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
| @@ -54,7 +54,7 @@ xformers_attention: | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            -
            eval_steps:  | 
| 58 | 
             
            eval_table_size:
         | 
| 59 | 
             
            save_steps:
         | 
| 60 | 
             
            debug:
         | 
|  | |
| 34 |  | 
| 35 | 
             
            gradient_accumulation_steps: 4
         | 
| 36 | 
             
            micro_batch_size: 2
         | 
| 37 | 
            +
            num_epochs: 4
         | 
| 38 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 39 | 
             
            lr_scheduler: cosine
         | 
| 40 | 
             
            learning_rate: 0.0002
         | 
|  | |
| 54 | 
             
            flash_attention: true
         | 
| 55 |  | 
| 56 | 
             
            warmup_steps: 10
         | 
| 57 | 
            +
            eval_steps: 0.05
         | 
| 58 | 
             
            eval_table_size:
         | 
| 59 | 
             
            save_steps:
         | 
| 60 | 
             
            debug:
         | 
    	
        examples/mistral/config.yml
    CHANGED
    
    | @@ -26,7 +26,7 @@ wandb_log_model: | |
| 26 |  | 
| 27 | 
             
            gradient_accumulation_steps: 4
         | 
| 28 | 
             
            micro_batch_size: 2
         | 
| 29 | 
            -
            num_epochs:  | 
| 30 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 31 | 
             
            lr_scheduler: cosine
         | 
| 32 | 
             
            learning_rate: 0.000005
         | 
| @@ -46,7 +46,7 @@ xformers_attention: | |
| 46 | 
             
            flash_attention: true
         | 
| 47 |  | 
| 48 | 
             
            warmup_steps: 10
         | 
| 49 | 
            -
            eval_steps:  | 
| 50 | 
             
            eval_table_size:
         | 
| 51 | 
             
            eval_table_max_new_tokens: 128
         | 
| 52 | 
             
            save_steps:
         | 
|  | |
| 26 |  | 
| 27 | 
             
            gradient_accumulation_steps: 4
         | 
| 28 | 
             
            micro_batch_size: 2
         | 
| 29 | 
            +
            num_epochs: 4
         | 
| 30 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 31 | 
             
            lr_scheduler: cosine
         | 
| 32 | 
             
            learning_rate: 0.000005
         | 
|  | |
| 46 | 
             
            flash_attention: true
         | 
| 47 |  | 
| 48 | 
             
            warmup_steps: 10
         | 
| 49 | 
            +
            eval_steps: 0.05
         | 
| 50 | 
             
            eval_table_size:
         | 
| 51 | 
             
            eval_table_max_new_tokens: 128
         | 
| 52 | 
             
            save_steps:
         | 
    	
        examples/mistral/qlora.yml
    CHANGED
    
    | @@ -63,7 +63,7 @@ xformers_attention: | |
| 63 | 
             
            flash_attention: true
         | 
| 64 |  | 
| 65 | 
             
            warmup_steps: 10
         | 
| 66 | 
            -
            eval_steps:  | 
| 67 | 
             
            eval_table_size:
         | 
| 68 | 
             
            eval_table_max_new_tokens: 128
         | 
| 69 | 
             
            save_steps:
         | 
|  | |
| 63 | 
             
            flash_attention: true
         | 
| 64 |  | 
| 65 | 
             
            warmup_steps: 10
         | 
| 66 | 
            +
            eval_steps: 0.05
         | 
| 67 | 
             
            eval_table_size:
         | 
| 68 | 
             
            eval_table_max_new_tokens: 128
         | 
| 69 | 
             
            save_steps:
         | 
    	
        examples/mpt-7b/config.yml
    CHANGED
    
    | @@ -26,7 +26,7 @@ wandb_log_model: | |
| 26 | 
             
            output_dir: ./mpt-alpaca-7b
         | 
| 27 | 
             
            gradient_accumulation_steps: 1
         | 
| 28 | 
             
            micro_batch_size: 1
         | 
| 29 | 
            -
            num_epochs:  | 
| 30 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 31 | 
             
            torchdistx_path:
         | 
| 32 | 
             
            lr_scheduler: cosine
         | 
|  | |
| 26 | 
             
            output_dir: ./mpt-alpaca-7b
         | 
| 27 | 
             
            gradient_accumulation_steps: 1
         | 
| 28 | 
             
            micro_batch_size: 1
         | 
| 29 | 
            +
            num_epochs: 4
         | 
| 30 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 31 | 
             
            torchdistx_path:
         | 
| 32 | 
             
            lr_scheduler: cosine
         | 
    	
        examples/pythia/lora.yml
    CHANGED
    
    | @@ -23,7 +23,7 @@ wandb_log_model: | |
| 23 | 
             
            output_dir: ./lora-alpaca-pythia
         | 
| 24 | 
             
            gradient_accumulation_steps: 1
         | 
| 25 | 
             
            micro_batch_size: 4
         | 
| 26 | 
            -
            num_epochs:  | 
| 27 | 
             
            learning_rate: 0.00001
         | 
| 28 | 
             
            train_on_inputs: false
         | 
| 29 | 
             
            group_by_length: false
         | 
| @@ -33,5 +33,5 @@ early_stopping_patience: | |
| 33 | 
             
            resume_from_checkpoint:
         | 
| 34 | 
             
            local_rank:
         | 
| 35 | 
             
            weight_decay: 0.1
         | 
| 36 | 
            -
            eval_steps:  | 
| 37 | 
             
            logging_steps: 1
         | 
|  | |
| 23 | 
             
            output_dir: ./lora-alpaca-pythia
         | 
| 24 | 
             
            gradient_accumulation_steps: 1
         | 
| 25 | 
             
            micro_batch_size: 4
         | 
| 26 | 
            +
            num_epochs: 4
         | 
| 27 | 
             
            learning_rate: 0.00001
         | 
| 28 | 
             
            train_on_inputs: false
         | 
| 29 | 
             
            group_by_length: false
         | 
|  | |
| 33 | 
             
            resume_from_checkpoint:
         | 
| 34 | 
             
            local_rank:
         | 
| 35 | 
             
            weight_decay: 0.1
         | 
| 36 | 
            +
            eval_steps: 0.05
         | 
| 37 | 
             
            logging_steps: 1
         | 
    	
        examples/redpajama/config-3b.yml
    CHANGED
    
    | @@ -27,7 +27,7 @@ wandb_log_model: | |
| 27 | 
             
            output_dir: ./redpajama-alpaca-3b
         | 
| 28 | 
             
            batch_size: 4
         | 
| 29 | 
             
            micro_batch_size: 1
         | 
| 30 | 
            -
            num_epochs:  | 
| 31 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 32 | 
             
            torchdistx_path:
         | 
| 33 | 
             
            lr_scheduler: cosine
         | 
|  | |
| 27 | 
             
            output_dir: ./redpajama-alpaca-3b
         | 
| 28 | 
             
            batch_size: 4
         | 
| 29 | 
             
            micro_batch_size: 1
         | 
| 30 | 
            +
            num_epochs: 4
         | 
| 31 | 
             
            optimizer: adamw_bnb_8bit
         | 
| 32 | 
             
            torchdistx_path:
         | 
| 33 | 
             
            lr_scheduler: cosine
         | 
    	
        examples/replit-3b/config-lora.yml
    CHANGED
    
    | @@ -26,7 +26,7 @@ wandb_log_model: | |
| 26 | 
             
            output_dir: ./lora-replit
         | 
| 27 | 
             
            batch_size: 8
         | 
| 28 | 
             
            micro_batch_size: 1
         | 
| 29 | 
            -
            num_epochs:  | 
| 30 | 
             
            optimizer:
         | 
| 31 | 
             
            torchdistx_path:
         | 
| 32 | 
             
            lr_scheduler:
         | 
|  | |
| 26 | 
             
            output_dir: ./lora-replit
         | 
| 27 | 
             
            batch_size: 8
         | 
| 28 | 
             
            micro_batch_size: 1
         | 
| 29 | 
            +
            num_epochs: 4
         | 
| 30 | 
             
            optimizer:
         | 
| 31 | 
             
            torchdistx_path:
         | 
| 32 | 
             
            lr_scheduler:
         | 
    	
        examples/xgen-7b/xgen-7b-8k-qlora.yml
    CHANGED
    
    | @@ -51,7 +51,7 @@ output_dir: ./qlora-out | |
| 51 | 
             
            # decrease if OOM, increase for max VRAM utilization
         | 
| 52 | 
             
            micro_batch_size: 1
         | 
| 53 | 
             
            gradient_accumulation_steps: 1
         | 
| 54 | 
            -
            num_epochs:  | 
| 55 | 
             
            # Optimizer for QLoRA
         | 
| 56 | 
             
            optimizer: paged_adamw_32bit
         | 
| 57 | 
             
            torchdistx_path:
         | 
|  | |
| 51 | 
             
            # decrease if OOM, increase for max VRAM utilization
         | 
| 52 | 
             
            micro_batch_size: 1
         | 
| 53 | 
             
            gradient_accumulation_steps: 1
         | 
| 54 | 
            +
            num_epochs: 4
         | 
| 55 | 
             
            # Optimizer for QLoRA
         | 
| 56 | 
             
            optimizer: paged_adamw_32bit
         | 
| 57 | 
             
            torchdistx_path:
         | 
