more fixes 20240228 (#1342) [skip ci]
Browse files* add missing evals_per_epoch setting
* more pydantic fixes
* more fixes
* move test from normalization to validation
* increase eval size for sample packing tests
    	
        src/axolotl/cli/__init__.py
    CHANGED
    
    | @@ -13,7 +13,6 @@ from threading import Thread | |
| 13 | 
             
            from typing import Any, Dict, List, Optional, Union
         | 
| 14 | 
             
            from urllib.parse import urlparse
         | 
| 15 |  | 
| 16 | 
            -
            import gradio as gr
         | 
| 17 | 
             
            import requests
         | 
| 18 | 
             
            import torch
         | 
| 19 | 
             
            import yaml
         | 
| @@ -215,6 +214,8 @@ def do_inference_gradio( | |
| 215 | 
             
                cfg: DictDefault,
         | 
| 216 | 
             
                cli_args: TrainerCliArgs,
         | 
| 217 | 
             
            ):
         | 
|  | |
|  | |
| 218 | 
             
                model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
         | 
| 219 | 
             
                prompter = cli_args.prompter
         | 
| 220 | 
             
                default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
         | 
|  | |
| 13 | 
             
            from typing import Any, Dict, List, Optional, Union
         | 
| 14 | 
             
            from urllib.parse import urlparse
         | 
| 15 |  | 
|  | |
| 16 | 
             
            import requests
         | 
| 17 | 
             
            import torch
         | 
| 18 | 
             
            import yaml
         | 
|  | |
| 214 | 
             
                cfg: DictDefault,
         | 
| 215 | 
             
                cli_args: TrainerCliArgs,
         | 
| 216 | 
             
            ):
         | 
| 217 | 
            +
                import gradio as gr
         | 
| 218 | 
            +
             | 
| 219 | 
             
                model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
         | 
| 220 | 
             
                prompter = cli_args.prompter
         | 
| 221 | 
             
                default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
         | 
    	
        src/axolotl/utils/config/__init__.py
    CHANGED
    
    | @@ -164,9 +164,6 @@ def normalize_config(cfg): | |
| 164 | 
             
                    ]
         | 
| 165 | 
             
                ) or cfg.is_qwen_derived_model
         | 
| 166 |  | 
| 167 | 
            -
                if isinstance(cfg.learning_rate, str):
         | 
| 168 | 
            -
                    cfg.learning_rate = float(cfg.learning_rate)
         | 
| 169 | 
            -
             | 
| 170 | 
             
                if isinstance(cfg.pretraining_dataset, dict):
         | 
| 171 | 
             
                    cfg.pretraining_dataset = [cfg.pretraining_dataset]
         | 
| 172 |  | 
|  | |
| 164 | 
             
                    ]
         | 
| 165 | 
             
                ) or cfg.is_qwen_derived_model
         | 
| 166 |  | 
|  | |
|  | |
|  | |
| 167 | 
             
                if isinstance(cfg.pretraining_dataset, dict):
         | 
| 168 | 
             
                    cfg.pretraining_dataset = [cfg.pretraining_dataset]
         | 
| 169 |  | 
    	
        src/axolotl/utils/config/models/input/v0_4_1/__init__.py
    CHANGED
    
    | @@ -302,6 +302,13 @@ class HyperparametersConfig(BaseModel): | |
| 302 | 
             
                        )
         | 
| 303 | 
             
                    return batch_size
         | 
| 304 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 305 |  | 
| 306 | 
             
            class ModelOutputConfig(BaseModel):
         | 
| 307 | 
             
                """model save configuration subset"""
         | 
| @@ -368,6 +375,7 @@ class AxolotlInputConfig( | |
| 368 | 
             
                rl: Optional[RLType] = None
         | 
| 369 |  | 
| 370 | 
             
                datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
         | 
|  | |
| 371 | 
             
                dataset_prepared_path: Optional[str] = None
         | 
| 372 | 
             
                dataset_shard_num: Optional[int] = None
         | 
| 373 | 
             
                dataset_shard_idx: Optional[int] = None
         | 
| @@ -456,6 +464,7 @@ class AxolotlInputConfig( | |
| 456 | 
             
                warmup_steps: Optional[int] = None
         | 
| 457 | 
             
                warmup_ratio: Optional[float] = None
         | 
| 458 | 
             
                eval_steps: Optional[Union[int, float]] = None
         | 
|  | |
| 459 | 
             
                evaluation_strategy: Optional[str] = None
         | 
| 460 | 
             
                save_steps: Optional[Union[int, float]] = None
         | 
| 461 | 
             
                saves_per_epoch: Optional[int] = None
         | 
| @@ -463,6 +472,7 @@ class AxolotlInputConfig( | |
| 463 | 
             
                save_total_limit: Optional[int] = None
         | 
| 464 | 
             
                logging_steps: Optional[int] = None
         | 
| 465 | 
             
                early_stopping_patience: Optional[int] = None
         | 
|  | |
| 466 |  | 
| 467 | 
             
                neftune_noise_alpha: Optional[float] = None
         | 
| 468 |  | 
|  | |
| 302 | 
             
                        )
         | 
| 303 | 
             
                    return batch_size
         | 
| 304 |  | 
| 305 | 
            +
                @field_validator("learning_rate")
         | 
| 306 | 
            +
                @classmethod
         | 
| 307 | 
            +
                def convert_learning_rate(cls, learning_rate):
         | 
| 308 | 
            +
                    if learning_rate and isinstance(learning_rate, str):
         | 
| 309 | 
            +
                        learning_rate = float(learning_rate)
         | 
| 310 | 
            +
                    return learning_rate
         | 
| 311 | 
            +
             | 
| 312 |  | 
| 313 | 
             
            class ModelOutputConfig(BaseModel):
         | 
| 314 | 
             
                """model save configuration subset"""
         | 
|  | |
| 375 | 
             
                rl: Optional[RLType] = None
         | 
| 376 |  | 
| 377 | 
             
                datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
         | 
| 378 | 
            +
                test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
         | 
| 379 | 
             
                dataset_prepared_path: Optional[str] = None
         | 
| 380 | 
             
                dataset_shard_num: Optional[int] = None
         | 
| 381 | 
             
                dataset_shard_idx: Optional[int] = None
         | 
|  | |
| 464 | 
             
                warmup_steps: Optional[int] = None
         | 
| 465 | 
             
                warmup_ratio: Optional[float] = None
         | 
| 466 | 
             
                eval_steps: Optional[Union[int, float]] = None
         | 
| 467 | 
            +
                evals_per_epoch: Optional[Union[int]] = None
         | 
| 468 | 
             
                evaluation_strategy: Optional[str] = None
         | 
| 469 | 
             
                save_steps: Optional[Union[int, float]] = None
         | 
| 470 | 
             
                saves_per_epoch: Optional[int] = None
         | 
|  | |
| 472 | 
             
                save_total_limit: Optional[int] = None
         | 
| 473 | 
             
                logging_steps: Optional[int] = None
         | 
| 474 | 
             
                early_stopping_patience: Optional[int] = None
         | 
| 475 | 
            +
                load_best_model_at_end: Optional[bool] = False
         | 
| 476 |  | 
| 477 | 
             
                neftune_noise_alpha: Optional[float] = None
         | 
| 478 |  | 
    	
        src/axolotl/utils/trainer.py
    CHANGED
    
    | @@ -255,7 +255,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True): | |
| 255 | 
             
                            train_dataset.remove_columns(["length"]),
         | 
| 256 | 
             
                            batch_sampler=sampler,
         | 
| 257 | 
             
                        )
         | 
| 258 | 
            -
                        data_loader_len = len(data_loader) // batch_size
         | 
| 259 | 
             
                        actual_eff = sampler.efficiency()
         | 
| 260 | 
             
                        LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True)
         | 
| 261 | 
             
                        # FIXME: is there a bug here somewhere? the total num steps depends
         | 
|  | |
| 255 | 
             
                            train_dataset.remove_columns(["length"]),
         | 
| 256 | 
             
                            batch_sampler=sampler,
         | 
| 257 | 
             
                        )
         | 
| 258 | 
            +
                        data_loader_len = len(data_loader) // cfg.batch_size
         | 
| 259 | 
             
                        actual_eff = sampler.efficiency()
         | 
| 260 | 
             
                        LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True)
         | 
| 261 | 
             
                        # FIXME: is there a bug here somewhere? the total num steps depends
         | 
    	
        tests/e2e/patched/test_lora_llama_multipack.py
    CHANGED
    
    | @@ -43,7 +43,7 @@ class TestLoraLlama(unittest.TestCase): | |
| 43 | 
             
                            "lora_alpha": 64,
         | 
| 44 | 
             
                            "lora_dropout": 0.05,
         | 
| 45 | 
             
                            "lora_target_linear": True,
         | 
| 46 | 
            -
                            "val_set_size": 0. | 
| 47 | 
             
                            "special_tokens": {
         | 
| 48 | 
             
                                "unk_token": "<unk>",
         | 
| 49 | 
             
                                "bos_token": "<s>",
         | 
|  | |
| 43 | 
             
                            "lora_alpha": 64,
         | 
| 44 | 
             
                            "lora_dropout": 0.05,
         | 
| 45 | 
             
                            "lora_target_linear": True,
         | 
| 46 | 
            +
                            "val_set_size": 0.2,
         | 
| 47 | 
             
                            "special_tokens": {
         | 
| 48 | 
             
                                "unk_token": "<unk>",
         | 
| 49 | 
             
                                "bos_token": "<s>",
         | 
    	
        tests/test_normalize_config.py
    CHANGED
    
    | @@ -25,20 +25,6 @@ class NormalizeConfigTestCase(unittest.TestCase): | |
| 25 | 
             
                        }
         | 
| 26 | 
             
                    )
         | 
| 27 |  | 
| 28 | 
            -
                def test_lr_as_float(self):
         | 
| 29 | 
            -
                    cfg = (
         | 
| 30 | 
            -
                        self._get_base_cfg()
         | 
| 31 | 
            -
                        | DictDefault(  # pylint: disable=unsupported-binary-operation
         | 
| 32 | 
            -
                            {
         | 
| 33 | 
            -
                                "learning_rate": "5e-5",
         | 
| 34 | 
            -
                            }
         | 
| 35 | 
            -
                        )
         | 
| 36 | 
            -
                    )
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                    normalize_config(cfg)
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                    assert cfg.learning_rate == 0.00005
         | 
| 41 | 
            -
             | 
| 42 | 
             
                def test_base_model_config_set_when_empty(self):
         | 
| 43 | 
             
                    cfg = self._get_base_cfg()
         | 
| 44 | 
             
                    del cfg.base_model_config
         | 
|  | |
| 25 | 
             
                        }
         | 
| 26 | 
             
                    )
         | 
| 27 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 28 | 
             
                def test_base_model_config_set_when_empty(self):
         | 
| 29 | 
             
                    cfg = self._get_base_cfg()
         | 
| 30 | 
             
                    del cfg.base_model_config
         | 
    	
        tests/test_validation.py
    CHANGED
    
    | @@ -176,6 +176,20 @@ class TestValidation(BaseValidation): | |
| 176 | 
             
                    with pytest.raises(ValueError, match=r".*At least two of*"):
         | 
| 177 | 
             
                        validate_config(cfg)
         | 
| 178 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 179 | 
             
                def test_qlora(self, minimal_cfg):
         | 
| 180 | 
             
                    base_cfg = (
         | 
| 181 | 
             
                        DictDefault(
         | 
|  | |
| 176 | 
             
                    with pytest.raises(ValueError, match=r".*At least two of*"):
         | 
| 177 | 
             
                        validate_config(cfg)
         | 
| 178 |  | 
| 179 | 
            +
                def test_lr_as_float(self, minimal_cfg):
         | 
| 180 | 
            +
                    cfg = (
         | 
| 181 | 
            +
                        DictDefault(  # pylint: disable=unsupported-binary-operation
         | 
| 182 | 
            +
                            {
         | 
| 183 | 
            +
                                "learning_rate": "5e-5",
         | 
| 184 | 
            +
                            }
         | 
| 185 | 
            +
                        )
         | 
| 186 | 
            +
                        | minimal_cfg
         | 
| 187 | 
            +
                    )
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                    new_cfg = validate_config(cfg)
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                    assert new_cfg.learning_rate == 0.00005
         | 
| 192 | 
            +
             | 
| 193 | 
             
                def test_qlora(self, minimal_cfg):
         | 
| 194 | 
             
                    base_cfg = (
         | 
| 195 | 
             
                        DictDefault(
         | 
