Deprecate max packed sequence len (#1141)
Browse files- README.md +0 -4
- src/axolotl/utils/config.py +4 -11
- src/axolotl/utils/data.py +16 -121
- src/axolotl/utils/models.py +1 -5
- src/axolotl/utils/trainer.py +10 -11
- tests/test_validation.py +7 -18
README.md
CHANGED
|
@@ -642,10 +642,6 @@ sequence_len: 2048
|
|
| 642 |
# Pad inputs so each step uses constant sized buffers
|
| 643 |
# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
|
| 644 |
pad_to_sequence_len:
|
| 645 |
-
# Max sequence length to concatenate training samples together up to
|
| 646 |
-
# Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
|
| 647 |
-
# FutureWarning: This will soon be DEPRECATED
|
| 648 |
-
max_packed_sequence_len: 1024
|
| 649 |
# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
|
| 650 |
sample_packing:
|
| 651 |
# Set to 'false' if getting errors during eval with sample_packing on.
|
|
|
|
| 642 |
# Pad inputs so each step uses constant sized buffers
|
| 643 |
# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
|
| 644 |
pad_to_sequence_len:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
|
| 646 |
sample_packing:
|
| 647 |
# Set to 'false' if getting errors during eval with sample_packing on.
|
src/axolotl/utils/config.py
CHANGED
|
@@ -157,6 +157,9 @@ def normalize_config(cfg):
|
|
| 157 |
if isinstance(cfg.learning_rate, str):
|
| 158 |
cfg.learning_rate = float(cfg.learning_rate)
|
| 159 |
|
|
|
|
|
|
|
|
|
|
| 160 |
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
| 161 |
|
| 162 |
|
|
@@ -192,18 +195,8 @@ def validate_config(cfg):
|
|
| 192 |
raise ValueError(
|
| 193 |
"bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above."
|
| 194 |
)
|
| 195 |
-
if cfg.max_packed_sequence_len and cfg.sample_packing:
|
| 196 |
-
raise ValueError(
|
| 197 |
-
"please set only one of max_packed_sequence_len (deprecated soon) or sample_packing"
|
| 198 |
-
)
|
| 199 |
if cfg.max_packed_sequence_len:
|
| 200 |
-
|
| 201 |
-
str(
|
| 202 |
-
PendingDeprecationWarning(
|
| 203 |
-
"max_packed_sequence_len will be deprecated in favor of sample_packing"
|
| 204 |
-
)
|
| 205 |
-
)
|
| 206 |
-
)
|
| 207 |
|
| 208 |
if cfg.sample_packing and not cfg.pad_to_sequence_len:
|
| 209 |
LOG.warning(
|
|
|
|
| 157 |
if isinstance(cfg.learning_rate, str):
|
| 158 |
cfg.learning_rate = float(cfg.learning_rate)
|
| 159 |
|
| 160 |
+
if isinstance(cfg.pretraining_dataset, dict):
|
| 161 |
+
cfg.pretraining_dataset = [cfg.pretraining_dataset]
|
| 162 |
+
|
| 163 |
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
| 164 |
|
| 165 |
|
|
|
|
| 195 |
raise ValueError(
|
| 196 |
"bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above."
|
| 197 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
if cfg.max_packed_sequence_len:
|
| 199 |
+
raise DeprecationWarning("`max_packed_sequence_len` is no longer supported")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
if cfg.sample_packing and not cfg.pad_to_sequence_len:
|
| 202 |
LOG.warning(
|
src/axolotl/utils/data.py
CHANGED
|
@@ -19,7 +19,7 @@ from torch.utils.data import RandomSampler
|
|
| 19 |
from transformers import PreTrainedTokenizerBase
|
| 20 |
|
| 21 |
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
| 22 |
-
from axolotl.datasets import
|
| 23 |
from axolotl.prompt_strategies import load
|
| 24 |
from axolotl.prompt_tokenizers import (
|
| 25 |
AlpacaMultipleChoicePromptTokenizingStrategy,
|
|
@@ -71,9 +71,11 @@ def prepare_dataset(cfg, tokenizer):
|
|
| 71 |
else:
|
| 72 |
path = cfg.pretraining_dataset
|
| 73 |
name = None
|
| 74 |
-
if isinstance(cfg.pretraining_dataset,
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
|
| 78 |
train_dataset = load_pretraining_dataset(
|
| 79 |
path,
|
|
@@ -88,11 +90,6 @@ def prepare_dataset(cfg, tokenizer):
|
|
| 88 |
eval_dataset = None
|
| 89 |
return train_dataset, eval_dataset, cfg.max_steps, prompters
|
| 90 |
|
| 91 |
-
with zero_first(is_main_process()):
|
| 92 |
-
train_dataset, eval_dataset = process_datasets_for_packing(
|
| 93 |
-
cfg, train_dataset, eval_dataset, tokenizer
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
|
| 97 |
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
|
| 98 |
if total_eval_steps == 0:
|
|
@@ -163,6 +160,10 @@ def load_tokenized_prepared_datasets(
|
|
| 163 |
else:
|
| 164 |
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
|
| 165 |
LOG.info("Loading raw datasets...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
if cfg.seed:
|
| 168 |
seed = cfg.seed
|
|
@@ -382,6 +383,9 @@ def load_tokenized_prepared_datasets(
|
|
| 382 |
if len(datasets) > 1:
|
| 383 |
LOG.info("shuffle merged datasets")
|
| 384 |
dataset = dataset.shuffle(seed=seed)
|
|
|
|
|
|
|
|
|
|
| 385 |
if cfg.local_rank == 0:
|
| 386 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
| 387 |
dataset.save_to_disk(prepared_ds_path)
|
|
@@ -419,119 +423,9 @@ def load_prepare_datasets(
|
|
| 419 |
cfg,
|
| 420 |
default_dataset_prepared_path,
|
| 421 |
) -> Tuple[Dataset, Dataset, List[Prompter]]:
|
| 422 |
-
|
| 423 |
-
|
| 424 |
)
|
| 425 |
-
max_packed_sequence_len = min(
|
| 426 |
-
max_packed_sequence_len, cfg.sequence_len
|
| 427 |
-
) # make sure we don't accidentally set it larger than sequence_len
|
| 428 |
-
|
| 429 |
-
tokenizer_name = tokenizer.__class__.__name__
|
| 430 |
-
prompters: List[Prompter] = []
|
| 431 |
-
if cfg.max_packed_sequence_len is not None:
|
| 432 |
-
# see if we can go ahead and load the stacked dataset
|
| 433 |
-
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
| 434 |
-
ds_hash = str(
|
| 435 |
-
md5(
|
| 436 |
-
(
|
| 437 |
-
str(cfg.sequence_len)
|
| 438 |
-
+ "@"
|
| 439 |
-
+ str(max_packed_sequence_len)
|
| 440 |
-
+ seed
|
| 441 |
-
+ "|".join(
|
| 442 |
-
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
|
| 443 |
-
)
|
| 444 |
-
+ "|"
|
| 445 |
-
+ tokenizer_name
|
| 446 |
-
)
|
| 447 |
-
)
|
| 448 |
-
)
|
| 449 |
-
prepared_ds_path = (
|
| 450 |
-
Path(cfg.dataset_prepared_path) / ds_hash
|
| 451 |
-
if cfg.dataset_prepared_path
|
| 452 |
-
else Path(default_dataset_prepared_path) / ds_hash
|
| 453 |
-
)
|
| 454 |
-
|
| 455 |
-
dataset = None
|
| 456 |
-
use_auth_token = cfg.hf_use_auth_token
|
| 457 |
-
try:
|
| 458 |
-
if cfg.push_dataset_to_hub:
|
| 459 |
-
LOG.info(
|
| 460 |
-
f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
| 461 |
-
)
|
| 462 |
-
dataset = load_dataset(
|
| 463 |
-
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
| 464 |
-
token=use_auth_token,
|
| 465 |
-
)
|
| 466 |
-
dataset = dataset["train"]
|
| 467 |
-
except Exception: # pylint: disable=broad-except # nosec
|
| 468 |
-
pass
|
| 469 |
-
|
| 470 |
-
if dataset:
|
| 471 |
-
...
|
| 472 |
-
elif (
|
| 473 |
-
cfg.dataset_prepared_path
|
| 474 |
-
and any(prepared_ds_path.glob("*"))
|
| 475 |
-
and not cfg.is_preprocess
|
| 476 |
-
):
|
| 477 |
-
LOG.info(
|
| 478 |
-
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
| 479 |
-
)
|
| 480 |
-
dataset = load_from_disk(str(prepared_ds_path))
|
| 481 |
-
LOG.info("Prepared packed dataset loaded from disk...")
|
| 482 |
-
if cfg.push_dataset_to_hub:
|
| 483 |
-
LOG.info(
|
| 484 |
-
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
| 485 |
-
)
|
| 486 |
-
dataset.push_to_hub(
|
| 487 |
-
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
|
| 488 |
-
)
|
| 489 |
-
else:
|
| 490 |
-
dataset, prompters = load_tokenized_prepared_datasets(
|
| 491 |
-
tokenizer, cfg, default_dataset_prepared_path
|
| 492 |
-
)
|
| 493 |
-
|
| 494 |
-
if cfg.seed:
|
| 495 |
-
dataset = dataset.shuffle(seed=cfg.seed)
|
| 496 |
-
|
| 497 |
-
constant_len_dataset = ConstantLengthDataset(
|
| 498 |
-
tokenizer,
|
| 499 |
-
[dataset],
|
| 500 |
-
seq_length=max_packed_sequence_len,
|
| 501 |
-
)
|
| 502 |
-
LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
|
| 503 |
-
dataset = Dataset.from_list(list(constant_len_dataset))
|
| 504 |
-
|
| 505 |
-
# filter out bad data
|
| 506 |
-
# TODO convert to dataset.filter(...)
|
| 507 |
-
dataset = Dataset.from_list(
|
| 508 |
-
[
|
| 509 |
-
d
|
| 510 |
-
for d in dataset
|
| 511 |
-
if len(d["input_ids"]) <= cfg.sequence_len
|
| 512 |
-
and len(d["input_ids"]) > 0
|
| 513 |
-
and len(d["input_ids"]) == len(d["attention_mask"])
|
| 514 |
-
and len(d["input_ids"]) == len(d["labels"])
|
| 515 |
-
]
|
| 516 |
-
)
|
| 517 |
-
|
| 518 |
-
if cfg.local_rank == 0:
|
| 519 |
-
LOG.info(
|
| 520 |
-
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
| 521 |
-
)
|
| 522 |
-
dataset.save_to_disk(prepared_ds_path)
|
| 523 |
-
if cfg.push_dataset_to_hub:
|
| 524 |
-
LOG.info(
|
| 525 |
-
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
| 526 |
-
)
|
| 527 |
-
dataset.push_to_hub(
|
| 528 |
-
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
| 529 |
-
private=True,
|
| 530 |
-
)
|
| 531 |
-
else:
|
| 532 |
-
dataset, prompters = load_tokenized_prepared_datasets(
|
| 533 |
-
tokenizer, cfg, default_dataset_prepared_path
|
| 534 |
-
)
|
| 535 |
|
| 536 |
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
| 537 |
LOG.info(
|
|
@@ -877,6 +771,7 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s
|
|
| 877 |
dataset = dataset.map(
|
| 878 |
encode,
|
| 879 |
batched=True,
|
|
|
|
| 880 |
input_columns="text",
|
| 881 |
# remove all the existing columns after mapping since they end up having
|
| 882 |
# a different length than the encoded/tokenized column
|
|
|
|
| 19 |
from transformers import PreTrainedTokenizerBase
|
| 20 |
|
| 21 |
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
| 22 |
+
from axolotl.datasets import TokenizedPromptDataset
|
| 23 |
from axolotl.prompt_strategies import load
|
| 24 |
from axolotl.prompt_tokenizers import (
|
| 25 |
AlpacaMultipleChoicePromptTokenizingStrategy,
|
|
|
|
| 71 |
else:
|
| 72 |
path = cfg.pretraining_dataset
|
| 73 |
name = None
|
| 74 |
+
if isinstance(cfg.pretraining_dataset, list) and isinstance(
|
| 75 |
+
cfg.pretraining_dataset[0], dict
|
| 76 |
+
):
|
| 77 |
+
path = cfg.pretraining_dataset[0]["path"]
|
| 78 |
+
name = cfg.pretraining_dataset[0]["name"]
|
| 79 |
|
| 80 |
train_dataset = load_pretraining_dataset(
|
| 81 |
path,
|
|
|
|
| 90 |
eval_dataset = None
|
| 91 |
return train_dataset, eval_dataset, cfg.max_steps, prompters
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
|
| 94 |
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
|
| 95 |
if total_eval_steps == 0:
|
|
|
|
| 160 |
else:
|
| 161 |
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
|
| 162 |
LOG.info("Loading raw datasets...")
|
| 163 |
+
if not cfg.is_preprocess:
|
| 164 |
+
LOG.warning(
|
| 165 |
+
"Processing datasets during training can lead to VRAM instability. Please pre-process your dataset"
|
| 166 |
+
)
|
| 167 |
|
| 168 |
if cfg.seed:
|
| 169 |
seed = cfg.seed
|
|
|
|
| 383 |
if len(datasets) > 1:
|
| 384 |
LOG.info("shuffle merged datasets")
|
| 385 |
dataset = dataset.shuffle(seed=seed)
|
| 386 |
+
|
| 387 |
+
dataset, _ = process_datasets_for_packing(cfg, dataset, None, tokenizer)
|
| 388 |
+
|
| 389 |
if cfg.local_rank == 0:
|
| 390 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
| 391 |
dataset.save_to_disk(prepared_ds_path)
|
|
|
|
| 423 |
cfg,
|
| 424 |
default_dataset_prepared_path,
|
| 425 |
) -> Tuple[Dataset, Dataset, List[Prompter]]:
|
| 426 |
+
dataset, prompters = load_tokenized_prepared_datasets(
|
| 427 |
+
tokenizer, cfg, default_dataset_prepared_path
|
| 428 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
| 431 |
LOG.info(
|
|
|
|
| 771 |
dataset = dataset.map(
|
| 772 |
encode,
|
| 773 |
batched=True,
|
| 774 |
+
batch_size=10_000,
|
| 775 |
input_columns="text",
|
| 776 |
# remove all the existing columns after mapping since they end up having
|
| 777 |
# a different length than the encoded/tokenized column
|
src/axolotl/utils/models.py
CHANGED
|
@@ -329,11 +329,7 @@ def load_model(
|
|
| 329 |
LOG.info("patching mixtral with flash attention")
|
| 330 |
replace_mixtral_attn_with_multipack_flash_attn()
|
| 331 |
|
| 332 |
-
if
|
| 333 |
-
cfg.is_llama_derived_model
|
| 334 |
-
and (cfg.max_packed_sequence_len or cfg.sample_packing)
|
| 335 |
-
and not inference
|
| 336 |
-
):
|
| 337 |
from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
|
| 338 |
|
| 339 |
LOG.info("patching _expand_mask")
|
|
|
|
| 329 |
LOG.info("patching mixtral with flash attention")
|
| 330 |
replace_mixtral_attn_with_multipack_flash_attn()
|
| 331 |
|
| 332 |
+
if cfg.is_llama_derived_model and cfg.sample_packing and not inference:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
|
| 334 |
|
| 335 |
LOG.info("patching _expand_mask")
|
src/axolotl/utils/trainer.py
CHANGED
|
@@ -81,6 +81,15 @@ def trainer_weighted_loss(model_output, labels, shift_labels=True):
|
|
| 81 |
return weighted_cross_entropy(logits, labels, weights)
|
| 82 |
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def add_position_ids(sample):
|
| 85 |
sample_len = len(sample["input_ids"])
|
| 86 |
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
|
@@ -97,15 +106,6 @@ def drop_long_seq(sample, sequence_len=2048):
|
|
| 97 |
return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
|
| 98 |
|
| 99 |
|
| 100 |
-
@contextmanager
|
| 101 |
-
def disable_datasets_caching():
|
| 102 |
-
try:
|
| 103 |
-
set_caching_enabled(False)
|
| 104 |
-
yield
|
| 105 |
-
finally:
|
| 106 |
-
set_caching_enabled(True)
|
| 107 |
-
|
| 108 |
-
|
| 109 |
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
| 110 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
| 111 |
with zero_first(is_main_process()):
|
|
@@ -227,8 +227,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
|
|
| 227 |
sampler=RandomSampler(train_dataset),
|
| 228 |
batch_size=cfg.micro_batch_size,
|
| 229 |
drop_last=True,
|
| 230 |
-
batch_max_len=cfg.micro_batch_size
|
| 231 |
-
* (cfg.max_packed_sequence_len or cfg.sequence_len),
|
| 232 |
lengths=get_dataset_lengths(train_dataset),
|
| 233 |
)
|
| 234 |
|
|
|
|
| 81 |
return weighted_cross_entropy(logits, labels, weights)
|
| 82 |
|
| 83 |
|
| 84 |
+
@contextmanager
|
| 85 |
+
def disable_datasets_caching():
|
| 86 |
+
try:
|
| 87 |
+
set_caching_enabled(False)
|
| 88 |
+
yield
|
| 89 |
+
finally:
|
| 90 |
+
set_caching_enabled(True)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
def add_position_ids(sample):
|
| 94 |
sample_len = len(sample["input_ids"])
|
| 95 |
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
|
|
|
| 106 |
return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
|
| 107 |
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
| 110 |
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
| 111 |
with zero_first(is_main_process()):
|
|
|
|
| 227 |
sampler=RandomSampler(train_dataset),
|
| 228 |
batch_size=cfg.micro_batch_size,
|
| 229 |
drop_last=True,
|
| 230 |
+
batch_max_len=cfg.micro_batch_size * cfg.sequence_len,
|
|
|
|
| 231 |
lengths=get_dataset_lengths(train_dataset),
|
| 232 |
)
|
| 233 |
|
tests/test_validation.py
CHANGED
|
@@ -324,20 +324,19 @@ class ValidationTest(BaseValidation):
|
|
| 324 |
|
| 325 |
validate_config(cfg)
|
| 326 |
|
| 327 |
-
def
|
| 328 |
cfg = DictDefault(
|
| 329 |
{
|
| 330 |
-
"max_packed_sequence_len":
|
| 331 |
}
|
| 332 |
)
|
| 333 |
-
with
|
|
|
|
|
|
|
|
|
|
| 334 |
validate_config(cfg)
|
| 335 |
-
assert any(
|
| 336 |
-
"max_packed_sequence_len will be deprecated in favor of sample_packing"
|
| 337 |
-
in record.message
|
| 338 |
-
for record in self._caplog.records
|
| 339 |
-
)
|
| 340 |
|
|
|
|
| 341 |
cfg = DictDefault(
|
| 342 |
{
|
| 343 |
"sample_packing": True,
|
|
@@ -352,16 +351,6 @@ class ValidationTest(BaseValidation):
|
|
| 352 |
for record in self._caplog.records
|
| 353 |
)
|
| 354 |
|
| 355 |
-
cfg = DictDefault(
|
| 356 |
-
{
|
| 357 |
-
"max_packed_sequence_len": 2048,
|
| 358 |
-
"sample_packing": True,
|
| 359 |
-
}
|
| 360 |
-
)
|
| 361 |
-
regex_exp = r".*set only one of max_packed_sequence_len \(deprecated soon\) or sample_packing.*"
|
| 362 |
-
with pytest.raises(ValueError, match=regex_exp):
|
| 363 |
-
validate_config(cfg)
|
| 364 |
-
|
| 365 |
@pytest.mark.skipif(
|
| 366 |
is_torch_bf16_gpu_available(),
|
| 367 |
reason="test should only run on gpus w/o bf16 support",
|
|
|
|
| 324 |
|
| 325 |
validate_config(cfg)
|
| 326 |
|
| 327 |
+
def test_deprecated_packing(self):
|
| 328 |
cfg = DictDefault(
|
| 329 |
{
|
| 330 |
+
"max_packed_sequence_len": 1024,
|
| 331 |
}
|
| 332 |
)
|
| 333 |
+
with pytest.raises(
|
| 334 |
+
DeprecationWarning,
|
| 335 |
+
match=r"`max_packed_sequence_len` is no longer supported",
|
| 336 |
+
):
|
| 337 |
validate_config(cfg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
+
def test_packing(self):
|
| 340 |
cfg = DictDefault(
|
| 341 |
{
|
| 342 |
"sample_packing": True,
|
|
|
|
| 351 |
for record in self._caplog.records
|
| 352 |
)
|
| 353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
@pytest.mark.skipif(
|
| 355 |
is_torch_bf16_gpu_available(),
|
| 356 |
reason="test should only run on gpus w/o bf16 support",
|