Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

winglian commited on Jan 10, 2024

Commit

2f2582e

unverified ·

1 Parent(s): 0ce1a65

additional logging to get maximum token length of a sequence in the dataset (#1066) [skip ci]

Browse files

* additional logging to get maximum token length of a sequence in the dataset

* fix ordering to properly determine the max_len of tokens before dropping anything longer

Files changed (1) hide show

src/axolotl/utils/trainer.py +10 -6

src/axolotl/utils/trainer.py CHANGED Viewed

@@ -109,12 +109,6 @@ def disable_datasets_caching():
 def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
     drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
     with zero_first(is_main_process()):
-        train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
-        if eval_dataset:
-            eval_dataset = eval_dataset.filter(
-                drop_long, num_proc=cfg.dataset_processes
-            )
         if cfg.group_by_length:
             train_dataset = train_dataset.map(
                 add_length, num_proc=cfg.dataset_processes
@@ -130,6 +124,16 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
                         add_position_ids, num_proc=cfg.dataset_processes
                     )
         # Phi doesn't want the attention_mask feature when training
         if (
             "CodeGenTokenizer" in tokenizer.__class__.__name__

 def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
     drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
     with zero_first(is_main_process()):
         if cfg.group_by_length:
             train_dataset = train_dataset.map(
                 add_length, num_proc=cfg.dataset_processes
                         add_position_ids, num_proc=cfg.dataset_processes
                     )
+        if cfg.group_by_length or cfg.sample_packing:
+            max_input_len = np.max(get_dataset_lengths(train_dataset))
+            LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
+        train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
+        if eval_dataset:
+            eval_dataset = eval_dataset.filter(
+                drop_long, num_proc=cfg.dataset_processes
+            )
         # Phi doesn't want the attention_mask feature when training
         if (
             "CodeGenTokenizer" in tokenizer.__class__.__name__