winglian commited on
Commit
2f2582e
·
unverified ·
1 Parent(s): 0ce1a65

additional logging to get maximum token length of a sequence in the dataset (#1066) [skip ci]

Browse files

* additional logging to get maximum token length of a sequence in the dataset

* fix ordering to properly determine the max_len of tokens before dropping anything longer

Files changed (1) hide show
  1. src/axolotl/utils/trainer.py +10 -6
src/axolotl/utils/trainer.py CHANGED
@@ -109,12 +109,6 @@ def disable_datasets_caching():
109
  def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
110
  drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
111
  with zero_first(is_main_process()):
112
- train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
113
- if eval_dataset:
114
- eval_dataset = eval_dataset.filter(
115
- drop_long, num_proc=cfg.dataset_processes
116
- )
117
-
118
  if cfg.group_by_length:
119
  train_dataset = train_dataset.map(
120
  add_length, num_proc=cfg.dataset_processes
@@ -130,6 +124,16 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
130
  add_position_ids, num_proc=cfg.dataset_processes
131
  )
132
 
 
 
 
 
 
 
 
 
 
 
133
  # Phi doesn't want the attention_mask feature when training
134
  if (
135
  "CodeGenTokenizer" in tokenizer.__class__.__name__
 
109
  def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
110
  drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
111
  with zero_first(is_main_process()):
 
 
 
 
 
 
112
  if cfg.group_by_length:
113
  train_dataset = train_dataset.map(
114
  add_length, num_proc=cfg.dataset_processes
 
124
  add_position_ids, num_proc=cfg.dataset_processes
125
  )
126
 
127
+ if cfg.group_by_length or cfg.sample_packing:
128
+ max_input_len = np.max(get_dataset_lengths(train_dataset))
129
+ LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
130
+
131
+ train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
132
+ if eval_dataset:
133
+ eval_dataset = eval_dataset.filter(
134
+ drop_long, num_proc=cfg.dataset_processes
135
+ )
136
+
137
  # Phi doesn't want the attention_mask feature when training
138
  if (
139
  "CodeGenTokenizer" in tokenizer.__class__.__name__