winglian commited on
Commit
1157950
·
unverified ·
1 Parent(s): 3b18c96

remove columns after tokenizing for pretraining (#571)

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/data.py +3 -3
src/axolotl/utils/data.py CHANGED
@@ -644,8 +644,8 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
644
  encode,
645
  batched=True,
646
  input_columns="text",
647
- remove_columns=[
648
- "text",
649
- ],
650
  )
651
  return dataset
 
644
  encode,
645
  batched=True,
646
  input_columns="text",
647
+ # remove all the existing columns after mapping since they end up having
648
+ # a different length than the encoded/tokenized column
649
+ remove_columns=dataset.features.keys(),
650
  )
651
  return dataset