remove columns after tokenizing for pretraining (#571)
Browse files
src/axolotl/utils/data.py
CHANGED
@@ -644,8 +644,8 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
|
|
644 |
encode,
|
645 |
batched=True,
|
646 |
input_columns="text",
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
)
|
651 |
return dataset
|
|
|
644 |
encode,
|
645 |
batched=True,
|
646 |
input_columns="text",
|
647 |
+
# remove all the existing columns after mapping since they end up having
|
648 |
+
# a different length than the encoded/tokenized column
|
649 |
+
remove_columns=dataset.features.keys(),
|
650 |
)
|
651 |
return dataset
|