ich
commited on
Fix bug when using pretokenized datasets (#652)
Browse files* fix pretokenized datasets readme
* check if dataset type is not set to handle pretokenized datasets
- README.md +1 -1
- src/axolotl/utils/config.py +2 -0
README.md
CHANGED
|
@@ -317,7 +317,7 @@ Using file:
|
|
| 317 |
#### How to use your custom pretokenized dataset
|
| 318 |
|
| 319 |
- Do not pass a `type:`
|
| 320 |
-
- Dataset must
|
| 321 |
|
| 322 |
|
| 323 |
### Config
|
|
|
|
| 317 |
#### How to use your custom pretokenized dataset
|
| 318 |
|
| 319 |
- Do not pass a `type:`
|
| 320 |
+
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
|
| 321 |
|
| 322 |
|
| 323 |
### Config
|
src/axolotl/utils/config.py
CHANGED
|
@@ -293,6 +293,8 @@ def validate_config(cfg):
|
|
| 293 |
|
| 294 |
if cfg.datasets:
|
| 295 |
for idx, ds_cfg in enumerate(cfg.datasets):
|
|
|
|
|
|
|
| 296 |
if ds_cfg.type == "sharegpt:chat":
|
| 297 |
LOG.warning(
|
| 298 |
PendingDeprecationWarning(
|
|
|
|
| 293 |
|
| 294 |
if cfg.datasets:
|
| 295 |
for idx, ds_cfg in enumerate(cfg.datasets):
|
| 296 |
+
if not ds_cfg.type:
|
| 297 |
+
continue
|
| 298 |
if ds_cfg.type == "sharegpt:chat":
|
| 299 |
LOG.warning(
|
| 300 |
PendingDeprecationWarning(
|