Merge pull request #277 from cg123/dataset-name
Browse files- README.md +7 -0
- src/axolotl/utils/data.py +13 -14
README.md
CHANGED
@@ -262,6 +262,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|
262 |
- path: vicgalle/alpaca-gpt4
|
263 |
type: alpaca # format from earlier
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
# local
|
266 |
datasets:
|
267 |
- path: json
|
@@ -344,6 +350,7 @@ datasets:
|
|
344 |
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
345 |
data_files: # path to source data files
|
346 |
shards: # number of shards to split data into
|
|
|
347 |
|
348 |
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
349 |
# subsequent training attempts load faster, relative path
|
|
|
262 |
- path: vicgalle/alpaca-gpt4
|
263 |
type: alpaca # format from earlier
|
264 |
|
265 |
+
# huggingface repo with specific configuration/subset
|
266 |
+
datasets:
|
267 |
+
- path: EleutherAI/pile
|
268 |
+
name: enron_emails
|
269 |
+
type: completion # format from earlier
|
270 |
+
|
271 |
# local
|
272 |
datasets:
|
273 |
- path: json
|
|
|
350 |
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
351 |
data_files: # path to source data files
|
352 |
shards: # number of shards to split data into
|
353 |
+
name: # name of dataset configuration to load
|
354 |
|
355 |
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
356 |
# subsequent training attempts load faster, relative path
|
src/axolotl/utils/data.py
CHANGED
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
|
|
94 |
try:
|
95 |
load_dataset(
|
96 |
d.path,
|
|
|
97 |
streaming=True,
|
98 |
use_auth_token=use_auth_token,
|
99 |
)
|
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
|
|
107 |
if local_path.is_dir():
|
108 |
ds = load_dataset(
|
109 |
d.path,
|
|
|
110 |
data_files=d.data_files,
|
111 |
streaming=False,
|
112 |
split=None,
|
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
|
|
114 |
elif local_path.is_file():
|
115 |
ds = load_dataset(
|
116 |
"json",
|
|
|
117 |
data_files=d.path,
|
118 |
streaming=False,
|
119 |
split=None,
|
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
|
|
123 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
124 |
)
|
125 |
elif ds_from_hub:
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
else:
|
134 |
-
ds = load_dataset(
|
135 |
-
d.path,
|
136 |
-
streaming=False,
|
137 |
-
use_auth_token=use_auth_token,
|
138 |
-
)
|
139 |
else:
|
140 |
fp = hf_hub_download(
|
141 |
repo_id=d.path,
|
142 |
repo_type="dataset",
|
143 |
filename=d.data_files,
|
144 |
)
|
145 |
-
ds = load_dataset(
|
|
|
|
|
146 |
if not ds:
|
147 |
raise ValueError("unhandled dataset load")
|
148 |
# support for using a subset of the data
|
|
|
94 |
try:
|
95 |
load_dataset(
|
96 |
d.path,
|
97 |
+
name=d.name,
|
98 |
streaming=True,
|
99 |
use_auth_token=use_auth_token,
|
100 |
)
|
|
|
108 |
if local_path.is_dir():
|
109 |
ds = load_dataset(
|
110 |
d.path,
|
111 |
+
name=d.name,
|
112 |
data_files=d.data_files,
|
113 |
streaming=False,
|
114 |
split=None,
|
|
|
116 |
elif local_path.is_file():
|
117 |
ds = load_dataset(
|
118 |
"json",
|
119 |
+
name=d.name,
|
120 |
data_files=d.path,
|
121 |
streaming=False,
|
122 |
split=None,
|
|
|
126 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
127 |
)
|
128 |
elif ds_from_hub:
|
129 |
+
ds = load_dataset(
|
130 |
+
d.path,
|
131 |
+
name=d.name,
|
132 |
+
streaming=False,
|
133 |
+
data_files=d.data_files,
|
134 |
+
use_auth_token=use_auth_token,
|
135 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
else:
|
137 |
fp = hf_hub_download(
|
138 |
repo_id=d.path,
|
139 |
repo_type="dataset",
|
140 |
filename=d.data_files,
|
141 |
)
|
142 |
+
ds = load_dataset(
|
143 |
+
"json", name=d.name, data_files=fp, streaming=False, split=None
|
144 |
+
)
|
145 |
if not ds:
|
146 |
raise ValueError("unhandled dataset load")
|
147 |
# support for using a subset of the data
|