support for datasets with multiple names (#480)
Browse files* support for datasets with multiple names
* update docs
- README.md +9 -0
- src/axolotl/utils/data.py +10 -1
README.md
CHANGED
|
@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|
| 328 |
name: enron_emails
|
| 329 |
type: completion # format from earlier
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
# local
|
| 332 |
datasets:
|
| 333 |
- path: data.jsonl # or json
|
|
|
|
| 328 |
name: enron_emails
|
| 329 |
type: completion # format from earlier
|
| 330 |
|
| 331 |
+
# huggingface repo with multiple named configurations/subsets
|
| 332 |
+
datasets:
|
| 333 |
+
- path: bigcode/commitpackft
|
| 334 |
+
name:
|
| 335 |
+
- ruby
|
| 336 |
+
- python
|
| 337 |
+
- typescript
|
| 338 |
+
type: ... # unimplemented custom format
|
| 339 |
+
|
| 340 |
# local
|
| 341 |
datasets:
|
| 342 |
- path: data.jsonl # or json
|
src/axolotl/utils/data.py
CHANGED
|
@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
|
|
| 134 |
seed = 42
|
| 135 |
|
| 136 |
datasets = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
# pylint: disable=invalid-name
|
| 138 |
-
for d in cfg.datasets:
|
| 139 |
ds: Union[Dataset, DatasetDict] = None
|
| 140 |
ds_from_hub = False
|
| 141 |
try:
|
|
|
|
| 134 |
seed = 42
|
| 135 |
|
| 136 |
datasets = []
|
| 137 |
+
|
| 138 |
+
def for_d_in_datasets(dataset_configs):
|
| 139 |
+
for dataset in dataset_configs:
|
| 140 |
+
if dataset.name and isinstance(dataset.name, list):
|
| 141 |
+
for name in dataset.name:
|
| 142 |
+
yield DictDefault({**dataset, "name": name})
|
| 143 |
+
else:
|
| 144 |
+
yield dataset
|
| 145 |
+
|
| 146 |
# pylint: disable=invalid-name
|
| 147 |
+
for d in for_d_in_datasets(cfg.datasets):
|
| 148 |
ds: Union[Dataset, DatasetDict] = None
|
| 149 |
ds_from_hub = False
|
| 150 |
try:
|