winglian commited on
Commit
334af62
2 Parent(s): 168a7a0 3cdd8e4

Merge pull request #277 from cg123/dataset-name

Browse files
Files changed (2) hide show
  1. README.md +7 -0
  2. src/axolotl/utils/data.py +13 -14
README.md CHANGED
@@ -262,6 +262,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
262
  - path: vicgalle/alpaca-gpt4
263
  type: alpaca # format from earlier
264
 
 
 
 
 
 
 
265
  # local
266
  datasets:
267
  - path: json
@@ -344,6 +350,7 @@ datasets:
344
  type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
345
  data_files: # path to source data files
346
  shards: # number of shards to split data into
 
347
 
348
  # axolotl attempts to save the dataset as an arrow after packing the data together so
349
  # subsequent training attempts load faster, relative path
 
262
  - path: vicgalle/alpaca-gpt4
263
  type: alpaca # format from earlier
264
 
265
+ # huggingface repo with specific configuration/subset
266
+ datasets:
267
+ - path: EleutherAI/pile
268
+ name: enron_emails
269
+ type: completion # format from earlier
270
+
271
  # local
272
  datasets:
273
  - path: json
 
350
  type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
351
  data_files: # path to source data files
352
  shards: # number of shards to split data into
353
+ name: # name of dataset configuration to load
354
 
355
  # axolotl attempts to save the dataset as an arrow after packing the data together so
356
  # subsequent training attempts load faster, relative path
src/axolotl/utils/data.py CHANGED
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
94
  try:
95
  load_dataset(
96
  d.path,
 
97
  streaming=True,
98
  use_auth_token=use_auth_token,
99
  )
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
107
  if local_path.is_dir():
108
  ds = load_dataset(
109
  d.path,
 
110
  data_files=d.data_files,
111
  streaming=False,
112
  split=None,
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
114
  elif local_path.is_file():
115
  ds = load_dataset(
116
  "json",
 
117
  data_files=d.path,
118
  streaming=False,
119
  split=None,
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
123
  "unhandled dataset load: local path exists, but is neither a directory or a file"
124
  )
125
  elif ds_from_hub:
126
- if d.data_files:
127
- ds = load_dataset(
128
- d.path,
129
- streaming=False,
130
- data_files=d.data_files,
131
- use_auth_token=use_auth_token,
132
- )
133
- else:
134
- ds = load_dataset(
135
- d.path,
136
- streaming=False,
137
- use_auth_token=use_auth_token,
138
- )
139
  else:
140
  fp = hf_hub_download(
141
  repo_id=d.path,
142
  repo_type="dataset",
143
  filename=d.data_files,
144
  )
145
- ds = load_dataset("json", data_files=fp, streaming=False, split=None)
 
 
146
  if not ds:
147
  raise ValueError("unhandled dataset load")
148
  # support for using a subset of the data
 
94
  try:
95
  load_dataset(
96
  d.path,
97
+ name=d.name,
98
  streaming=True,
99
  use_auth_token=use_auth_token,
100
  )
 
108
  if local_path.is_dir():
109
  ds = load_dataset(
110
  d.path,
111
+ name=d.name,
112
  data_files=d.data_files,
113
  streaming=False,
114
  split=None,
 
116
  elif local_path.is_file():
117
  ds = load_dataset(
118
  "json",
119
+ name=d.name,
120
  data_files=d.path,
121
  streaming=False,
122
  split=None,
 
126
  "unhandled dataset load: local path exists, but is neither a directory or a file"
127
  )
128
  elif ds_from_hub:
129
+ ds = load_dataset(
130
+ d.path,
131
+ name=d.name,
132
+ streaming=False,
133
+ data_files=d.data_files,
134
+ use_auth_token=use_auth_token,
135
+ )
 
 
 
 
 
 
136
  else:
137
  fp = hf_hub_download(
138
  repo_id=d.path,
139
  repo_type="dataset",
140
  filename=d.data_files,
141
  )
142
+ ds = load_dataset(
143
+ "json", name=d.name, data_files=fp, streaming=False, split=None
144
+ )
145
  if not ds:
146
  raise ValueError("unhandled dataset load")
147
  # support for using a subset of the data