Nanobit commited on
Commit
00dce35
1 Parent(s): b15b19e

Feat(data): Allow loading local csv and text (#594)

Browse files

* Feat(data): Allow loading local csv and text

* chore: update readme for loading data

Files changed (2) hide show
  1. README.md +4 -4
  2. src/axolotl/utils/data.py +4 -0
README.md CHANGED
@@ -434,10 +434,10 @@ datasets:
434
  - path: vicgalle/alpaca-gpt4
435
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
436
  type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
437
- ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
438
- data_files: # path to source data files
439
- shards: # number of shards to split data into
440
- name: # name of dataset configuration to load
441
 
442
  # custom user prompt
443
  - path: repo
 
434
  - path: vicgalle/alpaca-gpt4
435
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
436
  type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
437
+ ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
438
+ data_files: # Optional[str] path to source data files
439
+ shards: # Optional[int] number of shards to split data into
440
+ name: # Optional[str] name of dataset configuration to load
441
 
442
  # custom user prompt
443
  - path: repo
src/axolotl/utils/data.py CHANGED
@@ -183,6 +183,10 @@ def load_tokenized_prepared_datasets(
183
  ds_type = "parquet"
184
  elif ".arrow" in d.path:
185
  ds_type = "arrow"
 
 
 
 
186
  ds = load_dataset(
187
  ds_type,
188
  name=d.name,
 
183
  ds_type = "parquet"
184
  elif ".arrow" in d.path:
185
  ds_type = "arrow"
186
+ elif ".csv" in d.path:
187
+ ds_type = "csv"
188
+ elif ".txt" in d.path:
189
+ ds_type = "text"
190
  ds = load_dataset(
191
  ds_type,
192
  name=d.name,