winglian commited on
Commit
e50a64e
1 Parent(s): f4868d7

prepared dataset caching, other misc fixes (#665)

Browse files

* prepared dataset caching, other misc fixes

* also don't load from disk cache unless explicit

examples/cerebras/qlora.yml CHANGED
@@ -7,7 +7,7 @@ push_dataset_to_hub:
7
  datasets:
8
  - path: teknium/GPT4-LLM-Cleaned
9
  type: alpaca
10
- dataset_prepared_path: last_run_prepared
11
  val_set_size: 0.01
12
  adapter: qlora
13
  lora_model_dir:
 
7
  datasets:
8
  - path: teknium/GPT4-LLM-Cleaned
9
  type: alpaca
10
+ dataset_prepared_path:
11
  val_set_size: 0.01
12
  adapter: qlora
13
  lora_model_dir:
examples/code-llama/13b/lora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
examples/code-llama/13b/qlora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
examples/code-llama/34b/lora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
examples/code-llama/34b/qlora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
examples/code-llama/7b/lora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
examples/code-llama/7b/qlora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
examples/falcon/config-7b-lora.yml CHANGED
@@ -12,7 +12,7 @@ push_dataset_to_hub:
12
  datasets:
13
  - path: teknium/GPT4-LLM-Cleaned
14
  type: alpaca:chat
15
- dataset_prepared_path: last_run_prepared
16
  val_set_size: 0.01
17
  adapter: lora
18
  lora_model_dir:
 
12
  datasets:
13
  - path: teknium/GPT4-LLM-Cleaned
14
  type: alpaca:chat
15
+ dataset_prepared_path:
16
  val_set_size: 0.01
17
  adapter: lora
18
  lora_model_dir:
examples/falcon/config-7b-qlora.yml CHANGED
@@ -18,7 +18,7 @@ datasets:
18
  data_files:
19
  - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
20
  type: "alpaca:chat"
21
- dataset_prepared_path: last_run_prepared
22
  val_set_size: 0.01
23
  # enable QLoRA
24
  adapter: qlora
 
18
  data_files:
19
  - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
20
  type: "alpaca:chat"
21
+ dataset_prepared_path:
22
  val_set_size: 0.01
23
  # enable QLoRA
24
  adapter: qlora
examples/falcon/config-7b.yml CHANGED
@@ -12,7 +12,7 @@ push_dataset_to_hub:
12
  datasets:
13
  - path: teknium/GPT4-LLM-Cleaned
14
  type: alpaca:chat
15
- dataset_prepared_path: last_run_prepared
16
  val_set_size: 0.01
17
  adapter:
18
  lora_model_dir:
 
12
  datasets:
13
  - path: teknium/GPT4-LLM-Cleaned
14
  type: alpaca:chat
15
+ dataset_prepared_path:
16
  val_set_size: 0.01
17
  adapter:
18
  lora_model_dir:
examples/gptj/qlora.yml CHANGED
@@ -7,7 +7,7 @@ push_dataset_to_hub:
7
  datasets:
8
  - path: teknium/GPT4-LLM-Cleaned
9
  type: alpaca
10
- dataset_prepared_path: last_run_prepared
11
  val_set_size: 0.01
12
  adapter: qlora
13
  lora_model_dir:
 
7
  datasets:
8
  - path: teknium/GPT4-LLM-Cleaned
9
  type: alpaca
10
+ dataset_prepared_path:
11
  val_set_size: 0.01
12
  adapter: qlora
13
  lora_model_dir:
examples/jeopardy-bot/config.yml CHANGED
@@ -6,7 +6,7 @@ load_in_8bit: false
6
  datasets:
7
  - path: openaccess-ai-collective/jeopardy
8
  type: jeopardy
9
- dataset_prepared_path: last_run_prepared
10
  val_set_size: 0.02
11
  adapter:
12
  lora_model_dir:
 
6
  datasets:
7
  - path: openaccess-ai-collective/jeopardy
8
  type: jeopardy
9
+ dataset_prepared_path:
10
  val_set_size: 0.02
11
  adapter:
12
  lora_model_dir:
examples/llama-2/gptq-lora.yml CHANGED
@@ -15,7 +15,7 @@ hf_use_auth_token: true
15
  datasets:
16
  - path: mhenrichsen/alpaca_2k_test
17
  type: alpaca
18
- dataset_prepared_path: last_run_prepared
19
  val_set_size: 0.01
20
  adapter: lora
21
  lora_model_dir:
 
15
  datasets:
16
  - path: mhenrichsen/alpaca_2k_test
17
  type: alpaca
18
+ dataset_prepared_path:
19
  val_set_size: 0.01
20
  adapter: lora
21
  lora_model_dir:
examples/llama-2/lora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./lora-out
17
 
examples/llama-2/qlora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./qlora-out
17
 
examples/llama-2/relora.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: teknium/GPT4-LLM-Cleaned
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./relora-out
17
 
 
11
  datasets:
12
  - path: teknium/GPT4-LLM-Cleaned
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./relora-out
17
 
examples/llama-2/tiny-llama.yml CHANGED
@@ -12,7 +12,7 @@ strict: false
12
  datasets:
13
  - path: mhenrichsen/alpaca_2k_test
14
  type: alpaca
15
- dataset_prepared_path: last_run_prepared
16
  val_set_size: 0.01
17
  output_dir: ./lora-out
18
 
 
12
  datasets:
13
  - path: mhenrichsen/alpaca_2k_test
14
  type: alpaca
15
+ dataset_prepared_path:
16
  val_set_size: 0.01
17
  output_dir: ./lora-out
18
 
examples/mistral/config.yml CHANGED
@@ -11,7 +11,7 @@ strict: false
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
- dataset_prepared_path: last_run_prepared
15
  val_set_size: 0.01
16
  output_dir: ./out
17
 
 
11
  datasets:
12
  - path: mhenrichsen/alpaca_2k_test
13
  type: alpaca
14
+ dataset_prepared_path:
15
  val_set_size: 0.01
16
  output_dir: ./out
17
 
examples/mpt-7b/config.yml CHANGED
@@ -6,7 +6,7 @@ load_in_8bit: false
6
  datasets:
7
  - path: vicgalle/alpaca-gpt4
8
  type: alpaca
9
- dataset_prepared_path: last_run_prepared
10
  val_set_size: 0.02
11
  adapter:
12
  lora_model_dir:
 
6
  datasets:
7
  - path: vicgalle/alpaca-gpt4
8
  type: alpaca
9
+ dataset_prepared_path:
10
  val_set_size: 0.02
11
  adapter:
12
  lora_model_dir:
examples/openllama-3b/config.yml CHANGED
@@ -9,7 +9,7 @@ push_dataset_to_hub:
9
  datasets:
10
  - path: teknium/GPT4-LLM-Cleaned
11
  type: alpaca
12
- dataset_prepared_path: last_run_prepared
13
  val_set_size: 0.02
14
  adapter:
15
  lora_model_dir:
 
9
  datasets:
10
  - path: teknium/GPT4-LLM-Cleaned
11
  type: alpaca
12
+ dataset_prepared_path:
13
  val_set_size: 0.02
14
  adapter:
15
  lora_model_dir:
examples/openllama-3b/lora.yml CHANGED
@@ -9,7 +9,7 @@ push_dataset_to_hub:
9
  datasets:
10
  - path: teknium/GPT4-LLM-Cleaned
11
  type: alpaca
12
- dataset_prepared_path: last_run_prepared
13
  val_set_size: 0.02
14
  adapter: lora
15
  lora_model_dir:
 
9
  datasets:
10
  - path: teknium/GPT4-LLM-Cleaned
11
  type: alpaca
12
+ dataset_prepared_path:
13
  val_set_size: 0.02
14
  adapter: lora
15
  lora_model_dir:
examples/openllama-3b/qlora.yml CHANGED
@@ -9,7 +9,7 @@ push_dataset_to_hub:
9
  datasets:
10
  - path: teknium/GPT4-LLM-Cleaned
11
  type: alpaca
12
- dataset_prepared_path: last_run_prepared
13
  val_set_size: 0.01
14
  adapter: qlora
15
  lora_model_dir:
 
9
  datasets:
10
  - path: teknium/GPT4-LLM-Cleaned
11
  type: alpaca
12
+ dataset_prepared_path:
13
  val_set_size: 0.01
14
  adapter: qlora
15
  lora_model_dir:
examples/phi/phi-ft.yml CHANGED
@@ -13,7 +13,7 @@ datasets:
13
  - path: garage-bAInd/Open-Platypus
14
  type: alpaca
15
 
16
- dataset_prepared_path: last_run_prepared
17
  val_set_size: 0.05
18
  output_dir: ./phi-sft-out
19
 
 
13
  - path: garage-bAInd/Open-Platypus
14
  type: alpaca
15
 
16
+ dataset_prepared_path:
17
  val_set_size: 0.05
18
  output_dir: ./phi-sft-out
19
 
examples/phi/phi-qlora.yml CHANGED
@@ -13,7 +13,7 @@ datasets:
13
  - path: garage-bAInd/Open-Platypus
14
  type: alpaca
15
 
16
- dataset_prepared_path: last_run_prepared
17
  val_set_size: 0.05
18
  output_dir: ./phi-sft-out
19
 
 
13
  - path: garage-bAInd/Open-Platypus
14
  type: alpaca
15
 
16
+ dataset_prepared_path:
17
  val_set_size: 0.05
18
  output_dir: ./phi-sft-out
19
 
examples/pythia-12b/config.yml CHANGED
@@ -10,7 +10,7 @@ device_map: auto
10
  datasets:
11
  - path: vicgalle/alpaca-gpt4
12
  type: alpaca
13
- dataset_prepared_path: last_run_prepared
14
  val_set_size: 0.05
15
  adapter:
16
  lora_model_dir:
 
10
  datasets:
11
  - path: vicgalle/alpaca-gpt4
12
  type: alpaca
13
+ dataset_prepared_path:
14
  val_set_size: 0.05
15
  adapter:
16
  lora_model_dir:
examples/pythia/lora.yml CHANGED
@@ -4,7 +4,7 @@ load_in_8bit: true
4
  datasets:
5
  - path: teknium/GPT4-LLM-Cleaned
6
  type: alpaca
7
- dataset_prepared_path: last_run_prepared
8
  val_set_size: 0.05
9
  adapter: lora
10
  lora_model_dir:
 
4
  datasets:
5
  - path: teknium/GPT4-LLM-Cleaned
6
  type: alpaca
7
+ dataset_prepared_path:
8
  val_set_size: 0.05
9
  adapter: lora
10
  lora_model_dir:
examples/redpajama/config-3b.yml CHANGED
@@ -7,7 +7,7 @@ load_in_8bit: false
7
  datasets:
8
  - path: vicgalle/alpaca-gpt4
9
  type: alpaca
10
- dataset_prepared_path: last_run_prepared
11
  val_set_size: 0.02
12
  adapter:
13
  lora_model_dir:
 
7
  datasets:
8
  - path: vicgalle/alpaca-gpt4
9
  type: alpaca
10
+ dataset_prepared_path:
11
  val_set_size: 0.02
12
  adapter:
13
  lora_model_dir:
examples/replit-3b/config-lora.yml CHANGED
@@ -5,7 +5,7 @@ load_in_8bit: false
5
  datasets:
6
  - path: vicgalle/alpaca-gpt4
7
  type: alpaca
8
- dataset_prepared_path: last_run_prepared
9
  val_set_size: 0.05
10
  adapter: lora
11
  lora_model_dir:
 
5
  datasets:
6
  - path: vicgalle/alpaca-gpt4
7
  type: alpaca
8
+ dataset_prepared_path:
9
  val_set_size: 0.05
10
  adapter: lora
11
  lora_model_dir:
examples/xgen-7b/xgen-7b-8k-qlora.yml CHANGED
@@ -16,7 +16,7 @@ datasets:
16
  data_files:
17
  - openassistant_best_replies_train.jsonl
18
  type: "completion"
19
- dataset_prepared_path: last_run_prepared
20
  val_set_size: 0.01
21
  # enable QLoRA
22
  adapter: qlora
 
16
  data_files:
17
  - openassistant_best_replies_train.jsonl
18
  type: "completion"
19
+ dataset_prepared_path:
20
  val_set_size: 0.01
21
  # enable QLoRA
22
  adapter: qlora
src/axolotl/cli/__init__.py CHANGED
@@ -51,7 +51,7 @@ def print_axolotl_text_art(suffix=None):
51
 
52
 
53
  def get_multi_line_input() -> Optional[str]:
54
- print("Give me an instruction (Ctrl + D to finish): ")
55
  instruction = ""
56
  for line in sys.stdin:
57
  instruction += line # pylint: disable=consider-using-join
 
51
 
52
 
53
  def get_multi_line_input() -> Optional[str]:
54
+ print("Give me an instruction (Ctrl + D to submit): ")
55
  instruction = ""
56
  for line in sys.stdin:
57
  instruction += line # pylint: disable=consider-using-join
src/axolotl/utils/data.py CHANGED
@@ -122,7 +122,7 @@ def load_tokenized_prepared_datasets(
122
 
123
  if dataset:
124
  ...
125
- elif any(prepared_ds_path.glob("*")):
126
  LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
127
  dataset = load_from_disk(str(prepared_ds_path))
128
  LOG.info("Prepared dataset loaded from disk...")
@@ -357,7 +357,7 @@ def load_tokenized_prepared_datasets(
357
  if len(datasets) > 1:
358
  LOG.info("shuffle merged datasets")
359
  dataset = dataset.shuffle(seed=seed)
360
- if cfg.local_rank == 0:
361
  LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
362
  dataset.save_to_disk(prepared_ds_path)
363
  if cfg.push_dataset_to_hub:
@@ -425,7 +425,7 @@ def load_prepare_datasets(
425
 
426
  if dataset:
427
  ...
428
- elif any(prepared_ds_path.glob("*")):
429
  LOG.info(
430
  f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
431
  )
 
122
 
123
  if dataset:
124
  ...
125
+ elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
126
  LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
127
  dataset = load_from_disk(str(prepared_ds_path))
128
  LOG.info("Prepared dataset loaded from disk...")
 
357
  if len(datasets) > 1:
358
  LOG.info("shuffle merged datasets")
359
  dataset = dataset.shuffle(seed=seed)
360
+ if cfg.local_rank == 0 and cfg.dataset_prepared_path:
361
  LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
362
  dataset.save_to_disk(prepared_ds_path)
363
  if cfg.push_dataset_to_hub:
 
425
 
426
  if dataset:
427
  ...
428
+ elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
429
  LOG.info(
430
  f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
431
  )
src/axolotl/utils/tokenization.py CHANGED
@@ -31,7 +31,8 @@ def check_example_labels(example, tokenizer, text_only=False):
31
  )
32
  colored_tokens.append(colored_token)
33
 
34
- LOG.info(" ".join(colored_tokens))
 
35
  LOG.info("\n\n\n")
36
  print(" ".join(colored_tokens))
37
 
 
31
  )
32
  colored_tokens.append(colored_token)
33
 
34
+ delimiter = "" if text_only else " "
35
+ LOG.info(delimiter.join(colored_tokens))
36
  LOG.info("\n\n\n")
37
  print(" ".join(colored_tokens))
38