winglian commited on
Commit
5f79b82
1 Parent(s): f1de29d

new evals_per_epoch and saves_per_epoch to make things cleaner (#944)

Browse files

* new evals_per_epoch and saves_per_epoch to make things cleaner

* update per PR feedback

README.md CHANGED
@@ -691,9 +691,11 @@ warmup_ratio: 0.05 # cannot use with warmup_steps
691
  learning_rate: 0.00003
692
  lr_quadratic_warmup:
693
  logging_steps:
 
 
694
  save_strategy: # Set to `no` to skip checkpoint saves
695
  save_steps: # Leave empty to save at each epoch
696
- eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
697
  save_total_limit: # Checkpoints saved at a time
698
  # Maximum number of iterations to train for. It precedes num_epochs which means that
699
  # if both are set, num_epochs will not be guaranteed.
 
691
  learning_rate: 0.00003
692
  lr_quadratic_warmup:
693
  logging_steps:
694
+ eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
695
+ evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
696
  save_strategy: # Set to `no` to skip checkpoint saves
697
  save_steps: # Leave empty to save at each epoch
698
+ saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
699
  save_total_limit: # Checkpoints saved at a time
700
  # Maximum number of iterations to train for. It precedes num_epochs which means that
701
  # if both are set, num_epochs will not be guaranteed.
examples/cerebras/btlm-ft.yml CHANGED
@@ -72,8 +72,8 @@ gptq_groupsize:
72
  gptq_model_v1:
73
 
74
  warmup_steps: 32
75
- eval_steps:
76
- save_steps:
77
  save_total_limit:
78
 
79
  debug:
 
72
  gptq_model_v1:
73
 
74
  warmup_steps: 32
75
+ evals_per_epoch: 4
76
+ saves_per_epoch: 1
77
  save_total_limit:
78
 
79
  debug:
examples/cerebras/qlora.yml CHANGED
@@ -49,8 +49,8 @@ flash_attention:
49
  gptq_groupsize:
50
  gptq_model_v1:
51
  warmup_steps: 10
52
- eval_steps: 0.05
53
- save_steps:
54
  debug:
55
  deepspeed:
56
  weight_decay: 0.1
 
49
  gptq_groupsize:
50
  gptq_model_v1:
51
  warmup_steps: 10
52
+ evals_per_epoch: 4
53
+ saves_per_epoch: 1
54
  debug:
55
  deepspeed:
56
  weight_decay: 0.1
examples/code-llama/13b/lora.yml CHANGED
@@ -54,8 +54,8 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 0.05
58
- save_steps:
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.0
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ evals_per_epoch: 4
58
+ saves_per_epoch: 1
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.0
examples/code-llama/13b/qlora.yml CHANGED
@@ -56,8 +56,8 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 0.05
60
- save_steps:
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ evals_per_epoch: 4
60
+ saves_per_epoch: 1
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
examples/code-llama/34b/lora.yml CHANGED
@@ -54,8 +54,8 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 0.05
58
- save_steps:
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.0
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ evals_per_epoch: 4
58
+ saves_per_epoch: 1
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.0
examples/code-llama/34b/qlora.yml CHANGED
@@ -56,8 +56,8 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 0.05
60
- save_steps:
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ evals_per_epoch: 4
60
+ saves_per_epoch: 1
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
examples/code-llama/7b/lora.yml CHANGED
@@ -54,8 +54,8 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 0.05
58
- save_steps:
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.0
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ evals_per_epoch: 4
58
+ saves_per_epoch: 1
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.0
examples/code-llama/7b/qlora.yml CHANGED
@@ -56,8 +56,8 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 0.05
60
- save_steps:
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ evals_per_epoch: 4
60
+ saves_per_epoch: 1
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
examples/falcon/config-7b-lora.yml CHANGED
@@ -51,8 +51,8 @@ flash_attention:
51
  gptq_groupsize:
52
  gptq_model_v1:
53
  warmup_steps: 40
54
- eval_steps: 5
55
- save_steps: 43
56
  debug:
57
  deepspeed:
58
  weight_decay: 0.0
 
51
  gptq_groupsize:
52
  gptq_model_v1:
53
  warmup_steps: 40
54
+ evals_per_epoch: 4
55
+ saves_per_epoch: 1
56
  debug:
57
  deepspeed:
58
  weight_decay: 0.0
examples/falcon/config-7b-qlora.yml CHANGED
@@ -80,8 +80,8 @@ flash_attention:
80
  gptq_groupsize:
81
  gptq_model_v1:
82
  warmup_steps: 10
83
- eval_steps: 5
84
- save_steps: 10
85
  debug:
86
  deepspeed:
87
  weight_decay: 0.000001
 
80
  gptq_groupsize:
81
  gptq_model_v1:
82
  warmup_steps: 10
83
+ evals_per_epoch: 4
84
+ saves_per_epoch: 1
85
  debug:
86
  deepspeed:
87
  weight_decay: 0.000001
examples/falcon/config-7b.yml CHANGED
@@ -51,8 +51,8 @@ flash_attention:
51
  gptq_groupsize:
52
  gptq_model_v1:
53
  warmup_steps: 40
54
- eval_steps: 5
55
- save_steps: 43
56
  debug:
57
  deepspeed:
58
  weight_decay: 0.0
 
51
  gptq_groupsize:
52
  gptq_model_v1:
53
  warmup_steps: 40
54
+ evals_per_epoch: 4
55
+ saves_per_epoch: 1
56
  debug:
57
  deepspeed:
58
  weight_decay: 0.0
examples/gptj/qlora.yml CHANGED
@@ -46,8 +46,8 @@ flash_attention:
46
  gptq_groupsize:
47
  gptq_model_v1:
48
  warmup_steps: 10
49
- eval_steps: 0.05
50
- save_steps:
51
  debug:
52
  deepspeed:
53
  weight_decay: 0.1
 
46
  gptq_groupsize:
47
  gptq_model_v1:
48
  warmup_steps: 10
49
+ evals_per_epoch: 4
50
+ saves_per_epoch: 1
51
  debug:
52
  deepspeed:
53
  weight_decay: 0.1
examples/jeopardy-bot/config.yml CHANGED
@@ -42,8 +42,8 @@ flash_attention:
42
  gptq_groupsize:
43
  gptq_model_v1:
44
  warmup_steps: 20
45
- eval_steps: 110
46
- save_steps: 660
47
  debug:
48
  deepspeed:
49
  weight_decay: 0.1
 
42
  gptq_groupsize:
43
  gptq_model_v1:
44
  warmup_steps: 20
45
+ evals_per_epoch: 4
46
+ saves_per_epoch: 1
47
  debug:
48
  deepspeed:
49
  weight_decay: 0.1
examples/llama-2/fft_optimized.yml CHANGED
@@ -58,9 +58,9 @@ flash_attn_fuse_qkv: false
58
  flash_attn_fuse_mlp: true
59
 
60
  warmup_steps: 100
61
- eval_steps: 0.05
62
  eval_table_size:
63
- save_steps:
64
  debug:
65
  deepspeed: #deepspeed/zero2.json # multi-gpu only
66
  weight_decay: 0.1
 
58
  flash_attn_fuse_mlp: true
59
 
60
  warmup_steps: 100
61
+ evals_per_epoch: 4
62
  eval_table_size:
63
+ saves_per_epoch: 1
64
  debug:
65
  deepspeed: #deepspeed/zero2.json # multi-gpu only
66
  weight_decay: 0.1
examples/llama-2/gptq-lora.yml CHANGED
@@ -62,8 +62,8 @@ flash_attention:
62
  sdp_attention:
63
  flash_optimum:
64
  warmup_steps: 100
65
- eval_steps:
66
- save_steps:
67
  debug:
68
  deepspeed:
69
  weight_decay: 0.1
 
62
  sdp_attention:
63
  flash_optimum:
64
  warmup_steps: 100
65
+ evals_per_epoch: 4
66
+ saves_per_epoch: 1
67
  debug:
68
  deepspeed:
69
  weight_decay: 0.1
examples/llama-2/lora.yml CHANGED
@@ -54,10 +54,10 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 0.05
58
  eval_table_size:
59
  eval_table_max_new_tokens: 128
60
- save_steps:
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ evals_per_epoch: 4
58
  eval_table_size:
59
  eval_table_max_new_tokens: 128
60
+ saves_per_epoch: 1
61
  debug:
62
  deepspeed:
63
  weight_decay: 0.0
examples/llama-2/qlora.yml CHANGED
@@ -56,9 +56,9 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 0.05
60
  eval_table_size:
61
- save_steps:
62
  debug:
63
  deepspeed:
64
  weight_decay: 0.0
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ evals_per_epoch: 4
60
  eval_table_size:
61
+ saves_per_epoch: 1
62
  debug:
63
  deepspeed:
64
  weight_decay: 0.0
examples/llama-2/relora.yml CHANGED
@@ -60,8 +60,8 @@ xformers_attention:
60
  flash_attention: true
61
 
62
  warmup_steps: 10
63
- eval_steps: 0.05
64
- save_steps: 50
65
  debug:
66
  deepspeed:
67
  weight_decay: 0.0
 
60
  flash_attention: true
61
 
62
  warmup_steps: 10
63
+ evals_per_epoch: 4
64
+ saves_per_epoch: 1
65
  debug:
66
  deepspeed:
67
  weight_decay: 0.0
examples/llama-2/tiny-llama.yml CHANGED
@@ -54,9 +54,9 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 0.05
58
  eval_table_size:
59
- save_steps:
60
  debug:
61
  deepspeed:
62
  weight_decay: 0.0
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ evals_per_epoch: 4
58
  eval_table_size:
59
+ saves_per_epoch: 1
60
  debug:
61
  deepspeed:
62
  weight_decay: 0.0
examples/mamba/config.yml CHANGED
@@ -47,10 +47,10 @@ xformers_attention:
47
  flash_attention:
48
 
49
  warmup_steps: 10
50
- eval_steps:
51
  eval_table_size:
52
  eval_table_max_new_tokens: 128
53
- save_steps: 0.25
54
  debug:
55
  deepspeed:
56
  weight_decay: 0.0
 
47
  flash_attention:
48
 
49
  warmup_steps: 10
50
+ evals_per_epoch: 4
51
  eval_table_size:
52
  eval_table_max_new_tokens: 128
53
+ saves_per_epoch: 1
54
  debug:
55
  deepspeed:
56
  weight_decay: 0.0
examples/mistral/config.yml CHANGED
@@ -46,10 +46,10 @@ xformers_attention:
46
  flash_attention: true
47
 
48
  warmup_steps: 10
49
- eval_steps: 0.05
50
  eval_table_size:
51
  eval_table_max_new_tokens: 128
52
- save_steps:
53
  debug:
54
  deepspeed:
55
  weight_decay: 0.0
 
46
  flash_attention: true
47
 
48
  warmup_steps: 10
49
+ evals_per_epoch: 4
50
  eval_table_size:
51
  eval_table_max_new_tokens: 128
52
+ saves_per_epoch: 1
53
  debug:
54
  deepspeed:
55
  weight_decay: 0.0
examples/mistral/mixtral.yml CHANGED
@@ -67,10 +67,10 @@ loss_watchdog_threshold: 5.0
67
  loss_watchdog_patience: 3
68
 
69
  warmup_steps: 10
70
- eval_steps:
71
  eval_table_size:
72
  eval_table_max_new_tokens: 128
73
- save_steps:
74
  debug:
75
  deepspeed: deepspeed/zero2.json
76
  weight_decay: 0.0
 
67
  loss_watchdog_patience: 3
68
 
69
  warmup_steps: 10
70
+ evals_per_epoch: 4
71
  eval_table_size:
72
  eval_table_max_new_tokens: 128
73
+ saves_per_epoch: 1
74
  debug:
75
  deepspeed: deepspeed/zero2.json
76
  weight_decay: 0.0
examples/mistral/qlora.yml CHANGED
@@ -66,10 +66,10 @@ loss_watchdog_threshold: 5.0
66
  loss_watchdog_patience: 3
67
 
68
  warmup_steps: 10
69
- eval_steps: 0.05
70
  eval_table_size:
71
  eval_table_max_new_tokens: 128
72
- save_steps:
73
  debug:
74
  deepspeed:
75
  weight_decay: 0.0
 
66
  loss_watchdog_patience: 3
67
 
68
  warmup_steps: 10
69
+ evals_per_epoch: 4
70
  eval_table_size:
71
  eval_table_max_new_tokens: 128
72
+ saves_per_epoch: 1
73
  debug:
74
  deepspeed:
75
  weight_decay: 0.0
examples/mpt-7b/config.yml CHANGED
@@ -44,8 +44,8 @@ flash_attention:
44
  gptq_groupsize:
45
  gptq_model_v1:
46
  warmup_steps: 20
47
- eval_steps: 110
48
- save_steps: 660
49
  debug:
50
  deepspeed:
51
  weight_decay: 0.0001
 
44
  gptq_groupsize:
45
  gptq_model_v1:
46
  warmup_steps: 20
47
+ evals_per_epoch: 4
48
+ saves_per_epoch: 1
49
  debug:
50
  deepspeed:
51
  weight_decay: 0.0001
examples/openllama-3b/config.yml CHANGED
@@ -49,8 +49,8 @@ flash_attention: true
49
  gptq_groupsize:
50
  gptq_model_v1:
51
  warmup_steps: 20
52
- eval_steps: 0.05
53
- save_steps:
54
  debug:
55
  deepspeed:
56
  weight_decay: 0.1
 
49
  gptq_groupsize:
50
  gptq_model_v1:
51
  warmup_steps: 20
52
+ evals_per_epoch: 4
53
+ saves_per_epoch: 1
54
  debug:
55
  deepspeed:
56
  weight_decay: 0.1
examples/openllama-3b/lora.yml CHANGED
@@ -54,8 +54,8 @@ flash_attention: true
54
  gptq_groupsize:
55
  gptq_model_v1:
56
  warmup_steps: 20
57
- eval_steps: 0.05
58
- save_steps:
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.1
 
54
  gptq_groupsize:
55
  gptq_model_v1:
56
  warmup_steps: 20
57
+ evals_per_epoch: 4
58
+ saves_per_epoch: 1
59
  debug:
60
  deepspeed:
61
  weight_decay: 0.1
examples/openllama-3b/qlora.yml CHANGED
@@ -48,8 +48,8 @@ flash_attention: true
48
  gptq_groupsize:
49
  gptq_model_v1:
50
  warmup_steps: 20
51
- eval_steps: 0.05
52
- save_steps:
53
  debug:
54
  deepspeed:
55
  weight_decay: 0.1
 
48
  gptq_groupsize:
49
  gptq_model_v1:
50
  warmup_steps: 20
51
+ evals_per_epoch: 4
52
+ saves_per_epoch: 1
53
  debug:
54
  deepspeed:
55
  weight_decay: 0.1
examples/phi/phi-ft.yml CHANGED
@@ -59,8 +59,8 @@ xformers_attention:
59
  flash_attention:
60
 
61
  warmup_steps: 100
62
- eval_steps: 0.05
63
- save_steps:
64
  debug:
65
  deepspeed:
66
  weight_decay: 0.1
 
59
  flash_attention:
60
 
61
  warmup_steps: 100
62
+ evals_per_epoch: 4
63
+ saves_per_epoch: 1
64
  debug:
65
  deepspeed:
66
  weight_decay: 0.1
examples/phi/phi-qlora.yml CHANGED
@@ -59,8 +59,8 @@ xformers_attention:
59
  flash_attention:
60
 
61
  warmup_steps: 100
62
- eval_steps: 0.05
63
- save_steps:
64
  debug:
65
  deepspeed:
66
  weight_decay: 0.1
 
59
  flash_attention:
60
 
61
  warmup_steps: 100
62
+ evals_per_epoch: 4
63
+ saves_per_epoch: 1
64
  debug:
65
  deepspeed:
66
  weight_decay: 0.1
examples/pythia/lora.yml CHANGED
@@ -33,5 +33,5 @@ early_stopping_patience:
33
  resume_from_checkpoint:
34
  local_rank:
35
  weight_decay: 0.1
36
- eval_steps: 0.05
37
  logging_steps: 1
 
33
  resume_from_checkpoint:
34
  local_rank:
35
  weight_decay: 0.1
36
+ evals_per_epoch: 4
37
  logging_steps: 1
examples/qwen/lora.yml CHANGED
@@ -56,10 +56,10 @@ xformers_attention:
56
  flash_attention:
57
 
58
  warmup_steps: 10
59
- eval_steps: 0.05
60
  eval_table_size:
61
  eval_table_max_new_tokens: 128
62
- save_steps:
63
  debug:
64
  deepspeed:
65
  weight_decay: 0.0
 
56
  flash_attention:
57
 
58
  warmup_steps: 10
59
+ evals_per_epoch: 4
60
  eval_table_size:
61
  eval_table_max_new_tokens: 128
62
+ saves_per_epoch: 1
63
  debug:
64
  deepspeed:
65
  weight_decay: 0.0
examples/qwen/qlora.yml CHANGED
@@ -56,10 +56,10 @@ xformers_attention:
56
  flash_attention:
57
 
58
  warmup_steps: 10
59
- eval_steps: 0.05
60
  eval_table_size:
61
  eval_table_max_new_tokens: 128
62
- save_steps:
63
  debug:
64
  deepspeed:
65
  weight_decay: 0.0
 
56
  flash_attention:
57
 
58
  warmup_steps: 10
59
+ evals_per_epoch: 4
60
  eval_table_size:
61
  eval_table_max_new_tokens: 128
62
+ saves_per_epoch: 1
63
  debug:
64
  deepspeed:
65
  weight_decay: 0.0
examples/redpajama/config-3b.yml CHANGED
@@ -45,8 +45,8 @@ flash_attention:
45
  gptq_groupsize:
46
  gptq_model_v1:
47
  warmup_steps: 20
48
- eval_steps: 110
49
- save_steps: 660
50
  debug:
51
  deepspeed:
52
  weight_decay: 0.0001
 
45
  gptq_groupsize:
46
  gptq_model_v1:
47
  warmup_steps: 20
48
+ evals_per_epoch: 4
49
+ saves_per_epoch: 1
50
  debug:
51
  deepspeed:
52
  weight_decay: 0.0001
examples/replit-3b/config-lora.yml CHANGED
@@ -45,8 +45,8 @@ flash_attention:
45
  gptq_groupsize:
46
  gptq_model_v1:
47
  warmup_steps: 20
48
- eval_steps: 50
49
- save_steps:
50
  debug:
51
  deepspeed:
52
  weight_decay: 0
 
45
  gptq_groupsize:
46
  gptq_model_v1:
47
  warmup_steps: 20
48
+ evals_per_epoch: 4
49
+ saves_per_epoch: 1
50
  debug:
51
  deepspeed:
52
  weight_decay: 0
examples/xgen-7b/xgen-7b-8k-qlora.yml CHANGED
@@ -78,8 +78,8 @@ flash_attention:
78
  gptq_groupsize:
79
  gptq_model_v1:
80
  warmup_steps: 10
81
- eval_steps: 50
82
- save_steps: 50
83
  debug:
84
  deepspeed:
85
  weight_decay: 0.0
 
78
  gptq_groupsize:
79
  gptq_model_v1:
80
  warmup_steps: 10
81
+ evals_per_epoch: 4
82
+ saves_per_epoch: 1
83
  debug:
84
  deepspeed:
85
  weight_decay: 0.0
src/axolotl/utils/config.py CHANGED
@@ -77,6 +77,15 @@ def normalize_config(cfg):
77
  else:
78
  cfg.torch_dtype = torch.float32
79
 
 
 
 
 
 
 
 
 
 
80
  cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
81
 
82
  if not cfg.base_model_config:
@@ -352,6 +361,27 @@ def validate_config(cfg):
352
  cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
353
  "sharegpt_simple", "sharegpt"
354
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
356
  raise ValueError(
357
  "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
 
77
  else:
78
  cfg.torch_dtype = torch.float32
79
 
80
+ if cfg.saves_per_epoch:
81
+ save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
82
+ if save_steps < 1.0: # prevent saves on every step
83
+ cfg.save_steps = save_steps
84
+ if cfg.evals_per_epoch:
85
+ eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
86
+ if eval_steps < 1.0: # prevent evals on every step
87
+ cfg.eval_steps = eval_steps
88
+
89
  cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
90
 
91
  if not cfg.base_model_config:
 
361
  cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
362
  "sharegpt_simple", "sharegpt"
363
  )
364
+
365
+ if cfg.saves_per_epoch and cfg.save_steps:
366
+ raise ValueError(
367
+ "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
368
+ )
369
+ if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps":
370
+ raise ValueError(
371
+ "save_strategy must be empty or set to `steps` when used with saves_per_epoch."
372
+ )
373
+ if cfg.evals_per_epoch and cfg.eval_steps:
374
+ raise ValueError(
375
+ "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
376
+ )
377
+ if (
378
+ cfg.evals_per_epoch
379
+ and cfg.evaluation_strategy
380
+ and cfg.evaluation_strategy != "steps"
381
+ ):
382
+ raise ValueError(
383
+ "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
384
+ )
385
  if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
386
  raise ValueError(
387
  "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."