tmm1 commited on
Commit
f5c11f8
2 Parent(s): 176b888 b4d1d22

Merge pull request #350 from tmm1/group-len-false-examples

Browse files
README.md CHANGED
@@ -426,7 +426,9 @@ save_safetensors:
426
 
427
  # whether to mask out or include the human's prompt from the training labels
428
  train_on_inputs: false
429
- # don't use this, leads to wonky training (according to someone on the internet)
 
 
430
  group_by_length: false
431
 
432
  # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 
426
 
427
  # whether to mask out or include the human's prompt from the training labels
428
  train_on_inputs: false
429
+ # group similarly sized data to minimize padding
430
+ # may be slower to start, as it must download and sort the entire dataset
431
+ # note that training loss may have an oscillating pattern with this enabled
432
  group_by_length: false
433
 
434
  # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
examples/cerebras/qlora.yml CHANGED
@@ -35,7 +35,7 @@ torchdistx_path:
35
  lr_scheduler: cosine
36
  learning_rate: 0.0002
37
  train_on_inputs: false
38
- group_by_length: true
39
  bf16: true
40
  fp16: false
41
  tf32: true
 
35
  lr_scheduler: cosine
36
  learning_rate: 0.0002
37
  train_on_inputs: false
38
+ group_by_length: false
39
  bf16: true
40
  fp16: false
41
  tf32: true
examples/gptj/qlora.yml CHANGED
@@ -32,7 +32,7 @@ torchdistx_path:
32
  lr_scheduler: cosine
33
  learning_rate: 0.0001
34
  train_on_inputs: false
35
- group_by_length: true
36
  bf16: true
37
  fp16: false
38
  tf32: true
 
32
  lr_scheduler: cosine
33
  learning_rate: 0.0001
34
  train_on_inputs: false
35
+ group_by_length: false
36
  bf16: true
37
  fp16: false
38
  tf32: true
examples/llama-2/lora.yml CHANGED
@@ -38,7 +38,7 @@ lr_scheduler: cosine
38
  learning_rate: 0.0002
39
 
40
  train_on_inputs: false
41
- group_by_length: true
42
  bf16: true
43
  fp16: false
44
  tf32: false
 
38
  learning_rate: 0.0002
39
 
40
  train_on_inputs: false
41
+ group_by_length: false
42
  bf16: true
43
  fp16: false
44
  tf32: false
examples/llama-2/qlora.yml CHANGED
@@ -39,7 +39,7 @@ lr_scheduler: cosine
39
  learning_rate: 0.0002
40
 
41
  train_on_inputs: false
42
- group_by_length: true
43
  bf16: true
44
  fp16: false
45
  tf32: false
 
39
  learning_rate: 0.0002
40
 
41
  train_on_inputs: false
42
+ group_by_length: false
43
  bf16: true
44
  fp16: false
45
  tf32: false
examples/openllama-3b/qlora.yml CHANGED
@@ -34,7 +34,7 @@ torchdistx_path:
34
  lr_scheduler: cosine
35
  learning_rate: 0.0002
36
  train_on_inputs: false
37
- group_by_length: true
38
  bf16: true
39
  fp16: false
40
  tf32: true
 
34
  lr_scheduler: cosine
35
  learning_rate: 0.0002
36
  train_on_inputs: false
37
+ group_by_length: false
38
  bf16: true
39
  fp16: false
40
  tf32: true