utensil commited on
Commit
ca11ae9
1 Parent(s): fb3d40f

Add comments/alternatives for falcon-qlora configs

Browse files
Files changed (1) hide show
  1. examples/falcon/config-7b-qlora.yml +25 -1
examples/falcon/config-7b-qlora.yml CHANGED
@@ -1,9 +1,13 @@
 
 
1
  base_model: tiiuae/falcon-7b
2
  base_model_config: tiiuae/falcon-7b
 
3
  trust_remote_code: true
4
  model_type: AutoModelForCausalLM
5
  tokenizer_type: AutoTokenizer
6
  load_in_8bit: false
 
7
  load_in_4bit: true
8
  gptq: false
9
  strict: false
@@ -15,27 +19,47 @@ datasets:
15
  type: "alpaca:chat"
16
  dataset_prepared_path: last_run_prepared
17
  val_set_size: 0.01
 
18
  adapter: qlora
19
  lora_model_dir:
20
  sequence_len: 2048
21
  max_packed_sequence_len:
 
 
 
22
  lora_r: 64
23
  lora_alpha: 16
 
 
24
  lora_dropout: 0.05
 
25
  lora_target_modules:
26
  lora_target_linear: true
27
  lora_fan_in_fan_out:
 
28
  wandb_project: falcon-qlora
29
  wandb_watch:
30
  wandb_run_id:
31
  wandb_log_model:
32
  output_dir: ./qlora-out
33
- micro_batch_size: 40
 
 
 
 
 
 
 
 
34
  gradient_accumulation_steps: 2
35
  num_epochs: 3
 
36
  optimizer: paged_adamw_32bit
37
  torchdistx_path:
38
  lr_scheduler: cosine
 
 
 
39
  learning_rate: 0.0002
40
  train_on_inputs: false
41
  group_by_length: false
 
1
+ # 1b: tiiuae/falcon-rw-1b
2
+ # 40b: tiiuae/falcon-40b
3
  base_model: tiiuae/falcon-7b
4
  base_model_config: tiiuae/falcon-7b
5
+ # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
6
  trust_remote_code: true
7
  model_type: AutoModelForCausalLM
8
  tokenizer_type: AutoTokenizer
9
  load_in_8bit: false
10
+ # enable 4bit for QLoRA
11
  load_in_4bit: true
12
  gptq: false
13
  strict: false
 
19
  type: "alpaca:chat"
20
  dataset_prepared_path: last_run_prepared
21
  val_set_size: 0.01
22
+ # enable QLoRA
23
  adapter: qlora
24
  lora_model_dir:
25
  sequence_len: 2048
26
  max_packed_sequence_len:
27
+
28
+ # hyperparameters from QLoRA paper Appendix B.2
29
+ # "We find hyperparameters to be largely robust across datasets"
30
  lora_r: 64
31
  lora_alpha: 16
32
+ # 0.1 for models up to 13B
33
+ # 0.05 for 33B and 65B models
34
  lora_dropout: 0.05
35
+ # add LoRA modules on all linear layers of the base model
36
  lora_target_modules:
37
  lora_target_linear: true
38
  lora_fan_in_fan_out:
39
+
40
  wandb_project: falcon-qlora
41
  wandb_watch:
42
  wandb_run_id:
43
  wandb_log_model:
44
  output_dir: ./qlora-out
45
+
46
+ # QLoRA paper Table 9
47
+ # - 16 for 7b & 13b
48
+ # - 32 for 33b, 64 for 64b
49
+ # Max size tested on A6000
50
+ # - 7b: 40
51
+ # - 40b: 4
52
+ # decrease if OOM, increase for max VRAM utilization
53
+ micro_batch_size: 30
54
  gradient_accumulation_steps: 2
55
  num_epochs: 3
56
+ # Optimizer for QLoRA
57
  optimizer: paged_adamw_32bit
58
  torchdistx_path:
59
  lr_scheduler: cosine
60
+ # QLoRA paper Table 9
61
+ # - 2e-4 for 7b & 13b
62
+ # - 1e-4 for 33b & 64b
63
  learning_rate: 0.0002
64
  train_on_inputs: false
65
  group_by_length: false