Nanobit commited on
Commit
c8242de
β€’
2 Parent(s): afaa0d2 79a8f52

Merge pull request #132 from utensil/falcon-7b-qlora

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. examples/falcon/config-7b-qlora.yml +92 -0
README.md CHANGED
@@ -22,7 +22,7 @@
22
  | Pythia | βœ… | βœ… | ❓ | ❌ | ❌ | ❌ | ❓ |
23
  | cerebras | βœ… | βœ… | ❓ | ❌ | ❌ | ❌ | ❓ |
24
  | mpt | βœ… | ❌ | ❓ | ❌ | ❌ | ❌ | ❓ |
25
- | falcon | βœ… | ❌ | ❌ | ❌ | ❌ | ❌ | ❓ |
26
 
27
 
28
  ## Quickstart ⚑
 
22
  | Pythia | βœ… | βœ… | ❓ | ❌ | ❌ | ❌ | ❓ |
23
  | cerebras | βœ… | βœ… | ❓ | ❌ | ❌ | ❌ | ❓ |
24
  | mpt | βœ… | ❌ | ❓ | ❌ | ❌ | ❌ | ❓ |
25
+ | falcon | βœ… | ❌ | βœ… | ❌ | ❌ | ❌ | ❓ |
26
 
27
 
28
  ## Quickstart ⚑
examples/falcon/config-7b-qlora.yml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1b: tiiuae/falcon-rw-1b
2
+ # 40b: tiiuae/falcon-40b
3
+ base_model: tiiuae/falcon-7b
4
+ base_model_config: tiiuae/falcon-7b
5
+ # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
6
+ trust_remote_code: true
7
+ model_type: AutoModelForCausalLM
8
+ tokenizer_type: AutoTokenizer
9
+ load_in_8bit: false
10
+ # enable 4bit for QLoRA
11
+ load_in_4bit: true
12
+ gptq: false
13
+ strict: false
14
+ push_dataset_to_hub:
15
+ datasets:
16
+ - path: QingyiSi/Alpaca-CoT
17
+ data_files:
18
+ - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
19
+ type: "alpaca:chat"
20
+ dataset_prepared_path: last_run_prepared
21
+ val_set_size: 0.01
22
+ # enable QLoRA
23
+ adapter: qlora
24
+ lora_model_dir:
25
+ sequence_len: 2048
26
+ max_packed_sequence_len:
27
+
28
+ # hyperparameters from QLoRA paper Appendix B.2
29
+ # "We find hyperparameters to be largely robust across datasets"
30
+ lora_r: 64
31
+ lora_alpha: 16
32
+ # 0.1 for models up to 13B
33
+ # 0.05 for 33B and 65B models
34
+ lora_dropout: 0.05
35
+ # add LoRA modules on all linear layers of the base model
36
+ lora_target_modules:
37
+ lora_target_linear: true
38
+ lora_fan_in_fan_out:
39
+
40
+ wandb_project:
41
+ wandb_watch:
42
+ wandb_run_id:
43
+ wandb_log_model:
44
+ output_dir: ./qlora-out
45
+
46
+ # QLoRA paper Table 9
47
+ # - 16 for 7b & 13b
48
+ # - 32 for 33b, 64 for 64b
49
+ # Max size tested on A6000
50
+ # - 7b: 40
51
+ # - 40b: 4
52
+ # decrease if OOM, increase for max VRAM utilization
53
+ micro_batch_size: 1
54
+ gradient_accumulation_steps: 2
55
+ num_epochs: 3
56
+ # Optimizer for QLoRA
57
+ optimizer: paged_adamw_32bit
58
+ torchdistx_path:
59
+ lr_scheduler: cosine
60
+ # QLoRA paper Table 9
61
+ # - 2e-4 for 7b & 13b
62
+ # - 1e-4 for 33b & 64b
63
+ learning_rate: 0.0002
64
+ train_on_inputs: false
65
+ group_by_length: false
66
+ bf16: true
67
+ fp16: false
68
+ tf32: true
69
+ gradient_checkpointing: true
70
+ # stop training after this many evaluation losses have increased in a row
71
+ # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
72
+ early_stopping_patience: 3
73
+ resume_from_checkpoint:
74
+ auto_resume_from_checkpoints: true
75
+ local_rank:
76
+ logging_steps: 1
77
+ xformers_attention: true
78
+ flash_attention:
79
+ gptq_groupsize:
80
+ gptq_model_v1:
81
+ warmup_steps: 10
82
+ eval_steps: 5
83
+ save_steps: 10
84
+ debug:
85
+ deepspeed:
86
+ weight_decay: 0.000001
87
+ fsdp:
88
+ fsdp_config:
89
+ special_tokens:
90
+ pad_token: "<|endoftext|>"
91
+ bos_token: ">>ABSTRACT<<"
92
+ eos_token: "<|endoftext|>"