winglian commited on
Commit
dcdec44
β€’
2 Parent(s): 3ffb018 3881143

Merge pull request #306 from ethanhs/xgen

Browse files

Add XGen info to README and example config

Files changed (2) hide show
  1. README.md +1 -0
  2. examples/xgen-7b/xgen-7b-8k-qlora.yml +90 -0
README.md CHANGED
@@ -24,6 +24,7 @@
24
  | mpt | βœ… | ❌ | ❓ | ❌ | ❓ | ❌ | ❌ | ❓ |
25
  | falcon | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❌ | βœ… |
26
  | gpt-j | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❓ | βœ… |
 
27
 
28
 
29
  ## Quickstart ⚑
 
24
  | mpt | βœ… | ❌ | ❓ | ❌ | ❓ | ❌ | ❌ | ❓ |
25
  | falcon | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❌ | βœ… |
26
  | gpt-j | βœ… | βœ… | βœ… | ❌ | ❓ | ❌ | ❓ | βœ… |
27
+ | XGen | βœ… | ❓ | βœ… | ❓ | ❓ | ❓ | ❓ | βœ…
28
 
29
 
30
  ## Quickstart ⚑
examples/xgen-7b/xgen-7b-8k-qlora.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # An example finetuning Saleforce's XGen-7b model with 8k context using qlora
2
+ # on Tim Dettmer's Guanaco dataset.
3
+ base_model: Salesforce/xgen-7b-8k-base
4
+ base_model_config: Salesforce/xgen-7b-8k-base
5
+ trust_remote_code: true
6
+ model_type: AutoModelForCausalLM
7
+ tokenizer_type: AutoTokenizer
8
+ load_in_8bit: false
9
+ # enable 4bit for QLoRA
10
+ load_in_4bit: true
11
+ gptq: false
12
+ strict: false
13
+ push_dataset_to_hub:
14
+ datasets:
15
+ - path: timdettmers/openassistant-guanaco
16
+ data_files:
17
+ - openassistant_best_replies_train.jsonl
18
+ type: "completion"
19
+ dataset_prepared_path: last_run_prepared
20
+ val_set_size: 0.01
21
+ # enable QLoRA
22
+ adapter: qlora
23
+ lora_model_dir:
24
+ sequence_len: 8192
25
+ max_packed_sequence_len:
26
+
27
+ # hyperparameters from QLoRA paper Appendix B.2
28
+ # "We find hyperparameters to be largely robust across datasets"
29
+ lora_r: 64
30
+ lora_alpha: 16
31
+ # 0.1 for models up to 13B
32
+ # 0.05 for 33B and 65B models
33
+ lora_dropout: 0.05
34
+ # add LoRA modules on all linear layers of the base model
35
+ lora_target_modules:
36
+ lora_target_linear: true
37
+ lora_fan_in_fan_out:
38
+
39
+ wandb_project:
40
+ wandb_watch:
41
+ wandb_run_id:
42
+ wandb_log_model:
43
+ output_dir: ./qlora-out
44
+
45
+ # QLoRA paper Table 9
46
+ # - 16 for 7b & 13b
47
+ # - 32 for 33b, 64 for 64b
48
+ # Max size tested on A6000
49
+ # - 7b: 40
50
+ # - 40b: 4
51
+ # decrease if OOM, increase for max VRAM utilization
52
+ micro_batch_size: 1
53
+ gradient_accumulation_steps: 1
54
+ num_epochs: 3
55
+ # Optimizer for QLoRA
56
+ optimizer: paged_adamw_32bit
57
+ torchdistx_path:
58
+ lr_scheduler: cosine
59
+ # QLoRA paper Table 9
60
+ # - 2e-4 for 7b & 13b
61
+ # - 1e-4 for 33b & 64b
62
+ learning_rate: 0.00002
63
+ train_on_inputs: false
64
+ group_by_length: false
65
+ bf16: true
66
+ fp16: false
67
+ tf32: false
68
+ gradient_checkpointing: true
69
+ # stop training after this many evaluation losses have increased in a row
70
+ # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
71
+ early_stopping_patience: 3
72
+ resume_from_checkpoint:
73
+ auto_resume_from_checkpoints: true
74
+ local_rank:
75
+ logging_steps: 1
76
+ xformers_attention: true
77
+ flash_attention:
78
+ gptq_groupsize:
79
+ gptq_model_v1:
80
+ warmup_steps: 10
81
+ eval_steps: 50
82
+ save_steps: 50
83
+ debug:
84
+ deepspeed:
85
+ weight_decay: 0.0
86
+ special_tokens:
87
+ eos_token: "<|endoftext|>"
88
+ bos_token: "<|endoftext|>"
89
+ unk_token: "<|endoftext|>"
90
+ pad_token: "<|endoftext|>"