winglian commited on
Commit
effbbf6
β€’
1 Parent(s): c530e4b

more pruning

Browse files
configs/cerebras_1_3B_alpaca.yml DELETED
@@ -1,40 +0,0 @@
1
- base_model: cerebras/Cerebras-GPT-1.3B
2
- model_type: AutoModelForCausalLM
3
- tokenizer_type: AutoTokenizer
4
- load_in_8bit: true
5
- datasets:
6
- - path: data/alpaca_data_gpt4.jsonl
7
- type: alpaca
8
- - path: data/vicuna_cleaned.jsonl
9
- type: sharegpt
10
- - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
11
- type: gpteacher
12
- - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
13
- type: gpteacher
14
- dataset_prepared_path: last_run_prepared
15
- val_set_size: 0.05
16
- adapter: lora
17
- sequence_len: 2048
18
- lora_r: 8
19
- lora_alpha: 16
20
- lora_dropout: 0.05
21
- lora_target_modules:
22
- - c_attn
23
- lora_fan_in_fan_out: false
24
- wandb_project: pythia-1.4b-lora
25
- wandb_watch:
26
- wandb_run_id:
27
- wandb_log_model:
28
- output_dir: ./lora-alpaca
29
- gradient_accumulation_steps: 1
30
- micro_batch_size: 4
31
- num_epochs: 5
32
- learning_rate: 0.0003
33
- train_on_inputs: false
34
- group_by_length: false
35
- bf16: True
36
- tf32: True
37
- gradient_checkpointing:
38
- early_stopping_patience:
39
- resume_from_checkpoint:
40
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/galactica_1_3B.yml DELETED
@@ -1,41 +0,0 @@
1
- base_model: facebook/galactica-1.3b
2
- model_type: AutoModelForCausalLM
3
- tokenizer_type: AutoTokenizer
4
- load_in_8bit: false
5
- datasets:
6
- - path: tatsu-lab/alpaca
7
- type: alpaca
8
- dataset_prepared_path: last_run_prepared
9
- val_set_size: 0.1
10
- adapter:
11
- lora_model_dir:
12
- sequence_len: 1024
13
- max_packed_sequence_len: 1024
14
- lora_r: 8
15
- lora_alpha: 16
16
- lora_dropout: 0.05
17
- lora_target_modules:
18
- - q_proj
19
- - v_proj
20
- lora_fan_in_fan_out: false
21
- wandb_project:
22
- wandb_watch:
23
- wandb_run_id:
24
- wandb_log_model:
25
- output_dir: ./lora-llama-alpaca
26
- gradient_accumulation_steps: 1
27
- micro_batch_size: 16
28
- num_epochs: 3
29
- learning_rate: 0.00003
30
- train_on_inputs: false
31
- group_by_length: false
32
- bf16: false
33
- tf32: false
34
- early_stopping_patience:
35
- resume_from_checkpoint:
36
- local_rank:
37
- tokens:
38
- pad_token: "[PAD]"
39
- bos_token: "<s>"
40
- eos_token: "</s>"
41
- unk_token: "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/gpt_neox_20b.yml DELETED
@@ -1,39 +0,0 @@
1
- base_model: EleutherAI/gpt-neox-20b
2
- base_model_ignore_patterns: pytorch* # prefer safetensors
3
- model_type: GPTNeoXForCausalLM
4
- tokenizer_type: AutoTokenizer
5
- load_in_8bit: true
6
- datasets:
7
- - path: nomic-ai/gpt4all-j-prompt-generations
8
- type: alpaca
9
- shards: 4
10
- shards_index: 0
11
- dataset_prepared_path: last_run_prepared
12
- val_set_size: 0.05
13
- adapter: lora
14
- lora_model_dir:
15
- sequence_len: 2048
16
- max_packed_sequence_len: 2048
17
- lora_r: 8
18
- lora_alpha: 32
19
- lora_dropout: 0.05
20
- lora_target_modules:
21
- - query_key_value
22
- lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
23
- wandb_project: gpt4all-neox-20b
24
- wandb_watch:
25
- wandb_run_id:
26
- wandb_log_model:
27
- output_dir: ./gpt4all-neox-20b
28
- gradient_accumulation_steps: 1
29
- micro_batch_size: 4
30
- num_epochs: 5
31
- learning_rate: 0.00003
32
- lr_scheduler: one_cycle
33
- train_on_inputs: false
34
- group_by_length: false
35
- bf16: True
36
- tf32: True
37
- early_stopping_patience:
38
- resume_from_checkpoint:
39
- local_rank:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/stability_3b.yml β†’ examples/cerebras/qlora.yml RENAMED
@@ -1,38 +1,45 @@
1
- base_model: stabilityai/stablelm-base-alpha-3b
2
- base_model_config: stabilityai/stablelm-base-alpha-3b
3
  load_in_8bit: false
 
 
 
4
  datasets:
5
- - path: vicgalle/alpaca-gpt4
6
  type: alpaca
7
  dataset_prepared_path: last_run_prepared
8
- val_set_size: 0.04
9
- adapter:
10
  lora_model_dir:
11
- sequence_len: 4096
12
- max_packed_sequence_len: 4096
13
- lora_r: 8
14
- lora_alpha: 16
15
  lora_dropout: 0.05
16
  lora_target_modules:
17
- - q_proj
18
- - v_proj
19
- lora_fan_in_fan_out: false
20
- wandb_project: stable-alpaca-3b
 
 
21
  wandb_watch:
22
  wandb_run_id:
23
  wandb_log_model:
24
- output_dir: ./stable-alpaca-3b
25
- gradient_accumulation_steps: 1
26
- micro_batch_size: 1
27
- num_epochs: 1
28
- optimizer: adamw_bnb_8bit
29
  torchdistx_path:
30
  lr_scheduler: cosine
31
- learning_rate: 0.0000002
32
  train_on_inputs: false
33
- group_by_length: false
34
  bf16: true
 
35
  tf32: true
 
36
  early_stopping_patience:
37
  resume_from_checkpoint:
38
  local_rank:
@@ -41,16 +48,13 @@ xformers_attention: true
41
  flash_attention:
42
  gptq_groupsize:
43
  gptq_model_v1:
44
- warmup_steps: 100
45
- eval_steps: 50
46
- save_steps: 200
47
  debug:
48
  deepspeed:
49
- weight_decay: 0.01
50
  fsdp:
51
  fsdp_config:
52
- #tokens:
53
- # pad_token: "[PAD]"
54
- # bos_token: "<s>"
55
- # eos_token: "</s>"
56
- # unk_token: "<unk>"
 
1
+ base_model: cerebras/Cerebras-GPT-1.3B
2
+ base_model_config: cerebras/Cerebras-GPT-1.3B
3
  load_in_8bit: false
4
+ load_in_4bit: true
5
+ strict: false
6
+ push_dataset_to_hub:
7
  datasets:
8
+ - path: teknium/GPT4-LLM-Cleaned
9
  type: alpaca
10
  dataset_prepared_path: last_run_prepared
11
+ val_set_size: 0.01
12
+ adapter: qlora
13
  lora_model_dir:
14
+ sequence_len: 2048
15
+ max_packed_sequence_len: 2048
16
+ lora_r: 16
17
+ lora_alpha: 32
18
  lora_dropout: 0.05
19
  lora_target_modules:
20
+ - c_fc
21
+ - c_attn
22
+ - c_proj
23
+ lora_target_linear:
24
+ lora_fan_in_fan_out:
25
+ wandb_project:
26
  wandb_watch:
27
  wandb_run_id:
28
  wandb_log_model:
29
+ output_dir: ./qlora-out
30
+ batch_size: 4
31
+ micro_batch_size: 4
32
+ num_epochs: 2
33
+ optimizer: paged_adamw_8bit
34
  torchdistx_path:
35
  lr_scheduler: cosine
36
+ learning_rate: 0.0002
37
  train_on_inputs: false
38
+ group_by_length: true
39
  bf16: true
40
+ fp16: false
41
  tf32: true
42
+ gradient_checkpointing: true
43
  early_stopping_patience:
44
  resume_from_checkpoint:
45
  local_rank:
 
48
  flash_attention:
49
  gptq_groupsize:
50
  gptq_model_v1:
51
+ warmup_steps: 10
52
+ eval_steps: 20
53
+ save_steps:
54
  debug:
55
  deepspeed:
56
+ weight_decay: 0.1
57
  fsdp:
58
  fsdp_config:
59
+ special_tokens:
60
+ pad_token: "<|endoftext|>"