winglian commited on
Commit
62eaee7
1 Parent(s): be75668

make phi training work with Loras (#588)

Browse files

* valdiation for phi loras

* fix model config class check

* update readme for phi traiing

examples/phi/README.md CHANGED
@@ -1,7 +1,11 @@
1
  # Phi
2
 
3
- Due to some nuances with the phi code, please use deepspeed when training phi.
4
 
5
  ```shell
6
- accelerate launch scripts/finetune.py examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json
 
 
 
 
7
  ```
 
1
  # Phi
2
 
3
+ Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.
4
 
5
  ```shell
6
+ accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json
7
+
8
+ # OR
9
+
10
+ python -m axolotl.cli.train examples/phi/phi-qlora.yml
11
  ```
examples/phi/phi-qlora.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: microsoft/phi-1_5
2
+ base_model_config: microsoft/phi-1_5
3
+ model_type: AutoModelForCausalLM
4
+ tokenizer_type: AutoTokenizer
5
+ is_llama_derived_model: false
6
+ trust_remote_code: true
7
+
8
+ load_in_8bit: false
9
+ load_in_4bit: true
10
+ strict: false
11
+
12
+ datasets:
13
+ - path: garage-bAInd/Open-Platypus
14
+ type: alpaca
15
+
16
+ dataset_prepared_path: last_run_prepared
17
+ val_set_size: 0.05
18
+ output_dir: ./phi-sft-out
19
+
20
+ sequence_len: 1024
21
+ sample_packing: false # not CURRENTLY compatible with LoRAs
22
+ pad_to_sequence_len:
23
+
24
+ adapter: qlora
25
+ lora_model_dir:
26
+ lora_r: 64
27
+ lora_alpha: 32
28
+ lora_dropout: 0.05
29
+ lora_target_linear: true
30
+ lora_fan_in_fan_out:
31
+
32
+ wandb_project:
33
+ wandb_entity:
34
+ wandb_watch:
35
+ wandb_run_id:
36
+ wandb_log_model:
37
+
38
+ gradient_accumulation_steps: 1
39
+ micro_batch_size: 1
40
+ num_epochs: 4
41
+ optimizer: adamw_torch
42
+ adam_beta2: 0.95
43
+ adam_epsilon: 0.00001
44
+ max_grad_norm: 1.0
45
+ lr_scheduler: cosine
46
+ learning_rate: 0.000003
47
+
48
+ train_on_inputs: false
49
+ group_by_length: true
50
+ bf16: true
51
+ fp16: false
52
+ tf32: true
53
+
54
+ gradient_checkpointing:
55
+ early_stopping_patience:
56
+ resume_from_checkpoint:
57
+ local_rank:
58
+ logging_steps: 1
59
+ xformers_attention:
60
+ flash_attention:
61
+
62
+ warmup_steps: 100
63
+ eval_steps: 0.05
64
+ save_steps:
65
+ debug:
66
+ deepspeed:
67
+ weight_decay: 0.1
68
+ fsdp:
69
+ fsdp_config:
70
+ resize_token_embeddings_to_32x: true
71
+ special_tokens:
72
+ bos_token: "<|endoftext|>"
73
+ eos_token: "<|endoftext|>"
74
+ unk_token: "<|endoftext|>"
75
+ pad_token: "<|endoftext|>"
src/axolotl/utils/config.py CHANGED
@@ -75,6 +75,7 @@ def normalize_config(cfg):
75
  cfg.torch_dtype = torch.float32
76
 
77
  model_config = load_model_config(cfg)
 
78
 
79
  # figure out if the model is llama
80
  cfg.is_llama_derived_model = (
@@ -237,6 +238,21 @@ def validate_config(cfg):
237
  raise ValueError(
238
  "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
239
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  # TODO
241
  # MPT 7b
242
  # https://github.com/facebookresearch/bitsandbytes/issues/25
 
75
  cfg.torch_dtype = torch.float32
76
 
77
  model_config = load_model_config(cfg)
78
+ cfg.model_config_type = model_config.model_type
79
 
80
  # figure out if the model is llama
81
  cfg.is_llama_derived_model = (
 
238
  raise ValueError(
239
  "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
240
  )
241
+
242
+ if cfg.model_type == "MixFormerSequentialForCausalLM" and cfg.adapter is not None:
243
+ LOG.warning("Use AutoModelForCausalLM for phi/MixFormer models with qLoRA")
244
+
245
+ if cfg.model_config_type == "mixformer-sequential":
246
+ if cfg.sample_packing:
247
+ if cfg.adapter is not None:
248
+ LOG.warning(
249
+ "phi/MixFormer models are not currently compatible with LoRA and sample_packing"
250
+ )
251
+ if cfg.model_type == "AutoModelForCausalLM":
252
+ raise ValueError(
253
+ "`model_type: MixFormerSequentialForCausalLM` required for sample_packing"
254
+ )
255
+
256
  # TODO
257
  # MPT 7b
258
  # https://github.com/facebookresearch/bitsandbytes/issues/25
src/axolotl/utils/models.py CHANGED
@@ -1,6 +1,5 @@
1
  """Module for models and model loading"""
2
-
3
-
4
  import logging
5
  import math
6
  import os
@@ -155,11 +154,26 @@ def load_model(
155
  LOG.info("patching _expand_mask")
156
  hijack_expand_mask()
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  model_kwargs = {}
159
  if cfg.model_revision:
160
  model_kwargs["revision"] = cfg.model_revision
161
  if cfg.gptq:
162
- model_config = load_model_config(cfg)
163
  if not hasattr(model_config, "quantization_config"):
164
  LOG.warning("model config does not contain quantization_config information")
165
  else:
 
1
  """Module for models and model loading"""
2
+ import importlib
 
3
  import logging
4
  import math
5
  import os
 
154
  LOG.info("patching _expand_mask")
155
  hijack_expand_mask()
156
 
157
+ model_config = load_model_config(cfg)
158
+
159
+ # special handling b/c remote MixFormers code doesn't have _no_split_modules set
160
+ if (
161
+ "MixFormerSequentialConfig" in model_config.__class__.__name__
162
+ and cfg.model_type == "AutoModelForCausalLM"
163
+ ):
164
+ module_name = model_config.__class__.__module__.replace(
165
+ ".configuration_mixformer_sequential", ".modeling_mixformer_sequential"
166
+ )
167
+ modeling_phi = importlib.import_module(module_name)
168
+ # pylint:disable=protected-access
169
+ modeling_phi.MixFormerSequentialForCausalLM._no_split_modules = [
170
+ "ParallelBlock"
171
+ ]
172
+
173
  model_kwargs = {}
174
  if cfg.model_revision:
175
  model_kwargs["revision"] = cfg.model_revision
176
  if cfg.gptq:
 
177
  if not hasattr(model_config, "quantization_config"):
178
  LOG.warning("model config does not contain quantization_config information")
179
  else: