winglian commited on
Commit
ab5cd28
1 Parent(s): 1a82082

more gpt-neox long ctx fixes

Browse files
src/axolotl/utils/callbacks.py CHANGED
@@ -61,6 +61,7 @@ class SaveBetterTransformerModelCallback(
61
 
62
  model = BetterTransformer.reverse(kwargs["model"])
63
  model.save_pretrained(checkpoint_folder)
 
64
 
65
  # since we're saving here, we don't need the trainer loop to attempt to save too b/c
66
  # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
 
61
 
62
  model = BetterTransformer.reverse(kwargs["model"])
63
  model.save_pretrained(checkpoint_folder)
64
+ # FIXME - need to cleanup old checkpoints
65
 
66
  # since we're saving here, we don't need the trainer loop to attempt to save too b/c
67
  # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
src/axolotl/utils/data.py CHANGED
@@ -388,9 +388,13 @@ def load_prepare_datasets(
388
  index=cfg.dataset_shard_idx,
389
  )
390
 
391
- dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
392
- train_dataset = dataset["train"]
393
- eval_dataset = dataset["test"]
 
 
 
 
394
 
395
  return train_dataset, eval_dataset
396
 
 
388
  index=cfg.dataset_shard_idx,
389
  )
390
 
391
+ if cfg.val_set_size:
392
+ dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
393
+ train_dataset = dataset["train"]
394
+ eval_dataset = dataset["test"]
395
+ else:
396
+ train_dataset = dataset
397
+ eval_dataset = None
398
 
399
  return train_dataset, eval_dataset
400
 
src/axolotl/utils/models.py CHANGED
@@ -300,6 +300,12 @@ def load_model(
300
  embeddings_len = math.ceil(len(tokenizer) / 32) * 32
301
  model.resize_token_embeddings(embeddings_len)
302
 
 
 
 
 
 
 
303
  if not cfg.gptq and (
304
  (cfg.adapter == "lora" and load_in_8bit)
305
  or (cfg.adapter == "qlora" and cfg.load_in_4bit)
 
300
  embeddings_len = math.ceil(len(tokenizer) / 32) * 32
301
  model.resize_token_embeddings(embeddings_len)
302
 
303
+ if cfg.sequence_len >= model.config.max_position_embeddings:
304
+ logging.warning(
305
+ f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
306
+ )
307
+ model.config.max_position_embeddings = cfg.sequence_len
308
+
309
  if not cfg.gptq and (
310
  (cfg.adapter == "lora" and load_in_8bit)
311
  or (cfg.adapter == "qlora" and cfg.load_in_4bit)
src/axolotl/utils/validation.py CHANGED
@@ -80,4 +80,11 @@ def validate_config(cfg):
80
  # TODO
81
  # MPT 7b
82
  # https://github.com/facebookresearch/bitsandbytes/issues/25
83
- # no 8bit adamw w bf16
 
 
 
 
 
 
 
 
80
  # TODO
81
  # MPT 7b
82
  # https://github.com/facebookresearch/bitsandbytes/issues/25
83
+ # no 8bit adaAmw w bf16
84
+
85
+ # GPT-NeoX
86
+ # evals broken when extending context len
87
+ # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
88
+ # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
89
+ # attention_mask = causal_mask + attention_mask
90
+ # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3