winglian commited on
Commit
b2430ce
1 Parent(s): 4c834bf

use accelerate logging for zero/main loggin only

Browse files
src/axolotl/train.py CHANGED
@@ -1,6 +1,5 @@
1
  """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
2
 
3
- import logging
4
  import os
5
  import signal
6
  import sys
@@ -10,6 +9,7 @@ from typing import Optional
10
 
11
  import torch
12
  import transformers.modelcard
 
13
  from datasets import Dataset
14
  from optimum.bettertransformer import BetterTransformer
15
  from transformers.deepspeed import is_deepspeed_zero3_enabled
@@ -18,7 +18,6 @@ from axolotl.common.cli import TrainerCliArgs
18
  from axolotl.logging_config import configure_logging
19
  from axolotl.monkeypatch import neft_embeddings
20
  from axolotl.utils.dict import DictDefault
21
- from axolotl.utils.distributed import zero_only
22
  from axolotl.utils.models import load_model, load_tokenizer
23
  from axolotl.utils.trainer import setup_trainer
24
 
@@ -27,7 +26,7 @@ src_dir = os.path.join(project_root, "src")
27
  sys.path.insert(0, src_dir)
28
 
29
  configure_logging()
30
- LOG = logging.getLogger("axolotl.train")
31
 
32
 
33
  @dataclass
@@ -45,10 +44,10 @@ def train(
45
  *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
46
  ):
47
  # load the tokenizer first
48
- with zero_only():
49
- LOG.debug(
50
- f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}"
51
- )
52
  tokenizer = load_tokenizer(cfg)
53
 
54
  train_dataset = dataset_meta.train_dataset
 
1
  """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
2
 
 
3
  import os
4
  import signal
5
  import sys
 
9
 
10
  import torch
11
  import transformers.modelcard
12
+ from accelerate.logging import get_logger
13
  from datasets import Dataset
14
  from optimum.bettertransformer import BetterTransformer
15
  from transformers.deepspeed import is_deepspeed_zero3_enabled
 
18
  from axolotl.logging_config import configure_logging
19
  from axolotl.monkeypatch import neft_embeddings
20
  from axolotl.utils.dict import DictDefault
 
21
  from axolotl.utils.models import load_model, load_tokenizer
22
  from axolotl.utils.trainer import setup_trainer
23
 
 
26
  sys.path.insert(0, src_dir)
27
 
28
  configure_logging()
29
+ LOG = get_logger("axolotl.train")
30
 
31
 
32
  @dataclass
 
44
  *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
45
  ):
46
  # load the tokenizer first
47
+ LOG.debug(
48
+ f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
49
+ main_process_only=True,
50
+ )
51
  tokenizer = load_tokenizer(cfg)
52
 
53
  train_dataset = dataset_meta.train_dataset
src/axolotl/utils/trainer.py CHANGED
@@ -1,5 +1,4 @@
1
  """Module containing the Trainer class and related functions"""
2
- import logging
3
  import math
4
  import os
5
  from contextlib import contextmanager
@@ -10,6 +9,7 @@ import numpy as np
10
  import torch
11
  import torch.cuda
12
  import torch.distributed as dist
 
13
  from datasets import set_caching_enabled
14
  from torch.utils.data import DistributedSampler, RandomSampler
15
 
@@ -21,10 +21,9 @@ from axolotl.utils.distributed import (
21
  is_main_process,
22
  reduce_and_broadcast,
23
  zero_first,
24
- zero_only,
25
  )
26
 
27
- LOG = logging.getLogger("axolotl")
28
 
29
 
30
  @torch.jit.script
@@ -160,8 +159,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
160
  .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
161
  .values
162
  )
163
- with zero_only():
164
- LOG.debug(f"total_num_tokens: {total_num_tokens}")
165
  cfg.total_num_tokens = total_num_tokens
166
 
167
  if not cfg.total_supervised_tokens:
@@ -171,8 +169,10 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
171
  .apply(lambda x: np.sum(np.array(x) != -100))
172
  .sum()
173
  )
174
- with zero_only():
175
- LOG.debug(f"`total_supervised_tokens: {total_supervised_tokens}`")
 
 
176
  cfg.total_supervised_tokens = total_supervised_tokens
177
 
178
  if cfg.sample_packing_eff_est:
@@ -191,10 +191,10 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
191
  )
192
  * cfg.num_epochs
193
  )
194
- with zero_only():
195
- LOG.debug(
196
- f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}"
197
- )
198
  else:
199
  if cfg.world_size > 1 and is_distributed():
200
  sampler = DistributedSampler(
@@ -223,8 +223,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
223
  )
224
  data_loader_len = data_loader.len_w_stats()
225
  actual_eff = data_loader.efficiency()
226
- with zero_only():
227
- LOG.debug(f"data_loader_len: {data_loader_len}")
228
  # FIXME: is there a bug here somewhere? the total num steps depends
229
  # on the agreed on value for sample_packing_eff_est
230
  total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
@@ -241,14 +240,15 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
241
  math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
242
  )
243
  cfg.sample_packing_eff_est = sample_packing_eff_est
244
- with zero_only():
245
- LOG.debug(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}")
 
 
246
  else:
247
  total_num_steps = int(
248
  math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
249
  )
250
- with zero_only():
251
- LOG.debug(f"total_num_steps: {total_num_steps}")
252
  return total_num_steps
253
 
254
 
 
1
  """Module containing the Trainer class and related functions"""
 
2
  import math
3
  import os
4
  from contextlib import contextmanager
 
9
  import torch
10
  import torch.cuda
11
  import torch.distributed as dist
12
+ from accelerate.logging import get_logger
13
  from datasets import set_caching_enabled
14
  from torch.utils.data import DistributedSampler, RandomSampler
15
 
 
21
  is_main_process,
22
  reduce_and_broadcast,
23
  zero_first,
 
24
  )
25
 
26
+ LOG = get_logger("axolotl")
27
 
28
 
29
  @torch.jit.script
 
159
  .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
160
  .values
161
  )
162
+ LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True)
 
163
  cfg.total_num_tokens = total_num_tokens
164
 
165
  if not cfg.total_supervised_tokens:
 
169
  .apply(lambda x: np.sum(np.array(x) != -100))
170
  .sum()
171
  )
172
+ LOG.debug(
173
+ f"`total_supervised_tokens: {total_supervised_tokens}`",
174
+ main_process_only=True,
175
+ )
176
  cfg.total_supervised_tokens = total_supervised_tokens
177
 
178
  if cfg.sample_packing_eff_est:
 
191
  )
192
  * cfg.num_epochs
193
  )
194
+ LOG.debug(
195
+ f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}",
196
+ main_process_only=True,
197
+ )
198
  else:
199
  if cfg.world_size > 1 and is_distributed():
200
  sampler = DistributedSampler(
 
223
  )
224
  data_loader_len = data_loader.len_w_stats()
225
  actual_eff = data_loader.efficiency()
226
+ LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True)
 
227
  # FIXME: is there a bug here somewhere? the total num steps depends
228
  # on the agreed on value for sample_packing_eff_est
229
  total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
 
240
  math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
241
  )
242
  cfg.sample_packing_eff_est = sample_packing_eff_est
243
+ LOG.debug(
244
+ f"sample_packing_eff_est: {cfg.sample_packing_eff_est}",
245
+ main_process_only=True,
246
+ )
247
  else:
248
  total_num_steps = int(
249
  math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
250
  )
251
+ LOG.debug(f"total_num_steps: {total_num_steps}", main_process_only=True)
 
252
  return total_num_steps
253
 
254