winglian commited on
Commit
31b9e0c
β€’
1 Parent(s): 6b9b229

minor tweaks to simplify (#597)

Browse files
src/axolotl/utils/tokenization.py CHANGED
@@ -18,21 +18,16 @@ def check_example_labels(example, tokenizer, text_only=False):
18
  # Get the input_ids, labels, and attention_mask from the dataset
19
  input_ids = example["input_ids"]
20
  labels = example["labels"]
21
- attention_mask = example["attention_mask"]
22
 
23
  # You can compare the input_ids and labels element-wise
24
  # Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
25
  colored_tokens = []
26
- for _, (input_id, label_id, mask) in enumerate(
27
- zip(input_ids, labels, attention_mask)
28
- ):
29
  decoded_input_token = tokenizer.decode(input_id)
30
  # Choose the color based on whether the label has the ignore value or not
31
  color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
32
  colored_token = colored(decoded_input_token, color) + (
33
- not text_only
34
- and colored(f"({label_id}, {mask}, {input_id})", "white")
35
- or ""
36
  )
37
  colored_tokens.append(colored_token)
38
 
 
18
  # Get the input_ids, labels, and attention_mask from the dataset
19
  input_ids = example["input_ids"]
20
  labels = example["labels"]
 
21
 
22
  # You can compare the input_ids and labels element-wise
23
  # Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
24
  colored_tokens = []
25
+ for _, (input_id, label_id) in enumerate(zip(input_ids, labels)):
 
 
26
  decoded_input_token = tokenizer.decode(input_id)
27
  # Choose the color based on whether the label has the ignore value or not
28
  color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
29
  colored_token = colored(decoded_input_token, color) + (
30
+ not text_only and colored(f"({label_id}, {input_id})", "white") or ""
 
 
31
  )
32
  colored_tokens.append(colored_token)
33
 
src/axolotl/utils/trainer.py CHANGED
@@ -429,7 +429,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
429
  .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
430
  .values
431
  )
432
- LOG.info(f"πŸ“ UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
433
  cfg.total_num_tokens = total_num_tokens
434
 
435
  if not cfg.total_supervised_tokens:
@@ -489,6 +489,8 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
489
  data_loader_len = data_loader.len_w_stats()
490
  actual_eff = data_loader.efficiency()
491
  LOG.info(f"data_loader_len: {data_loader_len}")
 
 
492
  total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
493
 
494
  def calc_sample_packing_eff_est(estimates: List[float]):
@@ -502,10 +504,8 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
502
  sample_packing_eff_est = (
503
  math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
504
  )
505
- LOG.info(
506
- f"πŸ“ UPDATE CONFIG WITH: `sample_packing_eff_est: {sample_packing_eff_est}`"
507
- )
508
  cfg.sample_packing_eff_est = sample_packing_eff_est
 
509
  else:
510
  total_num_steps = int(
511
  math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
 
429
  .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
430
  .values
431
  )
432
+ LOG.info(f"total_num_tokens: {total_num_tokens}")
433
  cfg.total_num_tokens = total_num_tokens
434
 
435
  if not cfg.total_supervised_tokens:
 
489
  data_loader_len = data_loader.len_w_stats()
490
  actual_eff = data_loader.efficiency()
491
  LOG.info(f"data_loader_len: {data_loader_len}")
492
+ # FIXME: is there a bug here somewhere? the total num steps depends
493
+ # on the agreed on value for sample_packing_eff_est
494
  total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
495
 
496
  def calc_sample_packing_eff_est(estimates: List[float]):
 
504
  sample_packing_eff_est = (
505
  math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
506
  )
 
 
 
507
  cfg.sample_packing_eff_est = sample_packing_eff_est
508
+ LOG.info(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}")
509
  else:
510
  total_num_steps = int(
511
  math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)