winglian commited on
Commit
1eebbd0
1 Parent(s): 62a7741

improve handling for empty text on the tokenization step (#502)

Browse files
Files changed (1) hide show
  1. src/axolotl/prompt_tokenizers.py +16 -9
src/axolotl/prompt_tokenizers.py CHANGED
@@ -6,7 +6,7 @@ import functools
6
  import logging
7
  from typing import Dict, List, Tuple, Union
8
 
9
- from transformers import PreTrainedTokenizer
10
 
11
  from axolotl.prompters import IGNORE_TOKEN_ID
12
 
@@ -66,14 +66,21 @@ class PromptTokenizingStrategy(abc.ABC):
66
  pass
67
  return False
68
 
69
- def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
70
- result = self.tokenizer(
71
- prompt,
72
- truncation=True,
73
- max_length=self.sequence_len,
74
- padding=False,
75
- return_tensors=None,
76
- )
 
 
 
 
 
 
 
77
  if len(result["input_ids"]) == 0:
78
  LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
79
  if (
 
6
  import logging
7
  from typing import Dict, List, Tuple, Union
8
 
9
+ from transformers import BatchEncoding, PreTrainedTokenizer
10
 
11
  from axolotl.prompters import IGNORE_TOKEN_ID
12
 
 
66
  pass
67
  return False
68
 
69
+ def _tokenize(
70
+ self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
71
+ ) -> BatchEncoding:
72
+ result: BatchEncoding
73
+ if not prompt.strip():
74
+ LOG.warning("Empty text requested for tokenization.")
75
+ result = BatchEncoding(data={"input_ids": [], "attention_mask": []})
76
+ else:
77
+ result = self.tokenizer(
78
+ prompt,
79
+ truncation=True,
80
+ max_length=self.sequence_len,
81
+ padding=False,
82
+ return_tensors=None,
83
+ )
84
  if len(result["input_ids"]) == 0:
85
  LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
86
  if (