Nanobit commited on
Commit
a1f9850
1 Parent(s): 83d2920

Fix security issue or ignore false positives

Browse files
scripts/finetune.py CHANGED
@@ -136,7 +136,7 @@ def train(
136
 
137
  # load the config from the yaml file
138
  with open(config, encoding="utf-8") as file:
139
- cfg: DictDefault = DictDefault(yaml.load(file, Loader=yaml.Loader))
140
  # if there are any options passed in the cli, if it is something that seems valid from the yaml,
141
  # then overwrite the value
142
  cfg_keys = cfg.keys()
@@ -185,7 +185,7 @@ def train(
185
  logging.info("check_dataset_labels...")
186
  check_dataset_labels(
187
  train_dataset.select(
188
- [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
189
  ),
190
  tokenizer,
191
  )
 
136
 
137
  # load the config from the yaml file
138
  with open(config, encoding="utf-8") as file:
139
+ cfg: DictDefault = DictDefault(yaml.safe_load(file))
140
  # if there are any options passed in the cli, if it is something that seems valid from the yaml,
141
  # then overwrite the value
142
  cfg_keys = cfg.keys()
 
185
  logging.info("check_dataset_labels...")
186
  check_dataset_labels(
187
  train_dataset.select(
188
+ [random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
189
  ),
190
  tokenizer,
191
  )
src/axolotl/prompt_tokenizers.py CHANGED
@@ -11,10 +11,10 @@ from transformers import PreTrainedTokenizer
11
  from axolotl.prompters import IGNORE_TOKEN_ID
12
 
13
  IGNORE_INDEX = -100
14
- LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"
15
- LLAMA_DEFAULT_EOS_TOKEN = "</s>"
16
- LLAMA_DEFAULT_BOS_TOKEN = "<s>"
17
- LLAMA_DEFAULT_UNK_TOKEN = "<unk>"
18
 
19
 
20
  class InvalidDataException(Exception):
 
11
  from axolotl.prompters import IGNORE_TOKEN_ID
12
 
13
  IGNORE_INDEX = -100
14
+ LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
15
+ LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
16
+ LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
17
+ LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
18
 
19
 
20
  class InvalidDataException(Exception):
src/axolotl/utils/data.py CHANGED
@@ -40,7 +40,7 @@ def load_tokenized_prepared_datasets(
40
  ) -> DatasetDict:
41
  tokenizer_name = tokenizer.__class__.__name__
42
  ds_hash = str(
43
- md5(
44
  (
45
  str(cfg.sequence_len)
46
  + "@"
@@ -66,7 +66,7 @@ def load_tokenized_prepared_datasets(
66
  use_auth_token=use_auth_token,
67
  )
68
  dataset = dataset["train"]
69
- except Exception: # pylint: disable=broad-except
70
  pass
71
 
72
  if dataset:
@@ -272,7 +272,7 @@ def load_prepare_datasets(
272
  # see if we can go ahead and load the stacked dataset
273
  seed = f"@{str(cfg.seed)}" if cfg.seed else ""
274
  ds_hash = str(
275
- md5(
276
  (
277
  str(cfg.sequence_len)
278
  + "@"
@@ -304,7 +304,7 @@ def load_prepare_datasets(
304
  use_auth_token=use_auth_token,
305
  )
306
  dataset = dataset["train"]
307
- except Exception: # pylint: disable=broad-except
308
  pass
309
 
310
  if dataset:
 
40
  ) -> DatasetDict:
41
  tokenizer_name = tokenizer.__class__.__name__
42
  ds_hash = str(
43
+ md5( # nosec
44
  (
45
  str(cfg.sequence_len)
46
  + "@"
 
66
  use_auth_token=use_auth_token,
67
  )
68
  dataset = dataset["train"]
69
+ except Exception: # pylint: disable=broad-except # nosec
70
  pass
71
 
72
  if dataset:
 
272
  # see if we can go ahead and load the stacked dataset
273
  seed = f"@{str(cfg.seed)}" if cfg.seed else ""
274
  ds_hash = str(
275
+ md5( # nosec
276
  (
277
  str(cfg.sequence_len)
278
  + "@"
 
304
  use_auth_token=use_auth_token,
305
  )
306
  dataset = dataset["train"]
307
+ except Exception: # pylint: disable=broad-except # nosec
308
  pass
309
 
310
  if dataset: