Fix security issue or ignore false positives
Browse files- scripts/finetune.py +2 -2
- src/axolotl/prompt_tokenizers.py +4 -4
- src/axolotl/utils/data.py +4 -4
scripts/finetune.py
CHANGED
|
@@ -136,7 +136,7 @@ def train(
|
|
| 136 |
|
| 137 |
# load the config from the yaml file
|
| 138 |
with open(config, encoding="utf-8") as file:
|
| 139 |
-
cfg: DictDefault = DictDefault(yaml.
|
| 140 |
# if there are any options passed in the cli, if it is something that seems valid from the yaml,
|
| 141 |
# then overwrite the value
|
| 142 |
cfg_keys = cfg.keys()
|
|
@@ -185,7 +185,7 @@ def train(
|
|
| 185 |
logging.info("check_dataset_labels...")
|
| 186 |
check_dataset_labels(
|
| 187 |
train_dataset.select(
|
| 188 |
-
[random.randrange(0, len(train_dataset) - 1) for
|
| 189 |
),
|
| 190 |
tokenizer,
|
| 191 |
)
|
|
|
|
| 136 |
|
| 137 |
# load the config from the yaml file
|
| 138 |
with open(config, encoding="utf-8") as file:
|
| 139 |
+
cfg: DictDefault = DictDefault(yaml.safe_load(file))
|
| 140 |
# if there are any options passed in the cli, if it is something that seems valid from the yaml,
|
| 141 |
# then overwrite the value
|
| 142 |
cfg_keys = cfg.keys()
|
|
|
|
| 185 |
logging.info("check_dataset_labels...")
|
| 186 |
check_dataset_labels(
|
| 187 |
train_dataset.select(
|
| 188 |
+
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
|
| 189 |
),
|
| 190 |
tokenizer,
|
| 191 |
)
|
src/axolotl/prompt_tokenizers.py
CHANGED
|
@@ -11,10 +11,10 @@ from transformers import PreTrainedTokenizer
|
|
| 11 |
from axolotl.prompters import IGNORE_TOKEN_ID
|
| 12 |
|
| 13 |
IGNORE_INDEX = -100
|
| 14 |
-
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"
|
| 15 |
-
LLAMA_DEFAULT_EOS_TOKEN = "</s>"
|
| 16 |
-
LLAMA_DEFAULT_BOS_TOKEN = "<s>"
|
| 17 |
-
LLAMA_DEFAULT_UNK_TOKEN = "<unk>"
|
| 18 |
|
| 19 |
|
| 20 |
class InvalidDataException(Exception):
|
|
|
|
| 11 |
from axolotl.prompters import IGNORE_TOKEN_ID
|
| 12 |
|
| 13 |
IGNORE_INDEX = -100
|
| 14 |
+
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
|
| 15 |
+
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
|
| 16 |
+
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
|
| 17 |
+
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
|
| 18 |
|
| 19 |
|
| 20 |
class InvalidDataException(Exception):
|
src/axolotl/utils/data.py
CHANGED
|
@@ -40,7 +40,7 @@ def load_tokenized_prepared_datasets(
|
|
| 40 |
) -> DatasetDict:
|
| 41 |
tokenizer_name = tokenizer.__class__.__name__
|
| 42 |
ds_hash = str(
|
| 43 |
-
md5(
|
| 44 |
(
|
| 45 |
str(cfg.sequence_len)
|
| 46 |
+ "@"
|
|
@@ -66,7 +66,7 @@ def load_tokenized_prepared_datasets(
|
|
| 66 |
use_auth_token=use_auth_token,
|
| 67 |
)
|
| 68 |
dataset = dataset["train"]
|
| 69 |
-
except Exception: # pylint: disable=broad-except
|
| 70 |
pass
|
| 71 |
|
| 72 |
if dataset:
|
|
@@ -272,7 +272,7 @@ def load_prepare_datasets(
|
|
| 272 |
# see if we can go ahead and load the stacked dataset
|
| 273 |
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
| 274 |
ds_hash = str(
|
| 275 |
-
md5(
|
| 276 |
(
|
| 277 |
str(cfg.sequence_len)
|
| 278 |
+ "@"
|
|
@@ -304,7 +304,7 @@ def load_prepare_datasets(
|
|
| 304 |
use_auth_token=use_auth_token,
|
| 305 |
)
|
| 306 |
dataset = dataset["train"]
|
| 307 |
-
except Exception: # pylint: disable=broad-except
|
| 308 |
pass
|
| 309 |
|
| 310 |
if dataset:
|
|
|
|
| 40 |
) -> DatasetDict:
|
| 41 |
tokenizer_name = tokenizer.__class__.__name__
|
| 42 |
ds_hash = str(
|
| 43 |
+
md5( # nosec
|
| 44 |
(
|
| 45 |
str(cfg.sequence_len)
|
| 46 |
+ "@"
|
|
|
|
| 66 |
use_auth_token=use_auth_token,
|
| 67 |
)
|
| 68 |
dataset = dataset["train"]
|
| 69 |
+
except Exception: # pylint: disable=broad-except # nosec
|
| 70 |
pass
|
| 71 |
|
| 72 |
if dataset:
|
|
|
|
| 272 |
# see if we can go ahead and load the stacked dataset
|
| 273 |
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
| 274 |
ds_hash = str(
|
| 275 |
+
md5( # nosec
|
| 276 |
(
|
| 277 |
str(cfg.sequence_len)
|
| 278 |
+ "@"
|
|
|
|
| 304 |
use_auth_token=use_auth_token,
|
| 305 |
)
|
| 306 |
dataset = dataset["train"]
|
| 307 |
+
except Exception: # pylint: disable=broad-except # nosec
|
| 308 |
pass
|
| 309 |
|
| 310 |
if dataset:
|