|
activation: softmax |
|
adam_beta1: 0.9 |
|
adam_beta2: 0.99 |
|
adam_epsilon: 1.0e-06 |
|
alpha: 0.1 |
|
attn_implementation: null |
|
beta: 0.125 |
|
bf16: true |
|
block_size: 512 |
|
checkpoint_dir: mlruns/896390784617014591/892b97fa0aa6499288906c463545ae00/checkpoints |
|
compile: false |
|
config_path: configs/JZ/NRJ_base-wiki-original.yaml |
|
dataloader_num_workers: 8 |
|
dataset_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/ |
|
ddp_find_unused_parameters: false |
|
disable_tqdm: true |
|
do_eval: true |
|
dropout: 0.1 |
|
embedding_dim: 768 |
|
eval_steps: 25000 |
|
evaluation_strategy: steps |
|
forward_memories: 3072 |
|
fp16: false |
|
gradient_accumulation_steps: 1 |
|
ignore_lines: false |
|
layer_norm: 1.0e-12 |
|
learning_rate: 0.0007 |
|
log_on_each_node: false |
|
logging_steps: 1000 |
|
logging_strategy: steps |
|
lr_scheduler_kwargs: {} |
|
lr_scheduler_type: cosine |
|
max_steps: 500000 |
|
model_name: NRJ-V_30000K_bpe-NL12-NH12-EMB768-FFN3072 |
|
model_type: energyBERT |
|
n_run: 51 |
|
num_heads: 12 |
|
num_layers: 12 |
|
num_params: 50638896 |
|
optimizer: adamw_torch |
|
output_dir: null |
|
per_device_eval_batch_size: 8 |
|
per_device_train_batch_size: 64 |
|
remove_unused_columns: false |
|
report_to: mlflow |
|
save_steps: 25000 |
|
save_strategy: steps |
|
seed: 42 |
|
share_layers: false |
|
test_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.test.txt |
|
tie_weights: false |
|
tokenizer_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/tokenizer |
|
tokenizer_type: bpe |
|
total_batch_size: 4096 |
|
training_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.train.txt |
|
valid_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.valid.txt |
|
vocabulary_size: 30000 |
|
warmup_ratio: 0.0 |
|
warmup_steps: 24000 |
|
weight_decay: 0.01 |
|
|