TCMVince
/

NRJ-DEBUG

Feature Extraction

Model card Files Files and versions Community

NRJ-DEBUG / config.yaml

TCMVince's picture

commit files to HF hub

2b5b2f3 about 2 months ago

history blame contribute delete

1.97 kB

	activation: softmax
	adam_beta1: 0.9
	adam_beta2: 0.99
	adam_epsilon: 1.0e-06
	alpha: 0.1
	attn_implementation: null
	beta: 0.125
	bf16: true
	block_size: 512
	checkpoint_dir: mlruns/896390784617014591/892b97fa0aa6499288906c463545ae00/checkpoints
	compile: false
	config_path: configs/JZ/NRJ_base-wiki-original.yaml
	dataloader_num_workers: 8
	dataset_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/
	ddp_find_unused_parameters: false
	disable_tqdm: true
	do_eval: true
	dropout: 0.1
	embedding_dim: 768
	eval_steps: 25000
	evaluation_strategy: steps
	forward_memories: 3072
	fp16: false
	gradient_accumulation_steps: 1
	ignore_lines: false
	layer_norm: 1.0e-12
	learning_rate: 0.0007
	log_on_each_node: false
	logging_steps: 1000
	logging_strategy: steps
	lr_scheduler_kwargs: {}
	lr_scheduler_type: cosine
	max_steps: 500000
	model_name: NRJ-V_30000K_bpe-NL12-NH12-EMB768-FFN3072
	model_type: energyBERT
	n_run: 51
	num_heads: 12
	num_layers: 12
	num_params: 50638896
	optimizer: adamw_torch
	output_dir: null
	per_device_eval_batch_size: 8
	per_device_train_batch_size: 64
	remove_unused_columns: false
	report_to: mlflow
	save_steps: 25000
	save_strategy: steps
	seed: 42
	share_layers: false
	test_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.test.txt
	tie_weights: false
	tokenizer_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/tokenizer
	tokenizer_type: bpe
	total_batch_size: 4096
	training_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.train.txt
	valid_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.valid.txt
	vocabulary_size: 30000
	warmup_ratio: 0.0
	warmup_steps: 24000
	weight_decay: 0.01