_attn_implementation_autoset: value: true _name_or_path: value: belisards/congretimbau _wandb: value: cli_version: 0.19.0 m: - "1": train/global_step "6": - 3 "7": [] python_version: 3.10.12 t: "1": - 1 - 2 - 3 - 5 - 11 - 12 - 41 - 49 - 51 - 53 - 55 - 71 - 98 - 100 - 105 "2": - 1 - 2 - 3 - 5 - 11 - 12 - 41 - 49 - 51 - 53 - 55 - 71 - 98 - 100 - 105 "3": - 7 - 13 - 19 - 23 - 55 - 66 "4": 3.10.12 "5": 0.19.0 "6": 4.47.0 "8": - 1 - 5 - 12 "9": "1": transformers_trainer "12": 0.19.0 "13": linux-x86_64 accelerator_config: value: dispatch_batches: null even_batches: true gradient_accumulation_kwargs: null non_blocking: false split_batches: false use_seedable_sampler: true adafactor: value: false adam_beta1: value: 0.9 adam_beta2: value: 0.999 adam_epsilon: value: 1e-08 add_cross_attention: value: false architectures: value: - BertForMaskedLM attention_probs_dropout_prob: value: 0.05 auto_find_batch_size: value: false average_tokens_across_devices: value: false bad_words_ids: value: null batch_eval_metrics: value: false begin_suppress_tokens: value: null bf16: value: false bf16_full_eval: value: false bos_token_id: value: null chunk_size_feed_forward: value: 0 classifier_dropout: value: null cross_attention_hidden_size: value: null data_seed: value: null dataloader_drop_last: value: false dataloader_num_workers: value: 0 dataloader_persistent_workers: value: false dataloader_pin_memory: value: true dataloader_prefetch_factor: value: null ddp_backend: value: null ddp_broadcast_buffers: value: null ddp_bucket_cap_mb: value: null ddp_find_unused_parameters: value: null ddp_timeout: value: 1800 debug: value: [] decoder_start_token_id: value: null deepspeed: value: null directionality: value: bidi disable_tqdm: value: false dispatch_batches: value: null diversity_penalty: value: 0 do_eval: value: true do_predict: value: false do_sample: value: false do_train: value: false early_stopping: value: false encoder_no_repeat_ngram_size: value: 0 eos_token_id: value: null eval_accumulation_steps: value: null eval_delay: value: 0 eval_do_concat_batches: value: true eval_on_start: value: false eval_steps: value: 24 eval_strategy: value: epoch eval_use_gather_object: value: false evaluation_strategy: value: null exponential_decay_length_penalty: value: null finetuning_task: value: null forced_bos_token_id: value: null forced_eos_token_id: value: null fp16: value: false fp16_backend: value: auto fp16_full_eval: value: false fp16_opt_level: value: O1 fsdp: value: [] fsdp_config: value: min_num_params: 0 xla: false xla_fsdp_grad_ckpt: false xla_fsdp_v2: false fsdp_min_num_params: value: 0 fsdp_transformer_layer_cls_to_wrap: value: null full_determinism: value: false gradient_accumulation_steps: value: 1 gradient_checkpointing: value: false gradient_checkpointing_kwargs: value: null greater_is_better: value: true group_by_length: value: false half_precision_backend: value: auto hidden_act: value: gelu hidden_dropout_prob: value: 0.05 hidden_size: value: 1024 hub_always_push: value: false hub_model_id: value: null hub_private_repo: value: null hub_strategy: value: every_save hub_token: value: id2label: value: "0": "0" "1": "1" ignore_data_skip: value: false include_for_metrics: value: [] include_inputs_for_metrics: value: false include_num_input_tokens_seen: value: false include_tokens_per_second: value: false initializer_range: value: 0.02 intermediate_size: value: 4096 is_decoder: value: false is_encoder_decoder: value: false jit_mode_eval: value: false label_names: value: null label_smoothing_factor: value: 0 label2id: value: "0": 0 "1": 1 layer_norm_eps: value: 1e-12 learning_rate: value: 1e-05 length_column_name: value: length length_penalty: value: 1 load_best_model_at_end: value: true local_rank: value: 0 log_level: value: passive log_level_replica: value: warning log_on_each_node: value: true logging_dir: value: ./runs/Dec11_04-28-19_fd55a770be24 logging_first_step: value: false logging_nan_inf_filter: value: true logging_steps: value: 10 logging_strategy: value: steps lr_scheduler_type: value: linear max_grad_norm: value: 1 max_length: value: 20 max_position_embeddings: value: 512 max_steps: value: -1 metric_for_best_model: value: f1 min_length: value: 0 model/num_parameters: value: 334398466 model_type: value: bert mp_parameters: value: "" neftune_noise_alpha: value: null no_cuda: value: false no_repeat_ngram_size: value: 0 num_attention_heads: value: 16 num_beam_groups: value: 1 num_beams: value: 1 num_hidden_layers: value: 24 num_return_sequences: value: 1 num_train_epochs: value: 15 optim: value: adamw_torch optim_args: value: null optim_target_modules: value: null output_attentions: value: false output_dir: value: . output_hidden_states: value: false output_past: value: true output_scores: value: false overwrite_output_dir: value: false pad_token_id: value: 0 past_index: value: -1 per_device_eval_batch_size: value: 128 per_device_train_batch_size: value: 128 per_gpu_eval_batch_size: value: null per_gpu_train_batch_size: value: null pooler_fc_size: value: 768 pooler_num_attention_heads: value: 12 pooler_num_fc_layers: value: 3 pooler_size_per_head: value: 128 pooler_type: value: first_token_transform position_embedding_type: value: absolute prediction_loss_only: value: false prefix: value: null problem_type: value: null push_to_hub: value: false push_to_hub_model_id: value: null push_to_hub_organization: value: null push_to_hub_token: value: ray_scope: value: last remove_invalid_values: value: false remove_unused_columns: value: true repetition_penalty: value: 1 report_to: value: - wandb restore_callback_states_from_checkpoint: value: false resume_from_checkpoint: value: null return_dict: value: true return_dict_in_generate: value: false run_name: value: m2-congretimbau_tunado save_on_each_node: value: false save_only_model: value: false save_safetensors: value: true save_steps: value: 48 save_strategy: value: epoch save_total_limit: value: 1 seed: value: 5151 sep_token_id: value: null skip_memory_metrics: value: true split_batches: value: null suppress_tokens: value: null task_specific_params: value: null temperature: value: 1 tf_legacy_loss: value: false tf32: value: null tie_encoder_decoder: value: false tie_word_embeddings: value: true tokenizer_class: value: null top_k: value: 50 top_p: value: 1 torch_compile: value: false torch_compile_backend: value: null torch_compile_mode: value: null torch_dtype: value: float32 torch_empty_cache_steps: value: null torchdynamo: value: null torchscript: value: false tpu_metrics_debug: value: false tpu_num_cores: value: null transformers_version: value: 4.47.0 type_vocab_size: value: 2 typical_p: value: 1 use_bfloat16: value: false use_cache: value: true use_cpu: value: false use_ipex: value: false use_legacy_prediction_loop: value: false use_liger_kernel: value: false use_mps_device: value: false vocab_size: value: 29794 warmup_ratio: value: 0 warmup_steps: value: 200 weight_decay: value: 0.02