{ "best_metric": 74.67606866807955, "best_model_checkpoint": "/root/turkic_qa/ru_kaz_models/ru_kaz_xlm_roberta_base_model/checkpoint-4416", "epoch": 10.0, "eval_steps": 500, "global_step": 5520, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "step": 552, "train_exact_match": 24.275724275724276, "train_f1": 40.76875114599373, "train_runtime": 11.4713, "train_samples_per_second": 88.569, "train_steps_per_second": 3.225 }, { "epoch": 1.0, "grad_norm": 50.65106201171875, "learning_rate": 5e-06, "loss": 4.6936, "step": 552 }, { "epoch": 1.0, "eval_exact_match": 24.4375, "eval_f1": 40.89175449073484, "eval_runtime": 36.7427, "eval_samples_per_second": 89.297, "eval_steps_per_second": 3.212, "step": 552 }, { "epoch": 2.0, "step": 1104, "train_exact_match": 56.54345654345654, "train_f1": 72.7903191757748, "train_runtime": 11.4595, "train_samples_per_second": 89.707, "train_steps_per_second": 3.229 }, { "epoch": 2.0, "grad_norm": 61.147613525390625, "learning_rate": 1e-05, "loss": 2.162, "step": 1104 }, { "epoch": 2.0, "eval_exact_match": 48.28125, "eval_f1": 67.10891985034017, "eval_runtime": 36.689, "eval_samples_per_second": 89.427, "eval_steps_per_second": 3.216, "step": 1104 }, { "epoch": 3.0, "step": 1656, "train_exact_match": 60.33966033966034, "train_f1": 77.53847505132588, "train_runtime": 11.4178, "train_samples_per_second": 89.072, "train_steps_per_second": 3.241 }, { "epoch": 3.0, "grad_norm": 43.21469497680664, "learning_rate": 8.750000000000001e-06, "loss": 1.4851, "step": 1656 }, { "epoch": 3.0, "eval_exact_match": 54.59375, "eval_f1": 72.32243072865627, "eval_runtime": 36.5153, "eval_samples_per_second": 89.853, "eval_steps_per_second": 3.232, "step": 1656 }, { "epoch": 4.0, "step": 2208, "train_exact_match": 68.13186813186813, "train_f1": 83.71981586843339, "train_runtime": 11.3106, "train_samples_per_second": 89.65, "train_steps_per_second": 3.271 }, { "epoch": 4.0, "grad_norm": 39.521087646484375, "learning_rate": 7.500000000000001e-06, "loss": 1.1864, "step": 2208 }, { "epoch": 4.0, "eval_exact_match": 55.6875, "eval_f1": 73.33848461420202, "eval_runtime": 36.4825, "eval_samples_per_second": 89.934, "eval_steps_per_second": 3.234, "step": 2208 }, { "epoch": 5.0, "step": 2760, "train_exact_match": 72.82717282717283, "train_f1": 87.11695605040637, "train_runtime": 11.478, "train_samples_per_second": 89.04, "train_steps_per_second": 3.224 }, { "epoch": 5.0, "grad_norm": 14.2109956741333, "learning_rate": 6.25e-06, "loss": 1.0004, "step": 2760 }, { "epoch": 5.0, "eval_exact_match": 57.5, "eval_f1": 74.39778409830554, "eval_runtime": 36.7472, "eval_samples_per_second": 89.286, "eval_steps_per_second": 3.211, "step": 2760 }, { "epoch": 6.0, "step": 3312, "train_exact_match": 76.62337662337663, "train_f1": 89.20109871672176, "train_runtime": 11.3687, "train_samples_per_second": 89.896, "train_steps_per_second": 3.255 }, { "epoch": 6.0, "grad_norm": 37.63686752319336, "learning_rate": 5e-06, "loss": 0.8721, "step": 3312 }, { "epoch": 6.0, "eval_exact_match": 57.84375, "eval_f1": 74.55139589840982, "eval_runtime": 36.3591, "eval_samples_per_second": 90.239, "eval_steps_per_second": 3.245, "step": 3312 }, { "epoch": 7.0, "step": 3864, "train_exact_match": 78.42157842157842, "train_f1": 91.01806696135736, "train_runtime": 11.4573, "train_samples_per_second": 89.725, "train_steps_per_second": 3.229 }, { "epoch": 7.0, "grad_norm": 21.571279525756836, "learning_rate": 3.7500000000000005e-06, "loss": 0.7755, "step": 3864 }, { "epoch": 7.0, "eval_exact_match": 57.6875, "eval_f1": 74.44520627477633, "eval_runtime": 36.4488, "eval_samples_per_second": 90.017, "eval_steps_per_second": 3.237, "step": 3864 }, { "epoch": 8.0, "step": 4416, "train_exact_match": 79.12087912087912, "train_f1": 91.65370167331513, "train_runtime": 11.3648, "train_samples_per_second": 90.103, "train_steps_per_second": 3.256 }, { "epoch": 8.0, "grad_norm": 57.543540954589844, "learning_rate": 2.5e-06, "loss": 0.7089, "step": 4416 }, { "epoch": 8.0, "eval_exact_match": 58.09375, "eval_f1": 74.67606866807955, "eval_runtime": 36.2701, "eval_samples_per_second": 90.46, "eval_steps_per_second": 3.253, "step": 4416 }, { "epoch": 9.0, "step": 4968, "train_exact_match": 80.21978021978022, "train_f1": 92.17230025305071, "train_runtime": 11.629, "train_samples_per_second": 88.916, "train_steps_per_second": 3.182 }, { "epoch": 9.0, "grad_norm": 21.385902404785156, "learning_rate": 1.25e-06, "loss": 0.6522, "step": 4968 }, { "epoch": 9.0, "eval_exact_match": 58.3125, "eval_f1": 74.6689953301795, "eval_runtime": 36.9735, "eval_samples_per_second": 88.739, "eval_steps_per_second": 3.191, "step": 4968 }, { "epoch": 10.0, "step": 5520, "train_exact_match": 80.21978021978022, "train_f1": 91.90698139768105, "train_runtime": 11.5232, "train_samples_per_second": 89.472, "train_steps_per_second": 3.211 }, { "epoch": 10.0, "grad_norm": 32.45027160644531, "learning_rate": 0.0, "loss": 0.6146, "step": 5520 }, { "epoch": 10.0, "eval_exact_match": 57.75, "eval_f1": 74.50735271348769, "eval_runtime": 36.529, "eval_samples_per_second": 89.819, "eval_steps_per_second": 3.23, "step": 5520 }, { "epoch": 10.0, "step": 5520, "total_flos": 3.025228525300224e+16, "train_loss": 1.4150862375895181, "train_runtime": 3533.8403, "train_samples_per_second": 43.683, "train_steps_per_second": 1.562 } ], "logging_steps": 500, "max_steps": 5520, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.025228525300224e+16, "train_batch_size": 28, "trial_name": null, "trial_params": null }