{ "best_metric": 0.8346379647749511, "best_model_checkpoint": "tiny-bert-sst2-distilled/run-38/checkpoint-864", "epoch": 9.0, "eval_steps": 500, "global_step": 864, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 1.8633686304092407, "learning_rate": 0.00011164310281576651, "loss": 0.584, "step": 96 }, { "epoch": 1.0, "eval_accuracy": 0.7690802348336595, "eval_f1": 0.7993197278911565, "eval_loss": 0.48832422494888306, "eval_precision": 0.706766917293233, "eval_recall": 0.9197651663405088, "eval_runtime": 28.3969, "eval_samples_per_second": 35.99, "eval_steps_per_second": 1.127, "step": 96 }, { "epoch": 2.0, "grad_norm": 1.9557557106018066, "learning_rate": 9.76877149637957e-05, "loss": 0.4827, "step": 192 }, { "epoch": 2.0, "eval_accuracy": 0.8003913894324853, "eval_f1": 0.8222996515679443, "eval_loss": 0.44451940059661865, "eval_precision": 0.7409733124018838, "eval_recall": 0.923679060665362, "eval_runtime": 28.5294, "eval_samples_per_second": 35.823, "eval_steps_per_second": 1.122, "step": 192 }, { "epoch": 3.0, "grad_norm": 3.0629470348358154, "learning_rate": 8.373232711182488e-05, "loss": 0.448, "step": 288 }, { "epoch": 3.0, "eval_accuracy": 0.824853228962818, "eval_f1": 0.8353265869365226, "eval_loss": 0.4216720759868622, "eval_precision": 0.7881944444444444, "eval_recall": 0.8884540117416829, "eval_runtime": 28.7944, "eval_samples_per_second": 35.493, "eval_steps_per_second": 1.111, "step": 288 }, { "epoch": 4.0, "grad_norm": 5.688210964202881, "learning_rate": 6.977693925985407e-05, "loss": 0.4341, "step": 384 }, { "epoch": 4.0, "eval_accuracy": 0.8209393346379648, "eval_f1": 0.825214899713467, "eval_loss": 0.42298367619514465, "eval_precision": 0.8059701492537313, "eval_recall": 0.8454011741682974, "eval_runtime": 28.2399, "eval_samples_per_second": 36.19, "eval_steps_per_second": 1.133, "step": 384 }, { "epoch": 5.0, "grad_norm": 2.334526300430298, "learning_rate": 5.5821551407883254e-05, "loss": 0.4202, "step": 480 }, { "epoch": 5.0, "eval_accuracy": 0.8287671232876712, "eval_f1": 0.8436103663985702, "eval_loss": 0.4076910614967346, "eval_precision": 0.7763157894736842, "eval_recall": 0.923679060665362, "eval_runtime": 28.534, "eval_samples_per_second": 35.817, "eval_steps_per_second": 1.121, "step": 480 }, { "epoch": 6.0, "grad_norm": 4.195997714996338, "learning_rate": 4.186616355591244e-05, "loss": 0.4109, "step": 576 }, { "epoch": 6.0, "eval_accuracy": 0.8258317025440313, "eval_f1": 0.842756183745583, "eval_loss": 0.4115942418575287, "eval_precision": 0.7681159420289855, "eval_recall": 0.9334637964774951, "eval_runtime": 28.3073, "eval_samples_per_second": 36.104, "eval_steps_per_second": 1.13, "step": 576 }, { "epoch": 7.0, "grad_norm": 2.727288007736206, "learning_rate": 2.7910775703941627e-05, "loss": 0.4017, "step": 672 }, { "epoch": 7.0, "eval_accuracy": 0.8307240704500979, "eval_f1": 0.8467670504871568, "eval_loss": 0.41019657254219055, "eval_precision": 0.7734627831715211, "eval_recall": 0.9354207436399217, "eval_runtime": 28.5087, "eval_samples_per_second": 35.849, "eval_steps_per_second": 1.122, "step": 672 }, { "epoch": 8.0, "grad_norm": 3.721353530883789, "learning_rate": 1.3955387851970814e-05, "loss": 0.4014, "step": 768 }, { "epoch": 8.0, "eval_accuracy": 0.8258317025440313, "eval_f1": 0.8405017921146953, "eval_loss": 0.40405774116516113, "eval_precision": 0.775206611570248, "eval_recall": 0.9178082191780822, "eval_runtime": 28.5925, "eval_samples_per_second": 35.744, "eval_steps_per_second": 1.119, "step": 768 }, { "epoch": 9.0, "grad_norm": 2.3781871795654297, "learning_rate": 0.0, "loss": 0.3968, "step": 864 }, { "epoch": 9.0, "eval_accuracy": 0.8346379647749511, "eval_f1": 0.8456621004566209, "eval_loss": 0.4000749886035919, "eval_precision": 0.7928082191780822, "eval_recall": 0.9060665362035225, "eval_runtime": 28.358, "eval_samples_per_second": 36.039, "eval_steps_per_second": 1.128, "step": 864 } ], "logging_steps": 500, "max_steps": 864, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "total_flos": 2121256775520.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.8898885181069172, "learning_rate": 0.00012559849066773733, "num_train_epochs": 9, "temperature": 3 } }