{ "best_metric": 0.8256880733944955, "best_model_checkpoint": "tiny-bert-sst2-distilled/run-11/checkpoint-527", "epoch": 8.0, "eval_steps": 500, "global_step": 4216, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 6.5331573486328125, "learning_rate": 0.00014562243502675204, "loss": 0.547, "step": 527 }, { "epoch": 1.0, "eval_accuracy": 0.8256880733944955, "eval_loss": 0.5717546343803406, "eval_runtime": 2.8071, "eval_samples_per_second": 310.639, "eval_steps_per_second": 2.494, "step": 527 }, { "epoch": 2.0, "grad_norm": 4.755884647369385, "learning_rate": 0.0001248192300229303, "loss": 0.3165, "step": 1054 }, { "epoch": 2.0, "eval_accuracy": 0.823394495412844, "eval_loss": 0.6189394593238831, "eval_runtime": 2.8231, "eval_samples_per_second": 308.882, "eval_steps_per_second": 2.48, "step": 1054 }, { "epoch": 3.0, "grad_norm": 13.318252563476562, "learning_rate": 0.0001040160250191086, "loss": 0.2438, "step": 1581 }, { "epoch": 3.0, "eval_accuracy": 0.8256880733944955, "eval_loss": 0.6849867701530457, "eval_runtime": 2.8211, "eval_samples_per_second": 309.105, "eval_steps_per_second": 2.481, "step": 1581 }, { "epoch": 4.0, "grad_norm": 6.025650501251221, "learning_rate": 8.321282001528688e-05, "loss": 0.2015, "step": 2108 }, { "epoch": 4.0, "eval_accuracy": 0.8256880733944955, "eval_loss": 0.7157025337219238, "eval_runtime": 2.8159, "eval_samples_per_second": 309.671, "eval_steps_per_second": 2.486, "step": 2108 }, { "epoch": 5.0, "grad_norm": 6.3858113288879395, "learning_rate": 6.240961501146515e-05, "loss": 0.1725, "step": 2635 }, { "epoch": 5.0, "eval_accuracy": 0.8153669724770642, "eval_loss": 0.7607192397117615, "eval_runtime": 2.8278, "eval_samples_per_second": 308.365, "eval_steps_per_second": 2.475, "step": 2635 }, { "epoch": 6.0, "grad_norm": 12.167182922363281, "learning_rate": 4.160641000764344e-05, "loss": 0.1517, "step": 3162 }, { "epoch": 6.0, "eval_accuracy": 0.8119266055045872, "eval_loss": 0.8524329662322998, "eval_runtime": 2.8071, "eval_samples_per_second": 310.643, "eval_steps_per_second": 2.494, "step": 3162 }, { "epoch": 7.0, "grad_norm": 6.675292015075684, "learning_rate": 2.080320500382172e-05, "loss": 0.1413, "step": 3689 }, { "epoch": 7.0, "eval_accuracy": 0.8107798165137615, "eval_loss": 0.862307608127594, "eval_runtime": 2.8162, "eval_samples_per_second": 309.641, "eval_steps_per_second": 2.486, "step": 3689 }, { "epoch": 8.0, "grad_norm": 2.5445616245269775, "learning_rate": 0.0, "loss": 0.1299, "step": 4216 }, { "epoch": 8.0, "eval_accuracy": 0.8107798165137615, "eval_loss": 0.8725943565368652, "eval_runtime": 2.8212, "eval_samples_per_second": 309.086, "eval_steps_per_second": 2.481, "step": 4216 } ], "logging_steps": 500, "max_steps": 4216, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "total_flos": 64687167744780.0, "train_batch_size": 128, "trial_name": null, "trial_params": { "alpha": 0.9318516048675823, "learning_rate": 0.00016642564003057375, "num_train_epochs": 8, "temperature": 23 } }