{ "best_metric": 0.8326810176125244, "best_model_checkpoint": "tiny-bert-sst2-distilled/run-32/checkpoint-672", "epoch": 9.0, "eval_steps": 500, "global_step": 864, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 1.7263739109039307, "learning_rate": 0.00011867778606566929, "loss": 0.5622, "step": 96 }, { "epoch": 1.0, "eval_accuracy": 0.7710371819960861, "eval_f1": 0.8006814310051107, "eval_loss": 0.4762067198753357, "eval_precision": 0.7088989441930619, "eval_recall": 0.9197651663405088, "eval_runtime": 25.5548, "eval_samples_per_second": 39.993, "eval_steps_per_second": 1.252, "step": 96 }, { "epoch": 2.0, "grad_norm": 1.8613439798355103, "learning_rate": 0.00010384306280746064, "loss": 0.4724, "step": 192 }, { "epoch": 2.0, "eval_accuracy": 0.8072407045009785, "eval_f1": 0.8273444347063978, "eval_loss": 0.43931350111961365, "eval_precision": 0.7492063492063492, "eval_recall": 0.923679060665362, "eval_runtime": 25.0105, "eval_samples_per_second": 40.863, "eval_steps_per_second": 1.279, "step": 192 }, { "epoch": 3.0, "grad_norm": 2.853325366973877, "learning_rate": 8.900833954925197e-05, "loss": 0.4412, "step": 288 }, { "epoch": 3.0, "eval_accuracy": 0.8258317025440313, "eval_f1": 0.8363970588235294, "eval_loss": 0.42110058665275574, "eval_precision": 0.7885615251299827, "eval_recall": 0.8904109589041096, "eval_runtime": 25.7114, "eval_samples_per_second": 39.749, "eval_steps_per_second": 1.245, "step": 288 }, { "epoch": 4.0, "grad_norm": 5.9698286056518555, "learning_rate": 7.417361629104331e-05, "loss": 0.4294, "step": 384 }, { "epoch": 4.0, "eval_accuracy": 0.8160469667318982, "eval_f1": 0.8192307692307692, "eval_loss": 0.41999757289886475, "eval_precision": 0.8052930056710775, "eval_recall": 0.8336594911937377, "eval_runtime": 25.0795, "eval_samples_per_second": 40.75, "eval_steps_per_second": 1.276, "step": 384 }, { "epoch": 5.0, "grad_norm": 1.9139764308929443, "learning_rate": 5.9338893032834646e-05, "loss": 0.4153, "step": 480 }, { "epoch": 5.0, "eval_accuracy": 0.8297455968688845, "eval_f1": 0.843806104129264, "eval_loss": 0.4074234664440155, "eval_precision": 0.7794361525704809, "eval_recall": 0.9197651663405088, "eval_runtime": 25.2807, "eval_samples_per_second": 40.426, "eval_steps_per_second": 1.266, "step": 480 }, { "epoch": 6.0, "grad_norm": 3.3249385356903076, "learning_rate": 4.4504169774625984e-05, "loss": 0.4067, "step": 576 }, { "epoch": 6.0, "eval_accuracy": 0.824853228962818, "eval_f1": 0.8397493285586393, "eval_loss": 0.4091223180294037, "eval_precision": 0.7739273927392739, "eval_recall": 0.9178082191780822, "eval_runtime": 25.2618, "eval_samples_per_second": 40.456, "eval_steps_per_second": 1.267, "step": 576 }, { "epoch": 7.0, "grad_norm": 2.5627498626708984, "learning_rate": 2.9669446516417323e-05, "loss": 0.3993, "step": 672 }, { "epoch": 7.0, "eval_accuracy": 0.8326810176125244, "eval_f1": 0.848, "eval_loss": 0.4094063341617584, "eval_precision": 0.7768729641693811, "eval_recall": 0.9334637964774951, "eval_runtime": 26.0578, "eval_samples_per_second": 39.221, "eval_steps_per_second": 1.228, "step": 672 }, { "epoch": 8.0, "grad_norm": 3.446059465408325, "learning_rate": 1.4834723258208661e-05, "loss": 0.4005, "step": 768 }, { "epoch": 8.0, "eval_accuracy": 0.8287671232876712, "eval_f1": 0.8424842484248425, "eval_loss": 0.4037325978279114, "eval_precision": 0.78, "eval_recall": 0.9158512720156555, "eval_runtime": 25.5767, "eval_samples_per_second": 39.958, "eval_steps_per_second": 1.251, "step": 768 }, { "epoch": 9.0, "grad_norm": 2.3571696281433105, "learning_rate": 0.0, "loss": 0.3936, "step": 864 }, { "epoch": 9.0, "eval_accuracy": 0.8326810176125244, "eval_f1": 0.8423963133640553, "eval_loss": 0.40162914991378784, "eval_precision": 0.7961672473867596, "eval_recall": 0.8943248532289628, "eval_runtime": 25.5021, "eval_samples_per_second": 40.075, "eval_steps_per_second": 1.255, "step": 864 } ], "logging_steps": 500, "max_steps": 864, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "total_flos": 2121256775520.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.8475925399213161, "learning_rate": 0.00013351250932387796, "num_train_epochs": 9, "temperature": 10 } }