{ "best_metric": 0.6666666666666666, "best_model_checkpoint": "tiny-bert-sst2-distilled/run-3/checkpoint-214", "epoch": 18.0, "eval_steps": 500, "global_step": 3852, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.38364019989967346, "learning_rate": 3.063781435548824e-05, "loss": 0.0796, "step": 214 }, { "epoch": 1.0, "eval_accuracy": 0.6666666666666666, "eval_f1": 0.0, "eval_loss": 0.054877836257219315, "eval_mcc": 0.0, "eval_precision": 0.0, "eval_recall": 0.0, "eval_runtime": 3.1144, "eval_samples_per_second": 548.1, "eval_steps_per_second": 17.339, "step": 214 }, { "epoch": 2.0, "grad_norm": 0.44893428683280945, "learning_rate": 2.883558998163599e-05, "loss": 0.054, "step": 428 }, { "epoch": 2.0, "eval_accuracy": 0.6666666666666666, "eval_f1": 0.0, "eval_loss": 0.050605349242687225, "eval_mcc": 0.0, "eval_precision": 0.0, "eval_recall": 0.0, "eval_runtime": 3.2339, "eval_samples_per_second": 527.845, "eval_steps_per_second": 16.698, "step": 428 }, { "epoch": 3.0, "grad_norm": 0.47303804755210876, "learning_rate": 2.7033365607783743e-05, "loss": 0.0517, "step": 642 }, { "epoch": 3.0, "eval_accuracy": 0.6666666666666666, "eval_f1": 0.0, "eval_loss": 0.04936650022864342, "eval_mcc": 0.0, "eval_precision": 0.0, "eval_recall": 0.0, "eval_runtime": 3.1605, "eval_samples_per_second": 540.104, "eval_steps_per_second": 17.086, "step": 642 }, { "epoch": 4.0, "grad_norm": 0.5809402465820312, "learning_rate": 2.523114123393149e-05, "loss": 0.0508, "step": 856 }, { "epoch": 4.0, "eval_accuracy": 0.6666666666666666, "eval_f1": 0.013864818024263433, "eval_loss": 0.04913894087076187, "eval_mcc": 0.024260699053001704, "eval_precision": 0.5, "eval_recall": 0.007029876977152899, "eval_runtime": 3.8427, "eval_samples_per_second": 444.219, "eval_steps_per_second": 14.053, "step": 856 }, { "epoch": 5.0, "grad_norm": 0.7497197389602661, "learning_rate": 2.3428916860079242e-05, "loss": 0.0505, "step": 1070 }, { "epoch": 5.0, "eval_accuracy": 0.6666666666666666, "eval_f1": 0.017271157167530225, "eval_loss": 0.04886631295084953, "eval_mcc": 0.027140265094376777, "eval_precision": 0.5, "eval_recall": 0.008787346221441126, "eval_runtime": 3.1197, "eval_samples_per_second": 547.164, "eval_steps_per_second": 17.309, "step": 1070 }, { "epoch": 6.0, "grad_norm": 0.3793525993824005, "learning_rate": 2.162669248622699e-05, "loss": 0.0503, "step": 1284 }, { "epoch": 6.0, "eval_accuracy": 0.664323374340949, "eval_f1": 0.017152658662092625, "eval_loss": 0.048080265522003174, "eval_mcc": 0.004592958330124466, "eval_precision": 0.35714285714285715, "eval_recall": 0.008787346221441126, "eval_runtime": 3.3992, "eval_samples_per_second": 502.172, "eval_steps_per_second": 15.886, "step": 1284 }, { "epoch": 7.0, "grad_norm": 0.525617778301239, "learning_rate": 1.9824468112374745e-05, "loss": 0.05, "step": 1498 }, { "epoch": 7.0, "eval_accuracy": 0.664323374340949, "eval_f1": 0.017152658662092625, "eval_loss": 0.04796423017978668, "eval_mcc": 0.004592958330124466, "eval_precision": 0.35714285714285715, "eval_recall": 0.008787346221441126, "eval_runtime": 3.1569, "eval_samples_per_second": 540.727, "eval_steps_per_second": 17.106, "step": 1498 }, { "epoch": 8.0, "grad_norm": 0.456257700920105, "learning_rate": 1.8022243738522493e-05, "loss": 0.0497, "step": 1712 }, { "epoch": 8.0, "eval_accuracy": 0.6654950205038078, "eval_f1": 0.017211703958691912, "eval_loss": 0.04803091287612915, "eval_mcc": 0.01487410293271824, "eval_precision": 0.4166666666666667, "eval_recall": 0.008787346221441126, "eval_runtime": 3.822, "eval_samples_per_second": 446.626, "eval_steps_per_second": 14.129, "step": 1712 }, { "epoch": 9.0, "grad_norm": 0.4184946119785309, "learning_rate": 1.6220019364670245e-05, "loss": 0.0498, "step": 1926 }, { "epoch": 9.0, "eval_accuracy": 0.6666666666666666, "eval_f1": 0.017271157167530225, "eval_loss": 0.04849984124302864, "eval_mcc": 0.027140265094376777, "eval_precision": 0.5, "eval_recall": 0.008787346221441126, "eval_runtime": 3.1433, "eval_samples_per_second": 543.052, "eval_steps_per_second": 17.179, "step": 1926 }, { "epoch": 10.0, "grad_norm": 0.32953304052352905, "learning_rate": 1.4417794990817994e-05, "loss": 0.0494, "step": 2140 }, { "epoch": 10.0, "eval_accuracy": 0.664323374340949, "eval_f1": 0.017152658662092625, "eval_loss": 0.04773561656475067, "eval_mcc": 0.004592958330124466, "eval_precision": 0.35714285714285715, "eval_recall": 0.008787346221441126, "eval_runtime": 3.2081, "eval_samples_per_second": 532.094, "eval_steps_per_second": 16.833, "step": 2140 }, { "epoch": 11.0, "grad_norm": 0.2775495946407318, "learning_rate": 1.2615570616965746e-05, "loss": 0.0494, "step": 2354 }, { "epoch": 11.0, "eval_accuracy": 0.664323374340949, "eval_f1": 0.017152658662092625, "eval_loss": 0.04799096658825874, "eval_mcc": 0.004592958330124466, "eval_precision": 0.35714285714285715, "eval_recall": 0.008787346221441126, "eval_runtime": 3.142, "eval_samples_per_second": 543.29, "eval_steps_per_second": 17.187, "step": 2354 }, { "epoch": 12.0, "grad_norm": 0.2784470319747925, "learning_rate": 1.0813346243113495e-05, "loss": 0.0494, "step": 2568 }, { "epoch": 12.0, "eval_accuracy": 0.6660808435852372, "eval_f1": 0.01724137931034483, "eval_loss": 0.04788310080766678, "eval_mcc": 0.020707884164064556, "eval_precision": 0.45454545454545453, "eval_recall": 0.008787346221441126, "eval_runtime": 3.8964, "eval_samples_per_second": 438.099, "eval_steps_per_second": 13.859, "step": 2568 }, { "epoch": 13.0, "grad_norm": 0.7122122049331665, "learning_rate": 9.011121869261247e-06, "loss": 0.0493, "step": 2782 }, { "epoch": 13.0, "eval_accuracy": 0.6654950205038078, "eval_f1": 0.017211703958691912, "eval_loss": 0.04763857275247574, "eval_mcc": 0.01487410293271824, "eval_precision": 0.4166666666666667, "eval_recall": 0.008787346221441126, "eval_runtime": 3.1513, "eval_samples_per_second": 541.682, "eval_steps_per_second": 17.136, "step": 2782 }, { "epoch": 14.0, "grad_norm": 0.3367031216621399, "learning_rate": 7.208897495408997e-06, "loss": 0.0491, "step": 2996 }, { "epoch": 14.0, "eval_accuracy": 0.6649091974223784, "eval_f1": 0.01718213058419244, "eval_loss": 0.0474877767264843, "eval_mcc": 0.009529862152017439, "eval_precision": 0.38461538461538464, "eval_recall": 0.008787346221441126, "eval_runtime": 3.2218, "eval_samples_per_second": 529.831, "eval_steps_per_second": 16.761, "step": 2996 }, { "epoch": 15.0, "grad_norm": 0.6209991574287415, "learning_rate": 5.406673121556748e-06, "loss": 0.049, "step": 3210 }, { "epoch": 15.0, "eval_accuracy": 0.6649091974223784, "eval_f1": 0.01718213058419244, "eval_loss": 0.04748029261827469, "eval_mcc": 0.009529862152017439, "eval_precision": 0.38461538461538464, "eval_recall": 0.008787346221441126, "eval_runtime": 3.1605, "eval_samples_per_second": 540.101, "eval_steps_per_second": 17.086, "step": 3210 }, { "epoch": 16.0, "grad_norm": 0.8616418838500977, "learning_rate": 3.6044487477044986e-06, "loss": 0.0491, "step": 3424 }, { "epoch": 16.0, "eval_accuracy": 0.664323374340949, "eval_f1": 0.017152658662092625, "eval_loss": 0.04751123487949371, "eval_mcc": 0.004592958330124466, "eval_precision": 0.35714285714285715, "eval_recall": 0.008787346221441126, "eval_runtime": 3.9087, "eval_samples_per_second": 436.72, "eval_steps_per_second": 13.815, "step": 3424 }, { "epoch": 17.0, "grad_norm": 0.38889187574386597, "learning_rate": 1.8022243738522493e-06, "loss": 0.0493, "step": 3638 }, { "epoch": 17.0, "eval_accuracy": 0.6649091974223784, "eval_f1": 0.01718213058419244, "eval_loss": 0.0475541353225708, "eval_mcc": 0.009529862152017439, "eval_precision": 0.38461538461538464, "eval_recall": 0.008787346221441126, "eval_runtime": 3.1517, "eval_samples_per_second": 541.62, "eval_steps_per_second": 17.134, "step": 3638 }, { "epoch": 18.0, "grad_norm": 0.3475455343723297, "learning_rate": 0.0, "loss": 0.0489, "step": 3852 }, { "epoch": 18.0, "eval_accuracy": 0.6649091974223784, "eval_f1": 0.01718213058419244, "eval_loss": 0.047414712607860565, "eval_mcc": 0.009529862152017439, "eval_precision": 0.38461538461538464, "eval_recall": 0.008787346221441126, "eval_runtime": 3.2036, "eval_samples_per_second": 532.83, "eval_steps_per_second": 16.856, "step": 3852 } ], "logging_steps": 500, "max_steps": 3852, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 500, "total_flos": 9445961959920.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "alpha": 0.06448750556013427, "learning_rate": 3.244003872934049e-05, "num_train_epochs": 18, "temperature": 34 } }