|
{ |
|
"best_metric": 1.3073620796203613, |
|
"best_model_checkpoint": "mobilebert_sa_pre-training-complete/checkpoint-300000", |
|
"epoch": 41.98740377886634, |
|
"global_step": 300000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 4.882544181393798e-05, |
|
"loss": 1.6028, |
|
"step": 7145 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6935334549025108, |
|
"eval_loss": 1.4525387287139893, |
|
"eval_runtime": 1.4716, |
|
"eval_samples_per_second": 325.49, |
|
"eval_steps_per_second": 10.193, |
|
"step": 7145 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 4.763421140380127e-05, |
|
"loss": 1.5524, |
|
"step": 14290 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6992782005371531, |
|
"eval_loss": 1.437490463256836, |
|
"eval_runtime": 1.5211, |
|
"eval_samples_per_second": 314.9, |
|
"eval_steps_per_second": 9.861, |
|
"step": 14290 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 4.6442980993664556e-05, |
|
"loss": 1.5323, |
|
"step": 21435 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6993441976976554, |
|
"eval_loss": 1.4193694591522217, |
|
"eval_runtime": 1.4759, |
|
"eval_samples_per_second": 324.542, |
|
"eval_steps_per_second": 10.163, |
|
"step": 21435 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 4.5251750583527844e-05, |
|
"loss": 1.5191, |
|
"step": 28580 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7026513032777716, |
|
"eval_loss": 1.4109910726547241, |
|
"eval_runtime": 1.4968, |
|
"eval_samples_per_second": 320.019, |
|
"eval_steps_per_second": 10.021, |
|
"step": 28580 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 4.406052017339113e-05, |
|
"loss": 1.5025, |
|
"step": 35725 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7013675690761931, |
|
"eval_loss": 1.4167572259902954, |
|
"eval_runtime": 1.4782, |
|
"eval_samples_per_second": 324.039, |
|
"eval_steps_per_second": 10.147, |
|
"step": 35725 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 4.286928976325442e-05, |
|
"loss": 1.4902, |
|
"step": 42870 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7011720396863318, |
|
"eval_loss": 1.3931331634521484, |
|
"eval_runtime": 1.4734, |
|
"eval_samples_per_second": 325.107, |
|
"eval_steps_per_second": 10.181, |
|
"step": 42870 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 4.167805935311771e-05, |
|
"loss": 1.4813, |
|
"step": 50015 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7056545531078995, |
|
"eval_loss": 1.3738043308258057, |
|
"eval_runtime": 1.4644, |
|
"eval_samples_per_second": 327.106, |
|
"eval_steps_per_second": 10.243, |
|
"step": 50015 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 4.0486828942981e-05, |
|
"loss": 1.4751, |
|
"step": 57160 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6995995407320283, |
|
"eval_loss": 1.4237422943115234, |
|
"eval_runtime": 1.459, |
|
"eval_samples_per_second": 328.317, |
|
"eval_steps_per_second": 10.281, |
|
"step": 57160 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"learning_rate": 3.929559853284429e-05, |
|
"loss": 1.4689, |
|
"step": 64305 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.704691011235955, |
|
"eval_loss": 1.3969331979751587, |
|
"eval_runtime": 1.6056, |
|
"eval_samples_per_second": 298.322, |
|
"eval_steps_per_second": 9.342, |
|
"step": 64305 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 3.8104368122707576e-05, |
|
"loss": 1.4626, |
|
"step": 71450 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7067709060449532, |
|
"eval_loss": 1.391621470451355, |
|
"eval_runtime": 1.4719, |
|
"eval_samples_per_second": 325.421, |
|
"eval_steps_per_second": 10.191, |
|
"step": 71450 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 3.691313771257086e-05, |
|
"loss": 1.4566, |
|
"step": 78595 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7071985535088711, |
|
"eval_loss": 1.3686023950576782, |
|
"eval_runtime": 1.4629, |
|
"eval_samples_per_second": 327.432, |
|
"eval_steps_per_second": 10.254, |
|
"step": 78595 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 3.572190730243415e-05, |
|
"loss": 1.451, |
|
"step": 85740 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7060222091689743, |
|
"eval_loss": 1.3811498880386353, |
|
"eval_runtime": 1.4641, |
|
"eval_samples_per_second": 327.173, |
|
"eval_steps_per_second": 10.246, |
|
"step": 85740 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"learning_rate": 3.453067689229744e-05, |
|
"loss": 1.4478, |
|
"step": 92885 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7091579355840124, |
|
"eval_loss": 1.3597520589828491, |
|
"eval_runtime": 1.4632, |
|
"eval_samples_per_second": 327.355, |
|
"eval_steps_per_second": 10.251, |
|
"step": 92885 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 3.3339446482160726e-05, |
|
"loss": 1.4441, |
|
"step": 100030 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7054075191330094, |
|
"eval_loss": 1.3789618015289307, |
|
"eval_runtime": 1.4621, |
|
"eval_samples_per_second": 327.608, |
|
"eval_steps_per_second": 10.259, |
|
"step": 100030 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 3.214821607202401e-05, |
|
"loss": 1.4379, |
|
"step": 107175 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7065809145017066, |
|
"eval_loss": 1.379388451576233, |
|
"eval_runtime": 1.5875, |
|
"eval_samples_per_second": 301.725, |
|
"eval_steps_per_second": 9.449, |
|
"step": 107175 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 3.09569856618873e-05, |
|
"loss": 1.4353, |
|
"step": 114320 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.710198236648509, |
|
"eval_loss": 1.3609341382980347, |
|
"eval_runtime": 1.4593, |
|
"eval_samples_per_second": 328.244, |
|
"eval_steps_per_second": 10.279, |
|
"step": 114320 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 2.976575525175058e-05, |
|
"loss": 1.43, |
|
"step": 121465 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.7083252258512857, |
|
"eval_loss": 1.3685261011123657, |
|
"eval_runtime": 1.4875, |
|
"eval_samples_per_second": 322.019, |
|
"eval_steps_per_second": 10.084, |
|
"step": 121465 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 2.857452484161387e-05, |
|
"loss": 1.4278, |
|
"step": 128610 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7036037555518075, |
|
"eval_loss": 1.3953258991241455, |
|
"eval_runtime": 1.4616, |
|
"eval_samples_per_second": 327.715, |
|
"eval_steps_per_second": 10.262, |
|
"step": 128610 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"learning_rate": 2.7383294431477156e-05, |
|
"loss": 1.4219, |
|
"step": 135755 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7085320020194088, |
|
"eval_loss": 1.3756214380264282, |
|
"eval_runtime": 1.4616, |
|
"eval_samples_per_second": 327.73, |
|
"eval_steps_per_second": 10.263, |
|
"step": 135755 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 2.6192064021340444e-05, |
|
"loss": 1.4197, |
|
"step": 142900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7089573167311684, |
|
"eval_loss": 1.3597127199172974, |
|
"eval_runtime": 1.4718, |
|
"eval_samples_per_second": 325.445, |
|
"eval_steps_per_second": 10.191, |
|
"step": 142900 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"learning_rate": 2.5000833611203735e-05, |
|
"loss": 1.4169, |
|
"step": 150045 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.7060544426179265, |
|
"eval_loss": 1.367296576499939, |
|
"eval_runtime": 1.4625, |
|
"eval_samples_per_second": 327.518, |
|
"eval_steps_per_second": 10.256, |
|
"step": 150045 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"learning_rate": 2.3809603201067022e-05, |
|
"loss": 1.4146, |
|
"step": 157190 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.707288269036104, |
|
"eval_loss": 1.3753403425216675, |
|
"eval_runtime": 1.4573, |
|
"eval_samples_per_second": 328.688, |
|
"eval_steps_per_second": 10.293, |
|
"step": 157190 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"learning_rate": 2.2618372790930313e-05, |
|
"loss": 1.4109, |
|
"step": 164335 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.7081938623386121, |
|
"eval_loss": 1.3696134090423584, |
|
"eval_runtime": 1.4581, |
|
"eval_samples_per_second": 328.502, |
|
"eval_steps_per_second": 10.287, |
|
"step": 164335 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"learning_rate": 2.14271423807936e-05, |
|
"loss": 1.4073, |
|
"step": 171480 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7092472511981956, |
|
"eval_loss": 1.356264352798462, |
|
"eval_runtime": 1.4561, |
|
"eval_samples_per_second": 328.957, |
|
"eval_steps_per_second": 10.301, |
|
"step": 171480 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"learning_rate": 2.0235911970656888e-05, |
|
"loss": 1.4054, |
|
"step": 178625 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.7103286516069584, |
|
"eval_loss": 1.371171474456787, |
|
"eval_runtime": 1.475, |
|
"eval_samples_per_second": 324.736, |
|
"eval_steps_per_second": 10.169, |
|
"step": 178625 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"learning_rate": 1.9044681560520176e-05, |
|
"loss": 1.402, |
|
"step": 185770 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7112762628520339, |
|
"eval_loss": 1.3528329133987427, |
|
"eval_runtime": 1.467, |
|
"eval_samples_per_second": 326.525, |
|
"eval_steps_per_second": 10.225, |
|
"step": 185770 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"learning_rate": 1.7853451150383463e-05, |
|
"loss": 1.4001, |
|
"step": 192915 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.712307605886979, |
|
"eval_loss": 1.336666226387024, |
|
"eval_runtime": 1.4596, |
|
"eval_samples_per_second": 328.179, |
|
"eval_steps_per_second": 10.277, |
|
"step": 192915 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"learning_rate": 1.666222074024675e-05, |
|
"loss": 1.397, |
|
"step": 200060 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7117655307810966, |
|
"eval_loss": 1.3508223295211792, |
|
"eval_runtime": 1.458, |
|
"eval_samples_per_second": 328.539, |
|
"eval_steps_per_second": 10.288, |
|
"step": 200060 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"learning_rate": 1.5470990330110038e-05, |
|
"loss": 1.3955, |
|
"step": 207205 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.7116529947185077, |
|
"eval_loss": 1.3571882247924805, |
|
"eval_runtime": 1.6349, |
|
"eval_samples_per_second": 292.987, |
|
"eval_steps_per_second": 9.175, |
|
"step": 207205 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"learning_rate": 1.4279759919973326e-05, |
|
"loss": 1.3937, |
|
"step": 214350 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7095319458838688, |
|
"eval_loss": 1.356575846672058, |
|
"eval_runtime": 1.4657, |
|
"eval_samples_per_second": 326.804, |
|
"eval_steps_per_second": 10.234, |
|
"step": 214350 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"learning_rate": 1.3088529509836615e-05, |
|
"loss": 1.3901, |
|
"step": 221495 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.7116992819935238, |
|
"eval_loss": 1.3515229225158691, |
|
"eval_runtime": 1.461, |
|
"eval_samples_per_second": 327.859, |
|
"eval_steps_per_second": 10.267, |
|
"step": 221495 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"learning_rate": 1.18972990996999e-05, |
|
"loss": 1.3874, |
|
"step": 228640 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7118393529493795, |
|
"eval_loss": 1.3445274829864502, |
|
"eval_runtime": 1.4728, |
|
"eval_samples_per_second": 325.229, |
|
"eval_steps_per_second": 10.185, |
|
"step": 228640 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"learning_rate": 1.0706068689563188e-05, |
|
"loss": 1.386, |
|
"step": 235785 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.7097090095131505, |
|
"eval_loss": 1.361108660697937, |
|
"eval_runtime": 1.4621, |
|
"eval_samples_per_second": 327.607, |
|
"eval_steps_per_second": 10.259, |
|
"step": 235785 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"learning_rate": 9.514838279426476e-06, |
|
"loss": 1.3833, |
|
"step": 242930 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.7086746246959827, |
|
"eval_loss": 1.350243091583252, |
|
"eval_runtime": 1.4812, |
|
"eval_samples_per_second": 323.387, |
|
"eval_steps_per_second": 10.127, |
|
"step": 242930 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"learning_rate": 8.323607869289763e-06, |
|
"loss": 1.3822, |
|
"step": 250075 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.7108018854610629, |
|
"eval_loss": 1.3657063245773315, |
|
"eval_runtime": 1.4712, |
|
"eval_samples_per_second": 325.58, |
|
"eval_steps_per_second": 10.196, |
|
"step": 250075 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"learning_rate": 7.132377459153051e-06, |
|
"loss": 1.3797, |
|
"step": 257220 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7107789319595755, |
|
"eval_loss": 1.3575541973114014, |
|
"eval_runtime": 1.4667, |
|
"eval_samples_per_second": 326.589, |
|
"eval_steps_per_second": 10.227, |
|
"step": 257220 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"learning_rate": 5.941147049016339e-06, |
|
"loss": 1.3793, |
|
"step": 264365 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.710604865960802, |
|
"eval_loss": 1.3471879959106445, |
|
"eval_runtime": 1.4747, |
|
"eval_samples_per_second": 324.802, |
|
"eval_steps_per_second": 10.171, |
|
"step": 264365 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"learning_rate": 4.749916638879627e-06, |
|
"loss": 1.3763, |
|
"step": 271510 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7155870445344129, |
|
"eval_loss": 1.3322880268096924, |
|
"eval_runtime": 1.4923, |
|
"eval_samples_per_second": 320.979, |
|
"eval_steps_per_second": 10.052, |
|
"step": 271510 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"learning_rate": 3.5586862287429143e-06, |
|
"loss": 1.3762, |
|
"step": 278655 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.7144579664629017, |
|
"eval_loss": 1.3325406312942505, |
|
"eval_runtime": 1.6301, |
|
"eval_samples_per_second": 293.852, |
|
"eval_steps_per_second": 9.202, |
|
"step": 278655 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"learning_rate": 2.3674558186062022e-06, |
|
"loss": 1.3748, |
|
"step": 285800 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.7138002117109589, |
|
"eval_loss": 1.3242748975753784, |
|
"eval_runtime": 1.4707, |
|
"eval_samples_per_second": 325.685, |
|
"eval_steps_per_second": 10.199, |
|
"step": 285800 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"learning_rate": 1.17622540846949e-06, |
|
"loss": 1.3733, |
|
"step": 292945 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.7170023313951855, |
|
"eval_loss": 1.3217717409133911, |
|
"eval_runtime": 1.459, |
|
"eval_samples_per_second": 328.301, |
|
"eval_steps_per_second": 10.281, |
|
"step": 292945 |
|
}, |
|
{ |
|
"epoch": 41.99, |
|
"learning_rate": 0.0, |
|
"loss": 1.3722, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 41.99, |
|
"eval_accuracy": 0.7186174960946218, |
|
"eval_loss": 1.3073620796203613, |
|
"eval_runtime": 1.4662, |
|
"eval_samples_per_second": 326.688, |
|
"eval_steps_per_second": 10.23, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 41.99, |
|
"step": 300000, |
|
"total_flos": 9.562938924439962e+17, |
|
"train_loss": 1.4300982942708333, |
|
"train_runtime": 103608.4476, |
|
"train_samples_per_second": 92.657, |
|
"train_steps_per_second": 2.896 |
|
} |
|
], |
|
"max_steps": 300000, |
|
"num_train_epochs": 42, |
|
"total_flos": 9.562938924439962e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|