|
{ |
|
"best_metric": 0.996875, |
|
"best_model_checkpoint": "vit-msn-small-finetuned-alzheimers/checkpoint-765", |
|
"epoch": 48.888888888888886, |
|
"eval_steps": 500, |
|
"global_step": 1100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 8.449820518493652, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.2587, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 13.680850982666016, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.2996, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"eval_accuracy": 0.84375, |
|
"eval_loss": 0.38971763849258423, |
|
"eval_runtime": 3.5179, |
|
"eval_samples_per_second": 181.926, |
|
"eval_steps_per_second": 2.843, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 9.488574981689453, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 0.4023, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 18.977561950683594, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.3703, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.859375, |
|
"eval_loss": 0.3594878911972046, |
|
"eval_runtime": 3.9024, |
|
"eval_samples_per_second": 164.001, |
|
"eval_steps_per_second": 2.563, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 11.33133602142334, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.3541, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 16.366662979125977, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.3087, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.977777777777778, |
|
"eval_accuracy": 0.8625, |
|
"eval_loss": 0.3777163326740265, |
|
"eval_runtime": 3.8599, |
|
"eval_samples_per_second": 165.808, |
|
"eval_steps_per_second": 2.591, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 18.307331085205078, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.3195, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 42.80950164794922, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.3483, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 18.051124572753906, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 0.486, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.81875, |
|
"eval_loss": 0.4530211389064789, |
|
"eval_runtime": 3.6057, |
|
"eval_samples_per_second": 177.495, |
|
"eval_steps_per_second": 2.773, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 15.844127655029297, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.3521, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 11.87112808227539, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3307, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.977777777777778, |
|
"eval_accuracy": 0.8234375, |
|
"eval_loss": 0.45600825548171997, |
|
"eval_runtime": 3.572, |
|
"eval_samples_per_second": 179.171, |
|
"eval_steps_per_second": 2.8, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 17.418668746948242, |
|
"learning_rate": 4.94949494949495e-05, |
|
"loss": 0.3404, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 20.148906707763672, |
|
"learning_rate": 4.898989898989899e-05, |
|
"loss": 0.306, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.8671875, |
|
"eval_loss": 0.3470742106437683, |
|
"eval_runtime": 3.5697, |
|
"eval_samples_per_second": 179.289, |
|
"eval_steps_per_second": 2.801, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 11.451733589172363, |
|
"learning_rate": 4.848484848484849e-05, |
|
"loss": 0.2873, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 15.43708610534668, |
|
"learning_rate": 4.797979797979798e-05, |
|
"loss": 0.3005, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.977777777777778, |
|
"eval_accuracy": 0.8859375, |
|
"eval_loss": 0.3024638891220093, |
|
"eval_runtime": 3.8788, |
|
"eval_samples_per_second": 164.998, |
|
"eval_steps_per_second": 2.578, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 20.696516036987305, |
|
"learning_rate": 4.7474747474747476e-05, |
|
"loss": 0.3206, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.555555555555555, |
|
"grad_norm": 12.01241397857666, |
|
"learning_rate": 4.696969696969697e-05, |
|
"loss": 0.2851, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 17.638214111328125, |
|
"learning_rate": 4.6464646464646464e-05, |
|
"loss": 0.319, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8984375, |
|
"eval_loss": 0.24509796500205994, |
|
"eval_runtime": 3.9737, |
|
"eval_samples_per_second": 161.061, |
|
"eval_steps_per_second": 2.517, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.444444444444445, |
|
"grad_norm": 21.502132415771484, |
|
"learning_rate": 4.595959595959596e-05, |
|
"loss": 0.2968, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 20.09746742248535, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.3489, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 8.977777777777778, |
|
"eval_accuracy": 0.928125, |
|
"eval_loss": 0.18142804503440857, |
|
"eval_runtime": 3.7455, |
|
"eval_samples_per_second": 170.872, |
|
"eval_steps_per_second": 2.67, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 16.205760955810547, |
|
"learning_rate": 4.494949494949495e-05, |
|
"loss": 0.2915, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 27.511030197143555, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.3251, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.915625, |
|
"eval_loss": 0.24511559307575226, |
|
"eval_runtime": 3.6361, |
|
"eval_samples_per_second": 176.011, |
|
"eval_steps_per_second": 2.75, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 10.222222222222221, |
|
"grad_norm": 11.171629905700684, |
|
"learning_rate": 4.3939393939393944e-05, |
|
"loss": 0.308, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 12.315302848815918, |
|
"learning_rate": 4.343434343434344e-05, |
|
"loss": 0.3034, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 10.977777777777778, |
|
"eval_accuracy": 0.940625, |
|
"eval_loss": 0.15658709406852722, |
|
"eval_runtime": 3.5876, |
|
"eval_samples_per_second": 178.392, |
|
"eval_steps_per_second": 2.787, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 14.539319038391113, |
|
"learning_rate": 4.292929292929293e-05, |
|
"loss": 0.2847, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 11.555555555555555, |
|
"grad_norm": 17.26177215576172, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 0.2754, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 12.163039207458496, |
|
"learning_rate": 4.191919191919192e-05, |
|
"loss": 0.2746, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8921875, |
|
"eval_loss": 0.24925951659679413, |
|
"eval_runtime": 3.5808, |
|
"eval_samples_per_second": 178.73, |
|
"eval_steps_per_second": 2.793, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 12.444444444444445, |
|
"grad_norm": 11.92519474029541, |
|
"learning_rate": 4.141414141414142e-05, |
|
"loss": 0.2889, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 12.88888888888889, |
|
"grad_norm": 13.212408065795898, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 0.2369, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 12.977777777777778, |
|
"eval_accuracy": 0.9375, |
|
"eval_loss": 0.1622403860092163, |
|
"eval_runtime": 3.8343, |
|
"eval_samples_per_second": 166.914, |
|
"eval_steps_per_second": 2.608, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 11.68896770477295, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 0.2465, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 13.777777777777779, |
|
"grad_norm": 14.610076904296875, |
|
"learning_rate": 3.98989898989899e-05, |
|
"loss": 0.2231, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.9359375, |
|
"eval_loss": 0.17805945873260498, |
|
"eval_runtime": 3.9695, |
|
"eval_samples_per_second": 161.231, |
|
"eval_steps_per_second": 2.519, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 14.222222222222221, |
|
"grad_norm": 12.407272338867188, |
|
"learning_rate": 3.939393939393939e-05, |
|
"loss": 0.2177, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 14.666666666666666, |
|
"grad_norm": 7.3430256843566895, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.2281, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 14.977777777777778, |
|
"eval_accuracy": 0.953125, |
|
"eval_loss": 0.12681424617767334, |
|
"eval_runtime": 3.6554, |
|
"eval_samples_per_second": 175.082, |
|
"eval_steps_per_second": 2.736, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 15.11111111111111, |
|
"grad_norm": 10.262022018432617, |
|
"learning_rate": 3.838383838383838e-05, |
|
"loss": 0.209, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 15.555555555555555, |
|
"grad_norm": 9.078124046325684, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 0.2134, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 14.094355583190918, |
|
"learning_rate": 3.7373737373737376e-05, |
|
"loss": 0.2001, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.9140625, |
|
"eval_loss": 0.24309130012989044, |
|
"eval_runtime": 3.5892, |
|
"eval_samples_per_second": 178.311, |
|
"eval_steps_per_second": 2.786, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 16.444444444444443, |
|
"grad_norm": 12.868298530578613, |
|
"learning_rate": 3.686868686868687e-05, |
|
"loss": 0.2312, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 16.88888888888889, |
|
"grad_norm": 7.863047122955322, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.183, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 16.977777777777778, |
|
"eval_accuracy": 0.9625, |
|
"eval_loss": 0.10167054831981659, |
|
"eval_runtime": 3.6007, |
|
"eval_samples_per_second": 177.743, |
|
"eval_steps_per_second": 2.777, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 17.333333333333332, |
|
"grad_norm": 8.919840812683105, |
|
"learning_rate": 3.5858585858585855e-05, |
|
"loss": 0.1997, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 11.91215705871582, |
|
"learning_rate": 3.535353535353535e-05, |
|
"loss": 0.1891, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.9390625, |
|
"eval_loss": 0.180230051279068, |
|
"eval_runtime": 3.5491, |
|
"eval_samples_per_second": 180.325, |
|
"eval_steps_per_second": 2.818, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 18.22222222222222, |
|
"grad_norm": 9.109786033630371, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 0.213, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 18.666666666666668, |
|
"grad_norm": 5.232081413269043, |
|
"learning_rate": 3.434343434343435e-05, |
|
"loss": 0.1862, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 18.977777777777778, |
|
"eval_accuracy": 0.9765625, |
|
"eval_loss": 0.08689282089471817, |
|
"eval_runtime": 3.9268, |
|
"eval_samples_per_second": 162.982, |
|
"eval_steps_per_second": 2.547, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 19.11111111111111, |
|
"grad_norm": 13.340733528137207, |
|
"learning_rate": 3.3838383838383844e-05, |
|
"loss": 0.1664, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 19.555555555555557, |
|
"grad_norm": 7.66475772857666, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.1727, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 13.612215995788574, |
|
"learning_rate": 3.282828282828283e-05, |
|
"loss": 0.1935, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.96875, |
|
"eval_loss": 0.10791148245334625, |
|
"eval_runtime": 3.9472, |
|
"eval_samples_per_second": 162.141, |
|
"eval_steps_per_second": 2.533, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 20.444444444444443, |
|
"grad_norm": 9.189305305480957, |
|
"learning_rate": 3.232323232323233e-05, |
|
"loss": 0.1945, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 20.88888888888889, |
|
"grad_norm": 9.650483131408691, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.1797, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 20.977777777777778, |
|
"eval_accuracy": 0.95625, |
|
"eval_loss": 0.12502644956111908, |
|
"eval_runtime": 3.6171, |
|
"eval_samples_per_second": 176.939, |
|
"eval_steps_per_second": 2.765, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 21.333333333333332, |
|
"grad_norm": 7.25011682510376, |
|
"learning_rate": 3.131313131313132e-05, |
|
"loss": 0.1767, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 21.77777777777778, |
|
"grad_norm": 9.036290168762207, |
|
"learning_rate": 3.080808080808081e-05, |
|
"loss": 0.1605, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.971875, |
|
"eval_loss": 0.06545940786600113, |
|
"eval_runtime": 3.5923, |
|
"eval_samples_per_second": 178.16, |
|
"eval_steps_per_second": 2.784, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 22.22222222222222, |
|
"grad_norm": 5.982744216918945, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 0.1493, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 22.666666666666668, |
|
"grad_norm": 13.987672805786133, |
|
"learning_rate": 2.9797979797979796e-05, |
|
"loss": 0.1848, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 22.977777777777778, |
|
"eval_accuracy": 0.9765625, |
|
"eval_loss": 0.08063917607069016, |
|
"eval_runtime": 3.5801, |
|
"eval_samples_per_second": 178.764, |
|
"eval_steps_per_second": 2.793, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 23.11111111111111, |
|
"grad_norm": 15.488668441772461, |
|
"learning_rate": 2.9292929292929294e-05, |
|
"loss": 0.1426, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 23.555555555555557, |
|
"grad_norm": 11.646829605102539, |
|
"learning_rate": 2.878787878787879e-05, |
|
"loss": 0.1667, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 6.619264602661133, |
|
"learning_rate": 2.8282828282828282e-05, |
|
"loss": 0.1498, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.9578125, |
|
"eval_loss": 0.11159060150384903, |
|
"eval_runtime": 3.6016, |
|
"eval_samples_per_second": 177.701, |
|
"eval_steps_per_second": 2.777, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 24.444444444444443, |
|
"grad_norm": 7.8661627769470215, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.1287, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 24.88888888888889, |
|
"grad_norm": 7.934934616088867, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.1394, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 24.977777777777778, |
|
"eval_accuracy": 0.9671875, |
|
"eval_loss": 0.0806862860918045, |
|
"eval_runtime": 3.8749, |
|
"eval_samples_per_second": 165.167, |
|
"eval_steps_per_second": 2.581, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 25.333333333333332, |
|
"grad_norm": 10.52723217010498, |
|
"learning_rate": 2.676767676767677e-05, |
|
"loss": 0.1524, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 25.77777777777778, |
|
"grad_norm": 7.490493297576904, |
|
"learning_rate": 2.6262626262626268e-05, |
|
"loss": 0.1584, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.9796875, |
|
"eval_loss": 0.05252554267644882, |
|
"eval_runtime": 3.9703, |
|
"eval_samples_per_second": 161.197, |
|
"eval_steps_per_second": 2.519, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 26.22222222222222, |
|
"grad_norm": 7.178821563720703, |
|
"learning_rate": 2.575757575757576e-05, |
|
"loss": 0.153, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 7.702730178833008, |
|
"learning_rate": 2.5252525252525256e-05, |
|
"loss": 0.1302, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 26.977777777777778, |
|
"eval_accuracy": 0.9828125, |
|
"eval_loss": 0.05131406709551811, |
|
"eval_runtime": 3.7014, |
|
"eval_samples_per_second": 172.905, |
|
"eval_steps_per_second": 2.702, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 27.11111111111111, |
|
"grad_norm": 9.482915878295898, |
|
"learning_rate": 2.474747474747475e-05, |
|
"loss": 0.1577, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 27.555555555555557, |
|
"grad_norm": 10.196369171142578, |
|
"learning_rate": 2.4242424242424244e-05, |
|
"loss": 0.1543, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 10.799006462097168, |
|
"learning_rate": 2.3737373737373738e-05, |
|
"loss": 0.1356, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.9875, |
|
"eval_loss": 0.04204293340444565, |
|
"eval_runtime": 3.9774, |
|
"eval_samples_per_second": 160.91, |
|
"eval_steps_per_second": 2.514, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 28.444444444444443, |
|
"grad_norm": 7.129752159118652, |
|
"learning_rate": 2.3232323232323232e-05, |
|
"loss": 0.1291, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 28.88888888888889, |
|
"grad_norm": 6.642085552215576, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.1101, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 28.977777777777778, |
|
"eval_accuracy": 0.9875, |
|
"eval_loss": 0.03539272025227547, |
|
"eval_runtime": 4.0166, |
|
"eval_samples_per_second": 159.34, |
|
"eval_steps_per_second": 2.49, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 29.333333333333332, |
|
"grad_norm": 8.5753755569458, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.1445, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 29.77777777777778, |
|
"grad_norm": 13.183974266052246, |
|
"learning_rate": 2.171717171717172e-05, |
|
"loss": 0.1227, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.9765625, |
|
"eval_loss": 0.0582769513130188, |
|
"eval_runtime": 3.9913, |
|
"eval_samples_per_second": 160.35, |
|
"eval_steps_per_second": 2.505, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 30.22222222222222, |
|
"grad_norm": 9.026564598083496, |
|
"learning_rate": 2.1212121212121215e-05, |
|
"loss": 0.1209, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 30.666666666666668, |
|
"grad_norm": 12.504347801208496, |
|
"learning_rate": 2.070707070707071e-05, |
|
"loss": 0.1158, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 30.977777777777778, |
|
"eval_accuracy": 0.990625, |
|
"eval_loss": 0.025325458496809006, |
|
"eval_runtime": 3.9678, |
|
"eval_samples_per_second": 161.299, |
|
"eval_steps_per_second": 2.52, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 31.11111111111111, |
|
"grad_norm": 5.276214599609375, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 0.119, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 31.555555555555557, |
|
"grad_norm": 8.732769012451172, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 0.1156, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 5.604591369628906, |
|
"learning_rate": 1.919191919191919e-05, |
|
"loss": 0.117, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.990625, |
|
"eval_loss": 0.023098567500710487, |
|
"eval_runtime": 3.8505, |
|
"eval_samples_per_second": 166.213, |
|
"eval_steps_per_second": 2.597, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 32.44444444444444, |
|
"grad_norm": 7.332610607147217, |
|
"learning_rate": 1.8686868686868688e-05, |
|
"loss": 0.1213, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 32.888888888888886, |
|
"grad_norm": 12.890093803405762, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.1022, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 32.977777777777774, |
|
"eval_accuracy": 0.9796875, |
|
"eval_loss": 0.0725882276892662, |
|
"eval_runtime": 3.8065, |
|
"eval_samples_per_second": 168.135, |
|
"eval_steps_per_second": 2.627, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 13.247682571411133, |
|
"learning_rate": 1.7676767676767676e-05, |
|
"loss": 0.1257, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 33.77777777777778, |
|
"grad_norm": 6.758236885070801, |
|
"learning_rate": 1.7171717171717173e-05, |
|
"loss": 0.1221, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.996875, |
|
"eval_loss": 0.015964530408382416, |
|
"eval_runtime": 3.7585, |
|
"eval_samples_per_second": 170.283, |
|
"eval_steps_per_second": 2.661, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 34.22222222222222, |
|
"grad_norm": 8.521262168884277, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.1014, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 34.666666666666664, |
|
"grad_norm": 5.949100971221924, |
|
"learning_rate": 1.6161616161616165e-05, |
|
"loss": 0.0956, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 34.977777777777774, |
|
"eval_accuracy": 0.984375, |
|
"eval_loss": 0.048214979469776154, |
|
"eval_runtime": 3.6909, |
|
"eval_samples_per_second": 173.399, |
|
"eval_steps_per_second": 2.709, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 35.111111111111114, |
|
"grad_norm": 10.151766777038574, |
|
"learning_rate": 1.565656565656566e-05, |
|
"loss": 0.1135, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 35.55555555555556, |
|
"grad_norm": 9.514137268066406, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 0.1109, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 4.4278717041015625, |
|
"learning_rate": 1.4646464646464647e-05, |
|
"loss": 0.0856, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.9875, |
|
"eval_loss": 0.025567293167114258, |
|
"eval_runtime": 3.6537, |
|
"eval_samples_per_second": 175.165, |
|
"eval_steps_per_second": 2.737, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 36.44444444444444, |
|
"grad_norm": 8.582184791564941, |
|
"learning_rate": 1.4141414141414141e-05, |
|
"loss": 0.0994, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 36.888888888888886, |
|
"grad_norm": 9.628859519958496, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 0.0996, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 36.977777777777774, |
|
"eval_accuracy": 0.990625, |
|
"eval_loss": 0.021057253703475, |
|
"eval_runtime": 3.6772, |
|
"eval_samples_per_second": 174.046, |
|
"eval_steps_per_second": 2.719, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 37.333333333333336, |
|
"grad_norm": 5.165952682495117, |
|
"learning_rate": 1.3131313131313134e-05, |
|
"loss": 0.0915, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 37.77777777777778, |
|
"grad_norm": 5.831385135650635, |
|
"learning_rate": 1.2626262626262628e-05, |
|
"loss": 0.0848, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.9796875, |
|
"eval_loss": 0.04457371309399605, |
|
"eval_runtime": 3.6584, |
|
"eval_samples_per_second": 174.938, |
|
"eval_steps_per_second": 2.733, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 38.22222222222222, |
|
"grad_norm": 9.629181861877441, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 0.0972, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 38.666666666666664, |
|
"grad_norm": 6.214244365692139, |
|
"learning_rate": 1.1616161616161616e-05, |
|
"loss": 0.1001, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 38.977777777777774, |
|
"eval_accuracy": 0.9875, |
|
"eval_loss": 0.02742326818406582, |
|
"eval_runtime": 3.668, |
|
"eval_samples_per_second": 174.481, |
|
"eval_steps_per_second": 2.726, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 39.111111111111114, |
|
"grad_norm": 11.1734619140625, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.0919, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 39.55555555555556, |
|
"grad_norm": 6.438005447387695, |
|
"learning_rate": 1.0606060606060607e-05, |
|
"loss": 0.0988, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 5.9803643226623535, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 0.0976, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.9921875, |
|
"eval_loss": 0.022529248148202896, |
|
"eval_runtime": 3.7092, |
|
"eval_samples_per_second": 172.543, |
|
"eval_steps_per_second": 2.696, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 40.44444444444444, |
|
"grad_norm": 7.562661647796631, |
|
"learning_rate": 9.595959595959595e-06, |
|
"loss": 0.085, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 40.888888888888886, |
|
"grad_norm": 7.695030212402344, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.0864, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 40.977777777777774, |
|
"eval_accuracy": 0.9921875, |
|
"eval_loss": 0.0207191314548254, |
|
"eval_runtime": 4.0024, |
|
"eval_samples_per_second": 159.904, |
|
"eval_steps_per_second": 2.499, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 41.333333333333336, |
|
"grad_norm": 8.4052734375, |
|
"learning_rate": 8.585858585858587e-06, |
|
"loss": 0.088, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 41.77777777777778, |
|
"grad_norm": 8.705794334411621, |
|
"learning_rate": 8.080808080808082e-06, |
|
"loss": 0.0865, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.996875, |
|
"eval_loss": 0.01933131366968155, |
|
"eval_runtime": 3.9909, |
|
"eval_samples_per_second": 160.365, |
|
"eval_steps_per_second": 2.506, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 42.22222222222222, |
|
"grad_norm": 7.478874683380127, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 0.0815, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 42.666666666666664, |
|
"grad_norm": 5.25657320022583, |
|
"learning_rate": 7.0707070707070704e-06, |
|
"loss": 0.0773, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 42.977777777777774, |
|
"eval_accuracy": 0.9921875, |
|
"eval_loss": 0.020288193598389626, |
|
"eval_runtime": 3.6594, |
|
"eval_samples_per_second": 174.89, |
|
"eval_steps_per_second": 2.733, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 43.111111111111114, |
|
"grad_norm": 4.1972246170043945, |
|
"learning_rate": 6.565656565656567e-06, |
|
"loss": 0.0799, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 43.55555555555556, |
|
"grad_norm": 6.9554972648620605, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 0.0772, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 6.343081951141357, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.075, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.996875, |
|
"eval_loss": 0.013058523647487164, |
|
"eval_runtime": 3.5774, |
|
"eval_samples_per_second": 178.899, |
|
"eval_steps_per_second": 2.795, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 44.44444444444444, |
|
"grad_norm": 4.900812149047852, |
|
"learning_rate": 5.050505050505051e-06, |
|
"loss": 0.0736, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 44.888888888888886, |
|
"grad_norm": 5.955135345458984, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.0761, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 44.977777777777774, |
|
"eval_accuracy": 0.99375, |
|
"eval_loss": 0.012860281392931938, |
|
"eval_runtime": 3.5897, |
|
"eval_samples_per_second": 178.288, |
|
"eval_steps_per_second": 2.786, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 45.333333333333336, |
|
"grad_norm": 4.250102996826172, |
|
"learning_rate": 4.040404040404041e-06, |
|
"loss": 0.0707, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 45.77777777777778, |
|
"grad_norm": 6.8997931480407715, |
|
"learning_rate": 3.5353535353535352e-06, |
|
"loss": 0.0624, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.996875, |
|
"eval_loss": 0.011364495381712914, |
|
"eval_runtime": 3.6128, |
|
"eval_samples_per_second": 177.146, |
|
"eval_steps_per_second": 2.768, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 46.22222222222222, |
|
"grad_norm": 10.210082054138184, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 0.0762, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"grad_norm": 4.9201788902282715, |
|
"learning_rate": 2.5252525252525253e-06, |
|
"loss": 0.0557, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 46.977777777777774, |
|
"eval_accuracy": 0.9953125, |
|
"eval_loss": 0.010208332911133766, |
|
"eval_runtime": 3.8474, |
|
"eval_samples_per_second": 166.347, |
|
"eval_steps_per_second": 2.599, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 47.111111111111114, |
|
"grad_norm": 3.725327491760254, |
|
"learning_rate": 2.0202020202020206e-06, |
|
"loss": 0.0613, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 47.55555555555556, |
|
"grad_norm": 3.1549530029296875, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 0.0601, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 6.280518054962158, |
|
"learning_rate": 1.0101010101010103e-06, |
|
"loss": 0.0708, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.9953125, |
|
"eval_loss": 0.01160599384456873, |
|
"eval_runtime": 3.9922, |
|
"eval_samples_per_second": 160.314, |
|
"eval_steps_per_second": 2.505, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 48.44444444444444, |
|
"grad_norm": 6.1849260330200195, |
|
"learning_rate": 5.050505050505052e-07, |
|
"loss": 0.0699, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"grad_norm": 7.637501239776611, |
|
"learning_rate": 0.0, |
|
"loss": 0.0667, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"eval_accuracy": 0.9953125, |
|
"eval_loss": 0.013088616542518139, |
|
"eval_runtime": 3.601, |
|
"eval_samples_per_second": 177.727, |
|
"eval_steps_per_second": 2.777, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"step": 1100, |
|
"total_flos": 5.510665685119795e+18, |
|
"train_loss": 0.17931611462072894, |
|
"train_runtime": 3528.3547, |
|
"train_samples_per_second": 81.624, |
|
"train_steps_per_second": 0.312 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 5.510665685119795e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|