{ "best_metric": 0.996875, "best_model_checkpoint": "vit-msn-small-finetuned-alzheimers/checkpoint-765", "epoch": 48.888888888888886, "eval_steps": 500, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4444444444444444, "grad_norm": 8.449820518493652, "learning_rate": 4.5454545454545455e-06, "loss": 0.2587, "step": 10 }, { "epoch": 0.8888888888888888, "grad_norm": 13.680850982666016, "learning_rate": 9.090909090909091e-06, "loss": 0.2996, "step": 20 }, { "epoch": 0.9777777777777777, "eval_accuracy": 0.84375, "eval_loss": 0.38971763849258423, "eval_runtime": 3.5179, "eval_samples_per_second": 181.926, "eval_steps_per_second": 2.843, "step": 22 }, { "epoch": 1.3333333333333333, "grad_norm": 9.488574981689453, "learning_rate": 1.3636363636363637e-05, "loss": 0.4023, "step": 30 }, { "epoch": 1.7777777777777777, "grad_norm": 18.977561950683594, "learning_rate": 1.8181818181818182e-05, "loss": 0.3703, "step": 40 }, { "epoch": 2.0, "eval_accuracy": 0.859375, "eval_loss": 0.3594878911972046, "eval_runtime": 3.9024, "eval_samples_per_second": 164.001, "eval_steps_per_second": 2.563, "step": 45 }, { "epoch": 2.2222222222222223, "grad_norm": 11.33133602142334, "learning_rate": 2.272727272727273e-05, "loss": 0.3541, "step": 50 }, { "epoch": 2.6666666666666665, "grad_norm": 16.366662979125977, "learning_rate": 2.7272727272727273e-05, "loss": 0.3087, "step": 60 }, { "epoch": 2.977777777777778, "eval_accuracy": 0.8625, "eval_loss": 0.3777163326740265, "eval_runtime": 3.8599, "eval_samples_per_second": 165.808, "eval_steps_per_second": 2.591, "step": 67 }, { "epoch": 3.111111111111111, "grad_norm": 18.307331085205078, "learning_rate": 3.181818181818182e-05, "loss": 0.3195, "step": 70 }, { "epoch": 3.5555555555555554, "grad_norm": 42.80950164794922, "learning_rate": 3.6363636363636364e-05, "loss": 0.3483, "step": 80 }, { "epoch": 4.0, "grad_norm": 18.051124572753906, "learning_rate": 4.0909090909090915e-05, "loss": 0.486, "step": 90 }, { "epoch": 4.0, "eval_accuracy": 0.81875, "eval_loss": 0.4530211389064789, "eval_runtime": 3.6057, "eval_samples_per_second": 177.495, "eval_steps_per_second": 2.773, "step": 90 }, { "epoch": 4.444444444444445, "grad_norm": 15.844127655029297, "learning_rate": 4.545454545454546e-05, "loss": 0.3521, "step": 100 }, { "epoch": 4.888888888888889, "grad_norm": 11.87112808227539, "learning_rate": 5e-05, "loss": 0.3307, "step": 110 }, { "epoch": 4.977777777777778, "eval_accuracy": 0.8234375, "eval_loss": 0.45600825548171997, "eval_runtime": 3.572, "eval_samples_per_second": 179.171, "eval_steps_per_second": 2.8, "step": 112 }, { "epoch": 5.333333333333333, "grad_norm": 17.418668746948242, "learning_rate": 4.94949494949495e-05, "loss": 0.3404, "step": 120 }, { "epoch": 5.777777777777778, "grad_norm": 20.148906707763672, "learning_rate": 4.898989898989899e-05, "loss": 0.306, "step": 130 }, { "epoch": 6.0, "eval_accuracy": 0.8671875, "eval_loss": 0.3470742106437683, "eval_runtime": 3.5697, "eval_samples_per_second": 179.289, "eval_steps_per_second": 2.801, "step": 135 }, { "epoch": 6.222222222222222, "grad_norm": 11.451733589172363, "learning_rate": 4.848484848484849e-05, "loss": 0.2873, "step": 140 }, { "epoch": 6.666666666666667, "grad_norm": 15.43708610534668, "learning_rate": 4.797979797979798e-05, "loss": 0.3005, "step": 150 }, { "epoch": 6.977777777777778, "eval_accuracy": 0.8859375, "eval_loss": 0.3024638891220093, "eval_runtime": 3.8788, "eval_samples_per_second": 164.998, "eval_steps_per_second": 2.578, "step": 157 }, { "epoch": 7.111111111111111, "grad_norm": 20.696516036987305, "learning_rate": 4.7474747474747476e-05, "loss": 0.3206, "step": 160 }, { "epoch": 7.555555555555555, "grad_norm": 12.01241397857666, "learning_rate": 4.696969696969697e-05, "loss": 0.2851, "step": 170 }, { "epoch": 8.0, "grad_norm": 17.638214111328125, "learning_rate": 4.6464646464646464e-05, "loss": 0.319, "step": 180 }, { "epoch": 8.0, "eval_accuracy": 0.8984375, "eval_loss": 0.24509796500205994, "eval_runtime": 3.9737, "eval_samples_per_second": 161.061, "eval_steps_per_second": 2.517, "step": 180 }, { "epoch": 8.444444444444445, "grad_norm": 21.502132415771484, "learning_rate": 4.595959595959596e-05, "loss": 0.2968, "step": 190 }, { "epoch": 8.88888888888889, "grad_norm": 20.09746742248535, "learning_rate": 4.545454545454546e-05, "loss": 0.3489, "step": 200 }, { "epoch": 8.977777777777778, "eval_accuracy": 0.928125, "eval_loss": 0.18142804503440857, "eval_runtime": 3.7455, "eval_samples_per_second": 170.872, "eval_steps_per_second": 2.67, "step": 202 }, { "epoch": 9.333333333333334, "grad_norm": 16.205760955810547, "learning_rate": 4.494949494949495e-05, "loss": 0.2915, "step": 210 }, { "epoch": 9.777777777777779, "grad_norm": 27.511030197143555, "learning_rate": 4.4444444444444447e-05, "loss": 0.3251, "step": 220 }, { "epoch": 10.0, "eval_accuracy": 0.915625, "eval_loss": 0.24511559307575226, "eval_runtime": 3.6361, "eval_samples_per_second": 176.011, "eval_steps_per_second": 2.75, "step": 225 }, { "epoch": 10.222222222222221, "grad_norm": 11.171629905700684, "learning_rate": 4.3939393939393944e-05, "loss": 0.308, "step": 230 }, { "epoch": 10.666666666666666, "grad_norm": 12.315302848815918, "learning_rate": 4.343434343434344e-05, "loss": 0.3034, "step": 240 }, { "epoch": 10.977777777777778, "eval_accuracy": 0.940625, "eval_loss": 0.15658709406852722, "eval_runtime": 3.5876, "eval_samples_per_second": 178.392, "eval_steps_per_second": 2.787, "step": 247 }, { "epoch": 11.11111111111111, "grad_norm": 14.539319038391113, "learning_rate": 4.292929292929293e-05, "loss": 0.2847, "step": 250 }, { "epoch": 11.555555555555555, "grad_norm": 17.26177215576172, "learning_rate": 4.242424242424243e-05, "loss": 0.2754, "step": 260 }, { "epoch": 12.0, "grad_norm": 12.163039207458496, "learning_rate": 4.191919191919192e-05, "loss": 0.2746, "step": 270 }, { "epoch": 12.0, "eval_accuracy": 0.8921875, "eval_loss": 0.24925951659679413, "eval_runtime": 3.5808, "eval_samples_per_second": 178.73, "eval_steps_per_second": 2.793, "step": 270 }, { "epoch": 12.444444444444445, "grad_norm": 11.92519474029541, "learning_rate": 4.141414141414142e-05, "loss": 0.2889, "step": 280 }, { "epoch": 12.88888888888889, "grad_norm": 13.212408065795898, "learning_rate": 4.0909090909090915e-05, "loss": 0.2369, "step": 290 }, { "epoch": 12.977777777777778, "eval_accuracy": 0.9375, "eval_loss": 0.1622403860092163, "eval_runtime": 3.8343, "eval_samples_per_second": 166.914, "eval_steps_per_second": 2.608, "step": 292 }, { "epoch": 13.333333333333334, "grad_norm": 11.68896770477295, "learning_rate": 4.0404040404040405e-05, "loss": 0.2465, "step": 300 }, { "epoch": 13.777777777777779, "grad_norm": 14.610076904296875, "learning_rate": 3.98989898989899e-05, "loss": 0.2231, "step": 310 }, { "epoch": 14.0, "eval_accuracy": 0.9359375, "eval_loss": 0.17805945873260498, "eval_runtime": 3.9695, "eval_samples_per_second": 161.231, "eval_steps_per_second": 2.519, "step": 315 }, { "epoch": 14.222222222222221, "grad_norm": 12.407272338867188, "learning_rate": 3.939393939393939e-05, "loss": 0.2177, "step": 320 }, { "epoch": 14.666666666666666, "grad_norm": 7.3430256843566895, "learning_rate": 3.888888888888889e-05, "loss": 0.2281, "step": 330 }, { "epoch": 14.977777777777778, "eval_accuracy": 0.953125, "eval_loss": 0.12681424617767334, "eval_runtime": 3.6554, "eval_samples_per_second": 175.082, "eval_steps_per_second": 2.736, "step": 337 }, { "epoch": 15.11111111111111, "grad_norm": 10.262022018432617, "learning_rate": 3.838383838383838e-05, "loss": 0.209, "step": 340 }, { "epoch": 15.555555555555555, "grad_norm": 9.078124046325684, "learning_rate": 3.787878787878788e-05, "loss": 0.2134, "step": 350 }, { "epoch": 16.0, "grad_norm": 14.094355583190918, "learning_rate": 3.7373737373737376e-05, "loss": 0.2001, "step": 360 }, { "epoch": 16.0, "eval_accuracy": 0.9140625, "eval_loss": 0.24309130012989044, "eval_runtime": 3.5892, "eval_samples_per_second": 178.311, "eval_steps_per_second": 2.786, "step": 360 }, { "epoch": 16.444444444444443, "grad_norm": 12.868298530578613, "learning_rate": 3.686868686868687e-05, "loss": 0.2312, "step": 370 }, { "epoch": 16.88888888888889, "grad_norm": 7.863047122955322, "learning_rate": 3.6363636363636364e-05, "loss": 0.183, "step": 380 }, { "epoch": 16.977777777777778, "eval_accuracy": 0.9625, "eval_loss": 0.10167054831981659, "eval_runtime": 3.6007, "eval_samples_per_second": 177.743, "eval_steps_per_second": 2.777, "step": 382 }, { "epoch": 17.333333333333332, "grad_norm": 8.919840812683105, "learning_rate": 3.5858585858585855e-05, "loss": 0.1997, "step": 390 }, { "epoch": 17.77777777777778, "grad_norm": 11.91215705871582, "learning_rate": 3.535353535353535e-05, "loss": 0.1891, "step": 400 }, { "epoch": 18.0, "eval_accuracy": 0.9390625, "eval_loss": 0.180230051279068, "eval_runtime": 3.5491, "eval_samples_per_second": 180.325, "eval_steps_per_second": 2.818, "step": 405 }, { "epoch": 18.22222222222222, "grad_norm": 9.109786033630371, "learning_rate": 3.484848484848485e-05, "loss": 0.213, "step": 410 }, { "epoch": 18.666666666666668, "grad_norm": 5.232081413269043, "learning_rate": 3.434343434343435e-05, "loss": 0.1862, "step": 420 }, { "epoch": 18.977777777777778, "eval_accuracy": 0.9765625, "eval_loss": 0.08689282089471817, "eval_runtime": 3.9268, "eval_samples_per_second": 162.982, "eval_steps_per_second": 2.547, "step": 427 }, { "epoch": 19.11111111111111, "grad_norm": 13.340733528137207, "learning_rate": 3.3838383838383844e-05, "loss": 0.1664, "step": 430 }, { "epoch": 19.555555555555557, "grad_norm": 7.66475772857666, "learning_rate": 3.3333333333333335e-05, "loss": 0.1727, "step": 440 }, { "epoch": 20.0, "grad_norm": 13.612215995788574, "learning_rate": 3.282828282828283e-05, "loss": 0.1935, "step": 450 }, { "epoch": 20.0, "eval_accuracy": 0.96875, "eval_loss": 0.10791148245334625, "eval_runtime": 3.9472, "eval_samples_per_second": 162.141, "eval_steps_per_second": 2.533, "step": 450 }, { "epoch": 20.444444444444443, "grad_norm": 9.189305305480957, "learning_rate": 3.232323232323233e-05, "loss": 0.1945, "step": 460 }, { "epoch": 20.88888888888889, "grad_norm": 9.650483131408691, "learning_rate": 3.181818181818182e-05, "loss": 0.1797, "step": 470 }, { "epoch": 20.977777777777778, "eval_accuracy": 0.95625, "eval_loss": 0.12502644956111908, "eval_runtime": 3.6171, "eval_samples_per_second": 176.939, "eval_steps_per_second": 2.765, "step": 472 }, { "epoch": 21.333333333333332, "grad_norm": 7.25011682510376, "learning_rate": 3.131313131313132e-05, "loss": 0.1767, "step": 480 }, { "epoch": 21.77777777777778, "grad_norm": 9.036290168762207, "learning_rate": 3.080808080808081e-05, "loss": 0.1605, "step": 490 }, { "epoch": 22.0, "eval_accuracy": 0.971875, "eval_loss": 0.06545940786600113, "eval_runtime": 3.5923, "eval_samples_per_second": 178.16, "eval_steps_per_second": 2.784, "step": 495 }, { "epoch": 22.22222222222222, "grad_norm": 5.982744216918945, "learning_rate": 3.0303030303030306e-05, "loss": 0.1493, "step": 500 }, { "epoch": 22.666666666666668, "grad_norm": 13.987672805786133, "learning_rate": 2.9797979797979796e-05, "loss": 0.1848, "step": 510 }, { "epoch": 22.977777777777778, "eval_accuracy": 0.9765625, "eval_loss": 0.08063917607069016, "eval_runtime": 3.5801, "eval_samples_per_second": 178.764, "eval_steps_per_second": 2.793, "step": 517 }, { "epoch": 23.11111111111111, "grad_norm": 15.488668441772461, "learning_rate": 2.9292929292929294e-05, "loss": 0.1426, "step": 520 }, { "epoch": 23.555555555555557, "grad_norm": 11.646829605102539, "learning_rate": 2.878787878787879e-05, "loss": 0.1667, "step": 530 }, { "epoch": 24.0, "grad_norm": 6.619264602661133, "learning_rate": 2.8282828282828282e-05, "loss": 0.1498, "step": 540 }, { "epoch": 24.0, "eval_accuracy": 0.9578125, "eval_loss": 0.11159060150384903, "eval_runtime": 3.6016, "eval_samples_per_second": 177.701, "eval_steps_per_second": 2.777, "step": 540 }, { "epoch": 24.444444444444443, "grad_norm": 7.8661627769470215, "learning_rate": 2.777777777777778e-05, "loss": 0.1287, "step": 550 }, { "epoch": 24.88888888888889, "grad_norm": 7.934934616088867, "learning_rate": 2.7272727272727273e-05, "loss": 0.1394, "step": 560 }, { "epoch": 24.977777777777778, "eval_accuracy": 0.9671875, "eval_loss": 0.0806862860918045, "eval_runtime": 3.8749, "eval_samples_per_second": 165.167, "eval_steps_per_second": 2.581, "step": 562 }, { "epoch": 25.333333333333332, "grad_norm": 10.52723217010498, "learning_rate": 2.676767676767677e-05, "loss": 0.1524, "step": 570 }, { "epoch": 25.77777777777778, "grad_norm": 7.490493297576904, "learning_rate": 2.6262626262626268e-05, "loss": 0.1584, "step": 580 }, { "epoch": 26.0, "eval_accuracy": 0.9796875, "eval_loss": 0.05252554267644882, "eval_runtime": 3.9703, "eval_samples_per_second": 161.197, "eval_steps_per_second": 2.519, "step": 585 }, { "epoch": 26.22222222222222, "grad_norm": 7.178821563720703, "learning_rate": 2.575757575757576e-05, "loss": 0.153, "step": 590 }, { "epoch": 26.666666666666668, "grad_norm": 7.702730178833008, "learning_rate": 2.5252525252525256e-05, "loss": 0.1302, "step": 600 }, { "epoch": 26.977777777777778, "eval_accuracy": 0.9828125, "eval_loss": 0.05131406709551811, "eval_runtime": 3.7014, "eval_samples_per_second": 172.905, "eval_steps_per_second": 2.702, "step": 607 }, { "epoch": 27.11111111111111, "grad_norm": 9.482915878295898, "learning_rate": 2.474747474747475e-05, "loss": 0.1577, "step": 610 }, { "epoch": 27.555555555555557, "grad_norm": 10.196369171142578, "learning_rate": 2.4242424242424244e-05, "loss": 0.1543, "step": 620 }, { "epoch": 28.0, "grad_norm": 10.799006462097168, "learning_rate": 2.3737373737373738e-05, "loss": 0.1356, "step": 630 }, { "epoch": 28.0, "eval_accuracy": 0.9875, "eval_loss": 0.04204293340444565, "eval_runtime": 3.9774, "eval_samples_per_second": 160.91, "eval_steps_per_second": 2.514, "step": 630 }, { "epoch": 28.444444444444443, "grad_norm": 7.129752159118652, "learning_rate": 2.3232323232323232e-05, "loss": 0.1291, "step": 640 }, { "epoch": 28.88888888888889, "grad_norm": 6.642085552215576, "learning_rate": 2.272727272727273e-05, "loss": 0.1101, "step": 650 }, { "epoch": 28.977777777777778, "eval_accuracy": 0.9875, "eval_loss": 0.03539272025227547, "eval_runtime": 4.0166, "eval_samples_per_second": 159.34, "eval_steps_per_second": 2.49, "step": 652 }, { "epoch": 29.333333333333332, "grad_norm": 8.5753755569458, "learning_rate": 2.2222222222222223e-05, "loss": 0.1445, "step": 660 }, { "epoch": 29.77777777777778, "grad_norm": 13.183974266052246, "learning_rate": 2.171717171717172e-05, "loss": 0.1227, "step": 670 }, { "epoch": 30.0, "eval_accuracy": 0.9765625, "eval_loss": 0.0582769513130188, "eval_runtime": 3.9913, "eval_samples_per_second": 160.35, "eval_steps_per_second": 2.505, "step": 675 }, { "epoch": 30.22222222222222, "grad_norm": 9.026564598083496, "learning_rate": 2.1212121212121215e-05, "loss": 0.1209, "step": 680 }, { "epoch": 30.666666666666668, "grad_norm": 12.504347801208496, "learning_rate": 2.070707070707071e-05, "loss": 0.1158, "step": 690 }, { "epoch": 30.977777777777778, "eval_accuracy": 0.990625, "eval_loss": 0.025325458496809006, "eval_runtime": 3.9678, "eval_samples_per_second": 161.299, "eval_steps_per_second": 2.52, "step": 697 }, { "epoch": 31.11111111111111, "grad_norm": 5.276214599609375, "learning_rate": 2.0202020202020203e-05, "loss": 0.119, "step": 700 }, { "epoch": 31.555555555555557, "grad_norm": 8.732769012451172, "learning_rate": 1.9696969696969697e-05, "loss": 0.1156, "step": 710 }, { "epoch": 32.0, "grad_norm": 5.604591369628906, "learning_rate": 1.919191919191919e-05, "loss": 0.117, "step": 720 }, { "epoch": 32.0, "eval_accuracy": 0.990625, "eval_loss": 0.023098567500710487, "eval_runtime": 3.8505, "eval_samples_per_second": 166.213, "eval_steps_per_second": 2.597, "step": 720 }, { "epoch": 32.44444444444444, "grad_norm": 7.332610607147217, "learning_rate": 1.8686868686868688e-05, "loss": 0.1213, "step": 730 }, { "epoch": 32.888888888888886, "grad_norm": 12.890093803405762, "learning_rate": 1.8181818181818182e-05, "loss": 0.1022, "step": 740 }, { "epoch": 32.977777777777774, "eval_accuracy": 0.9796875, "eval_loss": 0.0725882276892662, "eval_runtime": 3.8065, "eval_samples_per_second": 168.135, "eval_steps_per_second": 2.627, "step": 742 }, { "epoch": 33.333333333333336, "grad_norm": 13.247682571411133, "learning_rate": 1.7676767676767676e-05, "loss": 0.1257, "step": 750 }, { "epoch": 33.77777777777778, "grad_norm": 6.758236885070801, "learning_rate": 1.7171717171717173e-05, "loss": 0.1221, "step": 760 }, { "epoch": 34.0, "eval_accuracy": 0.996875, "eval_loss": 0.015964530408382416, "eval_runtime": 3.7585, "eval_samples_per_second": 170.283, "eval_steps_per_second": 2.661, "step": 765 }, { "epoch": 34.22222222222222, "grad_norm": 8.521262168884277, "learning_rate": 1.6666666666666667e-05, "loss": 0.1014, "step": 770 }, { "epoch": 34.666666666666664, "grad_norm": 5.949100971221924, "learning_rate": 1.6161616161616165e-05, "loss": 0.0956, "step": 780 }, { "epoch": 34.977777777777774, "eval_accuracy": 0.984375, "eval_loss": 0.048214979469776154, "eval_runtime": 3.6909, "eval_samples_per_second": 173.399, "eval_steps_per_second": 2.709, "step": 787 }, { "epoch": 35.111111111111114, "grad_norm": 10.151766777038574, "learning_rate": 1.565656565656566e-05, "loss": 0.1135, "step": 790 }, { "epoch": 35.55555555555556, "grad_norm": 9.514137268066406, "learning_rate": 1.5151515151515153e-05, "loss": 0.1109, "step": 800 }, { "epoch": 36.0, "grad_norm": 4.4278717041015625, "learning_rate": 1.4646464646464647e-05, "loss": 0.0856, "step": 810 }, { "epoch": 36.0, "eval_accuracy": 0.9875, "eval_loss": 0.025567293167114258, "eval_runtime": 3.6537, "eval_samples_per_second": 175.165, "eval_steps_per_second": 2.737, "step": 810 }, { "epoch": 36.44444444444444, "grad_norm": 8.582184791564941, "learning_rate": 1.4141414141414141e-05, "loss": 0.0994, "step": 820 }, { "epoch": 36.888888888888886, "grad_norm": 9.628859519958496, "learning_rate": 1.3636363636363637e-05, "loss": 0.0996, "step": 830 }, { "epoch": 36.977777777777774, "eval_accuracy": 0.990625, "eval_loss": 0.021057253703475, "eval_runtime": 3.6772, "eval_samples_per_second": 174.046, "eval_steps_per_second": 2.719, "step": 832 }, { "epoch": 37.333333333333336, "grad_norm": 5.165952682495117, "learning_rate": 1.3131313131313134e-05, "loss": 0.0915, "step": 840 }, { "epoch": 37.77777777777778, "grad_norm": 5.831385135650635, "learning_rate": 1.2626262626262628e-05, "loss": 0.0848, "step": 850 }, { "epoch": 38.0, "eval_accuracy": 0.9796875, "eval_loss": 0.04457371309399605, "eval_runtime": 3.6584, "eval_samples_per_second": 174.938, "eval_steps_per_second": 2.733, "step": 855 }, { "epoch": 38.22222222222222, "grad_norm": 9.629181861877441, "learning_rate": 1.2121212121212122e-05, "loss": 0.0972, "step": 860 }, { "epoch": 38.666666666666664, "grad_norm": 6.214244365692139, "learning_rate": 1.1616161616161616e-05, "loss": 0.1001, "step": 870 }, { "epoch": 38.977777777777774, "eval_accuracy": 0.9875, "eval_loss": 0.02742326818406582, "eval_runtime": 3.668, "eval_samples_per_second": 174.481, "eval_steps_per_second": 2.726, "step": 877 }, { "epoch": 39.111111111111114, "grad_norm": 11.1734619140625, "learning_rate": 1.1111111111111112e-05, "loss": 0.0919, "step": 880 }, { "epoch": 39.55555555555556, "grad_norm": 6.438005447387695, "learning_rate": 1.0606060606060607e-05, "loss": 0.0988, "step": 890 }, { "epoch": 40.0, "grad_norm": 5.9803643226623535, "learning_rate": 1.0101010101010101e-05, "loss": 0.0976, "step": 900 }, { "epoch": 40.0, "eval_accuracy": 0.9921875, "eval_loss": 0.022529248148202896, "eval_runtime": 3.7092, "eval_samples_per_second": 172.543, "eval_steps_per_second": 2.696, "step": 900 }, { "epoch": 40.44444444444444, "grad_norm": 7.562661647796631, "learning_rate": 9.595959595959595e-06, "loss": 0.085, "step": 910 }, { "epoch": 40.888888888888886, "grad_norm": 7.695030212402344, "learning_rate": 9.090909090909091e-06, "loss": 0.0864, "step": 920 }, { "epoch": 40.977777777777774, "eval_accuracy": 0.9921875, "eval_loss": 0.0207191314548254, "eval_runtime": 4.0024, "eval_samples_per_second": 159.904, "eval_steps_per_second": 2.499, "step": 922 }, { "epoch": 41.333333333333336, "grad_norm": 8.4052734375, "learning_rate": 8.585858585858587e-06, "loss": 0.088, "step": 930 }, { "epoch": 41.77777777777778, "grad_norm": 8.705794334411621, "learning_rate": 8.080808080808082e-06, "loss": 0.0865, "step": 940 }, { "epoch": 42.0, "eval_accuracy": 0.996875, "eval_loss": 0.01933131366968155, "eval_runtime": 3.9909, "eval_samples_per_second": 160.365, "eval_steps_per_second": 2.506, "step": 945 }, { "epoch": 42.22222222222222, "grad_norm": 7.478874683380127, "learning_rate": 7.5757575757575764e-06, "loss": 0.0815, "step": 950 }, { "epoch": 42.666666666666664, "grad_norm": 5.25657320022583, "learning_rate": 7.0707070707070704e-06, "loss": 0.0773, "step": 960 }, { "epoch": 42.977777777777774, "eval_accuracy": 0.9921875, "eval_loss": 0.020288193598389626, "eval_runtime": 3.6594, "eval_samples_per_second": 174.89, "eval_steps_per_second": 2.733, "step": 967 }, { "epoch": 43.111111111111114, "grad_norm": 4.1972246170043945, "learning_rate": 6.565656565656567e-06, "loss": 0.0799, "step": 970 }, { "epoch": 43.55555555555556, "grad_norm": 6.9554972648620605, "learning_rate": 6.060606060606061e-06, "loss": 0.0772, "step": 980 }, { "epoch": 44.0, "grad_norm": 6.343081951141357, "learning_rate": 5.555555555555556e-06, "loss": 0.075, "step": 990 }, { "epoch": 44.0, "eval_accuracy": 0.996875, "eval_loss": 0.013058523647487164, "eval_runtime": 3.5774, "eval_samples_per_second": 178.899, "eval_steps_per_second": 2.795, "step": 990 }, { "epoch": 44.44444444444444, "grad_norm": 4.900812149047852, "learning_rate": 5.050505050505051e-06, "loss": 0.0736, "step": 1000 }, { "epoch": 44.888888888888886, "grad_norm": 5.955135345458984, "learning_rate": 4.5454545454545455e-06, "loss": 0.0761, "step": 1010 }, { "epoch": 44.977777777777774, "eval_accuracy": 0.99375, "eval_loss": 0.012860281392931938, "eval_runtime": 3.5897, "eval_samples_per_second": 178.288, "eval_steps_per_second": 2.786, "step": 1012 }, { "epoch": 45.333333333333336, "grad_norm": 4.250102996826172, "learning_rate": 4.040404040404041e-06, "loss": 0.0707, "step": 1020 }, { "epoch": 45.77777777777778, "grad_norm": 6.8997931480407715, "learning_rate": 3.5353535353535352e-06, "loss": 0.0624, "step": 1030 }, { "epoch": 46.0, "eval_accuracy": 0.996875, "eval_loss": 0.011364495381712914, "eval_runtime": 3.6128, "eval_samples_per_second": 177.146, "eval_steps_per_second": 2.768, "step": 1035 }, { "epoch": 46.22222222222222, "grad_norm": 10.210082054138184, "learning_rate": 3.0303030303030305e-06, "loss": 0.0762, "step": 1040 }, { "epoch": 46.666666666666664, "grad_norm": 4.9201788902282715, "learning_rate": 2.5252525252525253e-06, "loss": 0.0557, "step": 1050 }, { "epoch": 46.977777777777774, "eval_accuracy": 0.9953125, "eval_loss": 0.010208332911133766, "eval_runtime": 3.8474, "eval_samples_per_second": 166.347, "eval_steps_per_second": 2.599, "step": 1057 }, { "epoch": 47.111111111111114, "grad_norm": 3.725327491760254, "learning_rate": 2.0202020202020206e-06, "loss": 0.0613, "step": 1060 }, { "epoch": 47.55555555555556, "grad_norm": 3.1549530029296875, "learning_rate": 1.5151515151515152e-06, "loss": 0.0601, "step": 1070 }, { "epoch": 48.0, "grad_norm": 6.280518054962158, "learning_rate": 1.0101010101010103e-06, "loss": 0.0708, "step": 1080 }, { "epoch": 48.0, "eval_accuracy": 0.9953125, "eval_loss": 0.01160599384456873, "eval_runtime": 3.9922, "eval_samples_per_second": 160.314, "eval_steps_per_second": 2.505, "step": 1080 }, { "epoch": 48.44444444444444, "grad_norm": 6.1849260330200195, "learning_rate": 5.050505050505052e-07, "loss": 0.0699, "step": 1090 }, { "epoch": 48.888888888888886, "grad_norm": 7.637501239776611, "learning_rate": 0.0, "loss": 0.0667, "step": 1100 }, { "epoch": 48.888888888888886, "eval_accuracy": 0.9953125, "eval_loss": 0.013088616542518139, "eval_runtime": 3.601, "eval_samples_per_second": 177.727, "eval_steps_per_second": 2.777, "step": 1100 }, { "epoch": 48.888888888888886, "step": 1100, "total_flos": 5.510665685119795e+18, "train_loss": 0.17931611462072894, "train_runtime": 3528.3547, "train_samples_per_second": 81.624, "train_steps_per_second": 0.312 } ], "logging_steps": 10, "max_steps": 1100, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 5.510665685119795e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }