rhlc's picture
End of training
6c9684b verified
{
"best_metric": 0.996875,
"best_model_checkpoint": "vit-msn-small-finetuned-alzheimers/checkpoint-765",
"epoch": 48.888888888888886,
"eval_steps": 500,
"global_step": 1100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4444444444444444,
"grad_norm": 8.449820518493652,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.2587,
"step": 10
},
{
"epoch": 0.8888888888888888,
"grad_norm": 13.680850982666016,
"learning_rate": 9.090909090909091e-06,
"loss": 0.2996,
"step": 20
},
{
"epoch": 0.9777777777777777,
"eval_accuracy": 0.84375,
"eval_loss": 0.38971763849258423,
"eval_runtime": 3.5179,
"eval_samples_per_second": 181.926,
"eval_steps_per_second": 2.843,
"step": 22
},
{
"epoch": 1.3333333333333333,
"grad_norm": 9.488574981689453,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.4023,
"step": 30
},
{
"epoch": 1.7777777777777777,
"grad_norm": 18.977561950683594,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.3703,
"step": 40
},
{
"epoch": 2.0,
"eval_accuracy": 0.859375,
"eval_loss": 0.3594878911972046,
"eval_runtime": 3.9024,
"eval_samples_per_second": 164.001,
"eval_steps_per_second": 2.563,
"step": 45
},
{
"epoch": 2.2222222222222223,
"grad_norm": 11.33133602142334,
"learning_rate": 2.272727272727273e-05,
"loss": 0.3541,
"step": 50
},
{
"epoch": 2.6666666666666665,
"grad_norm": 16.366662979125977,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.3087,
"step": 60
},
{
"epoch": 2.977777777777778,
"eval_accuracy": 0.8625,
"eval_loss": 0.3777163326740265,
"eval_runtime": 3.8599,
"eval_samples_per_second": 165.808,
"eval_steps_per_second": 2.591,
"step": 67
},
{
"epoch": 3.111111111111111,
"grad_norm": 18.307331085205078,
"learning_rate": 3.181818181818182e-05,
"loss": 0.3195,
"step": 70
},
{
"epoch": 3.5555555555555554,
"grad_norm": 42.80950164794922,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.3483,
"step": 80
},
{
"epoch": 4.0,
"grad_norm": 18.051124572753906,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.486,
"step": 90
},
{
"epoch": 4.0,
"eval_accuracy": 0.81875,
"eval_loss": 0.4530211389064789,
"eval_runtime": 3.6057,
"eval_samples_per_second": 177.495,
"eval_steps_per_second": 2.773,
"step": 90
},
{
"epoch": 4.444444444444445,
"grad_norm": 15.844127655029297,
"learning_rate": 4.545454545454546e-05,
"loss": 0.3521,
"step": 100
},
{
"epoch": 4.888888888888889,
"grad_norm": 11.87112808227539,
"learning_rate": 5e-05,
"loss": 0.3307,
"step": 110
},
{
"epoch": 4.977777777777778,
"eval_accuracy": 0.8234375,
"eval_loss": 0.45600825548171997,
"eval_runtime": 3.572,
"eval_samples_per_second": 179.171,
"eval_steps_per_second": 2.8,
"step": 112
},
{
"epoch": 5.333333333333333,
"grad_norm": 17.418668746948242,
"learning_rate": 4.94949494949495e-05,
"loss": 0.3404,
"step": 120
},
{
"epoch": 5.777777777777778,
"grad_norm": 20.148906707763672,
"learning_rate": 4.898989898989899e-05,
"loss": 0.306,
"step": 130
},
{
"epoch": 6.0,
"eval_accuracy": 0.8671875,
"eval_loss": 0.3470742106437683,
"eval_runtime": 3.5697,
"eval_samples_per_second": 179.289,
"eval_steps_per_second": 2.801,
"step": 135
},
{
"epoch": 6.222222222222222,
"grad_norm": 11.451733589172363,
"learning_rate": 4.848484848484849e-05,
"loss": 0.2873,
"step": 140
},
{
"epoch": 6.666666666666667,
"grad_norm": 15.43708610534668,
"learning_rate": 4.797979797979798e-05,
"loss": 0.3005,
"step": 150
},
{
"epoch": 6.977777777777778,
"eval_accuracy": 0.8859375,
"eval_loss": 0.3024638891220093,
"eval_runtime": 3.8788,
"eval_samples_per_second": 164.998,
"eval_steps_per_second": 2.578,
"step": 157
},
{
"epoch": 7.111111111111111,
"grad_norm": 20.696516036987305,
"learning_rate": 4.7474747474747476e-05,
"loss": 0.3206,
"step": 160
},
{
"epoch": 7.555555555555555,
"grad_norm": 12.01241397857666,
"learning_rate": 4.696969696969697e-05,
"loss": 0.2851,
"step": 170
},
{
"epoch": 8.0,
"grad_norm": 17.638214111328125,
"learning_rate": 4.6464646464646464e-05,
"loss": 0.319,
"step": 180
},
{
"epoch": 8.0,
"eval_accuracy": 0.8984375,
"eval_loss": 0.24509796500205994,
"eval_runtime": 3.9737,
"eval_samples_per_second": 161.061,
"eval_steps_per_second": 2.517,
"step": 180
},
{
"epoch": 8.444444444444445,
"grad_norm": 21.502132415771484,
"learning_rate": 4.595959595959596e-05,
"loss": 0.2968,
"step": 190
},
{
"epoch": 8.88888888888889,
"grad_norm": 20.09746742248535,
"learning_rate": 4.545454545454546e-05,
"loss": 0.3489,
"step": 200
},
{
"epoch": 8.977777777777778,
"eval_accuracy": 0.928125,
"eval_loss": 0.18142804503440857,
"eval_runtime": 3.7455,
"eval_samples_per_second": 170.872,
"eval_steps_per_second": 2.67,
"step": 202
},
{
"epoch": 9.333333333333334,
"grad_norm": 16.205760955810547,
"learning_rate": 4.494949494949495e-05,
"loss": 0.2915,
"step": 210
},
{
"epoch": 9.777777777777779,
"grad_norm": 27.511030197143555,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.3251,
"step": 220
},
{
"epoch": 10.0,
"eval_accuracy": 0.915625,
"eval_loss": 0.24511559307575226,
"eval_runtime": 3.6361,
"eval_samples_per_second": 176.011,
"eval_steps_per_second": 2.75,
"step": 225
},
{
"epoch": 10.222222222222221,
"grad_norm": 11.171629905700684,
"learning_rate": 4.3939393939393944e-05,
"loss": 0.308,
"step": 230
},
{
"epoch": 10.666666666666666,
"grad_norm": 12.315302848815918,
"learning_rate": 4.343434343434344e-05,
"loss": 0.3034,
"step": 240
},
{
"epoch": 10.977777777777778,
"eval_accuracy": 0.940625,
"eval_loss": 0.15658709406852722,
"eval_runtime": 3.5876,
"eval_samples_per_second": 178.392,
"eval_steps_per_second": 2.787,
"step": 247
},
{
"epoch": 11.11111111111111,
"grad_norm": 14.539319038391113,
"learning_rate": 4.292929292929293e-05,
"loss": 0.2847,
"step": 250
},
{
"epoch": 11.555555555555555,
"grad_norm": 17.26177215576172,
"learning_rate": 4.242424242424243e-05,
"loss": 0.2754,
"step": 260
},
{
"epoch": 12.0,
"grad_norm": 12.163039207458496,
"learning_rate": 4.191919191919192e-05,
"loss": 0.2746,
"step": 270
},
{
"epoch": 12.0,
"eval_accuracy": 0.8921875,
"eval_loss": 0.24925951659679413,
"eval_runtime": 3.5808,
"eval_samples_per_second": 178.73,
"eval_steps_per_second": 2.793,
"step": 270
},
{
"epoch": 12.444444444444445,
"grad_norm": 11.92519474029541,
"learning_rate": 4.141414141414142e-05,
"loss": 0.2889,
"step": 280
},
{
"epoch": 12.88888888888889,
"grad_norm": 13.212408065795898,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.2369,
"step": 290
},
{
"epoch": 12.977777777777778,
"eval_accuracy": 0.9375,
"eval_loss": 0.1622403860092163,
"eval_runtime": 3.8343,
"eval_samples_per_second": 166.914,
"eval_steps_per_second": 2.608,
"step": 292
},
{
"epoch": 13.333333333333334,
"grad_norm": 11.68896770477295,
"learning_rate": 4.0404040404040405e-05,
"loss": 0.2465,
"step": 300
},
{
"epoch": 13.777777777777779,
"grad_norm": 14.610076904296875,
"learning_rate": 3.98989898989899e-05,
"loss": 0.2231,
"step": 310
},
{
"epoch": 14.0,
"eval_accuracy": 0.9359375,
"eval_loss": 0.17805945873260498,
"eval_runtime": 3.9695,
"eval_samples_per_second": 161.231,
"eval_steps_per_second": 2.519,
"step": 315
},
{
"epoch": 14.222222222222221,
"grad_norm": 12.407272338867188,
"learning_rate": 3.939393939393939e-05,
"loss": 0.2177,
"step": 320
},
{
"epoch": 14.666666666666666,
"grad_norm": 7.3430256843566895,
"learning_rate": 3.888888888888889e-05,
"loss": 0.2281,
"step": 330
},
{
"epoch": 14.977777777777778,
"eval_accuracy": 0.953125,
"eval_loss": 0.12681424617767334,
"eval_runtime": 3.6554,
"eval_samples_per_second": 175.082,
"eval_steps_per_second": 2.736,
"step": 337
},
{
"epoch": 15.11111111111111,
"grad_norm": 10.262022018432617,
"learning_rate": 3.838383838383838e-05,
"loss": 0.209,
"step": 340
},
{
"epoch": 15.555555555555555,
"grad_norm": 9.078124046325684,
"learning_rate": 3.787878787878788e-05,
"loss": 0.2134,
"step": 350
},
{
"epoch": 16.0,
"grad_norm": 14.094355583190918,
"learning_rate": 3.7373737373737376e-05,
"loss": 0.2001,
"step": 360
},
{
"epoch": 16.0,
"eval_accuracy": 0.9140625,
"eval_loss": 0.24309130012989044,
"eval_runtime": 3.5892,
"eval_samples_per_second": 178.311,
"eval_steps_per_second": 2.786,
"step": 360
},
{
"epoch": 16.444444444444443,
"grad_norm": 12.868298530578613,
"learning_rate": 3.686868686868687e-05,
"loss": 0.2312,
"step": 370
},
{
"epoch": 16.88888888888889,
"grad_norm": 7.863047122955322,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.183,
"step": 380
},
{
"epoch": 16.977777777777778,
"eval_accuracy": 0.9625,
"eval_loss": 0.10167054831981659,
"eval_runtime": 3.6007,
"eval_samples_per_second": 177.743,
"eval_steps_per_second": 2.777,
"step": 382
},
{
"epoch": 17.333333333333332,
"grad_norm": 8.919840812683105,
"learning_rate": 3.5858585858585855e-05,
"loss": 0.1997,
"step": 390
},
{
"epoch": 17.77777777777778,
"grad_norm": 11.91215705871582,
"learning_rate": 3.535353535353535e-05,
"loss": 0.1891,
"step": 400
},
{
"epoch": 18.0,
"eval_accuracy": 0.9390625,
"eval_loss": 0.180230051279068,
"eval_runtime": 3.5491,
"eval_samples_per_second": 180.325,
"eval_steps_per_second": 2.818,
"step": 405
},
{
"epoch": 18.22222222222222,
"grad_norm": 9.109786033630371,
"learning_rate": 3.484848484848485e-05,
"loss": 0.213,
"step": 410
},
{
"epoch": 18.666666666666668,
"grad_norm": 5.232081413269043,
"learning_rate": 3.434343434343435e-05,
"loss": 0.1862,
"step": 420
},
{
"epoch": 18.977777777777778,
"eval_accuracy": 0.9765625,
"eval_loss": 0.08689282089471817,
"eval_runtime": 3.9268,
"eval_samples_per_second": 162.982,
"eval_steps_per_second": 2.547,
"step": 427
},
{
"epoch": 19.11111111111111,
"grad_norm": 13.340733528137207,
"learning_rate": 3.3838383838383844e-05,
"loss": 0.1664,
"step": 430
},
{
"epoch": 19.555555555555557,
"grad_norm": 7.66475772857666,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.1727,
"step": 440
},
{
"epoch": 20.0,
"grad_norm": 13.612215995788574,
"learning_rate": 3.282828282828283e-05,
"loss": 0.1935,
"step": 450
},
{
"epoch": 20.0,
"eval_accuracy": 0.96875,
"eval_loss": 0.10791148245334625,
"eval_runtime": 3.9472,
"eval_samples_per_second": 162.141,
"eval_steps_per_second": 2.533,
"step": 450
},
{
"epoch": 20.444444444444443,
"grad_norm": 9.189305305480957,
"learning_rate": 3.232323232323233e-05,
"loss": 0.1945,
"step": 460
},
{
"epoch": 20.88888888888889,
"grad_norm": 9.650483131408691,
"learning_rate": 3.181818181818182e-05,
"loss": 0.1797,
"step": 470
},
{
"epoch": 20.977777777777778,
"eval_accuracy": 0.95625,
"eval_loss": 0.12502644956111908,
"eval_runtime": 3.6171,
"eval_samples_per_second": 176.939,
"eval_steps_per_second": 2.765,
"step": 472
},
{
"epoch": 21.333333333333332,
"grad_norm": 7.25011682510376,
"learning_rate": 3.131313131313132e-05,
"loss": 0.1767,
"step": 480
},
{
"epoch": 21.77777777777778,
"grad_norm": 9.036290168762207,
"learning_rate": 3.080808080808081e-05,
"loss": 0.1605,
"step": 490
},
{
"epoch": 22.0,
"eval_accuracy": 0.971875,
"eval_loss": 0.06545940786600113,
"eval_runtime": 3.5923,
"eval_samples_per_second": 178.16,
"eval_steps_per_second": 2.784,
"step": 495
},
{
"epoch": 22.22222222222222,
"grad_norm": 5.982744216918945,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.1493,
"step": 500
},
{
"epoch": 22.666666666666668,
"grad_norm": 13.987672805786133,
"learning_rate": 2.9797979797979796e-05,
"loss": 0.1848,
"step": 510
},
{
"epoch": 22.977777777777778,
"eval_accuracy": 0.9765625,
"eval_loss": 0.08063917607069016,
"eval_runtime": 3.5801,
"eval_samples_per_second": 178.764,
"eval_steps_per_second": 2.793,
"step": 517
},
{
"epoch": 23.11111111111111,
"grad_norm": 15.488668441772461,
"learning_rate": 2.9292929292929294e-05,
"loss": 0.1426,
"step": 520
},
{
"epoch": 23.555555555555557,
"grad_norm": 11.646829605102539,
"learning_rate": 2.878787878787879e-05,
"loss": 0.1667,
"step": 530
},
{
"epoch": 24.0,
"grad_norm": 6.619264602661133,
"learning_rate": 2.8282828282828282e-05,
"loss": 0.1498,
"step": 540
},
{
"epoch": 24.0,
"eval_accuracy": 0.9578125,
"eval_loss": 0.11159060150384903,
"eval_runtime": 3.6016,
"eval_samples_per_second": 177.701,
"eval_steps_per_second": 2.777,
"step": 540
},
{
"epoch": 24.444444444444443,
"grad_norm": 7.8661627769470215,
"learning_rate": 2.777777777777778e-05,
"loss": 0.1287,
"step": 550
},
{
"epoch": 24.88888888888889,
"grad_norm": 7.934934616088867,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.1394,
"step": 560
},
{
"epoch": 24.977777777777778,
"eval_accuracy": 0.9671875,
"eval_loss": 0.0806862860918045,
"eval_runtime": 3.8749,
"eval_samples_per_second": 165.167,
"eval_steps_per_second": 2.581,
"step": 562
},
{
"epoch": 25.333333333333332,
"grad_norm": 10.52723217010498,
"learning_rate": 2.676767676767677e-05,
"loss": 0.1524,
"step": 570
},
{
"epoch": 25.77777777777778,
"grad_norm": 7.490493297576904,
"learning_rate": 2.6262626262626268e-05,
"loss": 0.1584,
"step": 580
},
{
"epoch": 26.0,
"eval_accuracy": 0.9796875,
"eval_loss": 0.05252554267644882,
"eval_runtime": 3.9703,
"eval_samples_per_second": 161.197,
"eval_steps_per_second": 2.519,
"step": 585
},
{
"epoch": 26.22222222222222,
"grad_norm": 7.178821563720703,
"learning_rate": 2.575757575757576e-05,
"loss": 0.153,
"step": 590
},
{
"epoch": 26.666666666666668,
"grad_norm": 7.702730178833008,
"learning_rate": 2.5252525252525256e-05,
"loss": 0.1302,
"step": 600
},
{
"epoch": 26.977777777777778,
"eval_accuracy": 0.9828125,
"eval_loss": 0.05131406709551811,
"eval_runtime": 3.7014,
"eval_samples_per_second": 172.905,
"eval_steps_per_second": 2.702,
"step": 607
},
{
"epoch": 27.11111111111111,
"grad_norm": 9.482915878295898,
"learning_rate": 2.474747474747475e-05,
"loss": 0.1577,
"step": 610
},
{
"epoch": 27.555555555555557,
"grad_norm": 10.196369171142578,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.1543,
"step": 620
},
{
"epoch": 28.0,
"grad_norm": 10.799006462097168,
"learning_rate": 2.3737373737373738e-05,
"loss": 0.1356,
"step": 630
},
{
"epoch": 28.0,
"eval_accuracy": 0.9875,
"eval_loss": 0.04204293340444565,
"eval_runtime": 3.9774,
"eval_samples_per_second": 160.91,
"eval_steps_per_second": 2.514,
"step": 630
},
{
"epoch": 28.444444444444443,
"grad_norm": 7.129752159118652,
"learning_rate": 2.3232323232323232e-05,
"loss": 0.1291,
"step": 640
},
{
"epoch": 28.88888888888889,
"grad_norm": 6.642085552215576,
"learning_rate": 2.272727272727273e-05,
"loss": 0.1101,
"step": 650
},
{
"epoch": 28.977777777777778,
"eval_accuracy": 0.9875,
"eval_loss": 0.03539272025227547,
"eval_runtime": 4.0166,
"eval_samples_per_second": 159.34,
"eval_steps_per_second": 2.49,
"step": 652
},
{
"epoch": 29.333333333333332,
"grad_norm": 8.5753755569458,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.1445,
"step": 660
},
{
"epoch": 29.77777777777778,
"grad_norm": 13.183974266052246,
"learning_rate": 2.171717171717172e-05,
"loss": 0.1227,
"step": 670
},
{
"epoch": 30.0,
"eval_accuracy": 0.9765625,
"eval_loss": 0.0582769513130188,
"eval_runtime": 3.9913,
"eval_samples_per_second": 160.35,
"eval_steps_per_second": 2.505,
"step": 675
},
{
"epoch": 30.22222222222222,
"grad_norm": 9.026564598083496,
"learning_rate": 2.1212121212121215e-05,
"loss": 0.1209,
"step": 680
},
{
"epoch": 30.666666666666668,
"grad_norm": 12.504347801208496,
"learning_rate": 2.070707070707071e-05,
"loss": 0.1158,
"step": 690
},
{
"epoch": 30.977777777777778,
"eval_accuracy": 0.990625,
"eval_loss": 0.025325458496809006,
"eval_runtime": 3.9678,
"eval_samples_per_second": 161.299,
"eval_steps_per_second": 2.52,
"step": 697
},
{
"epoch": 31.11111111111111,
"grad_norm": 5.276214599609375,
"learning_rate": 2.0202020202020203e-05,
"loss": 0.119,
"step": 700
},
{
"epoch": 31.555555555555557,
"grad_norm": 8.732769012451172,
"learning_rate": 1.9696969696969697e-05,
"loss": 0.1156,
"step": 710
},
{
"epoch": 32.0,
"grad_norm": 5.604591369628906,
"learning_rate": 1.919191919191919e-05,
"loss": 0.117,
"step": 720
},
{
"epoch": 32.0,
"eval_accuracy": 0.990625,
"eval_loss": 0.023098567500710487,
"eval_runtime": 3.8505,
"eval_samples_per_second": 166.213,
"eval_steps_per_second": 2.597,
"step": 720
},
{
"epoch": 32.44444444444444,
"grad_norm": 7.332610607147217,
"learning_rate": 1.8686868686868688e-05,
"loss": 0.1213,
"step": 730
},
{
"epoch": 32.888888888888886,
"grad_norm": 12.890093803405762,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.1022,
"step": 740
},
{
"epoch": 32.977777777777774,
"eval_accuracy": 0.9796875,
"eval_loss": 0.0725882276892662,
"eval_runtime": 3.8065,
"eval_samples_per_second": 168.135,
"eval_steps_per_second": 2.627,
"step": 742
},
{
"epoch": 33.333333333333336,
"grad_norm": 13.247682571411133,
"learning_rate": 1.7676767676767676e-05,
"loss": 0.1257,
"step": 750
},
{
"epoch": 33.77777777777778,
"grad_norm": 6.758236885070801,
"learning_rate": 1.7171717171717173e-05,
"loss": 0.1221,
"step": 760
},
{
"epoch": 34.0,
"eval_accuracy": 0.996875,
"eval_loss": 0.015964530408382416,
"eval_runtime": 3.7585,
"eval_samples_per_second": 170.283,
"eval_steps_per_second": 2.661,
"step": 765
},
{
"epoch": 34.22222222222222,
"grad_norm": 8.521262168884277,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.1014,
"step": 770
},
{
"epoch": 34.666666666666664,
"grad_norm": 5.949100971221924,
"learning_rate": 1.6161616161616165e-05,
"loss": 0.0956,
"step": 780
},
{
"epoch": 34.977777777777774,
"eval_accuracy": 0.984375,
"eval_loss": 0.048214979469776154,
"eval_runtime": 3.6909,
"eval_samples_per_second": 173.399,
"eval_steps_per_second": 2.709,
"step": 787
},
{
"epoch": 35.111111111111114,
"grad_norm": 10.151766777038574,
"learning_rate": 1.565656565656566e-05,
"loss": 0.1135,
"step": 790
},
{
"epoch": 35.55555555555556,
"grad_norm": 9.514137268066406,
"learning_rate": 1.5151515151515153e-05,
"loss": 0.1109,
"step": 800
},
{
"epoch": 36.0,
"grad_norm": 4.4278717041015625,
"learning_rate": 1.4646464646464647e-05,
"loss": 0.0856,
"step": 810
},
{
"epoch": 36.0,
"eval_accuracy": 0.9875,
"eval_loss": 0.025567293167114258,
"eval_runtime": 3.6537,
"eval_samples_per_second": 175.165,
"eval_steps_per_second": 2.737,
"step": 810
},
{
"epoch": 36.44444444444444,
"grad_norm": 8.582184791564941,
"learning_rate": 1.4141414141414141e-05,
"loss": 0.0994,
"step": 820
},
{
"epoch": 36.888888888888886,
"grad_norm": 9.628859519958496,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.0996,
"step": 830
},
{
"epoch": 36.977777777777774,
"eval_accuracy": 0.990625,
"eval_loss": 0.021057253703475,
"eval_runtime": 3.6772,
"eval_samples_per_second": 174.046,
"eval_steps_per_second": 2.719,
"step": 832
},
{
"epoch": 37.333333333333336,
"grad_norm": 5.165952682495117,
"learning_rate": 1.3131313131313134e-05,
"loss": 0.0915,
"step": 840
},
{
"epoch": 37.77777777777778,
"grad_norm": 5.831385135650635,
"learning_rate": 1.2626262626262628e-05,
"loss": 0.0848,
"step": 850
},
{
"epoch": 38.0,
"eval_accuracy": 0.9796875,
"eval_loss": 0.04457371309399605,
"eval_runtime": 3.6584,
"eval_samples_per_second": 174.938,
"eval_steps_per_second": 2.733,
"step": 855
},
{
"epoch": 38.22222222222222,
"grad_norm": 9.629181861877441,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.0972,
"step": 860
},
{
"epoch": 38.666666666666664,
"grad_norm": 6.214244365692139,
"learning_rate": 1.1616161616161616e-05,
"loss": 0.1001,
"step": 870
},
{
"epoch": 38.977777777777774,
"eval_accuracy": 0.9875,
"eval_loss": 0.02742326818406582,
"eval_runtime": 3.668,
"eval_samples_per_second": 174.481,
"eval_steps_per_second": 2.726,
"step": 877
},
{
"epoch": 39.111111111111114,
"grad_norm": 11.1734619140625,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0919,
"step": 880
},
{
"epoch": 39.55555555555556,
"grad_norm": 6.438005447387695,
"learning_rate": 1.0606060606060607e-05,
"loss": 0.0988,
"step": 890
},
{
"epoch": 40.0,
"grad_norm": 5.9803643226623535,
"learning_rate": 1.0101010101010101e-05,
"loss": 0.0976,
"step": 900
},
{
"epoch": 40.0,
"eval_accuracy": 0.9921875,
"eval_loss": 0.022529248148202896,
"eval_runtime": 3.7092,
"eval_samples_per_second": 172.543,
"eval_steps_per_second": 2.696,
"step": 900
},
{
"epoch": 40.44444444444444,
"grad_norm": 7.562661647796631,
"learning_rate": 9.595959595959595e-06,
"loss": 0.085,
"step": 910
},
{
"epoch": 40.888888888888886,
"grad_norm": 7.695030212402344,
"learning_rate": 9.090909090909091e-06,
"loss": 0.0864,
"step": 920
},
{
"epoch": 40.977777777777774,
"eval_accuracy": 0.9921875,
"eval_loss": 0.0207191314548254,
"eval_runtime": 4.0024,
"eval_samples_per_second": 159.904,
"eval_steps_per_second": 2.499,
"step": 922
},
{
"epoch": 41.333333333333336,
"grad_norm": 8.4052734375,
"learning_rate": 8.585858585858587e-06,
"loss": 0.088,
"step": 930
},
{
"epoch": 41.77777777777778,
"grad_norm": 8.705794334411621,
"learning_rate": 8.080808080808082e-06,
"loss": 0.0865,
"step": 940
},
{
"epoch": 42.0,
"eval_accuracy": 0.996875,
"eval_loss": 0.01933131366968155,
"eval_runtime": 3.9909,
"eval_samples_per_second": 160.365,
"eval_steps_per_second": 2.506,
"step": 945
},
{
"epoch": 42.22222222222222,
"grad_norm": 7.478874683380127,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.0815,
"step": 950
},
{
"epoch": 42.666666666666664,
"grad_norm": 5.25657320022583,
"learning_rate": 7.0707070707070704e-06,
"loss": 0.0773,
"step": 960
},
{
"epoch": 42.977777777777774,
"eval_accuracy": 0.9921875,
"eval_loss": 0.020288193598389626,
"eval_runtime": 3.6594,
"eval_samples_per_second": 174.89,
"eval_steps_per_second": 2.733,
"step": 967
},
{
"epoch": 43.111111111111114,
"grad_norm": 4.1972246170043945,
"learning_rate": 6.565656565656567e-06,
"loss": 0.0799,
"step": 970
},
{
"epoch": 43.55555555555556,
"grad_norm": 6.9554972648620605,
"learning_rate": 6.060606060606061e-06,
"loss": 0.0772,
"step": 980
},
{
"epoch": 44.0,
"grad_norm": 6.343081951141357,
"learning_rate": 5.555555555555556e-06,
"loss": 0.075,
"step": 990
},
{
"epoch": 44.0,
"eval_accuracy": 0.996875,
"eval_loss": 0.013058523647487164,
"eval_runtime": 3.5774,
"eval_samples_per_second": 178.899,
"eval_steps_per_second": 2.795,
"step": 990
},
{
"epoch": 44.44444444444444,
"grad_norm": 4.900812149047852,
"learning_rate": 5.050505050505051e-06,
"loss": 0.0736,
"step": 1000
},
{
"epoch": 44.888888888888886,
"grad_norm": 5.955135345458984,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.0761,
"step": 1010
},
{
"epoch": 44.977777777777774,
"eval_accuracy": 0.99375,
"eval_loss": 0.012860281392931938,
"eval_runtime": 3.5897,
"eval_samples_per_second": 178.288,
"eval_steps_per_second": 2.786,
"step": 1012
},
{
"epoch": 45.333333333333336,
"grad_norm": 4.250102996826172,
"learning_rate": 4.040404040404041e-06,
"loss": 0.0707,
"step": 1020
},
{
"epoch": 45.77777777777778,
"grad_norm": 6.8997931480407715,
"learning_rate": 3.5353535353535352e-06,
"loss": 0.0624,
"step": 1030
},
{
"epoch": 46.0,
"eval_accuracy": 0.996875,
"eval_loss": 0.011364495381712914,
"eval_runtime": 3.6128,
"eval_samples_per_second": 177.146,
"eval_steps_per_second": 2.768,
"step": 1035
},
{
"epoch": 46.22222222222222,
"grad_norm": 10.210082054138184,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.0762,
"step": 1040
},
{
"epoch": 46.666666666666664,
"grad_norm": 4.9201788902282715,
"learning_rate": 2.5252525252525253e-06,
"loss": 0.0557,
"step": 1050
},
{
"epoch": 46.977777777777774,
"eval_accuracy": 0.9953125,
"eval_loss": 0.010208332911133766,
"eval_runtime": 3.8474,
"eval_samples_per_second": 166.347,
"eval_steps_per_second": 2.599,
"step": 1057
},
{
"epoch": 47.111111111111114,
"grad_norm": 3.725327491760254,
"learning_rate": 2.0202020202020206e-06,
"loss": 0.0613,
"step": 1060
},
{
"epoch": 47.55555555555556,
"grad_norm": 3.1549530029296875,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.0601,
"step": 1070
},
{
"epoch": 48.0,
"grad_norm": 6.280518054962158,
"learning_rate": 1.0101010101010103e-06,
"loss": 0.0708,
"step": 1080
},
{
"epoch": 48.0,
"eval_accuracy": 0.9953125,
"eval_loss": 0.01160599384456873,
"eval_runtime": 3.9922,
"eval_samples_per_second": 160.314,
"eval_steps_per_second": 2.505,
"step": 1080
},
{
"epoch": 48.44444444444444,
"grad_norm": 6.1849260330200195,
"learning_rate": 5.050505050505052e-07,
"loss": 0.0699,
"step": 1090
},
{
"epoch": 48.888888888888886,
"grad_norm": 7.637501239776611,
"learning_rate": 0.0,
"loss": 0.0667,
"step": 1100
},
{
"epoch": 48.888888888888886,
"eval_accuracy": 0.9953125,
"eval_loss": 0.013088616542518139,
"eval_runtime": 3.601,
"eval_samples_per_second": 177.727,
"eval_steps_per_second": 2.777,
"step": 1100
},
{
"epoch": 48.888888888888886,
"step": 1100,
"total_flos": 5.510665685119795e+18,
"train_loss": 0.17931611462072894,
"train_runtime": 3528.3547,
"train_samples_per_second": 81.624,
"train_steps_per_second": 0.312
}
],
"logging_steps": 10,
"max_steps": 1100,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"total_flos": 5.510665685119795e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}