atsuki-yamaguchi's picture
Upload folder using huggingface_hub
6ac26be verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2514,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002386634844868735,
"grad_norm": 79.25410461425781,
"learning_rate": 2.3809523809523808e-06,
"loss": 9.018,
"step": 3
},
{
"epoch": 0.00477326968973747,
"grad_norm": 69.27806854248047,
"learning_rate": 4.7619047619047615e-06,
"loss": 8.1397,
"step": 6
},
{
"epoch": 0.007159904534606206,
"grad_norm": 44.50214767456055,
"learning_rate": 7.142857142857143e-06,
"loss": 6.5055,
"step": 9
},
{
"epoch": 0.00954653937947494,
"grad_norm": 22.070268630981445,
"learning_rate": 9.523809523809523e-06,
"loss": 5.3274,
"step": 12
},
{
"epoch": 0.011933174224343675,
"grad_norm": 17.811525344848633,
"learning_rate": 1.1904761904761905e-05,
"loss": 5.0629,
"step": 15
},
{
"epoch": 0.014319809069212411,
"grad_norm": 13.21521282196045,
"learning_rate": 1.4285714285714285e-05,
"loss": 4.8887,
"step": 18
},
{
"epoch": 0.016706443914081145,
"grad_norm": 85.0751724243164,
"learning_rate": 1.6666666666666667e-05,
"loss": 4.6865,
"step": 21
},
{
"epoch": 0.01909307875894988,
"grad_norm": 11.005032539367676,
"learning_rate": 1.9047619047619046e-05,
"loss": 4.4609,
"step": 24
},
{
"epoch": 0.021479713603818614,
"grad_norm": 9.891936302185059,
"learning_rate": 2.1428571428571428e-05,
"loss": 4.4208,
"step": 27
},
{
"epoch": 0.02386634844868735,
"grad_norm": 6.92100715637207,
"learning_rate": 2.380952380952381e-05,
"loss": 4.3251,
"step": 30
},
{
"epoch": 0.026252983293556086,
"grad_norm": 4.971158504486084,
"learning_rate": 2.6190476190476192e-05,
"loss": 3.9813,
"step": 33
},
{
"epoch": 0.028639618138424822,
"grad_norm": 4.140079021453857,
"learning_rate": 2.857142857142857e-05,
"loss": 4.0783,
"step": 36
},
{
"epoch": 0.031026252983293555,
"grad_norm": 4.761113166809082,
"learning_rate": 3.095238095238095e-05,
"loss": 4.1418,
"step": 39
},
{
"epoch": 0.03341288782816229,
"grad_norm": 5.746503829956055,
"learning_rate": 3.3333333333333335e-05,
"loss": 3.8944,
"step": 42
},
{
"epoch": 0.03579952267303103,
"grad_norm": 4.554972171783447,
"learning_rate": 3.571428571428572e-05,
"loss": 3.7807,
"step": 45
},
{
"epoch": 0.03818615751789976,
"grad_norm": 3.873955726623535,
"learning_rate": 3.809523809523809e-05,
"loss": 3.607,
"step": 48
},
{
"epoch": 0.0405727923627685,
"grad_norm": 3.376633644104004,
"learning_rate": 4.047619047619048e-05,
"loss": 3.4685,
"step": 51
},
{
"epoch": 0.04295942720763723,
"grad_norm": 5.888181686401367,
"learning_rate": 4.2857142857142856e-05,
"loss": 3.4832,
"step": 54
},
{
"epoch": 0.045346062052505964,
"grad_norm": 5.094118595123291,
"learning_rate": 4.523809523809524e-05,
"loss": 3.3647,
"step": 57
},
{
"epoch": 0.0477326968973747,
"grad_norm": 4.940067291259766,
"learning_rate": 4.761904761904762e-05,
"loss": 3.2268,
"step": 60
},
{
"epoch": 0.050119331742243436,
"grad_norm": 3.514155149459839,
"learning_rate": 5e-05,
"loss": 3.1229,
"step": 63
},
{
"epoch": 0.05250596658711217,
"grad_norm": 3.922811985015869,
"learning_rate": 5.2380952380952384e-05,
"loss": 3.0612,
"step": 66
},
{
"epoch": 0.05489260143198091,
"grad_norm": 3.203274726867676,
"learning_rate": 5.4761904761904766e-05,
"loss": 2.8537,
"step": 69
},
{
"epoch": 0.057279236276849645,
"grad_norm": 2.8945529460906982,
"learning_rate": 5.714285714285714e-05,
"loss": 2.9128,
"step": 72
},
{
"epoch": 0.059665871121718374,
"grad_norm": 3.287409543991089,
"learning_rate": 5.9523809523809524e-05,
"loss": 2.8958,
"step": 75
},
{
"epoch": 0.06205250596658711,
"grad_norm": 3.5249319076538086,
"learning_rate": 6.19047619047619e-05,
"loss": 2.8925,
"step": 78
},
{
"epoch": 0.06443914081145585,
"grad_norm": 3.4028635025024414,
"learning_rate": 6.428571428571429e-05,
"loss": 2.8153,
"step": 81
},
{
"epoch": 0.06682577565632458,
"grad_norm": 3.1509742736816406,
"learning_rate": 6.666666666666667e-05,
"loss": 2.7562,
"step": 84
},
{
"epoch": 0.06921241050119331,
"grad_norm": 2.745244264602661,
"learning_rate": 6.904761904761905e-05,
"loss": 2.6552,
"step": 87
},
{
"epoch": 0.07159904534606205,
"grad_norm": 2.562229633331299,
"learning_rate": 7.142857142857143e-05,
"loss": 2.6686,
"step": 90
},
{
"epoch": 0.07398568019093078,
"grad_norm": 3.0172386169433594,
"learning_rate": 7.380952380952382e-05,
"loss": 2.782,
"step": 93
},
{
"epoch": 0.07637231503579953,
"grad_norm": 3.038167953491211,
"learning_rate": 7.619047619047618e-05,
"loss": 2.6845,
"step": 96
},
{
"epoch": 0.07875894988066826,
"grad_norm": 3.3530476093292236,
"learning_rate": 7.857142857142858e-05,
"loss": 2.5982,
"step": 99
},
{
"epoch": 0.081145584725537,
"grad_norm": 2.9540939331054688,
"learning_rate": 8.095238095238096e-05,
"loss": 2.5005,
"step": 102
},
{
"epoch": 0.08353221957040573,
"grad_norm": 2.9407269954681396,
"learning_rate": 8.333333333333334e-05,
"loss": 2.5421,
"step": 105
},
{
"epoch": 0.08591885441527446,
"grad_norm": 2.8846545219421387,
"learning_rate": 8.571428571428571e-05,
"loss": 2.5134,
"step": 108
},
{
"epoch": 0.0883054892601432,
"grad_norm": 3.2429230213165283,
"learning_rate": 8.80952380952381e-05,
"loss": 2.4542,
"step": 111
},
{
"epoch": 0.09069212410501193,
"grad_norm": 2.5776901245117188,
"learning_rate": 9.047619047619048e-05,
"loss": 2.5908,
"step": 114
},
{
"epoch": 0.09307875894988067,
"grad_norm": 2.4036340713500977,
"learning_rate": 9.285714285714286e-05,
"loss": 2.4376,
"step": 117
},
{
"epoch": 0.0954653937947494,
"grad_norm": 3.9573254585266113,
"learning_rate": 9.523809523809524e-05,
"loss": 2.4042,
"step": 120
},
{
"epoch": 0.09785202863961814,
"grad_norm": 3.6064600944519043,
"learning_rate": 9.761904761904762e-05,
"loss": 2.5074,
"step": 123
},
{
"epoch": 0.10023866348448687,
"grad_norm": 3.3975508213043213,
"learning_rate": 0.0001,
"loss": 2.2322,
"step": 126
},
{
"epoch": 0.1026252983293556,
"grad_norm": 2.3529391288757324,
"learning_rate": 9.999961058466053e-05,
"loss": 2.4629,
"step": 129
},
{
"epoch": 0.10501193317422435,
"grad_norm": 2.434084892272949,
"learning_rate": 9.999844234470782e-05,
"loss": 2.3296,
"step": 132
},
{
"epoch": 0.10739856801909307,
"grad_norm": 2.450005054473877,
"learning_rate": 9.999649529833915e-05,
"loss": 2.3464,
"step": 135
},
{
"epoch": 0.10978520286396182,
"grad_norm": 3.4513914585113525,
"learning_rate": 9.999376947588288e-05,
"loss": 2.4644,
"step": 138
},
{
"epoch": 0.11217183770883055,
"grad_norm": 3.2405099868774414,
"learning_rate": 9.999026491979808e-05,
"loss": 2.3977,
"step": 141
},
{
"epoch": 0.11455847255369929,
"grad_norm": 1.8948777914047241,
"learning_rate": 9.99859816846739e-05,
"loss": 2.3372,
"step": 144
},
{
"epoch": 0.11694510739856802,
"grad_norm": 2.3878233432769775,
"learning_rate": 9.998091983722863e-05,
"loss": 2.3408,
"step": 147
},
{
"epoch": 0.11933174224343675,
"grad_norm": 2.373782157897949,
"learning_rate": 9.99750794563087e-05,
"loss": 2.1927,
"step": 150
},
{
"epoch": 0.12171837708830549,
"grad_norm": 2.473146677017212,
"learning_rate": 9.996846063288747e-05,
"loss": 2.2674,
"step": 153
},
{
"epoch": 0.12410501193317422,
"grad_norm": 2.17854905128479,
"learning_rate": 9.996106347006379e-05,
"loss": 2.4093,
"step": 156
},
{
"epoch": 0.12649164677804295,
"grad_norm": 2.668506622314453,
"learning_rate": 9.99528880830604e-05,
"loss": 2.234,
"step": 159
},
{
"epoch": 0.1288782816229117,
"grad_norm": 2.30295991897583,
"learning_rate": 9.994393459922218e-05,
"loss": 2.174,
"step": 162
},
{
"epoch": 0.13126491646778043,
"grad_norm": 2.299704074859619,
"learning_rate": 9.993420315801406e-05,
"loss": 2.1369,
"step": 165
},
{
"epoch": 0.13365155131264916,
"grad_norm": 4.072019100189209,
"learning_rate": 9.992369391101895e-05,
"loss": 2.2051,
"step": 168
},
{
"epoch": 0.1360381861575179,
"grad_norm": 2.203756809234619,
"learning_rate": 9.991240702193532e-05,
"loss": 2.3608,
"step": 171
},
{
"epoch": 0.13842482100238662,
"grad_norm": 2.2594192028045654,
"learning_rate": 9.990034266657467e-05,
"loss": 2.2503,
"step": 174
},
{
"epoch": 0.14081145584725538,
"grad_norm": 2.214170217514038,
"learning_rate": 9.988750103285883e-05,
"loss": 2.1698,
"step": 177
},
{
"epoch": 0.1431980906921241,
"grad_norm": 2.1277706623077393,
"learning_rate": 9.987388232081694e-05,
"loss": 2.2199,
"step": 180
},
{
"epoch": 0.14558472553699284,
"grad_norm": 2.1861696243286133,
"learning_rate": 9.985948674258243e-05,
"loss": 2.1487,
"step": 183
},
{
"epoch": 0.14797136038186157,
"grad_norm": 2.062450647354126,
"learning_rate": 9.984431452238967e-05,
"loss": 2.2716,
"step": 186
},
{
"epoch": 0.15035799522673032,
"grad_norm": 2.335073471069336,
"learning_rate": 9.982836589657043e-05,
"loss": 2.1853,
"step": 189
},
{
"epoch": 0.15274463007159905,
"grad_norm": 1.7519389390945435,
"learning_rate": 9.981164111355035e-05,
"loss": 2.2452,
"step": 192
},
{
"epoch": 0.15513126491646778,
"grad_norm": 4.789851665496826,
"learning_rate": 9.979414043384485e-05,
"loss": 2.224,
"step": 195
},
{
"epoch": 0.1575178997613365,
"grad_norm": 2.454585552215576,
"learning_rate": 9.977586413005531e-05,
"loss": 2.2943,
"step": 198
},
{
"epoch": 0.15990453460620524,
"grad_norm": 2.2278664112091064,
"learning_rate": 9.975681248686461e-05,
"loss": 2.2947,
"step": 201
},
{
"epoch": 0.162291169451074,
"grad_norm": 2.118945837020874,
"learning_rate": 9.973698580103285e-05,
"loss": 2.2039,
"step": 204
},
{
"epoch": 0.16467780429594273,
"grad_norm": 2.1315999031066895,
"learning_rate": 9.971638438139266e-05,
"loss": 2.3493,
"step": 207
},
{
"epoch": 0.16706443914081145,
"grad_norm": 2.902245283126831,
"learning_rate": 9.96950085488444e-05,
"loss": 2.0854,
"step": 210
},
{
"epoch": 0.16945107398568018,
"grad_norm": 2.160647392272949,
"learning_rate": 9.967285863635112e-05,
"loss": 2.3386,
"step": 213
},
{
"epoch": 0.1718377088305489,
"grad_norm": 3.592740774154663,
"learning_rate": 9.964993498893349e-05,
"loss": 2.2387,
"step": 216
},
{
"epoch": 0.17422434367541767,
"grad_norm": 3.9422338008880615,
"learning_rate": 9.962623796366429e-05,
"loss": 2.2304,
"step": 219
},
{
"epoch": 0.1766109785202864,
"grad_norm": 3.213841438293457,
"learning_rate": 9.960176792966289e-05,
"loss": 2.1835,
"step": 222
},
{
"epoch": 0.17899761336515513,
"grad_norm": 2.4797472953796387,
"learning_rate": 9.95765252680896e-05,
"loss": 2.2491,
"step": 225
},
{
"epoch": 0.18138424821002386,
"grad_norm": 2.0377020835876465,
"learning_rate": 9.95505103721396e-05,
"loss": 2.1921,
"step": 228
},
{
"epoch": 0.18377088305489261,
"grad_norm": 2.2581214904785156,
"learning_rate": 9.952372364703687e-05,
"loss": 2.2655,
"step": 231
},
{
"epoch": 0.18615751789976134,
"grad_norm": 2.89260196685791,
"learning_rate": 9.949616551002787e-05,
"loss": 2.1804,
"step": 234
},
{
"epoch": 0.18854415274463007,
"grad_norm": 2.4222378730773926,
"learning_rate": 9.946783639037504e-05,
"loss": 2.1525,
"step": 237
},
{
"epoch": 0.1909307875894988,
"grad_norm": 2.1491341590881348,
"learning_rate": 9.943873672935014e-05,
"loss": 2.1605,
"step": 240
},
{
"epoch": 0.19331742243436753,
"grad_norm": 1.9782716035842896,
"learning_rate": 9.940886698022734e-05,
"loss": 2.0402,
"step": 243
},
{
"epoch": 0.1957040572792363,
"grad_norm": 1.9801812171936035,
"learning_rate": 9.93782276082762e-05,
"loss": 2.1493,
"step": 246
},
{
"epoch": 0.19809069212410502,
"grad_norm": 1.7495440244674683,
"learning_rate": 9.934681909075434e-05,
"loss": 2.1087,
"step": 249
},
{
"epoch": 0.20047732696897375,
"grad_norm": 1.9998902082443237,
"learning_rate": 9.931464191690015e-05,
"loss": 2.1841,
"step": 252
},
{
"epoch": 0.20286396181384247,
"grad_norm": 2.2117197513580322,
"learning_rate": 9.928169658792498e-05,
"loss": 2.1848,
"step": 255
},
{
"epoch": 0.2052505966587112,
"grad_norm": 2.210057497024536,
"learning_rate": 9.924798361700553e-05,
"loss": 2.1324,
"step": 258
},
{
"epoch": 0.20763723150357996,
"grad_norm": 1.9545291662216187,
"learning_rate": 9.92135035292757e-05,
"loss": 2.1454,
"step": 261
},
{
"epoch": 0.2100238663484487,
"grad_norm": 2.3102903366088867,
"learning_rate": 9.91782568618185e-05,
"loss": 2.183,
"step": 264
},
{
"epoch": 0.21241050119331742,
"grad_norm": 1.937774419784546,
"learning_rate": 9.914224416365764e-05,
"loss": 2.2033,
"step": 267
},
{
"epoch": 0.21479713603818615,
"grad_norm": 1.8689621686935425,
"learning_rate": 9.910546599574902e-05,
"loss": 2.3252,
"step": 270
},
{
"epoch": 0.2171837708830549,
"grad_norm": 2.4020369052886963,
"learning_rate": 9.906792293097194e-05,
"loss": 2.1485,
"step": 273
},
{
"epoch": 0.21957040572792363,
"grad_norm": 4.170408248901367,
"learning_rate": 9.90296155541202e-05,
"loss": 2.1413,
"step": 276
},
{
"epoch": 0.22195704057279236,
"grad_norm": 1.9683239459991455,
"learning_rate": 9.899054446189304e-05,
"loss": 1.9998,
"step": 279
},
{
"epoch": 0.2243436754176611,
"grad_norm": 1.9885108470916748,
"learning_rate": 9.895071026288574e-05,
"loss": 2.0373,
"step": 282
},
{
"epoch": 0.22673031026252982,
"grad_norm": 2.297595977783203,
"learning_rate": 9.891011357758022e-05,
"loss": 2.0459,
"step": 285
},
{
"epoch": 0.22911694510739858,
"grad_norm": 2.0850205421447754,
"learning_rate": 9.886875503833536e-05,
"loss": 2.2041,
"step": 288
},
{
"epoch": 0.2315035799522673,
"grad_norm": 2.3734419345855713,
"learning_rate": 9.882663528937717e-05,
"loss": 2.1135,
"step": 291
},
{
"epoch": 0.23389021479713604,
"grad_norm": 2.0763649940490723,
"learning_rate": 9.87837549867887e-05,
"loss": 1.955,
"step": 294
},
{
"epoch": 0.23627684964200477,
"grad_norm": 2.171724557876587,
"learning_rate": 9.87401147984998e-05,
"loss": 2.2409,
"step": 297
},
{
"epoch": 0.2386634844868735,
"grad_norm": 2.6933743953704834,
"learning_rate": 9.869571540427689e-05,
"loss": 2.1241,
"step": 300
},
{
"epoch": 0.24105011933174225,
"grad_norm": 1.917708396911621,
"learning_rate": 9.865055749571215e-05,
"loss": 2.1745,
"step": 303
},
{
"epoch": 0.24343675417661098,
"grad_norm": 2.3014838695526123,
"learning_rate": 9.860464177621284e-05,
"loss": 1.9313,
"step": 306
},
{
"epoch": 0.2458233890214797,
"grad_norm": 2.2263333797454834,
"learning_rate": 9.855796896099045e-05,
"loss": 2.1881,
"step": 309
},
{
"epoch": 0.24821002386634844,
"grad_norm": 2.161557197570801,
"learning_rate": 9.851053977704931e-05,
"loss": 1.9702,
"step": 312
},
{
"epoch": 0.25059665871121717,
"grad_norm": 2.1806318759918213,
"learning_rate": 9.846235496317555e-05,
"loss": 1.9893,
"step": 315
},
{
"epoch": 0.2529832935560859,
"grad_norm": 2.226500988006592,
"learning_rate": 9.841341526992536e-05,
"loss": 2.0987,
"step": 318
},
{
"epoch": 0.2553699284009546,
"grad_norm": 1.8919053077697754,
"learning_rate": 9.836372145961345e-05,
"loss": 2.1568,
"step": 321
},
{
"epoch": 0.2577565632458234,
"grad_norm": 1.9861871004104614,
"learning_rate": 9.83132743063011e-05,
"loss": 2.1504,
"step": 324
},
{
"epoch": 0.26014319809069214,
"grad_norm": 2.438837766647339,
"learning_rate": 9.826207459578411e-05,
"loss": 1.9779,
"step": 327
},
{
"epoch": 0.26252983293556087,
"grad_norm": 2.6724672317504883,
"learning_rate": 9.821012312558058e-05,
"loss": 1.9427,
"step": 330
},
{
"epoch": 0.2649164677804296,
"grad_norm": 1.813621163368225,
"learning_rate": 9.815742070491852e-05,
"loss": 2.1153,
"step": 333
},
{
"epoch": 0.26730310262529833,
"grad_norm": 2.0258703231811523,
"learning_rate": 9.810396815472314e-05,
"loss": 2.02,
"step": 336
},
{
"epoch": 0.26968973747016706,
"grad_norm": 1.9743555784225464,
"learning_rate": 9.804976630760419e-05,
"loss": 2.0681,
"step": 339
},
{
"epoch": 0.2720763723150358,
"grad_norm": 1.8202836513519287,
"learning_rate": 9.799481600784286e-05,
"loss": 1.9733,
"step": 342
},
{
"epoch": 0.2744630071599045,
"grad_norm": 2.296698808670044,
"learning_rate": 9.793911811137875e-05,
"loss": 2.0407,
"step": 345
},
{
"epoch": 0.27684964200477324,
"grad_norm": 2.002511501312256,
"learning_rate": 9.788267348579648e-05,
"loss": 2.0666,
"step": 348
},
{
"epoch": 0.27923627684964203,
"grad_norm": 2.08190655708313,
"learning_rate": 9.782548301031217e-05,
"loss": 2.0803,
"step": 351
},
{
"epoch": 0.28162291169451076,
"grad_norm": 2.150238513946533,
"learning_rate": 9.776754757575975e-05,
"loss": 2.0132,
"step": 354
},
{
"epoch": 0.2840095465393795,
"grad_norm": 2.2527501583099365,
"learning_rate": 9.770886808457709e-05,
"loss": 2.0905,
"step": 357
},
{
"epoch": 0.2863961813842482,
"grad_norm": 2.284032106399536,
"learning_rate": 9.764944545079196e-05,
"loss": 2.1845,
"step": 360
},
{
"epoch": 0.28878281622911695,
"grad_norm": 1.89547598361969,
"learning_rate": 9.758928060000778e-05,
"loss": 2.1135,
"step": 363
},
{
"epoch": 0.2911694510739857,
"grad_norm": 2.5930140018463135,
"learning_rate": 9.752837446938915e-05,
"loss": 1.9517,
"step": 366
},
{
"epoch": 0.2935560859188544,
"grad_norm": 1.86557137966156,
"learning_rate": 9.746672800764735e-05,
"loss": 1.9579,
"step": 369
},
{
"epoch": 0.29594272076372313,
"grad_norm": 1.8821232318878174,
"learning_rate": 9.740434217502547e-05,
"loss": 1.9665,
"step": 372
},
{
"epoch": 0.29832935560859186,
"grad_norm": 2.0216808319091797,
"learning_rate": 9.734121794328357e-05,
"loss": 2.0612,
"step": 375
},
{
"epoch": 0.30071599045346065,
"grad_norm": 1.8245267868041992,
"learning_rate": 9.727735629568336e-05,
"loss": 2.0857,
"step": 378
},
{
"epoch": 0.3031026252983294,
"grad_norm": 1.9001407623291016,
"learning_rate": 9.721275822697306e-05,
"loss": 1.9954,
"step": 381
},
{
"epoch": 0.3054892601431981,
"grad_norm": 1.7628698348999023,
"learning_rate": 9.714742474337186e-05,
"loss": 2.1095,
"step": 384
},
{
"epoch": 0.30787589498806683,
"grad_norm": 1.687436819076538,
"learning_rate": 9.708135686255416e-05,
"loss": 2.1919,
"step": 387
},
{
"epoch": 0.31026252983293556,
"grad_norm": 2.219071388244629,
"learning_rate": 9.701455561363379e-05,
"loss": 1.9392,
"step": 390
},
{
"epoch": 0.3126491646778043,
"grad_norm": 1.9545831680297852,
"learning_rate": 9.6947022037148e-05,
"loss": 1.9879,
"step": 393
},
{
"epoch": 0.315035799522673,
"grad_norm": 2.1863789558410645,
"learning_rate": 9.687875718504126e-05,
"loss": 1.9631,
"step": 396
},
{
"epoch": 0.31742243436754175,
"grad_norm": 3.1137101650238037,
"learning_rate": 9.680976212064874e-05,
"loss": 2.0387,
"step": 399
},
{
"epoch": 0.3198090692124105,
"grad_norm": 1.9021557569503784,
"learning_rate": 9.674003791867991e-05,
"loss": 2.0447,
"step": 402
},
{
"epoch": 0.3221957040572792,
"grad_norm": 1.8783704042434692,
"learning_rate": 9.666958566520174e-05,
"loss": 2.0777,
"step": 405
},
{
"epoch": 0.324582338902148,
"grad_norm": 1.910521388053894,
"learning_rate": 9.659840645762175e-05,
"loss": 2.1084,
"step": 408
},
{
"epoch": 0.3269689737470167,
"grad_norm": 1.9456645250320435,
"learning_rate": 9.652650140467093e-05,
"loss": 2.0317,
"step": 411
},
{
"epoch": 0.32935560859188545,
"grad_norm": 1.8211055994033813,
"learning_rate": 9.645387162638652e-05,
"loss": 2.0386,
"step": 414
},
{
"epoch": 0.3317422434367542,
"grad_norm": 2.032345771789551,
"learning_rate": 9.638051825409453e-05,
"loss": 2.2154,
"step": 417
},
{
"epoch": 0.3341288782816229,
"grad_norm": 1.869388461112976,
"learning_rate": 9.630644243039207e-05,
"loss": 1.9595,
"step": 420
},
{
"epoch": 0.33651551312649164,
"grad_norm": 1.9663021564483643,
"learning_rate": 9.623164530912963e-05,
"loss": 2.1678,
"step": 423
},
{
"epoch": 0.33890214797136037,
"grad_norm": 1.684556484222412,
"learning_rate": 9.615612805539305e-05,
"loss": 1.9458,
"step": 426
},
{
"epoch": 0.3412887828162291,
"grad_norm": 2.050321102142334,
"learning_rate": 9.607989184548543e-05,
"loss": 2.0412,
"step": 429
},
{
"epoch": 0.3436754176610978,
"grad_norm": 1.753670334815979,
"learning_rate": 9.600293786690872e-05,
"loss": 2.037,
"step": 432
},
{
"epoch": 0.3460620525059666,
"grad_norm": 1.8368828296661377,
"learning_rate": 9.592526731834537e-05,
"loss": 2.1845,
"step": 435
},
{
"epoch": 0.34844868735083534,
"grad_norm": 1.8000643253326416,
"learning_rate": 9.584688140963944e-05,
"loss": 1.8592,
"step": 438
},
{
"epoch": 0.35083532219570407,
"grad_norm": 1.7274373769760132,
"learning_rate": 9.576778136177798e-05,
"loss": 2.0366,
"step": 441
},
{
"epoch": 0.3532219570405728,
"grad_norm": 1.8421188592910767,
"learning_rate": 9.568796840687184e-05,
"loss": 1.9281,
"step": 444
},
{
"epoch": 0.3556085918854415,
"grad_norm": 1.8605895042419434,
"learning_rate": 9.560744378813659e-05,
"loss": 2.0214,
"step": 447
},
{
"epoch": 0.35799522673031026,
"grad_norm": 2.4788737297058105,
"learning_rate": 9.552620875987311e-05,
"loss": 1.9457,
"step": 450
},
{
"epoch": 0.360381861575179,
"grad_norm": 3.01055645942688,
"learning_rate": 9.544426458744804e-05,
"loss": 1.879,
"step": 453
},
{
"epoch": 0.3627684964200477,
"grad_norm": 2.6994807720184326,
"learning_rate": 9.536161254727408e-05,
"loss": 1.9495,
"step": 456
},
{
"epoch": 0.36515513126491644,
"grad_norm": 1.736722469329834,
"learning_rate": 9.527825392679012e-05,
"loss": 1.942,
"step": 459
},
{
"epoch": 0.36754176610978523,
"grad_norm": 2.2707650661468506,
"learning_rate": 9.51941900244412e-05,
"loss": 1.9525,
"step": 462
},
{
"epoch": 0.36992840095465396,
"grad_norm": 1.7479629516601562,
"learning_rate": 9.51094221496582e-05,
"loss": 1.9385,
"step": 465
},
{
"epoch": 0.3723150357995227,
"grad_norm": 2.1110448837280273,
"learning_rate": 9.502395162283759e-05,
"loss": 1.8335,
"step": 468
},
{
"epoch": 0.3747016706443914,
"grad_norm": 1.8206532001495361,
"learning_rate": 9.493777977532072e-05,
"loss": 1.9642,
"step": 471
},
{
"epoch": 0.37708830548926014,
"grad_norm": 2.622044801712036,
"learning_rate": 9.485090794937319e-05,
"loss": 1.9383,
"step": 474
},
{
"epoch": 0.3794749403341289,
"grad_norm": 2.016352653503418,
"learning_rate": 9.476333749816382e-05,
"loss": 1.8639,
"step": 477
},
{
"epoch": 0.3818615751789976,
"grad_norm": 2.027357816696167,
"learning_rate": 9.467506978574371e-05,
"loss": 1.862,
"step": 480
},
{
"epoch": 0.38424821002386633,
"grad_norm": 1.91681969165802,
"learning_rate": 9.45861061870249e-05,
"loss": 2.0105,
"step": 483
},
{
"epoch": 0.38663484486873506,
"grad_norm": 1.8423808813095093,
"learning_rate": 9.449644808775902e-05,
"loss": 1.9303,
"step": 486
},
{
"epoch": 0.38902147971360385,
"grad_norm": 1.5539026260375977,
"learning_rate": 9.44060968845156e-05,
"loss": 1.9071,
"step": 489
},
{
"epoch": 0.3914081145584726,
"grad_norm": 1.8300362825393677,
"learning_rate": 9.431505398466045e-05,
"loss": 2.0194,
"step": 492
},
{
"epoch": 0.3937947494033413,
"grad_norm": 1.6660507917404175,
"learning_rate": 9.42233208063336e-05,
"loss": 1.8226,
"step": 495
},
{
"epoch": 0.39618138424821003,
"grad_norm": 1.9865005016326904,
"learning_rate": 9.413089877842736e-05,
"loss": 2.0689,
"step": 498
},
{
"epoch": 0.39856801909307876,
"grad_norm": 1.9386086463928223,
"learning_rate": 9.403778934056391e-05,
"loss": 2.0289,
"step": 501
},
{
"epoch": 0.4009546539379475,
"grad_norm": 1.9398423433303833,
"learning_rate": 9.394399394307303e-05,
"loss": 2.2213,
"step": 504
},
{
"epoch": 0.4033412887828162,
"grad_norm": 1.864970326423645,
"learning_rate": 9.384951404696933e-05,
"loss": 1.8574,
"step": 507
},
{
"epoch": 0.40572792362768495,
"grad_norm": 1.9465175867080688,
"learning_rate": 9.375435112392969e-05,
"loss": 2.0628,
"step": 510
},
{
"epoch": 0.4081145584725537,
"grad_norm": 1.808294415473938,
"learning_rate": 9.365850665627016e-05,
"loss": 1.9223,
"step": 513
},
{
"epoch": 0.4105011933174224,
"grad_norm": 2.4403979778289795,
"learning_rate": 9.356198213692297e-05,
"loss": 1.8865,
"step": 516
},
{
"epoch": 0.4128878281622912,
"grad_norm": 1.8101935386657715,
"learning_rate": 9.346477906941331e-05,
"loss": 1.8335,
"step": 519
},
{
"epoch": 0.4152744630071599,
"grad_norm": 1.718172311782837,
"learning_rate": 9.336689896783573e-05,
"loss": 1.8691,
"step": 522
},
{
"epoch": 0.41766109785202865,
"grad_norm": 1.8754642009735107,
"learning_rate": 9.32683433568308e-05,
"loss": 1.9212,
"step": 525
},
{
"epoch": 0.4200477326968974,
"grad_norm": 2.471646785736084,
"learning_rate": 9.316911377156117e-05,
"loss": 2.0121,
"step": 528
},
{
"epoch": 0.4224343675417661,
"grad_norm": 2.1250839233398438,
"learning_rate": 9.306921175768775e-05,
"loss": 1.9309,
"step": 531
},
{
"epoch": 0.42482100238663484,
"grad_norm": 2.0609402656555176,
"learning_rate": 9.29686388713456e-05,
"loss": 2.1438,
"step": 534
},
{
"epoch": 0.42720763723150357,
"grad_norm": 1.7126508951187134,
"learning_rate": 9.286739667911972e-05,
"loss": 1.9621,
"step": 537
},
{
"epoch": 0.4295942720763723,
"grad_norm": 1.7493348121643066,
"learning_rate": 9.276548675802059e-05,
"loss": 2.0063,
"step": 540
},
{
"epoch": 0.431980906921241,
"grad_norm": 1.8076331615447998,
"learning_rate": 9.266291069545972e-05,
"loss": 1.9256,
"step": 543
},
{
"epoch": 0.4343675417661098,
"grad_norm": 1.6762983798980713,
"learning_rate": 9.255967008922474e-05,
"loss": 2.0052,
"step": 546
},
{
"epoch": 0.43675417661097854,
"grad_norm": 1.4335881471633911,
"learning_rate": 9.245576654745471e-05,
"loss": 2.0899,
"step": 549
},
{
"epoch": 0.43914081145584727,
"grad_norm": 1.6479798555374146,
"learning_rate": 9.235120168861496e-05,
"loss": 1.7962,
"step": 552
},
{
"epoch": 0.441527446300716,
"grad_norm": 2.1050121784210205,
"learning_rate": 9.224597714147186e-05,
"loss": 2.0109,
"step": 555
},
{
"epoch": 0.4439140811455847,
"grad_norm": 1.6112617254257202,
"learning_rate": 9.214009454506753e-05,
"loss": 1.8432,
"step": 558
},
{
"epoch": 0.44630071599045346,
"grad_norm": 1.8361741304397583,
"learning_rate": 9.203355554869428e-05,
"loss": 1.9433,
"step": 561
},
{
"epoch": 0.4486873508353222,
"grad_norm": 3.137519121170044,
"learning_rate": 9.192636181186888e-05,
"loss": 1.7776,
"step": 564
},
{
"epoch": 0.4510739856801909,
"grad_norm": 1.79214346408844,
"learning_rate": 9.181851500430673e-05,
"loss": 1.8203,
"step": 567
},
{
"epoch": 0.45346062052505964,
"grad_norm": 2.010784149169922,
"learning_rate": 9.171001680589588e-05,
"loss": 1.8505,
"step": 570
},
{
"epoch": 0.45584725536992843,
"grad_norm": 2.2128775119781494,
"learning_rate": 9.160086890667086e-05,
"loss": 1.9007,
"step": 573
},
{
"epoch": 0.45823389021479716,
"grad_norm": 1.6658575534820557,
"learning_rate": 9.14910730067863e-05,
"loss": 1.9516,
"step": 576
},
{
"epoch": 0.4606205250596659,
"grad_norm": 1.6554961204528809,
"learning_rate": 9.138063081649051e-05,
"loss": 2.0685,
"step": 579
},
{
"epoch": 0.4630071599045346,
"grad_norm": 1.6959861516952515,
"learning_rate": 9.126954405609882e-05,
"loss": 1.9156,
"step": 582
},
{
"epoch": 0.46539379474940334,
"grad_norm": 1.795530915260315,
"learning_rate": 9.115781445596676e-05,
"loss": 1.7886,
"step": 585
},
{
"epoch": 0.4677804295942721,
"grad_norm": 1.9987218379974365,
"learning_rate": 9.104544375646313e-05,
"loss": 2.1443,
"step": 588
},
{
"epoch": 0.4701670644391408,
"grad_norm": 1.8751397132873535,
"learning_rate": 9.093243370794291e-05,
"loss": 1.8975,
"step": 591
},
{
"epoch": 0.47255369928400953,
"grad_norm": 1.921032428741455,
"learning_rate": 9.081878607071996e-05,
"loss": 1.9969,
"step": 594
},
{
"epoch": 0.47494033412887826,
"grad_norm": 4.745877742767334,
"learning_rate": 9.07045026150396e-05,
"loss": 1.9351,
"step": 597
},
{
"epoch": 0.477326968973747,
"grad_norm": 1.8512818813323975,
"learning_rate": 9.058958512105104e-05,
"loss": 1.8773,
"step": 600
},
{
"epoch": 0.4797136038186158,
"grad_norm": 2.7076666355133057,
"learning_rate": 9.047403537877971e-05,
"loss": 1.8907,
"step": 603
},
{
"epoch": 0.4821002386634845,
"grad_norm": 1.9602298736572266,
"learning_rate": 9.035785518809927e-05,
"loss": 1.955,
"step": 606
},
{
"epoch": 0.48448687350835323,
"grad_norm": 1.8718470335006714,
"learning_rate": 9.024104635870368e-05,
"loss": 1.9148,
"step": 609
},
{
"epoch": 0.48687350835322196,
"grad_norm": 1.708706259727478,
"learning_rate": 9.012361071007891e-05,
"loss": 1.9202,
"step": 612
},
{
"epoch": 0.4892601431980907,
"grad_norm": 2.0447752475738525,
"learning_rate": 9.000555007147469e-05,
"loss": 1.9347,
"step": 615
},
{
"epoch": 0.4916467780429594,
"grad_norm": 2.0071353912353516,
"learning_rate": 8.988686628187597e-05,
"loss": 1.9392,
"step": 618
},
{
"epoch": 0.49403341288782815,
"grad_norm": 1.9587384462356567,
"learning_rate": 8.976756118997427e-05,
"loss": 1.9572,
"step": 621
},
{
"epoch": 0.4964200477326969,
"grad_norm": 2.829688549041748,
"learning_rate": 8.964763665413893e-05,
"loss": 1.7861,
"step": 624
},
{
"epoch": 0.4988066825775656,
"grad_norm": 1.878251075744629,
"learning_rate": 8.952709454238808e-05,
"loss": 2.0488,
"step": 627
},
{
"epoch": 0.5011933174224343,
"grad_norm": 1.6959253549575806,
"learning_rate": 8.940593673235962e-05,
"loss": 2.0164,
"step": 630
},
{
"epoch": 0.5035799522673031,
"grad_norm": 1.8719433546066284,
"learning_rate": 8.928416511128195e-05,
"loss": 1.8288,
"step": 633
},
{
"epoch": 0.5059665871121718,
"grad_norm": 1.9740793704986572,
"learning_rate": 8.916178157594453e-05,
"loss": 1.8924,
"step": 636
},
{
"epoch": 0.5083532219570406,
"grad_norm": 2.177084445953369,
"learning_rate": 8.903878803266841e-05,
"loss": 2.0888,
"step": 639
},
{
"epoch": 0.5107398568019093,
"grad_norm": 1.6153382062911987,
"learning_rate": 8.891518639727649e-05,
"loss": 1.7814,
"step": 642
},
{
"epoch": 0.513126491646778,
"grad_norm": 2.0001626014709473,
"learning_rate": 8.879097859506372e-05,
"loss": 1.9432,
"step": 645
},
{
"epoch": 0.5155131264916468,
"grad_norm": 2.0495731830596924,
"learning_rate": 8.866616656076696e-05,
"loss": 1.8339,
"step": 648
},
{
"epoch": 0.5178997613365155,
"grad_norm": 1.7960389852523804,
"learning_rate": 8.854075223853508e-05,
"loss": 1.9401,
"step": 651
},
{
"epoch": 0.5202863961813843,
"grad_norm": 1.9565211534500122,
"learning_rate": 8.841473758189854e-05,
"loss": 1.8112,
"step": 654
},
{
"epoch": 0.522673031026253,
"grad_norm": 2.022819995880127,
"learning_rate": 8.828812455373891e-05,
"loss": 2.0094,
"step": 657
},
{
"epoch": 0.5250596658711217,
"grad_norm": 1.8336989879608154,
"learning_rate": 8.816091512625843e-05,
"loss": 1.9463,
"step": 660
},
{
"epoch": 0.5274463007159904,
"grad_norm": 2.031402111053467,
"learning_rate": 8.803311128094918e-05,
"loss": 1.9657,
"step": 663
},
{
"epoch": 0.5298329355608592,
"grad_norm": 2.0874717235565186,
"learning_rate": 8.790471500856228e-05,
"loss": 2.0375,
"step": 666
},
{
"epoch": 0.5322195704057279,
"grad_norm": 1.7618159055709839,
"learning_rate": 8.777572830907684e-05,
"loss": 2.1104,
"step": 669
},
{
"epoch": 0.5346062052505967,
"grad_norm": 1.6236835718154907,
"learning_rate": 8.764615319166886e-05,
"loss": 2.0333,
"step": 672
},
{
"epoch": 0.5369928400954654,
"grad_norm": 1.613146424293518,
"learning_rate": 8.751599167467985e-05,
"loss": 1.8055,
"step": 675
},
{
"epoch": 0.5393794749403341,
"grad_norm": 1.5570297241210938,
"learning_rate": 8.738524578558547e-05,
"loss": 1.8801,
"step": 678
},
{
"epoch": 0.5417661097852029,
"grad_norm": 1.7564152479171753,
"learning_rate": 8.72539175609639e-05,
"loss": 1.8072,
"step": 681
},
{
"epoch": 0.5441527446300716,
"grad_norm": 1.7274627685546875,
"learning_rate": 8.712200904646416e-05,
"loss": 1.787,
"step": 684
},
{
"epoch": 0.5465393794749404,
"grad_norm": 2.1072866916656494,
"learning_rate": 8.698952229677422e-05,
"loss": 1.8179,
"step": 687
},
{
"epoch": 0.548926014319809,
"grad_norm": 2.1879334449768066,
"learning_rate": 8.685645937558896e-05,
"loss": 2.0755,
"step": 690
},
{
"epoch": 0.5513126491646778,
"grad_norm": 1.7754478454589844,
"learning_rate": 8.67228223555781e-05,
"loss": 1.8438,
"step": 693
},
{
"epoch": 0.5536992840095465,
"grad_norm": 2.0312633514404297,
"learning_rate": 8.658861331835385e-05,
"loss": 1.9058,
"step": 696
},
{
"epoch": 0.5560859188544153,
"grad_norm": 1.9634501934051514,
"learning_rate": 8.645383435443852e-05,
"loss": 1.8278,
"step": 699
},
{
"epoch": 0.5584725536992841,
"grad_norm": 1.915229082107544,
"learning_rate": 8.631848756323197e-05,
"loss": 1.9127,
"step": 702
},
{
"epoch": 0.5608591885441527,
"grad_norm": 1.7499381303787231,
"learning_rate": 8.618257505297886e-05,
"loss": 1.9196,
"step": 705
},
{
"epoch": 0.5632458233890215,
"grad_norm": 2.883967161178589,
"learning_rate": 8.604609894073584e-05,
"loss": 1.7157,
"step": 708
},
{
"epoch": 0.5656324582338902,
"grad_norm": 1.8204519748687744,
"learning_rate": 8.590906135233854e-05,
"loss": 1.845,
"step": 711
},
{
"epoch": 0.568019093078759,
"grad_norm": 3.1434381008148193,
"learning_rate": 8.577146442236857e-05,
"loss": 1.9142,
"step": 714
},
{
"epoch": 0.5704057279236276,
"grad_norm": 1.7107082605361938,
"learning_rate": 8.563331029412012e-05,
"loss": 1.9358,
"step": 717
},
{
"epoch": 0.5727923627684964,
"grad_norm": 2.010882616043091,
"learning_rate": 8.549460111956664e-05,
"loss": 1.8745,
"step": 720
},
{
"epoch": 0.5751789976133651,
"grad_norm": 2.873108148574829,
"learning_rate": 8.535533905932738e-05,
"loss": 1.8725,
"step": 723
},
{
"epoch": 0.5775656324582339,
"grad_norm": 2.0034258365631104,
"learning_rate": 8.521552628263362e-05,
"loss": 1.8666,
"step": 726
},
{
"epoch": 0.5799522673031027,
"grad_norm": 1.7691940069198608,
"learning_rate": 8.507516496729495e-05,
"loss": 1.8096,
"step": 729
},
{
"epoch": 0.5823389021479713,
"grad_norm": 1.77200186252594,
"learning_rate": 8.493425729966534e-05,
"loss": 1.9294,
"step": 732
},
{
"epoch": 0.5847255369928401,
"grad_norm": 1.6291214227676392,
"learning_rate": 8.479280547460907e-05,
"loss": 1.8242,
"step": 735
},
{
"epoch": 0.5871121718377088,
"grad_norm": 1.5914812088012695,
"learning_rate": 8.465081169546659e-05,
"loss": 1.9836,
"step": 738
},
{
"epoch": 0.5894988066825776,
"grad_norm": 1.6103566884994507,
"learning_rate": 8.450827817402011e-05,
"loss": 1.9699,
"step": 741
},
{
"epoch": 0.5918854415274463,
"grad_norm": 2.9773311614990234,
"learning_rate": 8.436520713045922e-05,
"loss": 1.7708,
"step": 744
},
{
"epoch": 0.594272076372315,
"grad_norm": 1.5203982591629028,
"learning_rate": 8.422160079334628e-05,
"loss": 1.8533,
"step": 747
},
{
"epoch": 0.5966587112171837,
"grad_norm": 3.8291232585906982,
"learning_rate": 8.40774613995817e-05,
"loss": 1.8964,
"step": 750
},
{
"epoch": 0.5990453460620525,
"grad_norm": 1.650585412979126,
"learning_rate": 8.393279119436912e-05,
"loss": 1.9163,
"step": 753
},
{
"epoch": 0.6014319809069213,
"grad_norm": 1.588335633277893,
"learning_rate": 8.378759243118044e-05,
"loss": 2.0618,
"step": 756
},
{
"epoch": 0.60381861575179,
"grad_norm": 1.9786241054534912,
"learning_rate": 8.364186737172068e-05,
"loss": 1.8235,
"step": 759
},
{
"epoch": 0.6062052505966588,
"grad_norm": 1.7208276987075806,
"learning_rate": 8.349561828589277e-05,
"loss": 2.0045,
"step": 762
},
{
"epoch": 0.6085918854415274,
"grad_norm": 1.713976502418518,
"learning_rate": 8.33488474517622e-05,
"loss": 1.8602,
"step": 765
},
{
"epoch": 0.6109785202863962,
"grad_norm": 1.655760407447815,
"learning_rate": 8.320155715552155e-05,
"loss": 1.8096,
"step": 768
},
{
"epoch": 0.6133651551312649,
"grad_norm": 1.82340669631958,
"learning_rate": 8.305374969145488e-05,
"loss": 1.9755,
"step": 771
},
{
"epoch": 0.6157517899761337,
"grad_norm": 1.6505107879638672,
"learning_rate": 8.290542736190188e-05,
"loss": 1.7543,
"step": 774
},
{
"epoch": 0.6181384248210023,
"grad_norm": 1.6107587814331055,
"learning_rate": 8.275659247722222e-05,
"loss": 1.7788,
"step": 777
},
{
"epoch": 0.6205250596658711,
"grad_norm": 1.7392557859420776,
"learning_rate": 8.260724735575933e-05,
"loss": 1.8713,
"step": 780
},
{
"epoch": 0.6229116945107399,
"grad_norm": 1.8423359394073486,
"learning_rate": 8.24573943238045e-05,
"loss": 1.9501,
"step": 783
},
{
"epoch": 0.6252983293556086,
"grad_norm": 1.596801996231079,
"learning_rate": 8.230703571556048e-05,
"loss": 1.7561,
"step": 786
},
{
"epoch": 0.6276849642004774,
"grad_norm": 1.6264513731002808,
"learning_rate": 8.215617387310524e-05,
"loss": 1.812,
"step": 789
},
{
"epoch": 0.630071599045346,
"grad_norm": 1.8065801858901978,
"learning_rate": 8.200481114635536e-05,
"loss": 1.8587,
"step": 792
},
{
"epoch": 0.6324582338902148,
"grad_norm": 1.6027936935424805,
"learning_rate": 8.185294989302958e-05,
"loss": 1.7951,
"step": 795
},
{
"epoch": 0.6348448687350835,
"grad_norm": 1.863053560256958,
"learning_rate": 8.170059247861194e-05,
"loss": 1.791,
"step": 798
},
{
"epoch": 0.6372315035799523,
"grad_norm": 1.7930762767791748,
"learning_rate": 8.154774127631501e-05,
"loss": 1.7575,
"step": 801
},
{
"epoch": 0.639618138424821,
"grad_norm": 2.0538759231567383,
"learning_rate": 8.139439866704293e-05,
"loss": 1.8417,
"step": 804
},
{
"epoch": 0.6420047732696897,
"grad_norm": 2.5710806846618652,
"learning_rate": 8.124056703935423e-05,
"loss": 1.8187,
"step": 807
},
{
"epoch": 0.6443914081145584,
"grad_norm": 1.6980230808258057,
"learning_rate": 8.108624878942477e-05,
"loss": 1.8364,
"step": 810
},
{
"epoch": 0.6467780429594272,
"grad_norm": 1.7313123941421509,
"learning_rate": 8.093144632101026e-05,
"loss": 1.7538,
"step": 813
},
{
"epoch": 0.649164677804296,
"grad_norm": 1.6911081075668335,
"learning_rate": 8.077616204540897e-05,
"loss": 1.8258,
"step": 816
},
{
"epoch": 0.6515513126491647,
"grad_norm": 1.9907560348510742,
"learning_rate": 8.062039838142402e-05,
"loss": 1.7978,
"step": 819
},
{
"epoch": 0.6539379474940334,
"grad_norm": 1.9501363039016724,
"learning_rate": 8.046415775532585e-05,
"loss": 1.8116,
"step": 822
},
{
"epoch": 0.6563245823389021,
"grad_norm": 1.7937824726104736,
"learning_rate": 8.030744260081426e-05,
"loss": 1.8347,
"step": 825
},
{
"epoch": 0.6587112171837709,
"grad_norm": 1.682985782623291,
"learning_rate": 8.015025535898073e-05,
"loss": 1.8879,
"step": 828
},
{
"epoch": 0.6610978520286396,
"grad_norm": 1.7274394035339355,
"learning_rate": 7.999259847827015e-05,
"loss": 1.8931,
"step": 831
},
{
"epoch": 0.6634844868735084,
"grad_norm": 1.7429416179656982,
"learning_rate": 7.983447441444281e-05,
"loss": 1.7171,
"step": 834
},
{
"epoch": 0.665871121718377,
"grad_norm": 1.949879765510559,
"learning_rate": 7.967588563053616e-05,
"loss": 1.7779,
"step": 837
},
{
"epoch": 0.6682577565632458,
"grad_norm": 1.5538753271102905,
"learning_rate": 7.951683459682641e-05,
"loss": 1.8087,
"step": 840
},
{
"epoch": 0.6706443914081146,
"grad_norm": 1.7967875003814697,
"learning_rate": 7.935732379079008e-05,
"loss": 2.0304,
"step": 843
},
{
"epoch": 0.6730310262529833,
"grad_norm": 1.7717353105545044,
"learning_rate": 7.919735569706533e-05,
"loss": 1.904,
"step": 846
},
{
"epoch": 0.6754176610978521,
"grad_norm": 1.6083266735076904,
"learning_rate": 7.903693280741331e-05,
"loss": 1.8501,
"step": 849
},
{
"epoch": 0.6778042959427207,
"grad_norm": 1.6468119621276855,
"learning_rate": 7.887605762067945e-05,
"loss": 1.7535,
"step": 852
},
{
"epoch": 0.6801909307875895,
"grad_norm": 1.6218470335006714,
"learning_rate": 7.871473264275429e-05,
"loss": 1.7495,
"step": 855
},
{
"epoch": 0.6825775656324582,
"grad_norm": 1.6359236240386963,
"learning_rate": 7.855296038653475e-05,
"loss": 2.0507,
"step": 858
},
{
"epoch": 0.684964200477327,
"grad_norm": 1.4922749996185303,
"learning_rate": 7.83907433718847e-05,
"loss": 1.7931,
"step": 861
},
{
"epoch": 0.6873508353221957,
"grad_norm": 1.5041239261627197,
"learning_rate": 7.82280841255959e-05,
"loss": 1.7704,
"step": 864
},
{
"epoch": 0.6897374701670644,
"grad_norm": 2.032655954360962,
"learning_rate": 7.80649851813486e-05,
"loss": 1.9174,
"step": 867
},
{
"epoch": 0.6921241050119332,
"grad_norm": 1.7632269859313965,
"learning_rate": 7.790144907967201e-05,
"loss": 1.7885,
"step": 870
},
{
"epoch": 0.6945107398568019,
"grad_norm": 1.7323729991912842,
"learning_rate": 7.773747836790481e-05,
"loss": 1.9919,
"step": 873
},
{
"epoch": 0.6968973747016707,
"grad_norm": 2.3218891620635986,
"learning_rate": 7.757307560015538e-05,
"loss": 1.7896,
"step": 876
},
{
"epoch": 0.6992840095465394,
"grad_norm": 1.620492935180664,
"learning_rate": 7.740824333726213e-05,
"loss": 1.748,
"step": 879
},
{
"epoch": 0.7016706443914081,
"grad_norm": 1.6885743141174316,
"learning_rate": 7.724298414675353e-05,
"loss": 1.7732,
"step": 882
},
{
"epoch": 0.7040572792362768,
"grad_norm": 1.8093699216842651,
"learning_rate": 7.707730060280812e-05,
"loss": 1.9147,
"step": 885
},
{
"epoch": 0.7064439140811456,
"grad_norm": 1.7837680578231812,
"learning_rate": 7.691119528621444e-05,
"loss": 1.8391,
"step": 888
},
{
"epoch": 0.7088305489260143,
"grad_norm": 1.75551176071167,
"learning_rate": 7.674467078433081e-05,
"loss": 1.9519,
"step": 891
},
{
"epoch": 0.711217183770883,
"grad_norm": 1.791812777519226,
"learning_rate": 7.657772969104508e-05,
"loss": 1.6442,
"step": 894
},
{
"epoch": 0.7136038186157518,
"grad_norm": 1.6638917922973633,
"learning_rate": 7.641037460673412e-05,
"loss": 1.6225,
"step": 897
},
{
"epoch": 0.7159904534606205,
"grad_norm": 1.479506015777588,
"learning_rate": 7.624260813822342e-05,
"loss": 1.7162,
"step": 900
},
{
"epoch": 0.7183770883054893,
"grad_norm": 1.9166977405548096,
"learning_rate": 7.607443289874642e-05,
"loss": 1.8657,
"step": 903
},
{
"epoch": 0.720763723150358,
"grad_norm": 1.4827370643615723,
"learning_rate": 7.590585150790389e-05,
"loss": 1.9136,
"step": 906
},
{
"epoch": 0.7231503579952268,
"grad_norm": 1.5117080211639404,
"learning_rate": 7.573686659162293e-05,
"loss": 1.8548,
"step": 909
},
{
"epoch": 0.7255369928400954,
"grad_norm": 1.8482357263565063,
"learning_rate": 7.556748078211635e-05,
"loss": 1.9555,
"step": 912
},
{
"epoch": 0.7279236276849642,
"grad_norm": 1.6880775690078735,
"learning_rate": 7.53976967178414e-05,
"loss": 1.8775,
"step": 915
},
{
"epoch": 0.7303102625298329,
"grad_norm": 1.5047417879104614,
"learning_rate": 7.522751704345887e-05,
"loss": 1.8849,
"step": 918
},
{
"epoch": 0.7326968973747017,
"grad_norm": 1.5307697057724,
"learning_rate": 7.505694440979178e-05,
"loss": 1.8404,
"step": 921
},
{
"epoch": 0.7350835322195705,
"grad_norm": 1.508344054222107,
"learning_rate": 7.488598147378416e-05,
"loss": 1.666,
"step": 924
},
{
"epoch": 0.7374701670644391,
"grad_norm": 1.7632466554641724,
"learning_rate": 7.471463089845956e-05,
"loss": 1.7466,
"step": 927
},
{
"epoch": 0.7398568019093079,
"grad_norm": 1.6543248891830444,
"learning_rate": 7.454289535287968e-05,
"loss": 1.7259,
"step": 930
},
{
"epoch": 0.7422434367541766,
"grad_norm": 2.329713821411133,
"learning_rate": 7.437077751210279e-05,
"loss": 1.9443,
"step": 933
},
{
"epoch": 0.7446300715990454,
"grad_norm": 1.585302472114563,
"learning_rate": 7.419828005714194e-05,
"loss": 1.8221,
"step": 936
},
{
"epoch": 0.747016706443914,
"grad_norm": 1.523518681526184,
"learning_rate": 7.402540567492337e-05,
"loss": 1.7825,
"step": 939
},
{
"epoch": 0.7494033412887828,
"grad_norm": 1.5896198749542236,
"learning_rate": 7.385215705824449e-05,
"loss": 1.9895,
"step": 942
},
{
"epoch": 0.7517899761336515,
"grad_norm": 1.7857471704483032,
"learning_rate": 7.367853690573208e-05,
"loss": 1.6531,
"step": 945
},
{
"epoch": 0.7541766109785203,
"grad_norm": 1.8661036491394043,
"learning_rate": 7.350454792180016e-05,
"loss": 1.7411,
"step": 948
},
{
"epoch": 0.7565632458233891,
"grad_norm": 1.7312443256378174,
"learning_rate": 7.333019281660789e-05,
"loss": 1.9503,
"step": 951
},
{
"epoch": 0.7589498806682577,
"grad_norm": 1.8219163417816162,
"learning_rate": 7.31554743060174e-05,
"loss": 1.7237,
"step": 954
},
{
"epoch": 0.7613365155131265,
"grad_norm": 1.7200877666473389,
"learning_rate": 7.298039511155138e-05,
"loss": 1.8042,
"step": 957
},
{
"epoch": 0.7637231503579952,
"grad_norm": 1.5407986640930176,
"learning_rate": 7.280495796035079e-05,
"loss": 1.8225,
"step": 960
},
{
"epoch": 0.766109785202864,
"grad_norm": 1.5745642185211182,
"learning_rate": 7.262916558513237e-05,
"loss": 1.6478,
"step": 963
},
{
"epoch": 0.7684964200477327,
"grad_norm": 1.8857331275939941,
"learning_rate": 7.245302072414601e-05,
"loss": 1.8026,
"step": 966
},
{
"epoch": 0.7708830548926014,
"grad_norm": 1.4270589351654053,
"learning_rate": 7.227652612113213e-05,
"loss": 1.6531,
"step": 969
},
{
"epoch": 0.7732696897374701,
"grad_norm": 1.530493140220642,
"learning_rate": 7.209968452527896e-05,
"loss": 1.7553,
"step": 972
},
{
"epoch": 0.7756563245823389,
"grad_norm": 1.6771613359451294,
"learning_rate": 7.192249869117971e-05,
"loss": 1.8374,
"step": 975
},
{
"epoch": 0.7780429594272077,
"grad_norm": 1.7160065174102783,
"learning_rate": 7.174497137878966e-05,
"loss": 1.7429,
"step": 978
},
{
"epoch": 0.7804295942720764,
"grad_norm": 1.518904685974121,
"learning_rate": 7.156710535338312e-05,
"loss": 1.8843,
"step": 981
},
{
"epoch": 0.7828162291169452,
"grad_norm": 1.7113001346588135,
"learning_rate": 7.138890338551048e-05,
"loss": 1.8249,
"step": 984
},
{
"epoch": 0.7852028639618138,
"grad_norm": 1.6391565799713135,
"learning_rate": 7.121036825095492e-05,
"loss": 1.6807,
"step": 987
},
{
"epoch": 0.7875894988066826,
"grad_norm": 1.6043463945388794,
"learning_rate": 7.103150273068921e-05,
"loss": 1.7299,
"step": 990
},
{
"epoch": 0.7899761336515513,
"grad_norm": 1.5004619359970093,
"learning_rate": 7.085230961083249e-05,
"loss": 1.8501,
"step": 993
},
{
"epoch": 0.7923627684964201,
"grad_norm": 1.893096685409546,
"learning_rate": 7.067279168260671e-05,
"loss": 1.8326,
"step": 996
},
{
"epoch": 0.7947494033412887,
"grad_norm": 1.5315916538238525,
"learning_rate": 7.04929517422933e-05,
"loss": 1.7833,
"step": 999
},
{
"epoch": 0.7971360381861575,
"grad_norm": 1.5632952451705933,
"learning_rate": 7.031279259118946e-05,
"loss": 1.6346,
"step": 1002
},
{
"epoch": 0.7995226730310262,
"grad_norm": 1.9762367010116577,
"learning_rate": 7.013231703556471e-05,
"loss": 1.8849,
"step": 1005
},
{
"epoch": 0.801909307875895,
"grad_norm": 1.5724525451660156,
"learning_rate": 6.995152788661705e-05,
"loss": 1.7792,
"step": 1008
},
{
"epoch": 0.8042959427207638,
"grad_norm": 1.4417201280593872,
"learning_rate": 6.977042796042917e-05,
"loss": 1.7516,
"step": 1011
},
{
"epoch": 0.8066825775656324,
"grad_norm": 1.6131057739257812,
"learning_rate": 6.958902007792466e-05,
"loss": 1.7614,
"step": 1014
},
{
"epoch": 0.8090692124105012,
"grad_norm": 2.5175702571868896,
"learning_rate": 6.940730706482399e-05,
"loss": 1.8208,
"step": 1017
},
{
"epoch": 0.8114558472553699,
"grad_norm": 1.5602608919143677,
"learning_rate": 6.922529175160054e-05,
"loss": 1.7046,
"step": 1020
},
{
"epoch": 0.8138424821002387,
"grad_norm": 1.5071675777435303,
"learning_rate": 6.904297697343655e-05,
"loss": 1.9148,
"step": 1023
},
{
"epoch": 0.8162291169451074,
"grad_norm": 1.4578834772109985,
"learning_rate": 6.886036557017881e-05,
"loss": 1.8546,
"step": 1026
},
{
"epoch": 0.8186157517899761,
"grad_norm": 1.7864019870758057,
"learning_rate": 6.867746038629462e-05,
"loss": 1.9423,
"step": 1029
},
{
"epoch": 0.8210023866348448,
"grad_norm": 1.4294066429138184,
"learning_rate": 6.849426427082735e-05,
"loss": 1.7797,
"step": 1032
},
{
"epoch": 0.8233890214797136,
"grad_norm": 2.900899648666382,
"learning_rate": 6.83107800773521e-05,
"loss": 1.8395,
"step": 1035
},
{
"epoch": 0.8257756563245824,
"grad_norm": 1.5912294387817383,
"learning_rate": 6.812701066393124e-05,
"loss": 1.7345,
"step": 1038
},
{
"epoch": 0.8281622911694511,
"grad_norm": 1.6191726922988892,
"learning_rate": 6.79429588930699e-05,
"loss": 1.7563,
"step": 1041
},
{
"epoch": 0.8305489260143198,
"grad_norm": 1.926048755645752,
"learning_rate": 6.775862763167142e-05,
"loss": 1.7473,
"step": 1044
},
{
"epoch": 0.8329355608591885,
"grad_norm": 1.5704717636108398,
"learning_rate": 6.757401975099262e-05,
"loss": 1.6788,
"step": 1047
},
{
"epoch": 0.8353221957040573,
"grad_norm": 2.986739158630371,
"learning_rate": 6.738913812659912e-05,
"loss": 1.9091,
"step": 1050
},
{
"epoch": 0.837708830548926,
"grad_norm": 2.8111684322357178,
"learning_rate": 6.720398563832055e-05,
"loss": 1.7738,
"step": 1053
},
{
"epoch": 0.8400954653937948,
"grad_norm": 1.5233489274978638,
"learning_rate": 6.701856517020565e-05,
"loss": 1.869,
"step": 1056
},
{
"epoch": 0.8424821002386634,
"grad_norm": 1.679288387298584,
"learning_rate": 6.683287961047742e-05,
"loss": 1.977,
"step": 1059
},
{
"epoch": 0.8448687350835322,
"grad_norm": 1.5280253887176514,
"learning_rate": 6.664693185148807e-05,
"loss": 1.7278,
"step": 1062
},
{
"epoch": 0.847255369928401,
"grad_norm": 1.5906157493591309,
"learning_rate": 6.646072478967397e-05,
"loss": 1.8965,
"step": 1065
},
{
"epoch": 0.8496420047732697,
"grad_norm": 1.6531226634979248,
"learning_rate": 6.627426132551058e-05,
"loss": 1.77,
"step": 1068
},
{
"epoch": 0.8520286396181385,
"grad_norm": 1.5413316488265991,
"learning_rate": 6.608754436346725e-05,
"loss": 1.7051,
"step": 1071
},
{
"epoch": 0.8544152744630071,
"grad_norm": 1.7978452444076538,
"learning_rate": 6.590057681196191e-05,
"loss": 1.6797,
"step": 1074
},
{
"epoch": 0.8568019093078759,
"grad_norm": 1.6512346267700195,
"learning_rate": 6.571336158331589e-05,
"loss": 1.9775,
"step": 1077
},
{
"epoch": 0.8591885441527446,
"grad_norm": 1.7628110647201538,
"learning_rate": 6.552590159370844e-05,
"loss": 1.6468,
"step": 1080
},
{
"epoch": 0.8615751789976134,
"grad_norm": 1.659655213356018,
"learning_rate": 6.53381997631314e-05,
"loss": 1.8676,
"step": 1083
},
{
"epoch": 0.863961813842482,
"grad_norm": 1.5994445085525513,
"learning_rate": 6.515025901534364e-05,
"loss": 1.7799,
"step": 1086
},
{
"epoch": 0.8663484486873508,
"grad_norm": 4.107300758361816,
"learning_rate": 6.496208227782556e-05,
"loss": 1.7622,
"step": 1089
},
{
"epoch": 0.8687350835322196,
"grad_norm": 1.6633572578430176,
"learning_rate": 6.477367248173352e-05,
"loss": 1.7943,
"step": 1092
},
{
"epoch": 0.8711217183770883,
"grad_norm": 1.5715115070343018,
"learning_rate": 6.458503256185404e-05,
"loss": 1.791,
"step": 1095
},
{
"epoch": 0.8735083532219571,
"grad_norm": 1.4537943601608276,
"learning_rate": 6.439616545655834e-05,
"loss": 1.6835,
"step": 1098
},
{
"epoch": 0.8758949880668258,
"grad_norm": 1.4551641941070557,
"learning_rate": 6.420707410775626e-05,
"loss": 1.8273,
"step": 1101
},
{
"epoch": 0.8782816229116945,
"grad_norm": 1.7414036989212036,
"learning_rate": 6.401776146085072e-05,
"loss": 2.0934,
"step": 1104
},
{
"epoch": 0.8806682577565632,
"grad_norm": 1.602457046508789,
"learning_rate": 6.382823046469167e-05,
"loss": 1.8388,
"step": 1107
},
{
"epoch": 0.883054892601432,
"grad_norm": 1.549912929534912,
"learning_rate": 6.363848407153016e-05,
"loss": 1.6429,
"step": 1110
},
{
"epoch": 0.8854415274463007,
"grad_norm": 1.4284569025039673,
"learning_rate": 6.344852523697247e-05,
"loss": 1.8545,
"step": 1113
},
{
"epoch": 0.8878281622911695,
"grad_norm": 1.83491849899292,
"learning_rate": 6.325835691993394e-05,
"loss": 1.574,
"step": 1116
},
{
"epoch": 0.8902147971360382,
"grad_norm": 1.6155121326446533,
"learning_rate": 6.306798208259297e-05,
"loss": 1.7415,
"step": 1119
},
{
"epoch": 0.8926014319809069,
"grad_norm": 1.5498522520065308,
"learning_rate": 6.287740369034485e-05,
"loss": 1.5622,
"step": 1122
},
{
"epoch": 0.8949880668257757,
"grad_norm": 1.6388617753982544,
"learning_rate": 6.26866247117555e-05,
"loss": 1.6144,
"step": 1125
},
{
"epoch": 0.8973747016706444,
"grad_norm": 1.4823899269104004,
"learning_rate": 6.249564811851543e-05,
"loss": 1.8225,
"step": 1128
},
{
"epoch": 0.8997613365155132,
"grad_norm": 1.4698792695999146,
"learning_rate": 6.230447688539316e-05,
"loss": 1.6339,
"step": 1131
},
{
"epoch": 0.9021479713603818,
"grad_norm": 1.6909016370773315,
"learning_rate": 6.211311399018916e-05,
"loss": 1.7918,
"step": 1134
},
{
"epoch": 0.9045346062052506,
"grad_norm": 1.4534494876861572,
"learning_rate": 6.192156241368929e-05,
"loss": 1.7715,
"step": 1137
},
{
"epoch": 0.9069212410501193,
"grad_norm": 1.719106912612915,
"learning_rate": 6.172982513961845e-05,
"loss": 1.7261,
"step": 1140
},
{
"epoch": 0.9093078758949881,
"grad_norm": 1.6923831701278687,
"learning_rate": 6.153790515459404e-05,
"loss": 1.6554,
"step": 1143
},
{
"epoch": 0.9116945107398569,
"grad_norm": 1.6443957090377808,
"learning_rate": 6.13458054480795e-05,
"loss": 1.8556,
"step": 1146
},
{
"epoch": 0.9140811455847255,
"grad_norm": 1.5689623355865479,
"learning_rate": 6.115352901233779e-05,
"loss": 1.8041,
"step": 1149
},
{
"epoch": 0.9164677804295943,
"grad_norm": 1.46951425075531,
"learning_rate": 6.096107884238458e-05,
"loss": 1.6515,
"step": 1152
},
{
"epoch": 0.918854415274463,
"grad_norm": 1.5866256952285767,
"learning_rate": 6.0768457935941817e-05,
"loss": 1.8007,
"step": 1155
},
{
"epoch": 0.9212410501193318,
"grad_norm": 1.9523183107376099,
"learning_rate": 6.0575669293390954e-05,
"loss": 1.8313,
"step": 1158
},
{
"epoch": 0.9236276849642004,
"grad_norm": 1.6216835975646973,
"learning_rate": 6.038271591772615e-05,
"loss": 1.8399,
"step": 1161
},
{
"epoch": 0.9260143198090692,
"grad_norm": 1.8278216123580933,
"learning_rate": 6.0189600814507604e-05,
"loss": 1.9067,
"step": 1164
},
{
"epoch": 0.9284009546539379,
"grad_norm": 1.2732576131820679,
"learning_rate": 5.9996326991814654e-05,
"loss": 1.6042,
"step": 1167
},
{
"epoch": 0.9307875894988067,
"grad_norm": 1.75897216796875,
"learning_rate": 5.980289746019892e-05,
"loss": 1.8796,
"step": 1170
},
{
"epoch": 0.9331742243436754,
"grad_norm": 1.5754450559616089,
"learning_rate": 5.9609315232637483e-05,
"loss": 1.5981,
"step": 1173
},
{
"epoch": 0.9355608591885441,
"grad_norm": 1.7975653409957886,
"learning_rate": 5.941558332448589e-05,
"loss": 1.638,
"step": 1176
},
{
"epoch": 0.9379474940334129,
"grad_norm": 1.6017708778381348,
"learning_rate": 5.922170475343125e-05,
"loss": 1.6904,
"step": 1179
},
{
"epoch": 0.9403341288782816,
"grad_norm": 1.6121450662612915,
"learning_rate": 5.9027682539445104e-05,
"loss": 1.69,
"step": 1182
},
{
"epoch": 0.9427207637231504,
"grad_norm": 1.5319181680679321,
"learning_rate": 5.883351970473654e-05,
"loss": 1.8468,
"step": 1185
},
{
"epoch": 0.9451073985680191,
"grad_norm": 1.7490177154541016,
"learning_rate": 5.863921927370498e-05,
"loss": 1.7477,
"step": 1188
},
{
"epoch": 0.9474940334128878,
"grad_norm": 1.546705722808838,
"learning_rate": 5.8444784272893175e-05,
"loss": 1.6837,
"step": 1191
},
{
"epoch": 0.9498806682577565,
"grad_norm": 1.576828956604004,
"learning_rate": 5.8250217730939973e-05,
"loss": 1.6976,
"step": 1194
},
{
"epoch": 0.9522673031026253,
"grad_norm": 1.443121075630188,
"learning_rate": 5.8055522678533225e-05,
"loss": 1.6661,
"step": 1197
},
{
"epoch": 0.954653937947494,
"grad_norm": 1.619332194328308,
"learning_rate": 5.786070214836254e-05,
"loss": 1.6282,
"step": 1200
},
{
"epoch": 0.9570405727923628,
"grad_norm": 1.516404628753662,
"learning_rate": 5.7665759175072034e-05,
"loss": 1.8743,
"step": 1203
},
{
"epoch": 0.9594272076372315,
"grad_norm": 1.842561960220337,
"learning_rate": 5.747069679521305e-05,
"loss": 1.7742,
"step": 1206
},
{
"epoch": 0.9618138424821002,
"grad_norm": 1.5098801851272583,
"learning_rate": 5.727551804719693e-05,
"loss": 1.7087,
"step": 1209
},
{
"epoch": 0.964200477326969,
"grad_norm": 1.611047387123108,
"learning_rate": 5.708022597124758e-05,
"loss": 1.6934,
"step": 1212
},
{
"epoch": 0.9665871121718377,
"grad_norm": 1.5745174884796143,
"learning_rate": 5.688482360935423e-05,
"loss": 1.8729,
"step": 1215
},
{
"epoch": 0.9689737470167065,
"grad_norm": 1.4478429555892944,
"learning_rate": 5.668931400522396e-05,
"loss": 1.801,
"step": 1218
},
{
"epoch": 0.9713603818615751,
"grad_norm": 2.9802427291870117,
"learning_rate": 5.649370020423431e-05,
"loss": 1.6937,
"step": 1221
},
{
"epoch": 0.9737470167064439,
"grad_norm": 1.6159980297088623,
"learning_rate": 5.629798525338589e-05,
"loss": 1.712,
"step": 1224
},
{
"epoch": 0.9761336515513126,
"grad_norm": 1.6687465906143188,
"learning_rate": 5.6102172201254835e-05,
"loss": 1.7582,
"step": 1227
},
{
"epoch": 0.9785202863961814,
"grad_norm": 1.318992018699646,
"learning_rate": 5.5906264097945407e-05,
"loss": 1.7913,
"step": 1230
},
{
"epoch": 0.9809069212410502,
"grad_norm": 1.6671638488769531,
"learning_rate": 5.5710263995042434e-05,
"loss": 1.8547,
"step": 1233
},
{
"epoch": 0.9832935560859188,
"grad_norm": 1.460523247718811,
"learning_rate": 5.551417494556376e-05,
"loss": 1.7699,
"step": 1236
},
{
"epoch": 0.9856801909307876,
"grad_norm": 1.4564532041549683,
"learning_rate": 5.531800000391275e-05,
"loss": 1.7636,
"step": 1239
},
{
"epoch": 0.9880668257756563,
"grad_norm": 1.7387609481811523,
"learning_rate": 5.5121742225830665e-05,
"loss": 1.8602,
"step": 1242
},
{
"epoch": 0.9904534606205251,
"grad_norm": 1.8153102397918701,
"learning_rate": 5.4925404668349076e-05,
"loss": 1.797,
"step": 1245
},
{
"epoch": 0.9928400954653938,
"grad_norm": 1.4723916053771973,
"learning_rate": 5.472899038974225e-05,
"loss": 1.7051,
"step": 1248
},
{
"epoch": 0.9952267303102625,
"grad_norm": 2.961282968521118,
"learning_rate": 5.45325024494795e-05,
"loss": 1.7661,
"step": 1251
},
{
"epoch": 0.9976133651551312,
"grad_norm": 1.6182713508605957,
"learning_rate": 5.433594390817756e-05,
"loss": 1.875,
"step": 1254
},
{
"epoch": 1.0,
"grad_norm": 2.34993839263916,
"learning_rate": 5.413931782755283e-05,
"loss": 1.8514,
"step": 1257
},
{
"epoch": 1.0023866348448687,
"grad_norm": 1.439794659614563,
"learning_rate": 5.3942627270373826e-05,
"loss": 1.5045,
"step": 1260
},
{
"epoch": 1.0047732696897376,
"grad_norm": 1.7519071102142334,
"learning_rate": 5.374587530041335e-05,
"loss": 1.5372,
"step": 1263
},
{
"epoch": 1.0071599045346062,
"grad_norm": 1.4893503189086914,
"learning_rate": 5.35490649824008e-05,
"loss": 1.5061,
"step": 1266
},
{
"epoch": 1.009546539379475,
"grad_norm": 1.4957739114761353,
"learning_rate": 5.335219938197445e-05,
"loss": 1.4709,
"step": 1269
},
{
"epoch": 1.0119331742243436,
"grad_norm": 1.5801351070404053,
"learning_rate": 5.315528156563367e-05,
"loss": 1.522,
"step": 1272
},
{
"epoch": 1.0143198090692125,
"grad_norm": 1.5916804075241089,
"learning_rate": 5.295831460069124e-05,
"loss": 1.429,
"step": 1275
},
{
"epoch": 1.0167064439140812,
"grad_norm": 1.598103404045105,
"learning_rate": 5.276130155522541e-05,
"loss": 1.5911,
"step": 1278
},
{
"epoch": 1.0190930787589498,
"grad_norm": 1.575002670288086,
"learning_rate": 5.256424549803228e-05,
"loss": 1.5334,
"step": 1281
},
{
"epoch": 1.0214797136038185,
"grad_norm": 1.5843360424041748,
"learning_rate": 5.236714949857791e-05,
"loss": 1.4366,
"step": 1284
},
{
"epoch": 1.0238663484486874,
"grad_norm": 1.5547505617141724,
"learning_rate": 5.2170016626950505e-05,
"loss": 1.5106,
"step": 1287
},
{
"epoch": 1.026252983293556,
"grad_norm": 1.4847965240478516,
"learning_rate": 5.1972849953812644e-05,
"loss": 1.4397,
"step": 1290
},
{
"epoch": 1.0286396181384247,
"grad_norm": 2.0349180698394775,
"learning_rate": 5.1775652550353405e-05,
"loss": 1.5765,
"step": 1293
},
{
"epoch": 1.0310262529832936,
"grad_norm": 1.6574598550796509,
"learning_rate": 5.157842748824053e-05,
"loss": 1.433,
"step": 1296
},
{
"epoch": 1.0334128878281623,
"grad_norm": 1.5641635656356812,
"learning_rate": 5.138117783957261e-05,
"loss": 1.5666,
"step": 1299
},
{
"epoch": 1.035799522673031,
"grad_norm": 1.6239501237869263,
"learning_rate": 5.1183906676831197e-05,
"loss": 1.6223,
"step": 1302
},
{
"epoch": 1.0381861575178997,
"grad_norm": 1.5002102851867676,
"learning_rate": 5.098661707283298e-05,
"loss": 1.4733,
"step": 1305
},
{
"epoch": 1.0405727923627686,
"grad_norm": 1.5246349573135376,
"learning_rate": 5.078931210068185e-05,
"loss": 1.4459,
"step": 1308
},
{
"epoch": 1.0429594272076372,
"grad_norm": 1.4616323709487915,
"learning_rate": 5.059199483372114e-05,
"loss": 1.4595,
"step": 1311
},
{
"epoch": 1.045346062052506,
"grad_norm": 1.4437859058380127,
"learning_rate": 5.039466834548568e-05,
"loss": 1.594,
"step": 1314
},
{
"epoch": 1.0477326968973748,
"grad_norm": 1.609554409980774,
"learning_rate": 5.0197335709653883e-05,
"loss": 1.3251,
"step": 1317
},
{
"epoch": 1.0501193317422435,
"grad_norm": 1.502581238746643,
"learning_rate": 5e-05,
"loss": 1.5505,
"step": 1320
},
{
"epoch": 1.0525059665871122,
"grad_norm": 1.5816670656204224,
"learning_rate": 4.980266429034613e-05,
"loss": 1.4974,
"step": 1323
},
{
"epoch": 1.0548926014319808,
"grad_norm": 1.714355230331421,
"learning_rate": 4.960533165451435e-05,
"loss": 1.387,
"step": 1326
},
{
"epoch": 1.0572792362768497,
"grad_norm": 1.506137490272522,
"learning_rate": 4.9408005166278855e-05,
"loss": 1.4963,
"step": 1329
},
{
"epoch": 1.0596658711217184,
"grad_norm": 1.5406689643859863,
"learning_rate": 4.921068789931816e-05,
"loss": 1.4865,
"step": 1332
},
{
"epoch": 1.062052505966587,
"grad_norm": 1.6917808055877686,
"learning_rate": 4.901338292716704e-05,
"loss": 1.3597,
"step": 1335
},
{
"epoch": 1.064439140811456,
"grad_norm": 1.4597982168197632,
"learning_rate": 4.8816093323168815e-05,
"loss": 1.4529,
"step": 1338
},
{
"epoch": 1.0668257756563246,
"grad_norm": 1.3898138999938965,
"learning_rate": 4.8618822160427406e-05,
"loss": 1.4942,
"step": 1341
},
{
"epoch": 1.0692124105011933,
"grad_norm": 1.3161375522613525,
"learning_rate": 4.842157251175947e-05,
"loss": 1.4459,
"step": 1344
},
{
"epoch": 1.071599045346062,
"grad_norm": 1.6536246538162231,
"learning_rate": 4.822434744964661e-05,
"loss": 1.4766,
"step": 1347
},
{
"epoch": 1.0739856801909309,
"grad_norm": 1.7592108249664307,
"learning_rate": 4.802715004618737e-05,
"loss": 1.5014,
"step": 1350
},
{
"epoch": 1.0763723150357996,
"grad_norm": 1.4345988035202026,
"learning_rate": 4.7829983373049507e-05,
"loss": 1.4605,
"step": 1353
},
{
"epoch": 1.0787589498806682,
"grad_norm": 1.2879067659378052,
"learning_rate": 4.763285050142211e-05,
"loss": 1.3331,
"step": 1356
},
{
"epoch": 1.081145584725537,
"grad_norm": 1.392535924911499,
"learning_rate": 4.743575450196773e-05,
"loss": 1.4701,
"step": 1359
},
{
"epoch": 1.0835322195704058,
"grad_norm": 1.4662644863128662,
"learning_rate": 4.7238698444774595e-05,
"loss": 1.4321,
"step": 1362
},
{
"epoch": 1.0859188544152745,
"grad_norm": 1.4287410974502563,
"learning_rate": 4.704168539930878e-05,
"loss": 1.3805,
"step": 1365
},
{
"epoch": 1.0883054892601431,
"grad_norm": 1.4799420833587646,
"learning_rate": 4.6844718434366334e-05,
"loss": 1.6115,
"step": 1368
},
{
"epoch": 1.0906921241050118,
"grad_norm": 1.6349695920944214,
"learning_rate": 4.664780061802557e-05,
"loss": 1.4725,
"step": 1371
},
{
"epoch": 1.0930787589498807,
"grad_norm": 1.5499435663223267,
"learning_rate": 4.64509350175992e-05,
"loss": 1.4677,
"step": 1374
},
{
"epoch": 1.0954653937947494,
"grad_norm": 1.5121248960494995,
"learning_rate": 4.6254124699586656e-05,
"loss": 1.5272,
"step": 1377
},
{
"epoch": 1.097852028639618,
"grad_norm": 1.491830825805664,
"learning_rate": 4.605737272962618e-05,
"loss": 1.4693,
"step": 1380
},
{
"epoch": 1.100238663484487,
"grad_norm": 1.4596081972122192,
"learning_rate": 4.5860682172447184e-05,
"loss": 1.491,
"step": 1383
},
{
"epoch": 1.1026252983293556,
"grad_norm": 1.7243481874465942,
"learning_rate": 4.566405609182247e-05,
"loss": 1.5289,
"step": 1386
},
{
"epoch": 1.1050119331742243,
"grad_norm": 1.5089308023452759,
"learning_rate": 4.546749755052051e-05,
"loss": 1.5428,
"step": 1389
},
{
"epoch": 1.107398568019093,
"grad_norm": 1.5854674577713013,
"learning_rate": 4.527100961025776e-05,
"loss": 1.5129,
"step": 1392
},
{
"epoch": 1.1097852028639619,
"grad_norm": 1.55560302734375,
"learning_rate": 4.507459533165093e-05,
"loss": 1.482,
"step": 1395
},
{
"epoch": 1.1121718377088305,
"grad_norm": 1.4460105895996094,
"learning_rate": 4.4878257774169346e-05,
"loss": 1.4073,
"step": 1398
},
{
"epoch": 1.1145584725536992,
"grad_norm": 1.5131725072860718,
"learning_rate": 4.4681999996087274e-05,
"loss": 1.4992,
"step": 1401
},
{
"epoch": 1.1169451073985681,
"grad_norm": 1.5037047863006592,
"learning_rate": 4.448582505443625e-05,
"loss": 1.5421,
"step": 1404
},
{
"epoch": 1.1193317422434368,
"grad_norm": 1.4700491428375244,
"learning_rate": 4.4289736004957585e-05,
"loss": 1.4587,
"step": 1407
},
{
"epoch": 1.1217183770883055,
"grad_norm": 1.3639819622039795,
"learning_rate": 4.4093735902054605e-05,
"loss": 1.4711,
"step": 1410
},
{
"epoch": 1.1241050119331741,
"grad_norm": 1.565079689025879,
"learning_rate": 4.3897827798745183e-05,
"loss": 1.4546,
"step": 1413
},
{
"epoch": 1.126491646778043,
"grad_norm": 1.5039900541305542,
"learning_rate": 4.3702014746614136e-05,
"loss": 1.4998,
"step": 1416
},
{
"epoch": 1.1288782816229117,
"grad_norm": 1.6283437013626099,
"learning_rate": 4.350629979576569e-05,
"loss": 1.4458,
"step": 1419
},
{
"epoch": 1.1312649164677804,
"grad_norm": 1.4039489030838013,
"learning_rate": 4.331068599477605e-05,
"loss": 1.3474,
"step": 1422
},
{
"epoch": 1.1336515513126493,
"grad_norm": 1.4971529245376587,
"learning_rate": 4.311517639064578e-05,
"loss": 1.3097,
"step": 1425
},
{
"epoch": 1.136038186157518,
"grad_norm": 1.9998141527175903,
"learning_rate": 4.2919774028752436e-05,
"loss": 1.4228,
"step": 1428
},
{
"epoch": 1.1384248210023866,
"grad_norm": 1.3852521181106567,
"learning_rate": 4.27244819528031e-05,
"loss": 1.4883,
"step": 1431
},
{
"epoch": 1.1408114558472553,
"grad_norm": 1.5615485906600952,
"learning_rate": 4.2529303204786953e-05,
"loss": 1.5153,
"step": 1434
},
{
"epoch": 1.1431980906921242,
"grad_norm": 1.5283887386322021,
"learning_rate": 4.233424082492797e-05,
"loss": 1.527,
"step": 1437
},
{
"epoch": 1.1455847255369929,
"grad_norm": 1.385176420211792,
"learning_rate": 4.213929785163747e-05,
"loss": 1.4805,
"step": 1440
},
{
"epoch": 1.1479713603818615,
"grad_norm": 1.3835763931274414,
"learning_rate": 4.1944477321466786e-05,
"loss": 1.4868,
"step": 1443
},
{
"epoch": 1.1503579952267304,
"grad_norm": 1.4247292280197144,
"learning_rate": 4.1749782269060045e-05,
"loss": 1.498,
"step": 1446
},
{
"epoch": 1.152744630071599,
"grad_norm": 1.5267618894577026,
"learning_rate": 4.1555215727106844e-05,
"loss": 1.4659,
"step": 1449
},
{
"epoch": 1.1551312649164678,
"grad_norm": 1.5051010847091675,
"learning_rate": 4.136078072629503e-05,
"loss": 1.4474,
"step": 1452
},
{
"epoch": 1.1575178997613365,
"grad_norm": 2.2243285179138184,
"learning_rate": 4.116648029526347e-05,
"loss": 1.453,
"step": 1455
},
{
"epoch": 1.1599045346062051,
"grad_norm": 1.4427586793899536,
"learning_rate": 4.097231746055491e-05,
"loss": 1.532,
"step": 1458
},
{
"epoch": 1.162291169451074,
"grad_norm": 1.9031730890274048,
"learning_rate": 4.077829524656877e-05,
"loss": 1.3974,
"step": 1461
},
{
"epoch": 1.1646778042959427,
"grad_norm": 1.4072078466415405,
"learning_rate": 4.05844166755141e-05,
"loss": 1.4742,
"step": 1464
},
{
"epoch": 1.1670644391408114,
"grad_norm": 1.4678648710250854,
"learning_rate": 4.039068476736253e-05,
"loss": 1.4408,
"step": 1467
},
{
"epoch": 1.1694510739856803,
"grad_norm": 7.8452582359313965,
"learning_rate": 4.01971025398011e-05,
"loss": 1.3489,
"step": 1470
},
{
"epoch": 1.171837708830549,
"grad_norm": 1.6050971746444702,
"learning_rate": 4.000367300818537e-05,
"loss": 1.6608,
"step": 1473
},
{
"epoch": 1.1742243436754176,
"grad_norm": 1.71634840965271,
"learning_rate": 3.98103991854924e-05,
"loss": 1.4679,
"step": 1476
},
{
"epoch": 1.1766109785202863,
"grad_norm": 1.4481014013290405,
"learning_rate": 3.961728408227384e-05,
"loss": 1.5657,
"step": 1479
},
{
"epoch": 1.1789976133651552,
"grad_norm": 1.6103709936141968,
"learning_rate": 3.942433070660905e-05,
"loss": 1.4409,
"step": 1482
},
{
"epoch": 1.1813842482100239,
"grad_norm": 1.411056399345398,
"learning_rate": 3.923154206405819e-05,
"loss": 1.4865,
"step": 1485
},
{
"epoch": 1.1837708830548925,
"grad_norm": 1.497053623199463,
"learning_rate": 3.9038921157615444e-05,
"loss": 1.4072,
"step": 1488
},
{
"epoch": 1.1861575178997614,
"grad_norm": 1.6520947217941284,
"learning_rate": 3.884647098766224e-05,
"loss": 1.4393,
"step": 1491
},
{
"epoch": 1.18854415274463,
"grad_norm": 1.442406177520752,
"learning_rate": 3.8654194551920485e-05,
"loss": 1.4458,
"step": 1494
},
{
"epoch": 1.1909307875894988,
"grad_norm": 1.3878071308135986,
"learning_rate": 3.846209484540597e-05,
"loss": 1.4374,
"step": 1497
},
{
"epoch": 1.1933174224343674,
"grad_norm": 1.401583194732666,
"learning_rate": 3.827017486038157e-05,
"loss": 1.3965,
"step": 1500
},
{
"epoch": 1.1957040572792363,
"grad_norm": 1.4490373134613037,
"learning_rate": 3.8078437586310716e-05,
"loss": 1.6232,
"step": 1503
},
{
"epoch": 1.198090692124105,
"grad_norm": 1.2965726852416992,
"learning_rate": 3.788688600981085e-05,
"loss": 1.4681,
"step": 1506
},
{
"epoch": 1.2004773269689737,
"grad_norm": 1.355893850326538,
"learning_rate": 3.769552311460684e-05,
"loss": 1.488,
"step": 1509
},
{
"epoch": 1.2028639618138426,
"grad_norm": 1.4382792711257935,
"learning_rate": 3.750435188148459e-05,
"loss": 1.5472,
"step": 1512
},
{
"epoch": 1.2052505966587113,
"grad_norm": 1.3497835397720337,
"learning_rate": 3.73133752882445e-05,
"loss": 1.3225,
"step": 1515
},
{
"epoch": 1.20763723150358,
"grad_norm": 1.448410153388977,
"learning_rate": 3.712259630965518e-05,
"loss": 1.4645,
"step": 1518
},
{
"epoch": 1.2100238663484486,
"grad_norm": 1.470038890838623,
"learning_rate": 3.6932017917407045e-05,
"loss": 1.4681,
"step": 1521
},
{
"epoch": 1.2124105011933175,
"grad_norm": 1.2767425775527954,
"learning_rate": 3.6741643080066065e-05,
"loss": 1.4428,
"step": 1524
},
{
"epoch": 1.2147971360381862,
"grad_norm": 1.5355818271636963,
"learning_rate": 3.655147476302754e-05,
"loss": 1.528,
"step": 1527
},
{
"epoch": 1.2171837708830548,
"grad_norm": 1.5351886749267578,
"learning_rate": 3.636151592846985e-05,
"loss": 1.3914,
"step": 1530
},
{
"epoch": 1.2195704057279237,
"grad_norm": 1.580076813697815,
"learning_rate": 3.617176953530835e-05,
"loss": 1.3934,
"step": 1533
},
{
"epoch": 1.2219570405727924,
"grad_norm": 1.2656506299972534,
"learning_rate": 3.5982238539149285e-05,
"loss": 1.305,
"step": 1536
},
{
"epoch": 1.224343675417661,
"grad_norm": 2.035010814666748,
"learning_rate": 3.579292589224375e-05,
"loss": 1.4774,
"step": 1539
},
{
"epoch": 1.2267303102625298,
"grad_norm": 1.4457292556762695,
"learning_rate": 3.560383454344168e-05,
"loss": 1.5794,
"step": 1542
},
{
"epoch": 1.2291169451073987,
"grad_norm": 1.2905712127685547,
"learning_rate": 3.541496743814596e-05,
"loss": 1.4821,
"step": 1545
},
{
"epoch": 1.2315035799522673,
"grad_norm": 1.505958914756775,
"learning_rate": 3.522632751826651e-05,
"loss": 1.3573,
"step": 1548
},
{
"epoch": 1.233890214797136,
"grad_norm": 1.4743762016296387,
"learning_rate": 3.503791772217445e-05,
"loss": 1.5104,
"step": 1551
},
{
"epoch": 1.2362768496420047,
"grad_norm": 1.4788120985031128,
"learning_rate": 3.484974098465636e-05,
"loss": 1.3989,
"step": 1554
},
{
"epoch": 1.2386634844868736,
"grad_norm": 1.3256484270095825,
"learning_rate": 3.4661800236868604e-05,
"loss": 1.4617,
"step": 1557
},
{
"epoch": 1.2410501193317423,
"grad_norm": 1.5638628005981445,
"learning_rate": 3.447409840629156e-05,
"loss": 1.402,
"step": 1560
},
{
"epoch": 1.243436754176611,
"grad_norm": 1.7883927822113037,
"learning_rate": 3.428663841668412e-05,
"loss": 1.5829,
"step": 1563
},
{
"epoch": 1.2458233890214796,
"grad_norm": 1.4335147142410278,
"learning_rate": 3.409942318803809e-05,
"loss": 1.4597,
"step": 1566
},
{
"epoch": 1.2482100238663485,
"grad_norm": 1.4358636140823364,
"learning_rate": 3.391245563653276e-05,
"loss": 1.5638,
"step": 1569
},
{
"epoch": 1.2505966587112172,
"grad_norm": 1.3342747688293457,
"learning_rate": 3.3725738674489414e-05,
"loss": 1.447,
"step": 1572
},
{
"epoch": 1.2529832935560858,
"grad_norm": 1.3842703104019165,
"learning_rate": 3.3539275210326044e-05,
"loss": 1.4634,
"step": 1575
},
{
"epoch": 1.2553699284009547,
"grad_norm": 1.5342031717300415,
"learning_rate": 3.335306814851196e-05,
"loss": 1.458,
"step": 1578
},
{
"epoch": 1.2577565632458234,
"grad_norm": 1.4148904085159302,
"learning_rate": 3.31671203895226e-05,
"loss": 1.3945,
"step": 1581
},
{
"epoch": 1.260143198090692,
"grad_norm": 1.3674441576004028,
"learning_rate": 3.298143482979436e-05,
"loss": 1.3962,
"step": 1584
},
{
"epoch": 1.2625298329355608,
"grad_norm": 1.42054283618927,
"learning_rate": 3.2796014361679464e-05,
"loss": 1.5179,
"step": 1587
},
{
"epoch": 1.2649164677804297,
"grad_norm": 1.5487267971038818,
"learning_rate": 3.261086187340088e-05,
"loss": 1.4281,
"step": 1590
},
{
"epoch": 1.2673031026252983,
"grad_norm": 1.5014619827270508,
"learning_rate": 3.242598024900738e-05,
"loss": 1.4789,
"step": 1593
},
{
"epoch": 1.269689737470167,
"grad_norm": 1.533458948135376,
"learning_rate": 3.224137236832859e-05,
"loss": 1.3491,
"step": 1596
},
{
"epoch": 1.272076372315036,
"grad_norm": 1.3559014797210693,
"learning_rate": 3.2057041106930104e-05,
"loss": 1.3915,
"step": 1599
},
{
"epoch": 1.2744630071599046,
"grad_norm": 1.3522697687149048,
"learning_rate": 3.187298933606878e-05,
"loss": 1.496,
"step": 1602
},
{
"epoch": 1.2768496420047732,
"grad_norm": 1.3634637594223022,
"learning_rate": 3.1689219922647924e-05,
"loss": 1.3662,
"step": 1605
},
{
"epoch": 1.279236276849642,
"grad_norm": 1.4531528949737549,
"learning_rate": 3.150573572917267e-05,
"loss": 1.5501,
"step": 1608
},
{
"epoch": 1.2816229116945108,
"grad_norm": 1.39664626121521,
"learning_rate": 3.13225396137054e-05,
"loss": 1.42,
"step": 1611
},
{
"epoch": 1.2840095465393795,
"grad_norm": 1.4954712390899658,
"learning_rate": 3.11396344298212e-05,
"loss": 1.5471,
"step": 1614
},
{
"epoch": 1.2863961813842482,
"grad_norm": 1.3956115245819092,
"learning_rate": 3.095702302656347e-05,
"loss": 1.4936,
"step": 1617
},
{
"epoch": 1.288782816229117,
"grad_norm": 1.4721133708953857,
"learning_rate": 3.077470824839947e-05,
"loss": 1.4429,
"step": 1620
},
{
"epoch": 1.2911694510739857,
"grad_norm": 6.485837936401367,
"learning_rate": 3.059269293517603e-05,
"loss": 1.4545,
"step": 1623
},
{
"epoch": 1.2935560859188544,
"grad_norm": 1.7930669784545898,
"learning_rate": 3.0410979922075343e-05,
"loss": 1.3401,
"step": 1626
},
{
"epoch": 1.295942720763723,
"grad_norm": 1.4317888021469116,
"learning_rate": 3.022957203957083e-05,
"loss": 1.5389,
"step": 1629
},
{
"epoch": 1.2983293556085918,
"grad_norm": 1.3846065998077393,
"learning_rate": 3.004847211338295e-05,
"loss": 1.355,
"step": 1632
},
{
"epoch": 1.3007159904534606,
"grad_norm": 1.3602261543273926,
"learning_rate": 2.9867682964435294e-05,
"loss": 1.4359,
"step": 1635
},
{
"epoch": 1.3031026252983293,
"grad_norm": 1.3559253215789795,
"learning_rate": 2.9687207408810557e-05,
"loss": 1.493,
"step": 1638
},
{
"epoch": 1.3054892601431982,
"grad_norm": 1.6385109424591064,
"learning_rate": 2.9507048257706727e-05,
"loss": 1.5694,
"step": 1641
},
{
"epoch": 1.307875894988067,
"grad_norm": 1.4390101432800293,
"learning_rate": 2.9327208317393303e-05,
"loss": 1.5722,
"step": 1644
},
{
"epoch": 1.3102625298329356,
"grad_norm": 1.4474018812179565,
"learning_rate": 2.9147690389167514e-05,
"loss": 1.4355,
"step": 1647
},
{
"epoch": 1.3126491646778042,
"grad_norm": 1.3009108304977417,
"learning_rate": 2.8968497269310803e-05,
"loss": 1.3249,
"step": 1650
},
{
"epoch": 1.315035799522673,
"grad_norm": 1.4271756410598755,
"learning_rate": 2.8789631749045097e-05,
"loss": 1.3821,
"step": 1653
},
{
"epoch": 1.3174224343675418,
"grad_norm": 1.4093137979507446,
"learning_rate": 2.8611096614489518e-05,
"loss": 1.3932,
"step": 1656
},
{
"epoch": 1.3198090692124105,
"grad_norm": 1.5373637676239014,
"learning_rate": 2.8432894646616885e-05,
"loss": 1.3887,
"step": 1659
},
{
"epoch": 1.3221957040572792,
"grad_norm": 1.3347370624542236,
"learning_rate": 2.8255028621210355e-05,
"loss": 1.4542,
"step": 1662
},
{
"epoch": 1.324582338902148,
"grad_norm": 1.5229403972625732,
"learning_rate": 2.8077501308820308e-05,
"loss": 1.5258,
"step": 1665
},
{
"epoch": 1.3269689737470167,
"grad_norm": 1.38504958152771,
"learning_rate": 2.790031547472105e-05,
"loss": 1.4561,
"step": 1668
},
{
"epoch": 1.3293556085918854,
"grad_norm": 1.4247602224349976,
"learning_rate": 2.7723473878867877e-05,
"loss": 1.5028,
"step": 1671
},
{
"epoch": 1.331742243436754,
"grad_norm": 1.437752604484558,
"learning_rate": 2.754697927585399e-05,
"loss": 1.4035,
"step": 1674
},
{
"epoch": 1.334128878281623,
"grad_norm": 1.3439934253692627,
"learning_rate": 2.737083441486763e-05,
"loss": 1.3615,
"step": 1677
},
{
"epoch": 1.3365155131264916,
"grad_norm": 1.3507174253463745,
"learning_rate": 2.71950420396492e-05,
"loss": 1.416,
"step": 1680
},
{
"epoch": 1.3389021479713603,
"grad_norm": 1.3720000982284546,
"learning_rate": 2.7019604888448642e-05,
"loss": 1.4143,
"step": 1683
},
{
"epoch": 1.3412887828162292,
"grad_norm": 1.4216835498809814,
"learning_rate": 2.6844525693982613e-05,
"loss": 1.4053,
"step": 1686
},
{
"epoch": 1.3436754176610979,
"grad_norm": 1.3386591672897339,
"learning_rate": 2.666980718339211e-05,
"loss": 1.4513,
"step": 1689
},
{
"epoch": 1.3460620525059666,
"grad_norm": 1.423043966293335,
"learning_rate": 2.6495452078199863e-05,
"loss": 1.4137,
"step": 1692
},
{
"epoch": 1.3484486873508352,
"grad_norm": 1.4139893054962158,
"learning_rate": 2.6321463094267934e-05,
"loss": 1.395,
"step": 1695
},
{
"epoch": 1.3508353221957041,
"grad_norm": 1.7168787717819214,
"learning_rate": 2.614784294175554e-05,
"loss": 1.5379,
"step": 1698
},
{
"epoch": 1.3532219570405728,
"grad_norm": 1.528499722480774,
"learning_rate": 2.597459432507664e-05,
"loss": 1.4597,
"step": 1701
},
{
"epoch": 1.3556085918854415,
"grad_norm": 1.4057003259658813,
"learning_rate": 2.5801719942858065e-05,
"loss": 1.4797,
"step": 1704
},
{
"epoch": 1.3579952267303104,
"grad_norm": 1.324141025543213,
"learning_rate": 2.562922248789722e-05,
"loss": 1.4355,
"step": 1707
},
{
"epoch": 1.360381861575179,
"grad_norm": 1.3396581411361694,
"learning_rate": 2.5457104647120322e-05,
"loss": 1.5498,
"step": 1710
},
{
"epoch": 1.3627684964200477,
"grad_norm": 1.3867429494857788,
"learning_rate": 2.5285369101540445e-05,
"loss": 1.4706,
"step": 1713
},
{
"epoch": 1.3651551312649164,
"grad_norm": 1.4631327390670776,
"learning_rate": 2.5114018526215844e-05,
"loss": 1.4652,
"step": 1716
},
{
"epoch": 1.3675417661097853,
"grad_norm": 1.4228324890136719,
"learning_rate": 2.494305559020822e-05,
"loss": 1.5147,
"step": 1719
},
{
"epoch": 1.369928400954654,
"grad_norm": 1.3669848442077637,
"learning_rate": 2.4772482956541132e-05,
"loss": 1.3945,
"step": 1722
},
{
"epoch": 1.3723150357995226,
"grad_norm": 1.466894268989563,
"learning_rate": 2.4602303282158616e-05,
"loss": 1.3822,
"step": 1725
},
{
"epoch": 1.3747016706443915,
"grad_norm": 1.5458747148513794,
"learning_rate": 2.4432519217883676e-05,
"loss": 1.46,
"step": 1728
},
{
"epoch": 1.3770883054892602,
"grad_norm": 1.4828535318374634,
"learning_rate": 2.4263133408377076e-05,
"loss": 1.5053,
"step": 1731
},
{
"epoch": 1.3794749403341289,
"grad_norm": 1.3600937128067017,
"learning_rate": 2.4094148492096125e-05,
"loss": 1.5814,
"step": 1734
},
{
"epoch": 1.3818615751789975,
"grad_norm": 1.5162534713745117,
"learning_rate": 2.3925567101253576e-05,
"loss": 1.5373,
"step": 1737
},
{
"epoch": 1.3842482100238662,
"grad_norm": 1.4318009614944458,
"learning_rate": 2.3757391861776585e-05,
"loss": 1.4308,
"step": 1740
},
{
"epoch": 1.3866348448687351,
"grad_norm": 1.285593032836914,
"learning_rate": 2.3589625393265895e-05,
"loss": 1.4115,
"step": 1743
},
{
"epoch": 1.3890214797136038,
"grad_norm": 1.41255521774292,
"learning_rate": 2.3422270308954934e-05,
"loss": 1.4724,
"step": 1746
},
{
"epoch": 1.3914081145584727,
"grad_norm": 1.221402883529663,
"learning_rate": 2.3255329215669185e-05,
"loss": 1.3923,
"step": 1749
},
{
"epoch": 1.3937947494033414,
"grad_norm": 1.7999006509780884,
"learning_rate": 2.3088804713785584e-05,
"loss": 1.5016,
"step": 1752
},
{
"epoch": 1.39618138424821,
"grad_norm": 1.2943382263183594,
"learning_rate": 2.2922699397191893e-05,
"loss": 1.4305,
"step": 1755
},
{
"epoch": 1.3985680190930787,
"grad_norm": 1.470240831375122,
"learning_rate": 2.2757015853246493e-05,
"loss": 1.3706,
"step": 1758
},
{
"epoch": 1.4009546539379474,
"grad_norm": 1.269652247428894,
"learning_rate": 2.2591756662737862e-05,
"loss": 1.4425,
"step": 1761
},
{
"epoch": 1.4033412887828163,
"grad_norm": 1.3773661851882935,
"learning_rate": 2.242692439984463e-05,
"loss": 1.4063,
"step": 1764
},
{
"epoch": 1.405727923627685,
"grad_norm": 1.4183642864227295,
"learning_rate": 2.2262521632095203e-05,
"loss": 1.5086,
"step": 1767
},
{
"epoch": 1.4081145584725536,
"grad_norm": 1.3887887001037598,
"learning_rate": 2.2098550920327998e-05,
"loss": 1.311,
"step": 1770
},
{
"epoch": 1.4105011933174225,
"grad_norm": 1.5611753463745117,
"learning_rate": 2.1935014818651405e-05,
"loss": 1.3815,
"step": 1773
},
{
"epoch": 1.4128878281622912,
"grad_norm": 1.2606338262557983,
"learning_rate": 2.177191587440409e-05,
"loss": 1.4286,
"step": 1776
},
{
"epoch": 1.4152744630071599,
"grad_norm": 1.3460246324539185,
"learning_rate": 2.1609256628115316e-05,
"loss": 1.541,
"step": 1779
},
{
"epoch": 1.4176610978520285,
"grad_norm": 1.4279568195343018,
"learning_rate": 2.1447039613465265e-05,
"loss": 1.4517,
"step": 1782
},
{
"epoch": 1.4200477326968974,
"grad_norm": 1.4444867372512817,
"learning_rate": 2.128526735724572e-05,
"loss": 1.4325,
"step": 1785
},
{
"epoch": 1.422434367541766,
"grad_norm": 2.7751872539520264,
"learning_rate": 2.1123942379320576e-05,
"loss": 1.4161,
"step": 1788
},
{
"epoch": 1.4248210023866348,
"grad_norm": 1.5731838941574097,
"learning_rate": 2.096306719258669e-05,
"loss": 1.3889,
"step": 1791
},
{
"epoch": 1.4272076372315037,
"grad_norm": 1.5036033391952515,
"learning_rate": 2.0802644302934683e-05,
"loss": 1.4823,
"step": 1794
},
{
"epoch": 1.4295942720763724,
"grad_norm": 1.5555312633514404,
"learning_rate": 2.0642676209209934e-05,
"loss": 1.5452,
"step": 1797
},
{
"epoch": 1.431980906921241,
"grad_norm": 1.653014898300171,
"learning_rate": 2.0483165403173583e-05,
"loss": 1.4651,
"step": 1800
},
{
"epoch": 1.4343675417661097,
"grad_norm": 1.4742029905319214,
"learning_rate": 2.0324114369463855e-05,
"loss": 1.4215,
"step": 1803
},
{
"epoch": 1.4367541766109786,
"grad_norm": 1.3069920539855957,
"learning_rate": 2.0165525585557204e-05,
"loss": 1.352,
"step": 1806
},
{
"epoch": 1.4391408114558473,
"grad_norm": 1.3675094842910767,
"learning_rate": 2.0007401521729863e-05,
"loss": 1.3925,
"step": 1809
},
{
"epoch": 1.441527446300716,
"grad_norm": 1.5048547983169556,
"learning_rate": 1.984974464101928e-05,
"loss": 1.4392,
"step": 1812
},
{
"epoch": 1.4439140811455848,
"grad_norm": 1.3629268407821655,
"learning_rate": 1.9692557399185734e-05,
"loss": 1.6123,
"step": 1815
},
{
"epoch": 1.4463007159904535,
"grad_norm": 1.4186464548110962,
"learning_rate": 1.953584224467418e-05,
"loss": 1.4375,
"step": 1818
},
{
"epoch": 1.4486873508353222,
"grad_norm": 1.264143943786621,
"learning_rate": 1.9379601618575977e-05,
"loss": 1.3714,
"step": 1821
},
{
"epoch": 1.4510739856801909,
"grad_norm": 1.2267049551010132,
"learning_rate": 1.9223837954591046e-05,
"loss": 1.442,
"step": 1824
},
{
"epoch": 1.4534606205250595,
"grad_norm": 1.4130151271820068,
"learning_rate": 1.9068553678989736e-05,
"loss": 1.5417,
"step": 1827
},
{
"epoch": 1.4558472553699284,
"grad_norm": 1.3902508020401,
"learning_rate": 1.8913751210575248e-05,
"loss": 1.4484,
"step": 1830
},
{
"epoch": 1.458233890214797,
"grad_norm": 1.3493434190750122,
"learning_rate": 1.8759432960645774e-05,
"loss": 1.4089,
"step": 1833
},
{
"epoch": 1.460620525059666,
"grad_norm": 1.3696714639663696,
"learning_rate": 1.8605601332957077e-05,
"loss": 1.3673,
"step": 1836
},
{
"epoch": 1.4630071599045347,
"grad_norm": 1.3794286251068115,
"learning_rate": 1.8452258723684995e-05,
"loss": 1.3348,
"step": 1839
},
{
"epoch": 1.4653937947494033,
"grad_norm": 1.4056599140167236,
"learning_rate": 1.8299407521388067e-05,
"loss": 1.3715,
"step": 1842
},
{
"epoch": 1.467780429594272,
"grad_norm": 1.3033989667892456,
"learning_rate": 1.8147050106970437e-05,
"loss": 1.4756,
"step": 1845
},
{
"epoch": 1.4701670644391407,
"grad_norm": 1.2725988626480103,
"learning_rate": 1.7995188853644646e-05,
"loss": 1.4429,
"step": 1848
},
{
"epoch": 1.4725536992840096,
"grad_norm": 1.2975175380706787,
"learning_rate": 1.784382612689477e-05,
"loss": 1.4233,
"step": 1851
},
{
"epoch": 1.4749403341288783,
"grad_norm": 1.3403589725494385,
"learning_rate": 1.7692964284439505e-05,
"loss": 1.3441,
"step": 1854
},
{
"epoch": 1.477326968973747,
"grad_norm": 1.395530343055725,
"learning_rate": 1.7542605676195506e-05,
"loss": 1.3968,
"step": 1857
},
{
"epoch": 1.4797136038186158,
"grad_norm": 1.2565813064575195,
"learning_rate": 1.739275264424067e-05,
"loss": 1.4858,
"step": 1860
},
{
"epoch": 1.4821002386634845,
"grad_norm": 1.4718109369277954,
"learning_rate": 1.7243407522777806e-05,
"loss": 1.4901,
"step": 1863
},
{
"epoch": 1.4844868735083532,
"grad_norm": 1.2175111770629883,
"learning_rate": 1.7094572638098123e-05,
"loss": 1.4708,
"step": 1866
},
{
"epoch": 1.4868735083532219,
"grad_norm": 1.3653630018234253,
"learning_rate": 1.6946250308545125e-05,
"loss": 1.3542,
"step": 1869
},
{
"epoch": 1.4892601431980907,
"grad_norm": 1.325291633605957,
"learning_rate": 1.6798442844478445e-05,
"loss": 1.3565,
"step": 1872
},
{
"epoch": 1.4916467780429594,
"grad_norm": 1.3131422996520996,
"learning_rate": 1.6651152548237802e-05,
"loss": 1.3708,
"step": 1875
},
{
"epoch": 1.494033412887828,
"grad_norm": 1.3751031160354614,
"learning_rate": 1.6504381714107252e-05,
"loss": 1.4521,
"step": 1878
},
{
"epoch": 1.496420047732697,
"grad_norm": 1.3174288272857666,
"learning_rate": 1.6358132628279322e-05,
"loss": 1.3748,
"step": 1881
},
{
"epoch": 1.4988066825775657,
"grad_norm": 1.3484026193618774,
"learning_rate": 1.6212407568819565e-05,
"loss": 1.3542,
"step": 1884
},
{
"epoch": 1.5011933174224343,
"grad_norm": 1.417079210281372,
"learning_rate": 1.6067208805630877e-05,
"loss": 1.4029,
"step": 1887
},
{
"epoch": 1.503579952267303,
"grad_norm": 2.4393136501312256,
"learning_rate": 1.5922538600418318e-05,
"loss": 1.3775,
"step": 1890
},
{
"epoch": 1.5059665871121717,
"grad_norm": 1.4242373704910278,
"learning_rate": 1.5778399206653734e-05,
"loss": 1.3828,
"step": 1893
},
{
"epoch": 1.5083532219570406,
"grad_norm": 1.3555493354797363,
"learning_rate": 1.563479286954078e-05,
"loss": 1.4257,
"step": 1896
},
{
"epoch": 1.5107398568019093,
"grad_norm": 1.3086305856704712,
"learning_rate": 1.54917218259799e-05,
"loss": 1.366,
"step": 1899
},
{
"epoch": 1.5131264916467781,
"grad_norm": 1.3633023500442505,
"learning_rate": 1.5349188304533413e-05,
"loss": 1.4599,
"step": 1902
},
{
"epoch": 1.5155131264916468,
"grad_norm": 1.3130401372909546,
"learning_rate": 1.5207194525390938e-05,
"loss": 1.4543,
"step": 1905
},
{
"epoch": 1.5178997613365155,
"grad_norm": 1.3946834802627563,
"learning_rate": 1.5065742700334678e-05,
"loss": 1.4115,
"step": 1908
},
{
"epoch": 1.5202863961813842,
"grad_norm": 1.3722360134124756,
"learning_rate": 1.4924835032705064e-05,
"loss": 1.4059,
"step": 1911
},
{
"epoch": 1.5226730310262528,
"grad_norm": 1.297034740447998,
"learning_rate": 1.4784473717366387e-05,
"loss": 1.5423,
"step": 1914
},
{
"epoch": 1.5250596658711217,
"grad_norm": 1.2313867807388306,
"learning_rate": 1.4644660940672627e-05,
"loss": 1.4202,
"step": 1917
},
{
"epoch": 1.5274463007159904,
"grad_norm": 1.2203267812728882,
"learning_rate": 1.4505398880433369e-05,
"loss": 1.3289,
"step": 1920
},
{
"epoch": 1.5298329355608593,
"grad_norm": 2.824936628341675,
"learning_rate": 1.4366689705879898e-05,
"loss": 1.4151,
"step": 1923
},
{
"epoch": 1.532219570405728,
"grad_norm": 1.2896000146865845,
"learning_rate": 1.4228535577631442e-05,
"loss": 1.3452,
"step": 1926
},
{
"epoch": 1.5346062052505967,
"grad_norm": 1.2921373844146729,
"learning_rate": 1.4090938647661461e-05,
"loss": 1.4469,
"step": 1929
},
{
"epoch": 1.5369928400954653,
"grad_norm": 1.4541573524475098,
"learning_rate": 1.3953901059264191e-05,
"loss": 1.5048,
"step": 1932
},
{
"epoch": 1.539379474940334,
"grad_norm": 1.5273715257644653,
"learning_rate": 1.3817424947021151e-05,
"loss": 1.425,
"step": 1935
},
{
"epoch": 1.541766109785203,
"grad_norm": 1.4470243453979492,
"learning_rate": 1.3681512436768045e-05,
"loss": 1.6023,
"step": 1938
},
{
"epoch": 1.5441527446300716,
"grad_norm": 1.274420976638794,
"learning_rate": 1.3546165645561487e-05,
"loss": 1.3682,
"step": 1941
},
{
"epoch": 1.5465393794749405,
"grad_norm": 1.537539005279541,
"learning_rate": 1.3411386681646164e-05,
"loss": 1.3933,
"step": 1944
},
{
"epoch": 1.5489260143198091,
"grad_norm": 1.2480096817016602,
"learning_rate": 1.3277177644421924e-05,
"loss": 1.3532,
"step": 1947
},
{
"epoch": 1.5513126491646778,
"grad_norm": 1.4736180305480957,
"learning_rate": 1.314354062441106e-05,
"loss": 1.5258,
"step": 1950
},
{
"epoch": 1.5536992840095465,
"grad_norm": 1.2729185819625854,
"learning_rate": 1.301047770322581e-05,
"loss": 1.3904,
"step": 1953
},
{
"epoch": 1.5560859188544152,
"grad_norm": 1.2870765924453735,
"learning_rate": 1.287799095353584e-05,
"loss": 1.3343,
"step": 1956
},
{
"epoch": 1.558472553699284,
"grad_norm": 1.385016679763794,
"learning_rate": 1.2746082439036117e-05,
"loss": 1.4185,
"step": 1959
},
{
"epoch": 1.5608591885441527,
"grad_norm": 1.3921257257461548,
"learning_rate": 1.2614754214414548e-05,
"loss": 1.3932,
"step": 1962
},
{
"epoch": 1.5632458233890216,
"grad_norm": 1.4554414749145508,
"learning_rate": 1.2484008325320174e-05,
"loss": 1.4237,
"step": 1965
},
{
"epoch": 1.5656324582338903,
"grad_norm": 1.4019906520843506,
"learning_rate": 1.2353846808331154e-05,
"loss": 1.3849,
"step": 1968
},
{
"epoch": 1.568019093078759,
"grad_norm": 1.3316291570663452,
"learning_rate": 1.2224271690923155e-05,
"loss": 1.3343,
"step": 1971
},
{
"epoch": 1.5704057279236276,
"grad_norm": 1.251207709312439,
"learning_rate": 1.2095284991437733e-05,
"loss": 1.333,
"step": 1974
},
{
"epoch": 1.5727923627684963,
"grad_norm": 1.3229224681854248,
"learning_rate": 1.1966888719050829e-05,
"loss": 1.4419,
"step": 1977
},
{
"epoch": 1.575178997613365,
"grad_norm": 1.3271231651306152,
"learning_rate": 1.1839084873741584e-05,
"loss": 1.421,
"step": 1980
},
{
"epoch": 1.577565632458234,
"grad_norm": 1.4479427337646484,
"learning_rate": 1.1711875446261094e-05,
"loss": 1.4322,
"step": 1983
},
{
"epoch": 1.5799522673031028,
"grad_norm": 1.4587756395339966,
"learning_rate": 1.1585262418101467e-05,
"loss": 1.4832,
"step": 1986
},
{
"epoch": 1.5823389021479715,
"grad_norm": 1.3749325275421143,
"learning_rate": 1.1459247761464909e-05,
"loss": 1.423,
"step": 1989
},
{
"epoch": 1.5847255369928401,
"grad_norm": 1.3663976192474365,
"learning_rate": 1.1333833439233055e-05,
"loss": 1.4133,
"step": 1992
},
{
"epoch": 1.5871121718377088,
"grad_norm": 1.3849143981933594,
"learning_rate": 1.1209021404936304e-05,
"loss": 1.3823,
"step": 1995
},
{
"epoch": 1.5894988066825775,
"grad_norm": 1.3283625841140747,
"learning_rate": 1.1084813602723515e-05,
"loss": 1.4437,
"step": 1998
},
{
"epoch": 1.5918854415274462,
"grad_norm": 1.3943214416503906,
"learning_rate": 1.0961211967331597e-05,
"loss": 1.4566,
"step": 2001
},
{
"epoch": 1.594272076372315,
"grad_norm": 1.340326189994812,
"learning_rate": 1.083821842405548e-05,
"loss": 1.3319,
"step": 2004
},
{
"epoch": 1.5966587112171837,
"grad_norm": 1.458629846572876,
"learning_rate": 1.0715834888718074e-05,
"loss": 1.307,
"step": 2007
},
{
"epoch": 1.5990453460620526,
"grad_norm": 1.327406883239746,
"learning_rate": 1.0594063267640386e-05,
"loss": 1.3367,
"step": 2010
},
{
"epoch": 1.6014319809069213,
"grad_norm": 1.3545573949813843,
"learning_rate": 1.0472905457611936e-05,
"loss": 1.43,
"step": 2013
},
{
"epoch": 1.60381861575179,
"grad_norm": 1.3321373462677002,
"learning_rate": 1.0352363345861065e-05,
"loss": 1.3416,
"step": 2016
},
{
"epoch": 1.6062052505966586,
"grad_norm": 1.3040752410888672,
"learning_rate": 1.023243881002573e-05,
"loss": 1.6122,
"step": 2019
},
{
"epoch": 1.6085918854415273,
"grad_norm": 1.7471809387207031,
"learning_rate": 1.0113133718124035e-05,
"loss": 1.5219,
"step": 2022
},
{
"epoch": 1.6109785202863962,
"grad_norm": 1.362330436706543,
"learning_rate": 9.994449928525324e-06,
"loss": 1.4859,
"step": 2025
},
{
"epoch": 1.6133651551312649,
"grad_norm": 1.3588142395019531,
"learning_rate": 9.876389289921106e-06,
"loss": 1.5388,
"step": 2028
},
{
"epoch": 1.6157517899761338,
"grad_norm": 1.2795350551605225,
"learning_rate": 9.758953641296331e-06,
"loss": 1.4129,
"step": 2031
},
{
"epoch": 1.6181384248210025,
"grad_norm": 1.3939927816390991,
"learning_rate": 9.642144811900739e-06,
"loss": 1.407,
"step": 2034
},
{
"epoch": 1.6205250596658711,
"grad_norm": 1.3605296611785889,
"learning_rate": 9.5259646212203e-06,
"loss": 1.3686,
"step": 2037
},
{
"epoch": 1.6229116945107398,
"grad_norm": 3.3502025604248047,
"learning_rate": 9.410414878948975e-06,
"loss": 1.3942,
"step": 2040
},
{
"epoch": 1.6252983293556085,
"grad_norm": 1.2541710138320923,
"learning_rate": 9.295497384960416e-06,
"loss": 1.4175,
"step": 2043
},
{
"epoch": 1.6276849642004774,
"grad_norm": 1.4015976190567017,
"learning_rate": 9.181213929280046e-06,
"loss": 1.4867,
"step": 2046
},
{
"epoch": 1.630071599045346,
"grad_norm": 1.2918376922607422,
"learning_rate": 9.067566292057084e-06,
"loss": 1.4243,
"step": 2049
},
{
"epoch": 1.632458233890215,
"grad_norm": 1.341584324836731,
"learning_rate": 8.954556243536877e-06,
"loss": 1.309,
"step": 2052
},
{
"epoch": 1.6348448687350836,
"grad_norm": 1.364698052406311,
"learning_rate": 8.842185544033255e-06,
"loss": 1.4609,
"step": 2055
},
{
"epoch": 1.6372315035799523,
"grad_norm": 1.3149210214614868,
"learning_rate": 8.7304559439012e-06,
"loss": 1.4338,
"step": 2058
},
{
"epoch": 1.639618138424821,
"grad_norm": 1.2939684391021729,
"learning_rate": 8.619369183509501e-06,
"loss": 1.3857,
"step": 2061
},
{
"epoch": 1.6420047732696896,
"grad_norm": 1.364255428314209,
"learning_rate": 8.508926993213712e-06,
"loss": 1.4484,
"step": 2064
},
{
"epoch": 1.6443914081145583,
"grad_norm": 1.3580390214920044,
"learning_rate": 8.39913109332916e-06,
"loss": 1.377,
"step": 2067
},
{
"epoch": 1.6467780429594272,
"grad_norm": 1.354833960533142,
"learning_rate": 8.28998319410413e-06,
"loss": 1.3848,
"step": 2070
},
{
"epoch": 1.649164677804296,
"grad_norm": 1.2995808124542236,
"learning_rate": 8.181484995693295e-06,
"loss": 1.369,
"step": 2073
},
{
"epoch": 1.6515513126491648,
"grad_norm": 1.387732982635498,
"learning_rate": 8.073638188131128e-06,
"loss": 1.3963,
"step": 2076
},
{
"epoch": 1.6539379474940334,
"grad_norm": 1.349213719367981,
"learning_rate": 7.966444451305726e-06,
"loss": 1.4368,
"step": 2079
},
{
"epoch": 1.6563245823389021,
"grad_norm": 1.3115229606628418,
"learning_rate": 7.859905454932471e-06,
"loss": 1.3239,
"step": 2082
},
{
"epoch": 1.6587112171837708,
"grad_norm": 1.2257990837097168,
"learning_rate": 7.75402285852816e-06,
"loss": 1.3398,
"step": 2085
},
{
"epoch": 1.6610978520286395,
"grad_norm": 1.3116490840911865,
"learning_rate": 7.648798311385058e-06,
"loss": 1.3408,
"step": 2088
},
{
"epoch": 1.6634844868735084,
"grad_norm": 1.3405076265335083,
"learning_rate": 7.5442334525452964e-06,
"loss": 1.3239,
"step": 2091
},
{
"epoch": 1.665871121718377,
"grad_norm": 1.328359842300415,
"learning_rate": 7.440329910775273e-06,
"loss": 1.3864,
"step": 2094
},
{
"epoch": 1.668257756563246,
"grad_norm": 1.4465019702911377,
"learning_rate": 7.337089304540301e-06,
"loss": 1.3507,
"step": 2097
},
{
"epoch": 1.6706443914081146,
"grad_norm": 1.334693431854248,
"learning_rate": 7.234513241979418e-06,
"loss": 1.41,
"step": 2100
},
{
"epoch": 1.6730310262529833,
"grad_norm": 1.2191022634506226,
"learning_rate": 7.132603320880294e-06,
"loss": 1.3517,
"step": 2103
},
{
"epoch": 1.675417661097852,
"grad_norm": 1.286347508430481,
"learning_rate": 7.031361128654401e-06,
"loss": 1.4724,
"step": 2106
},
{
"epoch": 1.6778042959427206,
"grad_norm": 1.2741565704345703,
"learning_rate": 6.930788242312253e-06,
"loss": 1.3599,
"step": 2109
},
{
"epoch": 1.6801909307875895,
"grad_norm": 1.331213116645813,
"learning_rate": 6.830886228438837e-06,
"loss": 1.448,
"step": 2112
},
{
"epoch": 1.6825775656324582,
"grad_norm": 1.2380222082138062,
"learning_rate": 6.731656643169204e-06,
"loss": 1.362,
"step": 2115
},
{
"epoch": 1.684964200477327,
"grad_norm": 1.3948215246200562,
"learning_rate": 6.633101032164274e-06,
"loss": 1.4777,
"step": 2118
},
{
"epoch": 1.6873508353221958,
"grad_norm": 1.2766612768173218,
"learning_rate": 6.535220930586705e-06,
"loss": 1.5417,
"step": 2121
},
{
"epoch": 1.6897374701670644,
"grad_norm": 1.5349231958389282,
"learning_rate": 6.4380178630770225e-06,
"loss": 1.4201,
"step": 2124
},
{
"epoch": 1.692124105011933,
"grad_norm": 1.3303968906402588,
"learning_rate": 6.341493343729854e-06,
"loss": 1.4746,
"step": 2127
},
{
"epoch": 1.6945107398568018,
"grad_norm": 1.2858116626739502,
"learning_rate": 6.2456488760703205e-06,
"loss": 1.4834,
"step": 2130
},
{
"epoch": 1.6968973747016707,
"grad_norm": 1.3412748575210571,
"learning_rate": 6.150485953030677e-06,
"loss": 1.2398,
"step": 2133
},
{
"epoch": 1.6992840095465394,
"grad_norm": 1.3060575723648071,
"learning_rate": 6.056006056926977e-06,
"loss": 1.5145,
"step": 2136
},
{
"epoch": 1.7016706443914082,
"grad_norm": 1.4394913911819458,
"learning_rate": 5.962210659436091e-06,
"loss": 1.3623,
"step": 2139
},
{
"epoch": 1.704057279236277,
"grad_norm": 1.2894078493118286,
"learning_rate": 5.869101221572654e-06,
"loss": 1.327,
"step": 2142
},
{
"epoch": 1.7064439140811456,
"grad_norm": 1.6368987560272217,
"learning_rate": 5.776679193666412e-06,
"loss": 1.5371,
"step": 2145
},
{
"epoch": 1.7088305489260143,
"grad_norm": 1.4706789255142212,
"learning_rate": 5.6849460153395706e-06,
"loss": 1.3617,
"step": 2148
},
{
"epoch": 1.711217183770883,
"grad_norm": 1.358765721321106,
"learning_rate": 5.5939031154844e-06,
"loss": 1.3666,
"step": 2151
},
{
"epoch": 1.7136038186157518,
"grad_norm": 1.3404169082641602,
"learning_rate": 5.5035519122409895e-06,
"loss": 1.4213,
"step": 2154
},
{
"epoch": 1.7159904534606205,
"grad_norm": 1.9059109687805176,
"learning_rate": 5.413893812975096e-06,
"loss": 1.4891,
"step": 2157
},
{
"epoch": 1.7183770883054894,
"grad_norm": 1.5331205129623413,
"learning_rate": 5.324930214256302e-06,
"loss": 1.4278,
"step": 2160
},
{
"epoch": 1.720763723150358,
"grad_norm": 1.3307032585144043,
"learning_rate": 5.236662501836192e-06,
"loss": 1.389,
"step": 2163
},
{
"epoch": 1.7231503579952268,
"grad_norm": 1.5163909196853638,
"learning_rate": 5.149092050626825e-06,
"loss": 1.5462,
"step": 2166
},
{
"epoch": 1.7255369928400954,
"grad_norm": 1.3799465894699097,
"learning_rate": 5.062220224679276e-06,
"loss": 1.3583,
"step": 2169
},
{
"epoch": 1.727923627684964,
"grad_norm": 1.288385272026062,
"learning_rate": 4.9760483771624236e-06,
"loss": 1.401,
"step": 2172
},
{
"epoch": 1.7303102625298328,
"grad_norm": 1.3018258810043335,
"learning_rate": 4.89057785034181e-06,
"loss": 1.3998,
"step": 2175
},
{
"epoch": 1.7326968973747017,
"grad_norm": 1.4465045928955078,
"learning_rate": 4.805809975558828e-06,
"loss": 1.4118,
"step": 2178
},
{
"epoch": 1.7350835322195706,
"grad_norm": 1.3118780851364136,
"learning_rate": 4.721746073209893e-06,
"loss": 1.3574,
"step": 2181
},
{
"epoch": 1.7374701670644392,
"grad_norm": 1.653915524482727,
"learning_rate": 4.6383874527259345e-06,
"loss": 1.5086,
"step": 2184
},
{
"epoch": 1.739856801909308,
"grad_norm": 1.3008549213409424,
"learning_rate": 4.555735412551975e-06,
"loss": 1.4131,
"step": 2187
},
{
"epoch": 1.7422434367541766,
"grad_norm": 1.714281678199768,
"learning_rate": 4.47379124012689e-06,
"loss": 1.4335,
"step": 2190
},
{
"epoch": 1.7446300715990453,
"grad_norm": 1.379380702972412,
"learning_rate": 4.3925562118634135e-06,
"loss": 1.4987,
"step": 2193
},
{
"epoch": 1.747016706443914,
"grad_norm": 1.3546345233917236,
"learning_rate": 4.312031593128163e-06,
"loss": 1.5424,
"step": 2196
},
{
"epoch": 1.7494033412887828,
"grad_norm": 1.4139165878295898,
"learning_rate": 4.232218638222029e-06,
"loss": 1.3599,
"step": 2199
},
{
"epoch": 1.7517899761336515,
"grad_norm": 1.326416015625,
"learning_rate": 4.153118590360561e-06,
"loss": 1.3698,
"step": 2202
},
{
"epoch": 1.7541766109785204,
"grad_norm": 1.2726656198501587,
"learning_rate": 4.074732681654647e-06,
"loss": 1.3478,
"step": 2205
},
{
"epoch": 1.756563245823389,
"grad_norm": 1.295027494430542,
"learning_rate": 3.997062133091284e-06,
"loss": 1.3318,
"step": 2208
},
{
"epoch": 1.7589498806682577,
"grad_norm": 1.240146279335022,
"learning_rate": 3.920108154514585e-06,
"loss": 1.2902,
"step": 2211
},
{
"epoch": 1.7613365155131264,
"grad_norm": 1.2674936056137085,
"learning_rate": 3.843871944606969e-06,
"loss": 1.3331,
"step": 2214
},
{
"epoch": 1.763723150357995,
"grad_norm": 1.4124763011932373,
"learning_rate": 3.7683546908703903e-06,
"loss": 1.432,
"step": 2217
},
{
"epoch": 1.766109785202864,
"grad_norm": 1.3157274723052979,
"learning_rate": 3.693557569607947e-06,
"loss": 1.3372,
"step": 2220
},
{
"epoch": 1.7684964200477327,
"grad_norm": 1.4040486812591553,
"learning_rate": 3.6194817459054676e-06,
"loss": 1.4255,
"step": 2223
},
{
"epoch": 1.7708830548926016,
"grad_norm": 1.2398067712783813,
"learning_rate": 3.5461283736134722e-06,
"loss": 1.3448,
"step": 2226
},
{
"epoch": 1.7732696897374702,
"grad_norm": 1.2771934270858765,
"learning_rate": 3.4734985953290778e-06,
"loss": 1.4079,
"step": 2229
},
{
"epoch": 1.775656324582339,
"grad_norm": 1.1697748899459839,
"learning_rate": 3.401593542378262e-06,
"loss": 1.4184,
"step": 2232
},
{
"epoch": 1.7780429594272076,
"grad_norm": 1.355383276939392,
"learning_rate": 3.330414334798265e-06,
"loss": 1.31,
"step": 2235
},
{
"epoch": 1.7804295942720763,
"grad_norm": 1.2719684839248657,
"learning_rate": 3.2599620813200837e-06,
"loss": 1.4189,
"step": 2238
},
{
"epoch": 1.7828162291169452,
"grad_norm": 1.4642714262008667,
"learning_rate": 3.1902378793512657e-06,
"loss": 1.4552,
"step": 2241
},
{
"epoch": 1.7852028639618138,
"grad_norm": 1.2354283332824707,
"learning_rate": 3.121242814958747e-06,
"loss": 1.3951,
"step": 2244
},
{
"epoch": 1.7875894988066827,
"grad_norm": 1.3236439228057861,
"learning_rate": 3.0529779628519992e-06,
"loss": 1.4105,
"step": 2247
},
{
"epoch": 1.7899761336515514,
"grad_norm": 1.3168965578079224,
"learning_rate": 2.9854443863662262e-06,
"loss": 1.4434,
"step": 2250
},
{
"epoch": 1.79236276849642,
"grad_norm": 2.0770509243011475,
"learning_rate": 2.918643137445859e-06,
"loss": 1.4209,
"step": 2253
},
{
"epoch": 1.7947494033412887,
"grad_norm": 1.3210294246673584,
"learning_rate": 2.8525752566281482e-06,
"loss": 1.4219,
"step": 2256
},
{
"epoch": 1.7971360381861574,
"grad_norm": 1.3929189443588257,
"learning_rate": 2.787241773026933e-06,
"loss": 1.4382,
"step": 2259
},
{
"epoch": 1.799522673031026,
"grad_norm": 1.3046364784240723,
"learning_rate": 2.722643704316652e-06,
"loss": 1.4634,
"step": 2262
},
{
"epoch": 1.801909307875895,
"grad_norm": 1.324135422706604,
"learning_rate": 2.658782056716441e-06,
"loss": 1.4041,
"step": 2265
},
{
"epoch": 1.8042959427207639,
"grad_norm": 1.3277792930603027,
"learning_rate": 2.5956578249745236e-06,
"loss": 1.3838,
"step": 2268
},
{
"epoch": 1.8066825775656326,
"grad_norm": 1.5090585947036743,
"learning_rate": 2.533271992352659e-06,
"loss": 1.4224,
"step": 2271
},
{
"epoch": 1.8090692124105012,
"grad_norm": 1.2901520729064941,
"learning_rate": 2.4716255306108605e-06,
"loss": 1.3893,
"step": 2274
},
{
"epoch": 1.81145584725537,
"grad_norm": 1.3586432933807373,
"learning_rate": 2.4107193999922286e-06,
"loss": 1.3146,
"step": 2277
},
{
"epoch": 1.8138424821002386,
"grad_norm": 1.2898615598678589,
"learning_rate": 2.3505545492080395e-06,
"loss": 1.2849,
"step": 2280
},
{
"epoch": 1.8162291169451072,
"grad_norm": 1.3389803171157837,
"learning_rate": 2.291131915422917e-06,
"loss": 1.3749,
"step": 2283
},
{
"epoch": 1.8186157517899761,
"grad_norm": 1.3909133672714233,
"learning_rate": 2.2324524242402613e-06,
"loss": 1.4045,
"step": 2286
},
{
"epoch": 1.8210023866348448,
"grad_norm": 1.992772102355957,
"learning_rate": 2.1745169896878414e-06,
"loss": 1.3947,
"step": 2289
},
{
"epoch": 1.8233890214797137,
"grad_norm": 1.3467578887939453,
"learning_rate": 2.117326514203527e-06,
"loss": 1.4358,
"step": 2292
},
{
"epoch": 1.8257756563245824,
"grad_norm": 1.316362738609314,
"learning_rate": 2.0608818886212576e-06,
"loss": 1.3924,
"step": 2295
},
{
"epoch": 1.828162291169451,
"grad_norm": 1.303235411643982,
"learning_rate": 2.0051839921571448e-06,
"loss": 1.4439,
"step": 2298
},
{
"epoch": 1.8305489260143197,
"grad_norm": 1.4706928730010986,
"learning_rate": 1.9502336923958255e-06,
"loss": 1.2834,
"step": 2301
},
{
"epoch": 1.8329355608591884,
"grad_norm": 1.2729629278182983,
"learning_rate": 1.8960318452768577e-06,
"loss": 1.3582,
"step": 2304
},
{
"epoch": 1.8353221957040573,
"grad_norm": 1.5145806074142456,
"learning_rate": 1.8425792950814868e-06,
"loss": 1.492,
"step": 2307
},
{
"epoch": 1.837708830548926,
"grad_norm": 1.286137342453003,
"learning_rate": 1.7898768744194162e-06,
"loss": 1.4031,
"step": 2310
},
{
"epoch": 1.8400954653937949,
"grad_norm": 1.309260368347168,
"learning_rate": 1.7379254042158955e-06,
"loss": 1.3831,
"step": 2313
},
{
"epoch": 1.8424821002386635,
"grad_norm": 1.5175321102142334,
"learning_rate": 1.6867256936989096e-06,
"loss": 1.4786,
"step": 2316
},
{
"epoch": 1.8448687350835322,
"grad_norm": 1.2964720726013184,
"learning_rate": 1.6362785403865488e-06,
"loss": 1.398,
"step": 2319
},
{
"epoch": 1.847255369928401,
"grad_norm": 1.2901564836502075,
"learning_rate": 1.5865847300746417e-06,
"loss": 1.3338,
"step": 2322
},
{
"epoch": 1.8496420047732696,
"grad_norm": 1.3462107181549072,
"learning_rate": 1.5376450368244589e-06,
"loss": 1.3809,
"step": 2325
},
{
"epoch": 1.8520286396181385,
"grad_norm": 1.3782678842544556,
"learning_rate": 1.4894602229506892e-06,
"loss": 1.3993,
"step": 2328
},
{
"epoch": 1.8544152744630071,
"grad_norm": 1.1769006252288818,
"learning_rate": 1.4420310390095615e-06,
"loss": 1.291,
"step": 2331
},
{
"epoch": 1.856801909307876,
"grad_norm": 1.279466152191162,
"learning_rate": 1.3953582237871521e-06,
"loss": 1.4328,
"step": 2334
},
{
"epoch": 1.8591885441527447,
"grad_norm": 1.3210725784301758,
"learning_rate": 1.3494425042878622e-06,
"loss": 1.4165,
"step": 2337
},
{
"epoch": 1.8615751789976134,
"grad_norm": 1.393333911895752,
"learning_rate": 1.3042845957231153e-06,
"loss": 1.3581,
"step": 2340
},
{
"epoch": 1.863961813842482,
"grad_norm": 1.3860208988189697,
"learning_rate": 1.2598852015001994e-06,
"loss": 1.3974,
"step": 2343
},
{
"epoch": 1.8663484486873507,
"grad_norm": 1.3636202812194824,
"learning_rate": 1.2162450132113201e-06,
"loss": 1.5009,
"step": 2346
},
{
"epoch": 1.8687350835322196,
"grad_norm": 1.2739925384521484,
"learning_rate": 1.1733647106228375e-06,
"loss": 1.4206,
"step": 2349
},
{
"epoch": 1.8711217183770883,
"grad_norm": 2.2390100955963135,
"learning_rate": 1.1312449616646403e-06,
"loss": 1.3318,
"step": 2352
},
{
"epoch": 1.8735083532219572,
"grad_norm": 1.1843377351760864,
"learning_rate": 1.0898864224197946e-06,
"loss": 1.289,
"step": 2355
},
{
"epoch": 1.8758949880668259,
"grad_norm": 1.2790746688842773,
"learning_rate": 1.049289737114273e-06,
"loss": 1.4026,
"step": 2358
},
{
"epoch": 1.8782816229116945,
"grad_norm": 1.1583595275878906,
"learning_rate": 1.009455538106968e-06,
"loss": 1.2771,
"step": 2361
},
{
"epoch": 1.8806682577565632,
"grad_norm": 4.7106781005859375,
"learning_rate": 9.703844458797962e-07,
"loss": 1.4771,
"step": 2364
},
{
"epoch": 1.8830548926014319,
"grad_norm": 1.3764762878417969,
"learning_rate": 9.320770690280645e-07,
"loss": 1.4295,
"step": 2367
},
{
"epoch": 1.8854415274463006,
"grad_norm": 1.3544838428497314,
"learning_rate": 8.945340042509797e-07,
"loss": 1.405,
"step": 2370
},
{
"epoch": 1.8878281622911695,
"grad_norm": 1.3262426853179932,
"learning_rate": 8.577558363423554e-07,
"loss": 1.4135,
"step": 2373
},
{
"epoch": 1.8902147971360383,
"grad_norm": 1.2534743547439575,
"learning_rate": 8.217431381815077e-07,
"loss": 1.4168,
"step": 2376
},
{
"epoch": 1.892601431980907,
"grad_norm": 1.4368257522583008,
"learning_rate": 7.864964707243072e-07,
"loss": 1.3518,
"step": 2379
},
{
"epoch": 1.8949880668257757,
"grad_norm": 1.3582005500793457,
"learning_rate": 7.520163829944804e-07,
"loss": 1.3315,
"step": 2382
},
{
"epoch": 1.8973747016706444,
"grad_norm": 1.4341644048690796,
"learning_rate": 7.183034120750221e-07,
"loss": 1.3689,
"step": 2385
},
{
"epoch": 1.899761336515513,
"grad_norm": 1.368857741355896,
"learning_rate": 6.85358083099863e-07,
"loss": 1.384,
"step": 2388
},
{
"epoch": 1.9021479713603817,
"grad_norm": 1.307237148284912,
"learning_rate": 6.531809092456598e-07,
"loss": 1.3101,
"step": 2391
},
{
"epoch": 1.9045346062052506,
"grad_norm": 1.274276614189148,
"learning_rate": 6.217723917238128e-07,
"loss": 1.4943,
"step": 2394
},
{
"epoch": 1.9069212410501193,
"grad_norm": 1.3081694841384888,
"learning_rate": 5.911330197726661e-07,
"loss": 1.3365,
"step": 2397
},
{
"epoch": 1.9093078758949882,
"grad_norm": 1.2196581363677979,
"learning_rate": 5.612632706498755e-07,
"loss": 1.3927,
"step": 2400
},
{
"epoch": 1.9116945107398569,
"grad_norm": 1.33291494846344,
"learning_rate": 5.321636096249749e-07,
"loss": 1.4337,
"step": 2403
},
{
"epoch": 1.9140811455847255,
"grad_norm": 1.355838418006897,
"learning_rate": 5.038344899721436e-07,
"loss": 1.3511,
"step": 2406
},
{
"epoch": 1.9164677804295942,
"grad_norm": 1.2884796857833862,
"learning_rate": 4.762763529631342e-07,
"loss": 1.3787,
"step": 2409
},
{
"epoch": 1.9188544152744629,
"grad_norm": 1.4186152219772339,
"learning_rate": 4.4948962786039437e-07,
"loss": 1.4141,
"step": 2412
},
{
"epoch": 1.9212410501193318,
"grad_norm": 1.1370840072631836,
"learning_rate": 4.234747319103949e-07,
"loss": 1.2792,
"step": 2415
},
{
"epoch": 1.9236276849642004,
"grad_norm": 1.3362590074539185,
"learning_rate": 3.9823207033710676e-07,
"loss": 1.43,
"step": 2418
},
{
"epoch": 1.9260143198090693,
"grad_norm": 1.2540643215179443,
"learning_rate": 3.737620363357286e-07,
"loss": 1.2947,
"step": 2421
},
{
"epoch": 1.928400954653938,
"grad_norm": 1.3183681964874268,
"learning_rate": 3.5006501106651937e-07,
"loss": 1.3768,
"step": 2424
},
{
"epoch": 1.9307875894988067,
"grad_norm": 1.2523298263549805,
"learning_rate": 3.2714136364888073e-07,
"loss": 1.3564,
"step": 2427
},
{
"epoch": 1.9331742243436754,
"grad_norm": 1.7795356512069702,
"learning_rate": 3.0499145115561176e-07,
"loss": 1.518,
"step": 2430
},
{
"epoch": 1.935560859188544,
"grad_norm": 1.2850757837295532,
"learning_rate": 2.836156186073413e-07,
"loss": 1.3588,
"step": 2433
},
{
"epoch": 1.937947494033413,
"grad_norm": 1.30345618724823,
"learning_rate": 2.630141989671542e-07,
"loss": 1.3733,
"step": 2436
},
{
"epoch": 1.9403341288782816,
"grad_norm": 1.3604350090026855,
"learning_rate": 2.431875131354011e-07,
"loss": 1.4814,
"step": 2439
},
{
"epoch": 1.9427207637231505,
"grad_norm": 1.4250744581222534,
"learning_rate": 2.2413586994470825e-07,
"loss": 1.3531,
"step": 2442
},
{
"epoch": 1.9451073985680192,
"grad_norm": 1.2653945684432983,
"learning_rate": 2.0585956615515323e-07,
"loss": 1.3951,
"step": 2445
},
{
"epoch": 1.9474940334128878,
"grad_norm": 1.3960016965866089,
"learning_rate": 1.8835888644966325e-07,
"loss": 1.3927,
"step": 2448
},
{
"epoch": 1.9498806682577565,
"grad_norm": 1.4156994819641113,
"learning_rate": 1.7163410342956875e-07,
"loss": 1.4211,
"step": 2451
},
{
"epoch": 1.9522673031026252,
"grad_norm": 1.5609006881713867,
"learning_rate": 1.5568547761034004e-07,
"loss": 1.3577,
"step": 2454
},
{
"epoch": 1.9546539379474939,
"grad_norm": 1.5074419975280762,
"learning_rate": 1.4051325741756828e-07,
"loss": 1.4627,
"step": 2457
},
{
"epoch": 1.9570405727923628,
"grad_norm": 1.231116771697998,
"learning_rate": 1.2611767918306316e-07,
"loss": 1.3873,
"step": 2460
},
{
"epoch": 1.9594272076372317,
"grad_norm": 1.2997921705245972,
"learning_rate": 1.1249896714117802e-07,
"loss": 1.3963,
"step": 2463
},
{
"epoch": 1.9618138424821003,
"grad_norm": 1.2703529596328735,
"learning_rate": 9.965733342532924e-08,
"loss": 1.371,
"step": 2466
},
{
"epoch": 1.964200477326969,
"grad_norm": 1.2716647386550903,
"learning_rate": 8.759297806469335e-08,
"loss": 1.3068,
"step": 2469
},
{
"epoch": 1.9665871121718377,
"grad_norm": 1.36246657371521,
"learning_rate": 7.630608898105962e-08,
"loss": 1.3863,
"step": 2472
},
{
"epoch": 1.9689737470167064,
"grad_norm": 1.224776029586792,
"learning_rate": 6.579684198594338e-08,
"loss": 1.3254,
"step": 2475
},
{
"epoch": 1.971360381861575,
"grad_norm": 1.215903878211975,
"learning_rate": 5.606540077782163e-08,
"loss": 1.3006,
"step": 2478
},
{
"epoch": 1.973747016706444,
"grad_norm": 1.5066276788711548,
"learning_rate": 4.711191693959616e-08,
"loss": 1.3676,
"step": 2481
},
{
"epoch": 1.9761336515513126,
"grad_norm": 1.3745481967926025,
"learning_rate": 3.893652993621766e-08,
"loss": 1.372,
"step": 2484
},
{
"epoch": 1.9785202863961815,
"grad_norm": 1.4172685146331787,
"learning_rate": 3.1539367112543014e-08,
"loss": 1.482,
"step": 2487
},
{
"epoch": 1.9809069212410502,
"grad_norm": 1.3960031270980835,
"learning_rate": 2.4920543691309138e-08,
"loss": 1.3987,
"step": 2490
},
{
"epoch": 1.9832935560859188,
"grad_norm": 1.2992031574249268,
"learning_rate": 1.9080162771378808e-08,
"loss": 1.3605,
"step": 2493
},
{
"epoch": 1.9856801909307875,
"grad_norm": 1.2885679006576538,
"learning_rate": 1.4018315326103094e-08,
"loss": 1.3816,
"step": 2496
},
{
"epoch": 1.9880668257756562,
"grad_norm": 1.2640992403030396,
"learning_rate": 9.735080201922487e-09,
"loss": 1.3142,
"step": 2499
},
{
"epoch": 1.990453460620525,
"grad_norm": 1.2508050203323364,
"learning_rate": 6.2305241171345395e-09,
"loss": 1.4008,
"step": 2502
},
{
"epoch": 1.9928400954653938,
"grad_norm": 1.2504699230194092,
"learning_rate": 3.5047016608613647e-09,
"loss": 1.3362,
"step": 2505
},
{
"epoch": 1.9952267303102627,
"grad_norm": 1.4421279430389404,
"learning_rate": 1.5576552921836574e-09,
"loss": 1.4037,
"step": 2508
},
{
"epoch": 1.9976133651551313,
"grad_norm": 1.2950758934020996,
"learning_rate": 3.89415339491217e-10,
"loss": 1.3817,
"step": 2511
},
{
"epoch": 2.0,
"grad_norm": 1.6945881843566895,
"learning_rate": 0.0,
"loss": 1.4222,
"step": 2514
}
],
"logging_steps": 3,
"max_steps": 2514,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1257,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.254235526619464e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}