{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2514, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002386634844868735, "grad_norm": 79.25410461425781, "learning_rate": 2.3809523809523808e-06, "loss": 9.018, "step": 3 }, { "epoch": 0.00477326968973747, "grad_norm": 69.27806854248047, "learning_rate": 4.7619047619047615e-06, "loss": 8.1397, "step": 6 }, { "epoch": 0.007159904534606206, "grad_norm": 44.50214767456055, "learning_rate": 7.142857142857143e-06, "loss": 6.5055, "step": 9 }, { "epoch": 0.00954653937947494, "grad_norm": 22.070268630981445, "learning_rate": 9.523809523809523e-06, "loss": 5.3274, "step": 12 }, { "epoch": 0.011933174224343675, "grad_norm": 17.811525344848633, "learning_rate": 1.1904761904761905e-05, "loss": 5.0629, "step": 15 }, { "epoch": 0.014319809069212411, "grad_norm": 13.21521282196045, "learning_rate": 1.4285714285714285e-05, "loss": 4.8887, "step": 18 }, { "epoch": 0.016706443914081145, "grad_norm": 85.0751724243164, "learning_rate": 1.6666666666666667e-05, "loss": 4.6865, "step": 21 }, { "epoch": 0.01909307875894988, "grad_norm": 11.005032539367676, "learning_rate": 1.9047619047619046e-05, "loss": 4.4609, "step": 24 }, { "epoch": 0.021479713603818614, "grad_norm": 9.891936302185059, "learning_rate": 2.1428571428571428e-05, "loss": 4.4208, "step": 27 }, { "epoch": 0.02386634844868735, "grad_norm": 6.92100715637207, "learning_rate": 2.380952380952381e-05, "loss": 4.3251, "step": 30 }, { "epoch": 0.026252983293556086, "grad_norm": 4.971158504486084, "learning_rate": 2.6190476190476192e-05, "loss": 3.9813, "step": 33 }, { "epoch": 0.028639618138424822, "grad_norm": 4.140079021453857, "learning_rate": 2.857142857142857e-05, "loss": 4.0783, "step": 36 }, { "epoch": 0.031026252983293555, "grad_norm": 4.761113166809082, "learning_rate": 3.095238095238095e-05, "loss": 4.1418, "step": 39 }, { "epoch": 0.03341288782816229, "grad_norm": 5.746503829956055, "learning_rate": 3.3333333333333335e-05, "loss": 3.8944, "step": 42 }, { "epoch": 0.03579952267303103, "grad_norm": 4.554972171783447, "learning_rate": 3.571428571428572e-05, "loss": 3.7807, "step": 45 }, { "epoch": 0.03818615751789976, "grad_norm": 3.873955726623535, "learning_rate": 3.809523809523809e-05, "loss": 3.607, "step": 48 }, { "epoch": 0.0405727923627685, "grad_norm": 3.376633644104004, "learning_rate": 4.047619047619048e-05, "loss": 3.4685, "step": 51 }, { "epoch": 0.04295942720763723, "grad_norm": 5.888181686401367, "learning_rate": 4.2857142857142856e-05, "loss": 3.4832, "step": 54 }, { "epoch": 0.045346062052505964, "grad_norm": 5.094118595123291, "learning_rate": 4.523809523809524e-05, "loss": 3.3647, "step": 57 }, { "epoch": 0.0477326968973747, "grad_norm": 4.940067291259766, "learning_rate": 4.761904761904762e-05, "loss": 3.2268, "step": 60 }, { "epoch": 0.050119331742243436, "grad_norm": 3.514155149459839, "learning_rate": 5e-05, "loss": 3.1229, "step": 63 }, { "epoch": 0.05250596658711217, "grad_norm": 3.922811985015869, "learning_rate": 5.2380952380952384e-05, "loss": 3.0612, "step": 66 }, { "epoch": 0.05489260143198091, "grad_norm": 3.203274726867676, "learning_rate": 5.4761904761904766e-05, "loss": 2.8537, "step": 69 }, { "epoch": 0.057279236276849645, "grad_norm": 2.8945529460906982, "learning_rate": 5.714285714285714e-05, "loss": 2.9128, "step": 72 }, { "epoch": 0.059665871121718374, "grad_norm": 3.287409543991089, "learning_rate": 5.9523809523809524e-05, "loss": 2.8958, "step": 75 }, { "epoch": 0.06205250596658711, "grad_norm": 3.5249319076538086, "learning_rate": 6.19047619047619e-05, "loss": 2.8925, "step": 78 }, { "epoch": 0.06443914081145585, "grad_norm": 3.4028635025024414, "learning_rate": 6.428571428571429e-05, "loss": 2.8153, "step": 81 }, { "epoch": 0.06682577565632458, "grad_norm": 3.1509742736816406, "learning_rate": 6.666666666666667e-05, "loss": 2.7562, "step": 84 }, { "epoch": 0.06921241050119331, "grad_norm": 2.745244264602661, "learning_rate": 6.904761904761905e-05, "loss": 2.6552, "step": 87 }, { "epoch": 0.07159904534606205, "grad_norm": 2.562229633331299, "learning_rate": 7.142857142857143e-05, "loss": 2.6686, "step": 90 }, { "epoch": 0.07398568019093078, "grad_norm": 3.0172386169433594, "learning_rate": 7.380952380952382e-05, "loss": 2.782, "step": 93 }, { "epoch": 0.07637231503579953, "grad_norm": 3.038167953491211, "learning_rate": 7.619047619047618e-05, "loss": 2.6845, "step": 96 }, { "epoch": 0.07875894988066826, "grad_norm": 3.3530476093292236, "learning_rate": 7.857142857142858e-05, "loss": 2.5982, "step": 99 }, { "epoch": 0.081145584725537, "grad_norm": 2.9540939331054688, "learning_rate": 8.095238095238096e-05, "loss": 2.5005, "step": 102 }, { "epoch": 0.08353221957040573, "grad_norm": 2.9407269954681396, "learning_rate": 8.333333333333334e-05, "loss": 2.5421, "step": 105 }, { "epoch": 0.08591885441527446, "grad_norm": 2.8846545219421387, "learning_rate": 8.571428571428571e-05, "loss": 2.5134, "step": 108 }, { "epoch": 0.0883054892601432, "grad_norm": 3.2429230213165283, "learning_rate": 8.80952380952381e-05, "loss": 2.4542, "step": 111 }, { "epoch": 0.09069212410501193, "grad_norm": 2.5776901245117188, "learning_rate": 9.047619047619048e-05, "loss": 2.5908, "step": 114 }, { "epoch": 0.09307875894988067, "grad_norm": 2.4036340713500977, "learning_rate": 9.285714285714286e-05, "loss": 2.4376, "step": 117 }, { "epoch": 0.0954653937947494, "grad_norm": 3.9573254585266113, "learning_rate": 9.523809523809524e-05, "loss": 2.4042, "step": 120 }, { "epoch": 0.09785202863961814, "grad_norm": 3.6064600944519043, "learning_rate": 9.761904761904762e-05, "loss": 2.5074, "step": 123 }, { "epoch": 0.10023866348448687, "grad_norm": 3.3975508213043213, "learning_rate": 0.0001, "loss": 2.2322, "step": 126 }, { "epoch": 0.1026252983293556, "grad_norm": 2.3529391288757324, "learning_rate": 9.999961058466053e-05, "loss": 2.4629, "step": 129 }, { "epoch": 0.10501193317422435, "grad_norm": 2.434084892272949, "learning_rate": 9.999844234470782e-05, "loss": 2.3296, "step": 132 }, { "epoch": 0.10739856801909307, "grad_norm": 2.450005054473877, "learning_rate": 9.999649529833915e-05, "loss": 2.3464, "step": 135 }, { "epoch": 0.10978520286396182, "grad_norm": 3.4513914585113525, "learning_rate": 9.999376947588288e-05, "loss": 2.4644, "step": 138 }, { "epoch": 0.11217183770883055, "grad_norm": 3.2405099868774414, "learning_rate": 9.999026491979808e-05, "loss": 2.3977, "step": 141 }, { "epoch": 0.11455847255369929, "grad_norm": 1.8948777914047241, "learning_rate": 9.99859816846739e-05, "loss": 2.3372, "step": 144 }, { "epoch": 0.11694510739856802, "grad_norm": 2.3878233432769775, "learning_rate": 9.998091983722863e-05, "loss": 2.3408, "step": 147 }, { "epoch": 0.11933174224343675, "grad_norm": 2.373782157897949, "learning_rate": 9.99750794563087e-05, "loss": 2.1927, "step": 150 }, { "epoch": 0.12171837708830549, "grad_norm": 2.473146677017212, "learning_rate": 9.996846063288747e-05, "loss": 2.2674, "step": 153 }, { "epoch": 0.12410501193317422, "grad_norm": 2.17854905128479, "learning_rate": 9.996106347006379e-05, "loss": 2.4093, "step": 156 }, { "epoch": 0.12649164677804295, "grad_norm": 2.668506622314453, "learning_rate": 9.99528880830604e-05, "loss": 2.234, "step": 159 }, { "epoch": 0.1288782816229117, "grad_norm": 2.30295991897583, "learning_rate": 9.994393459922218e-05, "loss": 2.174, "step": 162 }, { "epoch": 0.13126491646778043, "grad_norm": 2.299704074859619, "learning_rate": 9.993420315801406e-05, "loss": 2.1369, "step": 165 }, { "epoch": 0.13365155131264916, "grad_norm": 4.072019100189209, "learning_rate": 9.992369391101895e-05, "loss": 2.2051, "step": 168 }, { "epoch": 0.1360381861575179, "grad_norm": 2.203756809234619, "learning_rate": 9.991240702193532e-05, "loss": 2.3608, "step": 171 }, { "epoch": 0.13842482100238662, "grad_norm": 2.2594192028045654, "learning_rate": 9.990034266657467e-05, "loss": 2.2503, "step": 174 }, { "epoch": 0.14081145584725538, "grad_norm": 2.214170217514038, "learning_rate": 9.988750103285883e-05, "loss": 2.1698, "step": 177 }, { "epoch": 0.1431980906921241, "grad_norm": 2.1277706623077393, "learning_rate": 9.987388232081694e-05, "loss": 2.2199, "step": 180 }, { "epoch": 0.14558472553699284, "grad_norm": 2.1861696243286133, "learning_rate": 9.985948674258243e-05, "loss": 2.1487, "step": 183 }, { "epoch": 0.14797136038186157, "grad_norm": 2.062450647354126, "learning_rate": 9.984431452238967e-05, "loss": 2.2716, "step": 186 }, { "epoch": 0.15035799522673032, "grad_norm": 2.335073471069336, "learning_rate": 9.982836589657043e-05, "loss": 2.1853, "step": 189 }, { "epoch": 0.15274463007159905, "grad_norm": 1.7519389390945435, "learning_rate": 9.981164111355035e-05, "loss": 2.2452, "step": 192 }, { "epoch": 0.15513126491646778, "grad_norm": 4.789851665496826, "learning_rate": 9.979414043384485e-05, "loss": 2.224, "step": 195 }, { "epoch": 0.1575178997613365, "grad_norm": 2.454585552215576, "learning_rate": 9.977586413005531e-05, "loss": 2.2943, "step": 198 }, { "epoch": 0.15990453460620524, "grad_norm": 2.2278664112091064, "learning_rate": 9.975681248686461e-05, "loss": 2.2947, "step": 201 }, { "epoch": 0.162291169451074, "grad_norm": 2.118945837020874, "learning_rate": 9.973698580103285e-05, "loss": 2.2039, "step": 204 }, { "epoch": 0.16467780429594273, "grad_norm": 2.1315999031066895, "learning_rate": 9.971638438139266e-05, "loss": 2.3493, "step": 207 }, { "epoch": 0.16706443914081145, "grad_norm": 2.902245283126831, "learning_rate": 9.96950085488444e-05, "loss": 2.0854, "step": 210 }, { "epoch": 0.16945107398568018, "grad_norm": 2.160647392272949, "learning_rate": 9.967285863635112e-05, "loss": 2.3386, "step": 213 }, { "epoch": 0.1718377088305489, "grad_norm": 3.592740774154663, "learning_rate": 9.964993498893349e-05, "loss": 2.2387, "step": 216 }, { "epoch": 0.17422434367541767, "grad_norm": 3.9422338008880615, "learning_rate": 9.962623796366429e-05, "loss": 2.2304, "step": 219 }, { "epoch": 0.1766109785202864, "grad_norm": 3.213841438293457, "learning_rate": 9.960176792966289e-05, "loss": 2.1835, "step": 222 }, { "epoch": 0.17899761336515513, "grad_norm": 2.4797472953796387, "learning_rate": 9.95765252680896e-05, "loss": 2.2491, "step": 225 }, { "epoch": 0.18138424821002386, "grad_norm": 2.0377020835876465, "learning_rate": 9.95505103721396e-05, "loss": 2.1921, "step": 228 }, { "epoch": 0.18377088305489261, "grad_norm": 2.2581214904785156, "learning_rate": 9.952372364703687e-05, "loss": 2.2655, "step": 231 }, { "epoch": 0.18615751789976134, "grad_norm": 2.89260196685791, "learning_rate": 9.949616551002787e-05, "loss": 2.1804, "step": 234 }, { "epoch": 0.18854415274463007, "grad_norm": 2.4222378730773926, "learning_rate": 9.946783639037504e-05, "loss": 2.1525, "step": 237 }, { "epoch": 0.1909307875894988, "grad_norm": 2.1491341590881348, "learning_rate": 9.943873672935014e-05, "loss": 2.1605, "step": 240 }, { "epoch": 0.19331742243436753, "grad_norm": 1.9782716035842896, "learning_rate": 9.940886698022734e-05, "loss": 2.0402, "step": 243 }, { "epoch": 0.1957040572792363, "grad_norm": 1.9801812171936035, "learning_rate": 9.93782276082762e-05, "loss": 2.1493, "step": 246 }, { "epoch": 0.19809069212410502, "grad_norm": 1.7495440244674683, "learning_rate": 9.934681909075434e-05, "loss": 2.1087, "step": 249 }, { "epoch": 0.20047732696897375, "grad_norm": 1.9998902082443237, "learning_rate": 9.931464191690015e-05, "loss": 2.1841, "step": 252 }, { "epoch": 0.20286396181384247, "grad_norm": 2.2117197513580322, "learning_rate": 9.928169658792498e-05, "loss": 2.1848, "step": 255 }, { "epoch": 0.2052505966587112, "grad_norm": 2.210057497024536, "learning_rate": 9.924798361700553e-05, "loss": 2.1324, "step": 258 }, { "epoch": 0.20763723150357996, "grad_norm": 1.9545291662216187, "learning_rate": 9.92135035292757e-05, "loss": 2.1454, "step": 261 }, { "epoch": 0.2100238663484487, "grad_norm": 2.3102903366088867, "learning_rate": 9.91782568618185e-05, "loss": 2.183, "step": 264 }, { "epoch": 0.21241050119331742, "grad_norm": 1.937774419784546, "learning_rate": 9.914224416365764e-05, "loss": 2.2033, "step": 267 }, { "epoch": 0.21479713603818615, "grad_norm": 1.8689621686935425, "learning_rate": 9.910546599574902e-05, "loss": 2.3252, "step": 270 }, { "epoch": 0.2171837708830549, "grad_norm": 2.4020369052886963, "learning_rate": 9.906792293097194e-05, "loss": 2.1485, "step": 273 }, { "epoch": 0.21957040572792363, "grad_norm": 4.170408248901367, "learning_rate": 9.90296155541202e-05, "loss": 2.1413, "step": 276 }, { "epoch": 0.22195704057279236, "grad_norm": 1.9683239459991455, "learning_rate": 9.899054446189304e-05, "loss": 1.9998, "step": 279 }, { "epoch": 0.2243436754176611, "grad_norm": 1.9885108470916748, "learning_rate": 9.895071026288574e-05, "loss": 2.0373, "step": 282 }, { "epoch": 0.22673031026252982, "grad_norm": 2.297595977783203, "learning_rate": 9.891011357758022e-05, "loss": 2.0459, "step": 285 }, { "epoch": 0.22911694510739858, "grad_norm": 2.0850205421447754, "learning_rate": 9.886875503833536e-05, "loss": 2.2041, "step": 288 }, { "epoch": 0.2315035799522673, "grad_norm": 2.3734419345855713, "learning_rate": 9.882663528937717e-05, "loss": 2.1135, "step": 291 }, { "epoch": 0.23389021479713604, "grad_norm": 2.0763649940490723, "learning_rate": 9.87837549867887e-05, "loss": 1.955, "step": 294 }, { "epoch": 0.23627684964200477, "grad_norm": 2.171724557876587, "learning_rate": 9.87401147984998e-05, "loss": 2.2409, "step": 297 }, { "epoch": 0.2386634844868735, "grad_norm": 2.6933743953704834, "learning_rate": 9.869571540427689e-05, "loss": 2.1241, "step": 300 }, { "epoch": 0.24105011933174225, "grad_norm": 1.917708396911621, "learning_rate": 9.865055749571215e-05, "loss": 2.1745, "step": 303 }, { "epoch": 0.24343675417661098, "grad_norm": 2.3014838695526123, "learning_rate": 9.860464177621284e-05, "loss": 1.9313, "step": 306 }, { "epoch": 0.2458233890214797, "grad_norm": 2.2263333797454834, "learning_rate": 9.855796896099045e-05, "loss": 2.1881, "step": 309 }, { "epoch": 0.24821002386634844, "grad_norm": 2.161557197570801, "learning_rate": 9.851053977704931e-05, "loss": 1.9702, "step": 312 }, { "epoch": 0.25059665871121717, "grad_norm": 2.1806318759918213, "learning_rate": 9.846235496317555e-05, "loss": 1.9893, "step": 315 }, { "epoch": 0.2529832935560859, "grad_norm": 2.226500988006592, "learning_rate": 9.841341526992536e-05, "loss": 2.0987, "step": 318 }, { "epoch": 0.2553699284009546, "grad_norm": 1.8919053077697754, "learning_rate": 9.836372145961345e-05, "loss": 2.1568, "step": 321 }, { "epoch": 0.2577565632458234, "grad_norm": 1.9861871004104614, "learning_rate": 9.83132743063011e-05, "loss": 2.1504, "step": 324 }, { "epoch": 0.26014319809069214, "grad_norm": 2.438837766647339, "learning_rate": 9.826207459578411e-05, "loss": 1.9779, "step": 327 }, { "epoch": 0.26252983293556087, "grad_norm": 2.6724672317504883, "learning_rate": 9.821012312558058e-05, "loss": 1.9427, "step": 330 }, { "epoch": 0.2649164677804296, "grad_norm": 1.813621163368225, "learning_rate": 9.815742070491852e-05, "loss": 2.1153, "step": 333 }, { "epoch": 0.26730310262529833, "grad_norm": 2.0258703231811523, "learning_rate": 9.810396815472314e-05, "loss": 2.02, "step": 336 }, { "epoch": 0.26968973747016706, "grad_norm": 1.9743555784225464, "learning_rate": 9.804976630760419e-05, "loss": 2.0681, "step": 339 }, { "epoch": 0.2720763723150358, "grad_norm": 1.8202836513519287, "learning_rate": 9.799481600784286e-05, "loss": 1.9733, "step": 342 }, { "epoch": 0.2744630071599045, "grad_norm": 2.296698808670044, "learning_rate": 9.793911811137875e-05, "loss": 2.0407, "step": 345 }, { "epoch": 0.27684964200477324, "grad_norm": 2.002511501312256, "learning_rate": 9.788267348579648e-05, "loss": 2.0666, "step": 348 }, { "epoch": 0.27923627684964203, "grad_norm": 2.08190655708313, "learning_rate": 9.782548301031217e-05, "loss": 2.0803, "step": 351 }, { "epoch": 0.28162291169451076, "grad_norm": 2.150238513946533, "learning_rate": 9.776754757575975e-05, "loss": 2.0132, "step": 354 }, { "epoch": 0.2840095465393795, "grad_norm": 2.2527501583099365, "learning_rate": 9.770886808457709e-05, "loss": 2.0905, "step": 357 }, { "epoch": 0.2863961813842482, "grad_norm": 2.284032106399536, "learning_rate": 9.764944545079196e-05, "loss": 2.1845, "step": 360 }, { "epoch": 0.28878281622911695, "grad_norm": 1.89547598361969, "learning_rate": 9.758928060000778e-05, "loss": 2.1135, "step": 363 }, { "epoch": 0.2911694510739857, "grad_norm": 2.5930140018463135, "learning_rate": 9.752837446938915e-05, "loss": 1.9517, "step": 366 }, { "epoch": 0.2935560859188544, "grad_norm": 1.86557137966156, "learning_rate": 9.746672800764735e-05, "loss": 1.9579, "step": 369 }, { "epoch": 0.29594272076372313, "grad_norm": 1.8821232318878174, "learning_rate": 9.740434217502547e-05, "loss": 1.9665, "step": 372 }, { "epoch": 0.29832935560859186, "grad_norm": 2.0216808319091797, "learning_rate": 9.734121794328357e-05, "loss": 2.0612, "step": 375 }, { "epoch": 0.30071599045346065, "grad_norm": 1.8245267868041992, "learning_rate": 9.727735629568336e-05, "loss": 2.0857, "step": 378 }, { "epoch": 0.3031026252983294, "grad_norm": 1.9001407623291016, "learning_rate": 9.721275822697306e-05, "loss": 1.9954, "step": 381 }, { "epoch": 0.3054892601431981, "grad_norm": 1.7628698348999023, "learning_rate": 9.714742474337186e-05, "loss": 2.1095, "step": 384 }, { "epoch": 0.30787589498806683, "grad_norm": 1.687436819076538, "learning_rate": 9.708135686255416e-05, "loss": 2.1919, "step": 387 }, { "epoch": 0.31026252983293556, "grad_norm": 2.219071388244629, "learning_rate": 9.701455561363379e-05, "loss": 1.9392, "step": 390 }, { "epoch": 0.3126491646778043, "grad_norm": 1.9545831680297852, "learning_rate": 9.6947022037148e-05, "loss": 1.9879, "step": 393 }, { "epoch": 0.315035799522673, "grad_norm": 2.1863789558410645, "learning_rate": 9.687875718504126e-05, "loss": 1.9631, "step": 396 }, { "epoch": 0.31742243436754175, "grad_norm": 3.1137101650238037, "learning_rate": 9.680976212064874e-05, "loss": 2.0387, "step": 399 }, { "epoch": 0.3198090692124105, "grad_norm": 1.9021557569503784, "learning_rate": 9.674003791867991e-05, "loss": 2.0447, "step": 402 }, { "epoch": 0.3221957040572792, "grad_norm": 1.8783704042434692, "learning_rate": 9.666958566520174e-05, "loss": 2.0777, "step": 405 }, { "epoch": 0.324582338902148, "grad_norm": 1.910521388053894, "learning_rate": 9.659840645762175e-05, "loss": 2.1084, "step": 408 }, { "epoch": 0.3269689737470167, "grad_norm": 1.9456645250320435, "learning_rate": 9.652650140467093e-05, "loss": 2.0317, "step": 411 }, { "epoch": 0.32935560859188545, "grad_norm": 1.8211055994033813, "learning_rate": 9.645387162638652e-05, "loss": 2.0386, "step": 414 }, { "epoch": 0.3317422434367542, "grad_norm": 2.032345771789551, "learning_rate": 9.638051825409453e-05, "loss": 2.2154, "step": 417 }, { "epoch": 0.3341288782816229, "grad_norm": 1.869388461112976, "learning_rate": 9.630644243039207e-05, "loss": 1.9595, "step": 420 }, { "epoch": 0.33651551312649164, "grad_norm": 1.9663021564483643, "learning_rate": 9.623164530912963e-05, "loss": 2.1678, "step": 423 }, { "epoch": 0.33890214797136037, "grad_norm": 1.684556484222412, "learning_rate": 9.615612805539305e-05, "loss": 1.9458, "step": 426 }, { "epoch": 0.3412887828162291, "grad_norm": 2.050321102142334, "learning_rate": 9.607989184548543e-05, "loss": 2.0412, "step": 429 }, { "epoch": 0.3436754176610978, "grad_norm": 1.753670334815979, "learning_rate": 9.600293786690872e-05, "loss": 2.037, "step": 432 }, { "epoch": 0.3460620525059666, "grad_norm": 1.8368828296661377, "learning_rate": 9.592526731834537e-05, "loss": 2.1845, "step": 435 }, { "epoch": 0.34844868735083534, "grad_norm": 1.8000643253326416, "learning_rate": 9.584688140963944e-05, "loss": 1.8592, "step": 438 }, { "epoch": 0.35083532219570407, "grad_norm": 1.7274373769760132, "learning_rate": 9.576778136177798e-05, "loss": 2.0366, "step": 441 }, { "epoch": 0.3532219570405728, "grad_norm": 1.8421188592910767, "learning_rate": 9.568796840687184e-05, "loss": 1.9281, "step": 444 }, { "epoch": 0.3556085918854415, "grad_norm": 1.8605895042419434, "learning_rate": 9.560744378813659e-05, "loss": 2.0214, "step": 447 }, { "epoch": 0.35799522673031026, "grad_norm": 2.4788737297058105, "learning_rate": 9.552620875987311e-05, "loss": 1.9457, "step": 450 }, { "epoch": 0.360381861575179, "grad_norm": 3.01055645942688, "learning_rate": 9.544426458744804e-05, "loss": 1.879, "step": 453 }, { "epoch": 0.3627684964200477, "grad_norm": 2.6994807720184326, "learning_rate": 9.536161254727408e-05, "loss": 1.9495, "step": 456 }, { "epoch": 0.36515513126491644, "grad_norm": 1.736722469329834, "learning_rate": 9.527825392679012e-05, "loss": 1.942, "step": 459 }, { "epoch": 0.36754176610978523, "grad_norm": 2.2707650661468506, "learning_rate": 9.51941900244412e-05, "loss": 1.9525, "step": 462 }, { "epoch": 0.36992840095465396, "grad_norm": 1.7479629516601562, "learning_rate": 9.51094221496582e-05, "loss": 1.9385, "step": 465 }, { "epoch": 0.3723150357995227, "grad_norm": 2.1110448837280273, "learning_rate": 9.502395162283759e-05, "loss": 1.8335, "step": 468 }, { "epoch": 0.3747016706443914, "grad_norm": 1.8206532001495361, "learning_rate": 9.493777977532072e-05, "loss": 1.9642, "step": 471 }, { "epoch": 0.37708830548926014, "grad_norm": 2.622044801712036, "learning_rate": 9.485090794937319e-05, "loss": 1.9383, "step": 474 }, { "epoch": 0.3794749403341289, "grad_norm": 2.016352653503418, "learning_rate": 9.476333749816382e-05, "loss": 1.8639, "step": 477 }, { "epoch": 0.3818615751789976, "grad_norm": 2.027357816696167, "learning_rate": 9.467506978574371e-05, "loss": 1.862, "step": 480 }, { "epoch": 0.38424821002386633, "grad_norm": 1.91681969165802, "learning_rate": 9.45861061870249e-05, "loss": 2.0105, "step": 483 }, { "epoch": 0.38663484486873506, "grad_norm": 1.8423808813095093, "learning_rate": 9.449644808775902e-05, "loss": 1.9303, "step": 486 }, { "epoch": 0.38902147971360385, "grad_norm": 1.5539026260375977, "learning_rate": 9.44060968845156e-05, "loss": 1.9071, "step": 489 }, { "epoch": 0.3914081145584726, "grad_norm": 1.8300362825393677, "learning_rate": 9.431505398466045e-05, "loss": 2.0194, "step": 492 }, { "epoch": 0.3937947494033413, "grad_norm": 1.6660507917404175, "learning_rate": 9.42233208063336e-05, "loss": 1.8226, "step": 495 }, { "epoch": 0.39618138424821003, "grad_norm": 1.9865005016326904, "learning_rate": 9.413089877842736e-05, "loss": 2.0689, "step": 498 }, { "epoch": 0.39856801909307876, "grad_norm": 1.9386086463928223, "learning_rate": 9.403778934056391e-05, "loss": 2.0289, "step": 501 }, { "epoch": 0.4009546539379475, "grad_norm": 1.9398423433303833, "learning_rate": 9.394399394307303e-05, "loss": 2.2213, "step": 504 }, { "epoch": 0.4033412887828162, "grad_norm": 1.864970326423645, "learning_rate": 9.384951404696933e-05, "loss": 1.8574, "step": 507 }, { "epoch": 0.40572792362768495, "grad_norm": 1.9465175867080688, "learning_rate": 9.375435112392969e-05, "loss": 2.0628, "step": 510 }, { "epoch": 0.4081145584725537, "grad_norm": 1.808294415473938, "learning_rate": 9.365850665627016e-05, "loss": 1.9223, "step": 513 }, { "epoch": 0.4105011933174224, "grad_norm": 2.4403979778289795, "learning_rate": 9.356198213692297e-05, "loss": 1.8865, "step": 516 }, { "epoch": 0.4128878281622912, "grad_norm": 1.8101935386657715, "learning_rate": 9.346477906941331e-05, "loss": 1.8335, "step": 519 }, { "epoch": 0.4152744630071599, "grad_norm": 1.718172311782837, "learning_rate": 9.336689896783573e-05, "loss": 1.8691, "step": 522 }, { "epoch": 0.41766109785202865, "grad_norm": 1.8754642009735107, "learning_rate": 9.32683433568308e-05, "loss": 1.9212, "step": 525 }, { "epoch": 0.4200477326968974, "grad_norm": 2.471646785736084, "learning_rate": 9.316911377156117e-05, "loss": 2.0121, "step": 528 }, { "epoch": 0.4224343675417661, "grad_norm": 2.1250839233398438, "learning_rate": 9.306921175768775e-05, "loss": 1.9309, "step": 531 }, { "epoch": 0.42482100238663484, "grad_norm": 2.0609402656555176, "learning_rate": 9.29686388713456e-05, "loss": 2.1438, "step": 534 }, { "epoch": 0.42720763723150357, "grad_norm": 1.7126508951187134, "learning_rate": 9.286739667911972e-05, "loss": 1.9621, "step": 537 }, { "epoch": 0.4295942720763723, "grad_norm": 1.7493348121643066, "learning_rate": 9.276548675802059e-05, "loss": 2.0063, "step": 540 }, { "epoch": 0.431980906921241, "grad_norm": 1.8076331615447998, "learning_rate": 9.266291069545972e-05, "loss": 1.9256, "step": 543 }, { "epoch": 0.4343675417661098, "grad_norm": 1.6762983798980713, "learning_rate": 9.255967008922474e-05, "loss": 2.0052, "step": 546 }, { "epoch": 0.43675417661097854, "grad_norm": 1.4335881471633911, "learning_rate": 9.245576654745471e-05, "loss": 2.0899, "step": 549 }, { "epoch": 0.43914081145584727, "grad_norm": 1.6479798555374146, "learning_rate": 9.235120168861496e-05, "loss": 1.7962, "step": 552 }, { "epoch": 0.441527446300716, "grad_norm": 2.1050121784210205, "learning_rate": 9.224597714147186e-05, "loss": 2.0109, "step": 555 }, { "epoch": 0.4439140811455847, "grad_norm": 1.6112617254257202, "learning_rate": 9.214009454506753e-05, "loss": 1.8432, "step": 558 }, { "epoch": 0.44630071599045346, "grad_norm": 1.8361741304397583, "learning_rate": 9.203355554869428e-05, "loss": 1.9433, "step": 561 }, { "epoch": 0.4486873508353222, "grad_norm": 3.137519121170044, "learning_rate": 9.192636181186888e-05, "loss": 1.7776, "step": 564 }, { "epoch": 0.4510739856801909, "grad_norm": 1.79214346408844, "learning_rate": 9.181851500430673e-05, "loss": 1.8203, "step": 567 }, { "epoch": 0.45346062052505964, "grad_norm": 2.010784149169922, "learning_rate": 9.171001680589588e-05, "loss": 1.8505, "step": 570 }, { "epoch": 0.45584725536992843, "grad_norm": 2.2128775119781494, "learning_rate": 9.160086890667086e-05, "loss": 1.9007, "step": 573 }, { "epoch": 0.45823389021479716, "grad_norm": 1.6658575534820557, "learning_rate": 9.14910730067863e-05, "loss": 1.9516, "step": 576 }, { "epoch": 0.4606205250596659, "grad_norm": 1.6554961204528809, "learning_rate": 9.138063081649051e-05, "loss": 2.0685, "step": 579 }, { "epoch": 0.4630071599045346, "grad_norm": 1.6959861516952515, "learning_rate": 9.126954405609882e-05, "loss": 1.9156, "step": 582 }, { "epoch": 0.46539379474940334, "grad_norm": 1.795530915260315, "learning_rate": 9.115781445596676e-05, "loss": 1.7886, "step": 585 }, { "epoch": 0.4677804295942721, "grad_norm": 1.9987218379974365, "learning_rate": 9.104544375646313e-05, "loss": 2.1443, "step": 588 }, { "epoch": 0.4701670644391408, "grad_norm": 1.8751397132873535, "learning_rate": 9.093243370794291e-05, "loss": 1.8975, "step": 591 }, { "epoch": 0.47255369928400953, "grad_norm": 1.921032428741455, "learning_rate": 9.081878607071996e-05, "loss": 1.9969, "step": 594 }, { "epoch": 0.47494033412887826, "grad_norm": 4.745877742767334, "learning_rate": 9.07045026150396e-05, "loss": 1.9351, "step": 597 }, { "epoch": 0.477326968973747, "grad_norm": 1.8512818813323975, "learning_rate": 9.058958512105104e-05, "loss": 1.8773, "step": 600 }, { "epoch": 0.4797136038186158, "grad_norm": 2.7076666355133057, "learning_rate": 9.047403537877971e-05, "loss": 1.8907, "step": 603 }, { "epoch": 0.4821002386634845, "grad_norm": 1.9602298736572266, "learning_rate": 9.035785518809927e-05, "loss": 1.955, "step": 606 }, { "epoch": 0.48448687350835323, "grad_norm": 1.8718470335006714, "learning_rate": 9.024104635870368e-05, "loss": 1.9148, "step": 609 }, { "epoch": 0.48687350835322196, "grad_norm": 1.708706259727478, "learning_rate": 9.012361071007891e-05, "loss": 1.9202, "step": 612 }, { "epoch": 0.4892601431980907, "grad_norm": 2.0447752475738525, "learning_rate": 9.000555007147469e-05, "loss": 1.9347, "step": 615 }, { "epoch": 0.4916467780429594, "grad_norm": 2.0071353912353516, "learning_rate": 8.988686628187597e-05, "loss": 1.9392, "step": 618 }, { "epoch": 0.49403341288782815, "grad_norm": 1.9587384462356567, "learning_rate": 8.976756118997427e-05, "loss": 1.9572, "step": 621 }, { "epoch": 0.4964200477326969, "grad_norm": 2.829688549041748, "learning_rate": 8.964763665413893e-05, "loss": 1.7861, "step": 624 }, { "epoch": 0.4988066825775656, "grad_norm": 1.878251075744629, "learning_rate": 8.952709454238808e-05, "loss": 2.0488, "step": 627 }, { "epoch": 0.5011933174224343, "grad_norm": 1.6959253549575806, "learning_rate": 8.940593673235962e-05, "loss": 2.0164, "step": 630 }, { "epoch": 0.5035799522673031, "grad_norm": 1.8719433546066284, "learning_rate": 8.928416511128195e-05, "loss": 1.8288, "step": 633 }, { "epoch": 0.5059665871121718, "grad_norm": 1.9740793704986572, "learning_rate": 8.916178157594453e-05, "loss": 1.8924, "step": 636 }, { "epoch": 0.5083532219570406, "grad_norm": 2.177084445953369, "learning_rate": 8.903878803266841e-05, "loss": 2.0888, "step": 639 }, { "epoch": 0.5107398568019093, "grad_norm": 1.6153382062911987, "learning_rate": 8.891518639727649e-05, "loss": 1.7814, "step": 642 }, { "epoch": 0.513126491646778, "grad_norm": 2.0001626014709473, "learning_rate": 8.879097859506372e-05, "loss": 1.9432, "step": 645 }, { "epoch": 0.5155131264916468, "grad_norm": 2.0495731830596924, "learning_rate": 8.866616656076696e-05, "loss": 1.8339, "step": 648 }, { "epoch": 0.5178997613365155, "grad_norm": 1.7960389852523804, "learning_rate": 8.854075223853508e-05, "loss": 1.9401, "step": 651 }, { "epoch": 0.5202863961813843, "grad_norm": 1.9565211534500122, "learning_rate": 8.841473758189854e-05, "loss": 1.8112, "step": 654 }, { "epoch": 0.522673031026253, "grad_norm": 2.022819995880127, "learning_rate": 8.828812455373891e-05, "loss": 2.0094, "step": 657 }, { "epoch": 0.5250596658711217, "grad_norm": 1.8336989879608154, "learning_rate": 8.816091512625843e-05, "loss": 1.9463, "step": 660 }, { "epoch": 0.5274463007159904, "grad_norm": 2.031402111053467, "learning_rate": 8.803311128094918e-05, "loss": 1.9657, "step": 663 }, { "epoch": 0.5298329355608592, "grad_norm": 2.0874717235565186, "learning_rate": 8.790471500856228e-05, "loss": 2.0375, "step": 666 }, { "epoch": 0.5322195704057279, "grad_norm": 1.7618159055709839, "learning_rate": 8.777572830907684e-05, "loss": 2.1104, "step": 669 }, { "epoch": 0.5346062052505967, "grad_norm": 1.6236835718154907, "learning_rate": 8.764615319166886e-05, "loss": 2.0333, "step": 672 }, { "epoch": 0.5369928400954654, "grad_norm": 1.613146424293518, "learning_rate": 8.751599167467985e-05, "loss": 1.8055, "step": 675 }, { "epoch": 0.5393794749403341, "grad_norm": 1.5570297241210938, "learning_rate": 8.738524578558547e-05, "loss": 1.8801, "step": 678 }, { "epoch": 0.5417661097852029, "grad_norm": 1.7564152479171753, "learning_rate": 8.72539175609639e-05, "loss": 1.8072, "step": 681 }, { "epoch": 0.5441527446300716, "grad_norm": 1.7274627685546875, "learning_rate": 8.712200904646416e-05, "loss": 1.787, "step": 684 }, { "epoch": 0.5465393794749404, "grad_norm": 2.1072866916656494, "learning_rate": 8.698952229677422e-05, "loss": 1.8179, "step": 687 }, { "epoch": 0.548926014319809, "grad_norm": 2.1879334449768066, "learning_rate": 8.685645937558896e-05, "loss": 2.0755, "step": 690 }, { "epoch": 0.5513126491646778, "grad_norm": 1.7754478454589844, "learning_rate": 8.67228223555781e-05, "loss": 1.8438, "step": 693 }, { "epoch": 0.5536992840095465, "grad_norm": 2.0312633514404297, "learning_rate": 8.658861331835385e-05, "loss": 1.9058, "step": 696 }, { "epoch": 0.5560859188544153, "grad_norm": 1.9634501934051514, "learning_rate": 8.645383435443852e-05, "loss": 1.8278, "step": 699 }, { "epoch": 0.5584725536992841, "grad_norm": 1.915229082107544, "learning_rate": 8.631848756323197e-05, "loss": 1.9127, "step": 702 }, { "epoch": 0.5608591885441527, "grad_norm": 1.7499381303787231, "learning_rate": 8.618257505297886e-05, "loss": 1.9196, "step": 705 }, { "epoch": 0.5632458233890215, "grad_norm": 2.883967161178589, "learning_rate": 8.604609894073584e-05, "loss": 1.7157, "step": 708 }, { "epoch": 0.5656324582338902, "grad_norm": 1.8204519748687744, "learning_rate": 8.590906135233854e-05, "loss": 1.845, "step": 711 }, { "epoch": 0.568019093078759, "grad_norm": 3.1434381008148193, "learning_rate": 8.577146442236857e-05, "loss": 1.9142, "step": 714 }, { "epoch": 0.5704057279236276, "grad_norm": 1.7107082605361938, "learning_rate": 8.563331029412012e-05, "loss": 1.9358, "step": 717 }, { "epoch": 0.5727923627684964, "grad_norm": 2.010882616043091, "learning_rate": 8.549460111956664e-05, "loss": 1.8745, "step": 720 }, { "epoch": 0.5751789976133651, "grad_norm": 2.873108148574829, "learning_rate": 8.535533905932738e-05, "loss": 1.8725, "step": 723 }, { "epoch": 0.5775656324582339, "grad_norm": 2.0034258365631104, "learning_rate": 8.521552628263362e-05, "loss": 1.8666, "step": 726 }, { "epoch": 0.5799522673031027, "grad_norm": 1.7691940069198608, "learning_rate": 8.507516496729495e-05, "loss": 1.8096, "step": 729 }, { "epoch": 0.5823389021479713, "grad_norm": 1.77200186252594, "learning_rate": 8.493425729966534e-05, "loss": 1.9294, "step": 732 }, { "epoch": 0.5847255369928401, "grad_norm": 1.6291214227676392, "learning_rate": 8.479280547460907e-05, "loss": 1.8242, "step": 735 }, { "epoch": 0.5871121718377088, "grad_norm": 1.5914812088012695, "learning_rate": 8.465081169546659e-05, "loss": 1.9836, "step": 738 }, { "epoch": 0.5894988066825776, "grad_norm": 1.6103566884994507, "learning_rate": 8.450827817402011e-05, "loss": 1.9699, "step": 741 }, { "epoch": 0.5918854415274463, "grad_norm": 2.9773311614990234, "learning_rate": 8.436520713045922e-05, "loss": 1.7708, "step": 744 }, { "epoch": 0.594272076372315, "grad_norm": 1.5203982591629028, "learning_rate": 8.422160079334628e-05, "loss": 1.8533, "step": 747 }, { "epoch": 0.5966587112171837, "grad_norm": 3.8291232585906982, "learning_rate": 8.40774613995817e-05, "loss": 1.8964, "step": 750 }, { "epoch": 0.5990453460620525, "grad_norm": 1.650585412979126, "learning_rate": 8.393279119436912e-05, "loss": 1.9163, "step": 753 }, { "epoch": 0.6014319809069213, "grad_norm": 1.588335633277893, "learning_rate": 8.378759243118044e-05, "loss": 2.0618, "step": 756 }, { "epoch": 0.60381861575179, "grad_norm": 1.9786241054534912, "learning_rate": 8.364186737172068e-05, "loss": 1.8235, "step": 759 }, { "epoch": 0.6062052505966588, "grad_norm": 1.7208276987075806, "learning_rate": 8.349561828589277e-05, "loss": 2.0045, "step": 762 }, { "epoch": 0.6085918854415274, "grad_norm": 1.713976502418518, "learning_rate": 8.33488474517622e-05, "loss": 1.8602, "step": 765 }, { "epoch": 0.6109785202863962, "grad_norm": 1.655760407447815, "learning_rate": 8.320155715552155e-05, "loss": 1.8096, "step": 768 }, { "epoch": 0.6133651551312649, "grad_norm": 1.82340669631958, "learning_rate": 8.305374969145488e-05, "loss": 1.9755, "step": 771 }, { "epoch": 0.6157517899761337, "grad_norm": 1.6505107879638672, "learning_rate": 8.290542736190188e-05, "loss": 1.7543, "step": 774 }, { "epoch": 0.6181384248210023, "grad_norm": 1.6107587814331055, "learning_rate": 8.275659247722222e-05, "loss": 1.7788, "step": 777 }, { "epoch": 0.6205250596658711, "grad_norm": 1.7392557859420776, "learning_rate": 8.260724735575933e-05, "loss": 1.8713, "step": 780 }, { "epoch": 0.6229116945107399, "grad_norm": 1.8423359394073486, "learning_rate": 8.24573943238045e-05, "loss": 1.9501, "step": 783 }, { "epoch": 0.6252983293556086, "grad_norm": 1.596801996231079, "learning_rate": 8.230703571556048e-05, "loss": 1.7561, "step": 786 }, { "epoch": 0.6276849642004774, "grad_norm": 1.6264513731002808, "learning_rate": 8.215617387310524e-05, "loss": 1.812, "step": 789 }, { "epoch": 0.630071599045346, "grad_norm": 1.8065801858901978, "learning_rate": 8.200481114635536e-05, "loss": 1.8587, "step": 792 }, { "epoch": 0.6324582338902148, "grad_norm": 1.6027936935424805, "learning_rate": 8.185294989302958e-05, "loss": 1.7951, "step": 795 }, { "epoch": 0.6348448687350835, "grad_norm": 1.863053560256958, "learning_rate": 8.170059247861194e-05, "loss": 1.791, "step": 798 }, { "epoch": 0.6372315035799523, "grad_norm": 1.7930762767791748, "learning_rate": 8.154774127631501e-05, "loss": 1.7575, "step": 801 }, { "epoch": 0.639618138424821, "grad_norm": 2.0538759231567383, "learning_rate": 8.139439866704293e-05, "loss": 1.8417, "step": 804 }, { "epoch": 0.6420047732696897, "grad_norm": 2.5710806846618652, "learning_rate": 8.124056703935423e-05, "loss": 1.8187, "step": 807 }, { "epoch": 0.6443914081145584, "grad_norm": 1.6980230808258057, "learning_rate": 8.108624878942477e-05, "loss": 1.8364, "step": 810 }, { "epoch": 0.6467780429594272, "grad_norm": 1.7313123941421509, "learning_rate": 8.093144632101026e-05, "loss": 1.7538, "step": 813 }, { "epoch": 0.649164677804296, "grad_norm": 1.6911081075668335, "learning_rate": 8.077616204540897e-05, "loss": 1.8258, "step": 816 }, { "epoch": 0.6515513126491647, "grad_norm": 1.9907560348510742, "learning_rate": 8.062039838142402e-05, "loss": 1.7978, "step": 819 }, { "epoch": 0.6539379474940334, "grad_norm": 1.9501363039016724, "learning_rate": 8.046415775532585e-05, "loss": 1.8116, "step": 822 }, { "epoch": 0.6563245823389021, "grad_norm": 1.7937824726104736, "learning_rate": 8.030744260081426e-05, "loss": 1.8347, "step": 825 }, { "epoch": 0.6587112171837709, "grad_norm": 1.682985782623291, "learning_rate": 8.015025535898073e-05, "loss": 1.8879, "step": 828 }, { "epoch": 0.6610978520286396, "grad_norm": 1.7274394035339355, "learning_rate": 7.999259847827015e-05, "loss": 1.8931, "step": 831 }, { "epoch": 0.6634844868735084, "grad_norm": 1.7429416179656982, "learning_rate": 7.983447441444281e-05, "loss": 1.7171, "step": 834 }, { "epoch": 0.665871121718377, "grad_norm": 1.949879765510559, "learning_rate": 7.967588563053616e-05, "loss": 1.7779, "step": 837 }, { "epoch": 0.6682577565632458, "grad_norm": 1.5538753271102905, "learning_rate": 7.951683459682641e-05, "loss": 1.8087, "step": 840 }, { "epoch": 0.6706443914081146, "grad_norm": 1.7967875003814697, "learning_rate": 7.935732379079008e-05, "loss": 2.0304, "step": 843 }, { "epoch": 0.6730310262529833, "grad_norm": 1.7717353105545044, "learning_rate": 7.919735569706533e-05, "loss": 1.904, "step": 846 }, { "epoch": 0.6754176610978521, "grad_norm": 1.6083266735076904, "learning_rate": 7.903693280741331e-05, "loss": 1.8501, "step": 849 }, { "epoch": 0.6778042959427207, "grad_norm": 1.6468119621276855, "learning_rate": 7.887605762067945e-05, "loss": 1.7535, "step": 852 }, { "epoch": 0.6801909307875895, "grad_norm": 1.6218470335006714, "learning_rate": 7.871473264275429e-05, "loss": 1.7495, "step": 855 }, { "epoch": 0.6825775656324582, "grad_norm": 1.6359236240386963, "learning_rate": 7.855296038653475e-05, "loss": 2.0507, "step": 858 }, { "epoch": 0.684964200477327, "grad_norm": 1.4922749996185303, "learning_rate": 7.83907433718847e-05, "loss": 1.7931, "step": 861 }, { "epoch": 0.6873508353221957, "grad_norm": 1.5041239261627197, "learning_rate": 7.82280841255959e-05, "loss": 1.7704, "step": 864 }, { "epoch": 0.6897374701670644, "grad_norm": 2.032655954360962, "learning_rate": 7.80649851813486e-05, "loss": 1.9174, "step": 867 }, { "epoch": 0.6921241050119332, "grad_norm": 1.7632269859313965, "learning_rate": 7.790144907967201e-05, "loss": 1.7885, "step": 870 }, { "epoch": 0.6945107398568019, "grad_norm": 1.7323729991912842, "learning_rate": 7.773747836790481e-05, "loss": 1.9919, "step": 873 }, { "epoch": 0.6968973747016707, "grad_norm": 2.3218891620635986, "learning_rate": 7.757307560015538e-05, "loss": 1.7896, "step": 876 }, { "epoch": 0.6992840095465394, "grad_norm": 1.620492935180664, "learning_rate": 7.740824333726213e-05, "loss": 1.748, "step": 879 }, { "epoch": 0.7016706443914081, "grad_norm": 1.6885743141174316, "learning_rate": 7.724298414675353e-05, "loss": 1.7732, "step": 882 }, { "epoch": 0.7040572792362768, "grad_norm": 1.8093699216842651, "learning_rate": 7.707730060280812e-05, "loss": 1.9147, "step": 885 }, { "epoch": 0.7064439140811456, "grad_norm": 1.7837680578231812, "learning_rate": 7.691119528621444e-05, "loss": 1.8391, "step": 888 }, { "epoch": 0.7088305489260143, "grad_norm": 1.75551176071167, "learning_rate": 7.674467078433081e-05, "loss": 1.9519, "step": 891 }, { "epoch": 0.711217183770883, "grad_norm": 1.791812777519226, "learning_rate": 7.657772969104508e-05, "loss": 1.6442, "step": 894 }, { "epoch": 0.7136038186157518, "grad_norm": 1.6638917922973633, "learning_rate": 7.641037460673412e-05, "loss": 1.6225, "step": 897 }, { "epoch": 0.7159904534606205, "grad_norm": 1.479506015777588, "learning_rate": 7.624260813822342e-05, "loss": 1.7162, "step": 900 }, { "epoch": 0.7183770883054893, "grad_norm": 1.9166977405548096, "learning_rate": 7.607443289874642e-05, "loss": 1.8657, "step": 903 }, { "epoch": 0.720763723150358, "grad_norm": 1.4827370643615723, "learning_rate": 7.590585150790389e-05, "loss": 1.9136, "step": 906 }, { "epoch": 0.7231503579952268, "grad_norm": 1.5117080211639404, "learning_rate": 7.573686659162293e-05, "loss": 1.8548, "step": 909 }, { "epoch": 0.7255369928400954, "grad_norm": 1.8482357263565063, "learning_rate": 7.556748078211635e-05, "loss": 1.9555, "step": 912 }, { "epoch": 0.7279236276849642, "grad_norm": 1.6880775690078735, "learning_rate": 7.53976967178414e-05, "loss": 1.8775, "step": 915 }, { "epoch": 0.7303102625298329, "grad_norm": 1.5047417879104614, "learning_rate": 7.522751704345887e-05, "loss": 1.8849, "step": 918 }, { "epoch": 0.7326968973747017, "grad_norm": 1.5307697057724, "learning_rate": 7.505694440979178e-05, "loss": 1.8404, "step": 921 }, { "epoch": 0.7350835322195705, "grad_norm": 1.508344054222107, "learning_rate": 7.488598147378416e-05, "loss": 1.666, "step": 924 }, { "epoch": 0.7374701670644391, "grad_norm": 1.7632466554641724, "learning_rate": 7.471463089845956e-05, "loss": 1.7466, "step": 927 }, { "epoch": 0.7398568019093079, "grad_norm": 1.6543248891830444, "learning_rate": 7.454289535287968e-05, "loss": 1.7259, "step": 930 }, { "epoch": 0.7422434367541766, "grad_norm": 2.329713821411133, "learning_rate": 7.437077751210279e-05, "loss": 1.9443, "step": 933 }, { "epoch": 0.7446300715990454, "grad_norm": 1.585302472114563, "learning_rate": 7.419828005714194e-05, "loss": 1.8221, "step": 936 }, { "epoch": 0.747016706443914, "grad_norm": 1.523518681526184, "learning_rate": 7.402540567492337e-05, "loss": 1.7825, "step": 939 }, { "epoch": 0.7494033412887828, "grad_norm": 1.5896198749542236, "learning_rate": 7.385215705824449e-05, "loss": 1.9895, "step": 942 }, { "epoch": 0.7517899761336515, "grad_norm": 1.7857471704483032, "learning_rate": 7.367853690573208e-05, "loss": 1.6531, "step": 945 }, { "epoch": 0.7541766109785203, "grad_norm": 1.8661036491394043, "learning_rate": 7.350454792180016e-05, "loss": 1.7411, "step": 948 }, { "epoch": 0.7565632458233891, "grad_norm": 1.7312443256378174, "learning_rate": 7.333019281660789e-05, "loss": 1.9503, "step": 951 }, { "epoch": 0.7589498806682577, "grad_norm": 1.8219163417816162, "learning_rate": 7.31554743060174e-05, "loss": 1.7237, "step": 954 }, { "epoch": 0.7613365155131265, "grad_norm": 1.7200877666473389, "learning_rate": 7.298039511155138e-05, "loss": 1.8042, "step": 957 }, { "epoch": 0.7637231503579952, "grad_norm": 1.5407986640930176, "learning_rate": 7.280495796035079e-05, "loss": 1.8225, "step": 960 }, { "epoch": 0.766109785202864, "grad_norm": 1.5745642185211182, "learning_rate": 7.262916558513237e-05, "loss": 1.6478, "step": 963 }, { "epoch": 0.7684964200477327, "grad_norm": 1.8857331275939941, "learning_rate": 7.245302072414601e-05, "loss": 1.8026, "step": 966 }, { "epoch": 0.7708830548926014, "grad_norm": 1.4270589351654053, "learning_rate": 7.227652612113213e-05, "loss": 1.6531, "step": 969 }, { "epoch": 0.7732696897374701, "grad_norm": 1.530493140220642, "learning_rate": 7.209968452527896e-05, "loss": 1.7553, "step": 972 }, { "epoch": 0.7756563245823389, "grad_norm": 1.6771613359451294, "learning_rate": 7.192249869117971e-05, "loss": 1.8374, "step": 975 }, { "epoch": 0.7780429594272077, "grad_norm": 1.7160065174102783, "learning_rate": 7.174497137878966e-05, "loss": 1.7429, "step": 978 }, { "epoch": 0.7804295942720764, "grad_norm": 1.518904685974121, "learning_rate": 7.156710535338312e-05, "loss": 1.8843, "step": 981 }, { "epoch": 0.7828162291169452, "grad_norm": 1.7113001346588135, "learning_rate": 7.138890338551048e-05, "loss": 1.8249, "step": 984 }, { "epoch": 0.7852028639618138, "grad_norm": 1.6391565799713135, "learning_rate": 7.121036825095492e-05, "loss": 1.6807, "step": 987 }, { "epoch": 0.7875894988066826, "grad_norm": 1.6043463945388794, "learning_rate": 7.103150273068921e-05, "loss": 1.7299, "step": 990 }, { "epoch": 0.7899761336515513, "grad_norm": 1.5004619359970093, "learning_rate": 7.085230961083249e-05, "loss": 1.8501, "step": 993 }, { "epoch": 0.7923627684964201, "grad_norm": 1.893096685409546, "learning_rate": 7.067279168260671e-05, "loss": 1.8326, "step": 996 }, { "epoch": 0.7947494033412887, "grad_norm": 1.5315916538238525, "learning_rate": 7.04929517422933e-05, "loss": 1.7833, "step": 999 }, { "epoch": 0.7971360381861575, "grad_norm": 1.5632952451705933, "learning_rate": 7.031279259118946e-05, "loss": 1.6346, "step": 1002 }, { "epoch": 0.7995226730310262, "grad_norm": 1.9762367010116577, "learning_rate": 7.013231703556471e-05, "loss": 1.8849, "step": 1005 }, { "epoch": 0.801909307875895, "grad_norm": 1.5724525451660156, "learning_rate": 6.995152788661705e-05, "loss": 1.7792, "step": 1008 }, { "epoch": 0.8042959427207638, "grad_norm": 1.4417201280593872, "learning_rate": 6.977042796042917e-05, "loss": 1.7516, "step": 1011 }, { "epoch": 0.8066825775656324, "grad_norm": 1.6131057739257812, "learning_rate": 6.958902007792466e-05, "loss": 1.7614, "step": 1014 }, { "epoch": 0.8090692124105012, "grad_norm": 2.5175702571868896, "learning_rate": 6.940730706482399e-05, "loss": 1.8208, "step": 1017 }, { "epoch": 0.8114558472553699, "grad_norm": 1.5602608919143677, "learning_rate": 6.922529175160054e-05, "loss": 1.7046, "step": 1020 }, { "epoch": 0.8138424821002387, "grad_norm": 1.5071675777435303, "learning_rate": 6.904297697343655e-05, "loss": 1.9148, "step": 1023 }, { "epoch": 0.8162291169451074, "grad_norm": 1.4578834772109985, "learning_rate": 6.886036557017881e-05, "loss": 1.8546, "step": 1026 }, { "epoch": 0.8186157517899761, "grad_norm": 1.7864019870758057, "learning_rate": 6.867746038629462e-05, "loss": 1.9423, "step": 1029 }, { "epoch": 0.8210023866348448, "grad_norm": 1.4294066429138184, "learning_rate": 6.849426427082735e-05, "loss": 1.7797, "step": 1032 }, { "epoch": 0.8233890214797136, "grad_norm": 2.900899648666382, "learning_rate": 6.83107800773521e-05, "loss": 1.8395, "step": 1035 }, { "epoch": 0.8257756563245824, "grad_norm": 1.5912294387817383, "learning_rate": 6.812701066393124e-05, "loss": 1.7345, "step": 1038 }, { "epoch": 0.8281622911694511, "grad_norm": 1.6191726922988892, "learning_rate": 6.79429588930699e-05, "loss": 1.7563, "step": 1041 }, { "epoch": 0.8305489260143198, "grad_norm": 1.926048755645752, "learning_rate": 6.775862763167142e-05, "loss": 1.7473, "step": 1044 }, { "epoch": 0.8329355608591885, "grad_norm": 1.5704717636108398, "learning_rate": 6.757401975099262e-05, "loss": 1.6788, "step": 1047 }, { "epoch": 0.8353221957040573, "grad_norm": 2.986739158630371, "learning_rate": 6.738913812659912e-05, "loss": 1.9091, "step": 1050 }, { "epoch": 0.837708830548926, "grad_norm": 2.8111684322357178, "learning_rate": 6.720398563832055e-05, "loss": 1.7738, "step": 1053 }, { "epoch": 0.8400954653937948, "grad_norm": 1.5233489274978638, "learning_rate": 6.701856517020565e-05, "loss": 1.869, "step": 1056 }, { "epoch": 0.8424821002386634, "grad_norm": 1.679288387298584, "learning_rate": 6.683287961047742e-05, "loss": 1.977, "step": 1059 }, { "epoch": 0.8448687350835322, "grad_norm": 1.5280253887176514, "learning_rate": 6.664693185148807e-05, "loss": 1.7278, "step": 1062 }, { "epoch": 0.847255369928401, "grad_norm": 1.5906157493591309, "learning_rate": 6.646072478967397e-05, "loss": 1.8965, "step": 1065 }, { "epoch": 0.8496420047732697, "grad_norm": 1.6531226634979248, "learning_rate": 6.627426132551058e-05, "loss": 1.77, "step": 1068 }, { "epoch": 0.8520286396181385, "grad_norm": 1.5413316488265991, "learning_rate": 6.608754436346725e-05, "loss": 1.7051, "step": 1071 }, { "epoch": 0.8544152744630071, "grad_norm": 1.7978452444076538, "learning_rate": 6.590057681196191e-05, "loss": 1.6797, "step": 1074 }, { "epoch": 0.8568019093078759, "grad_norm": 1.6512346267700195, "learning_rate": 6.571336158331589e-05, "loss": 1.9775, "step": 1077 }, { "epoch": 0.8591885441527446, "grad_norm": 1.7628110647201538, "learning_rate": 6.552590159370844e-05, "loss": 1.6468, "step": 1080 }, { "epoch": 0.8615751789976134, "grad_norm": 1.659655213356018, "learning_rate": 6.53381997631314e-05, "loss": 1.8676, "step": 1083 }, { "epoch": 0.863961813842482, "grad_norm": 1.5994445085525513, "learning_rate": 6.515025901534364e-05, "loss": 1.7799, "step": 1086 }, { "epoch": 0.8663484486873508, "grad_norm": 4.107300758361816, "learning_rate": 6.496208227782556e-05, "loss": 1.7622, "step": 1089 }, { "epoch": 0.8687350835322196, "grad_norm": 1.6633572578430176, "learning_rate": 6.477367248173352e-05, "loss": 1.7943, "step": 1092 }, { "epoch": 0.8711217183770883, "grad_norm": 1.5715115070343018, "learning_rate": 6.458503256185404e-05, "loss": 1.791, "step": 1095 }, { "epoch": 0.8735083532219571, "grad_norm": 1.4537943601608276, "learning_rate": 6.439616545655834e-05, "loss": 1.6835, "step": 1098 }, { "epoch": 0.8758949880668258, "grad_norm": 1.4551641941070557, "learning_rate": 6.420707410775626e-05, "loss": 1.8273, "step": 1101 }, { "epoch": 0.8782816229116945, "grad_norm": 1.7414036989212036, "learning_rate": 6.401776146085072e-05, "loss": 2.0934, "step": 1104 }, { "epoch": 0.8806682577565632, "grad_norm": 1.602457046508789, "learning_rate": 6.382823046469167e-05, "loss": 1.8388, "step": 1107 }, { "epoch": 0.883054892601432, "grad_norm": 1.549912929534912, "learning_rate": 6.363848407153016e-05, "loss": 1.6429, "step": 1110 }, { "epoch": 0.8854415274463007, "grad_norm": 1.4284569025039673, "learning_rate": 6.344852523697247e-05, "loss": 1.8545, "step": 1113 }, { "epoch": 0.8878281622911695, "grad_norm": 1.83491849899292, "learning_rate": 6.325835691993394e-05, "loss": 1.574, "step": 1116 }, { "epoch": 0.8902147971360382, "grad_norm": 1.6155121326446533, "learning_rate": 6.306798208259297e-05, "loss": 1.7415, "step": 1119 }, { "epoch": 0.8926014319809069, "grad_norm": 1.5498522520065308, "learning_rate": 6.287740369034485e-05, "loss": 1.5622, "step": 1122 }, { "epoch": 0.8949880668257757, "grad_norm": 1.6388617753982544, "learning_rate": 6.26866247117555e-05, "loss": 1.6144, "step": 1125 }, { "epoch": 0.8973747016706444, "grad_norm": 1.4823899269104004, "learning_rate": 6.249564811851543e-05, "loss": 1.8225, "step": 1128 }, { "epoch": 0.8997613365155132, "grad_norm": 1.4698792695999146, "learning_rate": 6.230447688539316e-05, "loss": 1.6339, "step": 1131 }, { "epoch": 0.9021479713603818, "grad_norm": 1.6909016370773315, "learning_rate": 6.211311399018916e-05, "loss": 1.7918, "step": 1134 }, { "epoch": 0.9045346062052506, "grad_norm": 1.4534494876861572, "learning_rate": 6.192156241368929e-05, "loss": 1.7715, "step": 1137 }, { "epoch": 0.9069212410501193, "grad_norm": 1.719106912612915, "learning_rate": 6.172982513961845e-05, "loss": 1.7261, "step": 1140 }, { "epoch": 0.9093078758949881, "grad_norm": 1.6923831701278687, "learning_rate": 6.153790515459404e-05, "loss": 1.6554, "step": 1143 }, { "epoch": 0.9116945107398569, "grad_norm": 1.6443957090377808, "learning_rate": 6.13458054480795e-05, "loss": 1.8556, "step": 1146 }, { "epoch": 0.9140811455847255, "grad_norm": 1.5689623355865479, "learning_rate": 6.115352901233779e-05, "loss": 1.8041, "step": 1149 }, { "epoch": 0.9164677804295943, "grad_norm": 1.46951425075531, "learning_rate": 6.096107884238458e-05, "loss": 1.6515, "step": 1152 }, { "epoch": 0.918854415274463, "grad_norm": 1.5866256952285767, "learning_rate": 6.0768457935941817e-05, "loss": 1.8007, "step": 1155 }, { "epoch": 0.9212410501193318, "grad_norm": 1.9523183107376099, "learning_rate": 6.0575669293390954e-05, "loss": 1.8313, "step": 1158 }, { "epoch": 0.9236276849642004, "grad_norm": 1.6216835975646973, "learning_rate": 6.038271591772615e-05, "loss": 1.8399, "step": 1161 }, { "epoch": 0.9260143198090692, "grad_norm": 1.8278216123580933, "learning_rate": 6.0189600814507604e-05, "loss": 1.9067, "step": 1164 }, { "epoch": 0.9284009546539379, "grad_norm": 1.2732576131820679, "learning_rate": 5.9996326991814654e-05, "loss": 1.6042, "step": 1167 }, { "epoch": 0.9307875894988067, "grad_norm": 1.75897216796875, "learning_rate": 5.980289746019892e-05, "loss": 1.8796, "step": 1170 }, { "epoch": 0.9331742243436754, "grad_norm": 1.5754450559616089, "learning_rate": 5.9609315232637483e-05, "loss": 1.5981, "step": 1173 }, { "epoch": 0.9355608591885441, "grad_norm": 1.7975653409957886, "learning_rate": 5.941558332448589e-05, "loss": 1.638, "step": 1176 }, { "epoch": 0.9379474940334129, "grad_norm": 1.6017708778381348, "learning_rate": 5.922170475343125e-05, "loss": 1.6904, "step": 1179 }, { "epoch": 0.9403341288782816, "grad_norm": 1.6121450662612915, "learning_rate": 5.9027682539445104e-05, "loss": 1.69, "step": 1182 }, { "epoch": 0.9427207637231504, "grad_norm": 1.5319181680679321, "learning_rate": 5.883351970473654e-05, "loss": 1.8468, "step": 1185 }, { "epoch": 0.9451073985680191, "grad_norm": 1.7490177154541016, "learning_rate": 5.863921927370498e-05, "loss": 1.7477, "step": 1188 }, { "epoch": 0.9474940334128878, "grad_norm": 1.546705722808838, "learning_rate": 5.8444784272893175e-05, "loss": 1.6837, "step": 1191 }, { "epoch": 0.9498806682577565, "grad_norm": 1.576828956604004, "learning_rate": 5.8250217730939973e-05, "loss": 1.6976, "step": 1194 }, { "epoch": 0.9522673031026253, "grad_norm": 1.443121075630188, "learning_rate": 5.8055522678533225e-05, "loss": 1.6661, "step": 1197 }, { "epoch": 0.954653937947494, "grad_norm": 1.619332194328308, "learning_rate": 5.786070214836254e-05, "loss": 1.6282, "step": 1200 }, { "epoch": 0.9570405727923628, "grad_norm": 1.516404628753662, "learning_rate": 5.7665759175072034e-05, "loss": 1.8743, "step": 1203 }, { "epoch": 0.9594272076372315, "grad_norm": 1.842561960220337, "learning_rate": 5.747069679521305e-05, "loss": 1.7742, "step": 1206 }, { "epoch": 0.9618138424821002, "grad_norm": 1.5098801851272583, "learning_rate": 5.727551804719693e-05, "loss": 1.7087, "step": 1209 }, { "epoch": 0.964200477326969, "grad_norm": 1.611047387123108, "learning_rate": 5.708022597124758e-05, "loss": 1.6934, "step": 1212 }, { "epoch": 0.9665871121718377, "grad_norm": 1.5745174884796143, "learning_rate": 5.688482360935423e-05, "loss": 1.8729, "step": 1215 }, { "epoch": 0.9689737470167065, "grad_norm": 1.4478429555892944, "learning_rate": 5.668931400522396e-05, "loss": 1.801, "step": 1218 }, { "epoch": 0.9713603818615751, "grad_norm": 2.9802427291870117, "learning_rate": 5.649370020423431e-05, "loss": 1.6937, "step": 1221 }, { "epoch": 0.9737470167064439, "grad_norm": 1.6159980297088623, "learning_rate": 5.629798525338589e-05, "loss": 1.712, "step": 1224 }, { "epoch": 0.9761336515513126, "grad_norm": 1.6687465906143188, "learning_rate": 5.6102172201254835e-05, "loss": 1.7582, "step": 1227 }, { "epoch": 0.9785202863961814, "grad_norm": 1.318992018699646, "learning_rate": 5.5906264097945407e-05, "loss": 1.7913, "step": 1230 }, { "epoch": 0.9809069212410502, "grad_norm": 1.6671638488769531, "learning_rate": 5.5710263995042434e-05, "loss": 1.8547, "step": 1233 }, { "epoch": 0.9832935560859188, "grad_norm": 1.460523247718811, "learning_rate": 5.551417494556376e-05, "loss": 1.7699, "step": 1236 }, { "epoch": 0.9856801909307876, "grad_norm": 1.4564532041549683, "learning_rate": 5.531800000391275e-05, "loss": 1.7636, "step": 1239 }, { "epoch": 0.9880668257756563, "grad_norm": 1.7387609481811523, "learning_rate": 5.5121742225830665e-05, "loss": 1.8602, "step": 1242 }, { "epoch": 0.9904534606205251, "grad_norm": 1.8153102397918701, "learning_rate": 5.4925404668349076e-05, "loss": 1.797, "step": 1245 }, { "epoch": 0.9928400954653938, "grad_norm": 1.4723916053771973, "learning_rate": 5.472899038974225e-05, "loss": 1.7051, "step": 1248 }, { "epoch": 0.9952267303102625, "grad_norm": 2.961282968521118, "learning_rate": 5.45325024494795e-05, "loss": 1.7661, "step": 1251 }, { "epoch": 0.9976133651551312, "grad_norm": 1.6182713508605957, "learning_rate": 5.433594390817756e-05, "loss": 1.875, "step": 1254 }, { "epoch": 1.0, "grad_norm": 2.34993839263916, "learning_rate": 5.413931782755283e-05, "loss": 1.8514, "step": 1257 }, { "epoch": 1.0023866348448687, "grad_norm": 1.439794659614563, "learning_rate": 5.3942627270373826e-05, "loss": 1.5045, "step": 1260 }, { "epoch": 1.0047732696897376, "grad_norm": 1.7519071102142334, "learning_rate": 5.374587530041335e-05, "loss": 1.5372, "step": 1263 }, { "epoch": 1.0071599045346062, "grad_norm": 1.4893503189086914, "learning_rate": 5.35490649824008e-05, "loss": 1.5061, "step": 1266 }, { "epoch": 1.009546539379475, "grad_norm": 1.4957739114761353, "learning_rate": 5.335219938197445e-05, "loss": 1.4709, "step": 1269 }, { "epoch": 1.0119331742243436, "grad_norm": 1.5801351070404053, "learning_rate": 5.315528156563367e-05, "loss": 1.522, "step": 1272 }, { "epoch": 1.0143198090692125, "grad_norm": 1.5916804075241089, "learning_rate": 5.295831460069124e-05, "loss": 1.429, "step": 1275 }, { "epoch": 1.0167064439140812, "grad_norm": 1.598103404045105, "learning_rate": 5.276130155522541e-05, "loss": 1.5911, "step": 1278 }, { "epoch": 1.0190930787589498, "grad_norm": 1.575002670288086, "learning_rate": 5.256424549803228e-05, "loss": 1.5334, "step": 1281 }, { "epoch": 1.0214797136038185, "grad_norm": 1.5843360424041748, "learning_rate": 5.236714949857791e-05, "loss": 1.4366, "step": 1284 }, { "epoch": 1.0238663484486874, "grad_norm": 1.5547505617141724, "learning_rate": 5.2170016626950505e-05, "loss": 1.5106, "step": 1287 }, { "epoch": 1.026252983293556, "grad_norm": 1.4847965240478516, "learning_rate": 5.1972849953812644e-05, "loss": 1.4397, "step": 1290 }, { "epoch": 1.0286396181384247, "grad_norm": 2.0349180698394775, "learning_rate": 5.1775652550353405e-05, "loss": 1.5765, "step": 1293 }, { "epoch": 1.0310262529832936, "grad_norm": 1.6574598550796509, "learning_rate": 5.157842748824053e-05, "loss": 1.433, "step": 1296 }, { "epoch": 1.0334128878281623, "grad_norm": 1.5641635656356812, "learning_rate": 5.138117783957261e-05, "loss": 1.5666, "step": 1299 }, { "epoch": 1.035799522673031, "grad_norm": 1.6239501237869263, "learning_rate": 5.1183906676831197e-05, "loss": 1.6223, "step": 1302 }, { "epoch": 1.0381861575178997, "grad_norm": 1.5002102851867676, "learning_rate": 5.098661707283298e-05, "loss": 1.4733, "step": 1305 }, { "epoch": 1.0405727923627686, "grad_norm": 1.5246349573135376, "learning_rate": 5.078931210068185e-05, "loss": 1.4459, "step": 1308 }, { "epoch": 1.0429594272076372, "grad_norm": 1.4616323709487915, "learning_rate": 5.059199483372114e-05, "loss": 1.4595, "step": 1311 }, { "epoch": 1.045346062052506, "grad_norm": 1.4437859058380127, "learning_rate": 5.039466834548568e-05, "loss": 1.594, "step": 1314 }, { "epoch": 1.0477326968973748, "grad_norm": 1.609554409980774, "learning_rate": 5.0197335709653883e-05, "loss": 1.3251, "step": 1317 }, { "epoch": 1.0501193317422435, "grad_norm": 1.502581238746643, "learning_rate": 5e-05, "loss": 1.5505, "step": 1320 }, { "epoch": 1.0525059665871122, "grad_norm": 1.5816670656204224, "learning_rate": 4.980266429034613e-05, "loss": 1.4974, "step": 1323 }, { "epoch": 1.0548926014319808, "grad_norm": 1.714355230331421, "learning_rate": 4.960533165451435e-05, "loss": 1.387, "step": 1326 }, { "epoch": 1.0572792362768497, "grad_norm": 1.506137490272522, "learning_rate": 4.9408005166278855e-05, "loss": 1.4963, "step": 1329 }, { "epoch": 1.0596658711217184, "grad_norm": 1.5406689643859863, "learning_rate": 4.921068789931816e-05, "loss": 1.4865, "step": 1332 }, { "epoch": 1.062052505966587, "grad_norm": 1.6917808055877686, "learning_rate": 4.901338292716704e-05, "loss": 1.3597, "step": 1335 }, { "epoch": 1.064439140811456, "grad_norm": 1.4597982168197632, "learning_rate": 4.8816093323168815e-05, "loss": 1.4529, "step": 1338 }, { "epoch": 1.0668257756563246, "grad_norm": 1.3898138999938965, "learning_rate": 4.8618822160427406e-05, "loss": 1.4942, "step": 1341 }, { "epoch": 1.0692124105011933, "grad_norm": 1.3161375522613525, "learning_rate": 4.842157251175947e-05, "loss": 1.4459, "step": 1344 }, { "epoch": 1.071599045346062, "grad_norm": 1.6536246538162231, "learning_rate": 4.822434744964661e-05, "loss": 1.4766, "step": 1347 }, { "epoch": 1.0739856801909309, "grad_norm": 1.7592108249664307, "learning_rate": 4.802715004618737e-05, "loss": 1.5014, "step": 1350 }, { "epoch": 1.0763723150357996, "grad_norm": 1.4345988035202026, "learning_rate": 4.7829983373049507e-05, "loss": 1.4605, "step": 1353 }, { "epoch": 1.0787589498806682, "grad_norm": 1.2879067659378052, "learning_rate": 4.763285050142211e-05, "loss": 1.3331, "step": 1356 }, { "epoch": 1.081145584725537, "grad_norm": 1.392535924911499, "learning_rate": 4.743575450196773e-05, "loss": 1.4701, "step": 1359 }, { "epoch": 1.0835322195704058, "grad_norm": 1.4662644863128662, "learning_rate": 4.7238698444774595e-05, "loss": 1.4321, "step": 1362 }, { "epoch": 1.0859188544152745, "grad_norm": 1.4287410974502563, "learning_rate": 4.704168539930878e-05, "loss": 1.3805, "step": 1365 }, { "epoch": 1.0883054892601431, "grad_norm": 1.4799420833587646, "learning_rate": 4.6844718434366334e-05, "loss": 1.6115, "step": 1368 }, { "epoch": 1.0906921241050118, "grad_norm": 1.6349695920944214, "learning_rate": 4.664780061802557e-05, "loss": 1.4725, "step": 1371 }, { "epoch": 1.0930787589498807, "grad_norm": 1.5499435663223267, "learning_rate": 4.64509350175992e-05, "loss": 1.4677, "step": 1374 }, { "epoch": 1.0954653937947494, "grad_norm": 1.5121248960494995, "learning_rate": 4.6254124699586656e-05, "loss": 1.5272, "step": 1377 }, { "epoch": 1.097852028639618, "grad_norm": 1.491830825805664, "learning_rate": 4.605737272962618e-05, "loss": 1.4693, "step": 1380 }, { "epoch": 1.100238663484487, "grad_norm": 1.4596081972122192, "learning_rate": 4.5860682172447184e-05, "loss": 1.491, "step": 1383 }, { "epoch": 1.1026252983293556, "grad_norm": 1.7243481874465942, "learning_rate": 4.566405609182247e-05, "loss": 1.5289, "step": 1386 }, { "epoch": 1.1050119331742243, "grad_norm": 1.5089308023452759, "learning_rate": 4.546749755052051e-05, "loss": 1.5428, "step": 1389 }, { "epoch": 1.107398568019093, "grad_norm": 1.5854674577713013, "learning_rate": 4.527100961025776e-05, "loss": 1.5129, "step": 1392 }, { "epoch": 1.1097852028639619, "grad_norm": 1.55560302734375, "learning_rate": 4.507459533165093e-05, "loss": 1.482, "step": 1395 }, { "epoch": 1.1121718377088305, "grad_norm": 1.4460105895996094, "learning_rate": 4.4878257774169346e-05, "loss": 1.4073, "step": 1398 }, { "epoch": 1.1145584725536992, "grad_norm": 1.5131725072860718, "learning_rate": 4.4681999996087274e-05, "loss": 1.4992, "step": 1401 }, { "epoch": 1.1169451073985681, "grad_norm": 1.5037047863006592, "learning_rate": 4.448582505443625e-05, "loss": 1.5421, "step": 1404 }, { "epoch": 1.1193317422434368, "grad_norm": 1.4700491428375244, "learning_rate": 4.4289736004957585e-05, "loss": 1.4587, "step": 1407 }, { "epoch": 1.1217183770883055, "grad_norm": 1.3639819622039795, "learning_rate": 4.4093735902054605e-05, "loss": 1.4711, "step": 1410 }, { "epoch": 1.1241050119331741, "grad_norm": 1.565079689025879, "learning_rate": 4.3897827798745183e-05, "loss": 1.4546, "step": 1413 }, { "epoch": 1.126491646778043, "grad_norm": 1.5039900541305542, "learning_rate": 4.3702014746614136e-05, "loss": 1.4998, "step": 1416 }, { "epoch": 1.1288782816229117, "grad_norm": 1.6283437013626099, "learning_rate": 4.350629979576569e-05, "loss": 1.4458, "step": 1419 }, { "epoch": 1.1312649164677804, "grad_norm": 1.4039489030838013, "learning_rate": 4.331068599477605e-05, "loss": 1.3474, "step": 1422 }, { "epoch": 1.1336515513126493, "grad_norm": 1.4971529245376587, "learning_rate": 4.311517639064578e-05, "loss": 1.3097, "step": 1425 }, { "epoch": 1.136038186157518, "grad_norm": 1.9998141527175903, "learning_rate": 4.2919774028752436e-05, "loss": 1.4228, "step": 1428 }, { "epoch": 1.1384248210023866, "grad_norm": 1.3852521181106567, "learning_rate": 4.27244819528031e-05, "loss": 1.4883, "step": 1431 }, { "epoch": 1.1408114558472553, "grad_norm": 1.5615485906600952, "learning_rate": 4.2529303204786953e-05, "loss": 1.5153, "step": 1434 }, { "epoch": 1.1431980906921242, "grad_norm": 1.5283887386322021, "learning_rate": 4.233424082492797e-05, "loss": 1.527, "step": 1437 }, { "epoch": 1.1455847255369929, "grad_norm": 1.385176420211792, "learning_rate": 4.213929785163747e-05, "loss": 1.4805, "step": 1440 }, { "epoch": 1.1479713603818615, "grad_norm": 1.3835763931274414, "learning_rate": 4.1944477321466786e-05, "loss": 1.4868, "step": 1443 }, { "epoch": 1.1503579952267304, "grad_norm": 1.4247292280197144, "learning_rate": 4.1749782269060045e-05, "loss": 1.498, "step": 1446 }, { "epoch": 1.152744630071599, "grad_norm": 1.5267618894577026, "learning_rate": 4.1555215727106844e-05, "loss": 1.4659, "step": 1449 }, { "epoch": 1.1551312649164678, "grad_norm": 1.5051010847091675, "learning_rate": 4.136078072629503e-05, "loss": 1.4474, "step": 1452 }, { "epoch": 1.1575178997613365, "grad_norm": 2.2243285179138184, "learning_rate": 4.116648029526347e-05, "loss": 1.453, "step": 1455 }, { "epoch": 1.1599045346062051, "grad_norm": 1.4427586793899536, "learning_rate": 4.097231746055491e-05, "loss": 1.532, "step": 1458 }, { "epoch": 1.162291169451074, "grad_norm": 1.9031730890274048, "learning_rate": 4.077829524656877e-05, "loss": 1.3974, "step": 1461 }, { "epoch": 1.1646778042959427, "grad_norm": 1.4072078466415405, "learning_rate": 4.05844166755141e-05, "loss": 1.4742, "step": 1464 }, { "epoch": 1.1670644391408114, "grad_norm": 1.4678648710250854, "learning_rate": 4.039068476736253e-05, "loss": 1.4408, "step": 1467 }, { "epoch": 1.1694510739856803, "grad_norm": 7.8452582359313965, "learning_rate": 4.01971025398011e-05, "loss": 1.3489, "step": 1470 }, { "epoch": 1.171837708830549, "grad_norm": 1.6050971746444702, "learning_rate": 4.000367300818537e-05, "loss": 1.6608, "step": 1473 }, { "epoch": 1.1742243436754176, "grad_norm": 1.71634840965271, "learning_rate": 3.98103991854924e-05, "loss": 1.4679, "step": 1476 }, { "epoch": 1.1766109785202863, "grad_norm": 1.4481014013290405, "learning_rate": 3.961728408227384e-05, "loss": 1.5657, "step": 1479 }, { "epoch": 1.1789976133651552, "grad_norm": 1.6103709936141968, "learning_rate": 3.942433070660905e-05, "loss": 1.4409, "step": 1482 }, { "epoch": 1.1813842482100239, "grad_norm": 1.411056399345398, "learning_rate": 3.923154206405819e-05, "loss": 1.4865, "step": 1485 }, { "epoch": 1.1837708830548925, "grad_norm": 1.497053623199463, "learning_rate": 3.9038921157615444e-05, "loss": 1.4072, "step": 1488 }, { "epoch": 1.1861575178997614, "grad_norm": 1.6520947217941284, "learning_rate": 3.884647098766224e-05, "loss": 1.4393, "step": 1491 }, { "epoch": 1.18854415274463, "grad_norm": 1.442406177520752, "learning_rate": 3.8654194551920485e-05, "loss": 1.4458, "step": 1494 }, { "epoch": 1.1909307875894988, "grad_norm": 1.3878071308135986, "learning_rate": 3.846209484540597e-05, "loss": 1.4374, "step": 1497 }, { "epoch": 1.1933174224343674, "grad_norm": 1.401583194732666, "learning_rate": 3.827017486038157e-05, "loss": 1.3965, "step": 1500 }, { "epoch": 1.1957040572792363, "grad_norm": 1.4490373134613037, "learning_rate": 3.8078437586310716e-05, "loss": 1.6232, "step": 1503 }, { "epoch": 1.198090692124105, "grad_norm": 1.2965726852416992, "learning_rate": 3.788688600981085e-05, "loss": 1.4681, "step": 1506 }, { "epoch": 1.2004773269689737, "grad_norm": 1.355893850326538, "learning_rate": 3.769552311460684e-05, "loss": 1.488, "step": 1509 }, { "epoch": 1.2028639618138426, "grad_norm": 1.4382792711257935, "learning_rate": 3.750435188148459e-05, "loss": 1.5472, "step": 1512 }, { "epoch": 1.2052505966587113, "grad_norm": 1.3497835397720337, "learning_rate": 3.73133752882445e-05, "loss": 1.3225, "step": 1515 }, { "epoch": 1.20763723150358, "grad_norm": 1.448410153388977, "learning_rate": 3.712259630965518e-05, "loss": 1.4645, "step": 1518 }, { "epoch": 1.2100238663484486, "grad_norm": 1.470038890838623, "learning_rate": 3.6932017917407045e-05, "loss": 1.4681, "step": 1521 }, { "epoch": 1.2124105011933175, "grad_norm": 1.2767425775527954, "learning_rate": 3.6741643080066065e-05, "loss": 1.4428, "step": 1524 }, { "epoch": 1.2147971360381862, "grad_norm": 1.5355818271636963, "learning_rate": 3.655147476302754e-05, "loss": 1.528, "step": 1527 }, { "epoch": 1.2171837708830548, "grad_norm": 1.5351886749267578, "learning_rate": 3.636151592846985e-05, "loss": 1.3914, "step": 1530 }, { "epoch": 1.2195704057279237, "grad_norm": 1.580076813697815, "learning_rate": 3.617176953530835e-05, "loss": 1.3934, "step": 1533 }, { "epoch": 1.2219570405727924, "grad_norm": 1.2656506299972534, "learning_rate": 3.5982238539149285e-05, "loss": 1.305, "step": 1536 }, { "epoch": 1.224343675417661, "grad_norm": 2.035010814666748, "learning_rate": 3.579292589224375e-05, "loss": 1.4774, "step": 1539 }, { "epoch": 1.2267303102625298, "grad_norm": 1.4457292556762695, "learning_rate": 3.560383454344168e-05, "loss": 1.5794, "step": 1542 }, { "epoch": 1.2291169451073987, "grad_norm": 1.2905712127685547, "learning_rate": 3.541496743814596e-05, "loss": 1.4821, "step": 1545 }, { "epoch": 1.2315035799522673, "grad_norm": 1.505958914756775, "learning_rate": 3.522632751826651e-05, "loss": 1.3573, "step": 1548 }, { "epoch": 1.233890214797136, "grad_norm": 1.4743762016296387, "learning_rate": 3.503791772217445e-05, "loss": 1.5104, "step": 1551 }, { "epoch": 1.2362768496420047, "grad_norm": 1.4788120985031128, "learning_rate": 3.484974098465636e-05, "loss": 1.3989, "step": 1554 }, { "epoch": 1.2386634844868736, "grad_norm": 1.3256484270095825, "learning_rate": 3.4661800236868604e-05, "loss": 1.4617, "step": 1557 }, { "epoch": 1.2410501193317423, "grad_norm": 1.5638628005981445, "learning_rate": 3.447409840629156e-05, "loss": 1.402, "step": 1560 }, { "epoch": 1.243436754176611, "grad_norm": 1.7883927822113037, "learning_rate": 3.428663841668412e-05, "loss": 1.5829, "step": 1563 }, { "epoch": 1.2458233890214796, "grad_norm": 1.4335147142410278, "learning_rate": 3.409942318803809e-05, "loss": 1.4597, "step": 1566 }, { "epoch": 1.2482100238663485, "grad_norm": 1.4358636140823364, "learning_rate": 3.391245563653276e-05, "loss": 1.5638, "step": 1569 }, { "epoch": 1.2505966587112172, "grad_norm": 1.3342747688293457, "learning_rate": 3.3725738674489414e-05, "loss": 1.447, "step": 1572 }, { "epoch": 1.2529832935560858, "grad_norm": 1.3842703104019165, "learning_rate": 3.3539275210326044e-05, "loss": 1.4634, "step": 1575 }, { "epoch": 1.2553699284009547, "grad_norm": 1.5342031717300415, "learning_rate": 3.335306814851196e-05, "loss": 1.458, "step": 1578 }, { "epoch": 1.2577565632458234, "grad_norm": 1.4148904085159302, "learning_rate": 3.31671203895226e-05, "loss": 1.3945, "step": 1581 }, { "epoch": 1.260143198090692, "grad_norm": 1.3674441576004028, "learning_rate": 3.298143482979436e-05, "loss": 1.3962, "step": 1584 }, { "epoch": 1.2625298329355608, "grad_norm": 1.42054283618927, "learning_rate": 3.2796014361679464e-05, "loss": 1.5179, "step": 1587 }, { "epoch": 1.2649164677804297, "grad_norm": 1.5487267971038818, "learning_rate": 3.261086187340088e-05, "loss": 1.4281, "step": 1590 }, { "epoch": 1.2673031026252983, "grad_norm": 1.5014619827270508, "learning_rate": 3.242598024900738e-05, "loss": 1.4789, "step": 1593 }, { "epoch": 1.269689737470167, "grad_norm": 1.533458948135376, "learning_rate": 3.224137236832859e-05, "loss": 1.3491, "step": 1596 }, { "epoch": 1.272076372315036, "grad_norm": 1.3559014797210693, "learning_rate": 3.2057041106930104e-05, "loss": 1.3915, "step": 1599 }, { "epoch": 1.2744630071599046, "grad_norm": 1.3522697687149048, "learning_rate": 3.187298933606878e-05, "loss": 1.496, "step": 1602 }, { "epoch": 1.2768496420047732, "grad_norm": 1.3634637594223022, "learning_rate": 3.1689219922647924e-05, "loss": 1.3662, "step": 1605 }, { "epoch": 1.279236276849642, "grad_norm": 1.4531528949737549, "learning_rate": 3.150573572917267e-05, "loss": 1.5501, "step": 1608 }, { "epoch": 1.2816229116945108, "grad_norm": 1.39664626121521, "learning_rate": 3.13225396137054e-05, "loss": 1.42, "step": 1611 }, { "epoch": 1.2840095465393795, "grad_norm": 1.4954712390899658, "learning_rate": 3.11396344298212e-05, "loss": 1.5471, "step": 1614 }, { "epoch": 1.2863961813842482, "grad_norm": 1.3956115245819092, "learning_rate": 3.095702302656347e-05, "loss": 1.4936, "step": 1617 }, { "epoch": 1.288782816229117, "grad_norm": 1.4721133708953857, "learning_rate": 3.077470824839947e-05, "loss": 1.4429, "step": 1620 }, { "epoch": 1.2911694510739857, "grad_norm": 6.485837936401367, "learning_rate": 3.059269293517603e-05, "loss": 1.4545, "step": 1623 }, { "epoch": 1.2935560859188544, "grad_norm": 1.7930669784545898, "learning_rate": 3.0410979922075343e-05, "loss": 1.3401, "step": 1626 }, { "epoch": 1.295942720763723, "grad_norm": 1.4317888021469116, "learning_rate": 3.022957203957083e-05, "loss": 1.5389, "step": 1629 }, { "epoch": 1.2983293556085918, "grad_norm": 1.3846065998077393, "learning_rate": 3.004847211338295e-05, "loss": 1.355, "step": 1632 }, { "epoch": 1.3007159904534606, "grad_norm": 1.3602261543273926, "learning_rate": 2.9867682964435294e-05, "loss": 1.4359, "step": 1635 }, { "epoch": 1.3031026252983293, "grad_norm": 1.3559253215789795, "learning_rate": 2.9687207408810557e-05, "loss": 1.493, "step": 1638 }, { "epoch": 1.3054892601431982, "grad_norm": 1.6385109424591064, "learning_rate": 2.9507048257706727e-05, "loss": 1.5694, "step": 1641 }, { "epoch": 1.307875894988067, "grad_norm": 1.4390101432800293, "learning_rate": 2.9327208317393303e-05, "loss": 1.5722, "step": 1644 }, { "epoch": 1.3102625298329356, "grad_norm": 1.4474018812179565, "learning_rate": 2.9147690389167514e-05, "loss": 1.4355, "step": 1647 }, { "epoch": 1.3126491646778042, "grad_norm": 1.3009108304977417, "learning_rate": 2.8968497269310803e-05, "loss": 1.3249, "step": 1650 }, { "epoch": 1.315035799522673, "grad_norm": 1.4271756410598755, "learning_rate": 2.8789631749045097e-05, "loss": 1.3821, "step": 1653 }, { "epoch": 1.3174224343675418, "grad_norm": 1.4093137979507446, "learning_rate": 2.8611096614489518e-05, "loss": 1.3932, "step": 1656 }, { "epoch": 1.3198090692124105, "grad_norm": 1.5373637676239014, "learning_rate": 2.8432894646616885e-05, "loss": 1.3887, "step": 1659 }, { "epoch": 1.3221957040572792, "grad_norm": 1.3347370624542236, "learning_rate": 2.8255028621210355e-05, "loss": 1.4542, "step": 1662 }, { "epoch": 1.324582338902148, "grad_norm": 1.5229403972625732, "learning_rate": 2.8077501308820308e-05, "loss": 1.5258, "step": 1665 }, { "epoch": 1.3269689737470167, "grad_norm": 1.38504958152771, "learning_rate": 2.790031547472105e-05, "loss": 1.4561, "step": 1668 }, { "epoch": 1.3293556085918854, "grad_norm": 1.4247602224349976, "learning_rate": 2.7723473878867877e-05, "loss": 1.5028, "step": 1671 }, { "epoch": 1.331742243436754, "grad_norm": 1.437752604484558, "learning_rate": 2.754697927585399e-05, "loss": 1.4035, "step": 1674 }, { "epoch": 1.334128878281623, "grad_norm": 1.3439934253692627, "learning_rate": 2.737083441486763e-05, "loss": 1.3615, "step": 1677 }, { "epoch": 1.3365155131264916, "grad_norm": 1.3507174253463745, "learning_rate": 2.71950420396492e-05, "loss": 1.416, "step": 1680 }, { "epoch": 1.3389021479713603, "grad_norm": 1.3720000982284546, "learning_rate": 2.7019604888448642e-05, "loss": 1.4143, "step": 1683 }, { "epoch": 1.3412887828162292, "grad_norm": 1.4216835498809814, "learning_rate": 2.6844525693982613e-05, "loss": 1.4053, "step": 1686 }, { "epoch": 1.3436754176610979, "grad_norm": 1.3386591672897339, "learning_rate": 2.666980718339211e-05, "loss": 1.4513, "step": 1689 }, { "epoch": 1.3460620525059666, "grad_norm": 1.423043966293335, "learning_rate": 2.6495452078199863e-05, "loss": 1.4137, "step": 1692 }, { "epoch": 1.3484486873508352, "grad_norm": 1.4139893054962158, "learning_rate": 2.6321463094267934e-05, "loss": 1.395, "step": 1695 }, { "epoch": 1.3508353221957041, "grad_norm": 1.7168787717819214, "learning_rate": 2.614784294175554e-05, "loss": 1.5379, "step": 1698 }, { "epoch": 1.3532219570405728, "grad_norm": 1.528499722480774, "learning_rate": 2.597459432507664e-05, "loss": 1.4597, "step": 1701 }, { "epoch": 1.3556085918854415, "grad_norm": 1.4057003259658813, "learning_rate": 2.5801719942858065e-05, "loss": 1.4797, "step": 1704 }, { "epoch": 1.3579952267303104, "grad_norm": 1.324141025543213, "learning_rate": 2.562922248789722e-05, "loss": 1.4355, "step": 1707 }, { "epoch": 1.360381861575179, "grad_norm": 1.3396581411361694, "learning_rate": 2.5457104647120322e-05, "loss": 1.5498, "step": 1710 }, { "epoch": 1.3627684964200477, "grad_norm": 1.3867429494857788, "learning_rate": 2.5285369101540445e-05, "loss": 1.4706, "step": 1713 }, { "epoch": 1.3651551312649164, "grad_norm": 1.4631327390670776, "learning_rate": 2.5114018526215844e-05, "loss": 1.4652, "step": 1716 }, { "epoch": 1.3675417661097853, "grad_norm": 1.4228324890136719, "learning_rate": 2.494305559020822e-05, "loss": 1.5147, "step": 1719 }, { "epoch": 1.369928400954654, "grad_norm": 1.3669848442077637, "learning_rate": 2.4772482956541132e-05, "loss": 1.3945, "step": 1722 }, { "epoch": 1.3723150357995226, "grad_norm": 1.466894268989563, "learning_rate": 2.4602303282158616e-05, "loss": 1.3822, "step": 1725 }, { "epoch": 1.3747016706443915, "grad_norm": 1.5458747148513794, "learning_rate": 2.4432519217883676e-05, "loss": 1.46, "step": 1728 }, { "epoch": 1.3770883054892602, "grad_norm": 1.4828535318374634, "learning_rate": 2.4263133408377076e-05, "loss": 1.5053, "step": 1731 }, { "epoch": 1.3794749403341289, "grad_norm": 1.3600937128067017, "learning_rate": 2.4094148492096125e-05, "loss": 1.5814, "step": 1734 }, { "epoch": 1.3818615751789975, "grad_norm": 1.5162534713745117, "learning_rate": 2.3925567101253576e-05, "loss": 1.5373, "step": 1737 }, { "epoch": 1.3842482100238662, "grad_norm": 1.4318009614944458, "learning_rate": 2.3757391861776585e-05, "loss": 1.4308, "step": 1740 }, { "epoch": 1.3866348448687351, "grad_norm": 1.285593032836914, "learning_rate": 2.3589625393265895e-05, "loss": 1.4115, "step": 1743 }, { "epoch": 1.3890214797136038, "grad_norm": 1.41255521774292, "learning_rate": 2.3422270308954934e-05, "loss": 1.4724, "step": 1746 }, { "epoch": 1.3914081145584727, "grad_norm": 1.221402883529663, "learning_rate": 2.3255329215669185e-05, "loss": 1.3923, "step": 1749 }, { "epoch": 1.3937947494033414, "grad_norm": 1.7999006509780884, "learning_rate": 2.3088804713785584e-05, "loss": 1.5016, "step": 1752 }, { "epoch": 1.39618138424821, "grad_norm": 1.2943382263183594, "learning_rate": 2.2922699397191893e-05, "loss": 1.4305, "step": 1755 }, { "epoch": 1.3985680190930787, "grad_norm": 1.470240831375122, "learning_rate": 2.2757015853246493e-05, "loss": 1.3706, "step": 1758 }, { "epoch": 1.4009546539379474, "grad_norm": 1.269652247428894, "learning_rate": 2.2591756662737862e-05, "loss": 1.4425, "step": 1761 }, { "epoch": 1.4033412887828163, "grad_norm": 1.3773661851882935, "learning_rate": 2.242692439984463e-05, "loss": 1.4063, "step": 1764 }, { "epoch": 1.405727923627685, "grad_norm": 1.4183642864227295, "learning_rate": 2.2262521632095203e-05, "loss": 1.5086, "step": 1767 }, { "epoch": 1.4081145584725536, "grad_norm": 1.3887887001037598, "learning_rate": 2.2098550920327998e-05, "loss": 1.311, "step": 1770 }, { "epoch": 1.4105011933174225, "grad_norm": 1.5611753463745117, "learning_rate": 2.1935014818651405e-05, "loss": 1.3815, "step": 1773 }, { "epoch": 1.4128878281622912, "grad_norm": 1.2606338262557983, "learning_rate": 2.177191587440409e-05, "loss": 1.4286, "step": 1776 }, { "epoch": 1.4152744630071599, "grad_norm": 1.3460246324539185, "learning_rate": 2.1609256628115316e-05, "loss": 1.541, "step": 1779 }, { "epoch": 1.4176610978520285, "grad_norm": 1.4279568195343018, "learning_rate": 2.1447039613465265e-05, "loss": 1.4517, "step": 1782 }, { "epoch": 1.4200477326968974, "grad_norm": 1.4444867372512817, "learning_rate": 2.128526735724572e-05, "loss": 1.4325, "step": 1785 }, { "epoch": 1.422434367541766, "grad_norm": 2.7751872539520264, "learning_rate": 2.1123942379320576e-05, "loss": 1.4161, "step": 1788 }, { "epoch": 1.4248210023866348, "grad_norm": 1.5731838941574097, "learning_rate": 2.096306719258669e-05, "loss": 1.3889, "step": 1791 }, { "epoch": 1.4272076372315037, "grad_norm": 1.5036033391952515, "learning_rate": 2.0802644302934683e-05, "loss": 1.4823, "step": 1794 }, { "epoch": 1.4295942720763724, "grad_norm": 1.5555312633514404, "learning_rate": 2.0642676209209934e-05, "loss": 1.5452, "step": 1797 }, { "epoch": 1.431980906921241, "grad_norm": 1.653014898300171, "learning_rate": 2.0483165403173583e-05, "loss": 1.4651, "step": 1800 }, { "epoch": 1.4343675417661097, "grad_norm": 1.4742029905319214, "learning_rate": 2.0324114369463855e-05, "loss": 1.4215, "step": 1803 }, { "epoch": 1.4367541766109786, "grad_norm": 1.3069920539855957, "learning_rate": 2.0165525585557204e-05, "loss": 1.352, "step": 1806 }, { "epoch": 1.4391408114558473, "grad_norm": 1.3675094842910767, "learning_rate": 2.0007401521729863e-05, "loss": 1.3925, "step": 1809 }, { "epoch": 1.441527446300716, "grad_norm": 1.5048547983169556, "learning_rate": 1.984974464101928e-05, "loss": 1.4392, "step": 1812 }, { "epoch": 1.4439140811455848, "grad_norm": 1.3629268407821655, "learning_rate": 1.9692557399185734e-05, "loss": 1.6123, "step": 1815 }, { "epoch": 1.4463007159904535, "grad_norm": 1.4186464548110962, "learning_rate": 1.953584224467418e-05, "loss": 1.4375, "step": 1818 }, { "epoch": 1.4486873508353222, "grad_norm": 1.264143943786621, "learning_rate": 1.9379601618575977e-05, "loss": 1.3714, "step": 1821 }, { "epoch": 1.4510739856801909, "grad_norm": 1.2267049551010132, "learning_rate": 1.9223837954591046e-05, "loss": 1.442, "step": 1824 }, { "epoch": 1.4534606205250595, "grad_norm": 1.4130151271820068, "learning_rate": 1.9068553678989736e-05, "loss": 1.5417, "step": 1827 }, { "epoch": 1.4558472553699284, "grad_norm": 1.3902508020401, "learning_rate": 1.8913751210575248e-05, "loss": 1.4484, "step": 1830 }, { "epoch": 1.458233890214797, "grad_norm": 1.3493434190750122, "learning_rate": 1.8759432960645774e-05, "loss": 1.4089, "step": 1833 }, { "epoch": 1.460620525059666, "grad_norm": 1.3696714639663696, "learning_rate": 1.8605601332957077e-05, "loss": 1.3673, "step": 1836 }, { "epoch": 1.4630071599045347, "grad_norm": 1.3794286251068115, "learning_rate": 1.8452258723684995e-05, "loss": 1.3348, "step": 1839 }, { "epoch": 1.4653937947494033, "grad_norm": 1.4056599140167236, "learning_rate": 1.8299407521388067e-05, "loss": 1.3715, "step": 1842 }, { "epoch": 1.467780429594272, "grad_norm": 1.3033989667892456, "learning_rate": 1.8147050106970437e-05, "loss": 1.4756, "step": 1845 }, { "epoch": 1.4701670644391407, "grad_norm": 1.2725988626480103, "learning_rate": 1.7995188853644646e-05, "loss": 1.4429, "step": 1848 }, { "epoch": 1.4725536992840096, "grad_norm": 1.2975175380706787, "learning_rate": 1.784382612689477e-05, "loss": 1.4233, "step": 1851 }, { "epoch": 1.4749403341288783, "grad_norm": 1.3403589725494385, "learning_rate": 1.7692964284439505e-05, "loss": 1.3441, "step": 1854 }, { "epoch": 1.477326968973747, "grad_norm": 1.395530343055725, "learning_rate": 1.7542605676195506e-05, "loss": 1.3968, "step": 1857 }, { "epoch": 1.4797136038186158, "grad_norm": 1.2565813064575195, "learning_rate": 1.739275264424067e-05, "loss": 1.4858, "step": 1860 }, { "epoch": 1.4821002386634845, "grad_norm": 1.4718109369277954, "learning_rate": 1.7243407522777806e-05, "loss": 1.4901, "step": 1863 }, { "epoch": 1.4844868735083532, "grad_norm": 1.2175111770629883, "learning_rate": 1.7094572638098123e-05, "loss": 1.4708, "step": 1866 }, { "epoch": 1.4868735083532219, "grad_norm": 1.3653630018234253, "learning_rate": 1.6946250308545125e-05, "loss": 1.3542, "step": 1869 }, { "epoch": 1.4892601431980907, "grad_norm": 1.325291633605957, "learning_rate": 1.6798442844478445e-05, "loss": 1.3565, "step": 1872 }, { "epoch": 1.4916467780429594, "grad_norm": 1.3131422996520996, "learning_rate": 1.6651152548237802e-05, "loss": 1.3708, "step": 1875 }, { "epoch": 1.494033412887828, "grad_norm": 1.3751031160354614, "learning_rate": 1.6504381714107252e-05, "loss": 1.4521, "step": 1878 }, { "epoch": 1.496420047732697, "grad_norm": 1.3174288272857666, "learning_rate": 1.6358132628279322e-05, "loss": 1.3748, "step": 1881 }, { "epoch": 1.4988066825775657, "grad_norm": 1.3484026193618774, "learning_rate": 1.6212407568819565e-05, "loss": 1.3542, "step": 1884 }, { "epoch": 1.5011933174224343, "grad_norm": 1.417079210281372, "learning_rate": 1.6067208805630877e-05, "loss": 1.4029, "step": 1887 }, { "epoch": 1.503579952267303, "grad_norm": 2.4393136501312256, "learning_rate": 1.5922538600418318e-05, "loss": 1.3775, "step": 1890 }, { "epoch": 1.5059665871121717, "grad_norm": 1.4242373704910278, "learning_rate": 1.5778399206653734e-05, "loss": 1.3828, "step": 1893 }, { "epoch": 1.5083532219570406, "grad_norm": 1.3555493354797363, "learning_rate": 1.563479286954078e-05, "loss": 1.4257, "step": 1896 }, { "epoch": 1.5107398568019093, "grad_norm": 1.3086305856704712, "learning_rate": 1.54917218259799e-05, "loss": 1.366, "step": 1899 }, { "epoch": 1.5131264916467781, "grad_norm": 1.3633023500442505, "learning_rate": 1.5349188304533413e-05, "loss": 1.4599, "step": 1902 }, { "epoch": 1.5155131264916468, "grad_norm": 1.3130401372909546, "learning_rate": 1.5207194525390938e-05, "loss": 1.4543, "step": 1905 }, { "epoch": 1.5178997613365155, "grad_norm": 1.3946834802627563, "learning_rate": 1.5065742700334678e-05, "loss": 1.4115, "step": 1908 }, { "epoch": 1.5202863961813842, "grad_norm": 1.3722360134124756, "learning_rate": 1.4924835032705064e-05, "loss": 1.4059, "step": 1911 }, { "epoch": 1.5226730310262528, "grad_norm": 1.297034740447998, "learning_rate": 1.4784473717366387e-05, "loss": 1.5423, "step": 1914 }, { "epoch": 1.5250596658711217, "grad_norm": 1.2313867807388306, "learning_rate": 1.4644660940672627e-05, "loss": 1.4202, "step": 1917 }, { "epoch": 1.5274463007159904, "grad_norm": 1.2203267812728882, "learning_rate": 1.4505398880433369e-05, "loss": 1.3289, "step": 1920 }, { "epoch": 1.5298329355608593, "grad_norm": 2.824936628341675, "learning_rate": 1.4366689705879898e-05, "loss": 1.4151, "step": 1923 }, { "epoch": 1.532219570405728, "grad_norm": 1.2896000146865845, "learning_rate": 1.4228535577631442e-05, "loss": 1.3452, "step": 1926 }, { "epoch": 1.5346062052505967, "grad_norm": 1.2921373844146729, "learning_rate": 1.4090938647661461e-05, "loss": 1.4469, "step": 1929 }, { "epoch": 1.5369928400954653, "grad_norm": 1.4541573524475098, "learning_rate": 1.3953901059264191e-05, "loss": 1.5048, "step": 1932 }, { "epoch": 1.539379474940334, "grad_norm": 1.5273715257644653, "learning_rate": 1.3817424947021151e-05, "loss": 1.425, "step": 1935 }, { "epoch": 1.541766109785203, "grad_norm": 1.4470243453979492, "learning_rate": 1.3681512436768045e-05, "loss": 1.6023, "step": 1938 }, { "epoch": 1.5441527446300716, "grad_norm": 1.274420976638794, "learning_rate": 1.3546165645561487e-05, "loss": 1.3682, "step": 1941 }, { "epoch": 1.5465393794749405, "grad_norm": 1.537539005279541, "learning_rate": 1.3411386681646164e-05, "loss": 1.3933, "step": 1944 }, { "epoch": 1.5489260143198091, "grad_norm": 1.2480096817016602, "learning_rate": 1.3277177644421924e-05, "loss": 1.3532, "step": 1947 }, { "epoch": 1.5513126491646778, "grad_norm": 1.4736180305480957, "learning_rate": 1.314354062441106e-05, "loss": 1.5258, "step": 1950 }, { "epoch": 1.5536992840095465, "grad_norm": 1.2729185819625854, "learning_rate": 1.301047770322581e-05, "loss": 1.3904, "step": 1953 }, { "epoch": 1.5560859188544152, "grad_norm": 1.2870765924453735, "learning_rate": 1.287799095353584e-05, "loss": 1.3343, "step": 1956 }, { "epoch": 1.558472553699284, "grad_norm": 1.385016679763794, "learning_rate": 1.2746082439036117e-05, "loss": 1.4185, "step": 1959 }, { "epoch": 1.5608591885441527, "grad_norm": 1.3921257257461548, "learning_rate": 1.2614754214414548e-05, "loss": 1.3932, "step": 1962 }, { "epoch": 1.5632458233890216, "grad_norm": 1.4554414749145508, "learning_rate": 1.2484008325320174e-05, "loss": 1.4237, "step": 1965 }, { "epoch": 1.5656324582338903, "grad_norm": 1.4019906520843506, "learning_rate": 1.2353846808331154e-05, "loss": 1.3849, "step": 1968 }, { "epoch": 1.568019093078759, "grad_norm": 1.3316291570663452, "learning_rate": 1.2224271690923155e-05, "loss": 1.3343, "step": 1971 }, { "epoch": 1.5704057279236276, "grad_norm": 1.251207709312439, "learning_rate": 1.2095284991437733e-05, "loss": 1.333, "step": 1974 }, { "epoch": 1.5727923627684963, "grad_norm": 1.3229224681854248, "learning_rate": 1.1966888719050829e-05, "loss": 1.4419, "step": 1977 }, { "epoch": 1.575178997613365, "grad_norm": 1.3271231651306152, "learning_rate": 1.1839084873741584e-05, "loss": 1.421, "step": 1980 }, { "epoch": 1.577565632458234, "grad_norm": 1.4479427337646484, "learning_rate": 1.1711875446261094e-05, "loss": 1.4322, "step": 1983 }, { "epoch": 1.5799522673031028, "grad_norm": 1.4587756395339966, "learning_rate": 1.1585262418101467e-05, "loss": 1.4832, "step": 1986 }, { "epoch": 1.5823389021479715, "grad_norm": 1.3749325275421143, "learning_rate": 1.1459247761464909e-05, "loss": 1.423, "step": 1989 }, { "epoch": 1.5847255369928401, "grad_norm": 1.3663976192474365, "learning_rate": 1.1333833439233055e-05, "loss": 1.4133, "step": 1992 }, { "epoch": 1.5871121718377088, "grad_norm": 1.3849143981933594, "learning_rate": 1.1209021404936304e-05, "loss": 1.3823, "step": 1995 }, { "epoch": 1.5894988066825775, "grad_norm": 1.3283625841140747, "learning_rate": 1.1084813602723515e-05, "loss": 1.4437, "step": 1998 }, { "epoch": 1.5918854415274462, "grad_norm": 1.3943214416503906, "learning_rate": 1.0961211967331597e-05, "loss": 1.4566, "step": 2001 }, { "epoch": 1.594272076372315, "grad_norm": 1.340326189994812, "learning_rate": 1.083821842405548e-05, "loss": 1.3319, "step": 2004 }, { "epoch": 1.5966587112171837, "grad_norm": 1.458629846572876, "learning_rate": 1.0715834888718074e-05, "loss": 1.307, "step": 2007 }, { "epoch": 1.5990453460620526, "grad_norm": 1.327406883239746, "learning_rate": 1.0594063267640386e-05, "loss": 1.3367, "step": 2010 }, { "epoch": 1.6014319809069213, "grad_norm": 1.3545573949813843, "learning_rate": 1.0472905457611936e-05, "loss": 1.43, "step": 2013 }, { "epoch": 1.60381861575179, "grad_norm": 1.3321373462677002, "learning_rate": 1.0352363345861065e-05, "loss": 1.3416, "step": 2016 }, { "epoch": 1.6062052505966586, "grad_norm": 1.3040752410888672, "learning_rate": 1.023243881002573e-05, "loss": 1.6122, "step": 2019 }, { "epoch": 1.6085918854415273, "grad_norm": 1.7471809387207031, "learning_rate": 1.0113133718124035e-05, "loss": 1.5219, "step": 2022 }, { "epoch": 1.6109785202863962, "grad_norm": 1.362330436706543, "learning_rate": 9.994449928525324e-06, "loss": 1.4859, "step": 2025 }, { "epoch": 1.6133651551312649, "grad_norm": 1.3588142395019531, "learning_rate": 9.876389289921106e-06, "loss": 1.5388, "step": 2028 }, { "epoch": 1.6157517899761338, "grad_norm": 1.2795350551605225, "learning_rate": 9.758953641296331e-06, "loss": 1.4129, "step": 2031 }, { "epoch": 1.6181384248210025, "grad_norm": 1.3939927816390991, "learning_rate": 9.642144811900739e-06, "loss": 1.407, "step": 2034 }, { "epoch": 1.6205250596658711, "grad_norm": 1.3605296611785889, "learning_rate": 9.5259646212203e-06, "loss": 1.3686, "step": 2037 }, { "epoch": 1.6229116945107398, "grad_norm": 3.3502025604248047, "learning_rate": 9.410414878948975e-06, "loss": 1.3942, "step": 2040 }, { "epoch": 1.6252983293556085, "grad_norm": 1.2541710138320923, "learning_rate": 9.295497384960416e-06, "loss": 1.4175, "step": 2043 }, { "epoch": 1.6276849642004774, "grad_norm": 1.4015976190567017, "learning_rate": 9.181213929280046e-06, "loss": 1.4867, "step": 2046 }, { "epoch": 1.630071599045346, "grad_norm": 1.2918376922607422, "learning_rate": 9.067566292057084e-06, "loss": 1.4243, "step": 2049 }, { "epoch": 1.632458233890215, "grad_norm": 1.341584324836731, "learning_rate": 8.954556243536877e-06, "loss": 1.309, "step": 2052 }, { "epoch": 1.6348448687350836, "grad_norm": 1.364698052406311, "learning_rate": 8.842185544033255e-06, "loss": 1.4609, "step": 2055 }, { "epoch": 1.6372315035799523, "grad_norm": 1.3149210214614868, "learning_rate": 8.7304559439012e-06, "loss": 1.4338, "step": 2058 }, { "epoch": 1.639618138424821, "grad_norm": 1.2939684391021729, "learning_rate": 8.619369183509501e-06, "loss": 1.3857, "step": 2061 }, { "epoch": 1.6420047732696896, "grad_norm": 1.364255428314209, "learning_rate": 8.508926993213712e-06, "loss": 1.4484, "step": 2064 }, { "epoch": 1.6443914081145583, "grad_norm": 1.3580390214920044, "learning_rate": 8.39913109332916e-06, "loss": 1.377, "step": 2067 }, { "epoch": 1.6467780429594272, "grad_norm": 1.354833960533142, "learning_rate": 8.28998319410413e-06, "loss": 1.3848, "step": 2070 }, { "epoch": 1.649164677804296, "grad_norm": 1.2995808124542236, "learning_rate": 8.181484995693295e-06, "loss": 1.369, "step": 2073 }, { "epoch": 1.6515513126491648, "grad_norm": 1.387732982635498, "learning_rate": 8.073638188131128e-06, "loss": 1.3963, "step": 2076 }, { "epoch": 1.6539379474940334, "grad_norm": 1.349213719367981, "learning_rate": 7.966444451305726e-06, "loss": 1.4368, "step": 2079 }, { "epoch": 1.6563245823389021, "grad_norm": 1.3115229606628418, "learning_rate": 7.859905454932471e-06, "loss": 1.3239, "step": 2082 }, { "epoch": 1.6587112171837708, "grad_norm": 1.2257990837097168, "learning_rate": 7.75402285852816e-06, "loss": 1.3398, "step": 2085 }, { "epoch": 1.6610978520286395, "grad_norm": 1.3116490840911865, "learning_rate": 7.648798311385058e-06, "loss": 1.3408, "step": 2088 }, { "epoch": 1.6634844868735084, "grad_norm": 1.3405076265335083, "learning_rate": 7.5442334525452964e-06, "loss": 1.3239, "step": 2091 }, { "epoch": 1.665871121718377, "grad_norm": 1.328359842300415, "learning_rate": 7.440329910775273e-06, "loss": 1.3864, "step": 2094 }, { "epoch": 1.668257756563246, "grad_norm": 1.4465019702911377, "learning_rate": 7.337089304540301e-06, "loss": 1.3507, "step": 2097 }, { "epoch": 1.6706443914081146, "grad_norm": 1.334693431854248, "learning_rate": 7.234513241979418e-06, "loss": 1.41, "step": 2100 }, { "epoch": 1.6730310262529833, "grad_norm": 1.2191022634506226, "learning_rate": 7.132603320880294e-06, "loss": 1.3517, "step": 2103 }, { "epoch": 1.675417661097852, "grad_norm": 1.286347508430481, "learning_rate": 7.031361128654401e-06, "loss": 1.4724, "step": 2106 }, { "epoch": 1.6778042959427206, "grad_norm": 1.2741565704345703, "learning_rate": 6.930788242312253e-06, "loss": 1.3599, "step": 2109 }, { "epoch": 1.6801909307875895, "grad_norm": 1.331213116645813, "learning_rate": 6.830886228438837e-06, "loss": 1.448, "step": 2112 }, { "epoch": 1.6825775656324582, "grad_norm": 1.2380222082138062, "learning_rate": 6.731656643169204e-06, "loss": 1.362, "step": 2115 }, { "epoch": 1.684964200477327, "grad_norm": 1.3948215246200562, "learning_rate": 6.633101032164274e-06, "loss": 1.4777, "step": 2118 }, { "epoch": 1.6873508353221958, "grad_norm": 1.2766612768173218, "learning_rate": 6.535220930586705e-06, "loss": 1.5417, "step": 2121 }, { "epoch": 1.6897374701670644, "grad_norm": 1.5349231958389282, "learning_rate": 6.4380178630770225e-06, "loss": 1.4201, "step": 2124 }, { "epoch": 1.692124105011933, "grad_norm": 1.3303968906402588, "learning_rate": 6.341493343729854e-06, "loss": 1.4746, "step": 2127 }, { "epoch": 1.6945107398568018, "grad_norm": 1.2858116626739502, "learning_rate": 6.2456488760703205e-06, "loss": 1.4834, "step": 2130 }, { "epoch": 1.6968973747016707, "grad_norm": 1.3412748575210571, "learning_rate": 6.150485953030677e-06, "loss": 1.2398, "step": 2133 }, { "epoch": 1.6992840095465394, "grad_norm": 1.3060575723648071, "learning_rate": 6.056006056926977e-06, "loss": 1.5145, "step": 2136 }, { "epoch": 1.7016706443914082, "grad_norm": 1.4394913911819458, "learning_rate": 5.962210659436091e-06, "loss": 1.3623, "step": 2139 }, { "epoch": 1.704057279236277, "grad_norm": 1.2894078493118286, "learning_rate": 5.869101221572654e-06, "loss": 1.327, "step": 2142 }, { "epoch": 1.7064439140811456, "grad_norm": 1.6368987560272217, "learning_rate": 5.776679193666412e-06, "loss": 1.5371, "step": 2145 }, { "epoch": 1.7088305489260143, "grad_norm": 1.4706789255142212, "learning_rate": 5.6849460153395706e-06, "loss": 1.3617, "step": 2148 }, { "epoch": 1.711217183770883, "grad_norm": 1.358765721321106, "learning_rate": 5.5939031154844e-06, "loss": 1.3666, "step": 2151 }, { "epoch": 1.7136038186157518, "grad_norm": 1.3404169082641602, "learning_rate": 5.5035519122409895e-06, "loss": 1.4213, "step": 2154 }, { "epoch": 1.7159904534606205, "grad_norm": 1.9059109687805176, "learning_rate": 5.413893812975096e-06, "loss": 1.4891, "step": 2157 }, { "epoch": 1.7183770883054894, "grad_norm": 1.5331205129623413, "learning_rate": 5.324930214256302e-06, "loss": 1.4278, "step": 2160 }, { "epoch": 1.720763723150358, "grad_norm": 1.3307032585144043, "learning_rate": 5.236662501836192e-06, "loss": 1.389, "step": 2163 }, { "epoch": 1.7231503579952268, "grad_norm": 1.5163909196853638, "learning_rate": 5.149092050626825e-06, "loss": 1.5462, "step": 2166 }, { "epoch": 1.7255369928400954, "grad_norm": 1.3799465894699097, "learning_rate": 5.062220224679276e-06, "loss": 1.3583, "step": 2169 }, { "epoch": 1.727923627684964, "grad_norm": 1.288385272026062, "learning_rate": 4.9760483771624236e-06, "loss": 1.401, "step": 2172 }, { "epoch": 1.7303102625298328, "grad_norm": 1.3018258810043335, "learning_rate": 4.89057785034181e-06, "loss": 1.3998, "step": 2175 }, { "epoch": 1.7326968973747017, "grad_norm": 1.4465045928955078, "learning_rate": 4.805809975558828e-06, "loss": 1.4118, "step": 2178 }, { "epoch": 1.7350835322195706, "grad_norm": 1.3118780851364136, "learning_rate": 4.721746073209893e-06, "loss": 1.3574, "step": 2181 }, { "epoch": 1.7374701670644392, "grad_norm": 1.653915524482727, "learning_rate": 4.6383874527259345e-06, "loss": 1.5086, "step": 2184 }, { "epoch": 1.739856801909308, "grad_norm": 1.3008549213409424, "learning_rate": 4.555735412551975e-06, "loss": 1.4131, "step": 2187 }, { "epoch": 1.7422434367541766, "grad_norm": 1.714281678199768, "learning_rate": 4.47379124012689e-06, "loss": 1.4335, "step": 2190 }, { "epoch": 1.7446300715990453, "grad_norm": 1.379380702972412, "learning_rate": 4.3925562118634135e-06, "loss": 1.4987, "step": 2193 }, { "epoch": 1.747016706443914, "grad_norm": 1.3546345233917236, "learning_rate": 4.312031593128163e-06, "loss": 1.5424, "step": 2196 }, { "epoch": 1.7494033412887828, "grad_norm": 1.4139165878295898, "learning_rate": 4.232218638222029e-06, "loss": 1.3599, "step": 2199 }, { "epoch": 1.7517899761336515, "grad_norm": 1.326416015625, "learning_rate": 4.153118590360561e-06, "loss": 1.3698, "step": 2202 }, { "epoch": 1.7541766109785204, "grad_norm": 1.2726656198501587, "learning_rate": 4.074732681654647e-06, "loss": 1.3478, "step": 2205 }, { "epoch": 1.756563245823389, "grad_norm": 1.295027494430542, "learning_rate": 3.997062133091284e-06, "loss": 1.3318, "step": 2208 }, { "epoch": 1.7589498806682577, "grad_norm": 1.240146279335022, "learning_rate": 3.920108154514585e-06, "loss": 1.2902, "step": 2211 }, { "epoch": 1.7613365155131264, "grad_norm": 1.2674936056137085, "learning_rate": 3.843871944606969e-06, "loss": 1.3331, "step": 2214 }, { "epoch": 1.763723150357995, "grad_norm": 1.4124763011932373, "learning_rate": 3.7683546908703903e-06, "loss": 1.432, "step": 2217 }, { "epoch": 1.766109785202864, "grad_norm": 1.3157274723052979, "learning_rate": 3.693557569607947e-06, "loss": 1.3372, "step": 2220 }, { "epoch": 1.7684964200477327, "grad_norm": 1.4040486812591553, "learning_rate": 3.6194817459054676e-06, "loss": 1.4255, "step": 2223 }, { "epoch": 1.7708830548926016, "grad_norm": 1.2398067712783813, "learning_rate": 3.5461283736134722e-06, "loss": 1.3448, "step": 2226 }, { "epoch": 1.7732696897374702, "grad_norm": 1.2771934270858765, "learning_rate": 3.4734985953290778e-06, "loss": 1.4079, "step": 2229 }, { "epoch": 1.775656324582339, "grad_norm": 1.1697748899459839, "learning_rate": 3.401593542378262e-06, "loss": 1.4184, "step": 2232 }, { "epoch": 1.7780429594272076, "grad_norm": 1.355383276939392, "learning_rate": 3.330414334798265e-06, "loss": 1.31, "step": 2235 }, { "epoch": 1.7804295942720763, "grad_norm": 1.2719684839248657, "learning_rate": 3.2599620813200837e-06, "loss": 1.4189, "step": 2238 }, { "epoch": 1.7828162291169452, "grad_norm": 1.4642714262008667, "learning_rate": 3.1902378793512657e-06, "loss": 1.4552, "step": 2241 }, { "epoch": 1.7852028639618138, "grad_norm": 1.2354283332824707, "learning_rate": 3.121242814958747e-06, "loss": 1.3951, "step": 2244 }, { "epoch": 1.7875894988066827, "grad_norm": 1.3236439228057861, "learning_rate": 3.0529779628519992e-06, "loss": 1.4105, "step": 2247 }, { "epoch": 1.7899761336515514, "grad_norm": 1.3168965578079224, "learning_rate": 2.9854443863662262e-06, "loss": 1.4434, "step": 2250 }, { "epoch": 1.79236276849642, "grad_norm": 2.0770509243011475, "learning_rate": 2.918643137445859e-06, "loss": 1.4209, "step": 2253 }, { "epoch": 1.7947494033412887, "grad_norm": 1.3210294246673584, "learning_rate": 2.8525752566281482e-06, "loss": 1.4219, "step": 2256 }, { "epoch": 1.7971360381861574, "grad_norm": 1.3929189443588257, "learning_rate": 2.787241773026933e-06, "loss": 1.4382, "step": 2259 }, { "epoch": 1.799522673031026, "grad_norm": 1.3046364784240723, "learning_rate": 2.722643704316652e-06, "loss": 1.4634, "step": 2262 }, { "epoch": 1.801909307875895, "grad_norm": 1.324135422706604, "learning_rate": 2.658782056716441e-06, "loss": 1.4041, "step": 2265 }, { "epoch": 1.8042959427207639, "grad_norm": 1.3277792930603027, "learning_rate": 2.5956578249745236e-06, "loss": 1.3838, "step": 2268 }, { "epoch": 1.8066825775656326, "grad_norm": 1.5090585947036743, "learning_rate": 2.533271992352659e-06, "loss": 1.4224, "step": 2271 }, { "epoch": 1.8090692124105012, "grad_norm": 1.2901520729064941, "learning_rate": 2.4716255306108605e-06, "loss": 1.3893, "step": 2274 }, { "epoch": 1.81145584725537, "grad_norm": 1.3586432933807373, "learning_rate": 2.4107193999922286e-06, "loss": 1.3146, "step": 2277 }, { "epoch": 1.8138424821002386, "grad_norm": 1.2898615598678589, "learning_rate": 2.3505545492080395e-06, "loss": 1.2849, "step": 2280 }, { "epoch": 1.8162291169451072, "grad_norm": 1.3389803171157837, "learning_rate": 2.291131915422917e-06, "loss": 1.3749, "step": 2283 }, { "epoch": 1.8186157517899761, "grad_norm": 1.3909133672714233, "learning_rate": 2.2324524242402613e-06, "loss": 1.4045, "step": 2286 }, { "epoch": 1.8210023866348448, "grad_norm": 1.992772102355957, "learning_rate": 2.1745169896878414e-06, "loss": 1.3947, "step": 2289 }, { "epoch": 1.8233890214797137, "grad_norm": 1.3467578887939453, "learning_rate": 2.117326514203527e-06, "loss": 1.4358, "step": 2292 }, { "epoch": 1.8257756563245824, "grad_norm": 1.316362738609314, "learning_rate": 2.0608818886212576e-06, "loss": 1.3924, "step": 2295 }, { "epoch": 1.828162291169451, "grad_norm": 1.303235411643982, "learning_rate": 2.0051839921571448e-06, "loss": 1.4439, "step": 2298 }, { "epoch": 1.8305489260143197, "grad_norm": 1.4706928730010986, "learning_rate": 1.9502336923958255e-06, "loss": 1.2834, "step": 2301 }, { "epoch": 1.8329355608591884, "grad_norm": 1.2729629278182983, "learning_rate": 1.8960318452768577e-06, "loss": 1.3582, "step": 2304 }, { "epoch": 1.8353221957040573, "grad_norm": 1.5145806074142456, "learning_rate": 1.8425792950814868e-06, "loss": 1.492, "step": 2307 }, { "epoch": 1.837708830548926, "grad_norm": 1.286137342453003, "learning_rate": 1.7898768744194162e-06, "loss": 1.4031, "step": 2310 }, { "epoch": 1.8400954653937949, "grad_norm": 1.309260368347168, "learning_rate": 1.7379254042158955e-06, "loss": 1.3831, "step": 2313 }, { "epoch": 1.8424821002386635, "grad_norm": 1.5175321102142334, "learning_rate": 1.6867256936989096e-06, "loss": 1.4786, "step": 2316 }, { "epoch": 1.8448687350835322, "grad_norm": 1.2964720726013184, "learning_rate": 1.6362785403865488e-06, "loss": 1.398, "step": 2319 }, { "epoch": 1.847255369928401, "grad_norm": 1.2901564836502075, "learning_rate": 1.5865847300746417e-06, "loss": 1.3338, "step": 2322 }, { "epoch": 1.8496420047732696, "grad_norm": 1.3462107181549072, "learning_rate": 1.5376450368244589e-06, "loss": 1.3809, "step": 2325 }, { "epoch": 1.8520286396181385, "grad_norm": 1.3782678842544556, "learning_rate": 1.4894602229506892e-06, "loss": 1.3993, "step": 2328 }, { "epoch": 1.8544152744630071, "grad_norm": 1.1769006252288818, "learning_rate": 1.4420310390095615e-06, "loss": 1.291, "step": 2331 }, { "epoch": 1.856801909307876, "grad_norm": 1.279466152191162, "learning_rate": 1.3953582237871521e-06, "loss": 1.4328, "step": 2334 }, { "epoch": 1.8591885441527447, "grad_norm": 1.3210725784301758, "learning_rate": 1.3494425042878622e-06, "loss": 1.4165, "step": 2337 }, { "epoch": 1.8615751789976134, "grad_norm": 1.393333911895752, "learning_rate": 1.3042845957231153e-06, "loss": 1.3581, "step": 2340 }, { "epoch": 1.863961813842482, "grad_norm": 1.3860208988189697, "learning_rate": 1.2598852015001994e-06, "loss": 1.3974, "step": 2343 }, { "epoch": 1.8663484486873507, "grad_norm": 1.3636202812194824, "learning_rate": 1.2162450132113201e-06, "loss": 1.5009, "step": 2346 }, { "epoch": 1.8687350835322196, "grad_norm": 1.2739925384521484, "learning_rate": 1.1733647106228375e-06, "loss": 1.4206, "step": 2349 }, { "epoch": 1.8711217183770883, "grad_norm": 2.2390100955963135, "learning_rate": 1.1312449616646403e-06, "loss": 1.3318, "step": 2352 }, { "epoch": 1.8735083532219572, "grad_norm": 1.1843377351760864, "learning_rate": 1.0898864224197946e-06, "loss": 1.289, "step": 2355 }, { "epoch": 1.8758949880668259, "grad_norm": 1.2790746688842773, "learning_rate": 1.049289737114273e-06, "loss": 1.4026, "step": 2358 }, { "epoch": 1.8782816229116945, "grad_norm": 1.1583595275878906, "learning_rate": 1.009455538106968e-06, "loss": 1.2771, "step": 2361 }, { "epoch": 1.8806682577565632, "grad_norm": 4.7106781005859375, "learning_rate": 9.703844458797962e-07, "loss": 1.4771, "step": 2364 }, { "epoch": 1.8830548926014319, "grad_norm": 1.3764762878417969, "learning_rate": 9.320770690280645e-07, "loss": 1.4295, "step": 2367 }, { "epoch": 1.8854415274463006, "grad_norm": 1.3544838428497314, "learning_rate": 8.945340042509797e-07, "loss": 1.405, "step": 2370 }, { "epoch": 1.8878281622911695, "grad_norm": 1.3262426853179932, "learning_rate": 8.577558363423554e-07, "loss": 1.4135, "step": 2373 }, { "epoch": 1.8902147971360383, "grad_norm": 1.2534743547439575, "learning_rate": 8.217431381815077e-07, "loss": 1.4168, "step": 2376 }, { "epoch": 1.892601431980907, "grad_norm": 1.4368257522583008, "learning_rate": 7.864964707243072e-07, "loss": 1.3518, "step": 2379 }, { "epoch": 1.8949880668257757, "grad_norm": 1.3582005500793457, "learning_rate": 7.520163829944804e-07, "loss": 1.3315, "step": 2382 }, { "epoch": 1.8973747016706444, "grad_norm": 1.4341644048690796, "learning_rate": 7.183034120750221e-07, "loss": 1.3689, "step": 2385 }, { "epoch": 1.899761336515513, "grad_norm": 1.368857741355896, "learning_rate": 6.85358083099863e-07, "loss": 1.384, "step": 2388 }, { "epoch": 1.9021479713603817, "grad_norm": 1.307237148284912, "learning_rate": 6.531809092456598e-07, "loss": 1.3101, "step": 2391 }, { "epoch": 1.9045346062052506, "grad_norm": 1.274276614189148, "learning_rate": 6.217723917238128e-07, "loss": 1.4943, "step": 2394 }, { "epoch": 1.9069212410501193, "grad_norm": 1.3081694841384888, "learning_rate": 5.911330197726661e-07, "loss": 1.3365, "step": 2397 }, { "epoch": 1.9093078758949882, "grad_norm": 1.2196581363677979, "learning_rate": 5.612632706498755e-07, "loss": 1.3927, "step": 2400 }, { "epoch": 1.9116945107398569, "grad_norm": 1.33291494846344, "learning_rate": 5.321636096249749e-07, "loss": 1.4337, "step": 2403 }, { "epoch": 1.9140811455847255, "grad_norm": 1.355838418006897, "learning_rate": 5.038344899721436e-07, "loss": 1.3511, "step": 2406 }, { "epoch": 1.9164677804295942, "grad_norm": 1.2884796857833862, "learning_rate": 4.762763529631342e-07, "loss": 1.3787, "step": 2409 }, { "epoch": 1.9188544152744629, "grad_norm": 1.4186152219772339, "learning_rate": 4.4948962786039437e-07, "loss": 1.4141, "step": 2412 }, { "epoch": 1.9212410501193318, "grad_norm": 1.1370840072631836, "learning_rate": 4.234747319103949e-07, "loss": 1.2792, "step": 2415 }, { "epoch": 1.9236276849642004, "grad_norm": 1.3362590074539185, "learning_rate": 3.9823207033710676e-07, "loss": 1.43, "step": 2418 }, { "epoch": 1.9260143198090693, "grad_norm": 1.2540643215179443, "learning_rate": 3.737620363357286e-07, "loss": 1.2947, "step": 2421 }, { "epoch": 1.928400954653938, "grad_norm": 1.3183681964874268, "learning_rate": 3.5006501106651937e-07, "loss": 1.3768, "step": 2424 }, { "epoch": 1.9307875894988067, "grad_norm": 1.2523298263549805, "learning_rate": 3.2714136364888073e-07, "loss": 1.3564, "step": 2427 }, { "epoch": 1.9331742243436754, "grad_norm": 1.7795356512069702, "learning_rate": 3.0499145115561176e-07, "loss": 1.518, "step": 2430 }, { "epoch": 1.935560859188544, "grad_norm": 1.2850757837295532, "learning_rate": 2.836156186073413e-07, "loss": 1.3588, "step": 2433 }, { "epoch": 1.937947494033413, "grad_norm": 1.30345618724823, "learning_rate": 2.630141989671542e-07, "loss": 1.3733, "step": 2436 }, { "epoch": 1.9403341288782816, "grad_norm": 1.3604350090026855, "learning_rate": 2.431875131354011e-07, "loss": 1.4814, "step": 2439 }, { "epoch": 1.9427207637231505, "grad_norm": 1.4250744581222534, "learning_rate": 2.2413586994470825e-07, "loss": 1.3531, "step": 2442 }, { "epoch": 1.9451073985680192, "grad_norm": 1.2653945684432983, "learning_rate": 2.0585956615515323e-07, "loss": 1.3951, "step": 2445 }, { "epoch": 1.9474940334128878, "grad_norm": 1.3960016965866089, "learning_rate": 1.8835888644966325e-07, "loss": 1.3927, "step": 2448 }, { "epoch": 1.9498806682577565, "grad_norm": 1.4156994819641113, "learning_rate": 1.7163410342956875e-07, "loss": 1.4211, "step": 2451 }, { "epoch": 1.9522673031026252, "grad_norm": 1.5609006881713867, "learning_rate": 1.5568547761034004e-07, "loss": 1.3577, "step": 2454 }, { "epoch": 1.9546539379474939, "grad_norm": 1.5074419975280762, "learning_rate": 1.4051325741756828e-07, "loss": 1.4627, "step": 2457 }, { "epoch": 1.9570405727923628, "grad_norm": 1.231116771697998, "learning_rate": 1.2611767918306316e-07, "loss": 1.3873, "step": 2460 }, { "epoch": 1.9594272076372317, "grad_norm": 1.2997921705245972, "learning_rate": 1.1249896714117802e-07, "loss": 1.3963, "step": 2463 }, { "epoch": 1.9618138424821003, "grad_norm": 1.2703529596328735, "learning_rate": 9.965733342532924e-08, "loss": 1.371, "step": 2466 }, { "epoch": 1.964200477326969, "grad_norm": 1.2716647386550903, "learning_rate": 8.759297806469335e-08, "loss": 1.3068, "step": 2469 }, { "epoch": 1.9665871121718377, "grad_norm": 1.36246657371521, "learning_rate": 7.630608898105962e-08, "loss": 1.3863, "step": 2472 }, { "epoch": 1.9689737470167064, "grad_norm": 1.224776029586792, "learning_rate": 6.579684198594338e-08, "loss": 1.3254, "step": 2475 }, { "epoch": 1.971360381861575, "grad_norm": 1.215903878211975, "learning_rate": 5.606540077782163e-08, "loss": 1.3006, "step": 2478 }, { "epoch": 1.973747016706444, "grad_norm": 1.5066276788711548, "learning_rate": 4.711191693959616e-08, "loss": 1.3676, "step": 2481 }, { "epoch": 1.9761336515513126, "grad_norm": 1.3745481967926025, "learning_rate": 3.893652993621766e-08, "loss": 1.372, "step": 2484 }, { "epoch": 1.9785202863961815, "grad_norm": 1.4172685146331787, "learning_rate": 3.1539367112543014e-08, "loss": 1.482, "step": 2487 }, { "epoch": 1.9809069212410502, "grad_norm": 1.3960031270980835, "learning_rate": 2.4920543691309138e-08, "loss": 1.3987, "step": 2490 }, { "epoch": 1.9832935560859188, "grad_norm": 1.2992031574249268, "learning_rate": 1.9080162771378808e-08, "loss": 1.3605, "step": 2493 }, { "epoch": 1.9856801909307875, "grad_norm": 1.2885679006576538, "learning_rate": 1.4018315326103094e-08, "loss": 1.3816, "step": 2496 }, { "epoch": 1.9880668257756562, "grad_norm": 1.2640992403030396, "learning_rate": 9.735080201922487e-09, "loss": 1.3142, "step": 2499 }, { "epoch": 1.990453460620525, "grad_norm": 1.2508050203323364, "learning_rate": 6.2305241171345395e-09, "loss": 1.4008, "step": 2502 }, { "epoch": 1.9928400954653938, "grad_norm": 1.2504699230194092, "learning_rate": 3.5047016608613647e-09, "loss": 1.3362, "step": 2505 }, { "epoch": 1.9952267303102627, "grad_norm": 1.4421279430389404, "learning_rate": 1.5576552921836574e-09, "loss": 1.4037, "step": 2508 }, { "epoch": 1.9976133651551313, "grad_norm": 1.2950758934020996, "learning_rate": 3.89415339491217e-10, "loss": 1.3817, "step": 2511 }, { "epoch": 2.0, "grad_norm": 1.6945881843566895, "learning_rate": 0.0, "loss": 1.4222, "step": 2514 } ], "logging_steps": 3, "max_steps": 2514, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1257, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.254235526619464e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }