diff --git "a/checkpoint-2514/trainer_state.json" "b/checkpoint-2514/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2514/trainer_state.json" @@ -0,0 +1,5899 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 2514, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002386634844868735, + "grad_norm": 87.45082092285156, + "learning_rate": 2.3809523809523808e-06, + "loss": 8.3691, + "step": 3 + }, + { + "epoch": 0.00477326968973747, + "grad_norm": 42.669490814208984, + "learning_rate": 4.7619047619047615e-06, + "loss": 7.0093, + "step": 6 + }, + { + "epoch": 0.007159904534606206, + "grad_norm": 24.51582145690918, + "learning_rate": 7.142857142857143e-06, + "loss": 5.8097, + "step": 9 + }, + { + "epoch": 0.00954653937947494, + "grad_norm": 16.242338180541992, + "learning_rate": 9.523809523809523e-06, + "loss": 4.978, + "step": 12 + }, + { + "epoch": 0.011933174224343675, + "grad_norm": 11.578532218933105, + "learning_rate": 1.1904761904761905e-05, + "loss": 4.8608, + "step": 15 + }, + { + "epoch": 0.014319809069212411, + "grad_norm": 14.094877243041992, + "learning_rate": 1.4285714285714285e-05, + "loss": 4.6646, + "step": 18 + }, + { + "epoch": 0.016706443914081145, + "grad_norm": 8.234806060791016, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.428, + "step": 21 + }, + { + "epoch": 0.01909307875894988, + "grad_norm": 9.00922966003418, + "learning_rate": 1.9047619047619046e-05, + "loss": 4.2484, + "step": 24 + }, + { + "epoch": 0.021479713603818614, + "grad_norm": 8.009718894958496, + "learning_rate": 2.1428571428571428e-05, + "loss": 4.1826, + "step": 27 + }, + { + "epoch": 0.02386634844868735, + "grad_norm": 5.844778537750244, + "learning_rate": 2.380952380952381e-05, + "loss": 4.0247, + "step": 30 + }, + { + "epoch": 0.026252983293556086, + "grad_norm": 5.393738746643066, + "learning_rate": 2.6190476190476192e-05, + "loss": 3.6734, + "step": 33 + }, + { + "epoch": 0.028639618138424822, + "grad_norm": 5.676444053649902, + "learning_rate": 2.857142857142857e-05, + "loss": 3.621, + "step": 36 + }, + { + "epoch": 0.031026252983293555, + "grad_norm": 5.530320644378662, + "learning_rate": 3.095238095238095e-05, + "loss": 3.4785, + "step": 39 + }, + { + "epoch": 0.03341288782816229, + "grad_norm": 3.998183488845825, + "learning_rate": 3.3333333333333335e-05, + "loss": 3.2492, + "step": 42 + }, + { + "epoch": 0.03579952267303103, + "grad_norm": 5.9084553718566895, + "learning_rate": 3.571428571428572e-05, + "loss": 3.2145, + "step": 45 + }, + { + "epoch": 0.03818615751789976, + "grad_norm": 4.704463958740234, + "learning_rate": 3.809523809523809e-05, + "loss": 3.1231, + "step": 48 + }, + { + "epoch": 0.0405727923627685, + "grad_norm": 4.222476482391357, + "learning_rate": 4.047619047619048e-05, + "loss": 2.9818, + "step": 51 + }, + { + "epoch": 0.04295942720763723, + "grad_norm": 6.013700008392334, + "learning_rate": 4.2857142857142856e-05, + "loss": 3.0594, + "step": 54 + }, + { + "epoch": 0.045346062052505964, + "grad_norm": 5.095407009124756, + "learning_rate": 4.523809523809524e-05, + "loss": 2.9516, + "step": 57 + }, + { + "epoch": 0.0477326968973747, + "grad_norm": 4.415056228637695, + "learning_rate": 4.761904761904762e-05, + "loss": 2.8278, + "step": 60 + }, + { + "epoch": 0.050119331742243436, + "grad_norm": 4.036295413970947, + "learning_rate": 5e-05, + "loss": 2.7948, + "step": 63 + }, + { + "epoch": 0.05250596658711217, + "grad_norm": 4.796054840087891, + "learning_rate": 5.2380952380952384e-05, + "loss": 2.7323, + "step": 66 + }, + { + "epoch": 0.05489260143198091, + "grad_norm": 3.858863592147827, + "learning_rate": 5.4761904761904766e-05, + "loss": 2.5518, + "step": 69 + }, + { + "epoch": 0.057279236276849645, + "grad_norm": 2.7978243827819824, + "learning_rate": 5.714285714285714e-05, + "loss": 2.5852, + "step": 72 + }, + { + "epoch": 0.059665871121718374, + "grad_norm": 3.5701358318328857, + "learning_rate": 5.9523809523809524e-05, + "loss": 2.6322, + "step": 75 + }, + { + "epoch": 0.06205250596658711, + "grad_norm": 3.843608856201172, + "learning_rate": 6.19047619047619e-05, + "loss": 2.6322, + "step": 78 + }, + { + "epoch": 0.06443914081145585, + "grad_norm": 4.596922874450684, + "learning_rate": 6.428571428571429e-05, + "loss": 2.5773, + "step": 81 + }, + { + "epoch": 0.06682577565632458, + "grad_norm": 3.2659518718719482, + "learning_rate": 6.666666666666667e-05, + "loss": 2.5062, + "step": 84 + }, + { + "epoch": 0.06921241050119331, + "grad_norm": 2.8752944469451904, + "learning_rate": 6.904761904761905e-05, + "loss": 2.5073, + "step": 87 + }, + { + "epoch": 0.07159904534606205, + "grad_norm": 3.333749294281006, + "learning_rate": 7.142857142857143e-05, + "loss": 2.5105, + "step": 90 + }, + { + "epoch": 0.07398568019093078, + "grad_norm": 2.5306403636932373, + "learning_rate": 7.380952380952382e-05, + "loss": 2.5395, + "step": 93 + }, + { + "epoch": 0.07637231503579953, + "grad_norm": 2.989917278289795, + "learning_rate": 7.619047619047618e-05, + "loss": 2.4063, + "step": 96 + }, + { + "epoch": 0.07875894988066826, + "grad_norm": 3.6913046836853027, + "learning_rate": 7.857142857142858e-05, + "loss": 2.385, + "step": 99 + }, + { + "epoch": 0.081145584725537, + "grad_norm": 3.3200862407684326, + "learning_rate": 8.095238095238096e-05, + "loss": 2.3451, + "step": 102 + }, + { + "epoch": 0.08353221957040573, + "grad_norm": 3.1073644161224365, + "learning_rate": 8.333333333333334e-05, + "loss": 2.3661, + "step": 105 + }, + { + "epoch": 0.08591885441527446, + "grad_norm": 2.79823637008667, + "learning_rate": 8.571428571428571e-05, + "loss": 2.3642, + "step": 108 + }, + { + "epoch": 0.0883054892601432, + "grad_norm": 2.8037662506103516, + "learning_rate": 8.80952380952381e-05, + "loss": 2.3049, + "step": 111 + }, + { + "epoch": 0.09069212410501193, + "grad_norm": 2.33240008354187, + "learning_rate": 9.047619047619048e-05, + "loss": 2.4671, + "step": 114 + }, + { + "epoch": 0.09307875894988067, + "grad_norm": 2.2528834342956543, + "learning_rate": 9.285714285714286e-05, + "loss": 2.3363, + "step": 117 + }, + { + "epoch": 0.0954653937947494, + "grad_norm": 3.4800667762756348, + "learning_rate": 9.523809523809524e-05, + "loss": 2.3207, + "step": 120 + }, + { + "epoch": 0.09785202863961814, + "grad_norm": 2.395854949951172, + "learning_rate": 9.761904761904762e-05, + "loss": 2.3903, + "step": 123 + }, + { + "epoch": 0.10023866348448687, + "grad_norm": 3.5940475463867188, + "learning_rate": 0.0001, + "loss": 2.1336, + "step": 126 + }, + { + "epoch": 0.1026252983293556, + "grad_norm": 3.4392244815826416, + "learning_rate": 9.999961058466053e-05, + "loss": 2.3802, + "step": 129 + }, + { + "epoch": 0.10501193317422435, + "grad_norm": 2.595665216445923, + "learning_rate": 9.999844234470782e-05, + "loss": 2.2468, + "step": 132 + }, + { + "epoch": 0.10739856801909307, + "grad_norm": 2.5754427909851074, + "learning_rate": 9.999649529833915e-05, + "loss": 2.2207, + "step": 135 + }, + { + "epoch": 0.10978520286396182, + "grad_norm": 2.359457492828369, + "learning_rate": 9.999376947588288e-05, + "loss": 2.3501, + "step": 138 + }, + { + "epoch": 0.11217183770883055, + "grad_norm": 2.7424278259277344, + "learning_rate": 9.999026491979808e-05, + "loss": 2.3205, + "step": 141 + }, + { + "epoch": 0.11455847255369929, + "grad_norm": 2.8538436889648438, + "learning_rate": 9.99859816846739e-05, + "loss": 2.2671, + "step": 144 + }, + { + "epoch": 0.11694510739856802, + "grad_norm": 2.7669293880462646, + "learning_rate": 9.998091983722863e-05, + "loss": 2.2559, + "step": 147 + }, + { + "epoch": 0.11933174224343675, + "grad_norm": 2.62872314453125, + "learning_rate": 9.99750794563087e-05, + "loss": 2.1225, + "step": 150 + }, + { + "epoch": 0.12171837708830549, + "grad_norm": 3.0914721488952637, + "learning_rate": 9.996846063288747e-05, + "loss": 2.2146, + "step": 153 + }, + { + "epoch": 0.12410501193317422, + "grad_norm": 2.880335569381714, + "learning_rate": 9.996106347006379e-05, + "loss": 2.3611, + "step": 156 + }, + { + "epoch": 0.12649164677804295, + "grad_norm": 4.729184150695801, + "learning_rate": 9.99528880830604e-05, + "loss": 2.1814, + "step": 159 + }, + { + "epoch": 0.1288782816229117, + "grad_norm": 2.658954381942749, + "learning_rate": 9.994393459922218e-05, + "loss": 2.1278, + "step": 162 + }, + { + "epoch": 0.13126491646778043, + "grad_norm": 2.626692056655884, + "learning_rate": 9.993420315801406e-05, + "loss": 2.0775, + "step": 165 + }, + { + "epoch": 0.13365155131264916, + "grad_norm": 3.7431132793426514, + "learning_rate": 9.992369391101895e-05, + "loss": 2.1602, + "step": 168 + }, + { + "epoch": 0.1360381861575179, + "grad_norm": 2.7827000617980957, + "learning_rate": 9.991240702193532e-05, + "loss": 2.291, + "step": 171 + }, + { + "epoch": 0.13842482100238662, + "grad_norm": 2.846406936645508, + "learning_rate": 9.990034266657467e-05, + "loss": 2.1872, + "step": 174 + }, + { + "epoch": 0.14081145584725538, + "grad_norm": 3.0862011909484863, + "learning_rate": 9.988750103285883e-05, + "loss": 2.0943, + "step": 177 + }, + { + "epoch": 0.1431980906921241, + "grad_norm": 2.312624931335449, + "learning_rate": 9.987388232081694e-05, + "loss": 2.1579, + "step": 180 + }, + { + "epoch": 0.14558472553699284, + "grad_norm": 2.634638786315918, + "learning_rate": 9.985948674258243e-05, + "loss": 2.081, + "step": 183 + }, + { + "epoch": 0.14797136038186157, + "grad_norm": 3.84676194190979, + "learning_rate": 9.984431452238967e-05, + "loss": 2.2108, + "step": 186 + }, + { + "epoch": 0.15035799522673032, + "grad_norm": 3.062570810317993, + "learning_rate": 9.982836589657043e-05, + "loss": 2.1521, + "step": 189 + }, + { + "epoch": 0.15274463007159905, + "grad_norm": 2.0936121940612793, + "learning_rate": 9.981164111355035e-05, + "loss": 2.1847, + "step": 192 + }, + { + "epoch": 0.15513126491646778, + "grad_norm": 2.978933811187744, + "learning_rate": 9.979414043384485e-05, + "loss": 2.1839, + "step": 195 + }, + { + "epoch": 0.1575178997613365, + "grad_norm": 2.353105306625366, + "learning_rate": 9.977586413005531e-05, + "loss": 2.2415, + "step": 198 + }, + { + "epoch": 0.15990453460620524, + "grad_norm": 2.6958158016204834, + "learning_rate": 9.975681248686461e-05, + "loss": 2.2367, + "step": 201 + }, + { + "epoch": 0.162291169451074, + "grad_norm": 2.262739658355713, + "learning_rate": 9.973698580103285e-05, + "loss": 2.1569, + "step": 204 + }, + { + "epoch": 0.16467780429594273, + "grad_norm": 2.56048583984375, + "learning_rate": 9.971638438139266e-05, + "loss": 2.3021, + "step": 207 + }, + { + "epoch": 0.16706443914081145, + "grad_norm": 4.17548942565918, + "learning_rate": 9.96950085488444e-05, + "loss": 2.028, + "step": 210 + }, + { + "epoch": 0.16945107398568018, + "grad_norm": 4.712838649749756, + "learning_rate": 9.967285863635112e-05, + "loss": 2.2895, + "step": 213 + }, + { + "epoch": 0.1718377088305489, + "grad_norm": 2.642404556274414, + "learning_rate": 9.964993498893349e-05, + "loss": 2.1517, + "step": 216 + }, + { + "epoch": 0.17422434367541767, + "grad_norm": 3.9263668060302734, + "learning_rate": 9.962623796366429e-05, + "loss": 2.1385, + "step": 219 + }, + { + "epoch": 0.1766109785202864, + "grad_norm": 2.9646389484405518, + "learning_rate": 9.960176792966289e-05, + "loss": 2.1171, + "step": 222 + }, + { + "epoch": 0.17899761336515513, + "grad_norm": 2.0947139263153076, + "learning_rate": 9.95765252680896e-05, + "loss": 2.174, + "step": 225 + }, + { + "epoch": 0.18138424821002386, + "grad_norm": 2.083189010620117, + "learning_rate": 9.95505103721396e-05, + "loss": 2.1498, + "step": 228 + }, + { + "epoch": 0.18377088305489261, + "grad_norm": 2.691925048828125, + "learning_rate": 9.952372364703687e-05, + "loss": 2.2291, + "step": 231 + }, + { + "epoch": 0.18615751789976134, + "grad_norm": 3.0350098609924316, + "learning_rate": 9.949616551002787e-05, + "loss": 2.1164, + "step": 234 + }, + { + "epoch": 0.18854415274463007, + "grad_norm": 2.8906643390655518, + "learning_rate": 9.946783639037504e-05, + "loss": 2.0806, + "step": 237 + }, + { + "epoch": 0.1909307875894988, + "grad_norm": 3.903940439224243, + "learning_rate": 9.943873672935014e-05, + "loss": 2.1149, + "step": 240 + }, + { + "epoch": 0.19331742243436753, + "grad_norm": 2.354952096939087, + "learning_rate": 9.940886698022734e-05, + "loss": 2.0019, + "step": 243 + }, + { + "epoch": 0.1957040572792363, + "grad_norm": 2.914287805557251, + "learning_rate": 9.93782276082762e-05, + "loss": 2.1007, + "step": 246 + }, + { + "epoch": 0.19809069212410502, + "grad_norm": 2.2259702682495117, + "learning_rate": 9.934681909075434e-05, + "loss": 2.0461, + "step": 249 + }, + { + "epoch": 0.20047732696897375, + "grad_norm": 2.6093738079071045, + "learning_rate": 9.931464191690015e-05, + "loss": 2.125, + "step": 252 + }, + { + "epoch": 0.20286396181384247, + "grad_norm": 2.4364943504333496, + "learning_rate": 9.928169658792498e-05, + "loss": 2.1239, + "step": 255 + }, + { + "epoch": 0.2052505966587112, + "grad_norm": 4.371973514556885, + "learning_rate": 9.924798361700553e-05, + "loss": 2.0904, + "step": 258 + }, + { + "epoch": 0.20763723150357996, + "grad_norm": 2.1466689109802246, + "learning_rate": 9.92135035292757e-05, + "loss": 2.1023, + "step": 261 + }, + { + "epoch": 0.2100238663484487, + "grad_norm": 2.1940388679504395, + "learning_rate": 9.91782568618185e-05, + "loss": 2.1517, + "step": 264 + }, + { + "epoch": 0.21241050119331742, + "grad_norm": 2.100590229034424, + "learning_rate": 9.914224416365764e-05, + "loss": 2.1644, + "step": 267 + }, + { + "epoch": 0.21479713603818615, + "grad_norm": 2.8541860580444336, + "learning_rate": 9.910546599574902e-05, + "loss": 2.3204, + "step": 270 + }, + { + "epoch": 0.2171837708830549, + "grad_norm": 4.037013530731201, + "learning_rate": 9.906792293097194e-05, + "loss": 2.1066, + "step": 273 + }, + { + "epoch": 0.21957040572792363, + "grad_norm": 2.3364968299865723, + "learning_rate": 9.90296155541202e-05, + "loss": 2.1277, + "step": 276 + }, + { + "epoch": 0.22195704057279236, + "grad_norm": 2.3627822399139404, + "learning_rate": 9.899054446189304e-05, + "loss": 1.954, + "step": 279 + }, + { + "epoch": 0.2243436754176611, + "grad_norm": 2.593341588973999, + "learning_rate": 9.895071026288574e-05, + "loss": 2.0014, + "step": 282 + }, + { + "epoch": 0.22673031026252982, + "grad_norm": 2.349030017852783, + "learning_rate": 9.891011357758022e-05, + "loss": 1.9998, + "step": 285 + }, + { + "epoch": 0.22911694510739858, + "grad_norm": 2.243206024169922, + "learning_rate": 9.886875503833536e-05, + "loss": 2.1566, + "step": 288 + }, + { + "epoch": 0.2315035799522673, + "grad_norm": 2.4595718383789062, + "learning_rate": 9.882663528937717e-05, + "loss": 2.0575, + "step": 291 + }, + { + "epoch": 0.23389021479713604, + "grad_norm": 2.3169078826904297, + "learning_rate": 9.87837549867887e-05, + "loss": 1.9049, + "step": 294 + }, + { + "epoch": 0.23627684964200477, + "grad_norm": 2.022441864013672, + "learning_rate": 9.87401147984998e-05, + "loss": 2.1902, + "step": 297 + }, + { + "epoch": 0.2386634844868735, + "grad_norm": 2.343163251876831, + "learning_rate": 9.869571540427689e-05, + "loss": 2.0831, + "step": 300 + }, + { + "epoch": 0.24105011933174225, + "grad_norm": 2.565887212753296, + "learning_rate": 9.865055749571215e-05, + "loss": 2.1372, + "step": 303 + }, + { + "epoch": 0.24343675417661098, + "grad_norm": 2.249260902404785, + "learning_rate": 9.860464177621284e-05, + "loss": 1.8908, + "step": 306 + }, + { + "epoch": 0.2458233890214797, + "grad_norm": 2.274188280105591, + "learning_rate": 9.855796896099045e-05, + "loss": 2.156, + "step": 309 + }, + { + "epoch": 0.24821002386634844, + "grad_norm": 2.4920082092285156, + "learning_rate": 9.851053977704931e-05, + "loss": 1.9275, + "step": 312 + }, + { + "epoch": 0.25059665871121717, + "grad_norm": 2.18017578125, + "learning_rate": 9.846235496317555e-05, + "loss": 1.9534, + "step": 315 + }, + { + "epoch": 0.2529832935560859, + "grad_norm": 2.2087161540985107, + "learning_rate": 9.841341526992536e-05, + "loss": 2.0569, + "step": 318 + }, + { + "epoch": 0.2553699284009546, + "grad_norm": 2.050931215286255, + "learning_rate": 9.836372145961345e-05, + "loss": 2.117, + "step": 321 + }, + { + "epoch": 0.2577565632458234, + "grad_norm": 2.36757755279541, + "learning_rate": 9.83132743063011e-05, + "loss": 2.1293, + "step": 324 + }, + { + "epoch": 0.26014319809069214, + "grad_norm": 2.284879207611084, + "learning_rate": 9.826207459578411e-05, + "loss": 1.9431, + "step": 327 + }, + { + "epoch": 0.26252983293556087, + "grad_norm": 2.449686050415039, + "learning_rate": 9.821012312558058e-05, + "loss": 1.8996, + "step": 330 + }, + { + "epoch": 0.2649164677804296, + "grad_norm": 1.8972541093826294, + "learning_rate": 9.815742070491852e-05, + "loss": 2.0876, + "step": 333 + }, + { + "epoch": 0.26730310262529833, + "grad_norm": 2.3827104568481445, + "learning_rate": 9.810396815472314e-05, + "loss": 1.9801, + "step": 336 + }, + { + "epoch": 0.26968973747016706, + "grad_norm": 2.1279213428497314, + "learning_rate": 9.804976630760419e-05, + "loss": 2.0433, + "step": 339 + }, + { + "epoch": 0.2720763723150358, + "grad_norm": 2.0066232681274414, + "learning_rate": 9.799481600784286e-05, + "loss": 1.9296, + "step": 342 + }, + { + "epoch": 0.2744630071599045, + "grad_norm": 2.5718936920166016, + "learning_rate": 9.793911811137875e-05, + "loss": 2.0074, + "step": 345 + }, + { + "epoch": 0.27684964200477324, + "grad_norm": 2.3425660133361816, + "learning_rate": 9.788267348579648e-05, + "loss": 2.0321, + "step": 348 + }, + { + "epoch": 0.27923627684964203, + "grad_norm": 2.5972917079925537, + "learning_rate": 9.782548301031217e-05, + "loss": 2.0536, + "step": 351 + }, + { + "epoch": 0.28162291169451076, + "grad_norm": 2.15671706199646, + "learning_rate": 9.776754757575975e-05, + "loss": 1.9679, + "step": 354 + }, + { + "epoch": 0.2840095465393795, + "grad_norm": 2.41680645942688, + "learning_rate": 9.770886808457709e-05, + "loss": 2.029, + "step": 357 + }, + { + "epoch": 0.2863961813842482, + "grad_norm": 2.8900387287139893, + "learning_rate": 9.764944545079196e-05, + "loss": 2.1488, + "step": 360 + }, + { + "epoch": 0.28878281622911695, + "grad_norm": 2.2235772609710693, + "learning_rate": 9.758928060000778e-05, + "loss": 2.0734, + "step": 363 + }, + { + "epoch": 0.2911694510739857, + "grad_norm": 2.1928718090057373, + "learning_rate": 9.752837446938915e-05, + "loss": 1.907, + "step": 366 + }, + { + "epoch": 0.2935560859188544, + "grad_norm": 2.063851833343506, + "learning_rate": 9.746672800764735e-05, + "loss": 1.9107, + "step": 369 + }, + { + "epoch": 0.29594272076372313, + "grad_norm": 2.3695101737976074, + "learning_rate": 9.740434217502547e-05, + "loss": 1.9469, + "step": 372 + }, + { + "epoch": 0.29832935560859186, + "grad_norm": 2.556959867477417, + "learning_rate": 9.734121794328357e-05, + "loss": 2.0305, + "step": 375 + }, + { + "epoch": 0.30071599045346065, + "grad_norm": 2.1321029663085938, + "learning_rate": 9.727735629568336e-05, + "loss": 2.0459, + "step": 378 + }, + { + "epoch": 0.3031026252983294, + "grad_norm": 1.8276643753051758, + "learning_rate": 9.721275822697306e-05, + "loss": 1.9669, + "step": 381 + }, + { + "epoch": 0.3054892601431981, + "grad_norm": 2.0436830520629883, + "learning_rate": 9.714742474337186e-05, + "loss": 2.0737, + "step": 384 + }, + { + "epoch": 0.30787589498806683, + "grad_norm": 2.2903330326080322, + "learning_rate": 9.708135686255416e-05, + "loss": 2.1781, + "step": 387 + }, + { + "epoch": 0.31026252983293556, + "grad_norm": 2.438894510269165, + "learning_rate": 9.701455561363379e-05, + "loss": 1.9024, + "step": 390 + }, + { + "epoch": 0.3126491646778043, + "grad_norm": 2.9679908752441406, + "learning_rate": 9.6947022037148e-05, + "loss": 1.9496, + "step": 393 + }, + { + "epoch": 0.315035799522673, + "grad_norm": 2.693708896636963, + "learning_rate": 9.687875718504126e-05, + "loss": 1.9314, + "step": 396 + }, + { + "epoch": 0.31742243436754175, + "grad_norm": 3.0519704818725586, + "learning_rate": 9.680976212064874e-05, + "loss": 2.0026, + "step": 399 + }, + { + "epoch": 0.3198090692124105, + "grad_norm": 1.8532629013061523, + "learning_rate": 9.674003791867991e-05, + "loss": 2.0183, + "step": 402 + }, + { + "epoch": 0.3221957040572792, + "grad_norm": 2.272416353225708, + "learning_rate": 9.666958566520174e-05, + "loss": 2.0488, + "step": 405 + }, + { + "epoch": 0.324582338902148, + "grad_norm": 2.514479637145996, + "learning_rate": 9.659840645762175e-05, + "loss": 2.0881, + "step": 408 + }, + { + "epoch": 0.3269689737470167, + "grad_norm": 2.127908945083618, + "learning_rate": 9.652650140467093e-05, + "loss": 2.0058, + "step": 411 + }, + { + "epoch": 0.32935560859188545, + "grad_norm": 2.1128835678100586, + "learning_rate": 9.645387162638652e-05, + "loss": 2.0027, + "step": 414 + }, + { + "epoch": 0.3317422434367542, + "grad_norm": 2.1104748249053955, + "learning_rate": 9.638051825409453e-05, + "loss": 2.1834, + "step": 417 + }, + { + "epoch": 0.3341288782816229, + "grad_norm": 1.9978591203689575, + "learning_rate": 9.630644243039207e-05, + "loss": 1.9216, + "step": 420 + }, + { + "epoch": 0.33651551312649164, + "grad_norm": 2.1656672954559326, + "learning_rate": 9.623164530912963e-05, + "loss": 2.1389, + "step": 423 + }, + { + "epoch": 0.33890214797136037, + "grad_norm": 1.7347074747085571, + "learning_rate": 9.615612805539305e-05, + "loss": 1.8929, + "step": 426 + }, + { + "epoch": 0.3412887828162291, + "grad_norm": 2.3430702686309814, + "learning_rate": 9.607989184548543e-05, + "loss": 2.0085, + "step": 429 + }, + { + "epoch": 0.3436754176610978, + "grad_norm": 1.9454632997512817, + "learning_rate": 9.600293786690872e-05, + "loss": 2.0007, + "step": 432 + }, + { + "epoch": 0.3460620525059666, + "grad_norm": 1.9286153316497803, + "learning_rate": 9.592526731834537e-05, + "loss": 2.1495, + "step": 435 + }, + { + "epoch": 0.34844868735083534, + "grad_norm": 2.3506131172180176, + "learning_rate": 9.584688140963944e-05, + "loss": 1.8264, + "step": 438 + }, + { + "epoch": 0.35083532219570407, + "grad_norm": 2.0399367809295654, + "learning_rate": 9.576778136177798e-05, + "loss": 2.0056, + "step": 441 + }, + { + "epoch": 0.3532219570405728, + "grad_norm": 2.3488686084747314, + "learning_rate": 9.568796840687184e-05, + "loss": 1.8853, + "step": 444 + }, + { + "epoch": 0.3556085918854415, + "grad_norm": 2.3103203773498535, + "learning_rate": 9.560744378813659e-05, + "loss": 1.9849, + "step": 447 + }, + { + "epoch": 0.35799522673031026, + "grad_norm": 2.4852659702301025, + "learning_rate": 9.552620875987311e-05, + "loss": 1.9174, + "step": 450 + }, + { + "epoch": 0.360381861575179, + "grad_norm": 2.327434778213501, + "learning_rate": 9.544426458744804e-05, + "loss": 1.8321, + "step": 453 + }, + { + "epoch": 0.3627684964200477, + "grad_norm": 2.1217284202575684, + "learning_rate": 9.536161254727408e-05, + "loss": 1.8995, + "step": 456 + }, + { + "epoch": 0.36515513126491644, + "grad_norm": 2.4897334575653076, + "learning_rate": 9.527825392679012e-05, + "loss": 1.8949, + "step": 459 + }, + { + "epoch": 0.36754176610978523, + "grad_norm": 2.132058620452881, + "learning_rate": 9.51941900244412e-05, + "loss": 1.8962, + "step": 462 + }, + { + "epoch": 0.36992840095465396, + "grad_norm": 1.8956185579299927, + "learning_rate": 9.51094221496582e-05, + "loss": 1.9098, + "step": 465 + }, + { + "epoch": 0.3723150357995227, + "grad_norm": 2.0302867889404297, + "learning_rate": 9.502395162283759e-05, + "loss": 1.7664, + "step": 468 + }, + { + "epoch": 0.3747016706443914, + "grad_norm": 2.364084243774414, + "learning_rate": 9.493777977532072e-05, + "loss": 1.932, + "step": 471 + }, + { + "epoch": 0.37708830548926014, + "grad_norm": 2.0411782264709473, + "learning_rate": 9.485090794937319e-05, + "loss": 1.887, + "step": 474 + }, + { + "epoch": 0.3794749403341289, + "grad_norm": 2.052417755126953, + "learning_rate": 9.476333749816382e-05, + "loss": 1.8125, + "step": 477 + }, + { + "epoch": 0.3818615751789976, + "grad_norm": 2.2700257301330566, + "learning_rate": 9.467506978574371e-05, + "loss": 1.8167, + "step": 480 + }, + { + "epoch": 0.38424821002386633, + "grad_norm": 1.9266332387924194, + "learning_rate": 9.45861061870249e-05, + "loss": 1.9767, + "step": 483 + }, + { + "epoch": 0.38663484486873506, + "grad_norm": 2.934418201446533, + "learning_rate": 9.449644808775902e-05, + "loss": 1.8826, + "step": 486 + }, + { + "epoch": 0.38902147971360385, + "grad_norm": 1.8543293476104736, + "learning_rate": 9.44060968845156e-05, + "loss": 1.874, + "step": 489 + }, + { + "epoch": 0.3914081145584726, + "grad_norm": 2.2420761585235596, + "learning_rate": 9.431505398466045e-05, + "loss": 1.9835, + "step": 492 + }, + { + "epoch": 0.3937947494033413, + "grad_norm": 2.448664665222168, + "learning_rate": 9.42233208063336e-05, + "loss": 1.7932, + "step": 495 + }, + { + "epoch": 0.39618138424821003, + "grad_norm": 2.271794557571411, + "learning_rate": 9.413089877842736e-05, + "loss": 2.0198, + "step": 498 + }, + { + "epoch": 0.39856801909307876, + "grad_norm": 2.7900123596191406, + "learning_rate": 9.403778934056391e-05, + "loss": 1.9902, + "step": 501 + }, + { + "epoch": 0.4009546539379475, + "grad_norm": 1.9538331031799316, + "learning_rate": 9.394399394307303e-05, + "loss": 2.1558, + "step": 504 + }, + { + "epoch": 0.4033412887828162, + "grad_norm": 4.123261451721191, + "learning_rate": 9.384951404696933e-05, + "loss": 1.8325, + "step": 507 + }, + { + "epoch": 0.40572792362768495, + "grad_norm": 2.156297206878662, + "learning_rate": 9.375435112392969e-05, + "loss": 2.0274, + "step": 510 + }, + { + "epoch": 0.4081145584725537, + "grad_norm": 2.066129207611084, + "learning_rate": 9.365850665627016e-05, + "loss": 1.8711, + "step": 513 + }, + { + "epoch": 0.4105011933174224, + "grad_norm": 1.8385334014892578, + "learning_rate": 9.356198213692297e-05, + "loss": 1.8337, + "step": 516 + }, + { + "epoch": 0.4128878281622912, + "grad_norm": 2.1893041133880615, + "learning_rate": 9.346477906941331e-05, + "loss": 1.784, + "step": 519 + }, + { + "epoch": 0.4152744630071599, + "grad_norm": 2.0097122192382812, + "learning_rate": 9.336689896783573e-05, + "loss": 1.8381, + "step": 522 + }, + { + "epoch": 0.41766109785202865, + "grad_norm": 2.2761220932006836, + "learning_rate": 9.32683433568308e-05, + "loss": 1.876, + "step": 525 + }, + { + "epoch": 0.4200477326968974, + "grad_norm": 2.7852184772491455, + "learning_rate": 9.316911377156117e-05, + "loss": 1.9546, + "step": 528 + }, + { + "epoch": 0.4224343675417661, + "grad_norm": 1.915184497833252, + "learning_rate": 9.306921175768775e-05, + "loss": 1.8832, + "step": 531 + }, + { + "epoch": 0.42482100238663484, + "grad_norm": 2.2405636310577393, + "learning_rate": 9.29686388713456e-05, + "loss": 2.097, + "step": 534 + }, + { + "epoch": 0.42720763723150357, + "grad_norm": 1.9388247728347778, + "learning_rate": 9.286739667911972e-05, + "loss": 1.923, + "step": 537 + }, + { + "epoch": 0.4295942720763723, + "grad_norm": 1.7430307865142822, + "learning_rate": 9.276548675802059e-05, + "loss": 1.962, + "step": 540 + }, + { + "epoch": 0.431980906921241, + "grad_norm": 1.7537423372268677, + "learning_rate": 9.266291069545972e-05, + "loss": 1.8785, + "step": 543 + }, + { + "epoch": 0.4343675417661098, + "grad_norm": 1.791382074356079, + "learning_rate": 9.255967008922474e-05, + "loss": 1.9461, + "step": 546 + }, + { + "epoch": 0.43675417661097854, + "grad_norm": 1.7727302312850952, + "learning_rate": 9.245576654745471e-05, + "loss": 2.0444, + "step": 549 + }, + { + "epoch": 0.43914081145584727, + "grad_norm": 2.0288734436035156, + "learning_rate": 9.235120168861496e-05, + "loss": 1.7547, + "step": 552 + }, + { + "epoch": 0.441527446300716, + "grad_norm": 2.7705018520355225, + "learning_rate": 9.224597714147186e-05, + "loss": 1.9798, + "step": 555 + }, + { + "epoch": 0.4439140811455847, + "grad_norm": 1.8357696533203125, + "learning_rate": 9.214009454506753e-05, + "loss": 1.8111, + "step": 558 + }, + { + "epoch": 0.44630071599045346, + "grad_norm": 2.0281825065612793, + "learning_rate": 9.203355554869428e-05, + "loss": 1.8906, + "step": 561 + }, + { + "epoch": 0.4486873508353222, + "grad_norm": 2.804478406906128, + "learning_rate": 9.192636181186888e-05, + "loss": 1.722, + "step": 564 + }, + { + "epoch": 0.4510739856801909, + "grad_norm": 1.8126853704452515, + "learning_rate": 9.181851500430673e-05, + "loss": 1.7653, + "step": 567 + }, + { + "epoch": 0.45346062052505964, + "grad_norm": 1.9144084453582764, + "learning_rate": 9.171001680589588e-05, + "loss": 1.8009, + "step": 570 + }, + { + "epoch": 0.45584725536992843, + "grad_norm": 2.006549835205078, + "learning_rate": 9.160086890667086e-05, + "loss": 1.8403, + "step": 573 + }, + { + "epoch": 0.45823389021479716, + "grad_norm": 2.879286766052246, + "learning_rate": 9.14910730067863e-05, + "loss": 1.9097, + "step": 576 + }, + { + "epoch": 0.4606205250596659, + "grad_norm": 1.752193808555603, + "learning_rate": 9.138063081649051e-05, + "loss": 2.0254, + "step": 579 + }, + { + "epoch": 0.4630071599045346, + "grad_norm": 1.8021249771118164, + "learning_rate": 9.126954405609882e-05, + "loss": 1.8794, + "step": 582 + }, + { + "epoch": 0.46539379474940334, + "grad_norm": 1.8375767469406128, + "learning_rate": 9.115781445596676e-05, + "loss": 1.7489, + "step": 585 + }, + { + "epoch": 0.4677804295942721, + "grad_norm": 2.183716297149658, + "learning_rate": 9.104544375646313e-05, + "loss": 2.1149, + "step": 588 + }, + { + "epoch": 0.4701670644391408, + "grad_norm": 2.1002678871154785, + "learning_rate": 9.093243370794291e-05, + "loss": 1.8637, + "step": 591 + }, + { + "epoch": 0.47255369928400953, + "grad_norm": 2.2214958667755127, + "learning_rate": 9.081878607071996e-05, + "loss": 1.9605, + "step": 594 + }, + { + "epoch": 0.47494033412887826, + "grad_norm": 1.9720337390899658, + "learning_rate": 9.07045026150396e-05, + "loss": 1.8876, + "step": 597 + }, + { + "epoch": 0.477326968973747, + "grad_norm": 1.9128150939941406, + "learning_rate": 9.058958512105104e-05, + "loss": 1.8217, + "step": 600 + }, + { + "epoch": 0.4797136038186158, + "grad_norm": 2.2358174324035645, + "learning_rate": 9.047403537877971e-05, + "loss": 1.8511, + "step": 603 + }, + { + "epoch": 0.4821002386634845, + "grad_norm": 1.7367171049118042, + "learning_rate": 9.035785518809927e-05, + "loss": 1.9224, + "step": 606 + }, + { + "epoch": 0.48448687350835323, + "grad_norm": 2.0542192459106445, + "learning_rate": 9.024104635870368e-05, + "loss": 1.8781, + "step": 609 + }, + { + "epoch": 0.48687350835322196, + "grad_norm": 5.130634784698486, + "learning_rate": 9.012361071007891e-05, + "loss": 1.8786, + "step": 612 + }, + { + "epoch": 0.4892601431980907, + "grad_norm": 2.5925798416137695, + "learning_rate": 9.000555007147469e-05, + "loss": 1.893, + "step": 615 + }, + { + "epoch": 0.4916467780429594, + "grad_norm": 2.019935369491577, + "learning_rate": 8.988686628187597e-05, + "loss": 1.8782, + "step": 618 + }, + { + "epoch": 0.49403341288782815, + "grad_norm": 1.9143917560577393, + "learning_rate": 8.976756118997427e-05, + "loss": 1.9076, + "step": 621 + }, + { + "epoch": 0.4964200477326969, + "grad_norm": 2.5640761852264404, + "learning_rate": 8.964763665413893e-05, + "loss": 1.7351, + "step": 624 + }, + { + "epoch": 0.4988066825775656, + "grad_norm": 2.209772825241089, + "learning_rate": 8.952709454238808e-05, + "loss": 1.9958, + "step": 627 + }, + { + "epoch": 0.5011933174224343, + "grad_norm": 2.1098861694335938, + "learning_rate": 8.940593673235962e-05, + "loss": 1.9609, + "step": 630 + }, + { + "epoch": 0.5035799522673031, + "grad_norm": 2.109833002090454, + "learning_rate": 8.928416511128195e-05, + "loss": 1.7695, + "step": 633 + }, + { + "epoch": 0.5059665871121718, + "grad_norm": 1.9139786958694458, + "learning_rate": 8.916178157594453e-05, + "loss": 1.8483, + "step": 636 + }, + { + "epoch": 0.5083532219570406, + "grad_norm": 1.9831109046936035, + "learning_rate": 8.903878803266841e-05, + "loss": 2.0499, + "step": 639 + }, + { + "epoch": 0.5107398568019093, + "grad_norm": 1.6918339729309082, + "learning_rate": 8.891518639727649e-05, + "loss": 1.7375, + "step": 642 + }, + { + "epoch": 0.513126491646778, + "grad_norm": 2.150010585784912, + "learning_rate": 8.879097859506372e-05, + "loss": 1.9052, + "step": 645 + }, + { + "epoch": 0.5155131264916468, + "grad_norm": 1.9660903215408325, + "learning_rate": 8.866616656076696e-05, + "loss": 1.787, + "step": 648 + }, + { + "epoch": 0.5178997613365155, + "grad_norm": 2.0688490867614746, + "learning_rate": 8.854075223853508e-05, + "loss": 1.9195, + "step": 651 + }, + { + "epoch": 0.5202863961813843, + "grad_norm": 1.7813605070114136, + "learning_rate": 8.841473758189854e-05, + "loss": 1.779, + "step": 654 + }, + { + "epoch": 0.522673031026253, + "grad_norm": 2.305001735687256, + "learning_rate": 8.828812455373891e-05, + "loss": 1.9889, + "step": 657 + }, + { + "epoch": 0.5250596658711217, + "grad_norm": 1.8771156072616577, + "learning_rate": 8.816091512625843e-05, + "loss": 1.8912, + "step": 660 + }, + { + "epoch": 0.5274463007159904, + "grad_norm": 2.458179473876953, + "learning_rate": 8.803311128094918e-05, + "loss": 1.9203, + "step": 663 + }, + { + "epoch": 0.5298329355608592, + "grad_norm": 2.151304244995117, + "learning_rate": 8.790471500856228e-05, + "loss": 2.0082, + "step": 666 + }, + { + "epoch": 0.5322195704057279, + "grad_norm": 1.7993594408035278, + "learning_rate": 8.777572830907684e-05, + "loss": 2.0773, + "step": 669 + }, + { + "epoch": 0.5346062052505967, + "grad_norm": 1.6973276138305664, + "learning_rate": 8.764615319166886e-05, + "loss": 2.0038, + "step": 672 + }, + { + "epoch": 0.5369928400954654, + "grad_norm": 1.8245609998703003, + "learning_rate": 8.751599167467985e-05, + "loss": 1.7664, + "step": 675 + }, + { + "epoch": 0.5393794749403341, + "grad_norm": 1.7999241352081299, + "learning_rate": 8.738524578558547e-05, + "loss": 1.8544, + "step": 678 + }, + { + "epoch": 0.5417661097852029, + "grad_norm": 1.9764189720153809, + "learning_rate": 8.72539175609639e-05, + "loss": 1.7731, + "step": 681 + }, + { + "epoch": 0.5441527446300716, + "grad_norm": 1.9319300651550293, + "learning_rate": 8.712200904646416e-05, + "loss": 1.7558, + "step": 684 + }, + { + "epoch": 0.5465393794749404, + "grad_norm": 1.6623660326004028, + "learning_rate": 8.698952229677422e-05, + "loss": 1.7731, + "step": 687 + }, + { + "epoch": 0.548926014319809, + "grad_norm": 2.21167254447937, + "learning_rate": 8.685645937558896e-05, + "loss": 2.0473, + "step": 690 + }, + { + "epoch": 0.5513126491646778, + "grad_norm": 7.886738300323486, + "learning_rate": 8.67228223555781e-05, + "loss": 1.7793, + "step": 693 + }, + { + "epoch": 0.5536992840095465, + "grad_norm": 1.9568718671798706, + "learning_rate": 8.658861331835385e-05, + "loss": 1.8811, + "step": 696 + }, + { + "epoch": 0.5560859188544153, + "grad_norm": 1.9484925270080566, + "learning_rate": 8.645383435443852e-05, + "loss": 1.7875, + "step": 699 + }, + { + "epoch": 0.5584725536992841, + "grad_norm": 1.946845293045044, + "learning_rate": 8.631848756323197e-05, + "loss": 1.8602, + "step": 702 + }, + { + "epoch": 0.5608591885441527, + "grad_norm": 1.8822824954986572, + "learning_rate": 8.618257505297886e-05, + "loss": 1.873, + "step": 705 + }, + { + "epoch": 0.5632458233890215, + "grad_norm": 1.9031224250793457, + "learning_rate": 8.604609894073584e-05, + "loss": 1.6809, + "step": 708 + }, + { + "epoch": 0.5656324582338902, + "grad_norm": 2.2142996788024902, + "learning_rate": 8.590906135233854e-05, + "loss": 1.8147, + "step": 711 + }, + { + "epoch": 0.568019093078759, + "grad_norm": 1.9311624765396118, + "learning_rate": 8.577146442236857e-05, + "loss": 1.8671, + "step": 714 + }, + { + "epoch": 0.5704057279236276, + "grad_norm": 1.874782919883728, + "learning_rate": 8.563331029412012e-05, + "loss": 1.9027, + "step": 717 + }, + { + "epoch": 0.5727923627684964, + "grad_norm": 2.6065828800201416, + "learning_rate": 8.549460111956664e-05, + "loss": 1.8351, + "step": 720 + }, + { + "epoch": 0.5751789976133651, + "grad_norm": 2.043619394302368, + "learning_rate": 8.535533905932738e-05, + "loss": 1.8218, + "step": 723 + }, + { + "epoch": 0.5775656324582339, + "grad_norm": 2.091768264770508, + "learning_rate": 8.521552628263362e-05, + "loss": 1.8289, + "step": 726 + }, + { + "epoch": 0.5799522673031027, + "grad_norm": 2.1494524478912354, + "learning_rate": 8.507516496729495e-05, + "loss": 1.7669, + "step": 729 + }, + { + "epoch": 0.5823389021479713, + "grad_norm": 1.9479185342788696, + "learning_rate": 8.493425729966534e-05, + "loss": 1.8909, + "step": 732 + }, + { + "epoch": 0.5847255369928401, + "grad_norm": 1.799233317375183, + "learning_rate": 8.479280547460907e-05, + "loss": 1.7795, + "step": 735 + }, + { + "epoch": 0.5871121718377088, + "grad_norm": 1.768520474433899, + "learning_rate": 8.465081169546659e-05, + "loss": 1.9436, + "step": 738 + }, + { + "epoch": 0.5894988066825776, + "grad_norm": 1.9335992336273193, + "learning_rate": 8.450827817402011e-05, + "loss": 1.9402, + "step": 741 + }, + { + "epoch": 0.5918854415274463, + "grad_norm": 2.3245134353637695, + "learning_rate": 8.436520713045922e-05, + "loss": 1.7351, + "step": 744 + }, + { + "epoch": 0.594272076372315, + "grad_norm": 1.6517058610916138, + "learning_rate": 8.422160079334628e-05, + "loss": 1.8117, + "step": 747 + }, + { + "epoch": 0.5966587112171837, + "grad_norm": 2.4537134170532227, + "learning_rate": 8.40774613995817e-05, + "loss": 1.8694, + "step": 750 + }, + { + "epoch": 0.5990453460620525, + "grad_norm": 1.9698394536972046, + "learning_rate": 8.393279119436912e-05, + "loss": 1.8867, + "step": 753 + }, + { + "epoch": 0.6014319809069213, + "grad_norm": 1.7775943279266357, + "learning_rate": 8.378759243118044e-05, + "loss": 2.0282, + "step": 756 + }, + { + "epoch": 0.60381861575179, + "grad_norm": 2.286463737487793, + "learning_rate": 8.364186737172068e-05, + "loss": 1.7754, + "step": 759 + }, + { + "epoch": 0.6062052505966588, + "grad_norm": 1.9389877319335938, + "learning_rate": 8.349561828589277e-05, + "loss": 1.9778, + "step": 762 + }, + { + "epoch": 0.6085918854415274, + "grad_norm": 1.778572916984558, + "learning_rate": 8.33488474517622e-05, + "loss": 1.8265, + "step": 765 + }, + { + "epoch": 0.6109785202863962, + "grad_norm": 1.8738365173339844, + "learning_rate": 8.320155715552155e-05, + "loss": 1.7726, + "step": 768 + }, + { + "epoch": 0.6133651551312649, + "grad_norm": 2.0440497398376465, + "learning_rate": 8.305374969145488e-05, + "loss": 1.9478, + "step": 771 + }, + { + "epoch": 0.6157517899761337, + "grad_norm": 1.7448711395263672, + "learning_rate": 8.290542736190188e-05, + "loss": 1.7166, + "step": 774 + }, + { + "epoch": 0.6181384248210023, + "grad_norm": 1.9907166957855225, + "learning_rate": 8.275659247722222e-05, + "loss": 1.7377, + "step": 777 + }, + { + "epoch": 0.6205250596658711, + "grad_norm": 1.9221429824829102, + "learning_rate": 8.260724735575933e-05, + "loss": 1.8257, + "step": 780 + }, + { + "epoch": 0.6229116945107399, + "grad_norm": 2.3536133766174316, + "learning_rate": 8.24573943238045e-05, + "loss": 1.913, + "step": 783 + }, + { + "epoch": 0.6252983293556086, + "grad_norm": 1.7165111303329468, + "learning_rate": 8.230703571556048e-05, + "loss": 1.7241, + "step": 786 + }, + { + "epoch": 0.6276849642004774, + "grad_norm": 1.8131250143051147, + "learning_rate": 8.215617387310524e-05, + "loss": 1.7696, + "step": 789 + }, + { + "epoch": 0.630071599045346, + "grad_norm": 1.9642695188522339, + "learning_rate": 8.200481114635536e-05, + "loss": 1.8094, + "step": 792 + }, + { + "epoch": 0.6324582338902148, + "grad_norm": 1.9827426671981812, + "learning_rate": 8.185294989302958e-05, + "loss": 1.7453, + "step": 795 + }, + { + "epoch": 0.6348448687350835, + "grad_norm": 1.7909404039382935, + "learning_rate": 8.170059247861194e-05, + "loss": 1.7516, + "step": 798 + }, + { + "epoch": 0.6372315035799523, + "grad_norm": 1.903298020362854, + "learning_rate": 8.154774127631501e-05, + "loss": 1.7161, + "step": 801 + }, + { + "epoch": 0.639618138424821, + "grad_norm": 1.9327062368392944, + "learning_rate": 8.139439866704293e-05, + "loss": 1.7925, + "step": 804 + }, + { + "epoch": 0.6420047732696897, + "grad_norm": 1.9829552173614502, + "learning_rate": 8.124056703935423e-05, + "loss": 1.7784, + "step": 807 + }, + { + "epoch": 0.6443914081145584, + "grad_norm": 1.7598705291748047, + "learning_rate": 8.108624878942477e-05, + "loss": 1.786, + "step": 810 + }, + { + "epoch": 0.6467780429594272, + "grad_norm": 1.963447093963623, + "learning_rate": 8.093144632101026e-05, + "loss": 1.7174, + "step": 813 + }, + { + "epoch": 0.649164677804296, + "grad_norm": 1.7478758096694946, + "learning_rate": 8.077616204540897e-05, + "loss": 1.7849, + "step": 816 + }, + { + "epoch": 0.6515513126491647, + "grad_norm": 1.8316524028778076, + "learning_rate": 8.062039838142402e-05, + "loss": 1.7664, + "step": 819 + }, + { + "epoch": 0.6539379474940334, + "grad_norm": 1.9206571578979492, + "learning_rate": 8.046415775532585e-05, + "loss": 1.7882, + "step": 822 + }, + { + "epoch": 0.6563245823389021, + "grad_norm": 2.1145057678222656, + "learning_rate": 8.030744260081426e-05, + "loss": 1.8059, + "step": 825 + }, + { + "epoch": 0.6587112171837709, + "grad_norm": 1.711238145828247, + "learning_rate": 8.015025535898073e-05, + "loss": 1.8649, + "step": 828 + }, + { + "epoch": 0.6610978520286396, + "grad_norm": 1.5813544988632202, + "learning_rate": 7.999259847827015e-05, + "loss": 1.8448, + "step": 831 + }, + { + "epoch": 0.6634844868735084, + "grad_norm": 1.8383519649505615, + "learning_rate": 7.983447441444281e-05, + "loss": 1.6869, + "step": 834 + }, + { + "epoch": 0.665871121718377, + "grad_norm": 1.7440314292907715, + "learning_rate": 7.967588563053616e-05, + "loss": 1.735, + "step": 837 + }, + { + "epoch": 0.6682577565632458, + "grad_norm": 1.9489431381225586, + "learning_rate": 7.951683459682641e-05, + "loss": 1.7624, + "step": 840 + }, + { + "epoch": 0.6706443914081146, + "grad_norm": 1.744482159614563, + "learning_rate": 7.935732379079008e-05, + "loss": 1.9888, + "step": 843 + }, + { + "epoch": 0.6730310262529833, + "grad_norm": 2.0963144302368164, + "learning_rate": 7.919735569706533e-05, + "loss": 1.8716, + "step": 846 + }, + { + "epoch": 0.6754176610978521, + "grad_norm": 1.85282301902771, + "learning_rate": 7.903693280741331e-05, + "loss": 1.8274, + "step": 849 + }, + { + "epoch": 0.6778042959427207, + "grad_norm": 1.7610821723937988, + "learning_rate": 7.887605762067945e-05, + "loss": 1.7237, + "step": 852 + }, + { + "epoch": 0.6801909307875895, + "grad_norm": 1.7411203384399414, + "learning_rate": 7.871473264275429e-05, + "loss": 1.723, + "step": 855 + }, + { + "epoch": 0.6825775656324582, + "grad_norm": 1.8666094541549683, + "learning_rate": 7.855296038653475e-05, + "loss": 2.0054, + "step": 858 + }, + { + "epoch": 0.684964200477327, + "grad_norm": 1.56552255153656, + "learning_rate": 7.83907433718847e-05, + "loss": 1.7596, + "step": 861 + }, + { + "epoch": 0.6873508353221957, + "grad_norm": 1.7352533340454102, + "learning_rate": 7.82280841255959e-05, + "loss": 1.7279, + "step": 864 + }, + { + "epoch": 0.6897374701670644, + "grad_norm": 1.7743481397628784, + "learning_rate": 7.80649851813486e-05, + "loss": 1.8844, + "step": 867 + }, + { + "epoch": 0.6921241050119332, + "grad_norm": 3.154662847518921, + "learning_rate": 7.790144907967201e-05, + "loss": 1.743, + "step": 870 + }, + { + "epoch": 0.6945107398568019, + "grad_norm": 1.835939884185791, + "learning_rate": 7.773747836790481e-05, + "loss": 1.9497, + "step": 873 + }, + { + "epoch": 0.6968973747016707, + "grad_norm": 2.23503041267395, + "learning_rate": 7.757307560015538e-05, + "loss": 1.7462, + "step": 876 + }, + { + "epoch": 0.6992840095465394, + "grad_norm": 1.6978174448013306, + "learning_rate": 7.740824333726213e-05, + "loss": 1.6822, + "step": 879 + }, + { + "epoch": 0.7016706443914081, + "grad_norm": 1.7274194955825806, + "learning_rate": 7.724298414675353e-05, + "loss": 1.7253, + "step": 882 + }, + { + "epoch": 0.7040572792362768, + "grad_norm": 1.969519019126892, + "learning_rate": 7.707730060280812e-05, + "loss": 1.892, + "step": 885 + }, + { + "epoch": 0.7064439140811456, + "grad_norm": 1.7271039485931396, + "learning_rate": 7.691119528621444e-05, + "loss": 1.7954, + "step": 888 + }, + { + "epoch": 0.7088305489260143, + "grad_norm": 1.786372184753418, + "learning_rate": 7.674467078433081e-05, + "loss": 1.9196, + "step": 891 + }, + { + "epoch": 0.711217183770883, + "grad_norm": 1.8364696502685547, + "learning_rate": 7.657772969104508e-05, + "loss": 1.5962, + "step": 894 + }, + { + "epoch": 0.7136038186157518, + "grad_norm": 1.772505760192871, + "learning_rate": 7.641037460673412e-05, + "loss": 1.576, + "step": 897 + }, + { + "epoch": 0.7159904534606205, + "grad_norm": 1.4997633695602417, + "learning_rate": 7.624260813822342e-05, + "loss": 1.6787, + "step": 900 + }, + { + "epoch": 0.7183770883054893, + "grad_norm": 1.8389488458633423, + "learning_rate": 7.607443289874642e-05, + "loss": 1.832, + "step": 903 + }, + { + "epoch": 0.720763723150358, + "grad_norm": 1.6315606832504272, + "learning_rate": 7.590585150790389e-05, + "loss": 1.8814, + "step": 906 + }, + { + "epoch": 0.7231503579952268, + "grad_norm": 1.5416710376739502, + "learning_rate": 7.573686659162293e-05, + "loss": 1.8169, + "step": 909 + }, + { + "epoch": 0.7255369928400954, + "grad_norm": 1.736853003501892, + "learning_rate": 7.556748078211635e-05, + "loss": 1.9037, + "step": 912 + }, + { + "epoch": 0.7279236276849642, + "grad_norm": 1.7387202978134155, + "learning_rate": 7.53976967178414e-05, + "loss": 1.8332, + "step": 915 + }, + { + "epoch": 0.7303102625298329, + "grad_norm": 1.4991620779037476, + "learning_rate": 7.522751704345887e-05, + "loss": 1.8655, + "step": 918 + }, + { + "epoch": 0.7326968973747017, + "grad_norm": 1.6497949361801147, + "learning_rate": 7.505694440979178e-05, + "loss": 1.7951, + "step": 921 + }, + { + "epoch": 0.7350835322195705, + "grad_norm": 1.6602137088775635, + "learning_rate": 7.488598147378416e-05, + "loss": 1.6238, + "step": 924 + }, + { + "epoch": 0.7374701670644391, + "grad_norm": 1.6851720809936523, + "learning_rate": 7.471463089845956e-05, + "loss": 1.7139, + "step": 927 + }, + { + "epoch": 0.7398568019093079, + "grad_norm": 1.911044716835022, + "learning_rate": 7.454289535287968e-05, + "loss": 1.6739, + "step": 930 + }, + { + "epoch": 0.7422434367541766, + "grad_norm": 1.9571938514709473, + "learning_rate": 7.437077751210279e-05, + "loss": 1.896, + "step": 933 + }, + { + "epoch": 0.7446300715990454, + "grad_norm": 5.357728481292725, + "learning_rate": 7.419828005714194e-05, + "loss": 1.7826, + "step": 936 + }, + { + "epoch": 0.747016706443914, + "grad_norm": 1.8948403596878052, + "learning_rate": 7.402540567492337e-05, + "loss": 1.7544, + "step": 939 + }, + { + "epoch": 0.7494033412887828, + "grad_norm": 1.749838948249817, + "learning_rate": 7.385215705824449e-05, + "loss": 1.9523, + "step": 942 + }, + { + "epoch": 0.7517899761336515, + "grad_norm": 2.02341890335083, + "learning_rate": 7.367853690573208e-05, + "loss": 1.5975, + "step": 945 + }, + { + "epoch": 0.7541766109785203, + "grad_norm": 1.999101161956787, + "learning_rate": 7.350454792180016e-05, + "loss": 1.6717, + "step": 948 + }, + { + "epoch": 0.7565632458233891, + "grad_norm": 1.809001088142395, + "learning_rate": 7.333019281660789e-05, + "loss": 1.9019, + "step": 951 + }, + { + "epoch": 0.7589498806682577, + "grad_norm": 1.5597479343414307, + "learning_rate": 7.31554743060174e-05, + "loss": 1.6719, + "step": 954 + }, + { + "epoch": 0.7613365155131265, + "grad_norm": 1.871012806892395, + "learning_rate": 7.298039511155138e-05, + "loss": 1.7596, + "step": 957 + }, + { + "epoch": 0.7637231503579952, + "grad_norm": 1.623950719833374, + "learning_rate": 7.280495796035079e-05, + "loss": 1.7874, + "step": 960 + }, + { + "epoch": 0.766109785202864, + "grad_norm": 1.9036149978637695, + "learning_rate": 7.262916558513237e-05, + "loss": 1.5974, + "step": 963 + }, + { + "epoch": 0.7684964200477327, + "grad_norm": 2.126539468765259, + "learning_rate": 7.245302072414601e-05, + "loss": 1.747, + "step": 966 + }, + { + "epoch": 0.7708830548926014, + "grad_norm": 1.4484821557998657, + "learning_rate": 7.227652612113213e-05, + "loss": 1.5952, + "step": 969 + }, + { + "epoch": 0.7732696897374701, + "grad_norm": 1.6610711812973022, + "learning_rate": 7.209968452527896e-05, + "loss": 1.7045, + "step": 972 + }, + { + "epoch": 0.7756563245823389, + "grad_norm": 1.651370882987976, + "learning_rate": 7.192249869117971e-05, + "loss": 1.7833, + "step": 975 + }, + { + "epoch": 0.7780429594272077, + "grad_norm": 1.8085836172103882, + "learning_rate": 7.174497137878966e-05, + "loss": 1.6937, + "step": 978 + }, + { + "epoch": 0.7804295942720764, + "grad_norm": 1.75294029712677, + "learning_rate": 7.156710535338312e-05, + "loss": 1.8561, + "step": 981 + }, + { + "epoch": 0.7828162291169452, + "grad_norm": 1.712416172027588, + "learning_rate": 7.138890338551048e-05, + "loss": 1.7812, + "step": 984 + }, + { + "epoch": 0.7852028639618138, + "grad_norm": 1.8154973983764648, + "learning_rate": 7.121036825095492e-05, + "loss": 1.6314, + "step": 987 + }, + { + "epoch": 0.7875894988066826, + "grad_norm": 1.8881547451019287, + "learning_rate": 7.103150273068921e-05, + "loss": 1.6936, + "step": 990 + }, + { + "epoch": 0.7899761336515513, + "grad_norm": 1.6982760429382324, + "learning_rate": 7.085230961083249e-05, + "loss": 1.8004, + "step": 993 + }, + { + "epoch": 0.7923627684964201, + "grad_norm": 2.0271944999694824, + "learning_rate": 7.067279168260671e-05, + "loss": 1.7864, + "step": 996 + }, + { + "epoch": 0.7947494033412887, + "grad_norm": 1.6574324369430542, + "learning_rate": 7.04929517422933e-05, + "loss": 1.7387, + "step": 999 + }, + { + "epoch": 0.7971360381861575, + "grad_norm": 1.5255002975463867, + "learning_rate": 7.031279259118946e-05, + "loss": 1.5828, + "step": 1002 + }, + { + "epoch": 0.7995226730310262, + "grad_norm": 1.9319933652877808, + "learning_rate": 7.013231703556471e-05, + "loss": 1.8482, + "step": 1005 + }, + { + "epoch": 0.801909307875895, + "grad_norm": 1.7250359058380127, + "learning_rate": 6.995152788661705e-05, + "loss": 1.7401, + "step": 1008 + }, + { + "epoch": 0.8042959427207638, + "grad_norm": 1.6982582807540894, + "learning_rate": 6.977042796042917e-05, + "loss": 1.7117, + "step": 1011 + }, + { + "epoch": 0.8066825775656324, + "grad_norm": 1.8661295175552368, + "learning_rate": 6.958902007792466e-05, + "loss": 1.7261, + "step": 1014 + }, + { + "epoch": 0.8090692124105012, + "grad_norm": 2.4003262519836426, + "learning_rate": 6.940730706482399e-05, + "loss": 1.7594, + "step": 1017 + }, + { + "epoch": 0.8114558472553699, + "grad_norm": 1.6023939847946167, + "learning_rate": 6.922529175160054e-05, + "loss": 1.6506, + "step": 1020 + }, + { + "epoch": 0.8138424821002387, + "grad_norm": 1.6980756521224976, + "learning_rate": 6.904297697343655e-05, + "loss": 1.8821, + "step": 1023 + }, + { + "epoch": 0.8162291169451074, + "grad_norm": 1.5860844850540161, + "learning_rate": 6.886036557017881e-05, + "loss": 1.8247, + "step": 1026 + }, + { + "epoch": 0.8186157517899761, + "grad_norm": 1.8691662549972534, + "learning_rate": 6.867746038629462e-05, + "loss": 1.9143, + "step": 1029 + }, + { + "epoch": 0.8210023866348448, + "grad_norm": 1.6922295093536377, + "learning_rate": 6.849426427082735e-05, + "loss": 1.736, + "step": 1032 + }, + { + "epoch": 0.8233890214797136, + "grad_norm": 13.19885540008545, + "learning_rate": 6.83107800773521e-05, + "loss": 1.8041, + "step": 1035 + }, + { + "epoch": 0.8257756563245824, + "grad_norm": 1.7809809446334839, + "learning_rate": 6.812701066393124e-05, + "loss": 1.6853, + "step": 1038 + }, + { + "epoch": 0.8281622911694511, + "grad_norm": 1.6743443012237549, + "learning_rate": 6.79429588930699e-05, + "loss": 1.7091, + "step": 1041 + }, + { + "epoch": 0.8305489260143198, + "grad_norm": 1.9790890216827393, + "learning_rate": 6.775862763167142e-05, + "loss": 1.6965, + "step": 1044 + }, + { + "epoch": 0.8329355608591885, + "grad_norm": 1.5133233070373535, + "learning_rate": 6.757401975099262e-05, + "loss": 1.6387, + "step": 1047 + }, + { + "epoch": 0.8353221957040573, + "grad_norm": 3.9208717346191406, + "learning_rate": 6.738913812659912e-05, + "loss": 1.886, + "step": 1050 + }, + { + "epoch": 0.837708830548926, + "grad_norm": 1.884238362312317, + "learning_rate": 6.720398563832055e-05, + "loss": 1.7375, + "step": 1053 + }, + { + "epoch": 0.8400954653937948, + "grad_norm": 1.478226900100708, + "learning_rate": 6.701856517020565e-05, + "loss": 1.8226, + "step": 1056 + }, + { + "epoch": 0.8424821002386634, + "grad_norm": 1.8399808406829834, + "learning_rate": 6.683287961047742e-05, + "loss": 1.9471, + "step": 1059 + }, + { + "epoch": 0.8448687350835322, + "grad_norm": 1.7839453220367432, + "learning_rate": 6.664693185148807e-05, + "loss": 1.7139, + "step": 1062 + }, + { + "epoch": 0.847255369928401, + "grad_norm": 1.7865978479385376, + "learning_rate": 6.646072478967397e-05, + "loss": 1.8702, + "step": 1065 + }, + { + "epoch": 0.8496420047732697, + "grad_norm": 1.572658896446228, + "learning_rate": 6.627426132551058e-05, + "loss": 1.7129, + "step": 1068 + }, + { + "epoch": 0.8520286396181385, + "grad_norm": 1.555977463722229, + "learning_rate": 6.608754436346725e-05, + "loss": 1.6579, + "step": 1071 + }, + { + "epoch": 0.8544152744630071, + "grad_norm": 1.9152740240097046, + "learning_rate": 6.590057681196191e-05, + "loss": 1.6404, + "step": 1074 + }, + { + "epoch": 0.8568019093078759, + "grad_norm": 1.787007451057434, + "learning_rate": 6.571336158331589e-05, + "loss": 1.9347, + "step": 1077 + }, + { + "epoch": 0.8591885441527446, + "grad_norm": 1.757942795753479, + "learning_rate": 6.552590159370844e-05, + "loss": 1.5946, + "step": 1080 + }, + { + "epoch": 0.8615751789976134, + "grad_norm": 1.8404314517974854, + "learning_rate": 6.53381997631314e-05, + "loss": 1.8198, + "step": 1083 + }, + { + "epoch": 0.863961813842482, + "grad_norm": 1.9459943771362305, + "learning_rate": 6.515025901534364e-05, + "loss": 1.725, + "step": 1086 + }, + { + "epoch": 0.8663484486873508, + "grad_norm": 1.6057350635528564, + "learning_rate": 6.496208227782556e-05, + "loss": 1.7184, + "step": 1089 + }, + { + "epoch": 0.8687350835322196, + "grad_norm": 1.80686616897583, + "learning_rate": 6.477367248173352e-05, + "loss": 1.7365, + "step": 1092 + }, + { + "epoch": 0.8711217183770883, + "grad_norm": 1.8622928857803345, + "learning_rate": 6.458503256185404e-05, + "loss": 1.7449, + "step": 1095 + }, + { + "epoch": 0.8735083532219571, + "grad_norm": 1.521681547164917, + "learning_rate": 6.439616545655834e-05, + "loss": 1.6331, + "step": 1098 + }, + { + "epoch": 0.8758949880668258, + "grad_norm": 1.5368080139160156, + "learning_rate": 6.420707410775626e-05, + "loss": 1.7707, + "step": 1101 + }, + { + "epoch": 0.8782816229116945, + "grad_norm": 1.7633475065231323, + "learning_rate": 6.401776146085072e-05, + "loss": 2.0569, + "step": 1104 + }, + { + "epoch": 0.8806682577565632, + "grad_norm": 3.083674430847168, + "learning_rate": 6.382823046469167e-05, + "loss": 1.7942, + "step": 1107 + }, + { + "epoch": 0.883054892601432, + "grad_norm": 1.6440787315368652, + "learning_rate": 6.363848407153016e-05, + "loss": 1.604, + "step": 1110 + }, + { + "epoch": 0.8854415274463007, + "grad_norm": 1.640752911567688, + "learning_rate": 6.344852523697247e-05, + "loss": 1.8185, + "step": 1113 + }, + { + "epoch": 0.8878281622911695, + "grad_norm": 1.959386944770813, + "learning_rate": 6.325835691993394e-05, + "loss": 1.5247, + "step": 1116 + }, + { + "epoch": 0.8902147971360382, + "grad_norm": 1.8059946298599243, + "learning_rate": 6.306798208259297e-05, + "loss": 1.7049, + "step": 1119 + }, + { + "epoch": 0.8926014319809069, + "grad_norm": 1.619410753250122, + "learning_rate": 6.287740369034485e-05, + "loss": 1.5096, + "step": 1122 + }, + { + "epoch": 0.8949880668257757, + "grad_norm": 1.6625523567199707, + "learning_rate": 6.26866247117555e-05, + "loss": 1.5899, + "step": 1125 + }, + { + "epoch": 0.8973747016706444, + "grad_norm": 1.6419795751571655, + "learning_rate": 6.249564811851543e-05, + "loss": 1.7902, + "step": 1128 + }, + { + "epoch": 0.8997613365155132, + "grad_norm": 1.6195034980773926, + "learning_rate": 6.230447688539316e-05, + "loss": 1.592, + "step": 1131 + }, + { + "epoch": 0.9021479713603818, + "grad_norm": 1.6635634899139404, + "learning_rate": 6.211311399018916e-05, + "loss": 1.7621, + "step": 1134 + }, + { + "epoch": 0.9045346062052506, + "grad_norm": 1.5676401853561401, + "learning_rate": 6.192156241368929e-05, + "loss": 1.7352, + "step": 1137 + }, + { + "epoch": 0.9069212410501193, + "grad_norm": 1.879977822303772, + "learning_rate": 6.172982513961845e-05, + "loss": 1.6849, + "step": 1140 + }, + { + "epoch": 0.9093078758949881, + "grad_norm": 2.4494450092315674, + "learning_rate": 6.153790515459404e-05, + "loss": 1.6082, + "step": 1143 + }, + { + "epoch": 0.9116945107398569, + "grad_norm": 1.6648575067520142, + "learning_rate": 6.13458054480795e-05, + "loss": 1.8233, + "step": 1146 + }, + { + "epoch": 0.9140811455847255, + "grad_norm": 1.6422799825668335, + "learning_rate": 6.115352901233779e-05, + "loss": 1.7669, + "step": 1149 + }, + { + "epoch": 0.9164677804295943, + "grad_norm": 1.483912706375122, + "learning_rate": 6.096107884238458e-05, + "loss": 1.6037, + "step": 1152 + }, + { + "epoch": 0.918854415274463, + "grad_norm": 1.664514422416687, + "learning_rate": 6.0768457935941817e-05, + "loss": 1.7514, + "step": 1155 + }, + { + "epoch": 0.9212410501193318, + "grad_norm": 2.0139923095703125, + "learning_rate": 6.0575669293390954e-05, + "loss": 1.7803, + "step": 1158 + }, + { + "epoch": 0.9236276849642004, + "grad_norm": 1.7868553400039673, + "learning_rate": 6.038271591772615e-05, + "loss": 1.7884, + "step": 1161 + }, + { + "epoch": 0.9260143198090692, + "grad_norm": 2.1342079639434814, + "learning_rate": 6.0189600814507604e-05, + "loss": 1.8653, + "step": 1164 + }, + { + "epoch": 0.9284009546539379, + "grad_norm": 1.3909924030303955, + "learning_rate": 5.9996326991814654e-05, + "loss": 1.5607, + "step": 1167 + }, + { + "epoch": 0.9307875894988067, + "grad_norm": 2.2413270473480225, + "learning_rate": 5.980289746019892e-05, + "loss": 1.8347, + "step": 1170 + }, + { + "epoch": 0.9331742243436754, + "grad_norm": 1.802080512046814, + "learning_rate": 5.9609315232637483e-05, + "loss": 1.5535, + "step": 1173 + }, + { + "epoch": 0.9355608591885441, + "grad_norm": 1.6270626783370972, + "learning_rate": 5.941558332448589e-05, + "loss": 1.5801, + "step": 1176 + }, + { + "epoch": 0.9379474940334129, + "grad_norm": 1.567086935043335, + "learning_rate": 5.922170475343125e-05, + "loss": 1.6446, + "step": 1179 + }, + { + "epoch": 0.9403341288782816, + "grad_norm": 1.7809765338897705, + "learning_rate": 5.9027682539445104e-05, + "loss": 1.6499, + "step": 1182 + }, + { + "epoch": 0.9427207637231504, + "grad_norm": 1.5927331447601318, + "learning_rate": 5.883351970473654e-05, + "loss": 1.8008, + "step": 1185 + }, + { + "epoch": 0.9451073985680191, + "grad_norm": 1.7844749689102173, + "learning_rate": 5.863921927370498e-05, + "loss": 1.7045, + "step": 1188 + }, + { + "epoch": 0.9474940334128878, + "grad_norm": 1.6692689657211304, + "learning_rate": 5.8444784272893175e-05, + "loss": 1.6428, + "step": 1191 + }, + { + "epoch": 0.9498806682577565, + "grad_norm": 1.7680476903915405, + "learning_rate": 5.8250217730939973e-05, + "loss": 1.6511, + "step": 1194 + }, + { + "epoch": 0.9522673031026253, + "grad_norm": 1.5252869129180908, + "learning_rate": 5.8055522678533225e-05, + "loss": 1.6309, + "step": 1197 + }, + { + "epoch": 0.954653937947494, + "grad_norm": 1.713062047958374, + "learning_rate": 5.786070214836254e-05, + "loss": 1.5895, + "step": 1200 + }, + { + "epoch": 0.9570405727923628, + "grad_norm": 1.681307315826416, + "learning_rate": 5.7665759175072034e-05, + "loss": 1.8348, + "step": 1203 + }, + { + "epoch": 0.9594272076372315, + "grad_norm": 11.129051208496094, + "learning_rate": 5.747069679521305e-05, + "loss": 1.7288, + "step": 1206 + }, + { + "epoch": 0.9618138424821002, + "grad_norm": 2.107663869857788, + "learning_rate": 5.727551804719693e-05, + "loss": 1.6863, + "step": 1209 + }, + { + "epoch": 0.964200477326969, + "grad_norm": 1.520448088645935, + "learning_rate": 5.708022597124758e-05, + "loss": 1.6418, + "step": 1212 + }, + { + "epoch": 0.9665871121718377, + "grad_norm": 1.7622371912002563, + "learning_rate": 5.688482360935423e-05, + "loss": 1.8378, + "step": 1215 + }, + { + "epoch": 0.9689737470167065, + "grad_norm": 1.4517844915390015, + "learning_rate": 5.668931400522396e-05, + "loss": 1.7499, + "step": 1218 + }, + { + "epoch": 0.9713603818615751, + "grad_norm": 1.7153959274291992, + "learning_rate": 5.649370020423431e-05, + "loss": 1.6391, + "step": 1221 + }, + { + "epoch": 0.9737470167064439, + "grad_norm": 1.6043071746826172, + "learning_rate": 5.629798525338589e-05, + "loss": 1.6558, + "step": 1224 + }, + { + "epoch": 0.9761336515513126, + "grad_norm": 1.5428578853607178, + "learning_rate": 5.6102172201254835e-05, + "loss": 1.7245, + "step": 1227 + }, + { + "epoch": 0.9785202863961814, + "grad_norm": 3.9565799236297607, + "learning_rate": 5.5906264097945407e-05, + "loss": 1.7375, + "step": 1230 + }, + { + "epoch": 0.9809069212410502, + "grad_norm": 1.5014811754226685, + "learning_rate": 5.5710263995042434e-05, + "loss": 1.8023, + "step": 1233 + }, + { + "epoch": 0.9832935560859188, + "grad_norm": 1.6702003479003906, + "learning_rate": 5.551417494556376e-05, + "loss": 1.7092, + "step": 1236 + }, + { + "epoch": 0.9856801909307876, + "grad_norm": 1.5324056148529053, + "learning_rate": 5.531800000391275e-05, + "loss": 1.7092, + "step": 1239 + }, + { + "epoch": 0.9880668257756563, + "grad_norm": 1.6837244033813477, + "learning_rate": 5.5121742225830665e-05, + "loss": 1.8185, + "step": 1242 + }, + { + "epoch": 0.9904534606205251, + "grad_norm": 2.0750982761383057, + "learning_rate": 5.4925404668349076e-05, + "loss": 1.7451, + "step": 1245 + }, + { + "epoch": 0.9928400954653938, + "grad_norm": 1.609738826751709, + "learning_rate": 5.472899038974225e-05, + "loss": 1.6469, + "step": 1248 + }, + { + "epoch": 0.9952267303102625, + "grad_norm": 3.454913377761841, + "learning_rate": 5.45325024494795e-05, + "loss": 1.7291, + "step": 1251 + }, + { + "epoch": 0.9976133651551312, + "grad_norm": 1.7975199222564697, + "learning_rate": 5.433594390817756e-05, + "loss": 1.8493, + "step": 1254 + }, + { + "epoch": 1.0, + "grad_norm": 2.325455904006958, + "learning_rate": 5.413931782755283e-05, + "loss": 1.8083, + "step": 1257 + }, + { + "epoch": 1.0023866348448687, + "grad_norm": 1.4781057834625244, + "learning_rate": 5.3942627270373826e-05, + "loss": 1.4756, + "step": 1260 + }, + { + "epoch": 1.0047732696897376, + "grad_norm": 1.787947177886963, + "learning_rate": 5.374587530041335e-05, + "loss": 1.5096, + "step": 1263 + }, + { + "epoch": 1.0071599045346062, + "grad_norm": 1.7651340961456299, + "learning_rate": 5.35490649824008e-05, + "loss": 1.4784, + "step": 1266 + }, + { + "epoch": 1.009546539379475, + "grad_norm": 1.4127767086029053, + "learning_rate": 5.335219938197445e-05, + "loss": 1.4218, + "step": 1269 + }, + { + "epoch": 1.0119331742243436, + "grad_norm": 1.8526569604873657, + "learning_rate": 5.315528156563367e-05, + "loss": 1.4832, + "step": 1272 + }, + { + "epoch": 1.0143198090692125, + "grad_norm": 1.7442090511322021, + "learning_rate": 5.295831460069124e-05, + "loss": 1.3938, + "step": 1275 + }, + { + "epoch": 1.0167064439140812, + "grad_norm": 1.656014323234558, + "learning_rate": 5.276130155522541e-05, + "loss": 1.5448, + "step": 1278 + }, + { + "epoch": 1.0190930787589498, + "grad_norm": 1.4529969692230225, + "learning_rate": 5.256424549803228e-05, + "loss": 1.5037, + "step": 1281 + }, + { + "epoch": 1.0214797136038185, + "grad_norm": 1.4293503761291504, + "learning_rate": 5.236714949857791e-05, + "loss": 1.3965, + "step": 1284 + }, + { + "epoch": 1.0238663484486874, + "grad_norm": 1.8054838180541992, + "learning_rate": 5.2170016626950505e-05, + "loss": 1.4733, + "step": 1287 + }, + { + "epoch": 1.026252983293556, + "grad_norm": 1.5657786130905151, + "learning_rate": 5.1972849953812644e-05, + "loss": 1.4205, + "step": 1290 + }, + { + "epoch": 1.0286396181384247, + "grad_norm": 1.5996116399765015, + "learning_rate": 5.1775652550353405e-05, + "loss": 1.5207, + "step": 1293 + }, + { + "epoch": 1.0310262529832936, + "grad_norm": 1.7841379642486572, + "learning_rate": 5.157842748824053e-05, + "loss": 1.4094, + "step": 1296 + }, + { + "epoch": 1.0334128878281623, + "grad_norm": 1.8023128509521484, + "learning_rate": 5.138117783957261e-05, + "loss": 1.5327, + "step": 1299 + }, + { + "epoch": 1.035799522673031, + "grad_norm": 1.6026707887649536, + "learning_rate": 5.1183906676831197e-05, + "loss": 1.5903, + "step": 1302 + }, + { + "epoch": 1.0381861575178997, + "grad_norm": 1.5639821290969849, + "learning_rate": 5.098661707283298e-05, + "loss": 1.4613, + "step": 1305 + }, + { + "epoch": 1.0405727923627686, + "grad_norm": 1.6041333675384521, + "learning_rate": 5.078931210068185e-05, + "loss": 1.4202, + "step": 1308 + }, + { + "epoch": 1.0429594272076372, + "grad_norm": 1.774864673614502, + "learning_rate": 5.059199483372114e-05, + "loss": 1.4175, + "step": 1311 + }, + { + "epoch": 1.045346062052506, + "grad_norm": 1.5444409847259521, + "learning_rate": 5.039466834548568e-05, + "loss": 1.5621, + "step": 1314 + }, + { + "epoch": 1.0477326968973748, + "grad_norm": 1.456629753112793, + "learning_rate": 5.0197335709653883e-05, + "loss": 1.2813, + "step": 1317 + }, + { + "epoch": 1.0501193317422435, + "grad_norm": 1.5410020351409912, + "learning_rate": 5e-05, + "loss": 1.5317, + "step": 1320 + }, + { + "epoch": 1.0525059665871122, + "grad_norm": 1.7866257429122925, + "learning_rate": 4.980266429034613e-05, + "loss": 1.4627, + "step": 1323 + }, + { + "epoch": 1.0548926014319808, + "grad_norm": 1.5053359270095825, + "learning_rate": 4.960533165451435e-05, + "loss": 1.3497, + "step": 1326 + }, + { + "epoch": 1.0572792362768497, + "grad_norm": 1.4978351593017578, + "learning_rate": 4.9408005166278855e-05, + "loss": 1.4533, + "step": 1329 + }, + { + "epoch": 1.0596658711217184, + "grad_norm": 1.6775259971618652, + "learning_rate": 4.921068789931816e-05, + "loss": 1.4523, + "step": 1332 + }, + { + "epoch": 1.062052505966587, + "grad_norm": 1.4441739320755005, + "learning_rate": 4.901338292716704e-05, + "loss": 1.302, + "step": 1335 + }, + { + "epoch": 1.064439140811456, + "grad_norm": 1.6126224994659424, + "learning_rate": 4.8816093323168815e-05, + "loss": 1.4091, + "step": 1338 + }, + { + "epoch": 1.0668257756563246, + "grad_norm": 1.5988109111785889, + "learning_rate": 4.8618822160427406e-05, + "loss": 1.4568, + "step": 1341 + }, + { + "epoch": 1.0692124105011933, + "grad_norm": 1.4735040664672852, + "learning_rate": 4.842157251175947e-05, + "loss": 1.4034, + "step": 1344 + }, + { + "epoch": 1.071599045346062, + "grad_norm": 1.6535234451293945, + "learning_rate": 4.822434744964661e-05, + "loss": 1.4543, + "step": 1347 + }, + { + "epoch": 1.0739856801909309, + "grad_norm": 1.4830214977264404, + "learning_rate": 4.802715004618737e-05, + "loss": 1.4614, + "step": 1350 + }, + { + "epoch": 1.0763723150357996, + "grad_norm": 1.60805344581604, + "learning_rate": 4.7829983373049507e-05, + "loss": 1.4318, + "step": 1353 + }, + { + "epoch": 1.0787589498806682, + "grad_norm": 1.2208465337753296, + "learning_rate": 4.763285050142211e-05, + "loss": 1.3005, + "step": 1356 + }, + { + "epoch": 1.081145584725537, + "grad_norm": 1.3961493968963623, + "learning_rate": 4.743575450196773e-05, + "loss": 1.4343, + "step": 1359 + }, + { + "epoch": 1.0835322195704058, + "grad_norm": 1.7156232595443726, + "learning_rate": 4.7238698444774595e-05, + "loss": 1.3852, + "step": 1362 + }, + { + "epoch": 1.0859188544152745, + "grad_norm": 1.3676952123641968, + "learning_rate": 4.704168539930878e-05, + "loss": 1.3402, + "step": 1365 + }, + { + "epoch": 1.0883054892601431, + "grad_norm": 1.734222173690796, + "learning_rate": 4.6844718434366334e-05, + "loss": 1.5666, + "step": 1368 + }, + { + "epoch": 1.0906921241050118, + "grad_norm": 1.791410207748413, + "learning_rate": 4.664780061802557e-05, + "loss": 1.4359, + "step": 1371 + }, + { + "epoch": 1.0930787589498807, + "grad_norm": 1.553160309791565, + "learning_rate": 4.64509350175992e-05, + "loss": 1.4339, + "step": 1374 + }, + { + "epoch": 1.0954653937947494, + "grad_norm": 1.6250778436660767, + "learning_rate": 4.6254124699586656e-05, + "loss": 1.4916, + "step": 1377 + }, + { + "epoch": 1.097852028639618, + "grad_norm": 1.6796406507492065, + "learning_rate": 4.605737272962618e-05, + "loss": 1.4177, + "step": 1380 + }, + { + "epoch": 1.100238663484487, + "grad_norm": 1.5058690309524536, + "learning_rate": 4.5860682172447184e-05, + "loss": 1.455, + "step": 1383 + }, + { + "epoch": 1.1026252983293556, + "grad_norm": 3.526346206665039, + "learning_rate": 4.566405609182247e-05, + "loss": 1.5, + "step": 1386 + }, + { + "epoch": 1.1050119331742243, + "grad_norm": 1.591094970703125, + "learning_rate": 4.546749755052051e-05, + "loss": 1.5146, + "step": 1389 + }, + { + "epoch": 1.107398568019093, + "grad_norm": 1.7309263944625854, + "learning_rate": 4.527100961025776e-05, + "loss": 1.4662, + "step": 1392 + }, + { + "epoch": 1.1097852028639619, + "grad_norm": 1.7857303619384766, + "learning_rate": 4.507459533165093e-05, + "loss": 1.4219, + "step": 1395 + }, + { + "epoch": 1.1121718377088305, + "grad_norm": 1.5359363555908203, + "learning_rate": 4.4878257774169346e-05, + "loss": 1.3648, + "step": 1398 + }, + { + "epoch": 1.1145584725536992, + "grad_norm": 1.6269012689590454, + "learning_rate": 4.4681999996087274e-05, + "loss": 1.4435, + "step": 1401 + }, + { + "epoch": 1.1169451073985681, + "grad_norm": 1.8540282249450684, + "learning_rate": 4.448582505443625e-05, + "loss": 1.515, + "step": 1404 + }, + { + "epoch": 1.1193317422434368, + "grad_norm": 1.5260019302368164, + "learning_rate": 4.4289736004957585e-05, + "loss": 1.4334, + "step": 1407 + }, + { + "epoch": 1.1217183770883055, + "grad_norm": 1.5051707029342651, + "learning_rate": 4.4093735902054605e-05, + "loss": 1.4261, + "step": 1410 + }, + { + "epoch": 1.1241050119331741, + "grad_norm": 1.4636119604110718, + "learning_rate": 4.3897827798745183e-05, + "loss": 1.4085, + "step": 1413 + }, + { + "epoch": 1.126491646778043, + "grad_norm": 1.660888910293579, + "learning_rate": 4.3702014746614136e-05, + "loss": 1.4554, + "step": 1416 + }, + { + "epoch": 1.1288782816229117, + "grad_norm": 1.8002493381500244, + "learning_rate": 4.350629979576569e-05, + "loss": 1.4019, + "step": 1419 + }, + { + "epoch": 1.1312649164677804, + "grad_norm": 1.442589282989502, + "learning_rate": 4.331068599477605e-05, + "loss": 1.3116, + "step": 1422 + }, + { + "epoch": 1.1336515513126493, + "grad_norm": 1.5035159587860107, + "learning_rate": 4.311517639064578e-05, + "loss": 1.2794, + "step": 1425 + }, + { + "epoch": 1.136038186157518, + "grad_norm": 1.8796526193618774, + "learning_rate": 4.2919774028752436e-05, + "loss": 1.3611, + "step": 1428 + }, + { + "epoch": 1.1384248210023866, + "grad_norm": 1.4292360544204712, + "learning_rate": 4.27244819528031e-05, + "loss": 1.4402, + "step": 1431 + }, + { + "epoch": 1.1408114558472553, + "grad_norm": 1.6761499643325806, + "learning_rate": 4.2529303204786953e-05, + "loss": 1.4959, + "step": 1434 + }, + { + "epoch": 1.1431980906921242, + "grad_norm": 1.7346595525741577, + "learning_rate": 4.233424082492797e-05, + "loss": 1.4828, + "step": 1437 + }, + { + "epoch": 1.1455847255369929, + "grad_norm": 1.394885778427124, + "learning_rate": 4.213929785163747e-05, + "loss": 1.4321, + "step": 1440 + }, + { + "epoch": 1.1479713603818615, + "grad_norm": 1.510941743850708, + "learning_rate": 4.1944477321466786e-05, + "loss": 1.4397, + "step": 1443 + }, + { + "epoch": 1.1503579952267304, + "grad_norm": 1.5493265390396118, + "learning_rate": 4.1749782269060045e-05, + "loss": 1.457, + "step": 1446 + }, + { + "epoch": 1.152744630071599, + "grad_norm": 1.6018649339675903, + "learning_rate": 4.1555215727106844e-05, + "loss": 1.4258, + "step": 1449 + }, + { + "epoch": 1.1551312649164678, + "grad_norm": 1.5491352081298828, + "learning_rate": 4.136078072629503e-05, + "loss": 1.422, + "step": 1452 + }, + { + "epoch": 1.1575178997613365, + "grad_norm": 1.7042955160140991, + "learning_rate": 4.116648029526347e-05, + "loss": 1.406, + "step": 1455 + }, + { + "epoch": 1.1599045346062051, + "grad_norm": 1.5230995416641235, + "learning_rate": 4.097231746055491e-05, + "loss": 1.5026, + "step": 1458 + }, + { + "epoch": 1.162291169451074, + "grad_norm": 1.8555903434753418, + "learning_rate": 4.077829524656877e-05, + "loss": 1.3636, + "step": 1461 + }, + { + "epoch": 1.1646778042959427, + "grad_norm": 1.4330073595046997, + "learning_rate": 4.05844166755141e-05, + "loss": 1.4293, + "step": 1464 + }, + { + "epoch": 1.1670644391408114, + "grad_norm": 1.5035028457641602, + "learning_rate": 4.039068476736253e-05, + "loss": 1.406, + "step": 1467 + }, + { + "epoch": 1.1694510739856803, + "grad_norm": 1.7410966157913208, + "learning_rate": 4.01971025398011e-05, + "loss": 1.3049, + "step": 1470 + }, + { + "epoch": 1.171837708830549, + "grad_norm": 1.6594077348709106, + "learning_rate": 4.000367300818537e-05, + "loss": 1.6195, + "step": 1473 + }, + { + "epoch": 1.1742243436754176, + "grad_norm": 1.6386613845825195, + "learning_rate": 3.98103991854924e-05, + "loss": 1.4577, + "step": 1476 + }, + { + "epoch": 1.1766109785202863, + "grad_norm": 1.6970839500427246, + "learning_rate": 3.961728408227384e-05, + "loss": 1.5535, + "step": 1479 + }, + { + "epoch": 1.1789976133651552, + "grad_norm": 1.6995643377304077, + "learning_rate": 3.942433070660905e-05, + "loss": 1.4154, + "step": 1482 + }, + { + "epoch": 1.1813842482100239, + "grad_norm": 1.4957417249679565, + "learning_rate": 3.923154206405819e-05, + "loss": 1.4574, + "step": 1485 + }, + { + "epoch": 1.1837708830548925, + "grad_norm": 1.7634819746017456, + "learning_rate": 3.9038921157615444e-05, + "loss": 1.3863, + "step": 1488 + }, + { + "epoch": 1.1861575178997614, + "grad_norm": 1.64838707447052, + "learning_rate": 3.884647098766224e-05, + "loss": 1.4176, + "step": 1491 + }, + { + "epoch": 1.18854415274463, + "grad_norm": 1.379270315170288, + "learning_rate": 3.8654194551920485e-05, + "loss": 1.4031, + "step": 1494 + }, + { + "epoch": 1.1909307875894988, + "grad_norm": 1.4653505086898804, + "learning_rate": 3.846209484540597e-05, + "loss": 1.407, + "step": 1497 + }, + { + "epoch": 1.1933174224343674, + "grad_norm": 1.425595760345459, + "learning_rate": 3.827017486038157e-05, + "loss": 1.3342, + "step": 1500 + }, + { + "epoch": 1.1957040572792363, + "grad_norm": 1.3729442358016968, + "learning_rate": 3.8078437586310716e-05, + "loss": 1.5717, + "step": 1503 + }, + { + "epoch": 1.198090692124105, + "grad_norm": 1.317692756652832, + "learning_rate": 3.788688600981085e-05, + "loss": 1.4324, + "step": 1506 + }, + { + "epoch": 1.2004773269689737, + "grad_norm": 1.3449257612228394, + "learning_rate": 3.769552311460684e-05, + "loss": 1.454, + "step": 1509 + }, + { + "epoch": 1.2028639618138426, + "grad_norm": 1.6187500953674316, + "learning_rate": 3.750435188148459e-05, + "loss": 1.5128, + "step": 1512 + }, + { + "epoch": 1.2052505966587113, + "grad_norm": 2.184224843978882, + "learning_rate": 3.73133752882445e-05, + "loss": 1.2855, + "step": 1515 + }, + { + "epoch": 1.20763723150358, + "grad_norm": 1.5003141164779663, + "learning_rate": 3.712259630965518e-05, + "loss": 1.4485, + "step": 1518 + }, + { + "epoch": 1.2100238663484486, + "grad_norm": 1.4491519927978516, + "learning_rate": 3.6932017917407045e-05, + "loss": 1.4343, + "step": 1521 + }, + { + "epoch": 1.2124105011933175, + "grad_norm": 1.332411289215088, + "learning_rate": 3.6741643080066065e-05, + "loss": 1.3984, + "step": 1524 + }, + { + "epoch": 1.2147971360381862, + "grad_norm": 1.6156560182571411, + "learning_rate": 3.655147476302754e-05, + "loss": 1.4771, + "step": 1527 + }, + { + "epoch": 1.2171837708830548, + "grad_norm": 1.6296911239624023, + "learning_rate": 3.636151592846985e-05, + "loss": 1.3419, + "step": 1530 + }, + { + "epoch": 1.2195704057279237, + "grad_norm": 1.556209921836853, + "learning_rate": 3.617176953530835e-05, + "loss": 1.3619, + "step": 1533 + }, + { + "epoch": 1.2219570405727924, + "grad_norm": 1.383895754814148, + "learning_rate": 3.5982238539149285e-05, + "loss": 1.2554, + "step": 1536 + }, + { + "epoch": 1.224343675417661, + "grad_norm": 2.2628183364868164, + "learning_rate": 3.579292589224375e-05, + "loss": 1.4491, + "step": 1539 + }, + { + "epoch": 1.2267303102625298, + "grad_norm": 1.618979573249817, + "learning_rate": 3.560383454344168e-05, + "loss": 1.5344, + "step": 1542 + }, + { + "epoch": 1.2291169451073987, + "grad_norm": 1.386203646659851, + "learning_rate": 3.541496743814596e-05, + "loss": 1.4364, + "step": 1545 + }, + { + "epoch": 1.2315035799522673, + "grad_norm": 1.5511690378189087, + "learning_rate": 3.522632751826651e-05, + "loss": 1.3568, + "step": 1548 + }, + { + "epoch": 1.233890214797136, + "grad_norm": 1.6482512950897217, + "learning_rate": 3.503791772217445e-05, + "loss": 1.4858, + "step": 1551 + }, + { + "epoch": 1.2362768496420047, + "grad_norm": 1.5876716375350952, + "learning_rate": 3.484974098465636e-05, + "loss": 1.3507, + "step": 1554 + }, + { + "epoch": 1.2386634844868736, + "grad_norm": 1.7436355352401733, + "learning_rate": 3.4661800236868604e-05, + "loss": 1.4313, + "step": 1557 + }, + { + "epoch": 1.2410501193317423, + "grad_norm": 1.5434948205947876, + "learning_rate": 3.447409840629156e-05, + "loss": 1.3697, + "step": 1560 + }, + { + "epoch": 1.243436754176611, + "grad_norm": 1.7537883520126343, + "learning_rate": 3.428663841668412e-05, + "loss": 1.5541, + "step": 1563 + }, + { + "epoch": 1.2458233890214796, + "grad_norm": 1.4465534687042236, + "learning_rate": 3.409942318803809e-05, + "loss": 1.4101, + "step": 1566 + }, + { + "epoch": 1.2482100238663485, + "grad_norm": 1.4425692558288574, + "learning_rate": 3.391245563653276e-05, + "loss": 1.5231, + "step": 1569 + }, + { + "epoch": 1.2505966587112172, + "grad_norm": 1.571582317352295, + "learning_rate": 3.3725738674489414e-05, + "loss": 1.4215, + "step": 1572 + }, + { + "epoch": 1.2529832935560858, + "grad_norm": 1.4353744983673096, + "learning_rate": 3.3539275210326044e-05, + "loss": 1.4282, + "step": 1575 + }, + { + "epoch": 1.2553699284009547, + "grad_norm": 1.4851089715957642, + "learning_rate": 3.335306814851196e-05, + "loss": 1.4177, + "step": 1578 + }, + { + "epoch": 1.2577565632458234, + "grad_norm": 1.3933429718017578, + "learning_rate": 3.31671203895226e-05, + "loss": 1.3531, + "step": 1581 + }, + { + "epoch": 1.260143198090692, + "grad_norm": 3.346571207046509, + "learning_rate": 3.298143482979436e-05, + "loss": 1.3556, + "step": 1584 + }, + { + "epoch": 1.2625298329355608, + "grad_norm": 1.9931849241256714, + "learning_rate": 3.2796014361679464e-05, + "loss": 1.4935, + "step": 1587 + }, + { + "epoch": 1.2649164677804297, + "grad_norm": 1.5960787534713745, + "learning_rate": 3.261086187340088e-05, + "loss": 1.3861, + "step": 1590 + }, + { + "epoch": 1.2673031026252983, + "grad_norm": 1.4517109394073486, + "learning_rate": 3.242598024900738e-05, + "loss": 1.4512, + "step": 1593 + }, + { + "epoch": 1.269689737470167, + "grad_norm": 1.7831088304519653, + "learning_rate": 3.224137236832859e-05, + "loss": 1.336, + "step": 1596 + }, + { + "epoch": 1.272076372315036, + "grad_norm": 1.3997858762741089, + "learning_rate": 3.2057041106930104e-05, + "loss": 1.3483, + "step": 1599 + }, + { + "epoch": 1.2744630071599046, + "grad_norm": 1.4272023439407349, + "learning_rate": 3.187298933606878e-05, + "loss": 1.4455, + "step": 1602 + }, + { + "epoch": 1.2768496420047732, + "grad_norm": 1.44106924533844, + "learning_rate": 3.1689219922647924e-05, + "loss": 1.3062, + "step": 1605 + }, + { + "epoch": 1.279236276849642, + "grad_norm": 1.472388744354248, + "learning_rate": 3.150573572917267e-05, + "loss": 1.5172, + "step": 1608 + }, + { + "epoch": 1.2816229116945108, + "grad_norm": 1.4416998624801636, + "learning_rate": 3.13225396137054e-05, + "loss": 1.3624, + "step": 1611 + }, + { + "epoch": 1.2840095465393795, + "grad_norm": 1.5669586658477783, + "learning_rate": 3.11396344298212e-05, + "loss": 1.4928, + "step": 1614 + }, + { + "epoch": 1.2863961813842482, + "grad_norm": 1.4020856618881226, + "learning_rate": 3.095702302656347e-05, + "loss": 1.4451, + "step": 1617 + }, + { + "epoch": 1.288782816229117, + "grad_norm": 1.58262038230896, + "learning_rate": 3.077470824839947e-05, + "loss": 1.4129, + "step": 1620 + }, + { + "epoch": 1.2911694510739857, + "grad_norm": 1.3566120862960815, + "learning_rate": 3.059269293517603e-05, + "loss": 1.4103, + "step": 1623 + }, + { + "epoch": 1.2935560859188544, + "grad_norm": 1.4482066631317139, + "learning_rate": 3.0410979922075343e-05, + "loss": 1.3193, + "step": 1626 + }, + { + "epoch": 1.295942720763723, + "grad_norm": 1.5326952934265137, + "learning_rate": 3.022957203957083e-05, + "loss": 1.508, + "step": 1629 + }, + { + "epoch": 1.2983293556085918, + "grad_norm": 1.4580435752868652, + "learning_rate": 3.004847211338295e-05, + "loss": 1.3046, + "step": 1632 + }, + { + "epoch": 1.3007159904534606, + "grad_norm": 1.4401758909225464, + "learning_rate": 2.9867682964435294e-05, + "loss": 1.403, + "step": 1635 + }, + { + "epoch": 1.3031026252983293, + "grad_norm": 1.3278849124908447, + "learning_rate": 2.9687207408810557e-05, + "loss": 1.4574, + "step": 1638 + }, + { + "epoch": 1.3054892601431982, + "grad_norm": 1.5741738080978394, + "learning_rate": 2.9507048257706727e-05, + "loss": 1.5322, + "step": 1641 + }, + { + "epoch": 1.307875894988067, + "grad_norm": 1.517532229423523, + "learning_rate": 2.9327208317393303e-05, + "loss": 1.5201, + "step": 1644 + }, + { + "epoch": 1.3102625298329356, + "grad_norm": 1.5097103118896484, + "learning_rate": 2.9147690389167514e-05, + "loss": 1.3938, + "step": 1647 + }, + { + "epoch": 1.3126491646778042, + "grad_norm": 1.369953989982605, + "learning_rate": 2.8968497269310803e-05, + "loss": 1.2942, + "step": 1650 + }, + { + "epoch": 1.315035799522673, + "grad_norm": 1.4780120849609375, + "learning_rate": 2.8789631749045097e-05, + "loss": 1.3554, + "step": 1653 + }, + { + "epoch": 1.3174224343675418, + "grad_norm": 2.5901074409484863, + "learning_rate": 2.8611096614489518e-05, + "loss": 1.3647, + "step": 1656 + }, + { + "epoch": 1.3198090692124105, + "grad_norm": 1.4900747537612915, + "learning_rate": 2.8432894646616885e-05, + "loss": 1.3546, + "step": 1659 + }, + { + "epoch": 1.3221957040572792, + "grad_norm": 1.3987863063812256, + "learning_rate": 2.8255028621210355e-05, + "loss": 1.4177, + "step": 1662 + }, + { + "epoch": 1.324582338902148, + "grad_norm": 11.975431442260742, + "learning_rate": 2.8077501308820308e-05, + "loss": 1.4814, + "step": 1665 + }, + { + "epoch": 1.3269689737470167, + "grad_norm": 1.503467082977295, + "learning_rate": 2.790031547472105e-05, + "loss": 1.4222, + "step": 1668 + }, + { + "epoch": 1.3293556085918854, + "grad_norm": 1.5251504182815552, + "learning_rate": 2.7723473878867877e-05, + "loss": 1.4811, + "step": 1671 + }, + { + "epoch": 1.331742243436754, + "grad_norm": 1.567043423652649, + "learning_rate": 2.754697927585399e-05, + "loss": 1.3752, + "step": 1674 + }, + { + "epoch": 1.334128878281623, + "grad_norm": 1.3771165609359741, + "learning_rate": 2.737083441486763e-05, + "loss": 1.3422, + "step": 1677 + }, + { + "epoch": 1.3365155131264916, + "grad_norm": 1.4927712678909302, + "learning_rate": 2.71950420396492e-05, + "loss": 1.3736, + "step": 1680 + }, + { + "epoch": 1.3389021479713603, + "grad_norm": 1.3801411390304565, + "learning_rate": 2.7019604888448642e-05, + "loss": 1.3837, + "step": 1683 + }, + { + "epoch": 1.3412887828162292, + "grad_norm": 1.4260993003845215, + "learning_rate": 2.6844525693982613e-05, + "loss": 1.3627, + "step": 1686 + }, + { + "epoch": 1.3436754176610979, + "grad_norm": 1.3967187404632568, + "learning_rate": 2.666980718339211e-05, + "loss": 1.4247, + "step": 1689 + }, + { + "epoch": 1.3460620525059666, + "grad_norm": 1.6355633735656738, + "learning_rate": 2.6495452078199863e-05, + "loss": 1.3911, + "step": 1692 + }, + { + "epoch": 1.3484486873508352, + "grad_norm": 1.4555933475494385, + "learning_rate": 2.6321463094267934e-05, + "loss": 1.3659, + "step": 1695 + }, + { + "epoch": 1.3508353221957041, + "grad_norm": 1.8073235750198364, + "learning_rate": 2.614784294175554e-05, + "loss": 1.4982, + "step": 1698 + }, + { + "epoch": 1.3532219570405728, + "grad_norm": 1.790961503982544, + "learning_rate": 2.597459432507664e-05, + "loss": 1.427, + "step": 1701 + }, + { + "epoch": 1.3556085918854415, + "grad_norm": 1.4583306312561035, + "learning_rate": 2.5801719942858065e-05, + "loss": 1.4358, + "step": 1704 + }, + { + "epoch": 1.3579952267303104, + "grad_norm": 1.3691617250442505, + "learning_rate": 2.562922248789722e-05, + "loss": 1.4026, + "step": 1707 + }, + { + "epoch": 1.360381861575179, + "grad_norm": 1.4829280376434326, + "learning_rate": 2.5457104647120322e-05, + "loss": 1.5299, + "step": 1710 + }, + { + "epoch": 1.3627684964200477, + "grad_norm": 1.4290732145309448, + "learning_rate": 2.5285369101540445e-05, + "loss": 1.4366, + "step": 1713 + }, + { + "epoch": 1.3651551312649164, + "grad_norm": 1.5401760339736938, + "learning_rate": 2.5114018526215844e-05, + "loss": 1.4262, + "step": 1716 + }, + { + "epoch": 1.3675417661097853, + "grad_norm": 1.3975614309310913, + "learning_rate": 2.494305559020822e-05, + "loss": 1.4702, + "step": 1719 + }, + { + "epoch": 1.369928400954654, + "grad_norm": 1.3241435289382935, + "learning_rate": 2.4772482956541132e-05, + "loss": 1.3565, + "step": 1722 + }, + { + "epoch": 1.3723150357995226, + "grad_norm": 1.4867254495620728, + "learning_rate": 2.4602303282158616e-05, + "loss": 1.3253, + "step": 1725 + }, + { + "epoch": 1.3747016706443915, + "grad_norm": 1.5833885669708252, + "learning_rate": 2.4432519217883676e-05, + "loss": 1.421, + "step": 1728 + }, + { + "epoch": 1.3770883054892602, + "grad_norm": 1.5261849164962769, + "learning_rate": 2.4263133408377076e-05, + "loss": 1.4693, + "step": 1731 + }, + { + "epoch": 1.3794749403341289, + "grad_norm": 1.4552416801452637, + "learning_rate": 2.4094148492096125e-05, + "loss": 1.5436, + "step": 1734 + }, + { + "epoch": 1.3818615751789975, + "grad_norm": 1.5196255445480347, + "learning_rate": 2.3925567101253576e-05, + "loss": 1.5093, + "step": 1737 + }, + { + "epoch": 1.3842482100238662, + "grad_norm": 1.3654688596725464, + "learning_rate": 2.3757391861776585e-05, + "loss": 1.3681, + "step": 1740 + }, + { + "epoch": 1.3866348448687351, + "grad_norm": 1.4244242906570435, + "learning_rate": 2.3589625393265895e-05, + "loss": 1.3754, + "step": 1743 + }, + { + "epoch": 1.3890214797136038, + "grad_norm": 1.6873931884765625, + "learning_rate": 2.3422270308954934e-05, + "loss": 1.4434, + "step": 1746 + }, + { + "epoch": 1.3914081145584727, + "grad_norm": 1.2441445589065552, + "learning_rate": 2.3255329215669185e-05, + "loss": 1.3577, + "step": 1749 + }, + { + "epoch": 1.3937947494033414, + "grad_norm": 1.4804401397705078, + "learning_rate": 2.3088804713785584e-05, + "loss": 1.4486, + "step": 1752 + }, + { + "epoch": 1.39618138424821, + "grad_norm": 1.3242276906967163, + "learning_rate": 2.2922699397191893e-05, + "loss": 1.3782, + "step": 1755 + }, + { + "epoch": 1.3985680190930787, + "grad_norm": 1.4735126495361328, + "learning_rate": 2.2757015853246493e-05, + "loss": 1.3395, + "step": 1758 + }, + { + "epoch": 1.4009546539379474, + "grad_norm": 1.2929046154022217, + "learning_rate": 2.2591756662737862e-05, + "loss": 1.4035, + "step": 1761 + }, + { + "epoch": 1.4033412887828163, + "grad_norm": 1.3094271421432495, + "learning_rate": 2.242692439984463e-05, + "loss": 1.3874, + "step": 1764 + }, + { + "epoch": 1.405727923627685, + "grad_norm": 1.598767876625061, + "learning_rate": 2.2262521632095203e-05, + "loss": 1.4688, + "step": 1767 + }, + { + "epoch": 1.4081145584725536, + "grad_norm": 1.4594007730484009, + "learning_rate": 2.2098550920327998e-05, + "loss": 1.2801, + "step": 1770 + }, + { + "epoch": 1.4105011933174225, + "grad_norm": 1.6119073629379272, + "learning_rate": 2.1935014818651405e-05, + "loss": 1.3331, + "step": 1773 + }, + { + "epoch": 1.4128878281622912, + "grad_norm": 1.351380705833435, + "learning_rate": 2.177191587440409e-05, + "loss": 1.3735, + "step": 1776 + }, + { + "epoch": 1.4152744630071599, + "grad_norm": 1.4811229705810547, + "learning_rate": 2.1609256628115316e-05, + "loss": 1.4969, + "step": 1779 + }, + { + "epoch": 1.4176610978520285, + "grad_norm": 1.5260910987854004, + "learning_rate": 2.1447039613465265e-05, + "loss": 1.4194, + "step": 1782 + }, + { + "epoch": 1.4200477326968974, + "grad_norm": 1.8757227659225464, + "learning_rate": 2.128526735724572e-05, + "loss": 1.4043, + "step": 1785 + }, + { + "epoch": 1.422434367541766, + "grad_norm": 1.8287010192871094, + "learning_rate": 2.1123942379320576e-05, + "loss": 1.3959, + "step": 1788 + }, + { + "epoch": 1.4248210023866348, + "grad_norm": 1.6383695602416992, + "learning_rate": 2.096306719258669e-05, + "loss": 1.3586, + "step": 1791 + }, + { + "epoch": 1.4272076372315037, + "grad_norm": 1.5422853231430054, + "learning_rate": 2.0802644302934683e-05, + "loss": 1.4287, + "step": 1794 + }, + { + "epoch": 1.4295942720763724, + "grad_norm": 1.5485094785690308, + "learning_rate": 2.0642676209209934e-05, + "loss": 1.5216, + "step": 1797 + }, + { + "epoch": 1.431980906921241, + "grad_norm": 1.9971394538879395, + "learning_rate": 2.0483165403173583e-05, + "loss": 1.4238, + "step": 1800 + }, + { + "epoch": 1.4343675417661097, + "grad_norm": 1.4561408758163452, + "learning_rate": 2.0324114369463855e-05, + "loss": 1.3847, + "step": 1803 + }, + { + "epoch": 1.4367541766109786, + "grad_norm": 1.280886173248291, + "learning_rate": 2.0165525585557204e-05, + "loss": 1.3258, + "step": 1806 + }, + { + "epoch": 1.4391408114558473, + "grad_norm": 1.3426673412322998, + "learning_rate": 2.0007401521729863e-05, + "loss": 1.3456, + "step": 1809 + }, + { + "epoch": 1.441527446300716, + "grad_norm": 1.6823667287826538, + "learning_rate": 1.984974464101928e-05, + "loss": 1.406, + "step": 1812 + }, + { + "epoch": 1.4439140811455848, + "grad_norm": 1.64955735206604, + "learning_rate": 1.9692557399185734e-05, + "loss": 1.5698, + "step": 1815 + }, + { + "epoch": 1.4463007159904535, + "grad_norm": 1.390384316444397, + "learning_rate": 1.953584224467418e-05, + "loss": 1.3851, + "step": 1818 + }, + { + "epoch": 1.4486873508353222, + "grad_norm": 1.543288230895996, + "learning_rate": 1.9379601618575977e-05, + "loss": 1.3457, + "step": 1821 + }, + { + "epoch": 1.4510739856801909, + "grad_norm": 1.235751986503601, + "learning_rate": 1.9223837954591046e-05, + "loss": 1.398, + "step": 1824 + }, + { + "epoch": 1.4534606205250595, + "grad_norm": 1.6137385368347168, + "learning_rate": 1.9068553678989736e-05, + "loss": 1.513, + "step": 1827 + }, + { + "epoch": 1.4558472553699284, + "grad_norm": 1.5480167865753174, + "learning_rate": 1.8913751210575248e-05, + "loss": 1.4052, + "step": 1830 + }, + { + "epoch": 1.458233890214797, + "grad_norm": 1.3694156408309937, + "learning_rate": 1.8759432960645774e-05, + "loss": 1.4116, + "step": 1833 + }, + { + "epoch": 1.460620525059666, + "grad_norm": 1.3757750988006592, + "learning_rate": 1.8605601332957077e-05, + "loss": 1.3357, + "step": 1836 + }, + { + "epoch": 1.4630071599045347, + "grad_norm": 1.315748691558838, + "learning_rate": 1.8452258723684995e-05, + "loss": 1.3027, + "step": 1839 + }, + { + "epoch": 1.4653937947494033, + "grad_norm": 1.4307796955108643, + "learning_rate": 1.8299407521388067e-05, + "loss": 1.323, + "step": 1842 + }, + { + "epoch": 1.467780429594272, + "grad_norm": 1.3611763715744019, + "learning_rate": 1.8147050106970437e-05, + "loss": 1.4422, + "step": 1845 + }, + { + "epoch": 1.4701670644391407, + "grad_norm": 1.3486278057098389, + "learning_rate": 1.7995188853644646e-05, + "loss": 1.4126, + "step": 1848 + }, + { + "epoch": 1.4725536992840096, + "grad_norm": 1.381899118423462, + "learning_rate": 1.784382612689477e-05, + "loss": 1.3985, + "step": 1851 + }, + { + "epoch": 1.4749403341288783, + "grad_norm": 1.3606630563735962, + "learning_rate": 1.7692964284439505e-05, + "loss": 1.3024, + "step": 1854 + }, + { + "epoch": 1.477326968973747, + "grad_norm": 1.542244791984558, + "learning_rate": 1.7542605676195506e-05, + "loss": 1.3591, + "step": 1857 + }, + { + "epoch": 1.4797136038186158, + "grad_norm": 1.285721778869629, + "learning_rate": 1.739275264424067e-05, + "loss": 1.4591, + "step": 1860 + }, + { + "epoch": 1.4821002386634845, + "grad_norm": 1.584008812904358, + "learning_rate": 1.7243407522777806e-05, + "loss": 1.4387, + "step": 1863 + }, + { + "epoch": 1.4844868735083532, + "grad_norm": 1.2456049919128418, + "learning_rate": 1.7094572638098123e-05, + "loss": 1.4347, + "step": 1866 + }, + { + "epoch": 1.4868735083532219, + "grad_norm": 1.5531784296035767, + "learning_rate": 1.6946250308545125e-05, + "loss": 1.3157, + "step": 1869 + }, + { + "epoch": 1.4892601431980907, + "grad_norm": 1.3614214658737183, + "learning_rate": 1.6798442844478445e-05, + "loss": 1.3473, + "step": 1872 + }, + { + "epoch": 1.4916467780429594, + "grad_norm": 1.2862516641616821, + "learning_rate": 1.6651152548237802e-05, + "loss": 1.3371, + "step": 1875 + }, + { + "epoch": 1.494033412887828, + "grad_norm": 1.3549131155014038, + "learning_rate": 1.6504381714107252e-05, + "loss": 1.4438, + "step": 1878 + }, + { + "epoch": 1.496420047732697, + "grad_norm": 1.3501994609832764, + "learning_rate": 1.6358132628279322e-05, + "loss": 1.3332, + "step": 1881 + }, + { + "epoch": 1.4988066825775657, + "grad_norm": 1.3975999355316162, + "learning_rate": 1.6212407568819565e-05, + "loss": 1.3167, + "step": 1884 + }, + { + "epoch": 1.5011933174224343, + "grad_norm": 1.443943738937378, + "learning_rate": 1.6067208805630877e-05, + "loss": 1.3842, + "step": 1887 + }, + { + "epoch": 1.503579952267303, + "grad_norm": 2.483057975769043, + "learning_rate": 1.5922538600418318e-05, + "loss": 1.351, + "step": 1890 + }, + { + "epoch": 1.5059665871121717, + "grad_norm": 1.4036768674850464, + "learning_rate": 1.5778399206653734e-05, + "loss": 1.3499, + "step": 1893 + }, + { + "epoch": 1.5083532219570406, + "grad_norm": 1.3263362646102905, + "learning_rate": 1.563479286954078e-05, + "loss": 1.3777, + "step": 1896 + }, + { + "epoch": 1.5107398568019093, + "grad_norm": 1.3591517210006714, + "learning_rate": 1.54917218259799e-05, + "loss": 1.293, + "step": 1899 + }, + { + "epoch": 1.5131264916467781, + "grad_norm": 1.310631513595581, + "learning_rate": 1.5349188304533413e-05, + "loss": 1.4139, + "step": 1902 + }, + { + "epoch": 1.5155131264916468, + "grad_norm": 1.3131390810012817, + "learning_rate": 1.5207194525390938e-05, + "loss": 1.4123, + "step": 1905 + }, + { + "epoch": 1.5178997613365155, + "grad_norm": 1.4222484827041626, + "learning_rate": 1.5065742700334678e-05, + "loss": 1.3474, + "step": 1908 + }, + { + "epoch": 1.5202863961813842, + "grad_norm": 1.4119162559509277, + "learning_rate": 1.4924835032705064e-05, + "loss": 1.3558, + "step": 1911 + }, + { + "epoch": 1.5226730310262528, + "grad_norm": 1.356094479560852, + "learning_rate": 1.4784473717366387e-05, + "loss": 1.4808, + "step": 1914 + }, + { + "epoch": 1.5250596658711217, + "grad_norm": 1.249845266342163, + "learning_rate": 1.4644660940672627e-05, + "loss": 1.3971, + "step": 1917 + }, + { + "epoch": 1.5274463007159904, + "grad_norm": 1.2531503438949585, + "learning_rate": 1.4505398880433369e-05, + "loss": 1.3037, + "step": 1920 + }, + { + "epoch": 1.5298329355608593, + "grad_norm": 1.3742297887802124, + "learning_rate": 1.4366689705879898e-05, + "loss": 1.3794, + "step": 1923 + }, + { + "epoch": 1.532219570405728, + "grad_norm": 1.2931220531463623, + "learning_rate": 1.4228535577631442e-05, + "loss": 1.2969, + "step": 1926 + }, + { + "epoch": 1.5346062052505967, + "grad_norm": 1.314113974571228, + "learning_rate": 1.4090938647661461e-05, + "loss": 1.4102, + "step": 1929 + }, + { + "epoch": 1.5369928400954653, + "grad_norm": 1.6142003536224365, + "learning_rate": 1.3953901059264191e-05, + "loss": 1.5013, + "step": 1932 + }, + { + "epoch": 1.539379474940334, + "grad_norm": 1.374854564666748, + "learning_rate": 1.3817424947021151e-05, + "loss": 1.3812, + "step": 1935 + }, + { + "epoch": 1.541766109785203, + "grad_norm": 1.574981689453125, + "learning_rate": 1.3681512436768045e-05, + "loss": 1.5476, + "step": 1938 + }, + { + "epoch": 1.5441527446300716, + "grad_norm": 1.3789339065551758, + "learning_rate": 1.3546165645561487e-05, + "loss": 1.3535, + "step": 1941 + }, + { + "epoch": 1.5465393794749405, + "grad_norm": 1.4727178812026978, + "learning_rate": 1.3411386681646164e-05, + "loss": 1.3763, + "step": 1944 + }, + { + "epoch": 1.5489260143198091, + "grad_norm": 1.3274617195129395, + "learning_rate": 1.3277177644421924e-05, + "loss": 1.3276, + "step": 1947 + }, + { + "epoch": 1.5513126491646778, + "grad_norm": 1.5898061990737915, + "learning_rate": 1.314354062441106e-05, + "loss": 1.4907, + "step": 1950 + }, + { + "epoch": 1.5536992840095465, + "grad_norm": 1.401552677154541, + "learning_rate": 1.301047770322581e-05, + "loss": 1.3592, + "step": 1953 + }, + { + "epoch": 1.5560859188544152, + "grad_norm": 1.2774077653884888, + "learning_rate": 1.287799095353584e-05, + "loss": 1.2775, + "step": 1956 + }, + { + "epoch": 1.558472553699284, + "grad_norm": 1.458093523979187, + "learning_rate": 1.2746082439036117e-05, + "loss": 1.3795, + "step": 1959 + }, + { + "epoch": 1.5608591885441527, + "grad_norm": 1.4670827388763428, + "learning_rate": 1.2614754214414548e-05, + "loss": 1.3595, + "step": 1962 + }, + { + "epoch": 1.5632458233890216, + "grad_norm": 1.629981279373169, + "learning_rate": 1.2484008325320174e-05, + "loss": 1.3965, + "step": 1965 + }, + { + "epoch": 1.5656324582338903, + "grad_norm": 1.4332212209701538, + "learning_rate": 1.2353846808331154e-05, + "loss": 1.3406, + "step": 1968 + }, + { + "epoch": 1.568019093078759, + "grad_norm": 1.4571059942245483, + "learning_rate": 1.2224271690923155e-05, + "loss": 1.3083, + "step": 1971 + }, + { + "epoch": 1.5704057279236276, + "grad_norm": 1.2997206449508667, + "learning_rate": 1.2095284991437733e-05, + "loss": 1.3003, + "step": 1974 + }, + { + "epoch": 1.5727923627684963, + "grad_norm": 1.315752625465393, + "learning_rate": 1.1966888719050829e-05, + "loss": 1.4088, + "step": 1977 + }, + { + "epoch": 1.575178997613365, + "grad_norm": 1.3280858993530273, + "learning_rate": 1.1839084873741584e-05, + "loss": 1.3781, + "step": 1980 + }, + { + "epoch": 1.577565632458234, + "grad_norm": 1.4649864435195923, + "learning_rate": 1.1711875446261094e-05, + "loss": 1.4019, + "step": 1983 + }, + { + "epoch": 1.5799522673031028, + "grad_norm": 1.6478016376495361, + "learning_rate": 1.1585262418101467e-05, + "loss": 1.4548, + "step": 1986 + }, + { + "epoch": 1.5823389021479715, + "grad_norm": 1.4969959259033203, + "learning_rate": 1.1459247761464909e-05, + "loss": 1.3712, + "step": 1989 + }, + { + "epoch": 1.5847255369928401, + "grad_norm": 1.4021399021148682, + "learning_rate": 1.1333833439233055e-05, + "loss": 1.3716, + "step": 1992 + }, + { + "epoch": 1.5871121718377088, + "grad_norm": 1.4327634572982788, + "learning_rate": 1.1209021404936304e-05, + "loss": 1.3505, + "step": 1995 + }, + { + "epoch": 1.5894988066825775, + "grad_norm": 1.3530958890914917, + "learning_rate": 1.1084813602723515e-05, + "loss": 1.4087, + "step": 1998 + }, + { + "epoch": 1.5918854415274462, + "grad_norm": 1.4223620891571045, + "learning_rate": 1.0961211967331597e-05, + "loss": 1.4233, + "step": 2001 + }, + { + "epoch": 1.594272076372315, + "grad_norm": 1.3832675218582153, + "learning_rate": 1.083821842405548e-05, + "loss": 1.3184, + "step": 2004 + }, + { + "epoch": 1.5966587112171837, + "grad_norm": 4.159378528594971, + "learning_rate": 1.0715834888718074e-05, + "loss": 1.2636, + "step": 2007 + }, + { + "epoch": 1.5990453460620526, + "grad_norm": 1.4075279235839844, + "learning_rate": 1.0594063267640386e-05, + "loss": 1.2941, + "step": 2010 + }, + { + "epoch": 1.6014319809069213, + "grad_norm": 1.3836835622787476, + "learning_rate": 1.0472905457611936e-05, + "loss": 1.3858, + "step": 2013 + }, + { + "epoch": 1.60381861575179, + "grad_norm": 1.3855738639831543, + "learning_rate": 1.0352363345861065e-05, + "loss": 1.3013, + "step": 2016 + }, + { + "epoch": 1.6062052505966586, + "grad_norm": 1.3453712463378906, + "learning_rate": 1.023243881002573e-05, + "loss": 1.5934, + "step": 2019 + }, + { + "epoch": 1.6085918854415273, + "grad_norm": 14.255722999572754, + "learning_rate": 1.0113133718124035e-05, + "loss": 1.4809, + "step": 2022 + }, + { + "epoch": 1.6109785202863962, + "grad_norm": 1.4346468448638916, + "learning_rate": 9.994449928525324e-06, + "loss": 1.4255, + "step": 2025 + }, + { + "epoch": 1.6133651551312649, + "grad_norm": 1.797566533088684, + "learning_rate": 9.876389289921106e-06, + "loss": 1.4909, + "step": 2028 + }, + { + "epoch": 1.6157517899761338, + "grad_norm": 1.2779227495193481, + "learning_rate": 9.758953641296331e-06, + "loss": 1.3708, + "step": 2031 + }, + { + "epoch": 1.6181384248210025, + "grad_norm": 1.439191460609436, + "learning_rate": 9.642144811900739e-06, + "loss": 1.3745, + "step": 2034 + }, + { + "epoch": 1.6205250596658711, + "grad_norm": 1.3853205442428589, + "learning_rate": 9.5259646212203e-06, + "loss": 1.3369, + "step": 2037 + }, + { + "epoch": 1.6229116945107398, + "grad_norm": 1.397948145866394, + "learning_rate": 9.410414878948975e-06, + "loss": 1.365, + "step": 2040 + }, + { + "epoch": 1.6252983293556085, + "grad_norm": 1.2659891843795776, + "learning_rate": 9.295497384960416e-06, + "loss": 1.3817, + "step": 2043 + }, + { + "epoch": 1.6276849642004774, + "grad_norm": 1.4338490962982178, + "learning_rate": 9.181213929280046e-06, + "loss": 1.4503, + "step": 2046 + }, + { + "epoch": 1.630071599045346, + "grad_norm": 1.3001736402511597, + "learning_rate": 9.067566292057084e-06, + "loss": 1.3814, + "step": 2049 + }, + { + "epoch": 1.632458233890215, + "grad_norm": 1.4515080451965332, + "learning_rate": 8.954556243536877e-06, + "loss": 1.2801, + "step": 2052 + }, + { + "epoch": 1.6348448687350836, + "grad_norm": 1.4361463785171509, + "learning_rate": 8.842185544033255e-06, + "loss": 1.4327, + "step": 2055 + }, + { + "epoch": 1.6372315035799523, + "grad_norm": 1.36533522605896, + "learning_rate": 8.7304559439012e-06, + "loss": 1.4052, + "step": 2058 + }, + { + "epoch": 1.639618138424821, + "grad_norm": 1.3295538425445557, + "learning_rate": 8.619369183509501e-06, + "loss": 1.3336, + "step": 2061 + }, + { + "epoch": 1.6420047732696896, + "grad_norm": 1.3914989233016968, + "learning_rate": 8.508926993213712e-06, + "loss": 1.4033, + "step": 2064 + }, + { + "epoch": 1.6443914081145583, + "grad_norm": 1.3706380128860474, + "learning_rate": 8.39913109332916e-06, + "loss": 1.3468, + "step": 2067 + }, + { + "epoch": 1.6467780429594272, + "grad_norm": 1.386931300163269, + "learning_rate": 8.28998319410413e-06, + "loss": 1.3398, + "step": 2070 + }, + { + "epoch": 1.649164677804296, + "grad_norm": 1.307201623916626, + "learning_rate": 8.181484995693295e-06, + "loss": 1.3215, + "step": 2073 + }, + { + "epoch": 1.6515513126491648, + "grad_norm": 1.7024941444396973, + "learning_rate": 8.073638188131128e-06, + "loss": 1.3774, + "step": 2076 + }, + { + "epoch": 1.6539379474940334, + "grad_norm": 1.461051106452942, + "learning_rate": 7.966444451305726e-06, + "loss": 1.4113, + "step": 2079 + }, + { + "epoch": 1.6563245823389021, + "grad_norm": 1.3585187196731567, + "learning_rate": 7.859905454932471e-06, + "loss": 1.2784, + "step": 2082 + }, + { + "epoch": 1.6587112171837708, + "grad_norm": 1.2371797561645508, + "learning_rate": 7.75402285852816e-06, + "loss": 1.3123, + "step": 2085 + }, + { + "epoch": 1.6610978520286395, + "grad_norm": 1.3656738996505737, + "learning_rate": 7.648798311385058e-06, + "loss": 1.3187, + "step": 2088 + }, + { + "epoch": 1.6634844868735084, + "grad_norm": 1.3773971796035767, + "learning_rate": 7.5442334525452964e-06, + "loss": 1.2809, + "step": 2091 + }, + { + "epoch": 1.665871121718377, + "grad_norm": 1.370139718055725, + "learning_rate": 7.440329910775273e-06, + "loss": 1.3349, + "step": 2094 + }, + { + "epoch": 1.668257756563246, + "grad_norm": 1.5281294584274292, + "learning_rate": 7.337089304540301e-06, + "loss": 1.3332, + "step": 2097 + }, + { + "epoch": 1.6706443914081146, + "grad_norm": 1.3767362833023071, + "learning_rate": 7.234513241979418e-06, + "loss": 1.3754, + "step": 2100 + }, + { + "epoch": 1.6730310262529833, + "grad_norm": 1.2722810506820679, + "learning_rate": 7.132603320880294e-06, + "loss": 1.3081, + "step": 2103 + }, + { + "epoch": 1.675417661097852, + "grad_norm": 1.3534489870071411, + "learning_rate": 7.031361128654401e-06, + "loss": 1.4329, + "step": 2106 + }, + { + "epoch": 1.6778042959427206, + "grad_norm": 1.2815319299697876, + "learning_rate": 6.930788242312253e-06, + "loss": 1.3198, + "step": 2109 + }, + { + "epoch": 1.6801909307875895, + "grad_norm": 1.3864092826843262, + "learning_rate": 6.830886228438837e-06, + "loss": 1.4143, + "step": 2112 + }, + { + "epoch": 1.6825775656324582, + "grad_norm": 1.338248372077942, + "learning_rate": 6.731656643169204e-06, + "loss": 1.3364, + "step": 2115 + }, + { + "epoch": 1.684964200477327, + "grad_norm": 1.4621409177780151, + "learning_rate": 6.633101032164274e-06, + "loss": 1.4438, + "step": 2118 + }, + { + "epoch": 1.6873508353221958, + "grad_norm": 1.3858606815338135, + "learning_rate": 6.535220930586705e-06, + "loss": 1.5138, + "step": 2121 + }, + { + "epoch": 1.6897374701670644, + "grad_norm": 1.2535791397094727, + "learning_rate": 6.4380178630770225e-06, + "loss": 1.3784, + "step": 2124 + }, + { + "epoch": 1.692124105011933, + "grad_norm": 1.3751507997512817, + "learning_rate": 6.341493343729854e-06, + "loss": 1.4508, + "step": 2127 + }, + { + "epoch": 1.6945107398568018, + "grad_norm": 1.2520653009414673, + "learning_rate": 6.2456488760703205e-06, + "loss": 1.4547, + "step": 2130 + }, + { + "epoch": 1.6968973747016707, + "grad_norm": 1.4060708284378052, + "learning_rate": 6.150485953030677e-06, + "loss": 1.223, + "step": 2133 + }, + { + "epoch": 1.6992840095465394, + "grad_norm": 1.4133458137512207, + "learning_rate": 6.056006056926977e-06, + "loss": 1.4631, + "step": 2136 + }, + { + "epoch": 1.7016706443914082, + "grad_norm": 1.3407944440841675, + "learning_rate": 5.962210659436091e-06, + "loss": 1.3376, + "step": 2139 + }, + { + "epoch": 1.704057279236277, + "grad_norm": 1.3000420331954956, + "learning_rate": 5.869101221572654e-06, + "loss": 1.2783, + "step": 2142 + }, + { + "epoch": 1.7064439140811456, + "grad_norm": 1.3916162252426147, + "learning_rate": 5.776679193666412e-06, + "loss": 1.4991, + "step": 2145 + }, + { + "epoch": 1.7088305489260143, + "grad_norm": 1.5495882034301758, + "learning_rate": 5.6849460153395706e-06, + "loss": 1.3435, + "step": 2148 + }, + { + "epoch": 1.711217183770883, + "grad_norm": 1.365742564201355, + "learning_rate": 5.5939031154844e-06, + "loss": 1.3268, + "step": 2151 + }, + { + "epoch": 1.7136038186157518, + "grad_norm": 1.3886197805404663, + "learning_rate": 5.5035519122409895e-06, + "loss": 1.3701, + "step": 2154 + }, + { + "epoch": 1.7159904534606205, + "grad_norm": 1.5498794317245483, + "learning_rate": 5.413893812975096e-06, + "loss": 1.4669, + "step": 2157 + }, + { + "epoch": 1.7183770883054894, + "grad_norm": 1.51931893825531, + "learning_rate": 5.324930214256302e-06, + "loss": 1.4072, + "step": 2160 + }, + { + "epoch": 1.720763723150358, + "grad_norm": 1.441819667816162, + "learning_rate": 5.236662501836192e-06, + "loss": 1.3644, + "step": 2163 + }, + { + "epoch": 1.7231503579952268, + "grad_norm": 1.5425219535827637, + "learning_rate": 5.149092050626825e-06, + "loss": 1.5182, + "step": 2166 + }, + { + "epoch": 1.7255369928400954, + "grad_norm": 1.4178110361099243, + "learning_rate": 5.062220224679276e-06, + "loss": 1.3155, + "step": 2169 + }, + { + "epoch": 1.727923627684964, + "grad_norm": 1.401777982711792, + "learning_rate": 4.9760483771624236e-06, + "loss": 1.3718, + "step": 2172 + }, + { + "epoch": 1.7303102625298328, + "grad_norm": 1.3606517314910889, + "learning_rate": 4.89057785034181e-06, + "loss": 1.3611, + "step": 2175 + }, + { + "epoch": 1.7326968973747017, + "grad_norm": 1.4071412086486816, + "learning_rate": 4.805809975558828e-06, + "loss": 1.3619, + "step": 2178 + }, + { + "epoch": 1.7350835322195706, + "grad_norm": 1.3548985719680786, + "learning_rate": 4.721746073209893e-06, + "loss": 1.3098, + "step": 2181 + }, + { + "epoch": 1.7374701670644392, + "grad_norm": 1.4642560482025146, + "learning_rate": 4.6383874527259345e-06, + "loss": 1.4429, + "step": 2184 + }, + { + "epoch": 1.739856801909308, + "grad_norm": 1.3343088626861572, + "learning_rate": 4.555735412551975e-06, + "loss": 1.3698, + "step": 2187 + }, + { + "epoch": 1.7422434367541766, + "grad_norm": 1.6669230461120605, + "learning_rate": 4.47379124012689e-06, + "loss": 1.3676, + "step": 2190 + }, + { + "epoch": 1.7446300715990453, + "grad_norm": 1.513681173324585, + "learning_rate": 4.3925562118634135e-06, + "loss": 1.4566, + "step": 2193 + }, + { + "epoch": 1.747016706443914, + "grad_norm": 1.3339098691940308, + "learning_rate": 4.312031593128163e-06, + "loss": 1.513, + "step": 2196 + }, + { + "epoch": 1.7494033412887828, + "grad_norm": 1.3722683191299438, + "learning_rate": 4.232218638222029e-06, + "loss": 1.3236, + "step": 2199 + }, + { + "epoch": 1.7517899761336515, + "grad_norm": 1.3475229740142822, + "learning_rate": 4.153118590360561e-06, + "loss": 1.3351, + "step": 2202 + }, + { + "epoch": 1.7541766109785204, + "grad_norm": 1.2870583534240723, + "learning_rate": 4.074732681654647e-06, + "loss": 1.291, + "step": 2205 + }, + { + "epoch": 1.756563245823389, + "grad_norm": 1.3666460514068604, + "learning_rate": 3.997062133091284e-06, + "loss": 1.274, + "step": 2208 + }, + { + "epoch": 1.7589498806682577, + "grad_norm": 1.243913173675537, + "learning_rate": 3.920108154514585e-06, + "loss": 1.2515, + "step": 2211 + }, + { + "epoch": 1.7613365155131264, + "grad_norm": 1.3077210187911987, + "learning_rate": 3.843871944606969e-06, + "loss": 1.2814, + "step": 2214 + }, + { + "epoch": 1.763723150357995, + "grad_norm": 1.4350789785385132, + "learning_rate": 3.7683546908703903e-06, + "loss": 1.4259, + "step": 2217 + }, + { + "epoch": 1.766109785202864, + "grad_norm": 1.3633619546890259, + "learning_rate": 3.693557569607947e-06, + "loss": 1.3094, + "step": 2220 + }, + { + "epoch": 1.7684964200477327, + "grad_norm": 1.5014336109161377, + "learning_rate": 3.6194817459054676e-06, + "loss": 1.4016, + "step": 2223 + }, + { + "epoch": 1.7708830548926016, + "grad_norm": 1.2430241107940674, + "learning_rate": 3.5461283736134722e-06, + "loss": 1.3035, + "step": 2226 + }, + { + "epoch": 1.7732696897374702, + "grad_norm": 1.3036683797836304, + "learning_rate": 3.4734985953290778e-06, + "loss": 1.3666, + "step": 2229 + }, + { + "epoch": 1.775656324582339, + "grad_norm": 1.1916844844818115, + "learning_rate": 3.401593542378262e-06, + "loss": 1.364, + "step": 2232 + }, + { + "epoch": 1.7780429594272076, + "grad_norm": 1.4589468240737915, + "learning_rate": 3.330414334798265e-06, + "loss": 1.2668, + "step": 2235 + }, + { + "epoch": 1.7804295942720763, + "grad_norm": 1.426949143409729, + "learning_rate": 3.2599620813200837e-06, + "loss": 1.3789, + "step": 2238 + }, + { + "epoch": 1.7828162291169452, + "grad_norm": 1.4803996086120605, + "learning_rate": 3.1902378793512657e-06, + "loss": 1.4121, + "step": 2241 + }, + { + "epoch": 1.7852028639618138, + "grad_norm": 1.3442944288253784, + "learning_rate": 3.121242814958747e-06, + "loss": 1.3518, + "step": 2244 + }, + { + "epoch": 1.7875894988066827, + "grad_norm": 1.3079568147659302, + "learning_rate": 3.0529779628519992e-06, + "loss": 1.368, + "step": 2247 + }, + { + "epoch": 1.7899761336515514, + "grad_norm": 1.3837271928787231, + "learning_rate": 2.9854443863662262e-06, + "loss": 1.403, + "step": 2250 + }, + { + "epoch": 1.79236276849642, + "grad_norm": 2.337009906768799, + "learning_rate": 2.918643137445859e-06, + "loss": 1.3971, + "step": 2253 + }, + { + "epoch": 1.7947494033412887, + "grad_norm": 1.3745216131210327, + "learning_rate": 2.8525752566281482e-06, + "loss": 1.3739, + "step": 2256 + }, + { + "epoch": 1.7971360381861574, + "grad_norm": 1.4141366481781006, + "learning_rate": 2.787241773026933e-06, + "loss": 1.3846, + "step": 2259 + }, + { + "epoch": 1.799522673031026, + "grad_norm": 1.413260579109192, + "learning_rate": 2.722643704316652e-06, + "loss": 1.4082, + "step": 2262 + }, + { + "epoch": 1.801909307875895, + "grad_norm": 1.37075936794281, + "learning_rate": 2.658782056716441e-06, + "loss": 1.3568, + "step": 2265 + }, + { + "epoch": 1.8042959427207639, + "grad_norm": 1.3348731994628906, + "learning_rate": 2.5956578249745236e-06, + "loss": 1.3443, + "step": 2268 + }, + { + "epoch": 1.8066825775656326, + "grad_norm": 1.3601338863372803, + "learning_rate": 2.533271992352659e-06, + "loss": 1.3713, + "step": 2271 + }, + { + "epoch": 1.8090692124105012, + "grad_norm": 1.3218231201171875, + "learning_rate": 2.4716255306108605e-06, + "loss": 1.347, + "step": 2274 + }, + { + "epoch": 1.81145584725537, + "grad_norm": 1.3518648147583008, + "learning_rate": 2.4107193999922286e-06, + "loss": 1.2742, + "step": 2277 + }, + { + "epoch": 1.8138424821002386, + "grad_norm": 1.2940058708190918, + "learning_rate": 2.3505545492080395e-06, + "loss": 1.246, + "step": 2280 + }, + { + "epoch": 1.8162291169451072, + "grad_norm": 1.4600387811660767, + "learning_rate": 2.291131915422917e-06, + "loss": 1.3453, + "step": 2283 + }, + { + "epoch": 1.8186157517899761, + "grad_norm": 1.476802945137024, + "learning_rate": 2.2324524242402613e-06, + "loss": 1.3603, + "step": 2286 + }, + { + "epoch": 1.8210023866348448, + "grad_norm": 2.0956149101257324, + "learning_rate": 2.1745169896878414e-06, + "loss": 1.3493, + "step": 2289 + }, + { + "epoch": 1.8233890214797137, + "grad_norm": 1.4859145879745483, + "learning_rate": 2.117326514203527e-06, + "loss": 1.4109, + "step": 2292 + }, + { + "epoch": 1.8257756563245824, + "grad_norm": 1.400531530380249, + "learning_rate": 2.0608818886212576e-06, + "loss": 1.3619, + "step": 2295 + }, + { + "epoch": 1.828162291169451, + "grad_norm": 1.3414305448532104, + "learning_rate": 2.0051839921571448e-06, + "loss": 1.404, + "step": 2298 + }, + { + "epoch": 1.8305489260143197, + "grad_norm": 1.4340054988861084, + "learning_rate": 1.9502336923958255e-06, + "loss": 1.2401, + "step": 2301 + }, + { + "epoch": 1.8329355608591884, + "grad_norm": 1.320109486579895, + "learning_rate": 1.8960318452768577e-06, + "loss": 1.326, + "step": 2304 + }, + { + "epoch": 1.8353221957040573, + "grad_norm": 1.440313458442688, + "learning_rate": 1.8425792950814868e-06, + "loss": 1.4369, + "step": 2307 + }, + { + "epoch": 1.837708830548926, + "grad_norm": 1.357653260231018, + "learning_rate": 1.7898768744194162e-06, + "loss": 1.3528, + "step": 2310 + }, + { + "epoch": 1.8400954653937949, + "grad_norm": 1.28446364402771, + "learning_rate": 1.7379254042158955e-06, + "loss": 1.3513, + "step": 2313 + }, + { + "epoch": 1.8424821002386635, + "grad_norm": 1.5899930000305176, + "learning_rate": 1.6867256936989096e-06, + "loss": 1.4262, + "step": 2316 + }, + { + "epoch": 1.8448687350835322, + "grad_norm": 1.353194236755371, + "learning_rate": 1.6362785403865488e-06, + "loss": 1.345, + "step": 2319 + }, + { + "epoch": 1.847255369928401, + "grad_norm": 1.2713377475738525, + "learning_rate": 1.5865847300746417e-06, + "loss": 1.284, + "step": 2322 + }, + { + "epoch": 1.8496420047732696, + "grad_norm": 1.2895587682724, + "learning_rate": 1.5376450368244589e-06, + "loss": 1.3386, + "step": 2325 + }, + { + "epoch": 1.8520286396181385, + "grad_norm": 1.3775376081466675, + "learning_rate": 1.4894602229506892e-06, + "loss": 1.3587, + "step": 2328 + }, + { + "epoch": 1.8544152744630071, + "grad_norm": 1.153599739074707, + "learning_rate": 1.4420310390095615e-06, + "loss": 1.25, + "step": 2331 + }, + { + "epoch": 1.856801909307876, + "grad_norm": 1.2969733476638794, + "learning_rate": 1.3953582237871521e-06, + "loss": 1.3874, + "step": 2334 + }, + { + "epoch": 1.8591885441527447, + "grad_norm": 1.3307387828826904, + "learning_rate": 1.3494425042878622e-06, + "loss": 1.389, + "step": 2337 + }, + { + "epoch": 1.8615751789976134, + "grad_norm": 1.3975698947906494, + "learning_rate": 1.3042845957231153e-06, + "loss": 1.3198, + "step": 2340 + }, + { + "epoch": 1.863961813842482, + "grad_norm": 1.4592026472091675, + "learning_rate": 1.2598852015001994e-06, + "loss": 1.354, + "step": 2343 + }, + { + "epoch": 1.8663484486873507, + "grad_norm": 1.3790894746780396, + "learning_rate": 1.2162450132113201e-06, + "loss": 1.46, + "step": 2346 + }, + { + "epoch": 1.8687350835322196, + "grad_norm": 1.3110777139663696, + "learning_rate": 1.1733647106228375e-06, + "loss": 1.3819, + "step": 2349 + }, + { + "epoch": 1.8711217183770883, + "grad_norm": 1.447709083557129, + "learning_rate": 1.1312449616646403e-06, + "loss": 1.293, + "step": 2352 + }, + { + "epoch": 1.8735083532219572, + "grad_norm": 1.2239938974380493, + "learning_rate": 1.0898864224197946e-06, + "loss": 1.2388, + "step": 2355 + }, + { + "epoch": 1.8758949880668259, + "grad_norm": 1.3444265127182007, + "learning_rate": 1.049289737114273e-06, + "loss": 1.3817, + "step": 2358 + }, + { + "epoch": 1.8782816229116945, + "grad_norm": 1.226277232170105, + "learning_rate": 1.009455538106968e-06, + "loss": 1.2544, + "step": 2361 + }, + { + "epoch": 1.8806682577565632, + "grad_norm": 1.3254185914993286, + "learning_rate": 9.703844458797962e-07, + "loss": 1.4172, + "step": 2364 + }, + { + "epoch": 1.8830548926014319, + "grad_norm": 1.290475606918335, + "learning_rate": 9.320770690280645e-07, + "loss": 1.3941, + "step": 2367 + }, + { + "epoch": 1.8854415274463006, + "grad_norm": 1.4123435020446777, + "learning_rate": 8.945340042509797e-07, + "loss": 1.3697, + "step": 2370 + }, + { + "epoch": 1.8878281622911695, + "grad_norm": 1.3699328899383545, + "learning_rate": 8.577558363423554e-07, + "loss": 1.3808, + "step": 2373 + }, + { + "epoch": 1.8902147971360383, + "grad_norm": 1.2443058490753174, + "learning_rate": 8.217431381815077e-07, + "loss": 1.3819, + "step": 2376 + }, + { + "epoch": 1.892601431980907, + "grad_norm": 1.4468942880630493, + "learning_rate": 7.864964707243072e-07, + "loss": 1.3101, + "step": 2379 + }, + { + "epoch": 1.8949880668257757, + "grad_norm": 1.3872753381729126, + "learning_rate": 7.520163829944804e-07, + "loss": 1.2876, + "step": 2382 + }, + { + "epoch": 1.8973747016706444, + "grad_norm": 1.4598524570465088, + "learning_rate": 7.183034120750221e-07, + "loss": 1.3302, + "step": 2385 + }, + { + "epoch": 1.899761336515513, + "grad_norm": 1.4307209253311157, + "learning_rate": 6.85358083099863e-07, + "loss": 1.3474, + "step": 2388 + }, + { + "epoch": 1.9021479713603817, + "grad_norm": 1.347608208656311, + "learning_rate": 6.531809092456598e-07, + "loss": 1.2955, + "step": 2391 + }, + { + "epoch": 1.9045346062052506, + "grad_norm": 1.2689610719680786, + "learning_rate": 6.217723917238128e-07, + "loss": 1.4583, + "step": 2394 + }, + { + "epoch": 1.9069212410501193, + "grad_norm": 1.3200253248214722, + "learning_rate": 5.911330197726661e-07, + "loss": 1.2874, + "step": 2397 + }, + { + "epoch": 1.9093078758949882, + "grad_norm": 1.2589712142944336, + "learning_rate": 5.612632706498755e-07, + "loss": 1.3658, + "step": 2400 + }, + { + "epoch": 1.9116945107398569, + "grad_norm": 1.2950568199157715, + "learning_rate": 5.321636096249749e-07, + "loss": 1.3954, + "step": 2403 + }, + { + "epoch": 1.9140811455847255, + "grad_norm": 1.563720703125, + "learning_rate": 5.038344899721436e-07, + "loss": 1.326, + "step": 2406 + }, + { + "epoch": 1.9164677804295942, + "grad_norm": 1.2961969375610352, + "learning_rate": 4.762763529631342e-07, + "loss": 1.3369, + "step": 2409 + }, + { + "epoch": 1.9188544152744629, + "grad_norm": 1.4480239152908325, + "learning_rate": 4.4948962786039437e-07, + "loss": 1.3879, + "step": 2412 + }, + { + "epoch": 1.9212410501193318, + "grad_norm": 1.3828881978988647, + "learning_rate": 4.234747319103949e-07, + "loss": 1.2412, + "step": 2415 + }, + { + "epoch": 1.9236276849642004, + "grad_norm": 1.4093812704086304, + "learning_rate": 3.9823207033710676e-07, + "loss": 1.3996, + "step": 2418 + }, + { + "epoch": 1.9260143198090693, + "grad_norm": 1.438379168510437, + "learning_rate": 3.737620363357286e-07, + "loss": 1.2387, + "step": 2421 + }, + { + "epoch": 1.928400954653938, + "grad_norm": 1.32201087474823, + "learning_rate": 3.5006501106651937e-07, + "loss": 1.3328, + "step": 2424 + }, + { + "epoch": 1.9307875894988067, + "grad_norm": 1.276695966720581, + "learning_rate": 3.2714136364888073e-07, + "loss": 1.3076, + "step": 2427 + }, + { + "epoch": 1.9331742243436754, + "grad_norm": 1.8650705814361572, + "learning_rate": 3.0499145115561176e-07, + "loss": 1.4659, + "step": 2430 + }, + { + "epoch": 1.935560859188544, + "grad_norm": 1.3024852275848389, + "learning_rate": 2.836156186073413e-07, + "loss": 1.3261, + "step": 2433 + }, + { + "epoch": 1.937947494033413, + "grad_norm": 1.2863266468048096, + "learning_rate": 2.630141989671542e-07, + "loss": 1.3533, + "step": 2436 + }, + { + "epoch": 1.9403341288782816, + "grad_norm": 1.443809151649475, + "learning_rate": 2.431875131354011e-07, + "loss": 1.444, + "step": 2439 + }, + { + "epoch": 1.9427207637231505, + "grad_norm": 1.2889124155044556, + "learning_rate": 2.2413586994470825e-07, + "loss": 1.3069, + "step": 2442 + }, + { + "epoch": 1.9451073985680192, + "grad_norm": 1.3247694969177246, + "learning_rate": 2.0585956615515323e-07, + "loss": 1.3889, + "step": 2445 + }, + { + "epoch": 1.9474940334128878, + "grad_norm": 1.3477109670639038, + "learning_rate": 1.8835888644966325e-07, + "loss": 1.3446, + "step": 2448 + }, + { + "epoch": 1.9498806682577565, + "grad_norm": 1.4671467542648315, + "learning_rate": 1.7163410342956875e-07, + "loss": 1.3727, + "step": 2451 + }, + { + "epoch": 1.9522673031026252, + "grad_norm": 1.4846844673156738, + "learning_rate": 1.5568547761034004e-07, + "loss": 1.3132, + "step": 2454 + }, + { + "epoch": 1.9546539379474939, + "grad_norm": 1.4392292499542236, + "learning_rate": 1.4051325741756828e-07, + "loss": 1.4372, + "step": 2457 + }, + { + "epoch": 1.9570405727923628, + "grad_norm": 1.2357834577560425, + "learning_rate": 1.2611767918306316e-07, + "loss": 1.357, + "step": 2460 + }, + { + "epoch": 1.9594272076372317, + "grad_norm": 1.7262612581253052, + "learning_rate": 1.1249896714117802e-07, + "loss": 1.3613, + "step": 2463 + }, + { + "epoch": 1.9618138424821003, + "grad_norm": 1.2283878326416016, + "learning_rate": 9.965733342532924e-08, + "loss": 1.3145, + "step": 2466 + }, + { + "epoch": 1.964200477326969, + "grad_norm": 1.2315729856491089, + "learning_rate": 8.759297806469335e-08, + "loss": 1.254, + "step": 2469 + }, + { + "epoch": 1.9665871121718377, + "grad_norm": 1.3991883993148804, + "learning_rate": 7.630608898105962e-08, + "loss": 1.3403, + "step": 2472 + }, + { + "epoch": 1.9689737470167064, + "grad_norm": 1.2701016664505005, + "learning_rate": 6.579684198594338e-08, + "loss": 1.2927, + "step": 2475 + }, + { + "epoch": 1.971360381861575, + "grad_norm": 1.2588876485824585, + "learning_rate": 5.606540077782163e-08, + "loss": 1.2546, + "step": 2478 + }, + { + "epoch": 1.973747016706444, + "grad_norm": 1.5006442070007324, + "learning_rate": 4.711191693959616e-08, + "loss": 1.3456, + "step": 2481 + }, + { + "epoch": 1.9761336515513126, + "grad_norm": 1.3775635957717896, + "learning_rate": 3.893652993621766e-08, + "loss": 1.3456, + "step": 2484 + }, + { + "epoch": 1.9785202863961815, + "grad_norm": 1.5189213752746582, + "learning_rate": 3.1539367112543014e-08, + "loss": 1.4398, + "step": 2487 + }, + { + "epoch": 1.9809069212410502, + "grad_norm": 1.4015928506851196, + "learning_rate": 2.4920543691309138e-08, + "loss": 1.3546, + "step": 2490 + }, + { + "epoch": 1.9832935560859188, + "grad_norm": 1.2725491523742676, + "learning_rate": 1.9080162771378808e-08, + "loss": 1.3043, + "step": 2493 + }, + { + "epoch": 1.9856801909307875, + "grad_norm": 1.3217906951904297, + "learning_rate": 1.4018315326103094e-08, + "loss": 1.3286, + "step": 2496 + }, + { + "epoch": 1.9880668257756562, + "grad_norm": 1.2626640796661377, + "learning_rate": 9.735080201922487e-09, + "loss": 1.281, + "step": 2499 + }, + { + "epoch": 1.990453460620525, + "grad_norm": 1.3296139240264893, + "learning_rate": 6.2305241171345395e-09, + "loss": 1.3501, + "step": 2502 + }, + { + "epoch": 1.9928400954653938, + "grad_norm": 1.2684460878372192, + "learning_rate": 3.5047016608613647e-09, + "loss": 1.2971, + "step": 2505 + }, + { + "epoch": 1.9952267303102627, + "grad_norm": 1.4350745677947998, + "learning_rate": 1.5576552921836574e-09, + "loss": 1.3621, + "step": 2508 + }, + { + "epoch": 1.9976133651551313, + "grad_norm": 1.4221293926239014, + "learning_rate": 3.89415339491217e-10, + "loss": 1.3512, + "step": 2511 + }, + { + "epoch": 2.0, + "grad_norm": 1.6571133136749268, + "learning_rate": 0.0, + "loss": 1.3492, + "step": 2514 + } + ], + "logging_steps": 3, + "max_steps": 2514, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1257, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.254235526619464e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}