{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1639, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024405125076266015, "grad_norm": 10.934680938720703, "learning_rate": 2.4390243902439027e-06, "loss": 7.449, "step": 4 }, { "epoch": 0.004881025015253203, "grad_norm": 8.081192016601562, "learning_rate": 4.8780487804878055e-06, "loss": 6.8982, "step": 8 }, { "epoch": 0.007321537522879805, "grad_norm": 22.203201293945312, "learning_rate": 7.317073170731707e-06, "loss": 6.0248, "step": 12 }, { "epoch": 0.009762050030506406, "grad_norm": 5.285924911499023, "learning_rate": 9.756097560975611e-06, "loss": 5.9044, "step": 16 }, { "epoch": 0.012202562538133009, "grad_norm": 5.250211715698242, "learning_rate": 1.2195121951219513e-05, "loss": 5.5814, "step": 20 }, { "epoch": 0.01464307504575961, "grad_norm": 5.4697065353393555, "learning_rate": 1.4634146341463415e-05, "loss": 5.2881, "step": 24 }, { "epoch": 0.017083587553386213, "grad_norm": 3.608083486557007, "learning_rate": 1.707317073170732e-05, "loss": 5.1105, "step": 28 }, { "epoch": 0.019524100061012812, "grad_norm": 2.7470335960388184, "learning_rate": 1.9512195121951222e-05, "loss": 4.9689, "step": 32 }, { "epoch": 0.021964612568639415, "grad_norm": 3.315990447998047, "learning_rate": 2.1951219512195124e-05, "loss": 4.7263, "step": 36 }, { "epoch": 0.024405125076266018, "grad_norm": 4.4223103523254395, "learning_rate": 2.4390243902439026e-05, "loss": 4.4958, "step": 40 }, { "epoch": 0.026845637583892617, "grad_norm": 3.828432321548462, "learning_rate": 2.682926829268293e-05, "loss": 4.7925, "step": 44 }, { "epoch": 0.02928615009151922, "grad_norm": 2.336132287979126, "learning_rate": 2.926829268292683e-05, "loss": 4.3522, "step": 48 }, { "epoch": 0.03172666259914582, "grad_norm": 2.70143723487854, "learning_rate": 3.170731707317073e-05, "loss": 4.2123, "step": 52 }, { "epoch": 0.034167175106772425, "grad_norm": 2.4450480937957764, "learning_rate": 3.414634146341464e-05, "loss": 4.2956, "step": 56 }, { "epoch": 0.036607687614399025, "grad_norm": 2.598154306411743, "learning_rate": 3.6585365853658535e-05, "loss": 4.2567, "step": 60 }, { "epoch": 0.039048200122025624, "grad_norm": 2.658902168273926, "learning_rate": 3.9024390243902444e-05, "loss": 4.1263, "step": 64 }, { "epoch": 0.04148871262965223, "grad_norm": 2.6544759273529053, "learning_rate": 4.146341463414634e-05, "loss": 4.1412, "step": 68 }, { "epoch": 0.04392922513727883, "grad_norm": 2.9632568359375, "learning_rate": 4.390243902439025e-05, "loss": 4.0603, "step": 72 }, { "epoch": 0.04636973764490543, "grad_norm": 2.799863576889038, "learning_rate": 4.634146341463415e-05, "loss": 3.8496, "step": 76 }, { "epoch": 0.048810250152532035, "grad_norm": 2.582296371459961, "learning_rate": 4.878048780487805e-05, "loss": 3.9928, "step": 80 }, { "epoch": 0.051250762660158634, "grad_norm": 2.8325679302215576, "learning_rate": 5.121951219512195e-05, "loss": 3.9543, "step": 84 }, { "epoch": 0.053691275167785234, "grad_norm": 2.8199257850646973, "learning_rate": 5.365853658536586e-05, "loss": 3.7576, "step": 88 }, { "epoch": 0.05613178767541183, "grad_norm": 2.9311118125915527, "learning_rate": 5.6097560975609764e-05, "loss": 3.823, "step": 92 }, { "epoch": 0.05857230018303844, "grad_norm": 2.590095281600952, "learning_rate": 5.853658536585366e-05, "loss": 3.761, "step": 96 }, { "epoch": 0.06101281269066504, "grad_norm": 3.5961081981658936, "learning_rate": 6.097560975609756e-05, "loss": 3.6561, "step": 100 }, { "epoch": 0.06345332519829164, "grad_norm": 2.8117892742156982, "learning_rate": 6.341463414634146e-05, "loss": 3.7837, "step": 104 }, { "epoch": 0.06589383770591824, "grad_norm": 2.7620346546173096, "learning_rate": 6.585365853658538e-05, "loss": 3.7377, "step": 108 }, { "epoch": 0.06833435021354485, "grad_norm": 2.102983236312866, "learning_rate": 6.829268292682928e-05, "loss": 3.4906, "step": 112 }, { "epoch": 0.07077486272117145, "grad_norm": 2.688347101211548, "learning_rate": 7.073170731707317e-05, "loss": 3.563, "step": 116 }, { "epoch": 0.07321537522879805, "grad_norm": 3.4205071926116943, "learning_rate": 7.317073170731707e-05, "loss": 3.6558, "step": 120 }, { "epoch": 0.07565588773642465, "grad_norm": 2.4502980709075928, "learning_rate": 7.560975609756099e-05, "loss": 3.5056, "step": 124 }, { "epoch": 0.07809640024405125, "grad_norm": 2.422513008117676, "learning_rate": 7.804878048780489e-05, "loss": 3.5075, "step": 128 }, { "epoch": 0.08053691275167785, "grad_norm": 2.7142081260681152, "learning_rate": 8.048780487804879e-05, "loss": 3.3782, "step": 132 }, { "epoch": 0.08297742525930446, "grad_norm": 2.683786630630493, "learning_rate": 8.292682926829268e-05, "loss": 3.4138, "step": 136 }, { "epoch": 0.08541793776693106, "grad_norm": 3.0129106044769287, "learning_rate": 8.53658536585366e-05, "loss": 3.4168, "step": 140 }, { "epoch": 0.08785845027455766, "grad_norm": 2.9522879123687744, "learning_rate": 8.78048780487805e-05, "loss": 3.4613, "step": 144 }, { "epoch": 0.09029896278218426, "grad_norm": 2.464475154876709, "learning_rate": 9.02439024390244e-05, "loss": 3.2865, "step": 148 }, { "epoch": 0.09273947528981086, "grad_norm": 2.657923698425293, "learning_rate": 9.26829268292683e-05, "loss": 3.3316, "step": 152 }, { "epoch": 0.09517998779743746, "grad_norm": 2.762465476989746, "learning_rate": 9.51219512195122e-05, "loss": 3.4121, "step": 156 }, { "epoch": 0.09762050030506407, "grad_norm": 2.570235252380371, "learning_rate": 9.75609756097561e-05, "loss": 3.482, "step": 160 }, { "epoch": 0.10006101281269067, "grad_norm": 2.932487726211548, "learning_rate": 0.0001, "loss": 3.3538, "step": 164 }, { "epoch": 0.10250152532031727, "grad_norm": 6.148873805999756, "learning_rate": 9.999959288047527e-05, "loss": 3.2391, "step": 168 }, { "epoch": 0.10494203782794387, "grad_norm": 2.3334765434265137, "learning_rate": 9.999837152853088e-05, "loss": 3.3952, "step": 172 }, { "epoch": 0.10738255033557047, "grad_norm": 2.640165090560913, "learning_rate": 9.999633596405633e-05, "loss": 3.0588, "step": 176 }, { "epoch": 0.10982306284319707, "grad_norm": 2.6932566165924072, "learning_rate": 9.999348622020031e-05, "loss": 3.2899, "step": 180 }, { "epoch": 0.11226357535082367, "grad_norm": 2.199061870574951, "learning_rate": 9.998982234337026e-05, "loss": 3.3005, "step": 184 }, { "epoch": 0.11470408785845028, "grad_norm": 2.457505464553833, "learning_rate": 9.998534439323166e-05, "loss": 3.1601, "step": 188 }, { "epoch": 0.11714460036607688, "grad_norm": 2.4901139736175537, "learning_rate": 9.998005244270692e-05, "loss": 3.1873, "step": 192 }, { "epoch": 0.11958511287370348, "grad_norm": 2.2997779846191406, "learning_rate": 9.997394657797427e-05, "loss": 3.1023, "step": 196 }, { "epoch": 0.12202562538133008, "grad_norm": 2.224750518798828, "learning_rate": 9.996702689846645e-05, "loss": 3.1437, "step": 200 }, { "epoch": 0.12446613788895668, "grad_norm": 2.3547255992889404, "learning_rate": 9.995929351686885e-05, "loss": 3.0756, "step": 204 }, { "epoch": 0.12690665039658328, "grad_norm": 2.385662078857422, "learning_rate": 9.995074655911794e-05, "loss": 3.1756, "step": 208 }, { "epoch": 0.1293471629042099, "grad_norm": 2.3929145336151123, "learning_rate": 9.994138616439903e-05, "loss": 3.2254, "step": 212 }, { "epoch": 0.13178767541183647, "grad_norm": 2.0844318866729736, "learning_rate": 9.99312124851441e-05, "loss": 3.0249, "step": 216 }, { "epoch": 0.1342281879194631, "grad_norm": 2.3276314735412598, "learning_rate": 9.992022568702932e-05, "loss": 3.037, "step": 220 }, { "epoch": 0.1366687004270897, "grad_norm": 2.497742176055908, "learning_rate": 9.990842594897227e-05, "loss": 3.1051, "step": 224 }, { "epoch": 0.1391092129347163, "grad_norm": 2.3867313861846924, "learning_rate": 9.989581346312908e-05, "loss": 3.0532, "step": 228 }, { "epoch": 0.1415497254423429, "grad_norm": 2.4126064777374268, "learning_rate": 9.988238843489137e-05, "loss": 3.0547, "step": 232 }, { "epoch": 0.14399023794996949, "grad_norm": 2.293853998184204, "learning_rate": 9.986815108288272e-05, "loss": 3.0587, "step": 236 }, { "epoch": 0.1464307504575961, "grad_norm": 2.3181204795837402, "learning_rate": 9.985310163895534e-05, "loss": 3.0367, "step": 240 }, { "epoch": 0.14887126296522268, "grad_norm": 2.2385687828063965, "learning_rate": 9.983724034818611e-05, "loss": 3.1358, "step": 244 }, { "epoch": 0.1513117754728493, "grad_norm": 2.0997090339660645, "learning_rate": 9.982056746887269e-05, "loss": 2.9741, "step": 248 }, { "epoch": 0.1537522879804759, "grad_norm": 1.9703699350357056, "learning_rate": 9.980308327252925e-05, "loss": 2.9997, "step": 252 }, { "epoch": 0.1561928004881025, "grad_norm": 2.2509689331054688, "learning_rate": 9.97847880438821e-05, "loss": 3.0575, "step": 256 }, { "epoch": 0.1586333129957291, "grad_norm": 2.2377076148986816, "learning_rate": 9.976568208086503e-05, "loss": 2.9686, "step": 260 }, { "epoch": 0.1610738255033557, "grad_norm": 2.2603845596313477, "learning_rate": 9.974576569461449e-05, "loss": 3.0948, "step": 264 }, { "epoch": 0.1635143380109823, "grad_norm": 2.100558042526245, "learning_rate": 9.972503920946442e-05, "loss": 2.8875, "step": 268 }, { "epoch": 0.16595485051860892, "grad_norm": 2.139305830001831, "learning_rate": 9.970350296294113e-05, "loss": 3.0919, "step": 272 }, { "epoch": 0.1683953630262355, "grad_norm": 2.180773973464966, "learning_rate": 9.968115730575766e-05, "loss": 2.9352, "step": 276 }, { "epoch": 0.17083587553386212, "grad_norm": 2.0909817218780518, "learning_rate": 9.965800260180817e-05, "loss": 2.8328, "step": 280 }, { "epoch": 0.1732763880414887, "grad_norm": 2.2757225036621094, "learning_rate": 9.963403922816191e-05, "loss": 2.7534, "step": 284 }, { "epoch": 0.17571690054911532, "grad_norm": 2.206963539123535, "learning_rate": 9.960926757505719e-05, "loss": 2.9265, "step": 288 }, { "epoch": 0.1781574130567419, "grad_norm": 1.9953128099441528, "learning_rate": 9.958368804589496e-05, "loss": 2.8852, "step": 292 }, { "epoch": 0.18059792556436852, "grad_norm": 2.228975772857666, "learning_rate": 9.955730105723223e-05, "loss": 2.7656, "step": 296 }, { "epoch": 0.18303843807199513, "grad_norm": 2.3731560707092285, "learning_rate": 9.953010703877532e-05, "loss": 2.8817, "step": 300 }, { "epoch": 0.18547895057962172, "grad_norm": 2.4438838958740234, "learning_rate": 9.950210643337291e-05, "loss": 2.7615, "step": 304 }, { "epoch": 0.18791946308724833, "grad_norm": 1.8998314142227173, "learning_rate": 9.94732996970087e-05, "loss": 2.8328, "step": 308 }, { "epoch": 0.19035997559487491, "grad_norm": 2.18756103515625, "learning_rate": 9.944368729879408e-05, "loss": 2.8206, "step": 312 }, { "epoch": 0.19280048810250153, "grad_norm": 2.114089250564575, "learning_rate": 9.941326972096048e-05, "loss": 2.8317, "step": 316 }, { "epoch": 0.19524100061012814, "grad_norm": 2.013594150543213, "learning_rate": 9.93820474588515e-05, "loss": 2.8399, "step": 320 }, { "epoch": 0.19768151311775473, "grad_norm": 2.080949306488037, "learning_rate": 9.93500210209148e-05, "loss": 2.7811, "step": 324 }, { "epoch": 0.20012202562538134, "grad_norm": 1.8897756338119507, "learning_rate": 9.931719092869395e-05, "loss": 2.6913, "step": 328 }, { "epoch": 0.20256253813300792, "grad_norm": 2.150343179702759, "learning_rate": 9.928355771681981e-05, "loss": 2.877, "step": 332 }, { "epoch": 0.20500305064063454, "grad_norm": 2.010016441345215, "learning_rate": 9.924912193300186e-05, "loss": 2.793, "step": 336 }, { "epoch": 0.20744356314826112, "grad_norm": 2.0901033878326416, "learning_rate": 9.921388413801928e-05, "loss": 2.7149, "step": 340 }, { "epoch": 0.20988407565588774, "grad_norm": 2.0860214233398438, "learning_rate": 9.917784490571187e-05, "loss": 2.8542, "step": 344 }, { "epoch": 0.21232458816351435, "grad_norm": 2.0296056270599365, "learning_rate": 9.914100482297061e-05, "loss": 2.7846, "step": 348 }, { "epoch": 0.21476510067114093, "grad_norm": 1.8985519409179688, "learning_rate": 9.91033644897282e-05, "loss": 2.6664, "step": 352 }, { "epoch": 0.21720561317876755, "grad_norm": 2.012681007385254, "learning_rate": 9.906492451894922e-05, "loss": 2.8291, "step": 356 }, { "epoch": 0.21964612568639413, "grad_norm": 2.3786346912384033, "learning_rate": 9.902568553662017e-05, "loss": 2.9325, "step": 360 }, { "epoch": 0.22208663819402075, "grad_norm": 2.115422487258911, "learning_rate": 9.898564818173928e-05, "loss": 2.8192, "step": 364 }, { "epoch": 0.22452715070164733, "grad_norm": 2.1743714809417725, "learning_rate": 9.894481310630613e-05, "loss": 2.6733, "step": 368 }, { "epoch": 0.22696766320927395, "grad_norm": 2.0607974529266357, "learning_rate": 9.890318097531095e-05, "loss": 2.7947, "step": 372 }, { "epoch": 0.22940817571690056, "grad_norm": 2.0158376693725586, "learning_rate": 9.886075246672388e-05, "loss": 2.6874, "step": 376 }, { "epoch": 0.23184868822452714, "grad_norm": 2.3356993198394775, "learning_rate": 9.88175282714839e-05, "loss": 2.8465, "step": 380 }, { "epoch": 0.23428920073215376, "grad_norm": 1.8665406703948975, "learning_rate": 9.877350909348757e-05, "loss": 2.7436, "step": 384 }, { "epoch": 0.23672971323978034, "grad_norm": 2.3228230476379395, "learning_rate": 9.872869564957754e-05, "loss": 2.7462, "step": 388 }, { "epoch": 0.23917022574740696, "grad_norm": 2.0364367961883545, "learning_rate": 9.868308866953095e-05, "loss": 2.8538, "step": 392 }, { "epoch": 0.24161073825503357, "grad_norm": 1.8105356693267822, "learning_rate": 9.863668889604748e-05, "loss": 2.6953, "step": 396 }, { "epoch": 0.24405125076266015, "grad_norm": 2.336512565612793, "learning_rate": 9.858949708473726e-05, "loss": 2.7345, "step": 400 }, { "epoch": 0.24649176327028677, "grad_norm": 2.2437095642089844, "learning_rate": 9.854151400410862e-05, "loss": 2.6568, "step": 404 }, { "epoch": 0.24893227577791335, "grad_norm": 2.0617661476135254, "learning_rate": 9.849274043555551e-05, "loss": 2.6908, "step": 408 }, { "epoch": 0.25137278828553994, "grad_norm": 2.3168981075286865, "learning_rate": 9.844317717334481e-05, "loss": 2.6827, "step": 412 }, { "epoch": 0.25381330079316655, "grad_norm": 2.063445568084717, "learning_rate": 9.83928250246034e-05, "loss": 2.6308, "step": 416 }, { "epoch": 0.25625381330079317, "grad_norm": 2.0091001987457275, "learning_rate": 9.834168480930502e-05, "loss": 2.8215, "step": 420 }, { "epoch": 0.2586943258084198, "grad_norm": 2.083251476287842, "learning_rate": 9.82897573602568e-05, "loss": 2.6374, "step": 424 }, { "epoch": 0.2611348383160464, "grad_norm": 2.0492255687713623, "learning_rate": 9.823704352308596e-05, "loss": 2.5664, "step": 428 }, { "epoch": 0.26357535082367295, "grad_norm": 1.9190467596054077, "learning_rate": 9.818354415622574e-05, "loss": 2.7301, "step": 432 }, { "epoch": 0.26601586333129956, "grad_norm": 2.1036882400512695, "learning_rate": 9.812926013090162e-05, "loss": 2.6856, "step": 436 }, { "epoch": 0.2684563758389262, "grad_norm": 2.014232873916626, "learning_rate": 9.807419233111708e-05, "loss": 2.6082, "step": 440 }, { "epoch": 0.2708968883465528, "grad_norm": 2.1601650714874268, "learning_rate": 9.801834165363917e-05, "loss": 2.6928, "step": 444 }, { "epoch": 0.2733374008541794, "grad_norm": 1.9628103971481323, "learning_rate": 9.796170900798393e-05, "loss": 2.7258, "step": 448 }, { "epoch": 0.27577791336180596, "grad_norm": 1.7634297609329224, "learning_rate": 9.790429531640161e-05, "loss": 2.6662, "step": 452 }, { "epoch": 0.2782184258694326, "grad_norm": 2.149310350418091, "learning_rate": 9.784610151386162e-05, "loss": 2.6701, "step": 456 }, { "epoch": 0.2806589383770592, "grad_norm": 1.9656424522399902, "learning_rate": 9.778712854803725e-05, "loss": 2.669, "step": 460 }, { "epoch": 0.2830994508846858, "grad_norm": 2.0106678009033203, "learning_rate": 9.772737737929033e-05, "loss": 2.6633, "step": 464 }, { "epoch": 0.2855399633923124, "grad_norm": 1.9313619136810303, "learning_rate": 9.766684898065559e-05, "loss": 2.7275, "step": 468 }, { "epoch": 0.28798047589993897, "grad_norm": 1.8865876197814941, "learning_rate": 9.760554433782472e-05, "loss": 2.6618, "step": 472 }, { "epoch": 0.2904209884075656, "grad_norm": 1.9713722467422485, "learning_rate": 9.754346444913042e-05, "loss": 2.7185, "step": 476 }, { "epoch": 0.2928615009151922, "grad_norm": 2.02203631401062, "learning_rate": 9.748061032553007e-05, "loss": 2.5664, "step": 480 }, { "epoch": 0.2953020134228188, "grad_norm": 1.957247018814087, "learning_rate": 9.741698299058932e-05, "loss": 2.5477, "step": 484 }, { "epoch": 0.29774252593044537, "grad_norm": 1.9796209335327148, "learning_rate": 9.735258348046536e-05, "loss": 2.7561, "step": 488 }, { "epoch": 0.300183038438072, "grad_norm": 1.9826083183288574, "learning_rate": 9.728741284389013e-05, "loss": 2.6152, "step": 492 }, { "epoch": 0.3026235509456986, "grad_norm": 2.025087833404541, "learning_rate": 9.722147214215318e-05, "loss": 2.5433, "step": 496 }, { "epoch": 0.3050640634533252, "grad_norm": 1.8469005823135376, "learning_rate": 9.715476244908436e-05, "loss": 2.6317, "step": 500 }, { "epoch": 0.3075045759609518, "grad_norm": 1.918837547302246, "learning_rate": 9.708728485103646e-05, "loss": 2.598, "step": 504 }, { "epoch": 0.3099450884685784, "grad_norm": 1.9943981170654297, "learning_rate": 9.701904044686736e-05, "loss": 2.5639, "step": 508 }, { "epoch": 0.312385600976205, "grad_norm": 1.9431275129318237, "learning_rate": 9.695003034792221e-05, "loss": 2.5784, "step": 512 }, { "epoch": 0.3148261134838316, "grad_norm": 1.9503458738327026, "learning_rate": 9.68802556780154e-05, "loss": 2.6698, "step": 516 }, { "epoch": 0.3172666259914582, "grad_norm": 2.1501636505126953, "learning_rate": 9.680971757341212e-05, "loss": 2.6022, "step": 520 }, { "epoch": 0.31970713849908483, "grad_norm": 1.9228835105895996, "learning_rate": 9.673841718280999e-05, "loss": 2.6008, "step": 524 }, { "epoch": 0.3221476510067114, "grad_norm": 1.9820727109909058, "learning_rate": 9.666635566732023e-05, "loss": 2.4778, "step": 528 }, { "epoch": 0.324588163514338, "grad_norm": 1.8800560235977173, "learning_rate": 9.659353420044882e-05, "loss": 2.4971, "step": 532 }, { "epoch": 0.3270286760219646, "grad_norm": 1.9813599586486816, "learning_rate": 9.651995396807743e-05, "loss": 2.6343, "step": 536 }, { "epoch": 0.32946918852959123, "grad_norm": 1.9326095581054688, "learning_rate": 9.644561616844402e-05, "loss": 2.6344, "step": 540 }, { "epoch": 0.33190970103721784, "grad_norm": 1.922208547592163, "learning_rate": 9.637052201212336e-05, "loss": 2.5182, "step": 544 }, { "epoch": 0.3343502135448444, "grad_norm": 2.059067487716675, "learning_rate": 9.629467272200736e-05, "loss": 2.5843, "step": 548 }, { "epoch": 0.336790726052471, "grad_norm": 2.1328108310699463, "learning_rate": 9.62180695332851e-05, "loss": 2.6726, "step": 552 }, { "epoch": 0.3392312385600976, "grad_norm": 2.028780460357666, "learning_rate": 9.614071369342272e-05, "loss": 2.5376, "step": 556 }, { "epoch": 0.34167175106772424, "grad_norm": 1.952534556388855, "learning_rate": 9.606260646214313e-05, "loss": 2.6095, "step": 560 }, { "epoch": 0.3441122635753508, "grad_norm": 2.0387320518493652, "learning_rate": 9.59837491114055e-05, "loss": 2.6315, "step": 564 }, { "epoch": 0.3465527760829774, "grad_norm": 1.8844285011291504, "learning_rate": 9.59041429253845e-05, "loss": 2.4956, "step": 568 }, { "epoch": 0.348993288590604, "grad_norm": 1.8574570417404175, "learning_rate": 9.582378920044944e-05, "loss": 2.6009, "step": 572 }, { "epoch": 0.35143380109823064, "grad_norm": 1.9576845169067383, "learning_rate": 9.574268924514312e-05, "loss": 2.6683, "step": 576 }, { "epoch": 0.35387431360585725, "grad_norm": 1.771797776222229, "learning_rate": 9.566084438016056e-05, "loss": 2.5735, "step": 580 }, { "epoch": 0.3563148261134838, "grad_norm": 1.9233019351959229, "learning_rate": 9.557825593832748e-05, "loss": 2.5864, "step": 584 }, { "epoch": 0.3587553386211104, "grad_norm": 2.0586273670196533, "learning_rate": 9.549492526457856e-05, "loss": 2.5936, "step": 588 }, { "epoch": 0.36119585112873703, "grad_norm": 1.8952735662460327, "learning_rate": 9.541085371593553e-05, "loss": 2.5679, "step": 592 }, { "epoch": 0.36363636363636365, "grad_norm": 1.9247933626174927, "learning_rate": 9.53260426614852e-05, "loss": 2.6742, "step": 596 }, { "epoch": 0.36607687614399026, "grad_norm": 1.907720685005188, "learning_rate": 9.524049348235699e-05, "loss": 2.7457, "step": 600 }, { "epoch": 0.3685173886516168, "grad_norm": 2.0835776329040527, "learning_rate": 9.515420757170057e-05, "loss": 2.4971, "step": 604 }, { "epoch": 0.37095790115924343, "grad_norm": 2.132695198059082, "learning_rate": 9.506718633466308e-05, "loss": 2.6758, "step": 608 }, { "epoch": 0.37339841366687004, "grad_norm": 1.8507214784622192, "learning_rate": 9.497943118836628e-05, "loss": 2.5093, "step": 612 }, { "epoch": 0.37583892617449666, "grad_norm": 1.840934157371521, "learning_rate": 9.489094356188356e-05, "loss": 2.5388, "step": 616 }, { "epoch": 0.37827943868212327, "grad_norm": 1.9286271333694458, "learning_rate": 9.48017248962165e-05, "loss": 2.5616, "step": 620 }, { "epoch": 0.38071995118974983, "grad_norm": 1.8787904977798462, "learning_rate": 9.471177664427155e-05, "loss": 2.4194, "step": 624 }, { "epoch": 0.38316046369737644, "grad_norm": 2.0228466987609863, "learning_rate": 9.462110027083629e-05, "loss": 2.5924, "step": 628 }, { "epoch": 0.38560097620500305, "grad_norm": 1.9403541088104248, "learning_rate": 9.452969725255558e-05, "loss": 2.5228, "step": 632 }, { "epoch": 0.38804148871262967, "grad_norm": 1.8352724313735962, "learning_rate": 9.443756907790759e-05, "loss": 2.512, "step": 636 }, { "epoch": 0.3904820012202563, "grad_norm": 1.7737221717834473, "learning_rate": 9.434471724717946e-05, "loss": 2.6457, "step": 640 }, { "epoch": 0.39292251372788284, "grad_norm": 2.031172037124634, "learning_rate": 9.42511432724429e-05, "loss": 2.4233, "step": 644 }, { "epoch": 0.39536302623550945, "grad_norm": 2.0770301818847656, "learning_rate": 9.41568486775296e-05, "loss": 2.701, "step": 648 }, { "epoch": 0.39780353874313606, "grad_norm": 2.026745319366455, "learning_rate": 9.406183499800639e-05, "loss": 2.5561, "step": 652 }, { "epoch": 0.4002440512507627, "grad_norm": 1.7436802387237549, "learning_rate": 9.396610378115025e-05, "loss": 2.4574, "step": 656 }, { "epoch": 0.40268456375838924, "grad_norm": 1.9237080812454224, "learning_rate": 9.386965658592303e-05, "loss": 2.5846, "step": 660 }, { "epoch": 0.40512507626601585, "grad_norm": 2.117783546447754, "learning_rate": 9.377249498294624e-05, "loss": 2.4709, "step": 664 }, { "epoch": 0.40756558877364246, "grad_norm": 1.8249421119689941, "learning_rate": 9.367462055447528e-05, "loss": 2.4538, "step": 668 }, { "epoch": 0.4100061012812691, "grad_norm": 1.7343837022781372, "learning_rate": 9.357603489437377e-05, "loss": 2.6097, "step": 672 }, { "epoch": 0.4124466137888957, "grad_norm": 1.737115502357483, "learning_rate": 9.347673960808761e-05, "loss": 2.4277, "step": 676 }, { "epoch": 0.41488712629652225, "grad_norm": 1.8485581874847412, "learning_rate": 9.337673631261878e-05, "loss": 2.5958, "step": 680 }, { "epoch": 0.41732763880414886, "grad_norm": 1.7287710905075073, "learning_rate": 9.327602663649906e-05, "loss": 2.4185, "step": 684 }, { "epoch": 0.4197681513117755, "grad_norm": 2.0095677375793457, "learning_rate": 9.317461221976346e-05, "loss": 2.4634, "step": 688 }, { "epoch": 0.4222086638194021, "grad_norm": 1.982871651649475, "learning_rate": 9.307249471392355e-05, "loss": 2.5262, "step": 692 }, { "epoch": 0.4246491763270287, "grad_norm": 1.8937561511993408, "learning_rate": 9.296967578194051e-05, "loss": 2.4084, "step": 696 }, { "epoch": 0.42708968883465526, "grad_norm": 1.7690891027450562, "learning_rate": 9.28661570981982e-05, "loss": 2.5927, "step": 700 }, { "epoch": 0.42953020134228187, "grad_norm": 1.7315630912780762, "learning_rate": 9.276194034847566e-05, "loss": 2.549, "step": 704 }, { "epoch": 0.4319707138499085, "grad_norm": 1.9079010486602783, "learning_rate": 9.265702722991985e-05, "loss": 2.5563, "step": 708 }, { "epoch": 0.4344112263575351, "grad_norm": 1.8171732425689697, "learning_rate": 9.255141945101793e-05, "loss": 2.4908, "step": 712 }, { "epoch": 0.4368517388651617, "grad_norm": 1.6784181594848633, "learning_rate": 9.244511873156944e-05, "loss": 2.4592, "step": 716 }, { "epoch": 0.43929225137278827, "grad_norm": 3.376328468322754, "learning_rate": 9.233812680265835e-05, "loss": 2.5316, "step": 720 }, { "epoch": 0.4417327638804149, "grad_norm": 1.8734562397003174, "learning_rate": 9.223044540662473e-05, "loss": 2.6612, "step": 724 }, { "epoch": 0.4441732763880415, "grad_norm": 1.7755532264709473, "learning_rate": 9.212207629703658e-05, "loss": 2.5749, "step": 728 }, { "epoch": 0.4466137888956681, "grad_norm": 1.8399274349212646, "learning_rate": 9.201302123866111e-05, "loss": 2.5068, "step": 732 }, { "epoch": 0.44905430140329466, "grad_norm": 1.8102937936782837, "learning_rate": 9.190328200743604e-05, "loss": 2.5197, "step": 736 }, { "epoch": 0.4514948139109213, "grad_norm": 1.9041603803634644, "learning_rate": 9.179286039044073e-05, "loss": 2.5599, "step": 740 }, { "epoch": 0.4539353264185479, "grad_norm": 1.9416977167129517, "learning_rate": 9.168175818586703e-05, "loss": 2.4402, "step": 744 }, { "epoch": 0.4563758389261745, "grad_norm": 1.756083607673645, "learning_rate": 9.156997720299001e-05, "loss": 2.4459, "step": 748 }, { "epoch": 0.4588163514338011, "grad_norm": 1.9078497886657715, "learning_rate": 9.14575192621385e-05, "loss": 2.3221, "step": 752 }, { "epoch": 0.4612568639414277, "grad_norm": 1.8347623348236084, "learning_rate": 9.134438619466541e-05, "loss": 2.5118, "step": 756 }, { "epoch": 0.4636973764490543, "grad_norm": 2.0183730125427246, "learning_rate": 9.123057984291799e-05, "loss": 2.5196, "step": 760 }, { "epoch": 0.4661378889566809, "grad_norm": 2.1509206295013428, "learning_rate": 9.111610206020775e-05, "loss": 2.489, "step": 764 }, { "epoch": 0.4685784014643075, "grad_norm": 1.8190091848373413, "learning_rate": 9.100095471078031e-05, "loss": 2.5117, "step": 768 }, { "epoch": 0.47101891397193413, "grad_norm": 1.974328875541687, "learning_rate": 9.088513966978505e-05, "loss": 2.4627, "step": 772 }, { "epoch": 0.4734594264795607, "grad_norm": 1.8939768075942993, "learning_rate": 9.076865882324452e-05, "loss": 2.486, "step": 776 }, { "epoch": 0.4758999389871873, "grad_norm": 1.8601619005203247, "learning_rate": 9.065151406802381e-05, "loss": 2.5048, "step": 780 }, { "epoch": 0.4783404514948139, "grad_norm": 1.7344801425933838, "learning_rate": 9.05337073117996e-05, "loss": 2.4448, "step": 784 }, { "epoch": 0.4807809640024405, "grad_norm": 1.7854970693588257, "learning_rate": 9.041524047302912e-05, "loss": 2.5274, "step": 788 }, { "epoch": 0.48322147651006714, "grad_norm": 1.9368374347686768, "learning_rate": 9.02961154809189e-05, "loss": 2.5239, "step": 792 }, { "epoch": 0.4856619890176937, "grad_norm": 1.8803982734680176, "learning_rate": 9.017633427539332e-05, "loss": 2.431, "step": 796 }, { "epoch": 0.4881025015253203, "grad_norm": 2.184255361557007, "learning_rate": 9.00558988070631e-05, "loss": 2.5069, "step": 800 }, { "epoch": 0.4905430140329469, "grad_norm": 2.7057807445526123, "learning_rate": 8.993481103719347e-05, "loss": 2.5053, "step": 804 }, { "epoch": 0.49298352654057354, "grad_norm": 1.8198022842407227, "learning_rate": 8.981307293767222e-05, "loss": 2.5678, "step": 808 }, { "epoch": 0.49542403904820015, "grad_norm": 1.7807421684265137, "learning_rate": 8.969068649097766e-05, "loss": 2.4017, "step": 812 }, { "epoch": 0.4978645515558267, "grad_norm": 1.7634155750274658, "learning_rate": 8.956765369014626e-05, "loss": 2.4174, "step": 816 }, { "epoch": 0.5003050640634533, "grad_norm": 1.7116047143936157, "learning_rate": 8.944397653874024e-05, "loss": 2.4296, "step": 820 }, { "epoch": 0.5027455765710799, "grad_norm": 1.8272420167922974, "learning_rate": 8.931965705081494e-05, "loss": 2.3184, "step": 824 }, { "epoch": 0.5051860890787065, "grad_norm": 1.8915187120437622, "learning_rate": 8.919469725088595e-05, "loss": 2.4825, "step": 828 }, { "epoch": 0.5076266015863331, "grad_norm": 2.0008482933044434, "learning_rate": 8.906909917389629e-05, "loss": 2.468, "step": 832 }, { "epoch": 0.5100671140939598, "grad_norm": 1.98212468624115, "learning_rate": 8.894286486518311e-05, "loss": 2.543, "step": 836 }, { "epoch": 0.5125076266015863, "grad_norm": 1.8104711771011353, "learning_rate": 8.881599638044448e-05, "loss": 2.4523, "step": 840 }, { "epoch": 0.5149481391092129, "grad_norm": 1.9585717916488647, "learning_rate": 8.868849578570591e-05, "loss": 2.4314, "step": 844 }, { "epoch": 0.5173886516168396, "grad_norm": 1.788904070854187, "learning_rate": 8.856036515728666e-05, "loss": 2.53, "step": 848 }, { "epoch": 0.5198291641244661, "grad_norm": 1.7570775747299194, "learning_rate": 8.84316065817659e-05, "loss": 2.3706, "step": 852 }, { "epoch": 0.5222696766320928, "grad_norm": 1.7272508144378662, "learning_rate": 8.83022221559489e-05, "loss": 2.3785, "step": 856 }, { "epoch": 0.5247101891397193, "grad_norm": 1.7609403133392334, "learning_rate": 8.817221398683267e-05, "loss": 2.4508, "step": 860 }, { "epoch": 0.5271507016473459, "grad_norm": 1.672697901725769, "learning_rate": 8.804158419157178e-05, "loss": 2.4297, "step": 864 }, { "epoch": 0.5295912141549726, "grad_norm": 1.754487156867981, "learning_rate": 8.791033489744382e-05, "loss": 2.552, "step": 868 }, { "epoch": 0.5320317266625991, "grad_norm": 1.630885124206543, "learning_rate": 8.77784682418148e-05, "loss": 2.364, "step": 872 }, { "epoch": 0.5344722391702258, "grad_norm": 1.678991675376892, "learning_rate": 8.764598637210435e-05, "loss": 2.4063, "step": 876 }, { "epoch": 0.5369127516778524, "grad_norm": 1.6513984203338623, "learning_rate": 8.751289144575068e-05, "loss": 2.3931, "step": 880 }, { "epoch": 0.5393532641854789, "grad_norm": 1.7914862632751465, "learning_rate": 8.737918563017553e-05, "loss": 2.4849, "step": 884 }, { "epoch": 0.5417937766931056, "grad_norm": 1.7029664516448975, "learning_rate": 8.724487110274882e-05, "loss": 2.386, "step": 888 }, { "epoch": 0.5442342892007321, "grad_norm": 1.8063247203826904, "learning_rate": 8.710995005075319e-05, "loss": 2.3094, "step": 892 }, { "epoch": 0.5466748017083588, "grad_norm": 1.7636981010437012, "learning_rate": 8.697442467134846e-05, "loss": 2.5278, "step": 896 }, { "epoch": 0.5491153142159854, "grad_norm": 1.792641282081604, "learning_rate": 8.683829717153575e-05, "loss": 2.3687, "step": 900 }, { "epoch": 0.5515558267236119, "grad_norm": 1.6481139659881592, "learning_rate": 8.670156976812155e-05, "loss": 2.3078, "step": 904 }, { "epoch": 0.5539963392312386, "grad_norm": 1.5917445421218872, "learning_rate": 8.656424468768169e-05, "loss": 2.4052, "step": 908 }, { "epoch": 0.5564368517388651, "grad_norm": 1.8670578002929688, "learning_rate": 8.642632416652505e-05, "loss": 2.5449, "step": 912 }, { "epoch": 0.5588773642464918, "grad_norm": 1.954473853111267, "learning_rate": 8.62878104506571e-05, "loss": 2.3901, "step": 916 }, { "epoch": 0.5613178767541184, "grad_norm": 1.7728747129440308, "learning_rate": 8.614870579574337e-05, "loss": 2.4777, "step": 920 }, { "epoch": 0.5637583892617449, "grad_norm": 1.746490478515625, "learning_rate": 8.600901246707269e-05, "loss": 2.4241, "step": 924 }, { "epoch": 0.5661989017693716, "grad_norm": 1.7648823261260986, "learning_rate": 8.586873273952032e-05, "loss": 2.5084, "step": 928 }, { "epoch": 0.5686394142769982, "grad_norm": 2.0134596824645996, "learning_rate": 8.572786889751092e-05, "loss": 2.5037, "step": 932 }, { "epoch": 0.5710799267846248, "grad_norm": 1.658882737159729, "learning_rate": 8.558642323498129e-05, "loss": 2.3737, "step": 936 }, { "epoch": 0.5735204392922514, "grad_norm": 1.7220959663391113, "learning_rate": 8.544439805534306e-05, "loss": 2.3038, "step": 940 }, { "epoch": 0.5759609517998779, "grad_norm": 1.5689938068389893, "learning_rate": 8.53017956714452e-05, "loss": 2.3851, "step": 944 }, { "epoch": 0.5784014643075046, "grad_norm": 1.5180913209915161, "learning_rate": 8.515861840553628e-05, "loss": 2.3179, "step": 948 }, { "epoch": 0.5808419768151312, "grad_norm": 1.7135009765625, "learning_rate": 8.501486858922673e-05, "loss": 2.3275, "step": 952 }, { "epoch": 0.5832824893227577, "grad_norm": 1.7951072454452515, "learning_rate": 8.487054856345081e-05, "loss": 2.4855, "step": 956 }, { "epoch": 0.5857230018303844, "grad_norm": 1.9952635765075684, "learning_rate": 8.472566067842855e-05, "loss": 2.5149, "step": 960 }, { "epoch": 0.588163514338011, "grad_norm": 1.721585988998413, "learning_rate": 8.458020729362742e-05, "loss": 2.454, "step": 964 }, { "epoch": 0.5906040268456376, "grad_norm": 1.59372878074646, "learning_rate": 8.443419077772393e-05, "loss": 2.2734, "step": 968 }, { "epoch": 0.5930445393532642, "grad_norm": 1.7532519102096558, "learning_rate": 8.428761350856508e-05, "loss": 2.4066, "step": 972 }, { "epoch": 0.5954850518608907, "grad_norm": 1.776670217514038, "learning_rate": 8.414047787312958e-05, "loss": 2.3946, "step": 976 }, { "epoch": 0.5979255643685174, "grad_norm": 1.7801623344421387, "learning_rate": 8.399278626748902e-05, "loss": 2.4061, "step": 980 }, { "epoch": 0.600366076876144, "grad_norm": 1.781724214553833, "learning_rate": 8.384454109676886e-05, "loss": 2.4804, "step": 984 }, { "epoch": 0.6028065893837706, "grad_norm": 1.8400248289108276, "learning_rate": 8.369574477510926e-05, "loss": 2.3099, "step": 988 }, { "epoch": 0.6052471018913972, "grad_norm": 1.874883770942688, "learning_rate": 8.35463997256257e-05, "loss": 2.3951, "step": 992 }, { "epoch": 0.6076876143990237, "grad_norm": 1.8574483394622803, "learning_rate": 8.33965083803696e-05, "loss": 2.3203, "step": 996 }, { "epoch": 0.6101281269066504, "grad_norm": 1.6467291116714478, "learning_rate": 8.324607318028874e-05, "loss": 2.3252, "step": 1000 }, { "epoch": 0.612568639414277, "grad_norm": 2.4194834232330322, "learning_rate": 8.309509657518736e-05, "loss": 2.4004, "step": 1004 }, { "epoch": 0.6150091519219036, "grad_norm": 1.6403262615203857, "learning_rate": 8.294358102368641e-05, "loss": 2.4677, "step": 1008 }, { "epoch": 0.6174496644295302, "grad_norm": 1.582318902015686, "learning_rate": 8.279152899318347e-05, "loss": 2.4513, "step": 1012 }, { "epoch": 0.6198901769371568, "grad_norm": 1.7139570713043213, "learning_rate": 8.263894295981257e-05, "loss": 2.4391, "step": 1016 }, { "epoch": 0.6223306894447834, "grad_norm": 1.8283212184906006, "learning_rate": 8.248582540840382e-05, "loss": 2.3997, "step": 1020 }, { "epoch": 0.62477120195241, "grad_norm": 1.7493493556976318, "learning_rate": 8.233217883244305e-05, "loss": 2.4504, "step": 1024 }, { "epoch": 0.6272117144600367, "grad_norm": 1.9771623611450195, "learning_rate": 8.217800573403105e-05, "loss": 2.3603, "step": 1028 }, { "epoch": 0.6296522269676632, "grad_norm": 1.7687060832977295, "learning_rate": 8.202330862384298e-05, "loss": 2.3803, "step": 1032 }, { "epoch": 0.6320927394752898, "grad_norm": 1.7797812223434448, "learning_rate": 8.186809002108742e-05, "loss": 2.3582, "step": 1036 }, { "epoch": 0.6345332519829164, "grad_norm": 1.6900354623794556, "learning_rate": 8.17123524534653e-05, "loss": 2.2908, "step": 1040 }, { "epoch": 0.636973764490543, "grad_norm": 1.778268575668335, "learning_rate": 8.155609845712882e-05, "loss": 2.4061, "step": 1044 }, { "epoch": 0.6394142769981697, "grad_norm": 1.9298450946807861, "learning_rate": 8.139933057664006e-05, "loss": 2.3872, "step": 1048 }, { "epoch": 0.6418547895057962, "grad_norm": 1.6466952562332153, "learning_rate": 8.124205136492964e-05, "loss": 2.2126, "step": 1052 }, { "epoch": 0.6442953020134228, "grad_norm": 1.7138975858688354, "learning_rate": 8.108426338325508e-05, "loss": 2.5358, "step": 1056 }, { "epoch": 0.6467358145210494, "grad_norm": 1.9166319370269775, "learning_rate": 8.092596920115908e-05, "loss": 2.3274, "step": 1060 }, { "epoch": 0.649176327028676, "grad_norm": 1.64811372756958, "learning_rate": 8.076717139642775e-05, "loss": 2.2893, "step": 1064 }, { "epoch": 0.6516168395363027, "grad_norm": 1.8892488479614258, "learning_rate": 8.060787255504856e-05, "loss": 2.3852, "step": 1068 }, { "epoch": 0.6540573520439292, "grad_norm": 1.6665705442428589, "learning_rate": 8.044807527116824e-05, "loss": 2.3249, "step": 1072 }, { "epoch": 0.6564978645515558, "grad_norm": 1.811578631401062, "learning_rate": 8.028778214705059e-05, "loss": 2.2834, "step": 1076 }, { "epoch": 0.6589383770591825, "grad_norm": 1.6264970302581787, "learning_rate": 8.0126995793034e-05, "loss": 2.2388, "step": 1080 }, { "epoch": 0.661378889566809, "grad_norm": 1.6090588569641113, "learning_rate": 7.996571882748905e-05, "loss": 2.1855, "step": 1084 }, { "epoch": 0.6638194020744357, "grad_norm": 1.7412513494491577, "learning_rate": 7.980395387677579e-05, "loss": 2.3041, "step": 1088 }, { "epoch": 0.6662599145820622, "grad_norm": 1.6816790103912354, "learning_rate": 7.964170357520102e-05, "loss": 2.3832, "step": 1092 }, { "epoch": 0.6687004270896888, "grad_norm": 1.553139567375183, "learning_rate": 7.947897056497537e-05, "loss": 2.2524, "step": 1096 }, { "epoch": 0.6711409395973155, "grad_norm": 1.5886545181274414, "learning_rate": 7.931575749617026e-05, "loss": 2.2555, "step": 1100 }, { "epoch": 0.673581452104942, "grad_norm": 1.6401042938232422, "learning_rate": 7.915206702667477e-05, "loss": 2.3817, "step": 1104 }, { "epoch": 0.6760219646125687, "grad_norm": 1.7411017417907715, "learning_rate": 7.898790182215236e-05, "loss": 2.3398, "step": 1108 }, { "epoch": 0.6784624771201953, "grad_norm": 1.6779972314834595, "learning_rate": 7.882326455599744e-05, "loss": 2.4326, "step": 1112 }, { "epoch": 0.6809029896278218, "grad_norm": 1.777983546257019, "learning_rate": 7.865815790929181e-05, "loss": 2.2516, "step": 1116 }, { "epoch": 0.6833435021354485, "grad_norm": 1.9200791120529175, "learning_rate": 7.849258457076105e-05, "loss": 2.2811, "step": 1120 }, { "epoch": 0.685784014643075, "grad_norm": 1.7898070812225342, "learning_rate": 7.832654723673072e-05, "loss": 2.4561, "step": 1124 }, { "epoch": 0.6882245271507016, "grad_norm": 1.721513271331787, "learning_rate": 7.816004861108246e-05, "loss": 2.316, "step": 1128 }, { "epoch": 0.6906650396583283, "grad_norm": 1.5932090282440186, "learning_rate": 7.79930914052099e-05, "loss": 2.3827, "step": 1132 }, { "epoch": 0.6931055521659548, "grad_norm": 1.6721924543380737, "learning_rate": 7.782567833797457e-05, "loss": 2.2727, "step": 1136 }, { "epoch": 0.6955460646735815, "grad_norm": 1.7587766647338867, "learning_rate": 7.765781213566164e-05, "loss": 2.2387, "step": 1140 }, { "epoch": 0.697986577181208, "grad_norm": 1.676706314086914, "learning_rate": 7.748949553193541e-05, "loss": 2.2, "step": 1144 }, { "epoch": 0.7004270896888346, "grad_norm": 1.6946930885314941, "learning_rate": 7.732073126779493e-05, "loss": 2.3423, "step": 1148 }, { "epoch": 0.7028676021964613, "grad_norm": 1.7776798009872437, "learning_rate": 7.715152209152928e-05, "loss": 2.3296, "step": 1152 }, { "epoch": 0.7053081147040878, "grad_norm": 1.602172613143921, "learning_rate": 7.698187075867282e-05, "loss": 2.37, "step": 1156 }, { "epoch": 0.7077486272117145, "grad_norm": 1.7191290855407715, "learning_rate": 7.681178003196036e-05, "loss": 2.4166, "step": 1160 }, { "epoch": 0.7101891397193411, "grad_norm": 1.4953582286834717, "learning_rate": 7.664125268128215e-05, "loss": 2.2193, "step": 1164 }, { "epoch": 0.7126296522269676, "grad_norm": 1.6568121910095215, "learning_rate": 7.647029148363873e-05, "loss": 2.3921, "step": 1168 }, { "epoch": 0.7150701647345943, "grad_norm": 1.5020556449890137, "learning_rate": 7.629889922309577e-05, "loss": 2.2285, "step": 1172 }, { "epoch": 0.7175106772422208, "grad_norm": 1.7414259910583496, "learning_rate": 7.612707869073867e-05, "loss": 2.2694, "step": 1176 }, { "epoch": 0.7199511897498475, "grad_norm": 1.6852495670318604, "learning_rate": 7.595483268462722e-05, "loss": 2.2089, "step": 1180 }, { "epoch": 0.7223917022574741, "grad_norm": 1.624004602432251, "learning_rate": 7.578216400974986e-05, "loss": 2.2708, "step": 1184 }, { "epoch": 0.7248322147651006, "grad_norm": 1.7594397068023682, "learning_rate": 7.560907547797816e-05, "loss": 2.3268, "step": 1188 }, { "epoch": 0.7272727272727273, "grad_norm": 1.7249298095703125, "learning_rate": 7.543556990802095e-05, "loss": 2.2962, "step": 1192 }, { "epoch": 0.7297132397803539, "grad_norm": 1.7642275094985962, "learning_rate": 7.526165012537844e-05, "loss": 2.3354, "step": 1196 }, { "epoch": 0.7321537522879805, "grad_norm": 1.6708530187606812, "learning_rate": 7.50873189622962e-05, "loss": 2.3596, "step": 1200 }, { "epoch": 0.7345942647956071, "grad_norm": 1.6663364171981812, "learning_rate": 7.491257925771904e-05, "loss": 2.42, "step": 1204 }, { "epoch": 0.7370347773032336, "grad_norm": 1.6076205968856812, "learning_rate": 7.473743385724478e-05, "loss": 2.3137, "step": 1208 }, { "epoch": 0.7394752898108603, "grad_norm": 1.729025959968567, "learning_rate": 7.456188561307791e-05, "loss": 2.2779, "step": 1212 }, { "epoch": 0.7419158023184869, "grad_norm": 1.5809768438339233, "learning_rate": 7.438593738398313e-05, "loss": 2.3453, "step": 1216 }, { "epoch": 0.7443563148261135, "grad_norm": 1.6153522729873657, "learning_rate": 7.420959203523883e-05, "loss": 2.2915, "step": 1220 }, { "epoch": 0.7467968273337401, "grad_norm": 1.6490085124969482, "learning_rate": 7.403285243859038e-05, "loss": 2.3402, "step": 1224 }, { "epoch": 0.7492373398413666, "grad_norm": 1.6399990320205688, "learning_rate": 7.385572147220341e-05, "loss": 2.3134, "step": 1228 }, { "epoch": 0.7516778523489933, "grad_norm": 1.8725767135620117, "learning_rate": 7.367820202061692e-05, "loss": 2.2752, "step": 1232 }, { "epoch": 0.7541183648566199, "grad_norm": 1.5729907751083374, "learning_rate": 7.35002969746963e-05, "loss": 2.4031, "step": 1236 }, { "epoch": 0.7565588773642465, "grad_norm": 1.604311227798462, "learning_rate": 7.332200923158623e-05, "loss": 2.2086, "step": 1240 }, { "epoch": 0.7589993898718731, "grad_norm": 1.619836449623108, "learning_rate": 7.31433416946636e-05, "loss": 2.3613, "step": 1244 }, { "epoch": 0.7614399023794997, "grad_norm": 1.6590549945831299, "learning_rate": 7.29642972734901e-05, "loss": 2.2867, "step": 1248 }, { "epoch": 0.7638804148871263, "grad_norm": 1.778241753578186, "learning_rate": 7.278487888376493e-05, "loss": 2.3533, "step": 1252 }, { "epoch": 0.7663209273947529, "grad_norm": 1.6422300338745117, "learning_rate": 7.260508944727724e-05, "loss": 2.3247, "step": 1256 }, { "epoch": 0.7687614399023796, "grad_norm": 1.7021371126174927, "learning_rate": 7.242493189185867e-05, "loss": 2.4069, "step": 1260 }, { "epoch": 0.7712019524100061, "grad_norm": 1.6313260793685913, "learning_rate": 7.22444091513355e-05, "loss": 2.2165, "step": 1264 }, { "epoch": 0.7736424649176327, "grad_norm": 1.6499704122543335, "learning_rate": 7.206352416548108e-05, "loss": 2.1848, "step": 1268 }, { "epoch": 0.7760829774252593, "grad_norm": 3.755737543106079, "learning_rate": 7.188227987996777e-05, "loss": 2.1791, "step": 1272 }, { "epoch": 0.7785234899328859, "grad_norm": 1.735226035118103, "learning_rate": 7.170067924631903e-05, "loss": 2.3761, "step": 1276 }, { "epoch": 0.7809640024405126, "grad_norm": 1.7428323030471802, "learning_rate": 7.151872522186146e-05, "loss": 2.302, "step": 1280 }, { "epoch": 0.7834045149481391, "grad_norm": 1.5832102298736572, "learning_rate": 7.133642076967648e-05, "loss": 2.2192, "step": 1284 }, { "epoch": 0.7858450274557657, "grad_norm": 1.7065541744232178, "learning_rate": 7.115376885855214e-05, "loss": 2.3773, "step": 1288 }, { "epoch": 0.7882855399633923, "grad_norm": 1.5590834617614746, "learning_rate": 7.097077246293484e-05, "loss": 2.3552, "step": 1292 }, { "epoch": 0.7907260524710189, "grad_norm": 1.7039079666137695, "learning_rate": 7.07874345628808e-05, "loss": 2.2384, "step": 1296 }, { "epoch": 0.7931665649786455, "grad_norm": 1.7430140972137451, "learning_rate": 7.060375814400756e-05, "loss": 2.2023, "step": 1300 }, { "epoch": 0.7956070774862721, "grad_norm": 1.7291806936264038, "learning_rate": 7.041974619744538e-05, "loss": 2.4436, "step": 1304 }, { "epoch": 0.7980475899938987, "grad_norm": 1.743966817855835, "learning_rate": 7.02354017197885e-05, "loss": 2.27, "step": 1308 }, { "epoch": 0.8004881025015254, "grad_norm": 1.9459971189498901, "learning_rate": 7.005072771304638e-05, "loss": 2.272, "step": 1312 }, { "epoch": 0.8029286150091519, "grad_norm": 1.5706496238708496, "learning_rate": 6.986572718459479e-05, "loss": 2.2395, "step": 1316 }, { "epoch": 0.8053691275167785, "grad_norm": 1.6363030672073364, "learning_rate": 6.968040314712678e-05, "loss": 2.3157, "step": 1320 }, { "epoch": 0.8078096400244051, "grad_norm": 1.6379305124282837, "learning_rate": 6.949475861860372e-05, "loss": 2.3739, "step": 1324 }, { "epoch": 0.8102501525320317, "grad_norm": 1.5823924541473389, "learning_rate": 6.930879662220612e-05, "loss": 2.3456, "step": 1328 }, { "epoch": 0.8126906650396584, "grad_norm": 3.3123624324798584, "learning_rate": 6.912252018628435e-05, "loss": 2.2965, "step": 1332 }, { "epoch": 0.8151311775472849, "grad_norm": 1.7302931547164917, "learning_rate": 6.893593234430937e-05, "loss": 2.252, "step": 1336 }, { "epoch": 0.8175716900549115, "grad_norm": 1.570993423461914, "learning_rate": 6.874903613482334e-05, "loss": 2.4197, "step": 1340 }, { "epoch": 0.8200122025625382, "grad_norm": 1.5884159803390503, "learning_rate": 6.856183460139008e-05, "loss": 2.2392, "step": 1344 }, { "epoch": 0.8224527150701647, "grad_norm": 1.6081788539886475, "learning_rate": 6.837433079254558e-05, "loss": 2.3623, "step": 1348 }, { "epoch": 0.8248932275777914, "grad_norm": 1.5602836608886719, "learning_rate": 6.818652776174827e-05, "loss": 2.2222, "step": 1352 }, { "epoch": 0.8273337400854179, "grad_norm": 1.5885746479034424, "learning_rate": 6.79984285673294e-05, "loss": 2.3442, "step": 1356 }, { "epoch": 0.8297742525930445, "grad_norm": 1.7103272676467896, "learning_rate": 6.781003627244317e-05, "loss": 2.3187, "step": 1360 }, { "epoch": 0.8322147651006712, "grad_norm": 1.7549841403961182, "learning_rate": 6.76213539450168e-05, "loss": 2.3711, "step": 1364 }, { "epoch": 0.8346552776082977, "grad_norm": 1.640334129333496, "learning_rate": 6.743238465770073e-05, "loss": 2.1647, "step": 1368 }, { "epoch": 0.8370957901159244, "grad_norm": 1.5788451433181763, "learning_rate": 6.724313148781835e-05, "loss": 2.2223, "step": 1372 }, { "epoch": 0.839536302623551, "grad_norm": 1.6184700727462769, "learning_rate": 6.705359751731613e-05, "loss": 2.2667, "step": 1376 }, { "epoch": 0.8419768151311775, "grad_norm": 1.6971137523651123, "learning_rate": 6.686378583271323e-05, "loss": 2.1033, "step": 1380 }, { "epoch": 0.8444173276388042, "grad_norm": 1.603644847869873, "learning_rate": 6.66736995250514e-05, "loss": 2.3768, "step": 1384 }, { "epoch": 0.8468578401464307, "grad_norm": 1.7039309740066528, "learning_rate": 6.648334168984452e-05, "loss": 2.3628, "step": 1388 }, { "epoch": 0.8492983526540574, "grad_norm": 1.5587040185928345, "learning_rate": 6.629271542702824e-05, "loss": 2.2151, "step": 1392 }, { "epoch": 0.851738865161684, "grad_norm": 1.630582332611084, "learning_rate": 6.610182384090951e-05, "loss": 2.3218, "step": 1396 }, { "epoch": 0.8541793776693105, "grad_norm": 1.5783443450927734, "learning_rate": 6.591067004011599e-05, "loss": 2.186, "step": 1400 }, { "epoch": 0.8566198901769372, "grad_norm": 1.7726662158966064, "learning_rate": 6.571925713754547e-05, "loss": 2.2432, "step": 1404 }, { "epoch": 0.8590604026845637, "grad_norm": 1.6053755283355713, "learning_rate": 6.552758825031513e-05, "loss": 2.2529, "step": 1408 }, { "epoch": 0.8615009151921904, "grad_norm": 1.5767484903335571, "learning_rate": 6.533566649971086e-05, "loss": 2.3182, "step": 1412 }, { "epoch": 0.863941427699817, "grad_norm": 1.6744613647460938, "learning_rate": 6.514349501113628e-05, "loss": 2.273, "step": 1416 }, { "epoch": 0.8663819402074435, "grad_norm": 1.5842111110687256, "learning_rate": 6.495107691406204e-05, "loss": 2.1535, "step": 1420 }, { "epoch": 0.8688224527150702, "grad_norm": 1.5760037899017334, "learning_rate": 6.47584153419747e-05, "loss": 2.3132, "step": 1424 }, { "epoch": 0.8712629652226968, "grad_norm": 1.560583233833313, "learning_rate": 6.456551343232575e-05, "loss": 2.2572, "step": 1428 }, { "epoch": 0.8737034777303234, "grad_norm": 1.6090435981750488, "learning_rate": 6.437237432648055e-05, "loss": 2.2026, "step": 1432 }, { "epoch": 0.87614399023795, "grad_norm": 1.609047770500183, "learning_rate": 6.417900116966716e-05, "loss": 2.299, "step": 1436 }, { "epoch": 0.8785845027455765, "grad_norm": 1.5959935188293457, "learning_rate": 6.398539711092507e-05, "loss": 2.2031, "step": 1440 }, { "epoch": 0.8810250152532032, "grad_norm": 1.743567943572998, "learning_rate": 6.379156530305395e-05, "loss": 2.2001, "step": 1444 }, { "epoch": 0.8834655277608298, "grad_norm": 1.571115255355835, "learning_rate": 6.359750890256238e-05, "loss": 2.1274, "step": 1448 }, { "epoch": 0.8859060402684564, "grad_norm": 1.7434707880020142, "learning_rate": 6.340323106961632e-05, "loss": 2.1872, "step": 1452 }, { "epoch": 0.888346552776083, "grad_norm": 1.5418583154678345, "learning_rate": 6.320873496798774e-05, "loss": 2.1464, "step": 1456 }, { "epoch": 0.8907870652837095, "grad_norm": 1.7159123420715332, "learning_rate": 6.301402376500304e-05, "loss": 2.3295, "step": 1460 }, { "epoch": 0.8932275777913362, "grad_norm": 1.5622751712799072, "learning_rate": 6.281910063149155e-05, "loss": 2.2156, "step": 1464 }, { "epoch": 0.8956680902989628, "grad_norm": 1.6685218811035156, "learning_rate": 6.262396874173379e-05, "loss": 2.0905, "step": 1468 }, { "epoch": 0.8981086028065893, "grad_norm": 1.6046509742736816, "learning_rate": 6.242863127340985e-05, "loss": 2.2891, "step": 1472 }, { "epoch": 0.900549115314216, "grad_norm": 1.6661593914031982, "learning_rate": 6.223309140754763e-05, "loss": 2.4733, "step": 1476 }, { "epoch": 0.9029896278218426, "grad_norm": 1.580785870552063, "learning_rate": 6.2037352328471e-05, "loss": 2.1956, "step": 1480 }, { "epoch": 0.9054301403294692, "grad_norm": 1.8454831838607788, "learning_rate": 6.184141722374802e-05, "loss": 2.2056, "step": 1484 }, { "epoch": 0.9078706528370958, "grad_norm": 1.6183425188064575, "learning_rate": 6.16452892841389e-05, "loss": 2.1647, "step": 1488 }, { "epoch": 0.9103111653447223, "grad_norm": 1.6347066164016724, "learning_rate": 6.144897170354427e-05, "loss": 2.2844, "step": 1492 }, { "epoch": 0.912751677852349, "grad_norm": 1.5810872316360474, "learning_rate": 6.125246767895286e-05, "loss": 2.1497, "step": 1496 }, { "epoch": 0.9151921903599756, "grad_norm": 1.5986207723617554, "learning_rate": 6.105578041038973e-05, "loss": 2.1887, "step": 1500 }, { "epoch": 0.9176327028676022, "grad_norm": 1.507636547088623, "learning_rate": 6.0858913100863934e-05, "loss": 2.2456, "step": 1504 }, { "epoch": 0.9200732153752288, "grad_norm": 1.5669258832931519, "learning_rate": 6.06618689563165e-05, "loss": 2.1861, "step": 1508 }, { "epoch": 0.9225137278828553, "grad_norm": 1.6023259162902832, "learning_rate": 6.046465118556819e-05, "loss": 2.1911, "step": 1512 }, { "epoch": 0.924954240390482, "grad_norm": 1.5018157958984375, "learning_rate": 6.026726300026717e-05, "loss": 2.2887, "step": 1516 }, { "epoch": 0.9273947528981086, "grad_norm": 1.6610755920410156, "learning_rate": 6.0069707614836826e-05, "loss": 2.2053, "step": 1520 }, { "epoch": 0.9298352654057352, "grad_norm": 1.6853493452072144, "learning_rate": 5.987198824642335e-05, "loss": 2.2406, "step": 1524 }, { "epoch": 0.9322757779133618, "grad_norm": 1.6544604301452637, "learning_rate": 5.967410811484335e-05, "loss": 2.1305, "step": 1528 }, { "epoch": 0.9347162904209884, "grad_norm": 1.5510609149932861, "learning_rate": 5.947607044253142e-05, "loss": 2.2702, "step": 1532 }, { "epoch": 0.937156802928615, "grad_norm": 1.6373018026351929, "learning_rate": 5.92778784544877e-05, "loss": 2.1894, "step": 1536 }, { "epoch": 0.9395973154362416, "grad_norm": 1.8243416547775269, "learning_rate": 5.90795353782253e-05, "loss": 2.2655, "step": 1540 }, { "epoch": 0.9420378279438683, "grad_norm": 1.515629768371582, "learning_rate": 5.888104444371777e-05, "loss": 2.2185, "step": 1544 }, { "epoch": 0.9444783404514948, "grad_norm": 1.6385153532028198, "learning_rate": 5.868240888334653e-05, "loss": 2.3042, "step": 1548 }, { "epoch": 0.9469188529591214, "grad_norm": 1.5588388442993164, "learning_rate": 5.848363193184814e-05, "loss": 2.1669, "step": 1552 }, { "epoch": 0.949359365466748, "grad_norm": 1.6179107427597046, "learning_rate": 5.828471682626175e-05, "loss": 2.123, "step": 1556 }, { "epoch": 0.9517998779743746, "grad_norm": 1.7374824285507202, "learning_rate": 5.808566680587628e-05, "loss": 2.261, "step": 1560 }, { "epoch": 0.9542403904820013, "grad_norm": 1.5545965433120728, "learning_rate": 5.788648511217774e-05, "loss": 2.2205, "step": 1564 }, { "epoch": 0.9566809029896278, "grad_norm": 1.9451733827590942, "learning_rate": 5.768717498879635e-05, "loss": 2.3301, "step": 1568 }, { "epoch": 0.9591214154972544, "grad_norm": 1.5943657159805298, "learning_rate": 5.7487739681453844e-05, "loss": 2.21, "step": 1572 }, { "epoch": 0.961561928004881, "grad_norm": 1.628563642501831, "learning_rate": 5.728818243791052e-05, "loss": 2.2007, "step": 1576 }, { "epoch": 0.9640024405125076, "grad_norm": 2.344428539276123, "learning_rate": 5.7088506507912374e-05, "loss": 2.1962, "step": 1580 }, { "epoch": 0.9664429530201343, "grad_norm": 2.715648889541626, "learning_rate": 5.6888715143138204e-05, "loss": 2.4361, "step": 1584 }, { "epoch": 0.9688834655277608, "grad_norm": 1.6450954675674438, "learning_rate": 5.668881159714663e-05, "loss": 2.2138, "step": 1588 }, { "epoch": 0.9713239780353874, "grad_norm": 1.59503972530365, "learning_rate": 5.648879912532312e-05, "loss": 2.179, "step": 1592 }, { "epoch": 0.9737644905430141, "grad_norm": 1.636630892753601, "learning_rate": 5.628868098482696e-05, "loss": 2.2481, "step": 1596 }, { "epoch": 0.9762050030506406, "grad_norm": 1.5790376663208008, "learning_rate": 5.608846043453826e-05, "loss": 2.1175, "step": 1600 }, { "epoch": 0.9786455155582673, "grad_norm": 1.60578191280365, "learning_rate": 5.5888140735004804e-05, "loss": 2.249, "step": 1604 }, { "epoch": 0.9810860280658938, "grad_norm": 1.5261207818984985, "learning_rate": 5.5687725148389046e-05, "loss": 2.06, "step": 1608 }, { "epoch": 0.9835265405735204, "grad_norm": 1.7263493537902832, "learning_rate": 5.5487216938414924e-05, "loss": 2.2045, "step": 1612 }, { "epoch": 0.9859670530811471, "grad_norm": 1.5825196504592896, "learning_rate": 5.5286619370314706e-05, "loss": 2.1236, "step": 1616 }, { "epoch": 0.9884075655887736, "grad_norm": 2.4041831493377686, "learning_rate": 5.508593571077587e-05, "loss": 2.2125, "step": 1620 }, { "epoch": 0.9908480780964003, "grad_norm": 1.5313869714736938, "learning_rate": 5.488516922788787e-05, "loss": 2.1998, "step": 1624 }, { "epoch": 0.9932885906040269, "grad_norm": 1.69304358959198, "learning_rate": 5.468432319108888e-05, "loss": 2.2207, "step": 1628 }, { "epoch": 0.9957291031116534, "grad_norm": 1.5605350732803345, "learning_rate": 5.4483400871112645e-05, "loss": 2.2412, "step": 1632 }, { "epoch": 0.9981696156192801, "grad_norm": 1.5765243768692017, "learning_rate": 5.428240553993512e-05, "loss": 2.2266, "step": 1636 } ], "logging_steps": 4, "max_steps": 3278, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1639, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7728915038470144e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }