{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3849855630413859, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019249278152069297, "grad_norm": 1.9932657480239868, "learning_rate": 0.00016, "loss": 2.962, "step": 5 }, { "epoch": 0.00038498556304138594, "grad_norm": 1.2827370166778564, "learning_rate": 0.000199999997073812, "loss": 2.2983, "step": 10 }, { "epoch": 0.0005774783445620789, "grad_norm": 1.3585917949676514, "learning_rate": 0.0001999999851861734, "loss": 1.9389, "step": 15 }, { "epoch": 0.0007699711260827719, "grad_norm": 2.117544412612915, "learning_rate": 0.00019999996415419864, "loss": 1.6659, "step": 20 }, { "epoch": 0.0009624639076034649, "grad_norm": 0.8802940249443054, "learning_rate": 0.0001999999339778896, "loss": 1.6015, "step": 25 }, { "epoch": 0.0011549566891241579, "grad_norm": 1.256873369216919, "learning_rate": 0.000199999894657249, "loss": 1.7428, "step": 30 }, { "epoch": 0.001347449470644851, "grad_norm": 1.9709804058074951, "learning_rate": 0.0001999998461922805, "loss": 1.4316, "step": 35 }, { "epoch": 0.0015399422521655437, "grad_norm": 1.2085392475128174, "learning_rate": 0.00019999978858298848, "loss": 1.8392, "step": 40 }, { "epoch": 0.0017324350336862368, "grad_norm": 0.9966161847114563, "learning_rate": 0.00019999972182937827, "loss": 1.6381, "step": 45 }, { "epoch": 0.0019249278152069298, "grad_norm": 1.5572378635406494, "learning_rate": 0.0001999996459314559, "loss": 1.6214, "step": 50 }, { "epoch": 0.0021174205967276227, "grad_norm": 0.9813450574874878, "learning_rate": 0.00019999956088922837, "loss": 1.5337, "step": 55 }, { "epoch": 0.0023099133782483157, "grad_norm": 1.140754222869873, "learning_rate": 0.00019999946670270341, "loss": 1.5865, "step": 60 }, { "epoch": 0.0025024061597690088, "grad_norm": 1.7033613920211792, "learning_rate": 0.0001999993633718897, "loss": 1.5483, "step": 65 }, { "epoch": 0.002694898941289702, "grad_norm": 0.8782416582107544, "learning_rate": 0.00019999925089679658, "loss": 1.7574, "step": 70 }, { "epoch": 0.0028873917228103944, "grad_norm": 0.94110506772995, "learning_rate": 0.00019999912927743445, "loss": 1.747, "step": 75 }, { "epoch": 0.0030798845043310875, "grad_norm": 2.9130144119262695, "learning_rate": 0.00019999899851381436, "loss": 1.5482, "step": 80 }, { "epoch": 0.0032723772858517805, "grad_norm": 1.444981336593628, "learning_rate": 0.00019999885860594828, "loss": 1.7935, "step": 85 }, { "epoch": 0.0034648700673724736, "grad_norm": 0.8361923098564148, "learning_rate": 0.00019999870955384906, "loss": 1.5566, "step": 90 }, { "epoch": 0.0036573628488931666, "grad_norm": 1.0198391675949097, "learning_rate": 0.00019999855135753025, "loss": 1.6608, "step": 95 }, { "epoch": 0.0038498556304138597, "grad_norm": 0.9720978736877441, "learning_rate": 0.00019999838401700632, "loss": 1.4217, "step": 100 }, { "epoch": 0.004042348411934553, "grad_norm": 0.7735599279403687, "learning_rate": 0.00019999820753229263, "loss": 1.4195, "step": 105 }, { "epoch": 0.004234841193455245, "grad_norm": 1.1776920557022095, "learning_rate": 0.0001999980219034053, "loss": 1.7147, "step": 110 }, { "epoch": 0.004427333974975939, "grad_norm": 1.156069278717041, "learning_rate": 0.0001999978271303613, "loss": 1.7, "step": 115 }, { "epoch": 0.0046198267564966315, "grad_norm": 1.2335503101348877, "learning_rate": 0.0001999976232131784, "loss": 1.3309, "step": 120 }, { "epoch": 0.004812319538017324, "grad_norm": 1.0332967042922974, "learning_rate": 0.0001999974101518753, "loss": 1.7515, "step": 125 }, { "epoch": 0.0050048123195380175, "grad_norm": 1.561087727546692, "learning_rate": 0.00019999718794647145, "loss": 1.5517, "step": 130 }, { "epoch": 0.00519730510105871, "grad_norm": 1.3611408472061157, "learning_rate": 0.00019999695659698717, "loss": 1.5771, "step": 135 }, { "epoch": 0.005389797882579404, "grad_norm": 1.5531154870986938, "learning_rate": 0.0001999967161034437, "loss": 1.4217, "step": 140 }, { "epoch": 0.005582290664100096, "grad_norm": 1.5827676057815552, "learning_rate": 0.00019999646646586287, "loss": 1.611, "step": 145 }, { "epoch": 0.005774783445620789, "grad_norm": 1.1693483591079712, "learning_rate": 0.00019999620768426763, "loss": 1.3961, "step": 150 }, { "epoch": 0.005967276227141482, "grad_norm": 1.4277936220169067, "learning_rate": 0.00019999593975868164, "loss": 1.638, "step": 155 }, { "epoch": 0.006159769008662175, "grad_norm": 1.2951083183288574, "learning_rate": 0.00019999566268912933, "loss": 1.6187, "step": 160 }, { "epoch": 0.0063522617901828685, "grad_norm": 2.4885995388031006, "learning_rate": 0.0001999953764756361, "loss": 1.5669, "step": 165 }, { "epoch": 0.006544754571703561, "grad_norm": 1.3352105617523193, "learning_rate": 0.00019999508111822811, "loss": 1.3157, "step": 170 }, { "epoch": 0.006737247353224254, "grad_norm": 1.2560889720916748, "learning_rate": 0.00019999477661693233, "loss": 1.7011, "step": 175 }, { "epoch": 0.006929740134744947, "grad_norm": 2.4167582988739014, "learning_rate": 0.00019999446297177666, "loss": 1.4827, "step": 180 }, { "epoch": 0.00712223291626564, "grad_norm": 1.0598788261413574, "learning_rate": 0.00019999414018278974, "loss": 1.5718, "step": 185 }, { "epoch": 0.007314725697786333, "grad_norm": 1.5576567649841309, "learning_rate": 0.00019999380825000111, "loss": 1.7717, "step": 190 }, { "epoch": 0.007507218479307026, "grad_norm": 1.005711317062378, "learning_rate": 0.0001999934671734411, "loss": 1.5085, "step": 195 }, { "epoch": 0.007699711260827719, "grad_norm": 1.7211413383483887, "learning_rate": 0.00019999311695314095, "loss": 1.623, "step": 200 }, { "epoch": 0.007892204042348411, "grad_norm": 1.5765767097473145, "learning_rate": 0.00019999275758913261, "loss": 1.5982, "step": 205 }, { "epoch": 0.008084696823869105, "grad_norm": 1.0989298820495605, "learning_rate": 0.00019999238908144896, "loss": 1.3306, "step": 210 }, { "epoch": 0.008277189605389798, "grad_norm": 1.0234464406967163, "learning_rate": 0.0001999920114301238, "loss": 1.5856, "step": 215 }, { "epoch": 0.00846968238691049, "grad_norm": 1.6681355237960815, "learning_rate": 0.0001999916246351915, "loss": 1.4777, "step": 220 }, { "epoch": 0.008662175168431183, "grad_norm": 0.9723508358001709, "learning_rate": 0.00019999122869668754, "loss": 1.5357, "step": 225 }, { "epoch": 0.008854667949951878, "grad_norm": 0.8840959072113037, "learning_rate": 0.0001999908236146481, "loss": 1.5296, "step": 230 }, { "epoch": 0.00904716073147257, "grad_norm": 0.9913238883018494, "learning_rate": 0.0001999904093891102, "loss": 1.5846, "step": 235 }, { "epoch": 0.009239653512993263, "grad_norm": 1.129952073097229, "learning_rate": 0.00019998998602011178, "loss": 1.4455, "step": 240 }, { "epoch": 0.009432146294513956, "grad_norm": 1.0377521514892578, "learning_rate": 0.00019998955350769148, "loss": 1.4212, "step": 245 }, { "epoch": 0.009624639076034648, "grad_norm": 2.2103137969970703, "learning_rate": 0.00019998911185188886, "loss": 1.5812, "step": 250 }, { "epoch": 0.009817131857555342, "grad_norm": 0.8716953992843628, "learning_rate": 0.00019998866105274437, "loss": 1.5326, "step": 255 }, { "epoch": 0.010009624639076035, "grad_norm": 1.1956042051315308, "learning_rate": 0.00019998820111029916, "loss": 1.7183, "step": 260 }, { "epoch": 0.010202117420596728, "grad_norm": 2.747600555419922, "learning_rate": 0.00019998773202459534, "loss": 1.7952, "step": 265 }, { "epoch": 0.01039461020211742, "grad_norm": 1.3412338495254517, "learning_rate": 0.00019998725379567577, "loss": 1.3538, "step": 270 }, { "epoch": 0.010587102983638113, "grad_norm": 1.651822805404663, "learning_rate": 0.00019998676642358422, "loss": 1.5458, "step": 275 }, { "epoch": 0.010779595765158807, "grad_norm": 1.3036198616027832, "learning_rate": 0.00019998626990836522, "loss": 1.7305, "step": 280 }, { "epoch": 0.0109720885466795, "grad_norm": 0.8263657093048096, "learning_rate": 0.00019998576425006416, "loss": 1.3767, "step": 285 }, { "epoch": 0.011164581328200193, "grad_norm": 2.022136926651001, "learning_rate": 0.00019998524944872737, "loss": 1.5823, "step": 290 }, { "epoch": 0.011357074109720885, "grad_norm": 1.1224019527435303, "learning_rate": 0.00019998472550440178, "loss": 1.5723, "step": 295 }, { "epoch": 0.011549566891241578, "grad_norm": 1.375664234161377, "learning_rate": 0.00019998419241713542, "loss": 1.5224, "step": 300 }, { "epoch": 0.011742059672762272, "grad_norm": 1.2721813917160034, "learning_rate": 0.000199983650186977, "loss": 1.7217, "step": 305 }, { "epoch": 0.011934552454282965, "grad_norm": 1.4723321199417114, "learning_rate": 0.0001999830988139761, "loss": 1.4666, "step": 310 }, { "epoch": 0.012127045235803657, "grad_norm": 0.695198118686676, "learning_rate": 0.00019998253829818315, "loss": 1.2672, "step": 315 }, { "epoch": 0.01231953801732435, "grad_norm": 1.716638207435608, "learning_rate": 0.00019998196863964937, "loss": 1.3461, "step": 320 }, { "epoch": 0.012512030798845043, "grad_norm": 1.1060154438018799, "learning_rate": 0.0001999813898384269, "loss": 1.3816, "step": 325 }, { "epoch": 0.012704523580365737, "grad_norm": 1.6124354600906372, "learning_rate": 0.00019998080189456862, "loss": 1.5232, "step": 330 }, { "epoch": 0.01289701636188643, "grad_norm": 1.5060306787490845, "learning_rate": 0.00019998020480812832, "loss": 1.5767, "step": 335 }, { "epoch": 0.013089509143407122, "grad_norm": 1.1920175552368164, "learning_rate": 0.00019997959857916063, "loss": 1.6112, "step": 340 }, { "epoch": 0.013282001924927815, "grad_norm": 1.1669896841049194, "learning_rate": 0.00019997898320772096, "loss": 1.3679, "step": 345 }, { "epoch": 0.013474494706448507, "grad_norm": 1.1692086458206177, "learning_rate": 0.00019997835869386553, "loss": 1.4147, "step": 350 }, { "epoch": 0.013666987487969202, "grad_norm": 2.0466034412384033, "learning_rate": 0.00019997772503765153, "loss": 1.5261, "step": 355 }, { "epoch": 0.013859480269489894, "grad_norm": 1.1581529378890991, "learning_rate": 0.00019997708223913686, "loss": 1.5441, "step": 360 }, { "epoch": 0.014051973051010587, "grad_norm": 1.4370143413543701, "learning_rate": 0.0001999764302983803, "loss": 1.651, "step": 365 }, { "epoch": 0.01424446583253128, "grad_norm": 0.998635470867157, "learning_rate": 0.00019997576921544147, "loss": 1.4311, "step": 370 }, { "epoch": 0.014436958614051972, "grad_norm": 1.2625153064727783, "learning_rate": 0.00019997509899038086, "loss": 1.4634, "step": 375 }, { "epoch": 0.014629451395572667, "grad_norm": 1.171949863433838, "learning_rate": 0.00019997441962325968, "loss": 1.2474, "step": 380 }, { "epoch": 0.01482194417709336, "grad_norm": 1.4312052726745605, "learning_rate": 0.00019997373111414009, "loss": 1.4814, "step": 385 }, { "epoch": 0.015014436958614052, "grad_norm": 1.1508846282958984, "learning_rate": 0.00019997303346308508, "loss": 1.6291, "step": 390 }, { "epoch": 0.015206929740134744, "grad_norm": 1.2096014022827148, "learning_rate": 0.0001999723266701584, "loss": 1.5507, "step": 395 }, { "epoch": 0.015399422521655439, "grad_norm": 0.996391773223877, "learning_rate": 0.00019997161073542473, "loss": 1.6402, "step": 400 }, { "epoch": 0.015591915303176131, "grad_norm": 1.6977828741073608, "learning_rate": 0.00019997088565894947, "loss": 1.5706, "step": 405 }, { "epoch": 0.015784408084696822, "grad_norm": 1.4707343578338623, "learning_rate": 0.000199970151440799, "loss": 1.6348, "step": 410 }, { "epoch": 0.015976900866217517, "grad_norm": 1.5461647510528564, "learning_rate": 0.0001999694080810404, "loss": 1.4836, "step": 415 }, { "epoch": 0.01616939364773821, "grad_norm": 1.6253695487976074, "learning_rate": 0.00019996865557974166, "loss": 1.5834, "step": 420 }, { "epoch": 0.016361886429258902, "grad_norm": 1.671321988105774, "learning_rate": 0.00019996789393697165, "loss": 1.3816, "step": 425 }, { "epoch": 0.016554379210779596, "grad_norm": 0.9412807822227478, "learning_rate": 0.00019996712315279992, "loss": 1.443, "step": 430 }, { "epoch": 0.016746871992300287, "grad_norm": 0.8705793023109436, "learning_rate": 0.000199966343227297, "loss": 1.4938, "step": 435 }, { "epoch": 0.01693936477382098, "grad_norm": 1.6019854545593262, "learning_rate": 0.00019996555416053422, "loss": 1.3622, "step": 440 }, { "epoch": 0.017131857555341676, "grad_norm": 1.0340136289596558, "learning_rate": 0.00019996475595258372, "loss": 1.5803, "step": 445 }, { "epoch": 0.017324350336862367, "grad_norm": 1.4469108581542969, "learning_rate": 0.0001999639486035185, "loss": 1.6074, "step": 450 }, { "epoch": 0.01751684311838306, "grad_norm": 1.3311457633972168, "learning_rate": 0.00019996313211341238, "loss": 1.5337, "step": 455 }, { "epoch": 0.017709335899903755, "grad_norm": 0.9691542387008667, "learning_rate": 0.00019996230648234003, "loss": 1.3835, "step": 460 }, { "epoch": 0.017901828681424446, "grad_norm": 1.0229564905166626, "learning_rate": 0.00019996147171037691, "loss": 1.4925, "step": 465 }, { "epoch": 0.01809432146294514, "grad_norm": 1.0120052099227905, "learning_rate": 0.00019996062779759942, "loss": 1.4781, "step": 470 }, { "epoch": 0.01828681424446583, "grad_norm": 0.8471246361732483, "learning_rate": 0.00019995977474408468, "loss": 1.4961, "step": 475 }, { "epoch": 0.018479307025986526, "grad_norm": 2.020277261734009, "learning_rate": 0.00019995891254991072, "loss": 1.5299, "step": 480 }, { "epoch": 0.01867179980750722, "grad_norm": 1.2169212102890015, "learning_rate": 0.00019995804121515637, "loss": 1.4626, "step": 485 }, { "epoch": 0.01886429258902791, "grad_norm": 2.31048321723938, "learning_rate": 0.00019995716073990133, "loss": 1.3653, "step": 490 }, { "epoch": 0.019056785370548605, "grad_norm": 1.8170429468154907, "learning_rate": 0.0001999562711242261, "loss": 1.3537, "step": 495 }, { "epoch": 0.019249278152069296, "grad_norm": 1.1187188625335693, "learning_rate": 0.00019995537236821198, "loss": 1.6358, "step": 500 }, { "epoch": 0.01944177093358999, "grad_norm": 1.2112963199615479, "learning_rate": 0.0001999544644719412, "loss": 1.4565, "step": 505 }, { "epoch": 0.019634263715110685, "grad_norm": 1.3345009088516235, "learning_rate": 0.0001999535474354968, "loss": 1.647, "step": 510 }, { "epoch": 0.019826756496631376, "grad_norm": 1.3109021186828613, "learning_rate": 0.00019995262125896266, "loss": 1.5462, "step": 515 }, { "epoch": 0.02001924927815207, "grad_norm": 1.1681957244873047, "learning_rate": 0.00019995168594242338, "loss": 1.5292, "step": 520 }, { "epoch": 0.02021174205967276, "grad_norm": 0.9509350657463074, "learning_rate": 0.00019995074148596457, "loss": 1.5566, "step": 525 }, { "epoch": 0.020404234841193455, "grad_norm": 0.6594029664993286, "learning_rate": 0.00019994978788967255, "loss": 1.3693, "step": 530 }, { "epoch": 0.02059672762271415, "grad_norm": 0.8029458522796631, "learning_rate": 0.00019994882515363452, "loss": 1.4664, "step": 535 }, { "epoch": 0.02078922040423484, "grad_norm": 1.1551908254623413, "learning_rate": 0.00019994785327793856, "loss": 1.5342, "step": 540 }, { "epoch": 0.020981713185755535, "grad_norm": 1.3600980043411255, "learning_rate": 0.0001999468722626735, "loss": 1.5262, "step": 545 }, { "epoch": 0.021174205967276226, "grad_norm": 1.0333319902420044, "learning_rate": 0.00019994588210792906, "loss": 1.5079, "step": 550 }, { "epoch": 0.02136669874879692, "grad_norm": 1.2757694721221924, "learning_rate": 0.00019994488281379578, "loss": 1.7721, "step": 555 }, { "epoch": 0.021559191530317615, "grad_norm": 1.1292661428451538, "learning_rate": 0.00019994387438036505, "loss": 1.5077, "step": 560 }, { "epoch": 0.021751684311838305, "grad_norm": 1.105522871017456, "learning_rate": 0.00019994285680772906, "loss": 1.6468, "step": 565 }, { "epoch": 0.021944177093359, "grad_norm": 1.6378583908081055, "learning_rate": 0.00019994183009598086, "loss": 1.5432, "step": 570 }, { "epoch": 0.02213666987487969, "grad_norm": 0.931384801864624, "learning_rate": 0.0001999407942452144, "loss": 1.3818, "step": 575 }, { "epoch": 0.022329162656400385, "grad_norm": 1.0986119508743286, "learning_rate": 0.0001999397492555243, "loss": 1.552, "step": 580 }, { "epoch": 0.02252165543792108, "grad_norm": 1.121957540512085, "learning_rate": 0.00019993869512700623, "loss": 1.5241, "step": 585 }, { "epoch": 0.02271414821944177, "grad_norm": 1.2508270740509033, "learning_rate": 0.00019993763185975646, "loss": 1.6431, "step": 590 }, { "epoch": 0.022906641000962465, "grad_norm": 1.293603777885437, "learning_rate": 0.00019993655945387234, "loss": 1.3788, "step": 595 }, { "epoch": 0.023099133782483156, "grad_norm": 1.3218696117401123, "learning_rate": 0.00019993547790945183, "loss": 1.398, "step": 600 }, { "epoch": 0.02329162656400385, "grad_norm": 0.8816308975219727, "learning_rate": 0.0001999343872265939, "loss": 1.4239, "step": 605 }, { "epoch": 0.023484119345524544, "grad_norm": 1.9127452373504639, "learning_rate": 0.00019993328740539824, "loss": 1.549, "step": 610 }, { "epoch": 0.023676612127045235, "grad_norm": 2.071992874145508, "learning_rate": 0.0001999321784459655, "loss": 1.6769, "step": 615 }, { "epoch": 0.02386910490856593, "grad_norm": 1.335153579711914, "learning_rate": 0.000199931060348397, "loss": 1.6157, "step": 620 }, { "epoch": 0.02406159769008662, "grad_norm": 1.1237496137619019, "learning_rate": 0.000199929933112795, "loss": 1.4733, "step": 625 }, { "epoch": 0.024254090471607315, "grad_norm": 1.2557927370071411, "learning_rate": 0.00019992879673926258, "loss": 1.3888, "step": 630 }, { "epoch": 0.02444658325312801, "grad_norm": 1.0877735614776611, "learning_rate": 0.00019992765122790371, "loss": 1.4241, "step": 635 }, { "epoch": 0.0246390760346487, "grad_norm": 1.0029325485229492, "learning_rate": 0.00019992649657882307, "loss": 1.6504, "step": 640 }, { "epoch": 0.024831568816169394, "grad_norm": 1.5832372903823853, "learning_rate": 0.00019992533279212626, "loss": 1.4662, "step": 645 }, { "epoch": 0.025024061597690085, "grad_norm": 1.1658433675765991, "learning_rate": 0.00019992415986791974, "loss": 1.3723, "step": 650 }, { "epoch": 0.02521655437921078, "grad_norm": 1.8895657062530518, "learning_rate": 0.00019992297780631072, "loss": 1.457, "step": 655 }, { "epoch": 0.025409047160731474, "grad_norm": 1.193961501121521, "learning_rate": 0.00019992178660740732, "loss": 1.623, "step": 660 }, { "epoch": 0.025601539942252165, "grad_norm": 0.9851275086402893, "learning_rate": 0.00019992058627131844, "loss": 1.6884, "step": 665 }, { "epoch": 0.02579403272377286, "grad_norm": 1.5353829860687256, "learning_rate": 0.00019991937679815386, "loss": 1.3246, "step": 670 }, { "epoch": 0.02598652550529355, "grad_norm": 1.2476325035095215, "learning_rate": 0.0001999181581880242, "loss": 1.596, "step": 675 }, { "epoch": 0.026179018286814244, "grad_norm": 1.1163430213928223, "learning_rate": 0.00019991693044104083, "loss": 1.5077, "step": 680 }, { "epoch": 0.02637151106833494, "grad_norm": 1.1388076543807983, "learning_rate": 0.0001999156935573161, "loss": 1.4827, "step": 685 }, { "epoch": 0.02656400384985563, "grad_norm": 0.9100907444953918, "learning_rate": 0.00019991444753696304, "loss": 1.3429, "step": 690 }, { "epoch": 0.026756496631376324, "grad_norm": 2.032510995864868, "learning_rate": 0.00019991319238009565, "loss": 1.5473, "step": 695 }, { "epoch": 0.026948989412897015, "grad_norm": 1.0866800546646118, "learning_rate": 0.00019991192808682868, "loss": 1.5552, "step": 700 }, { "epoch": 0.02714148219441771, "grad_norm": 1.3941971063613892, "learning_rate": 0.00019991065465727774, "loss": 1.4103, "step": 705 }, { "epoch": 0.027333974975938403, "grad_norm": 1.721247911453247, "learning_rate": 0.0001999093720915593, "loss": 1.4965, "step": 710 }, { "epoch": 0.027526467757459094, "grad_norm": 1.4090749025344849, "learning_rate": 0.00019990808038979058, "loss": 1.3159, "step": 715 }, { "epoch": 0.02771896053897979, "grad_norm": 1.731886625289917, "learning_rate": 0.00019990677955208973, "loss": 1.4392, "step": 720 }, { "epoch": 0.02791145332050048, "grad_norm": 1.9695488214492798, "learning_rate": 0.00019990546957857576, "loss": 1.6206, "step": 725 }, { "epoch": 0.028103946102021174, "grad_norm": 0.7977893352508545, "learning_rate": 0.0001999041504693684, "loss": 1.5764, "step": 730 }, { "epoch": 0.02829643888354187, "grad_norm": 0.9448668360710144, "learning_rate": 0.00019990282222458826, "loss": 1.3149, "step": 735 }, { "epoch": 0.02848893166506256, "grad_norm": 1.0612679719924927, "learning_rate": 0.00019990148484435682, "loss": 1.4942, "step": 740 }, { "epoch": 0.028681424446583254, "grad_norm": 1.4038052558898926, "learning_rate": 0.0001999001383287964, "loss": 1.5184, "step": 745 }, { "epoch": 0.028873917228103944, "grad_norm": 1.0545177459716797, "learning_rate": 0.0001998987826780301, "loss": 1.5617, "step": 750 }, { "epoch": 0.02906641000962464, "grad_norm": 2.392878532409668, "learning_rate": 0.0001998974178921819, "loss": 1.3638, "step": 755 }, { "epoch": 0.029258902791145333, "grad_norm": 1.1004624366760254, "learning_rate": 0.0001998960439713766, "loss": 1.5162, "step": 760 }, { "epoch": 0.029451395572666024, "grad_norm": 1.2530279159545898, "learning_rate": 0.0001998946609157398, "loss": 1.5422, "step": 765 }, { "epoch": 0.02964388835418672, "grad_norm": 0.8240470290184021, "learning_rate": 0.00019989326872539803, "loss": 1.3828, "step": 770 }, { "epoch": 0.029836381135707413, "grad_norm": 0.9734111428260803, "learning_rate": 0.00019989186740047857, "loss": 1.7041, "step": 775 }, { "epoch": 0.030028873917228104, "grad_norm": 0.9785217642784119, "learning_rate": 0.00019989045694110953, "loss": 1.6267, "step": 780 }, { "epoch": 0.030221366698748798, "grad_norm": 1.3278164863586426, "learning_rate": 0.00019988903734741994, "loss": 1.5041, "step": 785 }, { "epoch": 0.03041385948026949, "grad_norm": 1.9143437147140503, "learning_rate": 0.00019988760861953958, "loss": 1.4728, "step": 790 }, { "epoch": 0.030606352261790183, "grad_norm": 1.5717315673828125, "learning_rate": 0.0001998861707575991, "loss": 1.3824, "step": 795 }, { "epoch": 0.030798845043310877, "grad_norm": 1.0486010313034058, "learning_rate": 0.00019988472376173, "loss": 1.6186, "step": 800 }, { "epoch": 0.03099133782483157, "grad_norm": 1.1566083431243896, "learning_rate": 0.00019988326763206458, "loss": 1.3773, "step": 805 }, { "epoch": 0.031183830606352263, "grad_norm": 1.6336543560028076, "learning_rate": 0.00019988180236873602, "loss": 1.2998, "step": 810 }, { "epoch": 0.031376323387872954, "grad_norm": 1.4655206203460693, "learning_rate": 0.00019988032797187824, "loss": 1.3966, "step": 815 }, { "epoch": 0.031568816169393644, "grad_norm": 2.0325050354003906, "learning_rate": 0.00019987884444162618, "loss": 1.3464, "step": 820 }, { "epoch": 0.03176130895091434, "grad_norm": 1.254342794418335, "learning_rate": 0.0001998773517781154, "loss": 1.5236, "step": 825 }, { "epoch": 0.03195380173243503, "grad_norm": 0.8909908533096313, "learning_rate": 0.00019987584998148244, "loss": 1.4838, "step": 830 }, { "epoch": 0.032146294513955724, "grad_norm": 1.1440258026123047, "learning_rate": 0.00019987433905186458, "loss": 1.3952, "step": 835 }, { "epoch": 0.03233878729547642, "grad_norm": 1.2138668298721313, "learning_rate": 0.00019987281898940003, "loss": 1.5982, "step": 840 }, { "epoch": 0.03253128007699711, "grad_norm": 1.1847470998764038, "learning_rate": 0.00019987128979422782, "loss": 1.4313, "step": 845 }, { "epoch": 0.032723772858517804, "grad_norm": 1.4961762428283691, "learning_rate": 0.0001998697514664877, "loss": 1.5187, "step": 850 }, { "epoch": 0.0329162656400385, "grad_norm": 1.4735344648361206, "learning_rate": 0.00019986820400632043, "loss": 1.5443, "step": 855 }, { "epoch": 0.03310875842155919, "grad_norm": 1.1350771188735962, "learning_rate": 0.00019986664741386743, "loss": 1.5219, "step": 860 }, { "epoch": 0.03330125120307988, "grad_norm": 1.098781943321228, "learning_rate": 0.0001998650816892711, "loss": 1.6074, "step": 865 }, { "epoch": 0.033493743984600574, "grad_norm": 1.9639078378677368, "learning_rate": 0.0001998635068326746, "loss": 1.342, "step": 870 }, { "epoch": 0.03368623676612127, "grad_norm": 1.1193336248397827, "learning_rate": 0.00019986192284422193, "loss": 1.5647, "step": 875 }, { "epoch": 0.03387872954764196, "grad_norm": 1.0558106899261475, "learning_rate": 0.00019986032972405793, "loss": 1.2448, "step": 880 }, { "epoch": 0.034071222329162654, "grad_norm": 1.1178051233291626, "learning_rate": 0.0001998587274723283, "loss": 1.3455, "step": 885 }, { "epoch": 0.03426371511068335, "grad_norm": 1.728400468826294, "learning_rate": 0.0001998571160891795, "loss": 1.44, "step": 890 }, { "epoch": 0.03445620789220404, "grad_norm": 1.158931016921997, "learning_rate": 0.000199855495574759, "loss": 1.4247, "step": 895 }, { "epoch": 0.03464870067372473, "grad_norm": 1.8745627403259277, "learning_rate": 0.0001998538659292149, "loss": 1.4036, "step": 900 }, { "epoch": 0.03484119345524543, "grad_norm": 1.4273000955581665, "learning_rate": 0.0001998522271526962, "loss": 1.4857, "step": 905 }, { "epoch": 0.03503368623676612, "grad_norm": 1.1671931743621826, "learning_rate": 0.0001998505792453528, "loss": 1.7199, "step": 910 }, { "epoch": 0.03522617901828681, "grad_norm": 1.1703475713729858, "learning_rate": 0.00019984892220733537, "loss": 1.5659, "step": 915 }, { "epoch": 0.03541867179980751, "grad_norm": 0.8550274968147278, "learning_rate": 0.00019984725603879546, "loss": 1.3608, "step": 920 }, { "epoch": 0.0356111645813282, "grad_norm": 1.676072359085083, "learning_rate": 0.0001998455807398854, "loss": 1.4841, "step": 925 }, { "epoch": 0.03580365736284889, "grad_norm": 1.362423062324524, "learning_rate": 0.00019984389631075842, "loss": 1.5501, "step": 930 }, { "epoch": 0.03599615014436958, "grad_norm": 1.1643259525299072, "learning_rate": 0.0001998422027515685, "loss": 1.4954, "step": 935 }, { "epoch": 0.03618864292589028, "grad_norm": 1.4984415769577026, "learning_rate": 0.00019984050006247053, "loss": 1.337, "step": 940 }, { "epoch": 0.03638113570741097, "grad_norm": 1.399708867073059, "learning_rate": 0.00019983878824362023, "loss": 1.5546, "step": 945 }, { "epoch": 0.03657362848893166, "grad_norm": 1.8458516597747803, "learning_rate": 0.00019983706729517412, "loss": 1.5268, "step": 950 }, { "epoch": 0.03676612127045236, "grad_norm": 1.1428085565567017, "learning_rate": 0.00019983533721728956, "loss": 1.4454, "step": 955 }, { "epoch": 0.03695861405197305, "grad_norm": 1.2200374603271484, "learning_rate": 0.00019983359801012475, "loss": 1.5586, "step": 960 }, { "epoch": 0.03715110683349374, "grad_norm": 1.3679723739624023, "learning_rate": 0.00019983184967383875, "loss": 1.3948, "step": 965 }, { "epoch": 0.03734359961501444, "grad_norm": 1.489397644996643, "learning_rate": 0.00019983009220859142, "loss": 1.5154, "step": 970 }, { "epoch": 0.03753609239653513, "grad_norm": 1.0442456007003784, "learning_rate": 0.00019982832561454345, "loss": 1.5704, "step": 975 }, { "epoch": 0.03772858517805582, "grad_norm": 1.7480882406234741, "learning_rate": 0.00019982654989185642, "loss": 1.5235, "step": 980 }, { "epoch": 0.03792107795957651, "grad_norm": 1.0078760385513306, "learning_rate": 0.00019982476504069272, "loss": 1.3936, "step": 985 }, { "epoch": 0.03811357074109721, "grad_norm": 1.0461446046829224, "learning_rate": 0.0001998229710612155, "loss": 1.6994, "step": 990 }, { "epoch": 0.0383060635226179, "grad_norm": 2.1919922828674316, "learning_rate": 0.00019982116795358885, "loss": 1.5739, "step": 995 }, { "epoch": 0.03849855630413859, "grad_norm": 1.7092692852020264, "learning_rate": 0.00019981935571797768, "loss": 1.2746, "step": 1000 }, { "epoch": 0.03869104908565929, "grad_norm": 1.3044835329055786, "learning_rate": 0.00019981753435454764, "loss": 1.5254, "step": 1005 }, { "epoch": 0.03888354186717998, "grad_norm": 1.1550064086914062, "learning_rate": 0.0001998157038634653, "loss": 1.6154, "step": 1010 }, { "epoch": 0.03907603464870067, "grad_norm": 2.0250370502471924, "learning_rate": 0.00019981386424489808, "loss": 1.4807, "step": 1015 }, { "epoch": 0.03926852743022137, "grad_norm": 1.036095380783081, "learning_rate": 0.00019981201549901419, "loss": 1.4124, "step": 1020 }, { "epoch": 0.03946102021174206, "grad_norm": 1.126434564590454, "learning_rate": 0.0001998101576259827, "loss": 1.4959, "step": 1025 }, { "epoch": 0.03965351299326275, "grad_norm": 1.2912375926971436, "learning_rate": 0.00019980829062597342, "loss": 1.5006, "step": 1030 }, { "epoch": 0.03984600577478344, "grad_norm": 1.5378974676132202, "learning_rate": 0.00019980641449915713, "loss": 1.3073, "step": 1035 }, { "epoch": 0.04003849855630414, "grad_norm": 1.52741277217865, "learning_rate": 0.0001998045292457054, "loss": 1.3709, "step": 1040 }, { "epoch": 0.04023099133782483, "grad_norm": 1.6989667415618896, "learning_rate": 0.00019980263486579064, "loss": 1.4784, "step": 1045 }, { "epoch": 0.04042348411934552, "grad_norm": 1.0623974800109863, "learning_rate": 0.00019980073135958607, "loss": 1.5163, "step": 1050 }, { "epoch": 0.04061597690086622, "grad_norm": 1.323283314704895, "learning_rate": 0.0001997988187272657, "loss": 1.4793, "step": 1055 }, { "epoch": 0.04080846968238691, "grad_norm": 1.4508922100067139, "learning_rate": 0.00019979689696900447, "loss": 1.4746, "step": 1060 }, { "epoch": 0.0410009624639076, "grad_norm": 1.159579873085022, "learning_rate": 0.0001997949660849781, "loss": 1.2928, "step": 1065 }, { "epoch": 0.0411934552454283, "grad_norm": 1.5187591314315796, "learning_rate": 0.0001997930260753632, "loss": 1.5116, "step": 1070 }, { "epoch": 0.04138594802694899, "grad_norm": 1.7137175798416138, "learning_rate": 0.0001997910769403371, "loss": 1.6406, "step": 1075 }, { "epoch": 0.04157844080846968, "grad_norm": 1.221326470375061, "learning_rate": 0.00019978911868007807, "loss": 1.418, "step": 1080 }, { "epoch": 0.04177093358999037, "grad_norm": 1.0666981935501099, "learning_rate": 0.0001997871512947652, "loss": 1.3768, "step": 1085 }, { "epoch": 0.04196342637151107, "grad_norm": 0.9577809572219849, "learning_rate": 0.00019978517478457834, "loss": 1.4915, "step": 1090 }, { "epoch": 0.04215591915303176, "grad_norm": 2.3966264724731445, "learning_rate": 0.00019978318914969827, "loss": 1.7057, "step": 1095 }, { "epoch": 0.04234841193455245, "grad_norm": 1.0523775815963745, "learning_rate": 0.0001997811943903066, "loss": 1.3887, "step": 1100 }, { "epoch": 0.04254090471607315, "grad_norm": 1.3975977897644043, "learning_rate": 0.00019977919050658566, "loss": 1.5335, "step": 1105 }, { "epoch": 0.04273339749759384, "grad_norm": 1.5198701620101929, "learning_rate": 0.0001997771774987187, "loss": 1.3939, "step": 1110 }, { "epoch": 0.04292589027911453, "grad_norm": 0.7943345308303833, "learning_rate": 0.00019977515536688984, "loss": 1.5908, "step": 1115 }, { "epoch": 0.04311838306063523, "grad_norm": 0.9602519869804382, "learning_rate": 0.00019977312411128398, "loss": 1.3225, "step": 1120 }, { "epoch": 0.04331087584215592, "grad_norm": 1.0204732418060303, "learning_rate": 0.00019977108373208687, "loss": 1.518, "step": 1125 }, { "epoch": 0.04350336862367661, "grad_norm": 1.2130141258239746, "learning_rate": 0.00019976903422948503, "loss": 1.3693, "step": 1130 }, { "epoch": 0.0436958614051973, "grad_norm": 0.854958176612854, "learning_rate": 0.00019976697560366598, "loss": 1.4907, "step": 1135 }, { "epoch": 0.043888354186718, "grad_norm": 1.3699367046356201, "learning_rate": 0.00019976490785481789, "loss": 1.4448, "step": 1140 }, { "epoch": 0.04408084696823869, "grad_norm": 1.1766821146011353, "learning_rate": 0.00019976283098312983, "loss": 1.5171, "step": 1145 }, { "epoch": 0.04427333974975938, "grad_norm": 1.6543035507202148, "learning_rate": 0.00019976074498879174, "loss": 1.2751, "step": 1150 }, { "epoch": 0.04446583253128008, "grad_norm": 1.2228333950042725, "learning_rate": 0.0001997586498719944, "loss": 1.4522, "step": 1155 }, { "epoch": 0.04465832531280077, "grad_norm": 1.2733262777328491, "learning_rate": 0.00019975654563292937, "loss": 1.6292, "step": 1160 }, { "epoch": 0.04485081809432146, "grad_norm": 1.3934366703033447, "learning_rate": 0.00019975443227178904, "loss": 1.433, "step": 1165 }, { "epoch": 0.04504331087584216, "grad_norm": 1.5495753288269043, "learning_rate": 0.00019975230978876672, "loss": 1.5803, "step": 1170 }, { "epoch": 0.04523580365736285, "grad_norm": 1.0099114179611206, "learning_rate": 0.00019975017818405646, "loss": 1.3434, "step": 1175 }, { "epoch": 0.04542829643888354, "grad_norm": 0.9009067416191101, "learning_rate": 0.0001997480374578532, "loss": 1.2312, "step": 1180 }, { "epoch": 0.04562078922040423, "grad_norm": 1.8678425550460815, "learning_rate": 0.00019974588761035266, "loss": 1.6331, "step": 1185 }, { "epoch": 0.04581328200192493, "grad_norm": 0.8258862495422363, "learning_rate": 0.00019974372864175148, "loss": 1.4584, "step": 1190 }, { "epoch": 0.04600577478344562, "grad_norm": 1.44557523727417, "learning_rate": 0.00019974156055224706, "loss": 1.4866, "step": 1195 }, { "epoch": 0.04619826756496631, "grad_norm": 1.7249491214752197, "learning_rate": 0.00019973938334203763, "loss": 1.3704, "step": 1200 }, { "epoch": 0.04639076034648701, "grad_norm": 1.005623698234558, "learning_rate": 0.0001997371970113223, "loss": 1.1993, "step": 1205 }, { "epoch": 0.0465832531280077, "grad_norm": 1.4596670866012573, "learning_rate": 0.00019973500156030105, "loss": 1.4996, "step": 1210 }, { "epoch": 0.04677574590952839, "grad_norm": 1.3085503578186035, "learning_rate": 0.00019973279698917454, "loss": 1.441, "step": 1215 }, { "epoch": 0.04696823869104909, "grad_norm": 0.9477142691612244, "learning_rate": 0.00019973058329814445, "loss": 1.5278, "step": 1220 }, { "epoch": 0.04716073147256978, "grad_norm": 0.9040088653564453, "learning_rate": 0.00019972836048741318, "loss": 1.5374, "step": 1225 }, { "epoch": 0.04735322425409047, "grad_norm": 1.7435801029205322, "learning_rate": 0.00019972612855718395, "loss": 1.3884, "step": 1230 }, { "epoch": 0.04754571703561117, "grad_norm": 1.180665135383606, "learning_rate": 0.00019972388750766088, "loss": 1.2097, "step": 1235 }, { "epoch": 0.04773820981713186, "grad_norm": 1.066064715385437, "learning_rate": 0.00019972163733904895, "loss": 1.4299, "step": 1240 }, { "epoch": 0.04793070259865255, "grad_norm": 1.1051660776138306, "learning_rate": 0.00019971937805155382, "loss": 1.5055, "step": 1245 }, { "epoch": 0.04812319538017324, "grad_norm": 1.2021822929382324, "learning_rate": 0.0001997171096453822, "loss": 1.5842, "step": 1250 }, { "epoch": 0.04831568816169394, "grad_norm": 2.1715807914733887, "learning_rate": 0.00019971483212074146, "loss": 1.4096, "step": 1255 }, { "epoch": 0.04850818094321463, "grad_norm": 1.1615819931030273, "learning_rate": 0.00019971254547783987, "loss": 1.2554, "step": 1260 }, { "epoch": 0.04870067372473532, "grad_norm": 1.5363492965698242, "learning_rate": 0.00019971024971688652, "loss": 1.5773, "step": 1265 }, { "epoch": 0.04889316650625602, "grad_norm": 1.3774447441101074, "learning_rate": 0.00019970794483809137, "loss": 1.3441, "step": 1270 }, { "epoch": 0.04908565928777671, "grad_norm": 2.065901041030884, "learning_rate": 0.00019970563084166515, "loss": 1.6342, "step": 1275 }, { "epoch": 0.0492781520692974, "grad_norm": 1.3221025466918945, "learning_rate": 0.0001997033077278195, "loss": 1.4967, "step": 1280 }, { "epoch": 0.0494706448508181, "grad_norm": 1.6636276245117188, "learning_rate": 0.00019970097549676684, "loss": 1.4936, "step": 1285 }, { "epoch": 0.04966313763233879, "grad_norm": 1.4630615711212158, "learning_rate": 0.0001996986341487204, "loss": 1.4096, "step": 1290 }, { "epoch": 0.04985563041385948, "grad_norm": 1.9586588144302368, "learning_rate": 0.00019969628368389432, "loss": 1.5956, "step": 1295 }, { "epoch": 0.05004812319538017, "grad_norm": 1.0234311819076538, "learning_rate": 0.00019969392410250353, "loss": 1.247, "step": 1300 }, { "epoch": 0.05024061597690087, "grad_norm": 1.7005319595336914, "learning_rate": 0.0001996915554047638, "loss": 1.4179, "step": 1305 }, { "epoch": 0.05043310875842156, "grad_norm": 1.3052936792373657, "learning_rate": 0.0001996891775908917, "loss": 1.4002, "step": 1310 }, { "epoch": 0.05062560153994225, "grad_norm": 1.0146903991699219, "learning_rate": 0.00019968679066110473, "loss": 1.5062, "step": 1315 }, { "epoch": 0.05081809432146295, "grad_norm": 0.9611810445785522, "learning_rate": 0.00019968439461562104, "loss": 1.5303, "step": 1320 }, { "epoch": 0.05101058710298364, "grad_norm": 0.8518236875534058, "learning_rate": 0.0001996819894546599, "loss": 1.3589, "step": 1325 }, { "epoch": 0.05120307988450433, "grad_norm": 1.6918632984161377, "learning_rate": 0.00019967957517844111, "loss": 1.4589, "step": 1330 }, { "epoch": 0.05139557266602503, "grad_norm": 1.4838560819625854, "learning_rate": 0.00019967715178718551, "loss": 1.2714, "step": 1335 }, { "epoch": 0.05158806544754572, "grad_norm": 1.291231632232666, "learning_rate": 0.00019967471928111465, "loss": 1.6378, "step": 1340 }, { "epoch": 0.05178055822906641, "grad_norm": 1.2091941833496094, "learning_rate": 0.00019967227766045102, "loss": 1.3985, "step": 1345 }, { "epoch": 0.0519730510105871, "grad_norm": 1.2294058799743652, "learning_rate": 0.00019966982692541785, "loss": 1.498, "step": 1350 }, { "epoch": 0.0521655437921078, "grad_norm": 1.1644397974014282, "learning_rate": 0.00019966736707623928, "loss": 1.4185, "step": 1355 }, { "epoch": 0.05235803657362849, "grad_norm": 1.7669397592544556, "learning_rate": 0.0001996648981131402, "loss": 1.3564, "step": 1360 }, { "epoch": 0.05255052935514918, "grad_norm": 0.7178487777709961, "learning_rate": 0.00019966242003634644, "loss": 1.2015, "step": 1365 }, { "epoch": 0.05274302213666988, "grad_norm": 0.8149698376655579, "learning_rate": 0.00019965993284608457, "loss": 1.4046, "step": 1370 }, { "epoch": 0.05293551491819057, "grad_norm": 1.3934742212295532, "learning_rate": 0.00019965743654258198, "loss": 1.5289, "step": 1375 }, { "epoch": 0.05312800769971126, "grad_norm": 1.060002326965332, "learning_rate": 0.00019965493112606702, "loss": 1.391, "step": 1380 }, { "epoch": 0.05332050048123196, "grad_norm": 1.1154258251190186, "learning_rate": 0.00019965241659676875, "loss": 1.3004, "step": 1385 }, { "epoch": 0.05351299326275265, "grad_norm": 1.8101186752319336, "learning_rate": 0.00019964989295491713, "loss": 1.4968, "step": 1390 }, { "epoch": 0.05370548604427334, "grad_norm": 1.075211524963379, "learning_rate": 0.00019964736020074294, "loss": 1.5198, "step": 1395 }, { "epoch": 0.05389797882579403, "grad_norm": 2.0130980014801025, "learning_rate": 0.00019964481833447775, "loss": 1.5495, "step": 1400 }, { "epoch": 0.05409047160731473, "grad_norm": 1.214570164680481, "learning_rate": 0.000199642267356354, "loss": 1.5886, "step": 1405 }, { "epoch": 0.05428296438883542, "grad_norm": 1.6430037021636963, "learning_rate": 0.00019963970726660497, "loss": 1.5293, "step": 1410 }, { "epoch": 0.05447545717035611, "grad_norm": 0.94575035572052, "learning_rate": 0.00019963713806546478, "loss": 1.276, "step": 1415 }, { "epoch": 0.05466794995187681, "grad_norm": 1.1988322734832764, "learning_rate": 0.00019963455975316832, "loss": 1.3151, "step": 1420 }, { "epoch": 0.0548604427333975, "grad_norm": 1.2768787145614624, "learning_rate": 0.00019963197232995142, "loss": 1.5559, "step": 1425 }, { "epoch": 0.05505293551491819, "grad_norm": 1.5184259414672852, "learning_rate": 0.0001996293757960506, "loss": 1.2998, "step": 1430 }, { "epoch": 0.055245428296438887, "grad_norm": 6.240184783935547, "learning_rate": 0.0001996267701517034, "loss": 1.4497, "step": 1435 }, { "epoch": 0.05543792107795958, "grad_norm": 1.4356882572174072, "learning_rate": 0.00019962415539714803, "loss": 1.6364, "step": 1440 }, { "epoch": 0.05563041385948027, "grad_norm": 0.9310120940208435, "learning_rate": 0.00019962153153262358, "loss": 1.417, "step": 1445 }, { "epoch": 0.05582290664100096, "grad_norm": 1.2131333351135254, "learning_rate": 0.00019961889855837, "loss": 1.4059, "step": 1450 }, { "epoch": 0.05601539942252166, "grad_norm": 1.2134804725646973, "learning_rate": 0.00019961625647462808, "loss": 1.458, "step": 1455 }, { "epoch": 0.05620789220404235, "grad_norm": 1.5725634098052979, "learning_rate": 0.0001996136052816394, "loss": 1.352, "step": 1460 }, { "epoch": 0.05640038498556304, "grad_norm": 0.9882212281227112, "learning_rate": 0.00019961094497964642, "loss": 1.1665, "step": 1465 }, { "epoch": 0.05659287776708374, "grad_norm": 1.055966854095459, "learning_rate": 0.00019960827556889235, "loss": 1.388, "step": 1470 }, { "epoch": 0.05678537054860443, "grad_norm": 1.0809309482574463, "learning_rate": 0.00019960559704962133, "loss": 1.4287, "step": 1475 }, { "epoch": 0.05697786333012512, "grad_norm": 1.0014935731887817, "learning_rate": 0.00019960290942207828, "loss": 1.5539, "step": 1480 }, { "epoch": 0.057170356111645816, "grad_norm": 1.1717151403427124, "learning_rate": 0.000199600212686509, "loss": 1.3619, "step": 1485 }, { "epoch": 0.05736284889316651, "grad_norm": 1.3981553316116333, "learning_rate": 0.00019959750684316, "loss": 1.3303, "step": 1490 }, { "epoch": 0.0575553416746872, "grad_norm": 0.7471413016319275, "learning_rate": 0.00019959479189227884, "loss": 1.4048, "step": 1495 }, { "epoch": 0.05774783445620789, "grad_norm": 1.1570223569869995, "learning_rate": 0.00019959206783411372, "loss": 1.6713, "step": 1500 }, { "epoch": 0.05794032723772859, "grad_norm": 1.4656585454940796, "learning_rate": 0.00019958933466891366, "loss": 1.3911, "step": 1505 }, { "epoch": 0.05813282001924928, "grad_norm": 1.5338329076766968, "learning_rate": 0.0001995865923969287, "loss": 1.578, "step": 1510 }, { "epoch": 0.05832531280076997, "grad_norm": 0.9481655955314636, "learning_rate": 0.0001995838410184096, "loss": 1.2903, "step": 1515 }, { "epoch": 0.058517805582290666, "grad_norm": 1.4928970336914062, "learning_rate": 0.00019958108053360788, "loss": 1.4139, "step": 1520 }, { "epoch": 0.05871029836381136, "grad_norm": 1.015381932258606, "learning_rate": 0.00019957831094277604, "loss": 1.5427, "step": 1525 }, { "epoch": 0.05890279114533205, "grad_norm": 1.3471331596374512, "learning_rate": 0.0001995755322461673, "loss": 1.3763, "step": 1530 }, { "epoch": 0.059095283926852746, "grad_norm": 2.0942165851593018, "learning_rate": 0.00019957274444403576, "loss": 1.4669, "step": 1535 }, { "epoch": 0.05928777670837344, "grad_norm": 1.4853599071502686, "learning_rate": 0.00019956994753663634, "loss": 1.4259, "step": 1540 }, { "epoch": 0.05948026948989413, "grad_norm": 1.3337596654891968, "learning_rate": 0.0001995671415242248, "loss": 1.4169, "step": 1545 }, { "epoch": 0.059672762271414825, "grad_norm": 1.3816536664962769, "learning_rate": 0.00019956432640705777, "loss": 1.3679, "step": 1550 }, { "epoch": 0.059865255052935516, "grad_norm": 1.1726235151290894, "learning_rate": 0.00019956150218539262, "loss": 1.4076, "step": 1555 }, { "epoch": 0.06005774783445621, "grad_norm": 1.419520378112793, "learning_rate": 0.00019955866885948764, "loss": 1.3621, "step": 1560 }, { "epoch": 0.0602502406159769, "grad_norm": 1.4154486656188965, "learning_rate": 0.0001995558264296019, "loss": 1.4221, "step": 1565 }, { "epoch": 0.060442733397497596, "grad_norm": 1.4721988439559937, "learning_rate": 0.00019955297489599537, "loss": 1.3641, "step": 1570 }, { "epoch": 0.06063522617901829, "grad_norm": 1.1087952852249146, "learning_rate": 0.0001995501142589287, "loss": 1.3734, "step": 1575 }, { "epoch": 0.06082771896053898, "grad_norm": 1.4815518856048584, "learning_rate": 0.00019954724451866357, "loss": 1.4042, "step": 1580 }, { "epoch": 0.061020211742059675, "grad_norm": 1.835754632949829, "learning_rate": 0.00019954436567546236, "loss": 1.2457, "step": 1585 }, { "epoch": 0.061212704523580366, "grad_norm": 1.3139601945877075, "learning_rate": 0.00019954147772958836, "loss": 1.4457, "step": 1590 }, { "epoch": 0.06140519730510106, "grad_norm": 1.155369758605957, "learning_rate": 0.0001995385806813056, "loss": 1.3483, "step": 1595 }, { "epoch": 0.061597690086621755, "grad_norm": 1.1897907257080078, "learning_rate": 0.00019953567453087902, "loss": 1.467, "step": 1600 }, { "epoch": 0.061790182868142446, "grad_norm": 1.0794181823730469, "learning_rate": 0.00019953275927857438, "loss": 1.5171, "step": 1605 }, { "epoch": 0.06198267564966314, "grad_norm": 0.9538444876670837, "learning_rate": 0.00019952983492465824, "loss": 1.2643, "step": 1610 }, { "epoch": 0.06217516843118383, "grad_norm": 1.1179461479187012, "learning_rate": 0.00019952690146939804, "loss": 1.408, "step": 1615 }, { "epoch": 0.062367661212704525, "grad_norm": 1.8034144639968872, "learning_rate": 0.00019952395891306197, "loss": 1.3685, "step": 1620 }, { "epoch": 0.06256015399422522, "grad_norm": 1.04547119140625, "learning_rate": 0.00019952100725591912, "loss": 1.4271, "step": 1625 }, { "epoch": 0.06275264677574591, "grad_norm": 1.3097724914550781, "learning_rate": 0.00019951804649823949, "loss": 1.3303, "step": 1630 }, { "epoch": 0.0629451395572666, "grad_norm": 1.8794469833374023, "learning_rate": 0.00019951507664029374, "loss": 1.5223, "step": 1635 }, { "epoch": 0.06313763233878729, "grad_norm": 1.4077703952789307, "learning_rate": 0.00019951209768235344, "loss": 1.5582, "step": 1640 }, { "epoch": 0.06333012512030799, "grad_norm": 1.2244471311569214, "learning_rate": 0.000199509109624691, "loss": 1.3437, "step": 1645 }, { "epoch": 0.06352261790182868, "grad_norm": 1.4610791206359863, "learning_rate": 0.00019950611246757972, "loss": 1.6944, "step": 1650 }, { "epoch": 0.06371511068334937, "grad_norm": 1.544989824295044, "learning_rate": 0.00019950310621129358, "loss": 1.3288, "step": 1655 }, { "epoch": 0.06390760346487007, "grad_norm": 1.4837945699691772, "learning_rate": 0.00019950009085610755, "loss": 1.1296, "step": 1660 }, { "epoch": 0.06410009624639076, "grad_norm": 2.2527410984039307, "learning_rate": 0.0001994970664022973, "loss": 1.3105, "step": 1665 }, { "epoch": 0.06429258902791145, "grad_norm": 1.3723945617675781, "learning_rate": 0.00019949403285013948, "loss": 1.3976, "step": 1670 }, { "epoch": 0.06448508180943215, "grad_norm": 1.571265459060669, "learning_rate": 0.0001994909901999114, "loss": 1.4603, "step": 1675 }, { "epoch": 0.06467757459095284, "grad_norm": 1.2445194721221924, "learning_rate": 0.00019948793845189137, "loss": 1.3072, "step": 1680 }, { "epoch": 0.06487006737247353, "grad_norm": 2.068112373352051, "learning_rate": 0.00019948487760635842, "loss": 1.4638, "step": 1685 }, { "epoch": 0.06506256015399423, "grad_norm": 1.0896637439727783, "learning_rate": 0.00019948180766359244, "loss": 1.3184, "step": 1690 }, { "epoch": 0.06525505293551492, "grad_norm": 2.0666351318359375, "learning_rate": 0.00019947872862387413, "loss": 1.3944, "step": 1695 }, { "epoch": 0.06544754571703561, "grad_norm": 1.5204085111618042, "learning_rate": 0.00019947564048748508, "loss": 1.3795, "step": 1700 }, { "epoch": 0.0656400384985563, "grad_norm": 0.9768043160438538, "learning_rate": 0.00019947254325470768, "loss": 1.3329, "step": 1705 }, { "epoch": 0.065832531280077, "grad_norm": 1.3453469276428223, "learning_rate": 0.00019946943692582516, "loss": 1.304, "step": 1710 }, { "epoch": 0.06602502406159769, "grad_norm": 1.0725489854812622, "learning_rate": 0.00019946632150112152, "loss": 1.5547, "step": 1715 }, { "epoch": 0.06621751684311838, "grad_norm": 1.5973418951034546, "learning_rate": 0.0001994631969808817, "loss": 1.3263, "step": 1720 }, { "epoch": 0.06641000962463908, "grad_norm": 1.2451751232147217, "learning_rate": 0.0001994600633653914, "loss": 1.4935, "step": 1725 }, { "epoch": 0.06660250240615977, "grad_norm": 1.3474830389022827, "learning_rate": 0.00019945692065493717, "loss": 1.6282, "step": 1730 }, { "epoch": 0.06679499518768046, "grad_norm": 1.7913939952850342, "learning_rate": 0.00019945376884980643, "loss": 1.2935, "step": 1735 }, { "epoch": 0.06698748796920115, "grad_norm": 1.0764446258544922, "learning_rate": 0.00019945060795028728, "loss": 1.6034, "step": 1740 }, { "epoch": 0.06717998075072185, "grad_norm": 1.0572975873947144, "learning_rate": 0.00019944743795666887, "loss": 1.3997, "step": 1745 }, { "epoch": 0.06737247353224254, "grad_norm": 1.3195079565048218, "learning_rate": 0.00019944425886924102, "loss": 1.4838, "step": 1750 }, { "epoch": 0.06756496631376323, "grad_norm": 1.0044989585876465, "learning_rate": 0.00019944107068829448, "loss": 1.388, "step": 1755 }, { "epoch": 0.06775745909528393, "grad_norm": 1.8276032209396362, "learning_rate": 0.0001994378734141207, "loss": 1.447, "step": 1760 }, { "epoch": 0.06794995187680462, "grad_norm": 1.5056366920471191, "learning_rate": 0.00019943466704701218, "loss": 1.5153, "step": 1765 }, { "epoch": 0.06814244465832531, "grad_norm": 1.6947304010391235, "learning_rate": 0.00019943145158726205, "loss": 1.5551, "step": 1770 }, { "epoch": 0.068334937439846, "grad_norm": 0.9702686667442322, "learning_rate": 0.00019942822703516433, "loss": 1.3168, "step": 1775 }, { "epoch": 0.0685274302213667, "grad_norm": 1.6755216121673584, "learning_rate": 0.0001994249933910139, "loss": 1.6223, "step": 1780 }, { "epoch": 0.06871992300288739, "grad_norm": 1.3666303157806396, "learning_rate": 0.00019942175065510643, "loss": 1.5748, "step": 1785 }, { "epoch": 0.06891241578440808, "grad_norm": 1.3785196542739868, "learning_rate": 0.0001994184988277385, "loss": 1.4033, "step": 1790 }, { "epoch": 0.06910490856592878, "grad_norm": 1.081828236579895, "learning_rate": 0.00019941523790920743, "loss": 1.4, "step": 1795 }, { "epoch": 0.06929740134744947, "grad_norm": 1.1024401187896729, "learning_rate": 0.0001994119678998114, "loss": 1.4751, "step": 1800 }, { "epoch": 0.06948989412897016, "grad_norm": 3.584055185317993, "learning_rate": 0.0001994086887998495, "loss": 1.3449, "step": 1805 }, { "epoch": 0.06968238691049086, "grad_norm": 0.9418397545814514, "learning_rate": 0.0001994054006096215, "loss": 1.3217, "step": 1810 }, { "epoch": 0.06987487969201155, "grad_norm": 1.6071193218231201, "learning_rate": 0.00019940210332942813, "loss": 1.3636, "step": 1815 }, { "epoch": 0.07006737247353224, "grad_norm": 2.0080580711364746, "learning_rate": 0.00019939879695957084, "loss": 1.4779, "step": 1820 }, { "epoch": 0.07025986525505294, "grad_norm": 1.169058918952942, "learning_rate": 0.00019939548150035207, "loss": 1.4031, "step": 1825 }, { "epoch": 0.07045235803657363, "grad_norm": 0.9863006472587585, "learning_rate": 0.00019939215695207496, "loss": 1.3832, "step": 1830 }, { "epoch": 0.07064485081809432, "grad_norm": 1.2257460355758667, "learning_rate": 0.00019938882331504347, "loss": 1.4967, "step": 1835 }, { "epoch": 0.07083734359961502, "grad_norm": 1.0062893629074097, "learning_rate": 0.00019938548058956253, "loss": 1.2637, "step": 1840 }, { "epoch": 0.0710298363811357, "grad_norm": 1.4179530143737793, "learning_rate": 0.0001993821287759377, "loss": 1.2961, "step": 1845 }, { "epoch": 0.0712223291626564, "grad_norm": 1.2181779146194458, "learning_rate": 0.00019937876787447557, "loss": 1.4104, "step": 1850 }, { "epoch": 0.07141482194417709, "grad_norm": 1.6110061407089233, "learning_rate": 0.00019937539788548344, "loss": 1.4045, "step": 1855 }, { "epoch": 0.07160731472569778, "grad_norm": 1.2814903259277344, "learning_rate": 0.0001993720188092695, "loss": 1.4194, "step": 1860 }, { "epoch": 0.07179980750721848, "grad_norm": 1.382265329360962, "learning_rate": 0.00019936863064614268, "loss": 1.5848, "step": 1865 }, { "epoch": 0.07199230028873917, "grad_norm": 1.4708553552627563, "learning_rate": 0.00019936523339641286, "loss": 1.6196, "step": 1870 }, { "epoch": 0.07218479307025986, "grad_norm": 1.0691862106323242, "learning_rate": 0.0001993618270603907, "loss": 1.4939, "step": 1875 }, { "epoch": 0.07237728585178056, "grad_norm": 0.9476374387741089, "learning_rate": 0.0001993584116383876, "loss": 1.5043, "step": 1880 }, { "epoch": 0.07256977863330125, "grad_norm": 1.37090003490448, "learning_rate": 0.000199354987130716, "loss": 1.4371, "step": 1885 }, { "epoch": 0.07276227141482194, "grad_norm": 1.2001820802688599, "learning_rate": 0.000199351553537689, "loss": 1.3048, "step": 1890 }, { "epoch": 0.07295476419634264, "grad_norm": 1.1123398542404175, "learning_rate": 0.00019934811085962055, "loss": 1.4398, "step": 1895 }, { "epoch": 0.07314725697786333, "grad_norm": 1.638574242591858, "learning_rate": 0.0001993446590968255, "loss": 1.3563, "step": 1900 }, { "epoch": 0.07333974975938402, "grad_norm": 1.9532630443572998, "learning_rate": 0.00019934119824961948, "loss": 1.3723, "step": 1905 }, { "epoch": 0.07353224254090472, "grad_norm": 1.3247241973876953, "learning_rate": 0.0001993377283183189, "loss": 1.4474, "step": 1910 }, { "epoch": 0.0737247353224254, "grad_norm": 1.203049659729004, "learning_rate": 0.00019933424930324118, "loss": 1.3347, "step": 1915 }, { "epoch": 0.0739172281039461, "grad_norm": 1.8858312368392944, "learning_rate": 0.00019933076120470436, "loss": 1.4754, "step": 1920 }, { "epoch": 0.0741097208854668, "grad_norm": 1.117814540863037, "learning_rate": 0.00019932726402302744, "loss": 1.4828, "step": 1925 }, { "epoch": 0.07430221366698748, "grad_norm": 1.0317554473876953, "learning_rate": 0.00019932375775853021, "loss": 1.5034, "step": 1930 }, { "epoch": 0.07449470644850818, "grad_norm": 2.315903902053833, "learning_rate": 0.00019932024241153332, "loss": 1.4311, "step": 1935 }, { "epoch": 0.07468719923002888, "grad_norm": 1.5780115127563477, "learning_rate": 0.00019931671798235817, "loss": 1.3917, "step": 1940 }, { "epoch": 0.07487969201154956, "grad_norm": 1.3360038995742798, "learning_rate": 0.00019931318447132706, "loss": 1.3634, "step": 1945 }, { "epoch": 0.07507218479307026, "grad_norm": 2.275620937347412, "learning_rate": 0.00019930964187876314, "loss": 1.414, "step": 1950 }, { "epoch": 0.07526467757459095, "grad_norm": 1.7956300973892212, "learning_rate": 0.00019930609020499032, "loss": 1.5117, "step": 1955 }, { "epoch": 0.07545717035611164, "grad_norm": 1.6429657936096191, "learning_rate": 0.0001993025294503334, "loss": 1.4436, "step": 1960 }, { "epoch": 0.07564966313763234, "grad_norm": 1.432246446609497, "learning_rate": 0.000199298959615118, "loss": 1.3952, "step": 1965 }, { "epoch": 0.07584215591915303, "grad_norm": 1.0579869747161865, "learning_rate": 0.00019929538069967051, "loss": 1.4369, "step": 1970 }, { "epoch": 0.07603464870067372, "grad_norm": 1.766543984413147, "learning_rate": 0.00019929179270431824, "loss": 1.5033, "step": 1975 }, { "epoch": 0.07622714148219442, "grad_norm": 1.0774848461151123, "learning_rate": 0.00019928819562938928, "loss": 1.3399, "step": 1980 }, { "epoch": 0.0764196342637151, "grad_norm": 1.0951963663101196, "learning_rate": 0.00019928458947521252, "loss": 1.3656, "step": 1985 }, { "epoch": 0.0766121270452358, "grad_norm": 1.278283953666687, "learning_rate": 0.0001992809742421178, "loss": 1.3467, "step": 1990 }, { "epoch": 0.0768046198267565, "grad_norm": 1.139508605003357, "learning_rate": 0.00019927734993043566, "loss": 1.4316, "step": 1995 }, { "epoch": 0.07699711260827719, "grad_norm": 1.39482581615448, "learning_rate": 0.00019927371654049748, "loss": 1.2032, "step": 2000 }, { "epoch": 0.07718960538979788, "grad_norm": 0.9154567718505859, "learning_rate": 0.0001992700740726356, "loss": 1.5053, "step": 2005 }, { "epoch": 0.07738209817131858, "grad_norm": 1.5105671882629395, "learning_rate": 0.00019926642252718303, "loss": 1.5059, "step": 2010 }, { "epoch": 0.07757459095283926, "grad_norm": 1.4019540548324585, "learning_rate": 0.00019926276190447367, "loss": 1.4051, "step": 2015 }, { "epoch": 0.07776708373435996, "grad_norm": 1.619841456413269, "learning_rate": 0.00019925909220484234, "loss": 1.1784, "step": 2020 }, { "epoch": 0.07795957651588066, "grad_norm": 1.6128195524215698, "learning_rate": 0.0001992554134286245, "loss": 1.4623, "step": 2025 }, { "epoch": 0.07815206929740134, "grad_norm": 1.2766104936599731, "learning_rate": 0.00019925172557615665, "loss": 1.3162, "step": 2030 }, { "epoch": 0.07834456207892204, "grad_norm": 1.2187426090240479, "learning_rate": 0.00019924802864777598, "loss": 1.2874, "step": 2035 }, { "epoch": 0.07853705486044274, "grad_norm": 1.1050268411636353, "learning_rate": 0.00019924432264382055, "loss": 1.433, "step": 2040 }, { "epoch": 0.07872954764196342, "grad_norm": 1.6128287315368652, "learning_rate": 0.00019924060756462925, "loss": 1.4698, "step": 2045 }, { "epoch": 0.07892204042348412, "grad_norm": 1.6588749885559082, "learning_rate": 0.00019923688341054176, "loss": 1.4972, "step": 2050 }, { "epoch": 0.0791145332050048, "grad_norm": 1.135289192199707, "learning_rate": 0.0001992331501818987, "loss": 1.3991, "step": 2055 }, { "epoch": 0.0793070259865255, "grad_norm": 1.757759928703308, "learning_rate": 0.00019922940787904137, "loss": 1.3736, "step": 2060 }, { "epoch": 0.0794995187680462, "grad_norm": 0.9943239092826843, "learning_rate": 0.00019922565650231207, "loss": 1.4476, "step": 2065 }, { "epoch": 0.07969201154956689, "grad_norm": 0.9459586143493652, "learning_rate": 0.00019922189605205379, "loss": 1.3913, "step": 2070 }, { "epoch": 0.07988450433108758, "grad_norm": 1.2325133085250854, "learning_rate": 0.00019921812652861037, "loss": 1.4658, "step": 2075 }, { "epoch": 0.08007699711260828, "grad_norm": 1.2397321462631226, "learning_rate": 0.00019921434793232658, "loss": 1.2552, "step": 2080 }, { "epoch": 0.08026948989412896, "grad_norm": 0.9636020660400391, "learning_rate": 0.0001992105602635479, "loss": 1.3296, "step": 2085 }, { "epoch": 0.08046198267564966, "grad_norm": 0.900841474533081, "learning_rate": 0.00019920676352262067, "loss": 1.2329, "step": 2090 }, { "epoch": 0.08065447545717036, "grad_norm": 1.0425807237625122, "learning_rate": 0.00019920295770989213, "loss": 1.1604, "step": 2095 }, { "epoch": 0.08084696823869104, "grad_norm": 1.1449722051620483, "learning_rate": 0.00019919914282571024, "loss": 1.3233, "step": 2100 }, { "epoch": 0.08103946102021174, "grad_norm": 1.2076728343963623, "learning_rate": 0.00019919531887042387, "loss": 1.3449, "step": 2105 }, { "epoch": 0.08123195380173244, "grad_norm": 0.968323826789856, "learning_rate": 0.00019919148584438272, "loss": 1.4273, "step": 2110 }, { "epoch": 0.08142444658325312, "grad_norm": 1.7322039604187012, "learning_rate": 0.00019918764374793726, "loss": 1.4994, "step": 2115 }, { "epoch": 0.08161693936477382, "grad_norm": 1.4216794967651367, "learning_rate": 0.00019918379258143884, "loss": 1.4071, "step": 2120 }, { "epoch": 0.08180943214629452, "grad_norm": 1.2262970209121704, "learning_rate": 0.00019917993234523963, "loss": 1.3528, "step": 2125 }, { "epoch": 0.0820019249278152, "grad_norm": 1.3137859106063843, "learning_rate": 0.0001991760630396926, "loss": 1.4367, "step": 2130 }, { "epoch": 0.0821944177093359, "grad_norm": 1.364478588104248, "learning_rate": 0.00019917218466515156, "loss": 1.6896, "step": 2135 }, { "epoch": 0.0823869104908566, "grad_norm": 1.2037614583969116, "learning_rate": 0.00019916829722197124, "loss": 1.5371, "step": 2140 }, { "epoch": 0.08257940327237728, "grad_norm": 1.7590453624725342, "learning_rate": 0.00019916440071050706, "loss": 1.6331, "step": 2145 }, { "epoch": 0.08277189605389798, "grad_norm": 1.6112565994262695, "learning_rate": 0.00019916049513111532, "loss": 1.5066, "step": 2150 }, { "epoch": 0.08296438883541868, "grad_norm": 0.937174916267395, "learning_rate": 0.00019915658048415318, "loss": 1.4698, "step": 2155 }, { "epoch": 0.08315688161693936, "grad_norm": 1.8568309545516968, "learning_rate": 0.00019915265676997862, "loss": 1.3197, "step": 2160 }, { "epoch": 0.08334937439846006, "grad_norm": 1.9865350723266602, "learning_rate": 0.00019914872398895043, "loss": 1.4883, "step": 2165 }, { "epoch": 0.08354186717998074, "grad_norm": 1.0227729082107544, "learning_rate": 0.0001991447821414282, "loss": 1.3967, "step": 2170 }, { "epoch": 0.08373435996150144, "grad_norm": 1.3028923273086548, "learning_rate": 0.00019914083122777245, "loss": 1.4296, "step": 2175 }, { "epoch": 0.08392685274302214, "grad_norm": 1.6131690740585327, "learning_rate": 0.00019913687124834442, "loss": 1.2983, "step": 2180 }, { "epoch": 0.08411934552454282, "grad_norm": 1.1791858673095703, "learning_rate": 0.00019913290220350622, "loss": 1.4632, "step": 2185 }, { "epoch": 0.08431183830606352, "grad_norm": 1.8457857370376587, "learning_rate": 0.00019912892409362085, "loss": 1.3623, "step": 2190 }, { "epoch": 0.08450433108758422, "grad_norm": 1.525680422782898, "learning_rate": 0.00019912493691905198, "loss": 1.2729, "step": 2195 }, { "epoch": 0.0846968238691049, "grad_norm": 1.3267451524734497, "learning_rate": 0.0001991209406801643, "loss": 1.3808, "step": 2200 }, { "epoch": 0.0848893166506256, "grad_norm": 1.37312912940979, "learning_rate": 0.00019911693537732323, "loss": 1.6072, "step": 2205 }, { "epoch": 0.0850818094321463, "grad_norm": 1.3433706760406494, "learning_rate": 0.000199112921010895, "loss": 1.4956, "step": 2210 }, { "epoch": 0.08527430221366698, "grad_norm": 1.220732569694519, "learning_rate": 0.00019910889758124672, "loss": 1.4875, "step": 2215 }, { "epoch": 0.08546679499518768, "grad_norm": 0.9385544657707214, "learning_rate": 0.00019910486508874627, "loss": 1.4202, "step": 2220 }, { "epoch": 0.08565928777670838, "grad_norm": 0.8727134466171265, "learning_rate": 0.0001991008235337624, "loss": 1.2268, "step": 2225 }, { "epoch": 0.08585178055822906, "grad_norm": 2.276063919067383, "learning_rate": 0.00019909677291666473, "loss": 1.3911, "step": 2230 }, { "epoch": 0.08604427333974976, "grad_norm": 1.2023353576660156, "learning_rate": 0.00019909271323782364, "loss": 1.4754, "step": 2235 }, { "epoch": 0.08623676612127046, "grad_norm": 0.9018556475639343, "learning_rate": 0.00019908864449761033, "loss": 1.4073, "step": 2240 }, { "epoch": 0.08642925890279114, "grad_norm": 1.2011221647262573, "learning_rate": 0.00019908456669639687, "loss": 1.3213, "step": 2245 }, { "epoch": 0.08662175168431184, "grad_norm": 1.9858746528625488, "learning_rate": 0.0001990804798345562, "loss": 1.3403, "step": 2250 }, { "epoch": 0.08681424446583254, "grad_norm": 1.0072557926177979, "learning_rate": 0.000199076383912462, "loss": 1.3387, "step": 2255 }, { "epoch": 0.08700673724735322, "grad_norm": 1.4516913890838623, "learning_rate": 0.00019907227893048877, "loss": 1.3755, "step": 2260 }, { "epoch": 0.08719923002887392, "grad_norm": 1.0636364221572876, "learning_rate": 0.00019906816488901195, "loss": 1.2495, "step": 2265 }, { "epoch": 0.0873917228103946, "grad_norm": 1.8495078086853027, "learning_rate": 0.0001990640417884077, "loss": 1.4166, "step": 2270 }, { "epoch": 0.0875842155919153, "grad_norm": 2.327951431274414, "learning_rate": 0.00019905990962905312, "loss": 1.3934, "step": 2275 }, { "epoch": 0.087776708373436, "grad_norm": 1.5719425678253174, "learning_rate": 0.00019905576841132595, "loss": 1.3932, "step": 2280 }, { "epoch": 0.08796920115495668, "grad_norm": 1.5799787044525146, "learning_rate": 0.000199051618135605, "loss": 1.5148, "step": 2285 }, { "epoch": 0.08816169393647738, "grad_norm": 0.7972100377082825, "learning_rate": 0.00019904745880226966, "loss": 1.2456, "step": 2290 }, { "epoch": 0.08835418671799808, "grad_norm": 1.4252464771270752, "learning_rate": 0.00019904329041170042, "loss": 1.4287, "step": 2295 }, { "epoch": 0.08854667949951876, "grad_norm": 1.5532910823822021, "learning_rate": 0.00019903911296427834, "loss": 1.3685, "step": 2300 }, { "epoch": 0.08873917228103946, "grad_norm": 1.3019160032272339, "learning_rate": 0.00019903492646038544, "loss": 1.3928, "step": 2305 }, { "epoch": 0.08893166506256016, "grad_norm": 1.7292853593826294, "learning_rate": 0.00019903073090040457, "loss": 1.369, "step": 2310 }, { "epoch": 0.08912415784408084, "grad_norm": 1.1780908107757568, "learning_rate": 0.00019902652628471938, "loss": 1.2541, "step": 2315 }, { "epoch": 0.08931665062560154, "grad_norm": 1.353721261024475, "learning_rate": 0.00019902231261371433, "loss": 1.2658, "step": 2320 }, { "epoch": 0.08950914340712224, "grad_norm": 1.0020657777786255, "learning_rate": 0.0001990180898877748, "loss": 1.3319, "step": 2325 }, { "epoch": 0.08970163618864292, "grad_norm": 1.1655325889587402, "learning_rate": 0.00019901385810728686, "loss": 1.3783, "step": 2330 }, { "epoch": 0.08989412897016362, "grad_norm": 1.2237039804458618, "learning_rate": 0.00019900961727263748, "loss": 1.2919, "step": 2335 }, { "epoch": 0.09008662175168432, "grad_norm": 1.6417179107666016, "learning_rate": 0.0001990053673842145, "loss": 1.471, "step": 2340 }, { "epoch": 0.090279114533205, "grad_norm": 1.2170498371124268, "learning_rate": 0.00019900110844240653, "loss": 1.3889, "step": 2345 }, { "epoch": 0.0904716073147257, "grad_norm": 1.1462334394454956, "learning_rate": 0.00019899684044760304, "loss": 1.4191, "step": 2350 }, { "epoch": 0.0906641000962464, "grad_norm": 0.961063802242279, "learning_rate": 0.00019899256340019425, "loss": 1.5019, "step": 2355 }, { "epoch": 0.09085659287776708, "grad_norm": 0.9323278069496155, "learning_rate": 0.0001989882773005713, "loss": 1.3988, "step": 2360 }, { "epoch": 0.09104908565928778, "grad_norm": 1.8326833248138428, "learning_rate": 0.00019898398214912612, "loss": 1.4211, "step": 2365 }, { "epoch": 0.09124157844080846, "grad_norm": 1.2725722789764404, "learning_rate": 0.00019897967794625153, "loss": 1.3274, "step": 2370 }, { "epoch": 0.09143407122232916, "grad_norm": 0.9105005860328674, "learning_rate": 0.00019897536469234102, "loss": 1.3309, "step": 2375 }, { "epoch": 0.09162656400384986, "grad_norm": 1.3157737255096436, "learning_rate": 0.00019897104238778907, "loss": 1.4086, "step": 2380 }, { "epoch": 0.09181905678537054, "grad_norm": 1.9295995235443115, "learning_rate": 0.00019896671103299094, "loss": 1.3849, "step": 2385 }, { "epoch": 0.09201154956689124, "grad_norm": 1.0183601379394531, "learning_rate": 0.00019896237062834267, "loss": 1.4397, "step": 2390 }, { "epoch": 0.09220404234841194, "grad_norm": 1.118998646736145, "learning_rate": 0.00019895802117424118, "loss": 1.568, "step": 2395 }, { "epoch": 0.09239653512993262, "grad_norm": 1.6463871002197266, "learning_rate": 0.00019895366267108416, "loss": 1.2755, "step": 2400 }, { "epoch": 0.09258902791145332, "grad_norm": 1.3326902389526367, "learning_rate": 0.00019894929511927022, "loss": 1.4369, "step": 2405 }, { "epoch": 0.09278152069297402, "grad_norm": 1.4168566465377808, "learning_rate": 0.00019894491851919871, "loss": 1.4323, "step": 2410 }, { "epoch": 0.0929740134744947, "grad_norm": 1.3266388177871704, "learning_rate": 0.00019894053287126986, "loss": 1.17, "step": 2415 }, { "epoch": 0.0931665062560154, "grad_norm": 1.7362377643585205, "learning_rate": 0.0001989361381758847, "loss": 1.5996, "step": 2420 }, { "epoch": 0.0933589990375361, "grad_norm": 1.1684424877166748, "learning_rate": 0.00019893173443344511, "loss": 1.3486, "step": 2425 }, { "epoch": 0.09355149181905678, "grad_norm": 1.3784310817718506, "learning_rate": 0.00019892732164435376, "loss": 1.2775, "step": 2430 }, { "epoch": 0.09374398460057748, "grad_norm": 1.1288561820983887, "learning_rate": 0.00019892289980901414, "loss": 1.2044, "step": 2435 }, { "epoch": 0.09393647738209818, "grad_norm": 1.1601535081863403, "learning_rate": 0.00019891846892783067, "loss": 1.4937, "step": 2440 }, { "epoch": 0.09412897016361886, "grad_norm": 1.3866316080093384, "learning_rate": 0.0001989140290012085, "loss": 1.913, "step": 2445 }, { "epoch": 0.09432146294513956, "grad_norm": 1.4638808965682983, "learning_rate": 0.00019890958002955362, "loss": 1.4114, "step": 2450 }, { "epoch": 0.09451395572666026, "grad_norm": 1.4660701751708984, "learning_rate": 0.00019890512201327284, "loss": 1.3607, "step": 2455 }, { "epoch": 0.09470644850818094, "grad_norm": 0.9787619113922119, "learning_rate": 0.00019890065495277388, "loss": 1.3729, "step": 2460 }, { "epoch": 0.09489894128970164, "grad_norm": 1.4845494031906128, "learning_rate": 0.00019889617884846517, "loss": 1.3326, "step": 2465 }, { "epoch": 0.09509143407122234, "grad_norm": 1.2955145835876465, "learning_rate": 0.000198891693700756, "loss": 1.3738, "step": 2470 }, { "epoch": 0.09528392685274302, "grad_norm": 1.7431209087371826, "learning_rate": 0.00019888719951005656, "loss": 1.3676, "step": 2475 }, { "epoch": 0.09547641963426372, "grad_norm": 0.923613965511322, "learning_rate": 0.00019888269627677777, "loss": 1.4142, "step": 2480 }, { "epoch": 0.0956689124157844, "grad_norm": 1.0258625745773315, "learning_rate": 0.0001988781840013315, "loss": 1.3868, "step": 2485 }, { "epoch": 0.0958614051973051, "grad_norm": 1.1365761756896973, "learning_rate": 0.00019887366268413025, "loss": 1.2871, "step": 2490 }, { "epoch": 0.0960538979788258, "grad_norm": 2.3250112533569336, "learning_rate": 0.00019886913232558754, "loss": 1.4345, "step": 2495 }, { "epoch": 0.09624639076034648, "grad_norm": 1.1625771522521973, "learning_rate": 0.00019886459292611767, "loss": 1.5796, "step": 2500 }, { "epoch": 0.09643888354186718, "grad_norm": 1.7454233169555664, "learning_rate": 0.00019886004448613562, "loss": 1.6151, "step": 2505 }, { "epoch": 0.09663137632338788, "grad_norm": 1.3514907360076904, "learning_rate": 0.00019885548700605745, "loss": 1.4529, "step": 2510 }, { "epoch": 0.09682386910490856, "grad_norm": 1.9735958576202393, "learning_rate": 0.00019885092048629982, "loss": 1.4945, "step": 2515 }, { "epoch": 0.09701636188642926, "grad_norm": 1.190207600593567, "learning_rate": 0.00019884634492728037, "loss": 1.473, "step": 2520 }, { "epoch": 0.09720885466794996, "grad_norm": 1.1596134901046753, "learning_rate": 0.00019884176032941743, "loss": 1.3745, "step": 2525 }, { "epoch": 0.09740134744947064, "grad_norm": 1.0496324300765991, "learning_rate": 0.0001988371666931303, "loss": 1.3853, "step": 2530 }, { "epoch": 0.09759384023099134, "grad_norm": 1.2820552587509155, "learning_rate": 0.000198832564018839, "loss": 1.4205, "step": 2535 }, { "epoch": 0.09778633301251204, "grad_norm": 0.9559310674667358, "learning_rate": 0.00019882795230696446, "loss": 1.2517, "step": 2540 }, { "epoch": 0.09797882579403272, "grad_norm": 1.026782751083374, "learning_rate": 0.00019882333155792835, "loss": 1.335, "step": 2545 }, { "epoch": 0.09817131857555342, "grad_norm": 1.3378793001174927, "learning_rate": 0.00019881870177215319, "loss": 1.3419, "step": 2550 }, { "epoch": 0.09836381135707412, "grad_norm": 1.0646761655807495, "learning_rate": 0.00019881406295006238, "loss": 1.3793, "step": 2555 }, { "epoch": 0.0985563041385948, "grad_norm": 1.3302899599075317, "learning_rate": 0.00019880941509208005, "loss": 1.3056, "step": 2560 }, { "epoch": 0.0987487969201155, "grad_norm": 1.3029305934906006, "learning_rate": 0.00019880475819863134, "loss": 1.3028, "step": 2565 }, { "epoch": 0.0989412897016362, "grad_norm": 1.6653764247894287, "learning_rate": 0.00019880009227014197, "loss": 1.4698, "step": 2570 }, { "epoch": 0.09913378248315688, "grad_norm": 1.5575610399246216, "learning_rate": 0.00019879541730703865, "loss": 1.2843, "step": 2575 }, { "epoch": 0.09932627526467758, "grad_norm": 1.1219451427459717, "learning_rate": 0.0001987907333097489, "loss": 1.2824, "step": 2580 }, { "epoch": 0.09951876804619826, "grad_norm": 1.680050253868103, "learning_rate": 0.000198786040278701, "loss": 1.431, "step": 2585 }, { "epoch": 0.09971126082771896, "grad_norm": 2.5341451168060303, "learning_rate": 0.00019878133821432412, "loss": 1.3925, "step": 2590 }, { "epoch": 0.09990375360923966, "grad_norm": 1.132542610168457, "learning_rate": 0.00019877662711704824, "loss": 1.4082, "step": 2595 }, { "epoch": 0.10009624639076034, "grad_norm": 1.0605584383010864, "learning_rate": 0.0001987719069873041, "loss": 1.2904, "step": 2600 }, { "epoch": 0.10028873917228104, "grad_norm": 1.161116361618042, "learning_rate": 0.0001987671778255234, "loss": 1.2922, "step": 2605 }, { "epoch": 0.10048123195380174, "grad_norm": 2.2763168811798096, "learning_rate": 0.0001987624396321386, "loss": 1.4692, "step": 2610 }, { "epoch": 0.10067372473532242, "grad_norm": 1.547316312789917, "learning_rate": 0.00019875769240758286, "loss": 1.458, "step": 2615 }, { "epoch": 0.10086621751684312, "grad_norm": 1.0679529905319214, "learning_rate": 0.0001987529361522904, "loss": 1.3075, "step": 2620 }, { "epoch": 0.10105871029836382, "grad_norm": 1.9426227807998657, "learning_rate": 0.0001987481708666961, "loss": 1.4985, "step": 2625 }, { "epoch": 0.1012512030798845, "grad_norm": 1.1619765758514404, "learning_rate": 0.00019874339655123575, "loss": 1.329, "step": 2630 }, { "epoch": 0.1014436958614052, "grad_norm": 0.8115332722663879, "learning_rate": 0.00019873861320634587, "loss": 1.218, "step": 2635 }, { "epoch": 0.1016361886429259, "grad_norm": 1.2575538158416748, "learning_rate": 0.0001987338208324639, "loss": 1.3133, "step": 2640 }, { "epoch": 0.10182868142444658, "grad_norm": 0.9605635404586792, "learning_rate": 0.00019872901943002806, "loss": 1.4462, "step": 2645 }, { "epoch": 0.10202117420596728, "grad_norm": 1.7909116744995117, "learning_rate": 0.00019872420899947742, "loss": 1.257, "step": 2650 }, { "epoch": 0.10221366698748797, "grad_norm": 1.5501129627227783, "learning_rate": 0.00019871938954125185, "loss": 1.2825, "step": 2655 }, { "epoch": 0.10240615976900866, "grad_norm": 1.4636069536209106, "learning_rate": 0.00019871456105579208, "loss": 1.3909, "step": 2660 }, { "epoch": 0.10259865255052936, "grad_norm": 1.4283297061920166, "learning_rate": 0.0001987097235435396, "loss": 1.2148, "step": 2665 }, { "epoch": 0.10279114533205005, "grad_norm": 1.316149115562439, "learning_rate": 0.00019870487700493684, "loss": 1.393, "step": 2670 }, { "epoch": 0.10298363811357074, "grad_norm": 0.8449459671974182, "learning_rate": 0.00019870002144042689, "loss": 1.4969, "step": 2675 }, { "epoch": 0.10317613089509144, "grad_norm": 1.3309835195541382, "learning_rate": 0.00019869515685045383, "loss": 1.4927, "step": 2680 }, { "epoch": 0.10336862367661212, "grad_norm": 0.9159907102584839, "learning_rate": 0.00019869028323546246, "loss": 1.3526, "step": 2685 }, { "epoch": 0.10356111645813282, "grad_norm": 2.2842464447021484, "learning_rate": 0.00019868540059589845, "loss": 1.3646, "step": 2690 }, { "epoch": 0.10375360923965352, "grad_norm": 0.9444146156311035, "learning_rate": 0.00019868050893220832, "loss": 1.349, "step": 2695 }, { "epoch": 0.1039461020211742, "grad_norm": 1.8546898365020752, "learning_rate": 0.0001986756082448393, "loss": 1.3195, "step": 2700 }, { "epoch": 0.1041385948026949, "grad_norm": 1.310783863067627, "learning_rate": 0.00019867069853423961, "loss": 1.6065, "step": 2705 }, { "epoch": 0.1043310875842156, "grad_norm": 1.248542308807373, "learning_rate": 0.00019866577980085813, "loss": 1.1987, "step": 2710 }, { "epoch": 0.10452358036573628, "grad_norm": 1.421844482421875, "learning_rate": 0.00019866085204514472, "loss": 1.3576, "step": 2715 }, { "epoch": 0.10471607314725698, "grad_norm": 1.1641993522644043, "learning_rate": 0.00019865591526754996, "loss": 1.436, "step": 2720 }, { "epoch": 0.10490856592877768, "grad_norm": 1.1122993230819702, "learning_rate": 0.0001986509694685253, "loss": 1.4218, "step": 2725 }, { "epoch": 0.10510105871029836, "grad_norm": 1.222016453742981, "learning_rate": 0.00019864601464852295, "loss": 1.2965, "step": 2730 }, { "epoch": 0.10529355149181906, "grad_norm": 1.6765378713607788, "learning_rate": 0.00019864105080799602, "loss": 1.3908, "step": 2735 }, { "epoch": 0.10548604427333975, "grad_norm": 1.8405592441558838, "learning_rate": 0.00019863607794739845, "loss": 1.2583, "step": 2740 }, { "epoch": 0.10567853705486044, "grad_norm": 1.3908604383468628, "learning_rate": 0.00019863109606718497, "loss": 1.2726, "step": 2745 }, { "epoch": 0.10587102983638114, "grad_norm": 1.3825894594192505, "learning_rate": 0.0001986261051678111, "loss": 1.3234, "step": 2750 }, { "epoch": 0.10606352261790183, "grad_norm": 1.5409029722213745, "learning_rate": 0.00019862110524973328, "loss": 1.4151, "step": 2755 }, { "epoch": 0.10625601539942252, "grad_norm": 2.1902191638946533, "learning_rate": 0.00019861609631340868, "loss": 1.3865, "step": 2760 }, { "epoch": 0.10644850818094322, "grad_norm": 0.9851712584495544, "learning_rate": 0.00019861107835929533, "loss": 1.4799, "step": 2765 }, { "epoch": 0.10664100096246391, "grad_norm": 1.2206732034683228, "learning_rate": 0.0001986060513878521, "loss": 1.3456, "step": 2770 }, { "epoch": 0.1068334937439846, "grad_norm": 1.3443645238876343, "learning_rate": 0.0001986010153995387, "loss": 1.2586, "step": 2775 }, { "epoch": 0.1070259865255053, "grad_norm": 1.1602864265441895, "learning_rate": 0.00019859597039481561, "loss": 1.1789, "step": 2780 }, { "epoch": 0.107218479307026, "grad_norm": 0.8068190813064575, "learning_rate": 0.00019859091637414414, "loss": 1.4228, "step": 2785 }, { "epoch": 0.10741097208854668, "grad_norm": 1.4439321756362915, "learning_rate": 0.0001985858533379865, "loss": 1.4365, "step": 2790 }, { "epoch": 0.10760346487006738, "grad_norm": 1.0814299583435059, "learning_rate": 0.00019858078128680564, "loss": 1.2755, "step": 2795 }, { "epoch": 0.10779595765158806, "grad_norm": 1.7848068475723267, "learning_rate": 0.00019857570022106536, "loss": 1.4061, "step": 2800 }, { "epoch": 0.10798845043310876, "grad_norm": 1.3163549900054932, "learning_rate": 0.0001985706101412303, "loss": 1.3599, "step": 2805 }, { "epoch": 0.10818094321462945, "grad_norm": 1.439104437828064, "learning_rate": 0.0001985655110477659, "loss": 1.3054, "step": 2810 }, { "epoch": 0.10837343599615014, "grad_norm": 0.892706036567688, "learning_rate": 0.0001985604029411385, "loss": 1.3504, "step": 2815 }, { "epoch": 0.10856592877767084, "grad_norm": 1.102704405784607, "learning_rate": 0.0001985552858218151, "loss": 1.3902, "step": 2820 }, { "epoch": 0.10875842155919153, "grad_norm": 1.21804678440094, "learning_rate": 0.0001985501596902637, "loss": 1.36, "step": 2825 }, { "epoch": 0.10895091434071222, "grad_norm": 1.6015477180480957, "learning_rate": 0.00019854502454695302, "loss": 1.6163, "step": 2830 }, { "epoch": 0.10914340712223292, "grad_norm": 1.3947224617004395, "learning_rate": 0.00019853988039235265, "loss": 1.2207, "step": 2835 }, { "epoch": 0.10933589990375361, "grad_norm": 1.616458535194397, "learning_rate": 0.00019853472722693302, "loss": 1.2081, "step": 2840 }, { "epoch": 0.1095283926852743, "grad_norm": 2.1588330268859863, "learning_rate": 0.00019852956505116528, "loss": 1.4428, "step": 2845 }, { "epoch": 0.109720885466795, "grad_norm": 1.2287509441375732, "learning_rate": 0.00019852439386552152, "loss": 1.4548, "step": 2850 }, { "epoch": 0.1099133782483157, "grad_norm": 1.7198657989501953, "learning_rate": 0.00019851921367047463, "loss": 1.2034, "step": 2855 }, { "epoch": 0.11010587102983638, "grad_norm": 1.4924067258834839, "learning_rate": 0.00019851402446649825, "loss": 1.3635, "step": 2860 }, { "epoch": 0.11029836381135708, "grad_norm": 1.3675332069396973, "learning_rate": 0.00019850882625406695, "loss": 1.29, "step": 2865 }, { "epoch": 0.11049085659287777, "grad_norm": 1.2170599699020386, "learning_rate": 0.00019850361903365603, "loss": 1.3495, "step": 2870 }, { "epoch": 0.11068334937439846, "grad_norm": 1.6067026853561401, "learning_rate": 0.00019849840280574167, "loss": 1.4679, "step": 2875 }, { "epoch": 0.11087584215591915, "grad_norm": 1.0457261800765991, "learning_rate": 0.00019849317757080092, "loss": 1.3289, "step": 2880 }, { "epoch": 0.11106833493743985, "grad_norm": 0.6958736181259155, "learning_rate": 0.00019848794332931146, "loss": 0.9412, "step": 2885 }, { "epoch": 0.11126082771896054, "grad_norm": 0.9687005281448364, "learning_rate": 0.00019848270008175205, "loss": 1.2777, "step": 2890 }, { "epoch": 0.11145332050048123, "grad_norm": 0.8073298931121826, "learning_rate": 0.00019847744782860213, "loss": 1.4295, "step": 2895 }, { "epoch": 0.11164581328200192, "grad_norm": 0.8794350624084473, "learning_rate": 0.00019847218657034193, "loss": 1.2199, "step": 2900 }, { "epoch": 0.11183830606352262, "grad_norm": 1.644554853439331, "learning_rate": 0.00019846691630745258, "loss": 1.3076, "step": 2905 }, { "epoch": 0.11203079884504331, "grad_norm": 1.0819231271743774, "learning_rate": 0.00019846163704041603, "loss": 1.385, "step": 2910 }, { "epoch": 0.112223291626564, "grad_norm": 1.4424269199371338, "learning_rate": 0.000198456348769715, "loss": 1.4287, "step": 2915 }, { "epoch": 0.1124157844080847, "grad_norm": 1.289413332939148, "learning_rate": 0.00019845105149583308, "loss": 1.25, "step": 2920 }, { "epoch": 0.1126082771896054, "grad_norm": 1.4669229984283447, "learning_rate": 0.00019844574521925474, "loss": 1.5371, "step": 2925 }, { "epoch": 0.11280076997112608, "grad_norm": 2.102736473083496, "learning_rate": 0.0001984404299404651, "loss": 1.5017, "step": 2930 }, { "epoch": 0.11299326275264678, "grad_norm": 1.1487330198287964, "learning_rate": 0.00019843510565995025, "loss": 1.3164, "step": 2935 }, { "epoch": 0.11318575553416747, "grad_norm": 1.259538173675537, "learning_rate": 0.00019842977237819707, "loss": 1.2946, "step": 2940 }, { "epoch": 0.11337824831568816, "grad_norm": 2.3158466815948486, "learning_rate": 0.00019842443009569324, "loss": 1.4614, "step": 2945 }, { "epoch": 0.11357074109720885, "grad_norm": 1.5077046155929565, "learning_rate": 0.0001984190788129273, "loss": 1.3478, "step": 2950 }, { "epoch": 0.11376323387872955, "grad_norm": 1.2548809051513672, "learning_rate": 0.00019841371853038852, "loss": 1.3351, "step": 2955 }, { "epoch": 0.11395572666025024, "grad_norm": 1.4622430801391602, "learning_rate": 0.00019840834924856715, "loss": 1.2788, "step": 2960 }, { "epoch": 0.11414821944177093, "grad_norm": 0.9759154319763184, "learning_rate": 0.00019840297096795415, "loss": 1.2793, "step": 2965 }, { "epoch": 0.11434071222329163, "grad_norm": 1.2217987775802612, "learning_rate": 0.00019839758368904128, "loss": 1.284, "step": 2970 }, { "epoch": 0.11453320500481232, "grad_norm": 2.180697441101074, "learning_rate": 0.00019839326738746614, "loss": 1.4163, "step": 2975 }, { "epoch": 0.11472569778633301, "grad_norm": 1.156293511390686, "learning_rate": 0.00019838786391285554, "loss": 1.3045, "step": 2980 }, { "epoch": 0.11491819056785371, "grad_norm": 1.1444417238235474, "learning_rate": 0.00019838245144132658, "loss": 1.4522, "step": 2985 }, { "epoch": 0.1151106833493744, "grad_norm": 1.3959949016571045, "learning_rate": 0.00019837702997337414, "loss": 1.3959, "step": 2990 }, { "epoch": 0.1153031761308951, "grad_norm": 1.2789435386657715, "learning_rate": 0.00019837159950949402, "loss": 1.2951, "step": 2995 }, { "epoch": 0.11549566891241578, "grad_norm": 1.0902299880981445, "learning_rate": 0.00019836616005018275, "loss": 1.4573, "step": 3000 }, { "epoch": 0.11568816169393648, "grad_norm": 1.452920913696289, "learning_rate": 0.0001983607115959378, "loss": 1.4688, "step": 3005 }, { "epoch": 0.11588065447545717, "grad_norm": 2.192514419555664, "learning_rate": 0.0001983552541472573, "loss": 1.4282, "step": 3010 }, { "epoch": 0.11607314725697786, "grad_norm": 1.938883900642395, "learning_rate": 0.0001983497877046404, "loss": 1.6123, "step": 3015 }, { "epoch": 0.11626564003849855, "grad_norm": 2.4365732669830322, "learning_rate": 0.0001983443122685869, "loss": 1.4987, "step": 3020 }, { "epoch": 0.11645813282001925, "grad_norm": 1.827972173690796, "learning_rate": 0.0001983388278395975, "loss": 1.2196, "step": 3025 }, { "epoch": 0.11665062560153994, "grad_norm": 1.6184618473052979, "learning_rate": 0.00019833333441817374, "loss": 1.5257, "step": 3030 }, { "epoch": 0.11684311838306063, "grad_norm": 1.0191036462783813, "learning_rate": 0.00019832783200481797, "loss": 1.4799, "step": 3035 }, { "epoch": 0.11703561116458133, "grad_norm": 1.1552925109863281, "learning_rate": 0.0001983223206000333, "loss": 1.2014, "step": 3040 }, { "epoch": 0.11722810394610202, "grad_norm": 0.9793531894683838, "learning_rate": 0.00019831680020432376, "loss": 1.2092, "step": 3045 }, { "epoch": 0.11742059672762271, "grad_norm": 1.480634331703186, "learning_rate": 0.0001983112708181941, "loss": 1.3238, "step": 3050 }, { "epoch": 0.11761308950914341, "grad_norm": 1.5112073421478271, "learning_rate": 0.00019830573244215, "loss": 1.5513, "step": 3055 }, { "epoch": 0.1178055822906641, "grad_norm": 1.4130852222442627, "learning_rate": 0.00019830018507669786, "loss": 1.4368, "step": 3060 }, { "epoch": 0.1179980750721848, "grad_norm": 1.401934027671814, "learning_rate": 0.000198294628722345, "loss": 1.243, "step": 3065 }, { "epoch": 0.11819056785370549, "grad_norm": 1.8309379816055298, "learning_rate": 0.00019828906337959946, "loss": 1.1656, "step": 3070 }, { "epoch": 0.11838306063522618, "grad_norm": 0.8511875867843628, "learning_rate": 0.0001982834890489702, "loss": 1.406, "step": 3075 }, { "epoch": 0.11857555341674687, "grad_norm": 1.4291598796844482, "learning_rate": 0.00019827790573096694, "loss": 1.3963, "step": 3080 }, { "epoch": 0.11876804619826757, "grad_norm": 0.6835631132125854, "learning_rate": 0.0001982723134261002, "loss": 1.1238, "step": 3085 }, { "epoch": 0.11896053897978826, "grad_norm": 1.6569236516952515, "learning_rate": 0.00019826671213488145, "loss": 1.3335, "step": 3090 }, { "epoch": 0.11915303176130895, "grad_norm": 1.0488132238388062, "learning_rate": 0.00019826110185782277, "loss": 1.3009, "step": 3095 }, { "epoch": 0.11934552454282965, "grad_norm": 1.3253639936447144, "learning_rate": 0.00019825548259543726, "loss": 1.3863, "step": 3100 }, { "epoch": 0.11953801732435033, "grad_norm": 0.9408076405525208, "learning_rate": 0.00019824985434823878, "loss": 1.3184, "step": 3105 }, { "epoch": 0.11973051010587103, "grad_norm": 0.9649772644042969, "learning_rate": 0.00019824421711674194, "loss": 1.2427, "step": 3110 }, { "epoch": 0.11992300288739172, "grad_norm": 1.7673052549362183, "learning_rate": 0.00019823857090146225, "loss": 1.2804, "step": 3115 }, { "epoch": 0.12011549566891241, "grad_norm": 1.230724811553955, "learning_rate": 0.00019823291570291604, "loss": 1.3527, "step": 3120 }, { "epoch": 0.12030798845043311, "grad_norm": 2.382617473602295, "learning_rate": 0.0001982272515216204, "loss": 1.4123, "step": 3125 }, { "epoch": 0.1205004812319538, "grad_norm": 1.2811720371246338, "learning_rate": 0.00019822157835809332, "loss": 1.3935, "step": 3130 }, { "epoch": 0.1206929740134745, "grad_norm": 1.9592630863189697, "learning_rate": 0.00019821589621285356, "loss": 1.2387, "step": 3135 }, { "epoch": 0.12088546679499519, "grad_norm": 1.659197449684143, "learning_rate": 0.0001982102050864207, "loss": 1.4228, "step": 3140 }, { "epoch": 0.12107795957651588, "grad_norm": 1.2591451406478882, "learning_rate": 0.00019820450497931517, "loss": 1.3192, "step": 3145 }, { "epoch": 0.12127045235803657, "grad_norm": 1.1670453548431396, "learning_rate": 0.00019819879589205822, "loss": 1.2593, "step": 3150 }, { "epoch": 0.12146294513955727, "grad_norm": 1.680776834487915, "learning_rate": 0.0001981930778251719, "loss": 1.5809, "step": 3155 }, { "epoch": 0.12165543792107796, "grad_norm": 1.388492226600647, "learning_rate": 0.00019818735077917904, "loss": 1.5646, "step": 3160 }, { "epoch": 0.12184793070259865, "grad_norm": 1.3851470947265625, "learning_rate": 0.00019818161475460342, "loss": 1.3282, "step": 3165 }, { "epoch": 0.12204042348411935, "grad_norm": 1.252103567123413, "learning_rate": 0.0001981758697519695, "loss": 1.3326, "step": 3170 }, { "epoch": 0.12223291626564003, "grad_norm": 2.6637227535247803, "learning_rate": 0.0001981701157718027, "loss": 1.4247, "step": 3175 }, { "epoch": 0.12242540904716073, "grad_norm": 1.4228829145431519, "learning_rate": 0.00019816435281462907, "loss": 1.3287, "step": 3180 }, { "epoch": 0.12261790182868143, "grad_norm": 1.0654631853103638, "learning_rate": 0.00019815858088097565, "loss": 1.3651, "step": 3185 }, { "epoch": 0.12281039461020211, "grad_norm": 1.1779879331588745, "learning_rate": 0.00019815279997137028, "loss": 1.2699, "step": 3190 }, { "epoch": 0.12300288739172281, "grad_norm": 0.966482937335968, "learning_rate": 0.0001981470100863416, "loss": 1.3029, "step": 3195 }, { "epoch": 0.12319538017324351, "grad_norm": 1.13119375705719, "learning_rate": 0.00019814121122641894, "loss": 1.3431, "step": 3200 }, { "epoch": 0.1233878729547642, "grad_norm": 1.0690468549728394, "learning_rate": 0.00019813540339213263, "loss": 1.237, "step": 3205 }, { "epoch": 0.12358036573628489, "grad_norm": 1.169592022895813, "learning_rate": 0.00019812958658401382, "loss": 1.3341, "step": 3210 }, { "epoch": 0.12377285851780558, "grad_norm": 0.9310591816902161, "learning_rate": 0.00019812376080259435, "loss": 1.3168, "step": 3215 }, { "epoch": 0.12396535129932627, "grad_norm": 1.1262513399124146, "learning_rate": 0.00019811792604840694, "loss": 1.322, "step": 3220 }, { "epoch": 0.12415784408084697, "grad_norm": 1.0723376274108887, "learning_rate": 0.00019811208232198518, "loss": 1.2814, "step": 3225 }, { "epoch": 0.12435033686236766, "grad_norm": 1.5084266662597656, "learning_rate": 0.00019810622962386344, "loss": 1.3136, "step": 3230 }, { "epoch": 0.12454282964388835, "grad_norm": 1.5219266414642334, "learning_rate": 0.0001981003679545769, "loss": 1.2971, "step": 3235 }, { "epoch": 0.12473532242540905, "grad_norm": 1.8135708570480347, "learning_rate": 0.00019809449731466154, "loss": 1.3987, "step": 3240 }, { "epoch": 0.12492781520692973, "grad_norm": 1.9838290214538574, "learning_rate": 0.00019808861770465424, "loss": 1.4063, "step": 3245 }, { "epoch": 0.12512030798845045, "grad_norm": 0.9821895956993103, "learning_rate": 0.00019808272912509258, "loss": 1.4336, "step": 3250 }, { "epoch": 0.12531280076997112, "grad_norm": 1.0371532440185547, "learning_rate": 0.00019807683157651513, "loss": 1.4659, "step": 3255 }, { "epoch": 0.12550529355149181, "grad_norm": 1.2441003322601318, "learning_rate": 0.0001980709250594611, "loss": 1.3807, "step": 3260 }, { "epoch": 0.1256977863330125, "grad_norm": 1.6097456216812134, "learning_rate": 0.00019806500957447067, "loss": 1.4115, "step": 3265 }, { "epoch": 0.1258902791145332, "grad_norm": 1.4005634784698486, "learning_rate": 0.0001980590851220847, "loss": 1.6008, "step": 3270 }, { "epoch": 0.1260827718960539, "grad_norm": 1.1883544921875, "learning_rate": 0.00019805315170284498, "loss": 1.3768, "step": 3275 }, { "epoch": 0.12627526467757458, "grad_norm": 1.2404242753982544, "learning_rate": 0.00019804720931729413, "loss": 1.463, "step": 3280 }, { "epoch": 0.12646775745909528, "grad_norm": 0.625027596950531, "learning_rate": 0.00019804125796597544, "loss": 1.3286, "step": 3285 }, { "epoch": 0.12666025024061597, "grad_norm": 1.5616633892059326, "learning_rate": 0.0001980352976494332, "loss": 1.4161, "step": 3290 }, { "epoch": 0.12685274302213667, "grad_norm": 0.8003360629081726, "learning_rate": 0.0001980293283682124, "loss": 1.4117, "step": 3295 }, { "epoch": 0.12704523580365737, "grad_norm": 1.0671011209487915, "learning_rate": 0.0001980233501228589, "loss": 1.4192, "step": 3300 }, { "epoch": 0.12723772858517807, "grad_norm": 1.4135669469833374, "learning_rate": 0.0001980173629139194, "loss": 1.3046, "step": 3305 }, { "epoch": 0.12743022136669874, "grad_norm": 1.0450470447540283, "learning_rate": 0.00019801136674194134, "loss": 1.4156, "step": 3310 }, { "epoch": 0.12762271414821943, "grad_norm": 1.1435261964797974, "learning_rate": 0.00019800536160747306, "loss": 1.2311, "step": 3315 }, { "epoch": 0.12781520692974013, "grad_norm": 1.5508229732513428, "learning_rate": 0.0001979993475110637, "loss": 1.4224, "step": 3320 }, { "epoch": 0.12800769971126083, "grad_norm": 0.9542085528373718, "learning_rate": 0.0001979933244532632, "loss": 1.2423, "step": 3325 }, { "epoch": 0.12820019249278153, "grad_norm": 1.5797593593597412, "learning_rate": 0.0001979872924346223, "loss": 1.3357, "step": 3330 }, { "epoch": 0.12839268527430223, "grad_norm": 1.0982688665390015, "learning_rate": 0.00019798125145569263, "loss": 1.2404, "step": 3335 }, { "epoch": 0.1285851780558229, "grad_norm": 1.5471248626708984, "learning_rate": 0.0001979752015170266, "loss": 1.3556, "step": 3340 }, { "epoch": 0.1287776708373436, "grad_norm": 1.64442777633667, "learning_rate": 0.0001979691426191774, "loss": 1.3407, "step": 3345 }, { "epoch": 0.1289701636188643, "grad_norm": 1.494186520576477, "learning_rate": 0.0001979630747626991, "loss": 1.4509, "step": 3350 }, { "epoch": 0.129162656400385, "grad_norm": 0.9598186612129211, "learning_rate": 0.00019795699794814654, "loss": 1.3221, "step": 3355 }, { "epoch": 0.1293551491819057, "grad_norm": 1.1328315734863281, "learning_rate": 0.00019795091217607544, "loss": 1.5129, "step": 3360 }, { "epoch": 0.12954764196342639, "grad_norm": 1.0476043224334717, "learning_rate": 0.00019794481744704227, "loss": 1.3448, "step": 3365 }, { "epoch": 0.12974013474494706, "grad_norm": 1.2570463418960571, "learning_rate": 0.0001979387137616044, "loss": 1.2726, "step": 3370 }, { "epoch": 0.12993262752646775, "grad_norm": 1.395627498626709, "learning_rate": 0.00019793260112031992, "loss": 1.1469, "step": 3375 }, { "epoch": 0.13012512030798845, "grad_norm": 2.2382960319519043, "learning_rate": 0.00019792647952374782, "loss": 1.3375, "step": 3380 }, { "epoch": 0.13031761308950915, "grad_norm": 1.4930087327957153, "learning_rate": 0.00019792034897244784, "loss": 1.3684, "step": 3385 }, { "epoch": 0.13051010587102985, "grad_norm": 0.9732452034950256, "learning_rate": 0.00019791420946698064, "loss": 1.0792, "step": 3390 }, { "epoch": 0.13070259865255052, "grad_norm": 1.9484987258911133, "learning_rate": 0.0001979080610079076, "loss": 1.4284, "step": 3395 }, { "epoch": 0.13089509143407121, "grad_norm": 1.3746837377548218, "learning_rate": 0.00019790190359579097, "loss": 1.4393, "step": 3400 }, { "epoch": 0.1310875842155919, "grad_norm": 1.2191319465637207, "learning_rate": 0.0001978957372311938, "loss": 1.2184, "step": 3405 }, { "epoch": 0.1312800769971126, "grad_norm": 1.0825196504592896, "learning_rate": 0.00019788956191467994, "loss": 1.3891, "step": 3410 }, { "epoch": 0.1314725697786333, "grad_norm": 1.9972898960113525, "learning_rate": 0.00019788337764681412, "loss": 1.3207, "step": 3415 }, { "epoch": 0.131665062560154, "grad_norm": 1.3864003419876099, "learning_rate": 0.00019787718442816182, "loss": 1.3791, "step": 3420 }, { "epoch": 0.13185755534167468, "grad_norm": 1.3315006494522095, "learning_rate": 0.0001978709822592894, "loss": 1.4253, "step": 3425 }, { "epoch": 0.13205004812319537, "grad_norm": 1.0171843767166138, "learning_rate": 0.00019786477114076397, "loss": 1.2974, "step": 3430 }, { "epoch": 0.13224254090471607, "grad_norm": 1.293380618095398, "learning_rate": 0.00019785855107315353, "loss": 1.3616, "step": 3435 }, { "epoch": 0.13243503368623677, "grad_norm": 2.0498528480529785, "learning_rate": 0.00019785232205702681, "loss": 1.3431, "step": 3440 }, { "epoch": 0.13262752646775747, "grad_norm": 0.8635803461074829, "learning_rate": 0.0001978460840929535, "loss": 1.3672, "step": 3445 }, { "epoch": 0.13282001924927817, "grad_norm": 0.9983857274055481, "learning_rate": 0.00019783983718150392, "loss": 1.4856, "step": 3450 }, { "epoch": 0.13301251203079884, "grad_norm": 4.542407989501953, "learning_rate": 0.00019783358132324937, "loss": 1.4599, "step": 3455 }, { "epoch": 0.13320500481231953, "grad_norm": 1.5495860576629639, "learning_rate": 0.00019782731651876194, "loss": 1.3641, "step": 3460 }, { "epoch": 0.13339749759384023, "grad_norm": 1.2070780992507935, "learning_rate": 0.00019782104276861443, "loss": 1.3596, "step": 3465 }, { "epoch": 0.13358999037536093, "grad_norm": 1.1749752759933472, "learning_rate": 0.00019781476007338058, "loss": 1.2387, "step": 3470 }, { "epoch": 0.13378248315688163, "grad_norm": 1.8580079078674316, "learning_rate": 0.00019780846843363485, "loss": 1.3966, "step": 3475 }, { "epoch": 0.1339749759384023, "grad_norm": 1.9713795185089111, "learning_rate": 0.00019780216784995265, "loss": 1.2541, "step": 3480 }, { "epoch": 0.134167468719923, "grad_norm": 1.4017597436904907, "learning_rate": 0.00019779585832291002, "loss": 1.4827, "step": 3485 }, { "epoch": 0.1343599615014437, "grad_norm": 1.188761591911316, "learning_rate": 0.00019778953985308406, "loss": 1.3972, "step": 3490 }, { "epoch": 0.1345524542829644, "grad_norm": 1.0930372476577759, "learning_rate": 0.00019778321244105242, "loss": 1.4706, "step": 3495 }, { "epoch": 0.1347449470644851, "grad_norm": 1.3041532039642334, "learning_rate": 0.0001977768760873938, "loss": 1.1929, "step": 3500 }, { "epoch": 0.13493743984600579, "grad_norm": 2.6741833686828613, "learning_rate": 0.00019777053079268753, "loss": 1.268, "step": 3505 }, { "epoch": 0.13512993262752646, "grad_norm": 1.091823935508728, "learning_rate": 0.0001977641765575139, "loss": 1.2776, "step": 3510 }, { "epoch": 0.13532242540904715, "grad_norm": 0.9205764532089233, "learning_rate": 0.00019775781338245398, "loss": 1.3007, "step": 3515 }, { "epoch": 0.13551491819056785, "grad_norm": 1.6321576833724976, "learning_rate": 0.00019775144126808958, "loss": 1.4214, "step": 3520 }, { "epoch": 0.13570741097208855, "grad_norm": 1.7947146892547607, "learning_rate": 0.00019774506021500343, "loss": 1.3895, "step": 3525 }, { "epoch": 0.13589990375360925, "grad_norm": 1.6696717739105225, "learning_rate": 0.00019773867022377902, "loss": 1.3968, "step": 3530 }, { "epoch": 0.13609239653512994, "grad_norm": 1.1003444194793701, "learning_rate": 0.0001977322712950007, "loss": 1.4084, "step": 3535 }, { "epoch": 0.13628488931665061, "grad_norm": 1.0268352031707764, "learning_rate": 0.00019772586342925357, "loss": 1.254, "step": 3540 }, { "epoch": 0.1364773820981713, "grad_norm": 1.3906810283660889, "learning_rate": 0.0001977194466271236, "loss": 1.3266, "step": 3545 }, { "epoch": 0.136669874879692, "grad_norm": 1.1786664724349976, "learning_rate": 0.00019771302088919757, "loss": 1.3114, "step": 3550 }, { "epoch": 0.1368623676612127, "grad_norm": 1.0252714157104492, "learning_rate": 0.00019770658621606307, "loss": 1.2089, "step": 3555 }, { "epoch": 0.1370548604427334, "grad_norm": 0.8099033236503601, "learning_rate": 0.00019770014260830853, "loss": 1.2607, "step": 3560 }, { "epoch": 0.1372473532242541, "grad_norm": 1.3679542541503906, "learning_rate": 0.0001976936900665231, "loss": 1.376, "step": 3565 }, { "epoch": 0.13743984600577477, "grad_norm": 1.7685283422470093, "learning_rate": 0.00019768722859129693, "loss": 1.4522, "step": 3570 }, { "epoch": 0.13763233878729547, "grad_norm": 1.0158277750015259, "learning_rate": 0.00019768075818322081, "loss": 1.2714, "step": 3575 }, { "epoch": 0.13782483156881617, "grad_norm": 1.7043020725250244, "learning_rate": 0.00019767427884288642, "loss": 1.5669, "step": 3580 }, { "epoch": 0.13801732435033687, "grad_norm": 1.8171344995498657, "learning_rate": 0.00019766779057088627, "loss": 1.4186, "step": 3585 }, { "epoch": 0.13820981713185757, "grad_norm": 1.0524088144302368, "learning_rate": 0.00019766129336781365, "loss": 1.167, "step": 3590 }, { "epoch": 0.13840230991337824, "grad_norm": 1.558383584022522, "learning_rate": 0.0001976547872342627, "loss": 1.5015, "step": 3595 }, { "epoch": 0.13859480269489893, "grad_norm": 1.9925919771194458, "learning_rate": 0.00019764827217082838, "loss": 1.3661, "step": 3600 }, { "epoch": 0.13878729547641963, "grad_norm": 1.5693559646606445, "learning_rate": 0.0001976417481781064, "loss": 1.3389, "step": 3605 }, { "epoch": 0.13897978825794033, "grad_norm": 1.2609871625900269, "learning_rate": 0.00019763521525669343, "loss": 1.2883, "step": 3610 }, { "epoch": 0.13917228103946103, "grad_norm": 1.4910306930541992, "learning_rate": 0.00019762867340718674, "loss": 1.4237, "step": 3615 }, { "epoch": 0.13936477382098172, "grad_norm": 0.9409481287002563, "learning_rate": 0.0001976221226301846, "loss": 1.4289, "step": 3620 }, { "epoch": 0.1395572666025024, "grad_norm": 0.9263445138931274, "learning_rate": 0.00019761556292628604, "loss": 1.2987, "step": 3625 }, { "epoch": 0.1397497593840231, "grad_norm": 0.9329832792282104, "learning_rate": 0.0001976089942960909, "loss": 1.3709, "step": 3630 }, { "epoch": 0.1399422521655438, "grad_norm": 1.7852829694747925, "learning_rate": 0.00019760241674019984, "loss": 1.2282, "step": 3635 }, { "epoch": 0.1401347449470645, "grad_norm": 1.0068609714508057, "learning_rate": 0.0001975958302592143, "loss": 1.3143, "step": 3640 }, { "epoch": 0.14032723772858519, "grad_norm": 2.1680188179016113, "learning_rate": 0.0001975892348537366, "loss": 1.4447, "step": 3645 }, { "epoch": 0.14051973051010588, "grad_norm": 1.633169412612915, "learning_rate": 0.00019758263052436988, "loss": 1.2633, "step": 3650 }, { "epoch": 0.14071222329162655, "grad_norm": 1.3609623908996582, "learning_rate": 0.000197576017271718, "loss": 1.3352, "step": 3655 }, { "epoch": 0.14090471607314725, "grad_norm": 1.50294828414917, "learning_rate": 0.00019756939509638573, "loss": 1.3557, "step": 3660 }, { "epoch": 0.14109720885466795, "grad_norm": 0.9931232333183289, "learning_rate": 0.0001975627639989786, "loss": 1.4719, "step": 3665 }, { "epoch": 0.14128970163618865, "grad_norm": 1.3870011568069458, "learning_rate": 0.000197556123980103, "loss": 1.5173, "step": 3670 }, { "epoch": 0.14148219441770934, "grad_norm": 1.274064540863037, "learning_rate": 0.00019754947504036608, "loss": 1.3951, "step": 3675 }, { "epoch": 0.14167468719923004, "grad_norm": 1.6096014976501465, "learning_rate": 0.00019754281718037593, "loss": 1.4478, "step": 3680 }, { "epoch": 0.1418671799807507, "grad_norm": 1.155772089958191, "learning_rate": 0.00019753615040074131, "loss": 1.229, "step": 3685 }, { "epoch": 0.1420596727622714, "grad_norm": 1.123856544494629, "learning_rate": 0.0001975294747020718, "loss": 1.5036, "step": 3690 }, { "epoch": 0.1422521655437921, "grad_norm": 1.541308879852295, "learning_rate": 0.00019752279008497796, "loss": 1.1174, "step": 3695 }, { "epoch": 0.1424446583253128, "grad_norm": 1.8912441730499268, "learning_rate": 0.00019751609655007098, "loss": 1.3753, "step": 3700 }, { "epoch": 0.1426371511068335, "grad_norm": 1.7746648788452148, "learning_rate": 0.00019750939409796293, "loss": 1.3115, "step": 3705 }, { "epoch": 0.14282964388835417, "grad_norm": 1.2228045463562012, "learning_rate": 0.00019750268272926676, "loss": 1.3477, "step": 3710 }, { "epoch": 0.14302213666987487, "grad_norm": 1.5031695365905762, "learning_rate": 0.00019749596244459614, "loss": 1.1905, "step": 3715 }, { "epoch": 0.14321462945139557, "grad_norm": 2.871879816055298, "learning_rate": 0.0001974892332445656, "loss": 1.3334, "step": 3720 }, { "epoch": 0.14340712223291627, "grad_norm": 1.1911511421203613, "learning_rate": 0.00019748249512979048, "loss": 1.2528, "step": 3725 }, { "epoch": 0.14359961501443697, "grad_norm": 1.2722115516662598, "learning_rate": 0.00019747574810088697, "loss": 1.3314, "step": 3730 }, { "epoch": 0.14379210779595766, "grad_norm": 1.0464539527893066, "learning_rate": 0.00019746899215847198, "loss": 1.1621, "step": 3735 }, { "epoch": 0.14398460057747833, "grad_norm": 1.8877158164978027, "learning_rate": 0.00019746222730316338, "loss": 1.2534, "step": 3740 }, { "epoch": 0.14417709335899903, "grad_norm": 1.5137780904769897, "learning_rate": 0.00019745545353557967, "loss": 1.1738, "step": 3745 }, { "epoch": 0.14436958614051973, "grad_norm": 1.7104227542877197, "learning_rate": 0.00019744867085634034, "loss": 1.2868, "step": 3750 }, { "epoch": 0.14456207892204043, "grad_norm": 1.2920212745666504, "learning_rate": 0.00019744187926606558, "loss": 1.3054, "step": 3755 }, { "epoch": 0.14475457170356112, "grad_norm": 2.3661959171295166, "learning_rate": 0.00019743507876537647, "loss": 1.3187, "step": 3760 }, { "epoch": 0.14494706448508182, "grad_norm": 1.4622807502746582, "learning_rate": 0.00019742826935489487, "loss": 1.1548, "step": 3765 }, { "epoch": 0.1451395572666025, "grad_norm": 1.7818437814712524, "learning_rate": 0.00019742145103524342, "loss": 1.4081, "step": 3770 }, { "epoch": 0.1453320500481232, "grad_norm": 1.023716926574707, "learning_rate": 0.00019741462380704566, "loss": 1.3367, "step": 3775 }, { "epoch": 0.1455245428296439, "grad_norm": 1.4382961988449097, "learning_rate": 0.00019740778767092585, "loss": 1.3498, "step": 3780 }, { "epoch": 0.14571703561116459, "grad_norm": 1.5282870531082153, "learning_rate": 0.0001974009426275091, "loss": 1.2685, "step": 3785 }, { "epoch": 0.14590952839268528, "grad_norm": 1.2222365140914917, "learning_rate": 0.0001973940886774214, "loss": 1.2273, "step": 3790 }, { "epoch": 0.14610202117420595, "grad_norm": 1.3231360912322998, "learning_rate": 0.00019738722582128944, "loss": 1.5449, "step": 3795 }, { "epoch": 0.14629451395572665, "grad_norm": 1.2198995351791382, "learning_rate": 0.00019738035405974085, "loss": 1.4927, "step": 3800 }, { "epoch": 0.14648700673724735, "grad_norm": 1.1108288764953613, "learning_rate": 0.00019737347339340394, "loss": 1.3894, "step": 3805 }, { "epoch": 0.14667949951876805, "grad_norm": 1.1478091478347778, "learning_rate": 0.0001973665838229079, "loss": 1.342, "step": 3810 }, { "epoch": 0.14687199230028875, "grad_norm": 1.555680751800537, "learning_rate": 0.0001973596853488828, "loss": 1.269, "step": 3815 }, { "epoch": 0.14706448508180944, "grad_norm": 1.2819339036941528, "learning_rate": 0.0001973527779719594, "loss": 1.3462, "step": 3820 }, { "epoch": 0.1472569778633301, "grad_norm": 1.6733057498931885, "learning_rate": 0.00019734586169276939, "loss": 1.3179, "step": 3825 }, { "epoch": 0.1474494706448508, "grad_norm": 1.8622225522994995, "learning_rate": 0.00019733893651194517, "loss": 1.452, "step": 3830 }, { "epoch": 0.1476419634263715, "grad_norm": 1.2225052118301392, "learning_rate": 0.00019733200243012006, "loss": 1.2925, "step": 3835 }, { "epoch": 0.1478344562078922, "grad_norm": 0.7980884313583374, "learning_rate": 0.00019732505944792804, "loss": 1.1505, "step": 3840 }, { "epoch": 0.1480269489894129, "grad_norm": 1.3874131441116333, "learning_rate": 0.00019731810756600405, "loss": 1.2989, "step": 3845 }, { "epoch": 0.1482194417709336, "grad_norm": 1.4387590885162354, "learning_rate": 0.00019731114678498378, "loss": 1.3295, "step": 3850 }, { "epoch": 0.14841193455245427, "grad_norm": 1.8189646005630493, "learning_rate": 0.00019730417710550383, "loss": 1.2926, "step": 3855 }, { "epoch": 0.14860442733397497, "grad_norm": 0.9577664732933044, "learning_rate": 0.0001972971985282014, "loss": 1.2375, "step": 3860 }, { "epoch": 0.14879692011549567, "grad_norm": 1.7154825925827026, "learning_rate": 0.00019729021105371474, "loss": 1.2853, "step": 3865 }, { "epoch": 0.14898941289701637, "grad_norm": 2.1061089038848877, "learning_rate": 0.00019728321468268277, "loss": 1.3391, "step": 3870 }, { "epoch": 0.14918190567853706, "grad_norm": 1.0177017450332642, "learning_rate": 0.00019727620941574524, "loss": 1.2801, "step": 3875 }, { "epoch": 0.14937439846005776, "grad_norm": 1.0773547887802124, "learning_rate": 0.00019726919525354277, "loss": 1.3063, "step": 3880 }, { "epoch": 0.14956689124157843, "grad_norm": 0.9082854986190796, "learning_rate": 0.00019726217219671673, "loss": 1.3601, "step": 3885 }, { "epoch": 0.14975938402309913, "grad_norm": 1.341280221939087, "learning_rate": 0.00019725514024590934, "loss": 1.4052, "step": 3890 }, { "epoch": 0.14995187680461983, "grad_norm": 2.240399122238159, "learning_rate": 0.00019724809940176364, "loss": 1.1955, "step": 3895 }, { "epoch": 0.15014436958614052, "grad_norm": 1.549137830734253, "learning_rate": 0.00019724104966492348, "loss": 1.3089, "step": 3900 }, { "epoch": 0.15033686236766122, "grad_norm": 1.6887294054031372, "learning_rate": 0.00019723399103603346, "loss": 1.4147, "step": 3905 }, { "epoch": 0.1505293551491819, "grad_norm": 1.793087363243103, "learning_rate": 0.0001972269235157391, "loss": 1.2674, "step": 3910 }, { "epoch": 0.1507218479307026, "grad_norm": 1.718336820602417, "learning_rate": 0.00019721984710468663, "loss": 1.2716, "step": 3915 }, { "epoch": 0.1509143407122233, "grad_norm": 2.2342288494110107, "learning_rate": 0.0001972127618035232, "loss": 0.965, "step": 3920 }, { "epoch": 0.15110683349374399, "grad_norm": 1.5450822114944458, "learning_rate": 0.00019720566761289665, "loss": 1.3461, "step": 3925 }, { "epoch": 0.15129932627526468, "grad_norm": 1.4395346641540527, "learning_rate": 0.0001971985645334557, "loss": 1.3462, "step": 3930 }, { "epoch": 0.15149181905678538, "grad_norm": 1.1160500049591064, "learning_rate": 0.00019719145256584994, "loss": 1.3334, "step": 3935 }, { "epoch": 0.15168431183830605, "grad_norm": 1.0270999670028687, "learning_rate": 0.00019718433171072967, "loss": 1.2737, "step": 3940 }, { "epoch": 0.15187680461982675, "grad_norm": 1.4266023635864258, "learning_rate": 0.00019717720196874608, "loss": 1.3639, "step": 3945 }, { "epoch": 0.15206929740134745, "grad_norm": 1.552283525466919, "learning_rate": 0.00019717006334055108, "loss": 1.301, "step": 3950 }, { "epoch": 0.15226179018286815, "grad_norm": 1.5459437370300293, "learning_rate": 0.0001971629158267975, "loss": 1.265, "step": 3955 }, { "epoch": 0.15245428296438884, "grad_norm": 1.4866915941238403, "learning_rate": 0.00019715575942813888, "loss": 1.5694, "step": 3960 }, { "epoch": 0.15264677574590954, "grad_norm": 1.1116254329681396, "learning_rate": 0.00019714859414522967, "loss": 1.4858, "step": 3965 }, { "epoch": 0.1528392685274302, "grad_norm": 1.1708245277404785, "learning_rate": 0.0001971414199787251, "loss": 1.3582, "step": 3970 }, { "epoch": 0.1530317613089509, "grad_norm": 1.1672711372375488, "learning_rate": 0.00019713423692928114, "loss": 1.3393, "step": 3975 }, { "epoch": 0.1532242540904716, "grad_norm": 1.4800153970718384, "learning_rate": 0.0001971270449975547, "loss": 1.22, "step": 3980 }, { "epoch": 0.1534167468719923, "grad_norm": 1.92826509475708, "learning_rate": 0.00019711984418420338, "loss": 1.3902, "step": 3985 }, { "epoch": 0.153609239653513, "grad_norm": 1.2292252779006958, "learning_rate": 0.00019711263448988567, "loss": 1.2327, "step": 3990 }, { "epoch": 0.1538017324350337, "grad_norm": 1.1007169485092163, "learning_rate": 0.00019710541591526085, "loss": 1.4284, "step": 3995 }, { "epoch": 0.15399422521655437, "grad_norm": 0.9456301927566528, "learning_rate": 0.00019709818846098905, "loss": 1.1589, "step": 4000 }, { "epoch": 0.15418671799807507, "grad_norm": 1.518704891204834, "learning_rate": 0.0001970909521277311, "loss": 1.3976, "step": 4005 }, { "epoch": 0.15437921077959577, "grad_norm": 1.3318589925765991, "learning_rate": 0.00019708370691614872, "loss": 1.3635, "step": 4010 }, { "epoch": 0.15457170356111646, "grad_norm": 1.752626657485962, "learning_rate": 0.0001970764528269045, "loss": 1.3175, "step": 4015 }, { "epoch": 0.15476419634263716, "grad_norm": 2.055469512939453, "learning_rate": 0.00019706918986066172, "loss": 1.2873, "step": 4020 }, { "epoch": 0.15495668912415783, "grad_norm": 2.1063289642333984, "learning_rate": 0.00019706191801808457, "loss": 1.3208, "step": 4025 }, { "epoch": 0.15514918190567853, "grad_norm": 1.2449209690093994, "learning_rate": 0.00019705463729983798, "loss": 1.2863, "step": 4030 }, { "epoch": 0.15534167468719923, "grad_norm": 1.4950852394104004, "learning_rate": 0.00019704734770658778, "loss": 1.2338, "step": 4035 }, { "epoch": 0.15553416746871992, "grad_norm": 0.9372254014015198, "learning_rate": 0.00019704004923900046, "loss": 1.2105, "step": 4040 }, { "epoch": 0.15572666025024062, "grad_norm": 1.2273038625717163, "learning_rate": 0.00019703274189774347, "loss": 1.3584, "step": 4045 }, { "epoch": 0.15591915303176132, "grad_norm": 1.1560612916946411, "learning_rate": 0.00019702542568348502, "loss": 1.432, "step": 4050 }, { "epoch": 0.156111645813282, "grad_norm": 1.2214939594268799, "learning_rate": 0.00019701810059689415, "loss": 1.3237, "step": 4055 }, { "epoch": 0.1563041385948027, "grad_norm": 1.255182147026062, "learning_rate": 0.00019701076663864066, "loss": 1.5111, "step": 4060 }, { "epoch": 0.1564966313763234, "grad_norm": 1.2496423721313477, "learning_rate": 0.0001970034238093952, "loss": 1.3917, "step": 4065 }, { "epoch": 0.15668912415784408, "grad_norm": 2.773935556411743, "learning_rate": 0.00019699607210982918, "loss": 1.3072, "step": 4070 }, { "epoch": 0.15688161693936478, "grad_norm": 2.5853006839752197, "learning_rate": 0.00019698871154061497, "loss": 1.2737, "step": 4075 }, { "epoch": 0.15707410972088548, "grad_norm": 0.9573465585708618, "learning_rate": 0.00019698134210242553, "loss": 1.411, "step": 4080 }, { "epoch": 0.15726660250240615, "grad_norm": 2.204242467880249, "learning_rate": 0.00019697396379593482, "loss": 1.2493, "step": 4085 }, { "epoch": 0.15745909528392685, "grad_norm": 1.4688855409622192, "learning_rate": 0.0001969665766218175, "loss": 1.273, "step": 4090 }, { "epoch": 0.15765158806544755, "grad_norm": 2.1439919471740723, "learning_rate": 0.0001969591805807491, "loss": 1.4691, "step": 4095 }, { "epoch": 0.15784408084696824, "grad_norm": 1.4877434968948364, "learning_rate": 0.00019695177567340594, "loss": 1.4427, "step": 4100 }, { "epoch": 0.15803657362848894, "grad_norm": 1.3709458112716675, "learning_rate": 0.00019694436190046514, "loss": 1.2713, "step": 4105 }, { "epoch": 0.1582290664100096, "grad_norm": 2.1676931381225586, "learning_rate": 0.00019693693926260464, "loss": 1.1888, "step": 4110 }, { "epoch": 0.1584215591915303, "grad_norm": 1.1726205348968506, "learning_rate": 0.0001969295077605032, "loss": 1.3544, "step": 4115 }, { "epoch": 0.158614051973051, "grad_norm": 1.2441811561584473, "learning_rate": 0.00019692206739484037, "loss": 1.4796, "step": 4120 }, { "epoch": 0.1588065447545717, "grad_norm": 1.4889960289001465, "learning_rate": 0.00019691461816629652, "loss": 1.418, "step": 4125 }, { "epoch": 0.1589990375360924, "grad_norm": 1.3810794353485107, "learning_rate": 0.00019690716007555282, "loss": 1.6398, "step": 4130 }, { "epoch": 0.1591915303176131, "grad_norm": 1.589390754699707, "learning_rate": 0.00019689969312329132, "loss": 1.3203, "step": 4135 }, { "epoch": 0.15938402309913377, "grad_norm": 0.8731974959373474, "learning_rate": 0.00019689221731019477, "loss": 1.2408, "step": 4140 }, { "epoch": 0.15957651588065447, "grad_norm": 1.046852707862854, "learning_rate": 0.00019688473263694678, "loss": 1.1249, "step": 4145 }, { "epoch": 0.15976900866217517, "grad_norm": 0.8767102360725403, "learning_rate": 0.0001968772391042318, "loss": 1.2611, "step": 4150 }, { "epoch": 0.15996150144369586, "grad_norm": 1.1452685594558716, "learning_rate": 0.0001968697367127351, "loss": 1.2992, "step": 4155 }, { "epoch": 0.16015399422521656, "grad_norm": 0.9254185557365417, "learning_rate": 0.00019686222546314266, "loss": 1.3894, "step": 4160 }, { "epoch": 0.16034648700673726, "grad_norm": 0.9607768654823303, "learning_rate": 0.00019685470535614133, "loss": 1.3076, "step": 4165 }, { "epoch": 0.16053897978825793, "grad_norm": 1.2880384922027588, "learning_rate": 0.0001968471763924188, "loss": 1.3868, "step": 4170 }, { "epoch": 0.16073147256977863, "grad_norm": 1.1116464138031006, "learning_rate": 0.00019683963857266356, "loss": 1.2489, "step": 4175 }, { "epoch": 0.16092396535129933, "grad_norm": 0.9132522940635681, "learning_rate": 0.0001968320918975649, "loss": 1.3788, "step": 4180 }, { "epoch": 0.16111645813282002, "grad_norm": 1.1793001890182495, "learning_rate": 0.00019682453636781283, "loss": 1.4742, "step": 4185 }, { "epoch": 0.16130895091434072, "grad_norm": 1.1624877452850342, "learning_rate": 0.00019681697198409835, "loss": 1.3547, "step": 4190 }, { "epoch": 0.16150144369586142, "grad_norm": 1.1367181539535522, "learning_rate": 0.00019680939874711312, "loss": 1.3692, "step": 4195 }, { "epoch": 0.1616939364773821, "grad_norm": 1.0168886184692383, "learning_rate": 0.00019680181665754972, "loss": 1.4148, "step": 4200 }, { "epoch": 0.1618864292589028, "grad_norm": 1.3179705142974854, "learning_rate": 0.0001967942257161014, "loss": 1.2674, "step": 4205 }, { "epoch": 0.16207892204042348, "grad_norm": 0.8679062724113464, "learning_rate": 0.00019678662592346235, "loss": 1.4001, "step": 4210 }, { "epoch": 0.16227141482194418, "grad_norm": 0.8477693200111389, "learning_rate": 0.00019677901728032754, "loss": 1.3527, "step": 4215 }, { "epoch": 0.16246390760346488, "grad_norm": 1.280357003211975, "learning_rate": 0.00019677139978739266, "loss": 1.2576, "step": 4220 }, { "epoch": 0.16265640038498555, "grad_norm": 3.5572381019592285, "learning_rate": 0.00019676377344535434, "loss": 1.3059, "step": 4225 }, { "epoch": 0.16284889316650625, "grad_norm": 0.9162838459014893, "learning_rate": 0.0001967561382549099, "loss": 1.3655, "step": 4230 }, { "epoch": 0.16304138594802695, "grad_norm": 1.0635076761245728, "learning_rate": 0.00019674849421675764, "loss": 1.2356, "step": 4235 }, { "epoch": 0.16323387872954764, "grad_norm": 2.3638720512390137, "learning_rate": 0.00019674084133159642, "loss": 1.3598, "step": 4240 }, { "epoch": 0.16342637151106834, "grad_norm": 1.013108730316162, "learning_rate": 0.00019673317960012615, "loss": 1.6119, "step": 4245 }, { "epoch": 0.16361886429258904, "grad_norm": 1.391450047492981, "learning_rate": 0.00019672550902304737, "loss": 1.2481, "step": 4250 }, { "epoch": 0.1638113570741097, "grad_norm": 1.5574865341186523, "learning_rate": 0.00019671782960106157, "loss": 1.345, "step": 4255 }, { "epoch": 0.1640038498556304, "grad_norm": 1.8456825017929077, "learning_rate": 0.00019671014133487095, "loss": 1.3582, "step": 4260 }, { "epoch": 0.1641963426371511, "grad_norm": 1.4087297916412354, "learning_rate": 0.00019670244422517855, "loss": 1.3162, "step": 4265 }, { "epoch": 0.1643888354186718, "grad_norm": 1.167403221130371, "learning_rate": 0.0001966947382726882, "loss": 1.3841, "step": 4270 }, { "epoch": 0.1645813282001925, "grad_norm": 1.3395906686782837, "learning_rate": 0.0001966870234781046, "loss": 1.1306, "step": 4275 }, { "epoch": 0.1647738209817132, "grad_norm": 0.8549813628196716, "learning_rate": 0.00019667929984213317, "loss": 1.3017, "step": 4280 }, { "epoch": 0.16496631376323387, "grad_norm": 0.8681890368461609, "learning_rate": 0.00019667156736548021, "loss": 1.2152, "step": 4285 }, { "epoch": 0.16515880654475457, "grad_norm": 1.8476097583770752, "learning_rate": 0.00019666382604885283, "loss": 1.2571, "step": 4290 }, { "epoch": 0.16535129932627526, "grad_norm": 1.6583194732666016, "learning_rate": 0.00019665607589295888, "loss": 1.3866, "step": 4295 }, { "epoch": 0.16554379210779596, "grad_norm": 1.6784121990203857, "learning_rate": 0.00019664831689850712, "loss": 1.2966, "step": 4300 }, { "epoch": 0.16573628488931666, "grad_norm": 1.5268521308898926, "learning_rate": 0.00019664054906620696, "loss": 1.3086, "step": 4305 }, { "epoch": 0.16592877767083736, "grad_norm": 2.0114951133728027, "learning_rate": 0.00019663277239676877, "loss": 1.2137, "step": 4310 }, { "epoch": 0.16612127045235803, "grad_norm": 1.4572757482528687, "learning_rate": 0.00019662498689090372, "loss": 1.2505, "step": 4315 }, { "epoch": 0.16631376323387873, "grad_norm": 1.4267566204071045, "learning_rate": 0.00019661719254932369, "loss": 1.1485, "step": 4320 }, { "epoch": 0.16650625601539942, "grad_norm": 0.9921162128448486, "learning_rate": 0.00019660938937274142, "loss": 1.304, "step": 4325 }, { "epoch": 0.16669874879692012, "grad_norm": 1.3901869058609009, "learning_rate": 0.00019660157736187047, "loss": 1.4347, "step": 4330 }, { "epoch": 0.16689124157844082, "grad_norm": 1.5446443557739258, "learning_rate": 0.0001965937565174252, "loss": 1.3157, "step": 4335 }, { "epoch": 0.1670837343599615, "grad_norm": 1.2553350925445557, "learning_rate": 0.0001965859268401208, "loss": 1.1882, "step": 4340 }, { "epoch": 0.1672762271414822, "grad_norm": 1.9385195970535278, "learning_rate": 0.0001965780883306732, "loss": 1.4522, "step": 4345 }, { "epoch": 0.16746871992300288, "grad_norm": 1.426032543182373, "learning_rate": 0.00019657024098979916, "loss": 1.1029, "step": 4350 }, { "epoch": 0.16766121270452358, "grad_norm": 1.5562461614608765, "learning_rate": 0.0001965623848182163, "loss": 1.4837, "step": 4355 }, { "epoch": 0.16785370548604428, "grad_norm": 1.0057613849639893, "learning_rate": 0.00019655451981664306, "loss": 1.3095, "step": 4360 }, { "epoch": 0.16804619826756498, "grad_norm": 1.447845697402954, "learning_rate": 0.00019654664598579857, "loss": 1.4002, "step": 4365 }, { "epoch": 0.16823869104908565, "grad_norm": 0.9452415108680725, "learning_rate": 0.00019653876332640288, "loss": 1.3324, "step": 4370 }, { "epoch": 0.16843118383060635, "grad_norm": 1.7831186056137085, "learning_rate": 0.00019653087183917677, "loss": 1.3004, "step": 4375 }, { "epoch": 0.16862367661212704, "grad_norm": 1.0656229257583618, "learning_rate": 0.0001965229715248419, "loss": 1.5165, "step": 4380 }, { "epoch": 0.16881616939364774, "grad_norm": 1.0360915660858154, "learning_rate": 0.0001965150623841207, "loss": 1.2842, "step": 4385 }, { "epoch": 0.16900866217516844, "grad_norm": 1.286447525024414, "learning_rate": 0.00019650714441773643, "loss": 1.2902, "step": 4390 }, { "epoch": 0.16920115495668914, "grad_norm": 1.2435790300369263, "learning_rate": 0.00019649921762641306, "loss": 1.3049, "step": 4395 }, { "epoch": 0.1693936477382098, "grad_norm": 1.9299678802490234, "learning_rate": 0.0001964912820108755, "loss": 1.3057, "step": 4400 }, { "epoch": 0.1695861405197305, "grad_norm": 1.7493208646774292, "learning_rate": 0.0001964833375718494, "loss": 1.3225, "step": 4405 }, { "epoch": 0.1697786333012512, "grad_norm": 1.3697878122329712, "learning_rate": 0.0001964753843100612, "loss": 1.3518, "step": 4410 }, { "epoch": 0.1699711260827719, "grad_norm": 1.343985676765442, "learning_rate": 0.0001964674222262382, "loss": 1.3195, "step": 4415 }, { "epoch": 0.1701636188642926, "grad_norm": 1.0094975233078003, "learning_rate": 0.00019645945132110853, "loss": 1.3184, "step": 4420 }, { "epoch": 0.17035611164581327, "grad_norm": 1.6048771142959595, "learning_rate": 0.00019645147159540096, "loss": 1.3307, "step": 4425 }, { "epoch": 0.17054860442733397, "grad_norm": 2.14099383354187, "learning_rate": 0.00019644348304984524, "loss": 1.3221, "step": 4430 }, { "epoch": 0.17074109720885466, "grad_norm": 2.5571303367614746, "learning_rate": 0.00019643548568517192, "loss": 1.3092, "step": 4435 }, { "epoch": 0.17093358999037536, "grad_norm": 1.1076972484588623, "learning_rate": 0.00019642747950211225, "loss": 1.1981, "step": 4440 }, { "epoch": 0.17112608277189606, "grad_norm": 1.1315946578979492, "learning_rate": 0.00019641946450139831, "loss": 1.335, "step": 4445 }, { "epoch": 0.17131857555341676, "grad_norm": 1.33171808719635, "learning_rate": 0.00019641144068376312, "loss": 1.4677, "step": 4450 }, { "epoch": 0.17151106833493743, "grad_norm": 0.87531977891922, "learning_rate": 0.0001964034080499403, "loss": 1.1795, "step": 4455 }, { "epoch": 0.17170356111645813, "grad_norm": 1.6923136711120605, "learning_rate": 0.00019639536660066446, "loss": 1.2491, "step": 4460 }, { "epoch": 0.17189605389797882, "grad_norm": 1.481703519821167, "learning_rate": 0.0001963873163366709, "loss": 1.2894, "step": 4465 }, { "epoch": 0.17208854667949952, "grad_norm": 3.3689515590667725, "learning_rate": 0.00019637925725869576, "loss": 1.3785, "step": 4470 }, { "epoch": 0.17228103946102022, "grad_norm": 2.498059034347534, "learning_rate": 0.000196371189367476, "loss": 1.2854, "step": 4475 }, { "epoch": 0.17247353224254092, "grad_norm": 1.2852959632873535, "learning_rate": 0.00019636311266374939, "loss": 1.2272, "step": 4480 }, { "epoch": 0.1726660250240616, "grad_norm": 0.9257192015647888, "learning_rate": 0.00019635502714825446, "loss": 1.1707, "step": 4485 }, { "epoch": 0.17285851780558228, "grad_norm": 0.989142656326294, "learning_rate": 0.00019634693282173058, "loss": 1.3174, "step": 4490 }, { "epoch": 0.17305101058710298, "grad_norm": 1.4923882484436035, "learning_rate": 0.00019633882968491794, "loss": 1.2334, "step": 4495 }, { "epoch": 0.17324350336862368, "grad_norm": 1.2684218883514404, "learning_rate": 0.0001963307177385575, "loss": 1.2468, "step": 4500 }, { "epoch": 0.17343599615014438, "grad_norm": 0.9474775791168213, "learning_rate": 0.0001963225969833911, "loss": 1.2767, "step": 4505 }, { "epoch": 0.17362848893166508, "grad_norm": 2.477541446685791, "learning_rate": 0.00019631446742016126, "loss": 1.4144, "step": 4510 }, { "epoch": 0.17382098171318575, "grad_norm": 1.040477991104126, "learning_rate": 0.00019630632904961138, "loss": 1.5665, "step": 4515 }, { "epoch": 0.17401347449470644, "grad_norm": 1.3127304315567017, "learning_rate": 0.0001962981818724857, "loss": 1.3511, "step": 4520 }, { "epoch": 0.17420596727622714, "grad_norm": 1.6968106031417847, "learning_rate": 0.0001962900258895292, "loss": 1.3202, "step": 4525 }, { "epoch": 0.17439846005774784, "grad_norm": 2.2431318759918213, "learning_rate": 0.0001962818611014877, "loss": 1.351, "step": 4530 }, { "epoch": 0.17459095283926854, "grad_norm": 1.2938642501831055, "learning_rate": 0.00019627368750910779, "loss": 1.276, "step": 4535 }, { "epoch": 0.1747834456207892, "grad_norm": 1.1331931352615356, "learning_rate": 0.00019626550511313694, "loss": 1.4734, "step": 4540 }, { "epoch": 0.1749759384023099, "grad_norm": 1.4755507707595825, "learning_rate": 0.00019625731391432333, "loss": 1.24, "step": 4545 }, { "epoch": 0.1751684311838306, "grad_norm": 1.5442554950714111, "learning_rate": 0.00019624911391341604, "loss": 1.0894, "step": 4550 }, { "epoch": 0.1753609239653513, "grad_norm": 1.2970473766326904, "learning_rate": 0.00019624090511116481, "loss": 1.3262, "step": 4555 }, { "epoch": 0.175553416746872, "grad_norm": 2.1946523189544678, "learning_rate": 0.0001962326875083204, "loss": 1.4652, "step": 4560 }, { "epoch": 0.1757459095283927, "grad_norm": 1.1216411590576172, "learning_rate": 0.00019622446110563417, "loss": 1.1608, "step": 4565 }, { "epoch": 0.17593840230991337, "grad_norm": 1.996535301208496, "learning_rate": 0.00019621622590385842, "loss": 1.2568, "step": 4570 }, { "epoch": 0.17613089509143406, "grad_norm": 1.9742660522460938, "learning_rate": 0.0001962079819037462, "loss": 1.3335, "step": 4575 }, { "epoch": 0.17632338787295476, "grad_norm": 1.985192060470581, "learning_rate": 0.00019619972910605134, "loss": 1.3529, "step": 4580 }, { "epoch": 0.17651588065447546, "grad_norm": 0.8765020966529846, "learning_rate": 0.00019619146751152848, "loss": 1.3956, "step": 4585 }, { "epoch": 0.17670837343599616, "grad_norm": 1.483407974243164, "learning_rate": 0.00019618319712093319, "loss": 1.4396, "step": 4590 }, { "epoch": 0.17690086621751686, "grad_norm": 1.5663124322891235, "learning_rate": 0.00019617491793502164, "loss": 1.3896, "step": 4595 }, { "epoch": 0.17709335899903753, "grad_norm": 1.3831099271774292, "learning_rate": 0.00019616662995455096, "loss": 1.2669, "step": 4600 }, { "epoch": 0.17728585178055822, "grad_norm": 0.8688403964042664, "learning_rate": 0.00019615833318027898, "loss": 1.2098, "step": 4605 }, { "epoch": 0.17747834456207892, "grad_norm": 1.9218660593032837, "learning_rate": 0.00019615002761296446, "loss": 1.1568, "step": 4610 }, { "epoch": 0.17767083734359962, "grad_norm": 1.5095698833465576, "learning_rate": 0.00019614171325336684, "loss": 1.0516, "step": 4615 }, { "epoch": 0.17786333012512032, "grad_norm": 0.9288404583930969, "learning_rate": 0.00019613339010224646, "loss": 1.075, "step": 4620 }, { "epoch": 0.17805582290664101, "grad_norm": 1.414787769317627, "learning_rate": 0.00019612505816036434, "loss": 1.2158, "step": 4625 }, { "epoch": 0.17824831568816168, "grad_norm": 1.3182802200317383, "learning_rate": 0.0001961167174284824, "loss": 1.3719, "step": 4630 }, { "epoch": 0.17844080846968238, "grad_norm": 1.1671231985092163, "learning_rate": 0.0001961083679073634, "loss": 1.3067, "step": 4635 }, { "epoch": 0.17863330125120308, "grad_norm": 1.11225163936615, "learning_rate": 0.0001961000095977708, "loss": 1.1593, "step": 4640 }, { "epoch": 0.17882579403272378, "grad_norm": 1.235335111618042, "learning_rate": 0.00019609164250046894, "loss": 1.2232, "step": 4645 }, { "epoch": 0.17901828681424448, "grad_norm": 1.0023348331451416, "learning_rate": 0.00019608326661622291, "loss": 1.2926, "step": 4650 }, { "epoch": 0.17921077959576515, "grad_norm": 1.7143383026123047, "learning_rate": 0.00019607488194579867, "loss": 1.3149, "step": 4655 }, { "epoch": 0.17940327237728584, "grad_norm": 1.135324478149414, "learning_rate": 0.00019606648848996287, "loss": 1.4155, "step": 4660 }, { "epoch": 0.17959576515880654, "grad_norm": 0.7830592393875122, "learning_rate": 0.0001960580862494831, "loss": 1.2632, "step": 4665 }, { "epoch": 0.17978825794032724, "grad_norm": 1.546481966972351, "learning_rate": 0.0001960496752251277, "loss": 1.4674, "step": 4670 }, { "epoch": 0.17998075072184794, "grad_norm": 1.5377360582351685, "learning_rate": 0.00019604125541766574, "loss": 1.0782, "step": 4675 }, { "epoch": 0.18017324350336864, "grad_norm": 2.1382510662078857, "learning_rate": 0.0001960328268278672, "loss": 1.3008, "step": 4680 }, { "epoch": 0.1803657362848893, "grad_norm": 1.4963937997817993, "learning_rate": 0.00019602438945650277, "loss": 1.2601, "step": 4685 }, { "epoch": 0.18055822906641, "grad_norm": 1.4736862182617188, "learning_rate": 0.00019601594330434405, "loss": 1.163, "step": 4690 }, { "epoch": 0.1807507218479307, "grad_norm": 0.9905889630317688, "learning_rate": 0.00019600748837216337, "loss": 1.3675, "step": 4695 }, { "epoch": 0.1809432146294514, "grad_norm": 1.1800122261047363, "learning_rate": 0.00019599902466073385, "loss": 1.3252, "step": 4700 }, { "epoch": 0.1811357074109721, "grad_norm": 1.1933966875076294, "learning_rate": 0.00019599055217082949, "loss": 1.2163, "step": 4705 }, { "epoch": 0.1813282001924928, "grad_norm": 1.3980772495269775, "learning_rate": 0.000195982070903225, "loss": 1.2807, "step": 4710 }, { "epoch": 0.18152069297401346, "grad_norm": 2.541808605194092, "learning_rate": 0.00019597358085869594, "loss": 1.1333, "step": 4715 }, { "epoch": 0.18171318575553416, "grad_norm": 1.616479516029358, "learning_rate": 0.0001959650820380187, "loss": 1.2991, "step": 4720 }, { "epoch": 0.18190567853705486, "grad_norm": 0.9473749399185181, "learning_rate": 0.00019595657444197037, "loss": 1.2273, "step": 4725 }, { "epoch": 0.18209817131857556, "grad_norm": 1.3119609355926514, "learning_rate": 0.000195948058071329, "loss": 1.2754, "step": 4730 }, { "epoch": 0.18229066410009626, "grad_norm": 1.0062682628631592, "learning_rate": 0.00019593953292687332, "loss": 1.2494, "step": 4735 }, { "epoch": 0.18248315688161693, "grad_norm": 1.2124086618423462, "learning_rate": 0.0001959309990093829, "loss": 1.3725, "step": 4740 }, { "epoch": 0.18267564966313762, "grad_norm": 1.2050824165344238, "learning_rate": 0.0001959224563196381, "loss": 1.5103, "step": 4745 }, { "epoch": 0.18286814244465832, "grad_norm": 0.9262427091598511, "learning_rate": 0.00019591390485842008, "loss": 1.4155, "step": 4750 }, { "epoch": 0.18306063522617902, "grad_norm": 1.5612881183624268, "learning_rate": 0.00019590534462651086, "loss": 1.2289, "step": 4755 }, { "epoch": 0.18325312800769972, "grad_norm": 1.5384646654129028, "learning_rate": 0.00019589677562469312, "loss": 1.2474, "step": 4760 }, { "epoch": 0.18344562078922041, "grad_norm": 1.397716999053955, "learning_rate": 0.00019588819785375057, "loss": 1.4273, "step": 4765 }, { "epoch": 0.18363811357074108, "grad_norm": 1.169207215309143, "learning_rate": 0.00019587961131446754, "loss": 1.3963, "step": 4770 }, { "epoch": 0.18383060635226178, "grad_norm": 1.5064833164215088, "learning_rate": 0.00019587101600762916, "loss": 1.5192, "step": 4775 }, { "epoch": 0.18402309913378248, "grad_norm": 0.9700071811676025, "learning_rate": 0.00019586241193402147, "loss": 1.2697, "step": 4780 }, { "epoch": 0.18421559191530318, "grad_norm": 1.2304507493972778, "learning_rate": 0.00019585379909443123, "loss": 1.3025, "step": 4785 }, { "epoch": 0.18440808469682388, "grad_norm": 1.3768020868301392, "learning_rate": 0.00019584517748964605, "loss": 1.3785, "step": 4790 }, { "epoch": 0.18460057747834457, "grad_norm": 1.062251091003418, "learning_rate": 0.0001958365471204543, "loss": 1.5416, "step": 4795 }, { "epoch": 0.18479307025986524, "grad_norm": 0.9126803874969482, "learning_rate": 0.00019582790798764518, "loss": 1.1479, "step": 4800 }, { "epoch": 0.18498556304138594, "grad_norm": 1.579830288887024, "learning_rate": 0.00019581926009200866, "loss": 1.3315, "step": 4805 }, { "epoch": 0.18517805582290664, "grad_norm": 2.351717710494995, "learning_rate": 0.00019581060343433555, "loss": 1.2503, "step": 4810 }, { "epoch": 0.18537054860442734, "grad_norm": 1.1480222940444946, "learning_rate": 0.00019580193801541746, "loss": 1.2048, "step": 4815 }, { "epoch": 0.18556304138594804, "grad_norm": 1.606439471244812, "learning_rate": 0.00019579326383604675, "loss": 1.5204, "step": 4820 }, { "epoch": 0.18575553416746873, "grad_norm": 1.520969271659851, "learning_rate": 0.00019578458089701664, "loss": 1.2584, "step": 4825 }, { "epoch": 0.1859480269489894, "grad_norm": 1.9096931219100952, "learning_rate": 0.00019577588919912113, "loss": 1.5508, "step": 4830 }, { "epoch": 0.1861405197305101, "grad_norm": 1.004654884338379, "learning_rate": 0.00019576718874315501, "loss": 1.2249, "step": 4835 }, { "epoch": 0.1863330125120308, "grad_norm": 1.0160667896270752, "learning_rate": 0.00019575847952991388, "loss": 1.0782, "step": 4840 }, { "epoch": 0.1865255052935515, "grad_norm": 1.4719328880310059, "learning_rate": 0.0001957497615601941, "loss": 1.4679, "step": 4845 }, { "epoch": 0.1867179980750722, "grad_norm": 1.229625940322876, "learning_rate": 0.00019574103483479296, "loss": 1.347, "step": 4850 }, { "epoch": 0.18691049085659286, "grad_norm": 3.0996217727661133, "learning_rate": 0.00019573229935450842, "loss": 1.3325, "step": 4855 }, { "epoch": 0.18710298363811356, "grad_norm": 1.59645676612854, "learning_rate": 0.00019572355512013922, "loss": 1.2983, "step": 4860 }, { "epoch": 0.18729547641963426, "grad_norm": 1.373542070388794, "learning_rate": 0.00019571480213248504, "loss": 1.3285, "step": 4865 }, { "epoch": 0.18748796920115496, "grad_norm": 0.9625198245048523, "learning_rate": 0.00019570604039234626, "loss": 1.2823, "step": 4870 }, { "epoch": 0.18768046198267566, "grad_norm": 1.1096363067626953, "learning_rate": 0.00019569726990052407, "loss": 1.2508, "step": 4875 }, { "epoch": 0.18787295476419635, "grad_norm": 1.2040042877197266, "learning_rate": 0.0001956884906578205, "loss": 1.3767, "step": 4880 }, { "epoch": 0.18806544754571702, "grad_norm": 1.103530764579773, "learning_rate": 0.00019567970266503833, "loss": 1.4559, "step": 4885 }, { "epoch": 0.18825794032723772, "grad_norm": 1.1266409158706665, "learning_rate": 0.0001956709059229812, "loss": 1.0687, "step": 4890 }, { "epoch": 0.18845043310875842, "grad_norm": 1.2266972064971924, "learning_rate": 0.00019566210043245344, "loss": 1.1801, "step": 4895 }, { "epoch": 0.18864292589027912, "grad_norm": 1.416676640510559, "learning_rate": 0.0001956532861942603, "loss": 1.346, "step": 4900 }, { "epoch": 0.18883541867179982, "grad_norm": 1.5538910627365112, "learning_rate": 0.0001956444632092078, "loss": 1.3498, "step": 4905 }, { "epoch": 0.1890279114533205, "grad_norm": 1.1525146961212158, "learning_rate": 0.00019563563147810274, "loss": 1.39, "step": 4910 }, { "epoch": 0.18922040423484118, "grad_norm": 1.6796061992645264, "learning_rate": 0.00019562679100175266, "loss": 1.3377, "step": 4915 }, { "epoch": 0.18941289701636188, "grad_norm": 1.6094450950622559, "learning_rate": 0.00019561794178096607, "loss": 1.3057, "step": 4920 }, { "epoch": 0.18960538979788258, "grad_norm": 1.8123548030853271, "learning_rate": 0.00019560908381655208, "loss": 1.1257, "step": 4925 }, { "epoch": 0.18979788257940328, "grad_norm": 1.5495673418045044, "learning_rate": 0.00019560021710932074, "loss": 1.303, "step": 4930 }, { "epoch": 0.18999037536092397, "grad_norm": 1.623429298400879, "learning_rate": 0.00019559134166008283, "loss": 1.1491, "step": 4935 }, { "epoch": 0.19018286814244467, "grad_norm": 1.2682925462722778, "learning_rate": 0.00019558245746964997, "loss": 1.3774, "step": 4940 }, { "epoch": 0.19037536092396534, "grad_norm": 0.9362719058990479, "learning_rate": 0.00019557356453883456, "loss": 1.2936, "step": 4945 }, { "epoch": 0.19056785370548604, "grad_norm": 1.4271594285964966, "learning_rate": 0.00019556466286844976, "loss": 1.3865, "step": 4950 }, { "epoch": 0.19076034648700674, "grad_norm": 1.4094691276550293, "learning_rate": 0.00019555575245930963, "loss": 1.2941, "step": 4955 }, { "epoch": 0.19095283926852744, "grad_norm": 0.9695935249328613, "learning_rate": 0.00019554683331222893, "loss": 1.1724, "step": 4960 }, { "epoch": 0.19114533205004813, "grad_norm": 1.110616683959961, "learning_rate": 0.00019553790542802327, "loss": 1.3999, "step": 4965 }, { "epoch": 0.1913378248315688, "grad_norm": 1.5389796495437622, "learning_rate": 0.000195528968807509, "loss": 1.2693, "step": 4970 }, { "epoch": 0.1915303176130895, "grad_norm": 1.921168565750122, "learning_rate": 0.00019552002345150338, "loss": 1.2392, "step": 4975 }, { "epoch": 0.1917228103946102, "grad_norm": 1.3342314958572388, "learning_rate": 0.00019551106936082437, "loss": 1.2477, "step": 4980 }, { "epoch": 0.1919153031761309, "grad_norm": 1.745754361152649, "learning_rate": 0.0001955021065362908, "loss": 1.7169, "step": 4985 }, { "epoch": 0.1921077959576516, "grad_norm": 1.090145468711853, "learning_rate": 0.0001954931349787222, "loss": 1.1156, "step": 4990 }, { "epoch": 0.1923002887391723, "grad_norm": 1.5357612371444702, "learning_rate": 0.00019548415468893899, "loss": 1.5436, "step": 4995 }, { "epoch": 0.19249278152069296, "grad_norm": 1.0309633016586304, "learning_rate": 0.00019547516566776238, "loss": 1.3212, "step": 5000 }, { "epoch": 0.19268527430221366, "grad_norm": 1.000688076019287, "learning_rate": 0.0001954661679160143, "loss": 1.2821, "step": 5005 }, { "epoch": 0.19287776708373436, "grad_norm": 1.268754243850708, "learning_rate": 0.0001954571614345176, "loss": 1.2168, "step": 5010 }, { "epoch": 0.19307025986525506, "grad_norm": 1.3859111070632935, "learning_rate": 0.00019544814622409582, "loss": 1.0701, "step": 5015 }, { "epoch": 0.19326275264677575, "grad_norm": 2.248309850692749, "learning_rate": 0.00019543912228557337, "loss": 1.3548, "step": 5020 }, { "epoch": 0.19345524542829645, "grad_norm": 1.0269944667816162, "learning_rate": 0.00019543008961977538, "loss": 1.213, "step": 5025 }, { "epoch": 0.19364773820981712, "grad_norm": 1.0082924365997314, "learning_rate": 0.00019542104822752789, "loss": 1.2395, "step": 5030 }, { "epoch": 0.19384023099133782, "grad_norm": 2.1287014484405518, "learning_rate": 0.00019541199810965766, "loss": 1.3794, "step": 5035 }, { "epoch": 0.19403272377285852, "grad_norm": 1.230859637260437, "learning_rate": 0.0001954029392669922, "loss": 1.3985, "step": 5040 }, { "epoch": 0.19422521655437922, "grad_norm": 1.0987460613250732, "learning_rate": 0.00019539387170035996, "loss": 1.2637, "step": 5045 }, { "epoch": 0.1944177093358999, "grad_norm": 1.2570157051086426, "learning_rate": 0.00019538479541059007, "loss": 1.2752, "step": 5050 }, { "epoch": 0.19461020211742058, "grad_norm": 0.5122241377830505, "learning_rate": 0.00019537571039851252, "loss": 1.1927, "step": 5055 }, { "epoch": 0.19480269489894128, "grad_norm": 1.7925124168395996, "learning_rate": 0.00019536661666495807, "loss": 1.1414, "step": 5060 }, { "epoch": 0.19499518768046198, "grad_norm": 0.8517950773239136, "learning_rate": 0.00019535751421075826, "loss": 1.2359, "step": 5065 }, { "epoch": 0.19518768046198268, "grad_norm": 0.582260012626648, "learning_rate": 0.00019534840303674544, "loss": 1.3528, "step": 5070 }, { "epoch": 0.19538017324350337, "grad_norm": 1.3547414541244507, "learning_rate": 0.0001953392831437528, "loss": 1.296, "step": 5075 }, { "epoch": 0.19557266602502407, "grad_norm": Infinity, "learning_rate": 0.0001953319809522536, "loss": 1.4074, "step": 5080 }, { "epoch": 0.19576515880654474, "grad_norm": 2.2984917163848877, "learning_rate": 0.00019532284536719936, "loss": 1.2002, "step": 5085 }, { "epoch": 0.19595765158806544, "grad_norm": 1.4113095998764038, "learning_rate": 0.0001953137010655024, "loss": 1.2755, "step": 5090 }, { "epoch": 0.19615014436958614, "grad_norm": 1.921242594718933, "learning_rate": 0.00019530454804799881, "loss": 1.2431, "step": 5095 }, { "epoch": 0.19634263715110684, "grad_norm": 1.3097113370895386, "learning_rate": 0.0001952953863155257, "loss": 1.415, "step": 5100 }, { "epoch": 0.19653512993262753, "grad_norm": 2.1493217945098877, "learning_rate": 0.00019528621586892072, "loss": 1.4282, "step": 5105 }, { "epoch": 0.19672762271414823, "grad_norm": 1.2487257719039917, "learning_rate": 0.0001952770367090226, "loss": 1.3512, "step": 5110 }, { "epoch": 0.1969201154956689, "grad_norm": 0.9984391331672668, "learning_rate": 0.00019526784883667055, "loss": 1.5437, "step": 5115 }, { "epoch": 0.1971126082771896, "grad_norm": 1.241417646408081, "learning_rate": 0.00019525865225270486, "loss": 1.2399, "step": 5120 }, { "epoch": 0.1973051010587103, "grad_norm": 1.5192227363586426, "learning_rate": 0.00019524944695796642, "loss": 1.3236, "step": 5125 }, { "epoch": 0.197497593840231, "grad_norm": 1.7465555667877197, "learning_rate": 0.00019524023295329704, "loss": 1.4247, "step": 5130 }, { "epoch": 0.1976900866217517, "grad_norm": 1.455175757408142, "learning_rate": 0.00019523101023953925, "loss": 1.5053, "step": 5135 }, { "epoch": 0.1978825794032724, "grad_norm": 2.164982318878174, "learning_rate": 0.00019522177881753643, "loss": 1.2796, "step": 5140 }, { "epoch": 0.19807507218479306, "grad_norm": 1.58863365650177, "learning_rate": 0.00019521253868813273, "loss": 1.349, "step": 5145 }, { "epoch": 0.19826756496631376, "grad_norm": 1.5380641222000122, "learning_rate": 0.0001952032898521731, "loss": 1.3107, "step": 5150 }, { "epoch": 0.19846005774783446, "grad_norm": 1.1790603399276733, "learning_rate": 0.00019519403231050327, "loss": 1.2178, "step": 5155 }, { "epoch": 0.19865255052935515, "grad_norm": 1.7905482053756714, "learning_rate": 0.0001951847660639698, "loss": 1.3579, "step": 5160 }, { "epoch": 0.19884504331087585, "grad_norm": 1.1262041330337524, "learning_rate": 0.00019517549111342, "loss": 1.2988, "step": 5165 }, { "epoch": 0.19903753609239652, "grad_norm": 1.6370010375976562, "learning_rate": 0.00019516620745970199, "loss": 1.2326, "step": 5170 }, { "epoch": 0.19923002887391722, "grad_norm": 1.1789335012435913, "learning_rate": 0.00019515691510366476, "loss": 1.1357, "step": 5175 }, { "epoch": 0.19942252165543792, "grad_norm": 1.167226791381836, "learning_rate": 0.000195147614046158, "loss": 1.4007, "step": 5180 }, { "epoch": 0.19961501443695862, "grad_norm": 1.3708933591842651, "learning_rate": 0.00019513830428803225, "loss": 1.3029, "step": 5185 }, { "epoch": 0.1998075072184793, "grad_norm": 1.6595165729522705, "learning_rate": 0.00019512898583013875, "loss": 1.3159, "step": 5190 }, { "epoch": 0.2, "grad_norm": 1.1252923011779785, "learning_rate": 0.00019511965867332972, "loss": 1.1894, "step": 5195 }, { "epoch": 0.20019249278152068, "grad_norm": 0.8440331816673279, "learning_rate": 0.00019511032281845797, "loss": 1.2108, "step": 5200 }, { "epoch": 0.20038498556304138, "grad_norm": 1.427147626876831, "learning_rate": 0.0001951009782663773, "loss": 1.197, "step": 5205 }, { "epoch": 0.20057747834456208, "grad_norm": 1.3509503602981567, "learning_rate": 0.00019509162501794213, "loss": 1.3348, "step": 5210 }, { "epoch": 0.20076997112608277, "grad_norm": 1.533103108406067, "learning_rate": 0.00019508226307400777, "loss": 1.1919, "step": 5215 }, { "epoch": 0.20096246390760347, "grad_norm": 1.1347332000732422, "learning_rate": 0.0001950728924354303, "loss": 1.2954, "step": 5220 }, { "epoch": 0.20115495668912417, "grad_norm": 1.65277099609375, "learning_rate": 0.00019506351310306664, "loss": 1.2686, "step": 5225 }, { "epoch": 0.20134744947064484, "grad_norm": 1.0601050853729248, "learning_rate": 0.00019505412507777442, "loss": 1.4066, "step": 5230 }, { "epoch": 0.20153994225216554, "grad_norm": 0.9429787397384644, "learning_rate": 0.00019504472836041217, "loss": 1.208, "step": 5235 }, { "epoch": 0.20173243503368624, "grad_norm": 0.9101033806800842, "learning_rate": 0.00019503532295183908, "loss": 1.3172, "step": 5240 }, { "epoch": 0.20192492781520693, "grad_norm": 1.1404805183410645, "learning_rate": 0.0001950259088529153, "loss": 1.1539, "step": 5245 }, { "epoch": 0.20211742059672763, "grad_norm": 1.1555522680282593, "learning_rate": 0.00019501648606450161, "loss": 1.3754, "step": 5250 }, { "epoch": 0.20230991337824833, "grad_norm": 1.5473912954330444, "learning_rate": 0.00019500705458745974, "loss": 1.1878, "step": 5255 }, { "epoch": 0.202502406159769, "grad_norm": 1.8766716718673706, "learning_rate": 0.00019499761442265208, "loss": 1.2445, "step": 5260 }, { "epoch": 0.2026948989412897, "grad_norm": 1.7951183319091797, "learning_rate": 0.00019498816557094188, "loss": 1.3496, "step": 5265 }, { "epoch": 0.2028873917228104, "grad_norm": 1.6615973711013794, "learning_rate": 0.00019497870803319317, "loss": 1.2919, "step": 5270 }, { "epoch": 0.2030798845043311, "grad_norm": 1.2885236740112305, "learning_rate": 0.00019496924181027078, "loss": 1.1807, "step": 5275 }, { "epoch": 0.2032723772858518, "grad_norm": 0.9546861052513123, "learning_rate": 0.00019495976690304034, "loss": 1.309, "step": 5280 }, { "epoch": 0.20346487006737246, "grad_norm": 1.6904189586639404, "learning_rate": 0.0001949502833123683, "loss": 1.2244, "step": 5285 }, { "epoch": 0.20365736284889316, "grad_norm": 1.394254446029663, "learning_rate": 0.0001949407910391218, "loss": 1.2877, "step": 5290 }, { "epoch": 0.20384985563041386, "grad_norm": 0.8937919735908508, "learning_rate": 0.0001949312900841689, "loss": 1.2389, "step": 5295 }, { "epoch": 0.20404234841193455, "grad_norm": 1.1096867322921753, "learning_rate": 0.00019492178044837837, "loss": 1.3766, "step": 5300 }, { "epoch": 0.20423484119345525, "grad_norm": 1.009758472442627, "learning_rate": 0.00019491226213261983, "loss": 1.2281, "step": 5305 }, { "epoch": 0.20442733397497595, "grad_norm": 1.4888296127319336, "learning_rate": 0.00019490273513776365, "loss": 1.0624, "step": 5310 }, { "epoch": 0.20461982675649662, "grad_norm": 1.4901612997055054, "learning_rate": 0.00019489319946468104, "loss": 1.1554, "step": 5315 }, { "epoch": 0.20481231953801732, "grad_norm": 1.2920863628387451, "learning_rate": 0.0001948836551142439, "loss": 1.2103, "step": 5320 }, { "epoch": 0.20500481231953802, "grad_norm": 1.3616580963134766, "learning_rate": 0.00019487410208732508, "loss": 1.3246, "step": 5325 }, { "epoch": 0.2051973051010587, "grad_norm": 1.0202921628952026, "learning_rate": 0.0001948645403847981, "loss": 1.3046, "step": 5330 }, { "epoch": 0.2053897978825794, "grad_norm": 1.0083186626434326, "learning_rate": 0.00019485497000753735, "loss": 1.2541, "step": 5335 }, { "epoch": 0.2055822906641001, "grad_norm": 1.137617588043213, "learning_rate": 0.0001948453909564179, "loss": 1.3143, "step": 5340 }, { "epoch": 0.20577478344562078, "grad_norm": 1.6331067085266113, "learning_rate": 0.00019483580323231578, "loss": 1.1129, "step": 5345 }, { "epoch": 0.20596727622714148, "grad_norm": 1.4032361507415771, "learning_rate": 0.00019482620683610767, "loss": 1.3412, "step": 5350 }, { "epoch": 0.20615976900866217, "grad_norm": 1.3207452297210693, "learning_rate": 0.00019481660176867108, "loss": 1.4614, "step": 5355 }, { "epoch": 0.20635226179018287, "grad_norm": 0.9236577749252319, "learning_rate": 0.0001948069880308844, "loss": 1.3131, "step": 5360 }, { "epoch": 0.20654475457170357, "grad_norm": 2.2021703720092773, "learning_rate": 0.0001947973656236267, "loss": 1.2434, "step": 5365 }, { "epoch": 0.20673724735322424, "grad_norm": 1.5074305534362793, "learning_rate": 0.00019478773454777789, "loss": 1.4204, "step": 5370 }, { "epoch": 0.20692974013474494, "grad_norm": 1.5073877573013306, "learning_rate": 0.00019477809480421865, "loss": 1.4193, "step": 5375 }, { "epoch": 0.20712223291626564, "grad_norm": 1.0522600412368774, "learning_rate": 0.00019476844639383049, "loss": 1.228, "step": 5380 }, { "epoch": 0.20731472569778633, "grad_norm": 1.1478843688964844, "learning_rate": 0.0001947587893174957, "loss": 1.2315, "step": 5385 }, { "epoch": 0.20750721847930703, "grad_norm": 0.922837495803833, "learning_rate": 0.00019474912357609733, "loss": 1.2567, "step": 5390 }, { "epoch": 0.20769971126082773, "grad_norm": 1.156615972518921, "learning_rate": 0.0001947394491705193, "loss": 1.443, "step": 5395 }, { "epoch": 0.2078922040423484, "grad_norm": 1.909555435180664, "learning_rate": 0.0001947297661016462, "loss": 1.1625, "step": 5400 }, { "epoch": 0.2080846968238691, "grad_norm": 1.8379411697387695, "learning_rate": 0.00019472007437036352, "loss": 1.3015, "step": 5405 }, { "epoch": 0.2082771896053898, "grad_norm": 1.188402771949768, "learning_rate": 0.00019471037397755754, "loss": 1.3294, "step": 5410 }, { "epoch": 0.2084696823869105, "grad_norm": 1.597538948059082, "learning_rate": 0.00019470066492411521, "loss": 1.3824, "step": 5415 }, { "epoch": 0.2086621751684312, "grad_norm": 1.0081026554107666, "learning_rate": 0.00019469094721092444, "loss": 1.2914, "step": 5420 }, { "epoch": 0.2088546679499519, "grad_norm": 1.3790476322174072, "learning_rate": 0.0001946812208388738, "loss": 1.2817, "step": 5425 }, { "epoch": 0.20904716073147256, "grad_norm": 1.777570128440857, "learning_rate": 0.00019467148580885272, "loss": 1.2253, "step": 5430 }, { "epoch": 0.20923965351299326, "grad_norm": 1.1196024417877197, "learning_rate": 0.00019466174212175142, "loss": 1.2956, "step": 5435 }, { "epoch": 0.20943214629451395, "grad_norm": 2.940906524658203, "learning_rate": 0.00019465198977846086, "loss": 1.3912, "step": 5440 }, { "epoch": 0.20962463907603465, "grad_norm": 1.9075424671173096, "learning_rate": 0.00019464222877987286, "loss": 1.2518, "step": 5445 }, { "epoch": 0.20981713185755535, "grad_norm": 1.0282469987869263, "learning_rate": 0.00019463245912687996, "loss": 1.2569, "step": 5450 }, { "epoch": 0.21000962463907605, "grad_norm": 1.1651009321212769, "learning_rate": 0.0001946226808203756, "loss": 1.4676, "step": 5455 }, { "epoch": 0.21020211742059672, "grad_norm": 1.1911680698394775, "learning_rate": 0.00019461289386125388, "loss": 1.3822, "step": 5460 }, { "epoch": 0.21039461020211742, "grad_norm": 0.7187578082084656, "learning_rate": 0.00019460309825040974, "loss": 1.1462, "step": 5465 }, { "epoch": 0.2105871029836381, "grad_norm": 2.401764154434204, "learning_rate": 0.000194593293988739, "loss": 1.3187, "step": 5470 }, { "epoch": 0.2107795957651588, "grad_norm": 1.783333659172058, "learning_rate": 0.0001945834810771381, "loss": 1.3539, "step": 5475 }, { "epoch": 0.2109720885466795, "grad_norm": 0.9923986196517944, "learning_rate": 0.00019457365951650445, "loss": 1.4837, "step": 5480 }, { "epoch": 0.21116458132820018, "grad_norm": 1.0704642534255981, "learning_rate": 0.00019456382930773612, "loss": 1.2345, "step": 5485 }, { "epoch": 0.21135707410972088, "grad_norm": 1.5242959260940552, "learning_rate": 0.000194553990451732, "loss": 1.2113, "step": 5490 }, { "epoch": 0.21154956689124157, "grad_norm": 1.3185608386993408, "learning_rate": 0.00019454414294939185, "loss": 1.4083, "step": 5495 }, { "epoch": 0.21174205967276227, "grad_norm": 1.1448662281036377, "learning_rate": 0.00019453428680161615, "loss": 1.4091, "step": 5500 }, { "epoch": 0.21193455245428297, "grad_norm": 1.172396183013916, "learning_rate": 0.0001945244220093061, "loss": 1.1414, "step": 5505 }, { "epoch": 0.21212704523580367, "grad_norm": 2.988346576690674, "learning_rate": 0.00019451454857336383, "loss": 1.3968, "step": 5510 }, { "epoch": 0.21231953801732434, "grad_norm": 0.8824801445007324, "learning_rate": 0.00019450466649469222, "loss": 1.2229, "step": 5515 }, { "epoch": 0.21251203079884504, "grad_norm": 1.7703745365142822, "learning_rate": 0.00019449477577419488, "loss": 1.3073, "step": 5520 }, { "epoch": 0.21270452358036573, "grad_norm": 1.3374749422073364, "learning_rate": 0.00019448487641277629, "loss": 1.3908, "step": 5525 }, { "epoch": 0.21289701636188643, "grad_norm": 1.2366503477096558, "learning_rate": 0.00019447496841134163, "loss": 1.2764, "step": 5530 }, { "epoch": 0.21308950914340713, "grad_norm": 1.242353081703186, "learning_rate": 0.00019446505177079696, "loss": 1.3136, "step": 5535 }, { "epoch": 0.21328200192492783, "grad_norm": 1.046583652496338, "learning_rate": 0.00019445512649204907, "loss": 1.1483, "step": 5540 }, { "epoch": 0.2134744947064485, "grad_norm": 1.6280517578125, "learning_rate": 0.00019444519257600558, "loss": 1.4076, "step": 5545 }, { "epoch": 0.2136669874879692, "grad_norm": 1.7472679615020752, "learning_rate": 0.00019443525002357486, "loss": 1.2842, "step": 5550 }, { "epoch": 0.2138594802694899, "grad_norm": 1.101185917854309, "learning_rate": 0.00019442529883566612, "loss": 1.3037, "step": 5555 }, { "epoch": 0.2140519730510106, "grad_norm": 1.8548834323883057, "learning_rate": 0.0001944153390131893, "loss": 1.4081, "step": 5560 }, { "epoch": 0.2142444658325313, "grad_norm": 1.4205219745635986, "learning_rate": 0.00019440537055705515, "loss": 1.3419, "step": 5565 }, { "epoch": 0.214436958614052, "grad_norm": 1.135933756828308, "learning_rate": 0.0001943953934681753, "loss": 0.9906, "step": 5570 }, { "epoch": 0.21462945139557266, "grad_norm": 1.7350742816925049, "learning_rate": 0.00019438540774746198, "loss": 1.1193, "step": 5575 }, { "epoch": 0.21482194417709335, "grad_norm": 1.891998291015625, "learning_rate": 0.00019437541339582836, "loss": 1.2271, "step": 5580 }, { "epoch": 0.21501443695861405, "grad_norm": 1.2564722299575806, "learning_rate": 0.0001943654104141884, "loss": 1.5134, "step": 5585 }, { "epoch": 0.21520692974013475, "grad_norm": 1.3632197380065918, "learning_rate": 0.00019435539880345673, "loss": 1.1772, "step": 5590 }, { "epoch": 0.21539942252165545, "grad_norm": 1.8670414686203003, "learning_rate": 0.00019434537856454894, "loss": 1.2685, "step": 5595 }, { "epoch": 0.21559191530317612, "grad_norm": 2.5948314666748047, "learning_rate": 0.00019433534969838122, "loss": 1.487, "step": 5600 }, { "epoch": 0.21578440808469682, "grad_norm": 1.2312328815460205, "learning_rate": 0.00019432531220587071, "loss": 1.3394, "step": 5605 }, { "epoch": 0.2159769008662175, "grad_norm": 0.9402896165847778, "learning_rate": 0.0001943152660879352, "loss": 1.1471, "step": 5610 }, { "epoch": 0.2161693936477382, "grad_norm": 0.3871050477027893, "learning_rate": 0.00019430521134549346, "loss": 0.9597, "step": 5615 }, { "epoch": 0.2163618864292589, "grad_norm": 0.9395222067832947, "learning_rate": 0.0001942951479794648, "loss": 1.3055, "step": 5620 }, { "epoch": 0.2165543792107796, "grad_norm": 0.8928638696670532, "learning_rate": 0.00019428507599076955, "loss": 1.4099, "step": 5625 }, { "epoch": 0.21674687199230028, "grad_norm": 1.8891551494598389, "learning_rate": 0.00019427499538032865, "loss": 1.5009, "step": 5630 }, { "epoch": 0.21693936477382098, "grad_norm": 0.6684243679046631, "learning_rate": 0.00019426490614906394, "loss": 1.2251, "step": 5635 }, { "epoch": 0.21713185755534167, "grad_norm": 1.5765355825424194, "learning_rate": 0.00019425480829789803, "loss": 1.1114, "step": 5640 }, { "epoch": 0.21732435033686237, "grad_norm": 0.9966096878051758, "learning_rate": 0.00019424470182775427, "loss": 1.2907, "step": 5645 }, { "epoch": 0.21751684311838307, "grad_norm": 1.263469934463501, "learning_rate": 0.00019423458673955684, "loss": 1.1443, "step": 5650 }, { "epoch": 0.21770933589990377, "grad_norm": 1.5138813257217407, "learning_rate": 0.0001942244630342307, "loss": 1.2699, "step": 5655 }, { "epoch": 0.21790182868142444, "grad_norm": 1.0215526819229126, "learning_rate": 0.00019421433071270156, "loss": 1.4265, "step": 5660 }, { "epoch": 0.21809432146294513, "grad_norm": 0.7587301731109619, "learning_rate": 0.00019420418977589605, "loss": 1.1706, "step": 5665 }, { "epoch": 0.21828681424446583, "grad_norm": 0.9531148672103882, "learning_rate": 0.0001941940402247414, "loss": 1.4041, "step": 5670 }, { "epoch": 0.21847930702598653, "grad_norm": 1.098739743232727, "learning_rate": 0.00019418388206016575, "loss": 1.3476, "step": 5675 }, { "epoch": 0.21867179980750723, "grad_norm": 1.0307271480560303, "learning_rate": 0.000194173715283098, "loss": 1.2333, "step": 5680 }, { "epoch": 0.2188642925890279, "grad_norm": 1.538256049156189, "learning_rate": 0.00019416353989446785, "loss": 1.4489, "step": 5685 }, { "epoch": 0.2190567853705486, "grad_norm": 1.5411714315414429, "learning_rate": 0.00019415335589520574, "loss": 1.2597, "step": 5690 }, { "epoch": 0.2192492781520693, "grad_norm": 1.3543205261230469, "learning_rate": 0.00019414316328624293, "loss": 1.265, "step": 5695 }, { "epoch": 0.21944177093359, "grad_norm": 0.7644770741462708, "learning_rate": 0.0001941329620685115, "loss": 1.1888, "step": 5700 }, { "epoch": 0.2196342637151107, "grad_norm": 2.1122093200683594, "learning_rate": 0.00019412275224294423, "loss": 1.1301, "step": 5705 }, { "epoch": 0.2198267564966314, "grad_norm": 1.4159448146820068, "learning_rate": 0.00019411253381047477, "loss": 1.209, "step": 5710 }, { "epoch": 0.22001924927815206, "grad_norm": 1.4212615489959717, "learning_rate": 0.00019410230677203755, "loss": 1.3268, "step": 5715 }, { "epoch": 0.22021174205967275, "grad_norm": 1.2042075395584106, "learning_rate": 0.00019409207112856778, "loss": 1.1976, "step": 5720 }, { "epoch": 0.22040423484119345, "grad_norm": 1.5765044689178467, "learning_rate": 0.00019408182688100136, "loss": 1.3631, "step": 5725 }, { "epoch": 0.22059672762271415, "grad_norm": 2.197000026702881, "learning_rate": 0.00019407157403027514, "loss": 1.2964, "step": 5730 }, { "epoch": 0.22078922040423485, "grad_norm": 1.3434042930603027, "learning_rate": 0.00019406131257732664, "loss": 1.244, "step": 5735 }, { "epoch": 0.22098171318575555, "grad_norm": 1.2889900207519531, "learning_rate": 0.0001940510425230942, "loss": 1.1333, "step": 5740 }, { "epoch": 0.22117420596727622, "grad_norm": 0.8795220851898193, "learning_rate": 0.00019404076386851692, "loss": 1.2635, "step": 5745 }, { "epoch": 0.22136669874879691, "grad_norm": 1.0312747955322266, "learning_rate": 0.00019403047661453477, "loss": 1.3195, "step": 5750 }, { "epoch": 0.2215591915303176, "grad_norm": 1.5083264112472534, "learning_rate": 0.00019402018076208845, "loss": 1.3417, "step": 5755 }, { "epoch": 0.2217516843118383, "grad_norm": 1.1538232564926147, "learning_rate": 0.00019400987631211936, "loss": 1.2956, "step": 5760 }, { "epoch": 0.221944177093359, "grad_norm": 1.975381851196289, "learning_rate": 0.0001939995632655699, "loss": 1.4641, "step": 5765 }, { "epoch": 0.2221366698748797, "grad_norm": 1.3251721858978271, "learning_rate": 0.00019398924162338305, "loss": 1.3429, "step": 5770 }, { "epoch": 0.22232916265640038, "grad_norm": 1.1281229257583618, "learning_rate": 0.0001939789113865027, "loss": 1.2155, "step": 5775 }, { "epoch": 0.22252165543792107, "grad_norm": 2.6070075035095215, "learning_rate": 0.00019396857255587344, "loss": 1.2634, "step": 5780 }, { "epoch": 0.22271414821944177, "grad_norm": 1.0815184116363525, "learning_rate": 0.00019395822513244067, "loss": 1.1176, "step": 5785 }, { "epoch": 0.22290664100096247, "grad_norm": 2.819180965423584, "learning_rate": 0.0001939478691171507, "loss": 1.2624, "step": 5790 }, { "epoch": 0.22309913378248317, "grad_norm": 1.180055022239685, "learning_rate": 0.0001939375045109504, "loss": 1.3433, "step": 5795 }, { "epoch": 0.22329162656400384, "grad_norm": 1.1582396030426025, "learning_rate": 0.0001939271313147876, "loss": 1.2815, "step": 5800 }, { "epoch": 0.22348411934552453, "grad_norm": 2.32379412651062, "learning_rate": 0.00019391674952961085, "loss": 1.4095, "step": 5805 }, { "epoch": 0.22367661212704523, "grad_norm": 1.5146657228469849, "learning_rate": 0.0001939063591563695, "loss": 1.2434, "step": 5810 }, { "epoch": 0.22386910490856593, "grad_norm": 1.6434500217437744, "learning_rate": 0.00019389596019601365, "loss": 1.1739, "step": 5815 }, { "epoch": 0.22406159769008663, "grad_norm": 1.7917993068695068, "learning_rate": 0.0001938855526494943, "loss": 1.5106, "step": 5820 }, { "epoch": 0.22425409047160733, "grad_norm": 1.10679030418396, "learning_rate": 0.00019387513651776303, "loss": 1.284, "step": 5825 }, { "epoch": 0.224446583253128, "grad_norm": 1.521506905555725, "learning_rate": 0.00019386471180177247, "loss": 1.4129, "step": 5830 }, { "epoch": 0.2246390760346487, "grad_norm": 1.4055581092834473, "learning_rate": 0.00019385427850247572, "loss": 1.2476, "step": 5835 }, { "epoch": 0.2248315688161694, "grad_norm": 0.9506363868713379, "learning_rate": 0.00019384383662082703, "loss": 1.3105, "step": 5840 }, { "epoch": 0.2250240615976901, "grad_norm": 1.354658842086792, "learning_rate": 0.00019383338615778107, "loss": 1.29, "step": 5845 }, { "epoch": 0.2252165543792108, "grad_norm": 0.8972203135490417, "learning_rate": 0.00019382292711429353, "loss": 1.3407, "step": 5850 }, { "epoch": 0.22540904716073148, "grad_norm": 0.9989115595817566, "learning_rate": 0.00019381245949132085, "loss": 1.1662, "step": 5855 }, { "epoch": 0.22560153994225216, "grad_norm": 1.1133052110671997, "learning_rate": 0.0001938019832898202, "loss": 1.2674, "step": 5860 }, { "epoch": 0.22579403272377285, "grad_norm": 1.3640556335449219, "learning_rate": 0.00019379149851074957, "loss": 1.1989, "step": 5865 }, { "epoch": 0.22598652550529355, "grad_norm": 1.2812589406967163, "learning_rate": 0.0001937810051550677, "loss": 1.4749, "step": 5870 }, { "epoch": 0.22617901828681425, "grad_norm": 1.223944902420044, "learning_rate": 0.00019377050322373412, "loss": 1.305, "step": 5875 }, { "epoch": 0.22637151106833495, "grad_norm": 1.3493690490722656, "learning_rate": 0.00019375999271770925, "loss": 1.458, "step": 5880 }, { "epoch": 0.22656400384985564, "grad_norm": 1.4042202234268188, "learning_rate": 0.0001937494736379541, "loss": 1.1714, "step": 5885 }, { "epoch": 0.22675649663137631, "grad_norm": 1.6239880323410034, "learning_rate": 0.00019373894598543066, "loss": 1.3224, "step": 5890 }, { "epoch": 0.226948989412897, "grad_norm": 1.096960425376892, "learning_rate": 0.00019372840976110154, "loss": 1.128, "step": 5895 }, { "epoch": 0.2271414821944177, "grad_norm": 1.6740233898162842, "learning_rate": 0.00019371786496593028, "loss": 1.195, "step": 5900 }, { "epoch": 0.2273339749759384, "grad_norm": 1.454030156135559, "learning_rate": 0.00019370731160088105, "loss": 1.2641, "step": 5905 }, { "epoch": 0.2275264677574591, "grad_norm": 1.4465221166610718, "learning_rate": 0.00019369674966691897, "loss": 1.331, "step": 5910 }, { "epoch": 0.22771896053897978, "grad_norm": 1.6115851402282715, "learning_rate": 0.00019368617916500978, "loss": 1.4061, "step": 5915 }, { "epoch": 0.22791145332050047, "grad_norm": 1.0165706872940063, "learning_rate": 0.00019367560009612013, "loss": 1.177, "step": 5920 }, { "epoch": 0.22810394610202117, "grad_norm": 1.5200728178024292, "learning_rate": 0.00019366501246121737, "loss": 1.1323, "step": 5925 }, { "epoch": 0.22829643888354187, "grad_norm": 1.4613386392593384, "learning_rate": 0.00019365441626126976, "loss": 1.4626, "step": 5930 }, { "epoch": 0.22848893166506257, "grad_norm": 1.2502466440200806, "learning_rate": 0.00019364381149724613, "loss": 1.2797, "step": 5935 }, { "epoch": 0.22868142444658326, "grad_norm": 1.2946960926055908, "learning_rate": 0.0001936331981701163, "loss": 1.3844, "step": 5940 }, { "epoch": 0.22887391722810393, "grad_norm": 1.2478231191635132, "learning_rate": 0.00019362257628085074, "loss": 1.2855, "step": 5945 }, { "epoch": 0.22906641000962463, "grad_norm": 1.0097830295562744, "learning_rate": 0.0001936119458304208, "loss": 1.1223, "step": 5950 }, { "epoch": 0.22925890279114533, "grad_norm": 1.3235141038894653, "learning_rate": 0.00019360130681979852, "loss": 1.284, "step": 5955 }, { "epoch": 0.22945139557266603, "grad_norm": 1.6869986057281494, "learning_rate": 0.00019359065924995678, "loss": 1.517, "step": 5960 }, { "epoch": 0.22964388835418673, "grad_norm": 0.9644334316253662, "learning_rate": 0.00019358000312186925, "loss": 1.0607, "step": 5965 }, { "epoch": 0.22983638113570742, "grad_norm": 1.063192367553711, "learning_rate": 0.0001935693384365103, "loss": 0.9187, "step": 5970 }, { "epoch": 0.2300288739172281, "grad_norm": 1.0339081287384033, "learning_rate": 0.00019355866519485523, "loss": 1.2946, "step": 5975 }, { "epoch": 0.2302213666987488, "grad_norm": 1.3194791078567505, "learning_rate": 0.00019354798339788, "loss": 1.4293, "step": 5980 }, { "epoch": 0.2304138594802695, "grad_norm": 1.8870794773101807, "learning_rate": 0.00019353729304656136, "loss": 1.4124, "step": 5985 }, { "epoch": 0.2306063522617902, "grad_norm": 1.132385015487671, "learning_rate": 0.00019352659414187694, "loss": 1.1949, "step": 5990 }, { "epoch": 0.23079884504331089, "grad_norm": 2.763613700866699, "learning_rate": 0.000193515886684805, "loss": 1.2341, "step": 5995 }, { "epoch": 0.23099133782483156, "grad_norm": 1.6793404817581177, "learning_rate": 0.00019350517067632473, "loss": 1.3597, "step": 6000 }, { "epoch": 0.23118383060635225, "grad_norm": 1.1538963317871094, "learning_rate": 0.000193494446117416, "loss": 1.1981, "step": 6005 }, { "epoch": 0.23137632338787295, "grad_norm": 1.0233584642410278, "learning_rate": 0.00019348371300905955, "loss": 1.2821, "step": 6010 }, { "epoch": 0.23156881616939365, "grad_norm": 1.3905096054077148, "learning_rate": 0.0001934729713522368, "loss": 1.3471, "step": 6015 }, { "epoch": 0.23176130895091435, "grad_norm": 1.345563292503357, "learning_rate": 0.00019346222114793, "loss": 1.0454, "step": 6020 }, { "epoch": 0.23195380173243504, "grad_norm": 0.739811897277832, "learning_rate": 0.00019345146239712225, "loss": 1.3125, "step": 6025 }, { "epoch": 0.23214629451395571, "grad_norm": 1.977918028831482, "learning_rate": 0.0001934406951007973, "loss": 1.3328, "step": 6030 }, { "epoch": 0.2323387872954764, "grad_norm": 0.9505223035812378, "learning_rate": 0.00019342991925993977, "loss": 1.1388, "step": 6035 }, { "epoch": 0.2325312800769971, "grad_norm": 1.257755160331726, "learning_rate": 0.00019341913487553502, "loss": 1.3064, "step": 6040 }, { "epoch": 0.2327237728585178, "grad_norm": 1.2003203630447388, "learning_rate": 0.00019340834194856926, "loss": 1.4369, "step": 6045 }, { "epoch": 0.2329162656400385, "grad_norm": 1.2289738655090332, "learning_rate": 0.0001933975404800294, "loss": 1.1462, "step": 6050 }, { "epoch": 0.2331087584215592, "grad_norm": 1.227171540260315, "learning_rate": 0.00019338673047090317, "loss": 1.1829, "step": 6055 }, { "epoch": 0.23330125120307987, "grad_norm": 1.2766560316085815, "learning_rate": 0.00019337591192217904, "loss": 1.2572, "step": 6060 }, { "epoch": 0.23349374398460057, "grad_norm": 2.6716904640197754, "learning_rate": 0.00019336508483484634, "loss": 1.0195, "step": 6065 }, { "epoch": 0.23368623676612127, "grad_norm": 1.1586931943893433, "learning_rate": 0.00019335424920989512, "loss": 1.4932, "step": 6070 }, { "epoch": 0.23387872954764197, "grad_norm": 1.0196670293807983, "learning_rate": 0.00019334340504831624, "loss": 1.3497, "step": 6075 }, { "epoch": 0.23407122232916266, "grad_norm": 1.6527109146118164, "learning_rate": 0.00019333255235110127, "loss": 1.1239, "step": 6080 }, { "epoch": 0.23426371511068336, "grad_norm": 0.9913870096206665, "learning_rate": 0.00019332169111924271, "loss": 1.2757, "step": 6085 }, { "epoch": 0.23445620789220403, "grad_norm": 1.1027697324752808, "learning_rate": 0.00019331082135373367, "loss": 1.2512, "step": 6090 }, { "epoch": 0.23464870067372473, "grad_norm": 1.9269218444824219, "learning_rate": 0.00019329994305556815, "loss": 1.4698, "step": 6095 }, { "epoch": 0.23484119345524543, "grad_norm": 1.1504942178726196, "learning_rate": 0.00019328905622574086, "loss": 1.4844, "step": 6100 }, { "epoch": 0.23503368623676613, "grad_norm": 1.1164321899414062, "learning_rate": 0.0001932781608652474, "loss": 1.2972, "step": 6105 }, { "epoch": 0.23522617901828682, "grad_norm": 1.283000111579895, "learning_rate": 0.00019326725697508407, "loss": 1.3117, "step": 6110 }, { "epoch": 0.2354186717998075, "grad_norm": 1.3553595542907715, "learning_rate": 0.00019325634455624787, "loss": 1.027, "step": 6115 }, { "epoch": 0.2356111645813282, "grad_norm": 2.1605517864227295, "learning_rate": 0.00019324542360973674, "loss": 1.2211, "step": 6120 }, { "epoch": 0.2358036573628489, "grad_norm": 1.1028283834457397, "learning_rate": 0.00019323449413654933, "loss": 1.3034, "step": 6125 }, { "epoch": 0.2359961501443696, "grad_norm": 1.1728841066360474, "learning_rate": 0.00019322355613768505, "loss": 1.3135, "step": 6130 }, { "epoch": 0.23618864292589029, "grad_norm": 1.7304178476333618, "learning_rate": 0.0001932126096141441, "loss": 1.3516, "step": 6135 }, { "epoch": 0.23638113570741098, "grad_norm": 1.3326451778411865, "learning_rate": 0.00019320165456692748, "loss": 1.3371, "step": 6140 }, { "epoch": 0.23657362848893165, "grad_norm": 1.6894330978393555, "learning_rate": 0.00019319069099703697, "loss": 1.2126, "step": 6145 }, { "epoch": 0.23676612127045235, "grad_norm": 1.7248213291168213, "learning_rate": 0.0001931797189054751, "loss": 1.193, "step": 6150 }, { "epoch": 0.23695861405197305, "grad_norm": 1.1517174243927002, "learning_rate": 0.0001931687382932452, "loss": 1.1472, "step": 6155 }, { "epoch": 0.23715110683349375, "grad_norm": 2.4606590270996094, "learning_rate": 0.00019315774916135134, "loss": 1.524, "step": 6160 }, { "epoch": 0.23734359961501444, "grad_norm": 1.6130386590957642, "learning_rate": 0.00019314675151079844, "loss": 1.052, "step": 6165 }, { "epoch": 0.23753609239653514, "grad_norm": 1.3845412731170654, "learning_rate": 0.00019313574534259216, "loss": 1.2557, "step": 6170 }, { "epoch": 0.2377285851780558, "grad_norm": 1.3509567975997925, "learning_rate": 0.00019312473065773893, "loss": 1.3083, "step": 6175 }, { "epoch": 0.2379210779595765, "grad_norm": 1.358113408088684, "learning_rate": 0.000193113707457246, "loss": 1.2226, "step": 6180 }, { "epoch": 0.2381135707410972, "grad_norm": 0.9598337411880493, "learning_rate": 0.00019310267574212134, "loss": 1.1861, "step": 6185 }, { "epoch": 0.2383060635226179, "grad_norm": 1.347159743309021, "learning_rate": 0.0001930916355133737, "loss": 1.2782, "step": 6190 }, { "epoch": 0.2384985563041386, "grad_norm": 1.0227164030075073, "learning_rate": 0.0001930805867720127, "loss": 1.2909, "step": 6195 }, { "epoch": 0.2386910490856593, "grad_norm": 1.8373135328292847, "learning_rate": 0.00019306952951904865, "loss": 1.3371, "step": 6200 }, { "epoch": 0.23888354186717997, "grad_norm": 2.130218267440796, "learning_rate": 0.00019305846375549263, "loss": 1.3275, "step": 6205 }, { "epoch": 0.23907603464870067, "grad_norm": 1.3699109554290771, "learning_rate": 0.00019304738948235656, "loss": 1.172, "step": 6210 }, { "epoch": 0.23926852743022137, "grad_norm": 1.8254964351654053, "learning_rate": 0.0001930363067006531, "loss": 1.166, "step": 6215 }, { "epoch": 0.23946102021174206, "grad_norm": 2.6475026607513428, "learning_rate": 0.00019302521541139571, "loss": 1.3168, "step": 6220 }, { "epoch": 0.23965351299326276, "grad_norm": 1.4869440793991089, "learning_rate": 0.0001930141156155986, "loss": 1.1112, "step": 6225 }, { "epoch": 0.23984600577478343, "grad_norm": 1.0316526889801025, "learning_rate": 0.00019300300731427678, "loss": 1.3845, "step": 6230 }, { "epoch": 0.24003849855630413, "grad_norm": 1.1549556255340576, "learning_rate": 0.00019299189050844603, "loss": 1.378, "step": 6235 }, { "epoch": 0.24023099133782483, "grad_norm": 1.9833987951278687, "learning_rate": 0.00019298076519912294, "loss": 1.2631, "step": 6240 }, { "epoch": 0.24042348411934553, "grad_norm": 1.1354988813400269, "learning_rate": 0.00019296963138732478, "loss": 1.6525, "step": 6245 }, { "epoch": 0.24061597690086622, "grad_norm": 1.6483670473098755, "learning_rate": 0.0001929584890740697, "loss": 0.9828, "step": 6250 }, { "epoch": 0.24080846968238692, "grad_norm": 1.537610650062561, "learning_rate": 0.00019294733826037659, "loss": 1.3566, "step": 6255 }, { "epoch": 0.2410009624639076, "grad_norm": 1.207406759262085, "learning_rate": 0.0001929361789472651, "loss": 1.3306, "step": 6260 }, { "epoch": 0.2411934552454283, "grad_norm": 1.4772666692733765, "learning_rate": 0.00019292501113575572, "loss": 1.3117, "step": 6265 }, { "epoch": 0.241385948026949, "grad_norm": 1.8285613059997559, "learning_rate": 0.00019291383482686962, "loss": 1.3711, "step": 6270 }, { "epoch": 0.24157844080846969, "grad_norm": 0.9223503470420837, "learning_rate": 0.00019290265002162884, "loss": 1.1712, "step": 6275 }, { "epoch": 0.24177093358999038, "grad_norm": 2.1818087100982666, "learning_rate": 0.00019289145672105612, "loss": 1.1596, "step": 6280 }, { "epoch": 0.24196342637151108, "grad_norm": 0.8749092817306519, "learning_rate": 0.00019288025492617504, "loss": 1.0726, "step": 6285 }, { "epoch": 0.24215591915303175, "grad_norm": 1.1598855257034302, "learning_rate": 0.00019286904463800995, "loss": 1.2931, "step": 6290 }, { "epoch": 0.24234841193455245, "grad_norm": 1.4357101917266846, "learning_rate": 0.0001928578258575859, "loss": 1.2612, "step": 6295 }, { "epoch": 0.24254090471607315, "grad_norm": 0.9731203317642212, "learning_rate": 0.0001928465985859288, "loss": 1.178, "step": 6300 }, { "epoch": 0.24273339749759384, "grad_norm": 1.1217381954193115, "learning_rate": 0.00019283536282406534, "loss": 1.285, "step": 6305 }, { "epoch": 0.24292589027911454, "grad_norm": 1.415860891342163, "learning_rate": 0.0001928241185730229, "loss": 1.399, "step": 6310 }, { "epoch": 0.2431183830606352, "grad_norm": 0.9067175388336182, "learning_rate": 0.00019281286583382973, "loss": 1.2336, "step": 6315 }, { "epoch": 0.2433108758421559, "grad_norm": 1.6320233345031738, "learning_rate": 0.0001928016046075148, "loss": 1.4348, "step": 6320 }, { "epoch": 0.2435033686236766, "grad_norm": 1.3945854902267456, "learning_rate": 0.0001927903348951079, "loss": 1.1614, "step": 6325 }, { "epoch": 0.2436958614051973, "grad_norm": 1.37948477268219, "learning_rate": 0.00019277905669763952, "loss": 1.2058, "step": 6330 }, { "epoch": 0.243888354186718, "grad_norm": 1.3325083255767822, "learning_rate": 0.00019276777001614104, "loss": 1.2737, "step": 6335 }, { "epoch": 0.2440808469682387, "grad_norm": 1.5902581214904785, "learning_rate": 0.00019275647485164453, "loss": 1.3706, "step": 6340 }, { "epoch": 0.24427333974975937, "grad_norm": 1.1309142112731934, "learning_rate": 0.00019274517120518284, "loss": 1.2408, "step": 6345 }, { "epoch": 0.24446583253128007, "grad_norm": 1.9998489618301392, "learning_rate": 0.0001927338590777896, "loss": 1.3079, "step": 6350 }, { "epoch": 0.24465832531280077, "grad_norm": 1.569667100906372, "learning_rate": 0.00019272253847049927, "loss": 1.2365, "step": 6355 }, { "epoch": 0.24485081809432147, "grad_norm": 1.2294694185256958, "learning_rate": 0.00019271120938434702, "loss": 1.3544, "step": 6360 }, { "epoch": 0.24504331087584216, "grad_norm": 1.9876806735992432, "learning_rate": 0.00019269987182036883, "loss": 1.3675, "step": 6365 }, { "epoch": 0.24523580365736286, "grad_norm": 1.3317819833755493, "learning_rate": 0.0001926885257796015, "loss": 1.0949, "step": 6370 }, { "epoch": 0.24542829643888353, "grad_norm": 1.7602546215057373, "learning_rate": 0.00019267717126308242, "loss": 1.3168, "step": 6375 }, { "epoch": 0.24562078922040423, "grad_norm": 1.5651274919509888, "learning_rate": 0.00019266580827184996, "loss": 1.2802, "step": 6380 }, { "epoch": 0.24581328200192493, "grad_norm": 0.9537544846534729, "learning_rate": 0.0001926544368069432, "loss": 1.1876, "step": 6385 }, { "epoch": 0.24600577478344562, "grad_norm": 0.9649773240089417, "learning_rate": 0.000192643056869402, "loss": 1.1378, "step": 6390 }, { "epoch": 0.24619826756496632, "grad_norm": 1.6363686323165894, "learning_rate": 0.00019263166846026692, "loss": 1.3284, "step": 6395 }, { "epoch": 0.24639076034648702, "grad_norm": 1.748897910118103, "learning_rate": 0.00019262027158057943, "loss": 1.4314, "step": 6400 }, { "epoch": 0.2465832531280077, "grad_norm": 2.138967990875244, "learning_rate": 0.00019260886623138164, "loss": 1.2244, "step": 6405 }, { "epoch": 0.2467757459095284, "grad_norm": 2.517312526702881, "learning_rate": 0.0001925974524137165, "loss": 1.3394, "step": 6410 }, { "epoch": 0.24696823869104909, "grad_norm": 1.7510714530944824, "learning_rate": 0.00019258603012862772, "loss": 1.3369, "step": 6415 }, { "epoch": 0.24716073147256978, "grad_norm": 1.1651504039764404, "learning_rate": 0.00019257459937715985, "loss": 1.2953, "step": 6420 }, { "epoch": 0.24735322425409048, "grad_norm": 1.325554609298706, "learning_rate": 0.0001925631601603581, "loss": 1.3062, "step": 6425 }, { "epoch": 0.24754571703561115, "grad_norm": 1.0340043306350708, "learning_rate": 0.00019255171247926852, "loss": 1.337, "step": 6430 }, { "epoch": 0.24773820981713185, "grad_norm": 1.677131175994873, "learning_rate": 0.00019254025633493792, "loss": 1.3179, "step": 6435 }, { "epoch": 0.24793070259865255, "grad_norm": 2.475339651107788, "learning_rate": 0.00019252879172841395, "loss": 1.4765, "step": 6440 }, { "epoch": 0.24812319538017324, "grad_norm": 1.1302917003631592, "learning_rate": 0.00019251731866074486, "loss": 1.3029, "step": 6445 }, { "epoch": 0.24831568816169394, "grad_norm": 1.3425379991531372, "learning_rate": 0.0001925058371329799, "loss": 1.1263, "step": 6450 }, { "epoch": 0.24850818094321464, "grad_norm": 1.0058633089065552, "learning_rate": 0.0001924943471461689, "loss": 1.1059, "step": 6455 }, { "epoch": 0.2487006737247353, "grad_norm": 1.9793190956115723, "learning_rate": 0.0001924828487013626, "loss": 1.5268, "step": 6460 }, { "epoch": 0.248893166506256, "grad_norm": 1.0673744678497314, "learning_rate": 0.00019247134179961242, "loss": 1.2199, "step": 6465 }, { "epoch": 0.2490856592877767, "grad_norm": 1.1182838678359985, "learning_rate": 0.00019245982644197057, "loss": 1.5456, "step": 6470 }, { "epoch": 0.2492781520692974, "grad_norm": 0.9264312982559204, "learning_rate": 0.00019244830262949014, "loss": 1.2367, "step": 6475 }, { "epoch": 0.2494706448508181, "grad_norm": 1.2094528675079346, "learning_rate": 0.00019243677036322478, "loss": 1.2026, "step": 6480 }, { "epoch": 0.2496631376323388, "grad_norm": 1.275902509689331, "learning_rate": 0.00019242522964422917, "loss": 1.206, "step": 6485 }, { "epoch": 0.24985563041385947, "grad_norm": 1.515559434890747, "learning_rate": 0.00019241368047355853, "loss": 1.2222, "step": 6490 }, { "epoch": 0.25004812319538017, "grad_norm": 0.9974495768547058, "learning_rate": 0.000192402122852269, "loss": 1.5274, "step": 6495 }, { "epoch": 0.2502406159769009, "grad_norm": 1.8940407037734985, "learning_rate": 0.00019239055678141746, "loss": 1.3639, "step": 6500 }, { "epoch": 0.25043310875842156, "grad_norm": 1.7484371662139893, "learning_rate": 0.00019237898226206153, "loss": 1.3517, "step": 6505 }, { "epoch": 0.25062560153994223, "grad_norm": 1.004660725593567, "learning_rate": 0.00019236739929525963, "loss": 1.0603, "step": 6510 }, { "epoch": 0.25081809432146296, "grad_norm": 0.9729489684104919, "learning_rate": 0.00019235580788207093, "loss": 1.3252, "step": 6515 }, { "epoch": 0.25101058710298363, "grad_norm": 0.4645654857158661, "learning_rate": 0.00019234420802355539, "loss": 1.1804, "step": 6520 }, { "epoch": 0.25120307988450435, "grad_norm": 1.0810743570327759, "learning_rate": 0.00019233259972077378, "loss": 1.3045, "step": 6525 }, { "epoch": 0.251395572666025, "grad_norm": 1.1666224002838135, "learning_rate": 0.00019232098297478756, "loss": 1.324, "step": 6530 }, { "epoch": 0.2515880654475457, "grad_norm": 1.06947660446167, "learning_rate": 0.000192309357786659, "loss": 1.3131, "step": 6535 }, { "epoch": 0.2517805582290664, "grad_norm": 1.1774028539657593, "learning_rate": 0.0001922977241574512, "loss": 1.301, "step": 6540 }, { "epoch": 0.2519730510105871, "grad_norm": 1.528041958808899, "learning_rate": 0.0001922860820882279, "loss": 1.2542, "step": 6545 }, { "epoch": 0.2521655437921078, "grad_norm": 1.1932915449142456, "learning_rate": 0.00019227443158005377, "loss": 1.125, "step": 6550 }, { "epoch": 0.2523580365736285, "grad_norm": 1.3258370161056519, "learning_rate": 0.0001922627726339941, "loss": 1.3776, "step": 6555 }, { "epoch": 0.25255052935514916, "grad_norm": 0.994076132774353, "learning_rate": 0.0001922511052511151, "loss": 1.0908, "step": 6560 }, { "epoch": 0.2527430221366699, "grad_norm": 1.0820032358169556, "learning_rate": 0.00019223942943248358, "loss": 1.215, "step": 6565 }, { "epoch": 0.25293551491819055, "grad_norm": 0.9792138338088989, "learning_rate": 0.00019222774517916734, "loss": 1.2413, "step": 6570 }, { "epoch": 0.2531280076997113, "grad_norm": 1.1704801321029663, "learning_rate": 0.0001922160524922347, "loss": 1.5203, "step": 6575 }, { "epoch": 0.25332050048123195, "grad_norm": 1.6249198913574219, "learning_rate": 0.00019220435137275494, "loss": 1.2771, "step": 6580 }, { "epoch": 0.2535129932627527, "grad_norm": 1.3218034505844116, "learning_rate": 0.00019219264182179804, "loss": 1.4433, "step": 6585 }, { "epoch": 0.25370548604427334, "grad_norm": 1.7230724096298218, "learning_rate": 0.0001921809238404348, "loss": 1.1069, "step": 6590 }, { "epoch": 0.253897978825794, "grad_norm": 1.3148738145828247, "learning_rate": 0.00019216919742973669, "loss": 1.2386, "step": 6595 }, { "epoch": 0.25409047160731474, "grad_norm": 1.257513403892517, "learning_rate": 0.00019215746259077605, "loss": 1.3476, "step": 6600 }, { "epoch": 0.2542829643888354, "grad_norm": 0.965403139591217, "learning_rate": 0.00019214571932462592, "loss": 1.1045, "step": 6605 }, { "epoch": 0.25447545717035613, "grad_norm": 0.8903887867927551, "learning_rate": 0.0001921339676323602, "loss": 1.1481, "step": 6610 }, { "epoch": 0.2546679499518768, "grad_norm": 1.284529209136963, "learning_rate": 0.00019212220751505345, "loss": 1.3179, "step": 6615 }, { "epoch": 0.2548604427333975, "grad_norm": 2.3491082191467285, "learning_rate": 0.0001921104389737811, "loss": 1.3042, "step": 6620 }, { "epoch": 0.2550529355149182, "grad_norm": 1.4170057773590088, "learning_rate": 0.00019209866200961927, "loss": 1.3775, "step": 6625 }, { "epoch": 0.25524542829643887, "grad_norm": 1.4182847738265991, "learning_rate": 0.00019208687662364488, "loss": 1.3895, "step": 6630 }, { "epoch": 0.2554379210779596, "grad_norm": 1.2162110805511475, "learning_rate": 0.00019207508281693568, "loss": 1.0754, "step": 6635 }, { "epoch": 0.25563041385948027, "grad_norm": 1.473873257637024, "learning_rate": 0.00019206328059057006, "loss": 1.3323, "step": 6640 }, { "epoch": 0.25582290664100094, "grad_norm": 1.2990386486053467, "learning_rate": 0.0001920514699456273, "loss": 1.2304, "step": 6645 }, { "epoch": 0.25601539942252166, "grad_norm": 1.2828303575515747, "learning_rate": 0.00019203965088318743, "loss": 1.2566, "step": 6650 }, { "epoch": 0.25620789220404233, "grad_norm": 0.9165570735931396, "learning_rate": 0.00019202782340433115, "loss": 1.2186, "step": 6655 }, { "epoch": 0.25640038498556306, "grad_norm": 2.0381886959075928, "learning_rate": 0.00019201598751014006, "loss": 1.114, "step": 6660 }, { "epoch": 0.2565928777670837, "grad_norm": 1.252790093421936, "learning_rate": 0.00019200414320169647, "loss": 1.2354, "step": 6665 }, { "epoch": 0.25678537054860445, "grad_norm": 1.1557594537734985, "learning_rate": 0.00019199229048008347, "loss": 1.3652, "step": 6670 }, { "epoch": 0.2569778633301251, "grad_norm": 1.356181025505066, "learning_rate": 0.0001919804293463849, "loss": 1.1026, "step": 6675 }, { "epoch": 0.2571703561116458, "grad_norm": 1.2493314743041992, "learning_rate": 0.00019196855980168536, "loss": 1.2225, "step": 6680 }, { "epoch": 0.2573628488931665, "grad_norm": 1.7480677366256714, "learning_rate": 0.00019195668184707025, "loss": 1.2898, "step": 6685 }, { "epoch": 0.2575553416746872, "grad_norm": 1.0522620677947998, "learning_rate": 0.00019194479548362577, "loss": 1.1404, "step": 6690 }, { "epoch": 0.2577478344562079, "grad_norm": 1.4085676670074463, "learning_rate": 0.00019193290071243882, "loss": 1.5024, "step": 6695 }, { "epoch": 0.2579403272377286, "grad_norm": 1.393096923828125, "learning_rate": 0.0001919209975345971, "loss": 1.2555, "step": 6700 }, { "epoch": 0.25813282001924925, "grad_norm": 1.5740808248519897, "learning_rate": 0.00019190908595118907, "loss": 1.2362, "step": 6705 }, { "epoch": 0.25832531280077, "grad_norm": 1.3243273496627808, "learning_rate": 0.00019189716596330395, "loss": 1.2517, "step": 6710 }, { "epoch": 0.25851780558229065, "grad_norm": 2.5867626667022705, "learning_rate": 0.00019188523757203177, "loss": 1.3509, "step": 6715 }, { "epoch": 0.2587102983638114, "grad_norm": 1.450181484222412, "learning_rate": 0.00019187330077846334, "loss": 1.3451, "step": 6720 }, { "epoch": 0.25890279114533205, "grad_norm": 1.4387754201889038, "learning_rate": 0.0001918613555836901, "loss": 1.2518, "step": 6725 }, { "epoch": 0.25909528392685277, "grad_norm": 1.427882432937622, "learning_rate": 0.00019184940198880448, "loss": 1.235, "step": 6730 }, { "epoch": 0.25928777670837344, "grad_norm": 1.060436487197876, "learning_rate": 0.00019183743999489947, "loss": 1.4583, "step": 6735 }, { "epoch": 0.2594802694898941, "grad_norm": 1.0780494213104248, "learning_rate": 0.00019182546960306893, "loss": 1.1134, "step": 6740 }, { "epoch": 0.25967276227141484, "grad_norm": 1.3795710802078247, "learning_rate": 0.0001918134908144075, "loss": 1.2979, "step": 6745 }, { "epoch": 0.2598652550529355, "grad_norm": 2.0972957611083984, "learning_rate": 0.00019180150363001051, "loss": 1.6512, "step": 6750 }, { "epoch": 0.26005774783445623, "grad_norm": 1.129204273223877, "learning_rate": 0.00019178950805097416, "loss": 1.2263, "step": 6755 }, { "epoch": 0.2602502406159769, "grad_norm": 0.8816843628883362, "learning_rate": 0.00019177750407839536, "loss": 1.2265, "step": 6760 }, { "epoch": 0.26044273339749757, "grad_norm": 1.5167860984802246, "learning_rate": 0.00019176549171337178, "loss": 1.226, "step": 6765 }, { "epoch": 0.2606352261790183, "grad_norm": 1.329172968864441, "learning_rate": 0.00019175347095700188, "loss": 1.3375, "step": 6770 }, { "epoch": 0.26082771896053897, "grad_norm": 1.8215051889419556, "learning_rate": 0.00019174144181038485, "loss": 1.2453, "step": 6775 }, { "epoch": 0.2610202117420597, "grad_norm": 1.147878646850586, "learning_rate": 0.00019172940427462072, "loss": 1.3137, "step": 6780 }, { "epoch": 0.26121270452358036, "grad_norm": 1.5783206224441528, "learning_rate": 0.0001917173583508102, "loss": 1.1803, "step": 6785 }, { "epoch": 0.26140519730510103, "grad_norm": 1.7433182001113892, "learning_rate": 0.00019170530404005485, "loss": 1.171, "step": 6790 }, { "epoch": 0.26159769008662176, "grad_norm": 1.5278960466384888, "learning_rate": 0.0001916932413434569, "loss": 1.2274, "step": 6795 }, { "epoch": 0.26179018286814243, "grad_norm": 1.375710368156433, "learning_rate": 0.00019168117026211948, "loss": 1.241, "step": 6800 }, { "epoch": 0.26198267564966315, "grad_norm": 2.146165370941162, "learning_rate": 0.00019166909079714636, "loss": 1.2778, "step": 6805 }, { "epoch": 0.2621751684311838, "grad_norm": 1.7670506238937378, "learning_rate": 0.00019165700294964216, "loss": 1.3293, "step": 6810 }, { "epoch": 0.26236766121270455, "grad_norm": 1.5492186546325684, "learning_rate": 0.00019164490672071217, "loss": 1.2808, "step": 6815 }, { "epoch": 0.2625601539942252, "grad_norm": 1.4138727188110352, "learning_rate": 0.00019163280211146257, "loss": 1.2352, "step": 6820 }, { "epoch": 0.2627526467757459, "grad_norm": 1.185674786567688, "learning_rate": 0.00019162068912300024, "loss": 1.1883, "step": 6825 }, { "epoch": 0.2629451395572666, "grad_norm": 1.717349886894226, "learning_rate": 0.0001916085677564328, "loss": 1.1329, "step": 6830 }, { "epoch": 0.2631376323387873, "grad_norm": 1.1391080617904663, "learning_rate": 0.00019159643801286872, "loss": 1.4104, "step": 6835 }, { "epoch": 0.263330125120308, "grad_norm": 1.0915690660476685, "learning_rate": 0.00019158429989341716, "loss": 1.2813, "step": 6840 }, { "epoch": 0.2635226179018287, "grad_norm": 1.120492696762085, "learning_rate": 0.000191572153399188, "loss": 1.2669, "step": 6845 }, { "epoch": 0.26371511068334935, "grad_norm": 1.0648150444030762, "learning_rate": 0.0001915599985312921, "loss": 1.2581, "step": 6850 }, { "epoch": 0.2639076034648701, "grad_norm": 1.7173513174057007, "learning_rate": 0.0001915478352908408, "loss": 1.2081, "step": 6855 }, { "epoch": 0.26410009624639075, "grad_norm": 1.3801002502441406, "learning_rate": 0.00019153566367894644, "loss": 1.4625, "step": 6860 }, { "epoch": 0.2642925890279115, "grad_norm": 2.5863940715789795, "learning_rate": 0.00019152348369672203, "loss": 1.4777, "step": 6865 }, { "epoch": 0.26448508180943214, "grad_norm": 1.5995707511901855, "learning_rate": 0.0001915112953452813, "loss": 1.2089, "step": 6870 }, { "epoch": 0.2646775745909528, "grad_norm": 1.2661023139953613, "learning_rate": 0.0001914990986257388, "loss": 1.1937, "step": 6875 }, { "epoch": 0.26487006737247354, "grad_norm": 1.4782702922821045, "learning_rate": 0.00019148689353920987, "loss": 1.2462, "step": 6880 }, { "epoch": 0.2650625601539942, "grad_norm": 1.8557063341140747, "learning_rate": 0.0001914746800868106, "loss": 1.425, "step": 6885 }, { "epoch": 0.26525505293551493, "grad_norm": 2.825359582901001, "learning_rate": 0.00019146245826965775, "loss": 1.3628, "step": 6890 }, { "epoch": 0.2654475457170356, "grad_norm": 1.7262654304504395, "learning_rate": 0.00019145022808886902, "loss": 1.2902, "step": 6895 }, { "epoch": 0.26564003849855633, "grad_norm": 0.9676236510276794, "learning_rate": 0.00019143798954556268, "loss": 1.3342, "step": 6900 }, { "epoch": 0.265832531280077, "grad_norm": 1.4607850313186646, "learning_rate": 0.00019142574264085797, "loss": 1.3084, "step": 6905 }, { "epoch": 0.26602502406159767, "grad_norm": 2.181511878967285, "learning_rate": 0.0001914134873758747, "loss": 1.1746, "step": 6910 }, { "epoch": 0.2662175168431184, "grad_norm": 1.4534579515457153, "learning_rate": 0.00019140122375173362, "loss": 1.3071, "step": 6915 }, { "epoch": 0.26641000962463907, "grad_norm": 1.607039213180542, "learning_rate": 0.00019138895176955604, "loss": 1.2883, "step": 6920 }, { "epoch": 0.2666025024061598, "grad_norm": 0.9929762482643127, "learning_rate": 0.00019137667143046425, "loss": 1.1122, "step": 6925 }, { "epoch": 0.26679499518768046, "grad_norm": 1.6732393503189087, "learning_rate": 0.0001913643827355812, "loss": 1.149, "step": 6930 }, { "epoch": 0.26698748796920113, "grad_norm": 1.3785120248794556, "learning_rate": 0.0001913520856860305, "loss": 1.3759, "step": 6935 }, { "epoch": 0.26717998075072186, "grad_norm": 1.8252770900726318, "learning_rate": 0.0001913397802829368, "loss": 1.2633, "step": 6940 }, { "epoch": 0.2673724735322425, "grad_norm": 1.6789536476135254, "learning_rate": 0.0001913274665274252, "loss": 1.2741, "step": 6945 }, { "epoch": 0.26756496631376325, "grad_norm": 2.0153861045837402, "learning_rate": 0.00019131514442062184, "loss": 1.196, "step": 6950 }, { "epoch": 0.2677574590952839, "grad_norm": 1.0000704526901245, "learning_rate": 0.0001913028139636534, "loss": 1.1872, "step": 6955 }, { "epoch": 0.2679499518768046, "grad_norm": 1.2803142070770264, "learning_rate": 0.00019129047515764743, "loss": 1.2655, "step": 6960 }, { "epoch": 0.2681424446583253, "grad_norm": 0.9827659130096436, "learning_rate": 0.00019127812800373225, "loss": 1.3503, "step": 6965 }, { "epoch": 0.268334937439846, "grad_norm": 1.3766348361968994, "learning_rate": 0.00019126577250303697, "loss": 1.2851, "step": 6970 }, { "epoch": 0.2685274302213667, "grad_norm": 2.285708427429199, "learning_rate": 0.00019125340865669134, "loss": 1.3247, "step": 6975 }, { "epoch": 0.2687199230028874, "grad_norm": 1.79937744140625, "learning_rate": 0.000191241036465826, "loss": 1.0306, "step": 6980 }, { "epoch": 0.2689124157844081, "grad_norm": 1.6062885522842407, "learning_rate": 0.0001912286559315723, "loss": 1.2068, "step": 6985 }, { "epoch": 0.2691049085659288, "grad_norm": 1.9590744972229004, "learning_rate": 0.00019121626705506233, "loss": 1.2195, "step": 6990 }, { "epoch": 0.26929740134744945, "grad_norm": 1.366186261177063, "learning_rate": 0.000191203869837429, "loss": 1.1627, "step": 6995 }, { "epoch": 0.2694898941289702, "grad_norm": 0.9655261635780334, "learning_rate": 0.00019119146427980593, "loss": 1.053, "step": 7000 }, { "epoch": 0.26968238691049085, "grad_norm": 1.4636151790618896, "learning_rate": 0.00019117905038332756, "loss": 1.0954, "step": 7005 }, { "epoch": 0.26987487969201157, "grad_norm": 1.4435783624649048, "learning_rate": 0.00019116662814912903, "loss": 1.2102, "step": 7010 }, { "epoch": 0.27006737247353224, "grad_norm": 0.9880768060684204, "learning_rate": 0.00019115419757834628, "loss": 1.0698, "step": 7015 }, { "epoch": 0.2702598652550529, "grad_norm": 1.516515851020813, "learning_rate": 0.000191141758672116, "loss": 1.3894, "step": 7020 }, { "epoch": 0.27045235803657364, "grad_norm": 2.1763806343078613, "learning_rate": 0.00019112931143157563, "loss": 1.3794, "step": 7025 }, { "epoch": 0.2706448508180943, "grad_norm": 1.2275705337524414, "learning_rate": 0.00019111685585786344, "loss": 1.2897, "step": 7030 }, { "epoch": 0.27083734359961503, "grad_norm": 0.966526985168457, "learning_rate": 0.00019110439195211835, "loss": 1.2112, "step": 7035 }, { "epoch": 0.2710298363811357, "grad_norm": 1.251911997795105, "learning_rate": 0.00019109191971548016, "loss": 1.2481, "step": 7040 }, { "epoch": 0.27122232916265643, "grad_norm": 2.3555140495300293, "learning_rate": 0.0001910794391490893, "loss": 1.3372, "step": 7045 }, { "epoch": 0.2714148219441771, "grad_norm": 1.229268193244934, "learning_rate": 0.0001910669502540871, "loss": 1.4362, "step": 7050 }, { "epoch": 0.27160731472569777, "grad_norm": 1.2356593608856201, "learning_rate": 0.00019105445303161555, "loss": 1.379, "step": 7055 }, { "epoch": 0.2717998075072185, "grad_norm": 1.910232424736023, "learning_rate": 0.00019104194748281747, "loss": 1.2902, "step": 7060 }, { "epoch": 0.27199230028873916, "grad_norm": 1.9058904647827148, "learning_rate": 0.0001910294336088364, "loss": 1.3313, "step": 7065 }, { "epoch": 0.2721847930702599, "grad_norm": 0.8631892800331116, "learning_rate": 0.0001910169114108166, "loss": 1.2843, "step": 7070 }, { "epoch": 0.27237728585178056, "grad_norm": 1.2212119102478027, "learning_rate": 0.0001910043808899032, "loss": 1.2588, "step": 7075 }, { "epoch": 0.27256977863330123, "grad_norm": 2.3140738010406494, "learning_rate": 0.00019099184204724202, "loss": 1.1781, "step": 7080 }, { "epoch": 0.27276227141482196, "grad_norm": 1.0162906646728516, "learning_rate": 0.00019097929488397965, "loss": 1.3433, "step": 7085 }, { "epoch": 0.2729547641963426, "grad_norm": 1.719766616821289, "learning_rate": 0.00019096673940126343, "loss": 1.1469, "step": 7090 }, { "epoch": 0.27314725697786335, "grad_norm": 1.5173147916793823, "learning_rate": 0.00019095417560024153, "loss": 1.1663, "step": 7095 }, { "epoch": 0.273339749759384, "grad_norm": 2.1228654384613037, "learning_rate": 0.00019094160348206277, "loss": 1.3433, "step": 7100 }, { "epoch": 0.2735322425409047, "grad_norm": 1.3896198272705078, "learning_rate": 0.00019092902304787679, "loss": 1.1782, "step": 7105 }, { "epoch": 0.2737247353224254, "grad_norm": 1.6935322284698486, "learning_rate": 0.00019091643429883402, "loss": 1.1867, "step": 7110 }, { "epoch": 0.2739172281039461, "grad_norm": 1.5454139709472656, "learning_rate": 0.00019090383723608558, "loss": 1.3938, "step": 7115 }, { "epoch": 0.2741097208854668, "grad_norm": 1.1493245363235474, "learning_rate": 0.00019089123186078342, "loss": 1.2127, "step": 7120 }, { "epoch": 0.2743022136669875, "grad_norm": 1.7321335077285767, "learning_rate": 0.00019087861817408021, "loss": 1.3068, "step": 7125 }, { "epoch": 0.2744947064485082, "grad_norm": 1.7654987573623657, "learning_rate": 0.00019086599617712936, "loss": 1.3236, "step": 7130 }, { "epoch": 0.2746871992300289, "grad_norm": 1.0047959089279175, "learning_rate": 0.0001908533658710851, "loss": 1.404, "step": 7135 }, { "epoch": 0.27487969201154955, "grad_norm": 1.9708582162857056, "learning_rate": 0.0001908407272571024, "loss": 1.2387, "step": 7140 }, { "epoch": 0.2750721847930703, "grad_norm": 2.097369432449341, "learning_rate": 0.00019082808033633696, "loss": 1.189, "step": 7145 }, { "epoch": 0.27526467757459094, "grad_norm": 1.1789932250976562, "learning_rate": 0.00019081542510994523, "loss": 1.4815, "step": 7150 }, { "epoch": 0.27545717035611167, "grad_norm": 1.7205069065093994, "learning_rate": 0.00019080276157908447, "loss": 1.2906, "step": 7155 }, { "epoch": 0.27564966313763234, "grad_norm": 1.7320606708526611, "learning_rate": 0.0001907900897449127, "loss": 1.339, "step": 7160 }, { "epoch": 0.275842155919153, "grad_norm": 2.100649356842041, "learning_rate": 0.00019077740960858863, "loss": 1.3145, "step": 7165 }, { "epoch": 0.27603464870067373, "grad_norm": 1.9302312135696411, "learning_rate": 0.00019076472117127182, "loss": 1.3082, "step": 7170 }, { "epoch": 0.2762271414821944, "grad_norm": 0.5863549113273621, "learning_rate": 0.0001907520244341225, "loss": 1.0183, "step": 7175 }, { "epoch": 0.27641963426371513, "grad_norm": 1.0428977012634277, "learning_rate": 0.00019073931939830174, "loss": 1.2488, "step": 7180 }, { "epoch": 0.2766121270452358, "grad_norm": 1.1643081903457642, "learning_rate": 0.0001907266060649713, "loss": 1.476, "step": 7185 }, { "epoch": 0.27680461982675647, "grad_norm": 1.0771207809448242, "learning_rate": 0.00019071388443529376, "loss": 1.3134, "step": 7190 }, { "epoch": 0.2769971126082772, "grad_norm": 1.9787309169769287, "learning_rate": 0.00019070115451043238, "loss": 1.3884, "step": 7195 }, { "epoch": 0.27718960538979787, "grad_norm": 2.095546245574951, "learning_rate": 0.0001906884162915513, "loss": 1.1221, "step": 7200 }, { "epoch": 0.2773820981713186, "grad_norm": 2.0389225482940674, "learning_rate": 0.00019067566977981528, "loss": 1.0463, "step": 7205 }, { "epoch": 0.27757459095283926, "grad_norm": 0.9991855621337891, "learning_rate": 0.00019066291497638993, "loss": 1.341, "step": 7210 }, { "epoch": 0.27776708373436, "grad_norm": 1.411401391029358, "learning_rate": 0.0001906501518824416, "loss": 1.434, "step": 7215 }, { "epoch": 0.27795957651588066, "grad_norm": 1.61775803565979, "learning_rate": 0.0001906373804991374, "loss": 1.1553, "step": 7220 }, { "epoch": 0.2781520692974013, "grad_norm": 2.546022653579712, "learning_rate": 0.00019062460082764515, "loss": 1.2496, "step": 7225 }, { "epoch": 0.27834456207892205, "grad_norm": 1.2731270790100098, "learning_rate": 0.00019061181286913348, "loss": 1.3236, "step": 7230 }, { "epoch": 0.2785370548604427, "grad_norm": 1.0163904428482056, "learning_rate": 0.00019059901662477177, "loss": 1.2854, "step": 7235 }, { "epoch": 0.27872954764196345, "grad_norm": 1.0653849840164185, "learning_rate": 0.0001905862120957302, "loss": 1.6351, "step": 7240 }, { "epoch": 0.2789220404234841, "grad_norm": 1.081264853477478, "learning_rate": 0.00019057339928317958, "loss": 1.2466, "step": 7245 }, { "epoch": 0.2791145332050048, "grad_norm": 1.3285462856292725, "learning_rate": 0.00019056057818829156, "loss": 1.2087, "step": 7250 }, { "epoch": 0.2793070259865255, "grad_norm": 1.067254900932312, "learning_rate": 0.0001905477488122386, "loss": 1.3877, "step": 7255 }, { "epoch": 0.2794995187680462, "grad_norm": 0.9383085370063782, "learning_rate": 0.0001905349111561938, "loss": 1.0643, "step": 7260 }, { "epoch": 0.2796920115495669, "grad_norm": 2.7797493934631348, "learning_rate": 0.00019052206522133117, "loss": 1.3828, "step": 7265 }, { "epoch": 0.2798845043310876, "grad_norm": 1.410261631011963, "learning_rate": 0.0001905092110088253, "loss": 1.3019, "step": 7270 }, { "epoch": 0.28007699711260825, "grad_norm": 2.313541889190674, "learning_rate": 0.0001904963485198517, "loss": 1.2058, "step": 7275 }, { "epoch": 0.280269489894129, "grad_norm": 1.4474842548370361, "learning_rate": 0.00019048347775558645, "loss": 1.2187, "step": 7280 }, { "epoch": 0.28046198267564965, "grad_norm": 1.5846171379089355, "learning_rate": 0.00019047059871720657, "loss": 1.0326, "step": 7285 }, { "epoch": 0.28065447545717037, "grad_norm": 1.1118413209915161, "learning_rate": 0.00019045771140588976, "loss": 1.2881, "step": 7290 }, { "epoch": 0.28084696823869104, "grad_norm": 2.5894134044647217, "learning_rate": 0.00019044481582281448, "loss": 1.3885, "step": 7295 }, { "epoch": 0.28103946102021177, "grad_norm": 1.6019679307937622, "learning_rate": 0.00019043191196915993, "loss": 1.3247, "step": 7300 }, { "epoch": 0.28123195380173244, "grad_norm": 1.3384417295455933, "learning_rate": 0.00019041899984610606, "loss": 1.346, "step": 7305 }, { "epoch": 0.2814244465832531, "grad_norm": 1.3584142923355103, "learning_rate": 0.00019040607945483367, "loss": 1.3418, "step": 7310 }, { "epoch": 0.28161693936477383, "grad_norm": 1.379162073135376, "learning_rate": 0.00019039315079652416, "loss": 1.293, "step": 7315 }, { "epoch": 0.2818094321462945, "grad_norm": 1.499841570854187, "learning_rate": 0.00019038021387235982, "loss": 1.2131, "step": 7320 }, { "epoch": 0.28200192492781523, "grad_norm": 1.9813991785049438, "learning_rate": 0.00019036726868352366, "loss": 1.3282, "step": 7325 }, { "epoch": 0.2821944177093359, "grad_norm": 1.404096245765686, "learning_rate": 0.00019035431523119938, "loss": 1.2238, "step": 7330 }, { "epoch": 0.28238691049085657, "grad_norm": 1.1089609861373901, "learning_rate": 0.00019034135351657152, "loss": 1.1705, "step": 7335 }, { "epoch": 0.2825794032723773, "grad_norm": 1.0567266941070557, "learning_rate": 0.00019032838354082535, "loss": 1.1228, "step": 7340 }, { "epoch": 0.28277189605389796, "grad_norm": 1.2407151460647583, "learning_rate": 0.00019031540530514685, "loss": 1.1154, "step": 7345 }, { "epoch": 0.2829643888354187, "grad_norm": 1.3094842433929443, "learning_rate": 0.00019030241881072283, "loss": 1.2251, "step": 7350 }, { "epoch": 0.28315688161693936, "grad_norm": 0.9434831142425537, "learning_rate": 0.00019028942405874082, "loss": 1.0644, "step": 7355 }, { "epoch": 0.2833493743984601, "grad_norm": 1.107958197593689, "learning_rate": 0.0001902764210503891, "loss": 1.295, "step": 7360 }, { "epoch": 0.28354186717998076, "grad_norm": 1.4402803182601929, "learning_rate": 0.00019026340978685666, "loss": 1.3339, "step": 7365 }, { "epoch": 0.2837343599615014, "grad_norm": 1.1564158201217651, "learning_rate": 0.0001902503902693334, "loss": 1.252, "step": 7370 }, { "epoch": 0.28392685274302215, "grad_norm": 1.8258494138717651, "learning_rate": 0.00019023736249900973, "loss": 1.3495, "step": 7375 }, { "epoch": 0.2841193455245428, "grad_norm": 1.1436362266540527, "learning_rate": 0.00019022432647707708, "loss": 1.4295, "step": 7380 }, { "epoch": 0.28431183830606355, "grad_norm": 1.1649361848831177, "learning_rate": 0.00019021128220472747, "loss": 1.3438, "step": 7385 }, { "epoch": 0.2845043310875842, "grad_norm": 1.7044711112976074, "learning_rate": 0.00019019822968315364, "loss": 1.2735, "step": 7390 }, { "epoch": 0.2846968238691049, "grad_norm": 0.8998376727104187, "learning_rate": 0.00019018516891354924, "loss": 1.1817, "step": 7395 }, { "epoch": 0.2848893166506256, "grad_norm": 1.8617538213729858, "learning_rate": 0.00019017209989710855, "loss": 1.3235, "step": 7400 }, { "epoch": 0.2850818094321463, "grad_norm": 0.9981639981269836, "learning_rate": 0.00019015902263502669, "loss": 1.1171, "step": 7405 }, { "epoch": 0.285274302213667, "grad_norm": 0.935457170009613, "learning_rate": 0.00019014593712849944, "loss": 1.1926, "step": 7410 }, { "epoch": 0.2854667949951877, "grad_norm": 1.3465532064437866, "learning_rate": 0.00019013284337872341, "loss": 1.5102, "step": 7415 }, { "epoch": 0.28565928777670835, "grad_norm": 1.3213337659835815, "learning_rate": 0.00019011974138689595, "loss": 1.2597, "step": 7420 }, { "epoch": 0.2858517805582291, "grad_norm": 1.655229091644287, "learning_rate": 0.0001901066311542151, "loss": 1.0345, "step": 7425 }, { "epoch": 0.28604427333974974, "grad_norm": 1.0165207386016846, "learning_rate": 0.00019009351268187974, "loss": 1.2854, "step": 7430 }, { "epoch": 0.28623676612127047, "grad_norm": 1.3425116539001465, "learning_rate": 0.00019008038597108945, "loss": 1.381, "step": 7435 }, { "epoch": 0.28642925890279114, "grad_norm": 1.2017732858657837, "learning_rate": 0.0001900672510230446, "loss": 1.2171, "step": 7440 }, { "epoch": 0.28662175168431187, "grad_norm": 1.4958349466323853, "learning_rate": 0.00019005410783894626, "loss": 1.3524, "step": 7445 }, { "epoch": 0.28681424446583254, "grad_norm": 1.1109000444412231, "learning_rate": 0.00019004095641999636, "loss": 1.2046, "step": 7450 }, { "epoch": 0.2870067372473532, "grad_norm": 1.5347834825515747, "learning_rate": 0.00019002779676739745, "loss": 1.2295, "step": 7455 }, { "epoch": 0.28719923002887393, "grad_norm": 1.5204600095748901, "learning_rate": 0.00019001462888235286, "loss": 1.0319, "step": 7460 }, { "epoch": 0.2873917228103946, "grad_norm": 2.0644850730895996, "learning_rate": 0.00019000145276606677, "loss": 1.2371, "step": 7465 }, { "epoch": 0.2875842155919153, "grad_norm": 1.5903024673461914, "learning_rate": 0.00018998826841974407, "loss": 1.3781, "step": 7470 }, { "epoch": 0.287776708373436, "grad_norm": 1.045086145401001, "learning_rate": 0.00018997507584459032, "loss": 1.0918, "step": 7475 }, { "epoch": 0.28796920115495667, "grad_norm": 1.499211311340332, "learning_rate": 0.0001899618750418119, "loss": 1.2377, "step": 7480 }, { "epoch": 0.2881616939364774, "grad_norm": 1.2885223627090454, "learning_rate": 0.00018994866601261597, "loss": 1.2936, "step": 7485 }, { "epoch": 0.28835418671799806, "grad_norm": 1.9687073230743408, "learning_rate": 0.00018993544875821035, "loss": 1.2043, "step": 7490 }, { "epoch": 0.2885466794995188, "grad_norm": 0.9758608937263489, "learning_rate": 0.00018992222327980375, "loss": 1.0775, "step": 7495 }, { "epoch": 0.28873917228103946, "grad_norm": 1.4256442785263062, "learning_rate": 0.00018990898957860547, "loss": 1.2608, "step": 7500 }, { "epoch": 0.28893166506256013, "grad_norm": 1.267991304397583, "learning_rate": 0.00018989574765582572, "loss": 1.3826, "step": 7505 }, { "epoch": 0.28912415784408085, "grad_norm": 1.4104158878326416, "learning_rate": 0.00018988249751267534, "loss": 1.1589, "step": 7510 }, { "epoch": 0.2893166506256015, "grad_norm": 0.9540778994560242, "learning_rate": 0.000189869239150366, "loss": 1.196, "step": 7515 }, { "epoch": 0.28950914340712225, "grad_norm": 4.175881385803223, "learning_rate": 0.00018985597257011006, "loss": 1.3408, "step": 7520 }, { "epoch": 0.2897016361886429, "grad_norm": 1.79558527469635, "learning_rate": 0.00018984269777312066, "loss": 1.0596, "step": 7525 }, { "epoch": 0.28989412897016364, "grad_norm": 1.5449460744857788, "learning_rate": 0.0001898294147606117, "loss": 1.2628, "step": 7530 }, { "epoch": 0.2900866217516843, "grad_norm": 1.5056041479110718, "learning_rate": 0.00018981612353379784, "loss": 1.132, "step": 7535 }, { "epoch": 0.290279114533205, "grad_norm": 1.7045507431030273, "learning_rate": 0.00018980282409389445, "loss": 1.1663, "step": 7540 }, { "epoch": 0.2904716073147257, "grad_norm": 1.203892469406128, "learning_rate": 0.00018978951644211766, "loss": 1.1168, "step": 7545 }, { "epoch": 0.2906641000962464, "grad_norm": 0.9239038228988647, "learning_rate": 0.0001897762005796844, "loss": 1.3328, "step": 7550 }, { "epoch": 0.2908565928777671, "grad_norm": 1.3521167039871216, "learning_rate": 0.00018976287650781238, "loss": 1.2766, "step": 7555 }, { "epoch": 0.2910490856592878, "grad_norm": 1.3824992179870605, "learning_rate": 0.00018974954422771987, "loss": 1.0153, "step": 7560 }, { "epoch": 0.29124157844080845, "grad_norm": 0.9183006286621094, "learning_rate": 0.00018973620374062607, "loss": 1.0558, "step": 7565 }, { "epoch": 0.29143407122232917, "grad_norm": 1.7128045558929443, "learning_rate": 0.0001897228550477509, "loss": 1.316, "step": 7570 }, { "epoch": 0.29162656400384984, "grad_norm": 1.3998011350631714, "learning_rate": 0.000189709498150315, "loss": 1.2359, "step": 7575 }, { "epoch": 0.29181905678537057, "grad_norm": 1.2251836061477661, "learning_rate": 0.00018969613304953975, "loss": 1.2464, "step": 7580 }, { "epoch": 0.29201154956689124, "grad_norm": 1.3014954328536987, "learning_rate": 0.00018968275974664734, "loss": 1.0624, "step": 7585 }, { "epoch": 0.2922040423484119, "grad_norm": 1.8785862922668457, "learning_rate": 0.00018966937824286062, "loss": 1.3491, "step": 7590 }, { "epoch": 0.29239653512993263, "grad_norm": 1.0634154081344604, "learning_rate": 0.00018965598853940327, "loss": 1.1012, "step": 7595 }, { "epoch": 0.2925890279114533, "grad_norm": 0.9114715456962585, "learning_rate": 0.00018964259063749967, "loss": 1.3738, "step": 7600 }, { "epoch": 0.29278152069297403, "grad_norm": 1.9063506126403809, "learning_rate": 0.00018962918453837503, "loss": 1.1161, "step": 7605 }, { "epoch": 0.2929740134744947, "grad_norm": 1.12264084815979, "learning_rate": 0.00018961577024325516, "loss": 1.4191, "step": 7610 }, { "epoch": 0.2931665062560154, "grad_norm": 1.4751306772232056, "learning_rate": 0.00018960234775336677, "loss": 1.2153, "step": 7615 }, { "epoch": 0.2933589990375361, "grad_norm": 1.4374860525131226, "learning_rate": 0.00018958891706993724, "loss": 1.1999, "step": 7620 }, { "epoch": 0.29355149181905676, "grad_norm": 1.5792250633239746, "learning_rate": 0.0001895754781941947, "loss": 1.266, "step": 7625 }, { "epoch": 0.2937439846005775, "grad_norm": 1.3390734195709229, "learning_rate": 0.00018956203112736807, "loss": 1.2703, "step": 7630 }, { "epoch": 0.29393647738209816, "grad_norm": 1.2470978498458862, "learning_rate": 0.00018954857587068701, "loss": 1.0415, "step": 7635 }, { "epoch": 0.2941289701636189, "grad_norm": 1.6102235317230225, "learning_rate": 0.00018953511242538186, "loss": 1.2707, "step": 7640 }, { "epoch": 0.29432146294513956, "grad_norm": 1.334554672241211, "learning_rate": 0.0001895216407926838, "loss": 1.2672, "step": 7645 }, { "epoch": 0.2945139557266602, "grad_norm": 1.2881218194961548, "learning_rate": 0.00018950816097382475, "loss": 1.1641, "step": 7650 }, { "epoch": 0.29470644850818095, "grad_norm": 1.2150179147720337, "learning_rate": 0.00018949467297003732, "loss": 1.2636, "step": 7655 }, { "epoch": 0.2948989412897016, "grad_norm": 1.1388130187988281, "learning_rate": 0.00018948117678255485, "loss": 1.2354, "step": 7660 }, { "epoch": 0.29509143407122235, "grad_norm": 0.785776674747467, "learning_rate": 0.0001894676724126115, "loss": 1.2621, "step": 7665 }, { "epoch": 0.295283926852743, "grad_norm": 1.005819320678711, "learning_rate": 0.00018945415986144223, "loss": 1.1175, "step": 7670 }, { "epoch": 0.29547641963426374, "grad_norm": 2.2892065048217773, "learning_rate": 0.00018944063913028264, "loss": 1.148, "step": 7675 }, { "epoch": 0.2956689124157844, "grad_norm": 2.0920302867889404, "learning_rate": 0.00018942711022036903, "loss": 1.178, "step": 7680 }, { "epoch": 0.2958614051973051, "grad_norm": 1.228538155555725, "learning_rate": 0.00018941357313293863, "loss": 1.2499, "step": 7685 }, { "epoch": 0.2960538979788258, "grad_norm": 1.8671079874038696, "learning_rate": 0.00018940002786922925, "loss": 1.2361, "step": 7690 }, { "epoch": 0.2962463907603465, "grad_norm": 1.7283247709274292, "learning_rate": 0.00018938647443047957, "loss": 1.2695, "step": 7695 }, { "epoch": 0.2964388835418672, "grad_norm": 1.9629713296890259, "learning_rate": 0.0001893729128179289, "loss": 1.5226, "step": 7700 }, { "epoch": 0.2966313763233879, "grad_norm": 1.2868784666061401, "learning_rate": 0.00018935934303281743, "loss": 1.3237, "step": 7705 }, { "epoch": 0.29682386910490854, "grad_norm": 1.3925827741622925, "learning_rate": 0.000189345765076386, "loss": 1.4075, "step": 7710 }, { "epoch": 0.29701636188642927, "grad_norm": 1.1560002565383911, "learning_rate": 0.0001893321789498762, "loss": 1.3212, "step": 7715 }, { "epoch": 0.29720885466794994, "grad_norm": 1.207263708114624, "learning_rate": 0.0001893185846545304, "loss": 1.3106, "step": 7720 }, { "epoch": 0.29740134744947067, "grad_norm": Infinity, "learning_rate": 0.00018930770333752716, "loss": 1.5499, "step": 7725 }, { "epoch": 0.29759384023099134, "grad_norm": 1.2437909841537476, "learning_rate": 0.0001892940943414097, "loss": 1.2797, "step": 7730 }, { "epoch": 0.297786333012512, "grad_norm": 0.8919286131858826, "learning_rate": 0.00018928047717993885, "loss": 1.1074, "step": 7735 }, { "epoch": 0.29797882579403273, "grad_norm": 1.219995379447937, "learning_rate": 0.00018926685185435978, "loss": 1.0856, "step": 7740 }, { "epoch": 0.2981713185755534, "grad_norm": 0.8819857835769653, "learning_rate": 0.00018925321836591846, "loss": 1.3518, "step": 7745 }, { "epoch": 0.2983638113570741, "grad_norm": 1.2268033027648926, "learning_rate": 0.00018923957671586154, "loss": 1.3786, "step": 7750 }, { "epoch": 0.2985563041385948, "grad_norm": 0.9456066489219666, "learning_rate": 0.0001892259269054365, "loss": 1.3424, "step": 7755 }, { "epoch": 0.2987487969201155, "grad_norm": 1.5397047996520996, "learning_rate": 0.0001892122689358915, "loss": 1.3618, "step": 7760 }, { "epoch": 0.2989412897016362, "grad_norm": 1.3874872922897339, "learning_rate": 0.0001891986028084755, "loss": 1.2717, "step": 7765 }, { "epoch": 0.29913378248315686, "grad_norm": 1.1725342273712158, "learning_rate": 0.00018918492852443817, "loss": 1.4347, "step": 7770 }, { "epoch": 0.2993262752646776, "grad_norm": 1.2135777473449707, "learning_rate": 0.0001891712460850299, "loss": 1.1892, "step": 7775 }, { "epoch": 0.29951876804619826, "grad_norm": 1.549715280532837, "learning_rate": 0.00018915755549150188, "loss": 1.2041, "step": 7780 }, { "epoch": 0.299711260827719, "grad_norm": 0.9927541613578796, "learning_rate": 0.00018914385674510605, "loss": 1.2198, "step": 7785 }, { "epoch": 0.29990375360923965, "grad_norm": 1.3314557075500488, "learning_rate": 0.00018913014984709502, "loss": 1.1805, "step": 7790 }, { "epoch": 0.3000962463907603, "grad_norm": 1.4021222591400146, "learning_rate": 0.00018911643479872225, "loss": 1.3375, "step": 7795 }, { "epoch": 0.30028873917228105, "grad_norm": 1.0226534605026245, "learning_rate": 0.00018910271160124182, "loss": 1.329, "step": 7800 }, { "epoch": 0.3004812319538017, "grad_norm": 0.8493847846984863, "learning_rate": 0.0001890889802559087, "loss": 1.4581, "step": 7805 }, { "epoch": 0.30067372473532245, "grad_norm": 1.0437967777252197, "learning_rate": 0.00018907524076397847, "loss": 1.409, "step": 7810 }, { "epoch": 0.3008662175168431, "grad_norm": 2.574695110321045, "learning_rate": 0.00018906149312670754, "loss": 1.3962, "step": 7815 }, { "epoch": 0.3010587102983638, "grad_norm": 1.3757768869400024, "learning_rate": 0.00018904773734535306, "loss": 1.4098, "step": 7820 }, { "epoch": 0.3012512030798845, "grad_norm": 1.2249635457992554, "learning_rate": 0.0001890339734211729, "loss": 1.1643, "step": 7825 }, { "epoch": 0.3014436958614052, "grad_norm": 1.6329936981201172, "learning_rate": 0.00018902020135542564, "loss": 1.1914, "step": 7830 }, { "epoch": 0.3016361886429259, "grad_norm": 1.0217385292053223, "learning_rate": 0.0001890064211493707, "loss": 1.043, "step": 7835 }, { "epoch": 0.3018286814244466, "grad_norm": 1.448754072189331, "learning_rate": 0.0001889926328042681, "loss": 1.0953, "step": 7840 }, { "epoch": 0.3020211742059673, "grad_norm": 0.9284221529960632, "learning_rate": 0.00018897883632137881, "loss": 1.321, "step": 7845 }, { "epoch": 0.30221366698748797, "grad_norm": 1.4679608345031738, "learning_rate": 0.00018896503170196435, "loss": 1.2266, "step": 7850 }, { "epoch": 0.30240615976900864, "grad_norm": 1.1148631572723389, "learning_rate": 0.00018895121894728709, "loss": 1.1666, "step": 7855 }, { "epoch": 0.30259865255052937, "grad_norm": 1.0431932210922241, "learning_rate": 0.00018893739805861008, "loss": 1.2986, "step": 7860 }, { "epoch": 0.30279114533205004, "grad_norm": 1.5691524744033813, "learning_rate": 0.00018892356903719718, "loss": 1.3928, "step": 7865 }, { "epoch": 0.30298363811357076, "grad_norm": 1.6849128007888794, "learning_rate": 0.000188909731884313, "loss": 1.3569, "step": 7870 }, { "epoch": 0.30317613089509143, "grad_norm": 1.1832456588745117, "learning_rate": 0.00018889588660122276, "loss": 1.2984, "step": 7875 }, { "epoch": 0.3033686236766121, "grad_norm": 1.3270272016525269, "learning_rate": 0.0001888820331891926, "loss": 1.1498, "step": 7880 }, { "epoch": 0.30356111645813283, "grad_norm": 1.6383373737335205, "learning_rate": 0.0001888681716494893, "loss": 1.4725, "step": 7885 }, { "epoch": 0.3037536092396535, "grad_norm": 1.1068469285964966, "learning_rate": 0.00018885430198338038, "loss": 1.3326, "step": 7890 }, { "epoch": 0.3039461020211742, "grad_norm": 1.8454192876815796, "learning_rate": 0.00018884042419213412, "loss": 1.2307, "step": 7895 }, { "epoch": 0.3041385948026949, "grad_norm": 1.160762906074524, "learning_rate": 0.00018882653827701965, "loss": 1.6025, "step": 7900 }, { "epoch": 0.30433108758421556, "grad_norm": 1.9325065612792969, "learning_rate": 0.00018881264423930663, "loss": 1.3071, "step": 7905 }, { "epoch": 0.3045235803657363, "grad_norm": 0.9047966003417969, "learning_rate": 0.00018879874208026562, "loss": 1.3166, "step": 7910 }, { "epoch": 0.30471607314725696, "grad_norm": 0.9753623008728027, "learning_rate": 0.00018878483180116793, "loss": 1.3702, "step": 7915 }, { "epoch": 0.3049085659287777, "grad_norm": 1.210321307182312, "learning_rate": 0.00018877091340328549, "loss": 1.3775, "step": 7920 }, { "epoch": 0.30510105871029836, "grad_norm": 1.287484049797058, "learning_rate": 0.00018875698688789106, "loss": 1.3534, "step": 7925 }, { "epoch": 0.3052935514918191, "grad_norm": 1.1604797840118408, "learning_rate": 0.00018874305225625814, "loss": 1.2154, "step": 7930 }, { "epoch": 0.30548604427333975, "grad_norm": 1.4771429300308228, "learning_rate": 0.00018872910950966097, "loss": 1.2438, "step": 7935 }, { "epoch": 0.3056785370548604, "grad_norm": 1.1472980976104736, "learning_rate": 0.00018871515864937453, "loss": 1.0805, "step": 7940 }, { "epoch": 0.30587102983638115, "grad_norm": 1.1015262603759766, "learning_rate": 0.0001887011996766745, "loss": 1.0594, "step": 7945 }, { "epoch": 0.3060635226179018, "grad_norm": 1.5410771369934082, "learning_rate": 0.00018868723259283737, "loss": 1.2624, "step": 7950 }, { "epoch": 0.30625601539942254, "grad_norm": 1.2014496326446533, "learning_rate": 0.0001886732573991403, "loss": 1.2259, "step": 7955 }, { "epoch": 0.3064485081809432, "grad_norm": 2.0007143020629883, "learning_rate": 0.0001886592740968612, "loss": 1.3877, "step": 7960 }, { "epoch": 0.3066410009624639, "grad_norm": 1.2455111742019653, "learning_rate": 0.00018864528268727887, "loss": 1.3254, "step": 7965 }, { "epoch": 0.3068334937439846, "grad_norm": 1.2766424417495728, "learning_rate": 0.00018863128317167264, "loss": 1.2663, "step": 7970 }, { "epoch": 0.3070259865255053, "grad_norm": 1.2151165008544922, "learning_rate": 0.0001886172755513227, "loss": 1.3597, "step": 7975 }, { "epoch": 0.307218479307026, "grad_norm": 1.1774568557739258, "learning_rate": 0.0001886032598275099, "loss": 1.1311, "step": 7980 }, { "epoch": 0.3074109720885467, "grad_norm": 1.43276846408844, "learning_rate": 0.00018858923600151596, "loss": 1.1123, "step": 7985 }, { "epoch": 0.3076034648700674, "grad_norm": 1.691684603691101, "learning_rate": 0.00018857520407462326, "loss": 1.4089, "step": 7990 }, { "epoch": 0.30779595765158807, "grad_norm": 1.7944872379302979, "learning_rate": 0.00018856116404811487, "loss": 1.3098, "step": 7995 }, { "epoch": 0.30798845043310874, "grad_norm": 1.2894377708435059, "learning_rate": 0.00018854711592327473, "loss": 1.2128, "step": 8000 }, { "epoch": 0.30818094321462947, "grad_norm": 2.52504301071167, "learning_rate": 0.00018853305970138737, "loss": 1.4214, "step": 8005 }, { "epoch": 0.30837343599615014, "grad_norm": 1.0757540464401245, "learning_rate": 0.0001885189953837382, "loss": 1.1836, "step": 8010 }, { "epoch": 0.30856592877767086, "grad_norm": 0.9253488183021545, "learning_rate": 0.0001885049229716133, "loss": 1.0756, "step": 8015 }, { "epoch": 0.30875842155919153, "grad_norm": 2.042194366455078, "learning_rate": 0.00018849084246629945, "loss": 1.4017, "step": 8020 }, { "epoch": 0.3089509143407122, "grad_norm": 1.750023603439331, "learning_rate": 0.00018847675386908427, "loss": 1.2352, "step": 8025 }, { "epoch": 0.3091434071222329, "grad_norm": 1.5334408283233643, "learning_rate": 0.00018846265718125605, "loss": 1.3053, "step": 8030 }, { "epoch": 0.3093358999037536, "grad_norm": 1.262428641319275, "learning_rate": 0.00018844855240410387, "loss": 1.28, "step": 8035 }, { "epoch": 0.3095283926852743, "grad_norm": 1.1430000066757202, "learning_rate": 0.0001884344395389175, "loss": 1.2133, "step": 8040 }, { "epoch": 0.309720885466795, "grad_norm": 1.792740821838379, "learning_rate": 0.0001884203185869874, "loss": 1.3004, "step": 8045 }, { "epoch": 0.30991337824831566, "grad_norm": 1.7067112922668457, "learning_rate": 0.00018840618954960495, "loss": 1.4131, "step": 8050 }, { "epoch": 0.3101058710298364, "grad_norm": 1.5428810119628906, "learning_rate": 0.00018839205242806206, "loss": 1.2361, "step": 8055 }, { "epoch": 0.31029836381135706, "grad_norm": 1.078902244567871, "learning_rate": 0.00018837790722365152, "loss": 1.2126, "step": 8060 }, { "epoch": 0.3104908565928778, "grad_norm": 1.5348985195159912, "learning_rate": 0.00018836375393766684, "loss": 1.2591, "step": 8065 }, { "epoch": 0.31068334937439845, "grad_norm": 1.2026286125183105, "learning_rate": 0.00018834959257140222, "loss": 1.3059, "step": 8070 }, { "epoch": 0.3108758421559192, "grad_norm": 1.3559043407440186, "learning_rate": 0.0001883354231261526, "loss": 1.2006, "step": 8075 }, { "epoch": 0.31106833493743985, "grad_norm": 1.2358171939849854, "learning_rate": 0.00018832124560321374, "loss": 1.2656, "step": 8080 }, { "epoch": 0.3112608277189605, "grad_norm": 1.720358967781067, "learning_rate": 0.00018830706000388202, "loss": 1.3493, "step": 8085 }, { "epoch": 0.31145332050048125, "grad_norm": 1.4281798601150513, "learning_rate": 0.00018829286632945463, "loss": 1.1485, "step": 8090 }, { "epoch": 0.3116458132820019, "grad_norm": 1.6174485683441162, "learning_rate": 0.00018827866458122951, "loss": 1.4384, "step": 8095 }, { "epoch": 0.31183830606352264, "grad_norm": 1.0020065307617188, "learning_rate": 0.00018826445476050532, "loss": 1.0489, "step": 8100 }, { "epoch": 0.3120307988450433, "grad_norm": 1.8663140535354614, "learning_rate": 0.0001882502368685814, "loss": 1.3252, "step": 8105 }, { "epoch": 0.312223291626564, "grad_norm": 1.4404470920562744, "learning_rate": 0.00018823601090675796, "loss": 1.1452, "step": 8110 }, { "epoch": 0.3124157844080847, "grad_norm": 1.3358442783355713, "learning_rate": 0.00018822177687633583, "loss": 1.1581, "step": 8115 }, { "epoch": 0.3126082771896054, "grad_norm": 1.6938860416412354, "learning_rate": 0.00018820753477861662, "loss": 1.5378, "step": 8120 }, { "epoch": 0.3128007699711261, "grad_norm": 1.1914762258529663, "learning_rate": 0.00018819328461490268, "loss": 1.172, "step": 8125 }, { "epoch": 0.3129932627526468, "grad_norm": 2.0504634380340576, "learning_rate": 0.0001881790263864971, "loss": 1.2462, "step": 8130 }, { "epoch": 0.31318575553416744, "grad_norm": 1.548021912574768, "learning_rate": 0.00018816476009470367, "loss": 1.271, "step": 8135 }, { "epoch": 0.31337824831568817, "grad_norm": 1.2875434160232544, "learning_rate": 0.00018815048574082698, "loss": 1.2484, "step": 8140 }, { "epoch": 0.31357074109720884, "grad_norm": 0.936850905418396, "learning_rate": 0.00018813620332617227, "loss": 1.2765, "step": 8145 }, { "epoch": 0.31376323387872956, "grad_norm": 1.2823413610458374, "learning_rate": 0.00018812191285204566, "loss": 1.1859, "step": 8150 }, { "epoch": 0.31395572666025023, "grad_norm": 2.052490472793579, "learning_rate": 0.00018810761431975386, "loss": 1.2033, "step": 8155 }, { "epoch": 0.31414821944177096, "grad_norm": 2.4439830780029297, "learning_rate": 0.00018809330773060442, "loss": 1.3678, "step": 8160 }, { "epoch": 0.31434071222329163, "grad_norm": 1.9978455305099487, "learning_rate": 0.0001880789930859055, "loss": 1.25, "step": 8165 }, { "epoch": 0.3145332050048123, "grad_norm": 1.2606321573257446, "learning_rate": 0.00018806467038696615, "loss": 1.4966, "step": 8170 }, { "epoch": 0.314725697786333, "grad_norm": 1.4588353633880615, "learning_rate": 0.00018805033963509605, "loss": 1.1843, "step": 8175 }, { "epoch": 0.3149181905678537, "grad_norm": 2.8686156272888184, "learning_rate": 0.00018803600083160574, "loss": 1.3017, "step": 8180 }, { "epoch": 0.3151106833493744, "grad_norm": 1.812328815460205, "learning_rate": 0.00018802165397780626, "loss": 1.4141, "step": 8185 }, { "epoch": 0.3153031761308951, "grad_norm": 1.4686119556427002, "learning_rate": 0.00018800729907500968, "loss": 1.4522, "step": 8190 }, { "epoch": 0.31549566891241576, "grad_norm": 1.766160249710083, "learning_rate": 0.00018799293612452856, "loss": 1.1501, "step": 8195 }, { "epoch": 0.3156881616939365, "grad_norm": 1.5843030214309692, "learning_rate": 0.00018797856512767634, "loss": 1.2997, "step": 8200 }, { "epoch": 0.31588065447545716, "grad_norm": 1.2028679847717285, "learning_rate": 0.00018796418608576712, "loss": 1.108, "step": 8205 }, { "epoch": 0.3160731472569779, "grad_norm": 1.4626559019088745, "learning_rate": 0.0001879497990001158, "loss": 1.116, "step": 8210 }, { "epoch": 0.31626564003849855, "grad_norm": 1.956745982170105, "learning_rate": 0.000187935403872038, "loss": 1.2741, "step": 8215 }, { "epoch": 0.3164581328200192, "grad_norm": 1.1932622194290161, "learning_rate": 0.00018792100070285002, "loss": 1.1966, "step": 8220 }, { "epoch": 0.31665062560153995, "grad_norm": 2.212184429168701, "learning_rate": 0.00018790658949386892, "loss": 1.1485, "step": 8225 }, { "epoch": 0.3168431183830606, "grad_norm": 0.867708146572113, "learning_rate": 0.00018789217024641256, "loss": 1.2457, "step": 8230 }, { "epoch": 0.31703561116458134, "grad_norm": 2.4929304122924805, "learning_rate": 0.0001878777429617995, "loss": 1.1819, "step": 8235 }, { "epoch": 0.317228103946102, "grad_norm": 1.4232670068740845, "learning_rate": 0.00018786330764134897, "loss": 1.2189, "step": 8240 }, { "epoch": 0.31742059672762274, "grad_norm": 1.8306447267532349, "learning_rate": 0.00018784886428638094, "loss": 1.2939, "step": 8245 }, { "epoch": 0.3176130895091434, "grad_norm": 0.9103988409042358, "learning_rate": 0.00018783441289821627, "loss": 1.2982, "step": 8250 }, { "epoch": 0.3178055822906641, "grad_norm": 1.08035409450531, "learning_rate": 0.0001878199534781764, "loss": 1.2777, "step": 8255 }, { "epoch": 0.3179980750721848, "grad_norm": 1.1342133283615112, "learning_rate": 0.0001878054860275835, "loss": 1.1476, "step": 8260 }, { "epoch": 0.3181905678537055, "grad_norm": 1.7727190256118774, "learning_rate": 0.0001877910105477606, "loss": 1.1887, "step": 8265 }, { "epoch": 0.3183830606352262, "grad_norm": 2.5168001651763916, "learning_rate": 0.0001877765270400313, "loss": 1.0494, "step": 8270 }, { "epoch": 0.31857555341674687, "grad_norm": 1.2397305965423584, "learning_rate": 0.0001877620355057201, "loss": 1.321, "step": 8275 }, { "epoch": 0.31876804619826754, "grad_norm": 1.3002814054489136, "learning_rate": 0.0001877475359461521, "loss": 1.1543, "step": 8280 }, { "epoch": 0.31896053897978827, "grad_norm": 1.5683960914611816, "learning_rate": 0.00018773302836265322, "loss": 1.1987, "step": 8285 }, { "epoch": 0.31915303176130894, "grad_norm": 1.6934245824813843, "learning_rate": 0.00018771851275655008, "loss": 1.2946, "step": 8290 }, { "epoch": 0.31934552454282966, "grad_norm": 1.4387637376785278, "learning_rate": 0.00018770398912917004, "loss": 1.2151, "step": 8295 }, { "epoch": 0.31953801732435033, "grad_norm": 1.3155730962753296, "learning_rate": 0.00018768945748184117, "loss": 1.1692, "step": 8300 }, { "epoch": 0.31973051010587106, "grad_norm": 1.039670467376709, "learning_rate": 0.0001876749178158923, "loss": 1.2783, "step": 8305 }, { "epoch": 0.3199230028873917, "grad_norm": 1.1988794803619385, "learning_rate": 0.00018766037013265302, "loss": 1.1775, "step": 8310 }, { "epoch": 0.3201154956689124, "grad_norm": 1.39814031124115, "learning_rate": 0.00018764581443345355, "loss": 1.2256, "step": 8315 }, { "epoch": 0.3203079884504331, "grad_norm": 1.7934690713882446, "learning_rate": 0.00018763125071962495, "loss": 1.3505, "step": 8320 }, { "epoch": 0.3205004812319538, "grad_norm": 1.5974578857421875, "learning_rate": 0.00018761667899249899, "loss": 1.1725, "step": 8325 }, { "epoch": 0.3206929740134745, "grad_norm": 0.9480400085449219, "learning_rate": 0.00018760209925340818, "loss": 1.2059, "step": 8330 }, { "epoch": 0.3208854667949952, "grad_norm": 1.9734187126159668, "learning_rate": 0.00018758751150368564, "loss": 1.2116, "step": 8335 }, { "epoch": 0.32107795957651586, "grad_norm": 0.9984979033470154, "learning_rate": 0.00018757291574466543, "loss": 1.1347, "step": 8340 }, { "epoch": 0.3212704523580366, "grad_norm": 0.96681147813797, "learning_rate": 0.00018755831197768215, "loss": 1.2824, "step": 8345 }, { "epoch": 0.32146294513955725, "grad_norm": 1.5365724563598633, "learning_rate": 0.00018754370020407127, "loss": 1.3718, "step": 8350 }, { "epoch": 0.321655437921078, "grad_norm": 1.6202696561813354, "learning_rate": 0.00018752908042516897, "loss": 1.3233, "step": 8355 }, { "epoch": 0.32184793070259865, "grad_norm": 2.0272514820098877, "learning_rate": 0.00018751445264231207, "loss": 1.3406, "step": 8360 }, { "epoch": 0.3220404234841193, "grad_norm": 1.1724604368209839, "learning_rate": 0.0001874998168568382, "loss": 1.2649, "step": 8365 }, { "epoch": 0.32223291626564005, "grad_norm": 1.0908805131912231, "learning_rate": 0.00018748517307008573, "loss": 1.2924, "step": 8370 }, { "epoch": 0.3224254090471607, "grad_norm": 1.0658169984817505, "learning_rate": 0.0001874705212833937, "loss": 1.1266, "step": 8375 }, { "epoch": 0.32261790182868144, "grad_norm": 1.2267755270004272, "learning_rate": 0.00018745586149810194, "loss": 1.172, "step": 8380 }, { "epoch": 0.3228103946102021, "grad_norm": 0.9808927178382874, "learning_rate": 0.000187441193715551, "loss": 1.1241, "step": 8385 }, { "epoch": 0.32300288739172284, "grad_norm": 1.2251529693603516, "learning_rate": 0.00018742651793708212, "loss": 1.1649, "step": 8390 }, { "epoch": 0.3231953801732435, "grad_norm": 1.7396290302276611, "learning_rate": 0.00018741183416403734, "loss": 1.173, "step": 8395 }, { "epoch": 0.3233878729547642, "grad_norm": 1.1498087644577026, "learning_rate": 0.00018739714239775936, "loss": 1.266, "step": 8400 }, { "epoch": 0.3235803657362849, "grad_norm": 0.9458256959915161, "learning_rate": 0.0001873824426395917, "loss": 1.1651, "step": 8405 }, { "epoch": 0.3237728585178056, "grad_norm": 1.701441764831543, "learning_rate": 0.00018736773489087845, "loss": 1.4314, "step": 8410 }, { "epoch": 0.3239653512993263, "grad_norm": 1.3168058395385742, "learning_rate": 0.00018735301915296466, "loss": 1.3837, "step": 8415 }, { "epoch": 0.32415784408084697, "grad_norm": 1.2277673482894897, "learning_rate": 0.0001873382954271959, "loss": 1.2433, "step": 8420 }, { "epoch": 0.32435033686236764, "grad_norm": 1.3443776369094849, "learning_rate": 0.00018732356371491858, "loss": 1.1514, "step": 8425 }, { "epoch": 0.32454282964388836, "grad_norm": 1.3421462774276733, "learning_rate": 0.00018730882401747984, "loss": 1.2908, "step": 8430 }, { "epoch": 0.32473532242540903, "grad_norm": 2.7043700218200684, "learning_rate": 0.0001872940763362275, "loss": 1.426, "step": 8435 }, { "epoch": 0.32492781520692976, "grad_norm": 1.2363086938858032, "learning_rate": 0.00018727932067251016, "loss": 1.2172, "step": 8440 }, { "epoch": 0.32512030798845043, "grad_norm": 1.7551484107971191, "learning_rate": 0.00018726455702767713, "loss": 1.2379, "step": 8445 }, { "epoch": 0.3253128007699711, "grad_norm": 1.2935433387756348, "learning_rate": 0.00018724978540307844, "loss": 1.2109, "step": 8450 }, { "epoch": 0.3255052935514918, "grad_norm": 1.723219871520996, "learning_rate": 0.00018723500580006483, "loss": 1.3996, "step": 8455 }, { "epoch": 0.3256977863330125, "grad_norm": 1.1455639600753784, "learning_rate": 0.0001872202182199878, "loss": 1.1223, "step": 8460 }, { "epoch": 0.3258902791145332, "grad_norm": 1.194926381111145, "learning_rate": 0.0001872054226641996, "loss": 1.3301, "step": 8465 }, { "epoch": 0.3260827718960539, "grad_norm": 1.9672341346740723, "learning_rate": 0.00018719061913405322, "loss": 1.3884, "step": 8470 }, { "epoch": 0.3262752646775746, "grad_norm": 1.5594457387924194, "learning_rate": 0.0001871758076309023, "loss": 1.1862, "step": 8475 }, { "epoch": 0.3264677574590953, "grad_norm": 1.141787052154541, "learning_rate": 0.0001871609881561012, "loss": 1.2375, "step": 8480 }, { "epoch": 0.32666025024061596, "grad_norm": 1.1914411783218384, "learning_rate": 0.0001871461607110052, "loss": 1.397, "step": 8485 }, { "epoch": 0.3268527430221367, "grad_norm": 1.2841687202453613, "learning_rate": 0.00018713132529697007, "loss": 1.3052, "step": 8490 }, { "epoch": 0.32704523580365735, "grad_norm": 2.2977144718170166, "learning_rate": 0.0001871164819153524, "loss": 1.2819, "step": 8495 }, { "epoch": 0.3272377285851781, "grad_norm": 1.62446928024292, "learning_rate": 0.00018710163056750957, "loss": 1.1739, "step": 8500 }, { "epoch": 0.32743022136669875, "grad_norm": 1.471348524093628, "learning_rate": 0.00018708677125479963, "loss": 1.0684, "step": 8505 }, { "epoch": 0.3276227141482194, "grad_norm": 1.0703455209732056, "learning_rate": 0.00018707190397858133, "loss": 1.0832, "step": 8510 }, { "epoch": 0.32781520692974014, "grad_norm": 1.3942466974258423, "learning_rate": 0.00018705702874021425, "loss": 1.1855, "step": 8515 }, { "epoch": 0.3280076997112608, "grad_norm": 1.1790398359298706, "learning_rate": 0.00018704214554105856, "loss": 1.1459, "step": 8520 }, { "epoch": 0.32820019249278154, "grad_norm": 1.2982394695281982, "learning_rate": 0.00018702725438247527, "loss": 1.2642, "step": 8525 }, { "epoch": 0.3283926852743022, "grad_norm": 1.4757968187332153, "learning_rate": 0.00018701235526582608, "loss": 1.291, "step": 8530 }, { "epoch": 0.3285851780558229, "grad_norm": 1.6837409734725952, "learning_rate": 0.0001870004302436148, "loss": 1.3796, "step": 8535 }, { "epoch": 0.3287776708373436, "grad_norm": 1.1914480924606323, "learning_rate": 0.00018698551680588075, "loss": 1.2608, "step": 8540 }, { "epoch": 0.3289701636188643, "grad_norm": 1.2581427097320557, "learning_rate": 0.00018697059541389742, "loss": 1.3011, "step": 8545 }, { "epoch": 0.329162656400385, "grad_norm": 1.5642743110656738, "learning_rate": 0.0001869556660690293, "loss": 1.2273, "step": 8550 }, { "epoch": 0.32935514918190567, "grad_norm": 1.621721863746643, "learning_rate": 0.0001869407287726415, "loss": 1.1648, "step": 8555 }, { "epoch": 0.3295476419634264, "grad_norm": 0.9840386509895325, "learning_rate": 0.00018692578352610002, "loss": 1.2741, "step": 8560 }, { "epoch": 0.32974013474494707, "grad_norm": 1.5852268934249878, "learning_rate": 0.00018691083033077144, "loss": 1.2913, "step": 8565 }, { "epoch": 0.32993262752646774, "grad_norm": 1.280247688293457, "learning_rate": 0.00018689586918802314, "loss": 1.172, "step": 8570 }, { "epoch": 0.33012512030798846, "grad_norm": 1.3940321207046509, "learning_rate": 0.0001868809000992233, "loss": 1.175, "step": 8575 }, { "epoch": 0.33031761308950913, "grad_norm": 1.0753341913223267, "learning_rate": 0.00018686592306574063, "loss": 1.3922, "step": 8580 }, { "epoch": 0.33051010587102986, "grad_norm": 1.5959515571594238, "learning_rate": 0.00018685093808894476, "loss": 1.2741, "step": 8585 }, { "epoch": 0.33070259865255053, "grad_norm": 1.1567896604537964, "learning_rate": 0.00018683594517020593, "loss": 1.1325, "step": 8590 }, { "epoch": 0.3308950914340712, "grad_norm": 1.202486276626587, "learning_rate": 0.0001868209443108951, "loss": 1.1915, "step": 8595 }, { "epoch": 0.3310875842155919, "grad_norm": 1.6866669654846191, "learning_rate": 0.00018680593551238412, "loss": 1.2806, "step": 8600 }, { "epoch": 0.3312800769971126, "grad_norm": 1.1932209730148315, "learning_rate": 0.00018679091877604536, "loss": 1.2254, "step": 8605 }, { "epoch": 0.3314725697786333, "grad_norm": 1.5348761081695557, "learning_rate": 0.000186775894103252, "loss": 1.1519, "step": 8610 }, { "epoch": 0.331665062560154, "grad_norm": 1.908500075340271, "learning_rate": 0.00018676086149537792, "loss": 1.3105, "step": 8615 }, { "epoch": 0.3318575553416747, "grad_norm": 2.0427961349487305, "learning_rate": 0.00018674582095379788, "loss": 1.1415, "step": 8620 }, { "epoch": 0.3320500481231954, "grad_norm": 1.0964915752410889, "learning_rate": 0.00018673077247988707, "loss": 1.2041, "step": 8625 }, { "epoch": 0.33224254090471605, "grad_norm": 1.2229498624801636, "learning_rate": 0.00018671571607502168, "loss": 1.2975, "step": 8630 }, { "epoch": 0.3324350336862368, "grad_norm": 1.3551470041275024, "learning_rate": 0.00018670065174057854, "loss": 1.1592, "step": 8635 }, { "epoch": 0.33262752646775745, "grad_norm": 0.8810299634933472, "learning_rate": 0.0001866855794779351, "loss": 1.1414, "step": 8640 }, { "epoch": 0.3328200192492782, "grad_norm": 1.5907199382781982, "learning_rate": 0.00018667049928846967, "loss": 1.2191, "step": 8645 }, { "epoch": 0.33301251203079885, "grad_norm": 2.042478561401367, "learning_rate": 0.0001866554111735612, "loss": 1.1619, "step": 8650 }, { "epoch": 0.3332050048123195, "grad_norm": 1.6686564683914185, "learning_rate": 0.00018664031513458942, "loss": 1.2534, "step": 8655 }, { "epoch": 0.33339749759384024, "grad_norm": 1.7643070220947266, "learning_rate": 0.0001866252111729348, "loss": 1.2631, "step": 8660 }, { "epoch": 0.3335899903753609, "grad_norm": 1.4883722066879272, "learning_rate": 0.0001866100992899784, "loss": 1.1786, "step": 8665 }, { "epoch": 0.33378248315688164, "grad_norm": 0.9850770235061646, "learning_rate": 0.00018659497948710218, "loss": 1.4181, "step": 8670 }, { "epoch": 0.3339749759384023, "grad_norm": 0.9056932926177979, "learning_rate": 0.00018657985176568875, "loss": 1.0365, "step": 8675 }, { "epoch": 0.334167468719923, "grad_norm": 1.9456449747085571, "learning_rate": 0.00018656471612712137, "loss": 1.227, "step": 8680 }, { "epoch": 0.3343599615014437, "grad_norm": 1.289870262145996, "learning_rate": 0.00018654957257278415, "loss": 1.32, "step": 8685 }, { "epoch": 0.3345524542829644, "grad_norm": 1.048143744468689, "learning_rate": 0.00018653442110406189, "loss": 1.2123, "step": 8690 }, { "epoch": 0.3347449470644851, "grad_norm": 1.1696733236312866, "learning_rate": 0.00018651926172234004, "loss": 1.0226, "step": 8695 }, { "epoch": 0.33493743984600577, "grad_norm": 1.4806257486343384, "learning_rate": 0.00018650409442900486, "loss": 1.1715, "step": 8700 }, { "epoch": 0.3351299326275265, "grad_norm": 1.525719404220581, "learning_rate": 0.00018648891922544325, "loss": 1.2037, "step": 8705 }, { "epoch": 0.33532242540904716, "grad_norm": 1.3378442525863647, "learning_rate": 0.00018647373611304293, "loss": 1.2188, "step": 8710 }, { "epoch": 0.33551491819056783, "grad_norm": 0.870988130569458, "learning_rate": 0.00018645854509319226, "loss": 1.2153, "step": 8715 }, { "epoch": 0.33570741097208856, "grad_norm": 1.5496007204055786, "learning_rate": 0.00018644334616728042, "loss": 1.1974, "step": 8720 }, { "epoch": 0.33589990375360923, "grad_norm": 1.0248416662216187, "learning_rate": 0.00018642813933669717, "loss": 1.2845, "step": 8725 }, { "epoch": 0.33609239653512996, "grad_norm": 1.9984816312789917, "learning_rate": 0.00018641292460283313, "loss": 1.3144, "step": 8730 }, { "epoch": 0.3362848893166506, "grad_norm": 1.3114112615585327, "learning_rate": 0.00018639770196707955, "loss": 1.209, "step": 8735 }, { "epoch": 0.3364773820981713, "grad_norm": 1.1683485507965088, "learning_rate": 0.00018638247143082848, "loss": 1.2688, "step": 8740 }, { "epoch": 0.336669874879692, "grad_norm": 1.507900595664978, "learning_rate": 0.0001863672329954726, "loss": 1.1325, "step": 8745 }, { "epoch": 0.3368623676612127, "grad_norm": 1.3393852710723877, "learning_rate": 0.00018635198666240542, "loss": 1.1573, "step": 8750 }, { "epoch": 0.3370548604427334, "grad_norm": 1.0203709602355957, "learning_rate": 0.00018633673243302108, "loss": 1.2922, "step": 8755 }, { "epoch": 0.3372473532242541, "grad_norm": 0.8483877778053284, "learning_rate": 0.00018632147030871448, "loss": 1.2252, "step": 8760 }, { "epoch": 0.33743984600577476, "grad_norm": 0.983748197555542, "learning_rate": 0.00018630620029088125, "loss": 1.2027, "step": 8765 }, { "epoch": 0.3376323387872955, "grad_norm": 1.2489101886749268, "learning_rate": 0.00018629092238091775, "loss": 1.1962, "step": 8770 }, { "epoch": 0.33782483156881615, "grad_norm": 1.4553676843643188, "learning_rate": 0.000186275636580221, "loss": 1.3698, "step": 8775 }, { "epoch": 0.3380173243503369, "grad_norm": 0.9494854807853699, "learning_rate": 0.0001862603428901888, "loss": 1.25, "step": 8780 }, { "epoch": 0.33820981713185755, "grad_norm": 0.8667522072792053, "learning_rate": 0.00018624504131221968, "loss": 1.222, "step": 8785 }, { "epoch": 0.3384023099133783, "grad_norm": 1.4215630292892456, "learning_rate": 0.00018622973184771285, "loss": 1.2592, "step": 8790 }, { "epoch": 0.33859480269489894, "grad_norm": 0.9913888573646545, "learning_rate": 0.00018621441449806828, "loss": 1.2904, "step": 8795 }, { "epoch": 0.3387872954764196, "grad_norm": 0.9612273573875427, "learning_rate": 0.00018619908926468664, "loss": 1.24, "step": 8800 }, { "epoch": 0.33897978825794034, "grad_norm": 1.656568169593811, "learning_rate": 0.00018618375614896926, "loss": 1.1763, "step": 8805 }, { "epoch": 0.339172281039461, "grad_norm": 1.4496088027954102, "learning_rate": 0.0001861684151523183, "loss": 1.2045, "step": 8810 }, { "epoch": 0.33936477382098174, "grad_norm": 1.3886058330535889, "learning_rate": 0.0001861530662761366, "loss": 1.3111, "step": 8815 }, { "epoch": 0.3395572666025024, "grad_norm": 1.644887089729309, "learning_rate": 0.0001861377095218277, "loss": 1.3172, "step": 8820 }, { "epoch": 0.3397497593840231, "grad_norm": 1.1925910711288452, "learning_rate": 0.00018612234489079587, "loss": 1.3268, "step": 8825 }, { "epoch": 0.3399422521655438, "grad_norm": 1.1367309093475342, "learning_rate": 0.0001861069723844461, "loss": 1.1209, "step": 8830 }, { "epoch": 0.34013474494706447, "grad_norm": 1.0649480819702148, "learning_rate": 0.00018609159200418414, "loss": 1.1514, "step": 8835 }, { "epoch": 0.3403272377285852, "grad_norm": 1.1887884140014648, "learning_rate": 0.00018607620375141637, "loss": 1.1026, "step": 8840 }, { "epoch": 0.34051973051010587, "grad_norm": 1.9125694036483765, "learning_rate": 0.00018606080762754995, "loss": 1.4718, "step": 8845 }, { "epoch": 0.34071222329162654, "grad_norm": 1.1742594242095947, "learning_rate": 0.00018604540363399282, "loss": 1.3206, "step": 8850 }, { "epoch": 0.34090471607314726, "grad_norm": 1.504146695137024, "learning_rate": 0.0001860299917721535, "loss": 1.1639, "step": 8855 }, { "epoch": 0.34109720885466793, "grad_norm": 0.8869237899780273, "learning_rate": 0.00018601457204344131, "loss": 1.2674, "step": 8860 }, { "epoch": 0.34128970163618866, "grad_norm": 0.8492304682731628, "learning_rate": 0.00018599914444926636, "loss": 1.2732, "step": 8865 }, { "epoch": 0.34148219441770933, "grad_norm": 1.1681571006774902, "learning_rate": 0.00018598370899103932, "loss": 1.2995, "step": 8870 }, { "epoch": 0.34167468719923005, "grad_norm": 1.6912837028503418, "learning_rate": 0.00018596826567017166, "loss": 1.3217, "step": 8875 }, { "epoch": 0.3418671799807507, "grad_norm": 1.0427602529525757, "learning_rate": 0.0001859528144880756, "loss": 1.05, "step": 8880 }, { "epoch": 0.3420596727622714, "grad_norm": 1.9644991159439087, "learning_rate": 0.00018593735544616404, "loss": 1.1087, "step": 8885 }, { "epoch": 0.3422521655437921, "grad_norm": 1.966264247894287, "learning_rate": 0.0001859218885458506, "loss": 1.2221, "step": 8890 }, { "epoch": 0.3424446583253128, "grad_norm": 1.9770557880401611, "learning_rate": 0.00018590641378854965, "loss": 1.2489, "step": 8895 }, { "epoch": 0.3426371511068335, "grad_norm": 1.4175180196762085, "learning_rate": 0.00018589093117567625, "loss": 1.1292, "step": 8900 }, { "epoch": 0.3428296438883542, "grad_norm": 1.066177487373352, "learning_rate": 0.00018587544070864612, "loss": 1.1182, "step": 8905 }, { "epoch": 0.34302213666987486, "grad_norm": 2.6207172870635986, "learning_rate": 0.00018585994238887586, "loss": 1.1, "step": 8910 }, { "epoch": 0.3432146294513956, "grad_norm": 1.6905888319015503, "learning_rate": 0.0001858444362177826, "loss": 1.3135, "step": 8915 }, { "epoch": 0.34340712223291625, "grad_norm": 1.117883324623108, "learning_rate": 0.00018582892219678435, "loss": 1.3394, "step": 8920 }, { "epoch": 0.343599615014437, "grad_norm": 1.549805760383606, "learning_rate": 0.00018581340032729972, "loss": 1.1957, "step": 8925 }, { "epoch": 0.34379210779595765, "grad_norm": 1.165260672569275, "learning_rate": 0.00018579787061074807, "loss": 1.2406, "step": 8930 }, { "epoch": 0.34398460057747837, "grad_norm": 1.1872533559799194, "learning_rate": 0.00018578233304854952, "loss": 1.1831, "step": 8935 }, { "epoch": 0.34417709335899904, "grad_norm": 0.8727648854255676, "learning_rate": 0.00018576678764212489, "loss": 1.2645, "step": 8940 }, { "epoch": 0.3443695861405197, "grad_norm": 1.1179304122924805, "learning_rate": 0.00018575123439289567, "loss": 1.297, "step": 8945 }, { "epoch": 0.34456207892204044, "grad_norm": 1.9064927101135254, "learning_rate": 0.0001857356733022841, "loss": 1.3917, "step": 8950 }, { "epoch": 0.3447545717035611, "grad_norm": 2.100154399871826, "learning_rate": 0.00018572010437171315, "loss": 1.1723, "step": 8955 }, { "epoch": 0.34494706448508183, "grad_norm": 1.0105838775634766, "learning_rate": 0.00018570452760260654, "loss": 1.0851, "step": 8960 }, { "epoch": 0.3451395572666025, "grad_norm": 1.760038137435913, "learning_rate": 0.0001856889429963886, "loss": 1.0612, "step": 8965 }, { "epoch": 0.3453320500481232, "grad_norm": 1.5740501880645752, "learning_rate": 0.00018567335055448444, "loss": 1.117, "step": 8970 }, { "epoch": 0.3455245428296439, "grad_norm": 1.4148597717285156, "learning_rate": 0.00018565775027831993, "loss": 1.2003, "step": 8975 }, { "epoch": 0.34571703561116457, "grad_norm": 1.2243534326553345, "learning_rate": 0.00018564214216932159, "loss": 1.2106, "step": 8980 }, { "epoch": 0.3459095283926853, "grad_norm": 1.3532603979110718, "learning_rate": 0.00018562652622891666, "loss": 1.1703, "step": 8985 }, { "epoch": 0.34610202117420596, "grad_norm": 1.6701220273971558, "learning_rate": 0.00018561090245853315, "loss": 1.2409, "step": 8990 }, { "epoch": 0.34629451395572663, "grad_norm": 1.6342322826385498, "learning_rate": 0.00018559527085959968, "loss": 1.2981, "step": 8995 }, { "epoch": 0.34648700673724736, "grad_norm": 2.4354701042175293, "learning_rate": 0.00018557963143354576, "loss": 1.1021, "step": 9000 }, { "epoch": 0.34667949951876803, "grad_norm": 1.5688186883926392, "learning_rate": 0.00018556398418180146, "loss": 1.2649, "step": 9005 }, { "epoch": 0.34687199230028876, "grad_norm": 2.2158894538879395, "learning_rate": 0.0001855483291057976, "loss": 1.2335, "step": 9010 }, { "epoch": 0.3470644850818094, "grad_norm": 1.7294437885284424, "learning_rate": 0.00018553266620696573, "loss": 1.3235, "step": 9015 }, { "epoch": 0.34725697786333015, "grad_norm": 1.1023756265640259, "learning_rate": 0.00018551699548673814, "loss": 1.3515, "step": 9020 }, { "epoch": 0.3474494706448508, "grad_norm": 1.4505863189697266, "learning_rate": 0.00018550131694654784, "loss": 1.3773, "step": 9025 }, { "epoch": 0.3476419634263715, "grad_norm": 2.221957206726074, "learning_rate": 0.00018548563058782847, "loss": 1.0896, "step": 9030 }, { "epoch": 0.3478344562078922, "grad_norm": 0.917010486125946, "learning_rate": 0.0001854699364120145, "loss": 1.1569, "step": 9035 }, { "epoch": 0.3480269489894129, "grad_norm": 1.4631186723709106, "learning_rate": 0.00018545423442054105, "loss": 1.2169, "step": 9040 }, { "epoch": 0.3482194417709336, "grad_norm": 1.0917268991470337, "learning_rate": 0.0001854385246148439, "loss": 1.2425, "step": 9045 }, { "epoch": 0.3484119345524543, "grad_norm": 1.5985426902770996, "learning_rate": 0.00018542280699635968, "loss": 1.0944, "step": 9050 }, { "epoch": 0.34860442733397495, "grad_norm": 1.5402495861053467, "learning_rate": 0.0001854070815665256, "loss": 1.1497, "step": 9055 }, { "epoch": 0.3487969201154957, "grad_norm": 1.211295485496521, "learning_rate": 0.00018539134832677972, "loss": 1.0403, "step": 9060 }, { "epoch": 0.34898941289701635, "grad_norm": 1.0569374561309814, "learning_rate": 0.00018537560727856068, "loss": 1.2886, "step": 9065 }, { "epoch": 0.3491819056785371, "grad_norm": 1.550212025642395, "learning_rate": 0.00018535985842330793, "loss": 1.2654, "step": 9070 }, { "epoch": 0.34937439846005774, "grad_norm": 1.7941083908081055, "learning_rate": 0.00018534410176246154, "loss": 1.2757, "step": 9075 }, { "epoch": 0.3495668912415784, "grad_norm": 0.9004856944084167, "learning_rate": 0.00018532833729746243, "loss": 1.2045, "step": 9080 }, { "epoch": 0.34975938402309914, "grad_norm": 0.9916037321090698, "learning_rate": 0.00018531256502975216, "loss": 1.1788, "step": 9085 }, { "epoch": 0.3499518768046198, "grad_norm": 1.0524908304214478, "learning_rate": 0.00018529678496077292, "loss": 1.3298, "step": 9090 }, { "epoch": 0.35014436958614054, "grad_norm": 2.7244019508361816, "learning_rate": 0.00018528099709196774, "loss": 1.3274, "step": 9095 }, { "epoch": 0.3503368623676612, "grad_norm": 1.4286680221557617, "learning_rate": 0.0001852652014247803, "loss": 1.193, "step": 9100 }, { "epoch": 0.35052935514918193, "grad_norm": 1.0943810939788818, "learning_rate": 0.00018524939796065503, "loss": 1.2953, "step": 9105 }, { "epoch": 0.3507218479307026, "grad_norm": 1.1513092517852783, "learning_rate": 0.00018523358670103704, "loss": 1.3436, "step": 9110 }, { "epoch": 0.35091434071222327, "grad_norm": 2.142829656600952, "learning_rate": 0.00018521776764737218, "loss": 1.2998, "step": 9115 }, { "epoch": 0.351106833493744, "grad_norm": 0.9734616875648499, "learning_rate": 0.00018520194080110699, "loss": 1.2794, "step": 9120 }, { "epoch": 0.35129932627526467, "grad_norm": 1.0793628692626953, "learning_rate": 0.00018518610616368868, "loss": 1.2574, "step": 9125 }, { "epoch": 0.3514918190567854, "grad_norm": 2.409484386444092, "learning_rate": 0.00018517026373656532, "loss": 1.1601, "step": 9130 }, { "epoch": 0.35168431183830606, "grad_norm": 1.1166318655014038, "learning_rate": 0.0001851544135211855, "loss": 1.2705, "step": 9135 }, { "epoch": 0.35187680461982673, "grad_norm": 1.183131217956543, "learning_rate": 0.0001851385555189987, "loss": 1.132, "step": 9140 }, { "epoch": 0.35206929740134746, "grad_norm": 1.3792176246643066, "learning_rate": 0.00018512268973145497, "loss": 1.1271, "step": 9145 }, { "epoch": 0.35226179018286813, "grad_norm": 1.3978809118270874, "learning_rate": 0.00018510681616000513, "loss": 1.3828, "step": 9150 }, { "epoch": 0.35245428296438885, "grad_norm": 1.0242118835449219, "learning_rate": 0.00018509093480610078, "loss": 1.1982, "step": 9155 }, { "epoch": 0.3526467757459095, "grad_norm": 1.326621174812317, "learning_rate": 0.00018507504567119408, "loss": 1.0175, "step": 9160 }, { "epoch": 0.3528392685274302, "grad_norm": 1.1905460357666016, "learning_rate": 0.00018505914875673805, "loss": 1.3367, "step": 9165 }, { "epoch": 0.3530317613089509, "grad_norm": 1.5423171520233154, "learning_rate": 0.0001850432440641863, "loss": 1.1721, "step": 9170 }, { "epoch": 0.3532242540904716, "grad_norm": 1.0577900409698486, "learning_rate": 0.00018502733159499326, "loss": 1.2173, "step": 9175 }, { "epoch": 0.3534167468719923, "grad_norm": 0.8053417205810547, "learning_rate": 0.000185011411350614, "loss": 1.1492, "step": 9180 }, { "epoch": 0.353609239653513, "grad_norm": 1.076053261756897, "learning_rate": 0.0001849954833325043, "loss": 1.2117, "step": 9185 }, { "epoch": 0.3538017324350337, "grad_norm": 1.206359624862671, "learning_rate": 0.0001849795475421207, "loss": 1.1659, "step": 9190 }, { "epoch": 0.3539942252165544, "grad_norm": 1.4652369022369385, "learning_rate": 0.00018496360398092046, "loss": 1.2605, "step": 9195 }, { "epoch": 0.35418671799807505, "grad_norm": 1.158055067062378, "learning_rate": 0.00018494765265036144, "loss": 1.414, "step": 9200 }, { "epoch": 0.3543792107795958, "grad_norm": 2.4634461402893066, "learning_rate": 0.0001849316935519023, "loss": 1.1982, "step": 9205 }, { "epoch": 0.35457170356111645, "grad_norm": 1.875139594078064, "learning_rate": 0.00018491572668700242, "loss": 1.4133, "step": 9210 }, { "epoch": 0.3547641963426372, "grad_norm": 1.0054875612258911, "learning_rate": 0.00018489975205712185, "loss": 1.2294, "step": 9215 }, { "epoch": 0.35495668912415784, "grad_norm": 2.2620842456817627, "learning_rate": 0.00018488376966372134, "loss": 1.2672, "step": 9220 }, { "epoch": 0.3551491819056785, "grad_norm": 1.584251880645752, "learning_rate": 0.00018486777950826243, "loss": 1.4366, "step": 9225 }, { "epoch": 0.35534167468719924, "grad_norm": 1.6498923301696777, "learning_rate": 0.00018485178159220725, "loss": 1.3502, "step": 9230 }, { "epoch": 0.3555341674687199, "grad_norm": 1.6700108051300049, "learning_rate": 0.00018483577591701876, "loss": 1.2462, "step": 9235 }, { "epoch": 0.35572666025024063, "grad_norm": 1.6976680755615234, "learning_rate": 0.00018481976248416052, "loss": 1.4637, "step": 9240 }, { "epoch": 0.3559191530317613, "grad_norm": 0.9686551094055176, "learning_rate": 0.0001848037412950969, "loss": 1.1902, "step": 9245 }, { "epoch": 0.35611164581328203, "grad_norm": 1.2102336883544922, "learning_rate": 0.00018478771235129292, "loss": 1.586, "step": 9250 }, { "epoch": 0.3563041385948027, "grad_norm": 1.7220674753189087, "learning_rate": 0.0001847716756542143, "loss": 1.2324, "step": 9255 }, { "epoch": 0.35649663137632337, "grad_norm": 1.7433216571807861, "learning_rate": 0.0001847556312053275, "loss": 1.4454, "step": 9260 }, { "epoch": 0.3566891241578441, "grad_norm": 0.9930455088615417, "learning_rate": 0.0001847395790060997, "loss": 1.1601, "step": 9265 }, { "epoch": 0.35688161693936477, "grad_norm": 1.1169023513793945, "learning_rate": 0.00018472351905799873, "loss": 1.2534, "step": 9270 }, { "epoch": 0.3570741097208855, "grad_norm": 1.238748550415039, "learning_rate": 0.00018470745136249316, "loss": 1.2174, "step": 9275 }, { "epoch": 0.35726660250240616, "grad_norm": 2.130223035812378, "learning_rate": 0.00018469137592105235, "loss": 1.3975, "step": 9280 }, { "epoch": 0.35745909528392683, "grad_norm": 1.4341787099838257, "learning_rate": 0.0001846752927351462, "loss": 1.1725, "step": 9285 }, { "epoch": 0.35765158806544756, "grad_norm": 1.948145866394043, "learning_rate": 0.00018465920180624548, "loss": 1.2741, "step": 9290 }, { "epoch": 0.3578440808469682, "grad_norm": 1.0314382314682007, "learning_rate": 0.00018464310313582157, "loss": 1.0998, "step": 9295 }, { "epoch": 0.35803657362848895, "grad_norm": 1.0461472272872925, "learning_rate": 0.0001846269967253466, "loss": 1.1953, "step": 9300 }, { "epoch": 0.3582290664100096, "grad_norm": 1.781084656715393, "learning_rate": 0.00018461088257629334, "loss": 1.3629, "step": 9305 }, { "epoch": 0.3584215591915303, "grad_norm": 1.9082306623458862, "learning_rate": 0.00018459476069013537, "loss": 1.2675, "step": 9310 }, { "epoch": 0.358614051973051, "grad_norm": 1.803348422050476, "learning_rate": 0.00018457863106834693, "loss": 1.2303, "step": 9315 }, { "epoch": 0.3588065447545717, "grad_norm": 1.5346139669418335, "learning_rate": 0.000184562493712403, "loss": 1.3354, "step": 9320 }, { "epoch": 0.3589990375360924, "grad_norm": 1.3731290102005005, "learning_rate": 0.00018454634862377916, "loss": 1.4874, "step": 9325 }, { "epoch": 0.3591915303176131, "grad_norm": 1.186759352684021, "learning_rate": 0.0001845301958039518, "loss": 1.29, "step": 9330 }, { "epoch": 0.3593840230991338, "grad_norm": 3.729174852371216, "learning_rate": 0.00018451403525439802, "loss": 1.2589, "step": 9335 }, { "epoch": 0.3595765158806545, "grad_norm": 2.46051025390625, "learning_rate": 0.00018449786697659554, "loss": 1.1818, "step": 9340 }, { "epoch": 0.35976900866217515, "grad_norm": 1.6652323007583618, "learning_rate": 0.00018448169097202288, "loss": 1.2719, "step": 9345 }, { "epoch": 0.3599615014436959, "grad_norm": 1.375410556793213, "learning_rate": 0.00018446550724215922, "loss": 1.2687, "step": 9350 }, { "epoch": 0.36015399422521654, "grad_norm": 1.9113675355911255, "learning_rate": 0.00018444931578848447, "loss": 1.2475, "step": 9355 }, { "epoch": 0.36034648700673727, "grad_norm": 1.8949065208435059, "learning_rate": 0.0001844331166124792, "loss": 1.3439, "step": 9360 }, { "epoch": 0.36053897978825794, "grad_norm": 1.0940630435943604, "learning_rate": 0.00018441690971562476, "loss": 1.203, "step": 9365 }, { "epoch": 0.3607314725697786, "grad_norm": 1.2999101877212524, "learning_rate": 0.00018440069509940315, "loss": 1.2729, "step": 9370 }, { "epoch": 0.36092396535129934, "grad_norm": 1.3675721883773804, "learning_rate": 0.00018438447276529702, "loss": 1.2024, "step": 9375 }, { "epoch": 0.36111645813282, "grad_norm": 1.6651533842086792, "learning_rate": 0.00018436824271478988, "loss": 1.2235, "step": 9380 }, { "epoch": 0.36130895091434073, "grad_norm": 2.16670823097229, "learning_rate": 0.00018435200494936585, "loss": 1.4486, "step": 9385 }, { "epoch": 0.3615014436958614, "grad_norm": 1.3305730819702148, "learning_rate": 0.00018433575947050972, "loss": 1.2003, "step": 9390 }, { "epoch": 0.36169393647738207, "grad_norm": 1.5913615226745605, "learning_rate": 0.00018431950627970708, "loss": 1.2722, "step": 9395 }, { "epoch": 0.3618864292589028, "grad_norm": 0.9965779781341553, "learning_rate": 0.00018430324537844415, "loss": 1.0604, "step": 9400 }, { "epoch": 0.36207892204042347, "grad_norm": 1.7614198923110962, "learning_rate": 0.00018428697676820788, "loss": 1.2734, "step": 9405 }, { "epoch": 0.3622714148219442, "grad_norm": 1.190706491470337, "learning_rate": 0.00018427070045048594, "loss": 1.2309, "step": 9410 }, { "epoch": 0.36246390760346486, "grad_norm": 1.1487165689468384, "learning_rate": 0.00018425441642676667, "loss": 1.2049, "step": 9415 }, { "epoch": 0.3626564003849856, "grad_norm": 1.0437067747116089, "learning_rate": 0.00018423812469853918, "loss": 1.3632, "step": 9420 }, { "epoch": 0.36284889316650626, "grad_norm": 1.7774686813354492, "learning_rate": 0.00018422182526729318, "loss": 1.1797, "step": 9425 }, { "epoch": 0.36304138594802693, "grad_norm": 1.3748910427093506, "learning_rate": 0.0001842055181345192, "loss": 1.4438, "step": 9430 }, { "epoch": 0.36323387872954765, "grad_norm": 0.891248881816864, "learning_rate": 0.00018418920330170842, "loss": 1.3017, "step": 9435 }, { "epoch": 0.3634263715110683, "grad_norm": 1.5410393476486206, "learning_rate": 0.00018417288077035267, "loss": 1.2239, "step": 9440 }, { "epoch": 0.36361886429258905, "grad_norm": 1.3638213872909546, "learning_rate": 0.00018415655054194457, "loss": 1.2245, "step": 9445 }, { "epoch": 0.3638113570741097, "grad_norm": 1.84505033493042, "learning_rate": 0.00018414021261797743, "loss": 1.1362, "step": 9450 }, { "epoch": 0.3640038498556304, "grad_norm": 1.5999794006347656, "learning_rate": 0.00018412386699994518, "loss": 1.1647, "step": 9455 }, { "epoch": 0.3641963426371511, "grad_norm": 1.55308997631073, "learning_rate": 0.0001841075136893426, "loss": 1.2612, "step": 9460 }, { "epoch": 0.3643888354186718, "grad_norm": 1.3549528121948242, "learning_rate": 0.00018409115268766505, "loss": 1.2095, "step": 9465 }, { "epoch": 0.3645813282001925, "grad_norm": 1.123184323310852, "learning_rate": 0.00018407478399640862, "loss": 1.3047, "step": 9470 }, { "epoch": 0.3647738209817132, "grad_norm": 1.3776748180389404, "learning_rate": 0.00018405840761707016, "loss": 1.1064, "step": 9475 }, { "epoch": 0.36496631376323385, "grad_norm": 1.3778200149536133, "learning_rate": 0.00018404202355114718, "loss": 1.0956, "step": 9480 }, { "epoch": 0.3651588065447546, "grad_norm": 0.9069898128509521, "learning_rate": 0.00018402563180013783, "loss": 1.141, "step": 9485 }, { "epoch": 0.36535129932627525, "grad_norm": 1.3908804655075073, "learning_rate": 0.0001840092323655411, "loss": 1.2679, "step": 9490 }, { "epoch": 0.365543792107796, "grad_norm": 1.3785732984542847, "learning_rate": 0.00018399282524885654, "loss": 1.22, "step": 9495 }, { "epoch": 0.36573628488931664, "grad_norm": 1.1326193809509277, "learning_rate": 0.00018397641045158453, "loss": 1.2289, "step": 9500 }, { "epoch": 0.36592877767083737, "grad_norm": 1.2267814874649048, "learning_rate": 0.0001839599879752261, "loss": 1.1337, "step": 9505 }, { "epoch": 0.36612127045235804, "grad_norm": 0.8690314888954163, "learning_rate": 0.00018394355782128295, "loss": 1.2535, "step": 9510 }, { "epoch": 0.3663137632338787, "grad_norm": 1.448415994644165, "learning_rate": 0.00018392711999125748, "loss": 1.1405, "step": 9515 }, { "epoch": 0.36650625601539943, "grad_norm": 1.8989317417144775, "learning_rate": 0.00018391067448665288, "loss": 1.091, "step": 9520 }, { "epoch": 0.3666987487969201, "grad_norm": 1.2263299226760864, "learning_rate": 0.00018389422130897295, "loss": 1.1925, "step": 9525 }, { "epoch": 0.36689124157844083, "grad_norm": 0.8818153142929077, "learning_rate": 0.00018387776045972225, "loss": 1.2961, "step": 9530 }, { "epoch": 0.3670837343599615, "grad_norm": 1.0975017547607422, "learning_rate": 0.00018386129194040597, "loss": 1.414, "step": 9535 }, { "epoch": 0.36727622714148217, "grad_norm": 2.2097692489624023, "learning_rate": 0.00018384481575253004, "loss": 1.1941, "step": 9540 }, { "epoch": 0.3674687199230029, "grad_norm": 1.2249376773834229, "learning_rate": 0.0001838283318976012, "loss": 1.4472, "step": 9545 }, { "epoch": 0.36766121270452357, "grad_norm": 1.0000889301300049, "learning_rate": 0.0001838118403771267, "loss": 1.2399, "step": 9550 }, { "epoch": 0.3678537054860443, "grad_norm": 1.0249544382095337, "learning_rate": 0.00018379534119261458, "loss": 1.3182, "step": 9555 }, { "epoch": 0.36804619826756496, "grad_norm": 1.2347283363342285, "learning_rate": 0.00018377883434557362, "loss": 1.1313, "step": 9560 }, { "epoch": 0.3682386910490857, "grad_norm": 1.1021714210510254, "learning_rate": 0.0001837623198375132, "loss": 1.2381, "step": 9565 }, { "epoch": 0.36843118383060636, "grad_norm": 1.0923985242843628, "learning_rate": 0.00018374579766994355, "loss": 1.3386, "step": 9570 }, { "epoch": 0.368623676612127, "grad_norm": 1.7709978818893433, "learning_rate": 0.00018372926784437547, "loss": 1.2405, "step": 9575 }, { "epoch": 0.36881616939364775, "grad_norm": 1.316901683807373, "learning_rate": 0.00018371273036232047, "loss": 1.1244, "step": 9580 }, { "epoch": 0.3690086621751684, "grad_norm": 1.7281345129013062, "learning_rate": 0.00018369618522529085, "loss": 1.2979, "step": 9585 }, { "epoch": 0.36920115495668915, "grad_norm": 1.6363762617111206, "learning_rate": 0.00018367963243479953, "loss": 1.1528, "step": 9590 }, { "epoch": 0.3693936477382098, "grad_norm": 1.7078179121017456, "learning_rate": 0.00018366307199236013, "loss": 1.2833, "step": 9595 }, { "epoch": 0.3695861405197305, "grad_norm": 1.9110232591629028, "learning_rate": 0.000183646503899487, "loss": 1.4191, "step": 9600 }, { "epoch": 0.3697786333012512, "grad_norm": 0.952301025390625, "learning_rate": 0.00018362992815769525, "loss": 1.1504, "step": 9605 }, { "epoch": 0.3699711260827719, "grad_norm": 0.9142165780067444, "learning_rate": 0.0001836133447685005, "loss": 1.2617, "step": 9610 }, { "epoch": 0.3701636188642926, "grad_norm": 1.5571134090423584, "learning_rate": 0.0001835967537334193, "loss": 1.3054, "step": 9615 }, { "epoch": 0.3703561116458133, "grad_norm": 1.799795389175415, "learning_rate": 0.00018358015505396877, "loss": 1.0603, "step": 9620 }, { "epoch": 0.37054860442733395, "grad_norm": 1.6660315990447998, "learning_rate": 0.0001835635487316667, "loss": 1.1757, "step": 9625 }, { "epoch": 0.3707410972088547, "grad_norm": 0.9840423464775085, "learning_rate": 0.00018354693476803168, "loss": 0.9815, "step": 9630 }, { "epoch": 0.37093358999037535, "grad_norm": 2.0538954734802246, "learning_rate": 0.00018353031316458286, "loss": 1.2396, "step": 9635 }, { "epoch": 0.37112608277189607, "grad_norm": 1.2079198360443115, "learning_rate": 0.0001835136839228403, "loss": 1.2731, "step": 9640 }, { "epoch": 0.37131857555341674, "grad_norm": 1.7076921463012695, "learning_rate": 0.00018349704704432457, "loss": 1.1388, "step": 9645 }, { "epoch": 0.37151106833493747, "grad_norm": 1.0324435234069824, "learning_rate": 0.00018348040253055698, "loss": 0.9949, "step": 9650 }, { "epoch": 0.37170356111645814, "grad_norm": 1.3635584115982056, "learning_rate": 0.0001834637503830596, "loss": 1.307, "step": 9655 }, { "epoch": 0.3718960538979788, "grad_norm": 1.6683429479599, "learning_rate": 0.00018344709060335513, "loss": 1.1687, "step": 9660 }, { "epoch": 0.37208854667949953, "grad_norm": 2.3687121868133545, "learning_rate": 0.00018343042319296702, "loss": 1.4163, "step": 9665 }, { "epoch": 0.3722810394610202, "grad_norm": 1.9078242778778076, "learning_rate": 0.00018341374815341937, "loss": 1.2986, "step": 9670 }, { "epoch": 0.37247353224254093, "grad_norm": 1.6381220817565918, "learning_rate": 0.00018339706548623706, "loss": 1.5092, "step": 9675 }, { "epoch": 0.3726660250240616, "grad_norm": 1.3529161214828491, "learning_rate": 0.00018338037519294553, "loss": 1.2296, "step": 9680 }, { "epoch": 0.37285851780558227, "grad_norm": 1.1034053564071655, "learning_rate": 0.00018336367727507104, "loss": 1.2774, "step": 9685 }, { "epoch": 0.373051010587103, "grad_norm": 2.0935397148132324, "learning_rate": 0.0001833469717341405, "loss": 1.2247, "step": 9690 }, { "epoch": 0.37324350336862366, "grad_norm": 1.6294866800308228, "learning_rate": 0.0001833302585716815, "loss": 1.3766, "step": 9695 }, { "epoch": 0.3734359961501444, "grad_norm": 1.6927978992462158, "learning_rate": 0.0001833135377892224, "loss": 1.3069, "step": 9700 }, { "epoch": 0.37362848893166506, "grad_norm": 0.8497247695922852, "learning_rate": 0.00018329680938829212, "loss": 1.0906, "step": 9705 }, { "epoch": 0.37382098171318573, "grad_norm": 1.9347554445266724, "learning_rate": 0.00018328007337042046, "loss": 1.277, "step": 9710 }, { "epoch": 0.37401347449470645, "grad_norm": 1.023130178451538, "learning_rate": 0.00018326332973713776, "loss": 1.254, "step": 9715 }, { "epoch": 0.3742059672762271, "grad_norm": 1.7206385135650635, "learning_rate": 0.0001832465784899751, "loss": 1.2141, "step": 9720 }, { "epoch": 0.37439846005774785, "grad_norm": 1.2445294857025146, "learning_rate": 0.00018322981963046433, "loss": 1.3817, "step": 9725 }, { "epoch": 0.3745909528392685, "grad_norm": 1.832334280014038, "learning_rate": 0.00018321305316013788, "loss": 1.3584, "step": 9730 }, { "epoch": 0.37478344562078925, "grad_norm": 1.2087010145187378, "learning_rate": 0.00018319627908052898, "loss": 1.116, "step": 9735 }, { "epoch": 0.3749759384023099, "grad_norm": 1.286687970161438, "learning_rate": 0.00018317949739317147, "loss": 1.1913, "step": 9740 }, { "epoch": 0.3751684311838306, "grad_norm": 1.44833242893219, "learning_rate": 0.00018316270809959993, "loss": 1.2713, "step": 9745 }, { "epoch": 0.3753609239653513, "grad_norm": 1.1395667791366577, "learning_rate": 0.00018314591120134963, "loss": 1.2912, "step": 9750 }, { "epoch": 0.375553416746872, "grad_norm": 1.1399837732315063, "learning_rate": 0.00018312910669995654, "loss": 1.2804, "step": 9755 }, { "epoch": 0.3757459095283927, "grad_norm": 1.814249038696289, "learning_rate": 0.00018311229459695735, "loss": 1.1062, "step": 9760 }, { "epoch": 0.3759384023099134, "grad_norm": 1.4851144552230835, "learning_rate": 0.00018309547489388933, "loss": 1.2826, "step": 9765 }, { "epoch": 0.37613089509143405, "grad_norm": 0.9308827519416809, "learning_rate": 0.00018307864759229065, "loss": 1.3706, "step": 9770 }, { "epoch": 0.3763233878729548, "grad_norm": 3.707566261291504, "learning_rate": 0.00018306181269369998, "loss": 1.2292, "step": 9775 }, { "epoch": 0.37651588065447544, "grad_norm": 2.6666324138641357, "learning_rate": 0.00018304497019965677, "loss": 1.4645, "step": 9780 }, { "epoch": 0.37670837343599617, "grad_norm": 1.5997512340545654, "learning_rate": 0.00018302812011170114, "loss": 1.2812, "step": 9785 }, { "epoch": 0.37690086621751684, "grad_norm": 0.8998873233795166, "learning_rate": 0.00018301126243137395, "loss": 1.195, "step": 9790 }, { "epoch": 0.3770933589990375, "grad_norm": 1.407524585723877, "learning_rate": 0.0001829943971602167, "loss": 1.1793, "step": 9795 }, { "epoch": 0.37728585178055823, "grad_norm": 1.1469497680664062, "learning_rate": 0.00018297752429977164, "loss": 1.3624, "step": 9800 }, { "epoch": 0.3774783445620789, "grad_norm": 1.4583423137664795, "learning_rate": 0.00018296064385158164, "loss": 1.2033, "step": 9805 }, { "epoch": 0.37767083734359963, "grad_norm": 1.0782575607299805, "learning_rate": 0.00018294375581719036, "loss": 1.1823, "step": 9810 }, { "epoch": 0.3778633301251203, "grad_norm": 1.1890922784805298, "learning_rate": 0.00018292686019814202, "loss": 1.2711, "step": 9815 }, { "epoch": 0.378055822906641, "grad_norm": 0.854491651058197, "learning_rate": 0.00018290995699598165, "loss": 1.1953, "step": 9820 }, { "epoch": 0.3782483156881617, "grad_norm": 1.2184374332427979, "learning_rate": 0.00018289304621225497, "loss": 1.2052, "step": 9825 }, { "epoch": 0.37844080846968237, "grad_norm": 1.1952948570251465, "learning_rate": 0.0001828761278485083, "loss": 1.2516, "step": 9830 }, { "epoch": 0.3786333012512031, "grad_norm": 2.1117265224456787, "learning_rate": 0.00018285920190628879, "loss": 1.2834, "step": 9835 }, { "epoch": 0.37882579403272376, "grad_norm": 1.1815403699874878, "learning_rate": 0.00018284226838714412, "loss": 1.0574, "step": 9840 }, { "epoch": 0.3790182868142445, "grad_norm": 1.3763145208358765, "learning_rate": 0.00018282532729262278, "loss": 1.2813, "step": 9845 }, { "epoch": 0.37921077959576516, "grad_norm": 1.5308822393417358, "learning_rate": 0.00018280837862427393, "loss": 1.2118, "step": 9850 }, { "epoch": 0.3794032723772858, "grad_norm": 1.1991111040115356, "learning_rate": 0.00018279142238364745, "loss": 1.0999, "step": 9855 }, { "epoch": 0.37959576515880655, "grad_norm": 1.7062435150146484, "learning_rate": 0.0001827744585722938, "loss": 1.2103, "step": 9860 }, { "epoch": 0.3797882579403272, "grad_norm": 1.5572453737258911, "learning_rate": 0.00018275748719176425, "loss": 1.112, "step": 9865 }, { "epoch": 0.37998075072184795, "grad_norm": 0.9328321218490601, "learning_rate": 0.00018274050824361072, "loss": 1.2688, "step": 9870 }, { "epoch": 0.3801732435033686, "grad_norm": 1.290634036064148, "learning_rate": 0.0001827235217293858, "loss": 1.1486, "step": 9875 }, { "epoch": 0.38036573628488934, "grad_norm": 1.7471963167190552, "learning_rate": 0.00018270652765064283, "loss": 1.2584, "step": 9880 }, { "epoch": 0.38055822906641, "grad_norm": 1.4827409982681274, "learning_rate": 0.00018268952600893577, "loss": 1.3655, "step": 9885 }, { "epoch": 0.3807507218479307, "grad_norm": 1.0229063034057617, "learning_rate": 0.00018267251680581935, "loss": 1.1955, "step": 9890 }, { "epoch": 0.3809432146294514, "grad_norm": 1.3075898885726929, "learning_rate": 0.0001826555000428489, "loss": 0.9779, "step": 9895 }, { "epoch": 0.3811357074109721, "grad_norm": 1.5942119359970093, "learning_rate": 0.00018263847572158053, "loss": 1.2556, "step": 9900 }, { "epoch": 0.3813282001924928, "grad_norm": 0.9223330616950989, "learning_rate": 0.00018262144384357097, "loss": 1.1109, "step": 9905 }, { "epoch": 0.3815206929740135, "grad_norm": 1.7757457494735718, "learning_rate": 0.00018260440441037766, "loss": 1.2219, "step": 9910 }, { "epoch": 0.38171318575553415, "grad_norm": 1.4870551824569702, "learning_rate": 0.00018258735742355883, "loss": 1.3312, "step": 9915 }, { "epoch": 0.38190567853705487, "grad_norm": 1.2982031106948853, "learning_rate": 0.00018257030288467322, "loss": 1.2421, "step": 9920 }, { "epoch": 0.38209817131857554, "grad_norm": 1.016822338104248, "learning_rate": 0.0001825532407952804, "loss": 1.3542, "step": 9925 }, { "epoch": 0.38229066410009627, "grad_norm": 1.0763219594955444, "learning_rate": 0.00018253617115694058, "loss": 1.2579, "step": 9930 }, { "epoch": 0.38248315688161694, "grad_norm": 1.7673341035842896, "learning_rate": 0.00018251909397121464, "loss": 1.1875, "step": 9935 }, { "epoch": 0.3826756496631376, "grad_norm": 1.3719041347503662, "learning_rate": 0.00018250200923966423, "loss": 1.1493, "step": 9940 }, { "epoch": 0.38286814244465833, "grad_norm": 1.8589760065078735, "learning_rate": 0.00018248491696385157, "loss": 1.2751, "step": 9945 }, { "epoch": 0.383060635226179, "grad_norm": 1.6069539785385132, "learning_rate": 0.0001824678171453397, "loss": 1.415, "step": 9950 }, { "epoch": 0.38325312800769973, "grad_norm": 1.7131226062774658, "learning_rate": 0.0001824507097856922, "loss": 1.1773, "step": 9955 }, { "epoch": 0.3834456207892204, "grad_norm": 0.7622759342193604, "learning_rate": 0.0001824335948864735, "loss": 1.1588, "step": 9960 }, { "epoch": 0.3836381135707411, "grad_norm": 1.6202800273895264, "learning_rate": 0.0001824164724492486, "loss": 1.3064, "step": 9965 }, { "epoch": 0.3838306063522618, "grad_norm": 1.5452194213867188, "learning_rate": 0.0001823993424755833, "loss": 1.2993, "step": 9970 }, { "epoch": 0.38402309913378246, "grad_norm": 1.013929009437561, "learning_rate": 0.00018238220496704396, "loss": 1.3123, "step": 9975 }, { "epoch": 0.3842155919153032, "grad_norm": 0.9624648094177246, "learning_rate": 0.0001823650599251977, "loss": 1.0517, "step": 9980 }, { "epoch": 0.38440808469682386, "grad_norm": 1.2065962553024292, "learning_rate": 0.00018234790735161232, "loss": 1.1954, "step": 9985 }, { "epoch": 0.3846005774783446, "grad_norm": 1.425376057624817, "learning_rate": 0.00018233074724785634, "loss": 1.069, "step": 9990 }, { "epoch": 0.38479307025986526, "grad_norm": 1.0355112552642822, "learning_rate": 0.00018231357961549888, "loss": 1.0839, "step": 9995 }, { "epoch": 0.3849855630413859, "grad_norm": 1.7273633480072021, "learning_rate": 0.00018229640445610988, "loss": 1.1324, "step": 10000 } ], "logging_steps": 5, "max_steps": 51950, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, "total_flos": 3.133033729973453e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }