diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3849855630413859, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019249278152069297, + "grad_norm": 1.9932657480239868, + "learning_rate": 0.00016, + "loss": 2.962, + "step": 5 + }, + { + "epoch": 0.00038498556304138594, + "grad_norm": 1.2827370166778564, + "learning_rate": 0.000199999997073812, + "loss": 2.2983, + "step": 10 + }, + { + "epoch": 0.0005774783445620789, + "grad_norm": 1.3585917949676514, + "learning_rate": 0.0001999999851861734, + "loss": 1.9389, + "step": 15 + }, + { + "epoch": 0.0007699711260827719, + "grad_norm": 2.117544412612915, + "learning_rate": 0.00019999996415419864, + "loss": 1.6659, + "step": 20 + }, + { + "epoch": 0.0009624639076034649, + "grad_norm": 0.8802940249443054, + "learning_rate": 0.0001999999339778896, + "loss": 1.6015, + "step": 25 + }, + { + "epoch": 0.0011549566891241579, + "grad_norm": 1.256873369216919, + "learning_rate": 0.000199999894657249, + "loss": 1.7428, + "step": 30 + }, + { + "epoch": 0.001347449470644851, + "grad_norm": 1.9709804058074951, + "learning_rate": 0.0001999998461922805, + "loss": 1.4316, + "step": 35 + }, + { + "epoch": 0.0015399422521655437, + "grad_norm": 1.2085392475128174, + "learning_rate": 0.00019999978858298848, + "loss": 1.8392, + "step": 40 + }, + { + "epoch": 0.0017324350336862368, + "grad_norm": 0.9966161847114563, + "learning_rate": 0.00019999972182937827, + "loss": 1.6381, + "step": 45 + }, + { + "epoch": 0.0019249278152069298, + "grad_norm": 1.5572378635406494, + "learning_rate": 0.0001999996459314559, + "loss": 1.6214, + "step": 50 + }, + { + "epoch": 0.0021174205967276227, + "grad_norm": 0.9813450574874878, + "learning_rate": 0.00019999956088922837, + "loss": 1.5337, + "step": 55 + }, + { + "epoch": 0.0023099133782483157, + "grad_norm": 1.140754222869873, + "learning_rate": 0.00019999946670270341, + "loss": 1.5865, + "step": 60 + }, + { + "epoch": 0.0025024061597690088, + "grad_norm": 1.7033613920211792, + "learning_rate": 0.0001999993633718897, + "loss": 1.5483, + "step": 65 + }, + { + "epoch": 0.002694898941289702, + "grad_norm": 0.8782416582107544, + "learning_rate": 0.00019999925089679658, + "loss": 1.7574, + "step": 70 + }, + { + "epoch": 0.0028873917228103944, + "grad_norm": 0.94110506772995, + "learning_rate": 0.00019999912927743445, + "loss": 1.747, + "step": 75 + }, + { + "epoch": 0.0030798845043310875, + "grad_norm": 2.9130144119262695, + "learning_rate": 0.00019999899851381436, + "loss": 1.5482, + "step": 80 + }, + { + "epoch": 0.0032723772858517805, + "grad_norm": 1.444981336593628, + "learning_rate": 0.00019999885860594828, + "loss": 1.7935, + "step": 85 + }, + { + "epoch": 0.0034648700673724736, + "grad_norm": 0.8361923098564148, + "learning_rate": 0.00019999870955384906, + "loss": 1.5566, + "step": 90 + }, + { + "epoch": 0.0036573628488931666, + "grad_norm": 1.0198391675949097, + "learning_rate": 0.00019999855135753025, + "loss": 1.6608, + "step": 95 + }, + { + "epoch": 0.0038498556304138597, + "grad_norm": 0.9720978736877441, + "learning_rate": 0.00019999838401700632, + "loss": 1.4217, + "step": 100 + }, + { + "epoch": 0.004042348411934553, + "grad_norm": 0.7735599279403687, + "learning_rate": 0.00019999820753229263, + "loss": 1.4195, + "step": 105 + }, + { + "epoch": 0.004234841193455245, + "grad_norm": 1.1776920557022095, + "learning_rate": 0.0001999980219034053, + "loss": 1.7147, + "step": 110 + }, + { + "epoch": 0.004427333974975939, + "grad_norm": 1.156069278717041, + "learning_rate": 0.0001999978271303613, + "loss": 1.7, + "step": 115 + }, + { + "epoch": 0.0046198267564966315, + "grad_norm": 1.2335503101348877, + "learning_rate": 0.0001999976232131784, + "loss": 1.3309, + "step": 120 + }, + { + "epoch": 0.004812319538017324, + "grad_norm": 1.0332967042922974, + "learning_rate": 0.0001999974101518753, + "loss": 1.7515, + "step": 125 + }, + { + "epoch": 0.0050048123195380175, + "grad_norm": 1.561087727546692, + "learning_rate": 0.00019999718794647145, + "loss": 1.5517, + "step": 130 + }, + { + "epoch": 0.00519730510105871, + "grad_norm": 1.3611408472061157, + "learning_rate": 0.00019999695659698717, + "loss": 1.5771, + "step": 135 + }, + { + "epoch": 0.005389797882579404, + "grad_norm": 1.5531154870986938, + "learning_rate": 0.0001999967161034437, + "loss": 1.4217, + "step": 140 + }, + { + "epoch": 0.005582290664100096, + "grad_norm": 1.5827676057815552, + "learning_rate": 0.00019999646646586287, + "loss": 1.611, + "step": 145 + }, + { + "epoch": 0.005774783445620789, + "grad_norm": 1.1693483591079712, + "learning_rate": 0.00019999620768426763, + "loss": 1.3961, + "step": 150 + }, + { + "epoch": 0.005967276227141482, + "grad_norm": 1.4277936220169067, + "learning_rate": 0.00019999593975868164, + "loss": 1.638, + "step": 155 + }, + { + "epoch": 0.006159769008662175, + "grad_norm": 1.2951083183288574, + "learning_rate": 0.00019999566268912933, + "loss": 1.6187, + "step": 160 + }, + { + "epoch": 0.0063522617901828685, + "grad_norm": 2.4885995388031006, + "learning_rate": 0.0001999953764756361, + "loss": 1.5669, + "step": 165 + }, + { + "epoch": 0.006544754571703561, + "grad_norm": 1.3352105617523193, + "learning_rate": 0.00019999508111822811, + "loss": 1.3157, + "step": 170 + }, + { + "epoch": 0.006737247353224254, + "grad_norm": 1.2560889720916748, + "learning_rate": 0.00019999477661693233, + "loss": 1.7011, + "step": 175 + }, + { + "epoch": 0.006929740134744947, + "grad_norm": 2.4167582988739014, + "learning_rate": 0.00019999446297177666, + "loss": 1.4827, + "step": 180 + }, + { + "epoch": 0.00712223291626564, + "grad_norm": 1.0598788261413574, + "learning_rate": 0.00019999414018278974, + "loss": 1.5718, + "step": 185 + }, + { + "epoch": 0.007314725697786333, + "grad_norm": 1.5576567649841309, + "learning_rate": 0.00019999380825000111, + "loss": 1.7717, + "step": 190 + }, + { + "epoch": 0.007507218479307026, + "grad_norm": 1.005711317062378, + "learning_rate": 0.0001999934671734411, + "loss": 1.5085, + "step": 195 + }, + { + "epoch": 0.007699711260827719, + "grad_norm": 1.7211413383483887, + "learning_rate": 0.00019999311695314095, + "loss": 1.623, + "step": 200 + }, + { + "epoch": 0.007892204042348411, + "grad_norm": 1.5765767097473145, + "learning_rate": 0.00019999275758913261, + "loss": 1.5982, + "step": 205 + }, + { + "epoch": 0.008084696823869105, + "grad_norm": 1.0989298820495605, + "learning_rate": 0.00019999238908144896, + "loss": 1.3306, + "step": 210 + }, + { + "epoch": 0.008277189605389798, + "grad_norm": 1.0234464406967163, + "learning_rate": 0.0001999920114301238, + "loss": 1.5856, + "step": 215 + }, + { + "epoch": 0.00846968238691049, + "grad_norm": 1.6681355237960815, + "learning_rate": 0.0001999916246351915, + "loss": 1.4777, + "step": 220 + }, + { + "epoch": 0.008662175168431183, + "grad_norm": 0.9723508358001709, + "learning_rate": 0.00019999122869668754, + "loss": 1.5357, + "step": 225 + }, + { + "epoch": 0.008854667949951878, + "grad_norm": 0.8840959072113037, + "learning_rate": 0.0001999908236146481, + "loss": 1.5296, + "step": 230 + }, + { + "epoch": 0.00904716073147257, + "grad_norm": 0.9913238883018494, + "learning_rate": 0.0001999904093891102, + "loss": 1.5846, + "step": 235 + }, + { + "epoch": 0.009239653512993263, + "grad_norm": 1.129952073097229, + "learning_rate": 0.00019998998602011178, + "loss": 1.4455, + "step": 240 + }, + { + "epoch": 0.009432146294513956, + "grad_norm": 1.0377521514892578, + "learning_rate": 0.00019998955350769148, + "loss": 1.4212, + "step": 245 + }, + { + "epoch": 0.009624639076034648, + "grad_norm": 2.2103137969970703, + "learning_rate": 0.00019998911185188886, + "loss": 1.5812, + "step": 250 + }, + { + "epoch": 0.009817131857555342, + "grad_norm": 0.8716953992843628, + "learning_rate": 0.00019998866105274437, + "loss": 1.5326, + "step": 255 + }, + { + "epoch": 0.010009624639076035, + "grad_norm": 1.1956042051315308, + "learning_rate": 0.00019998820111029916, + "loss": 1.7183, + "step": 260 + }, + { + "epoch": 0.010202117420596728, + "grad_norm": 2.747600555419922, + "learning_rate": 0.00019998773202459534, + "loss": 1.7952, + "step": 265 + }, + { + "epoch": 0.01039461020211742, + "grad_norm": 1.3412338495254517, + "learning_rate": 0.00019998725379567577, + "loss": 1.3538, + "step": 270 + }, + { + "epoch": 0.010587102983638113, + "grad_norm": 1.651822805404663, + "learning_rate": 0.00019998676642358422, + "loss": 1.5458, + "step": 275 + }, + { + "epoch": 0.010779595765158807, + "grad_norm": 1.3036198616027832, + "learning_rate": 0.00019998626990836522, + "loss": 1.7305, + "step": 280 + }, + { + "epoch": 0.0109720885466795, + "grad_norm": 0.8263657093048096, + "learning_rate": 0.00019998576425006416, + "loss": 1.3767, + "step": 285 + }, + { + "epoch": 0.011164581328200193, + "grad_norm": 2.022136926651001, + "learning_rate": 0.00019998524944872737, + "loss": 1.5823, + "step": 290 + }, + { + "epoch": 0.011357074109720885, + "grad_norm": 1.1224019527435303, + "learning_rate": 0.00019998472550440178, + "loss": 1.5723, + "step": 295 + }, + { + "epoch": 0.011549566891241578, + "grad_norm": 1.375664234161377, + "learning_rate": 0.00019998419241713542, + "loss": 1.5224, + "step": 300 + }, + { + "epoch": 0.011742059672762272, + "grad_norm": 1.2721813917160034, + "learning_rate": 0.000199983650186977, + "loss": 1.7217, + "step": 305 + }, + { + "epoch": 0.011934552454282965, + "grad_norm": 1.4723321199417114, + "learning_rate": 0.0001999830988139761, + "loss": 1.4666, + "step": 310 + }, + { + "epoch": 0.012127045235803657, + "grad_norm": 0.695198118686676, + "learning_rate": 0.00019998253829818315, + "loss": 1.2672, + "step": 315 + }, + { + "epoch": 0.01231953801732435, + "grad_norm": 1.716638207435608, + "learning_rate": 0.00019998196863964937, + "loss": 1.3461, + "step": 320 + }, + { + "epoch": 0.012512030798845043, + "grad_norm": 1.1060154438018799, + "learning_rate": 0.0001999813898384269, + "loss": 1.3816, + "step": 325 + }, + { + "epoch": 0.012704523580365737, + "grad_norm": 1.6124354600906372, + "learning_rate": 0.00019998080189456862, + "loss": 1.5232, + "step": 330 + }, + { + "epoch": 0.01289701636188643, + "grad_norm": 1.5060306787490845, + "learning_rate": 0.00019998020480812832, + "loss": 1.5767, + "step": 335 + }, + { + "epoch": 0.013089509143407122, + "grad_norm": 1.1920175552368164, + "learning_rate": 0.00019997959857916063, + "loss": 1.6112, + "step": 340 + }, + { + "epoch": 0.013282001924927815, + "grad_norm": 1.1669896841049194, + "learning_rate": 0.00019997898320772096, + "loss": 1.3679, + "step": 345 + }, + { + "epoch": 0.013474494706448507, + "grad_norm": 1.1692086458206177, + "learning_rate": 0.00019997835869386553, + "loss": 1.4147, + "step": 350 + }, + { + "epoch": 0.013666987487969202, + "grad_norm": 2.0466034412384033, + "learning_rate": 0.00019997772503765153, + "loss": 1.5261, + "step": 355 + }, + { + "epoch": 0.013859480269489894, + "grad_norm": 1.1581529378890991, + "learning_rate": 0.00019997708223913686, + "loss": 1.5441, + "step": 360 + }, + { + "epoch": 0.014051973051010587, + "grad_norm": 1.4370143413543701, + "learning_rate": 0.0001999764302983803, + "loss": 1.651, + "step": 365 + }, + { + "epoch": 0.01424446583253128, + "grad_norm": 0.998635470867157, + "learning_rate": 0.00019997576921544147, + "loss": 1.4311, + "step": 370 + }, + { + "epoch": 0.014436958614051972, + "grad_norm": 1.2625153064727783, + "learning_rate": 0.00019997509899038086, + "loss": 1.4634, + "step": 375 + }, + { + "epoch": 0.014629451395572667, + "grad_norm": 1.171949863433838, + "learning_rate": 0.00019997441962325968, + "loss": 1.2474, + "step": 380 + }, + { + "epoch": 0.01482194417709336, + "grad_norm": 1.4312052726745605, + "learning_rate": 0.00019997373111414009, + "loss": 1.4814, + "step": 385 + }, + { + "epoch": 0.015014436958614052, + "grad_norm": 1.1508846282958984, + "learning_rate": 0.00019997303346308508, + "loss": 1.6291, + "step": 390 + }, + { + "epoch": 0.015206929740134744, + "grad_norm": 1.2096014022827148, + "learning_rate": 0.0001999723266701584, + "loss": 1.5507, + "step": 395 + }, + { + "epoch": 0.015399422521655439, + "grad_norm": 0.996391773223877, + "learning_rate": 0.00019997161073542473, + "loss": 1.6402, + "step": 400 + }, + { + "epoch": 0.015591915303176131, + "grad_norm": 1.6977828741073608, + "learning_rate": 0.00019997088565894947, + "loss": 1.5706, + "step": 405 + }, + { + "epoch": 0.015784408084696822, + "grad_norm": 1.4707343578338623, + "learning_rate": 0.000199970151440799, + "loss": 1.6348, + "step": 410 + }, + { + "epoch": 0.015976900866217517, + "grad_norm": 1.5461647510528564, + "learning_rate": 0.0001999694080810404, + "loss": 1.4836, + "step": 415 + }, + { + "epoch": 0.01616939364773821, + "grad_norm": 1.6253695487976074, + "learning_rate": 0.00019996865557974166, + "loss": 1.5834, + "step": 420 + }, + { + "epoch": 0.016361886429258902, + "grad_norm": 1.671321988105774, + "learning_rate": 0.00019996789393697165, + "loss": 1.3816, + "step": 425 + }, + { + "epoch": 0.016554379210779596, + "grad_norm": 0.9412807822227478, + "learning_rate": 0.00019996712315279992, + "loss": 1.443, + "step": 430 + }, + { + "epoch": 0.016746871992300287, + "grad_norm": 0.8705793023109436, + "learning_rate": 0.000199966343227297, + "loss": 1.4938, + "step": 435 + }, + { + "epoch": 0.01693936477382098, + "grad_norm": 1.6019854545593262, + "learning_rate": 0.00019996555416053422, + "loss": 1.3622, + "step": 440 + }, + { + "epoch": 0.017131857555341676, + "grad_norm": 1.0340136289596558, + "learning_rate": 0.00019996475595258372, + "loss": 1.5803, + "step": 445 + }, + { + "epoch": 0.017324350336862367, + "grad_norm": 1.4469108581542969, + "learning_rate": 0.0001999639486035185, + "loss": 1.6074, + "step": 450 + }, + { + "epoch": 0.01751684311838306, + "grad_norm": 1.3311457633972168, + "learning_rate": 0.00019996313211341238, + "loss": 1.5337, + "step": 455 + }, + { + "epoch": 0.017709335899903755, + "grad_norm": 0.9691542387008667, + "learning_rate": 0.00019996230648234003, + "loss": 1.3835, + "step": 460 + }, + { + "epoch": 0.017901828681424446, + "grad_norm": 1.0229564905166626, + "learning_rate": 0.00019996147171037691, + "loss": 1.4925, + "step": 465 + }, + { + "epoch": 0.01809432146294514, + "grad_norm": 1.0120052099227905, + "learning_rate": 0.00019996062779759942, + "loss": 1.4781, + "step": 470 + }, + { + "epoch": 0.01828681424446583, + "grad_norm": 0.8471246361732483, + "learning_rate": 0.00019995977474408468, + "loss": 1.4961, + "step": 475 + }, + { + "epoch": 0.018479307025986526, + "grad_norm": 2.020277261734009, + "learning_rate": 0.00019995891254991072, + "loss": 1.5299, + "step": 480 + }, + { + "epoch": 0.01867179980750722, + "grad_norm": 1.2169212102890015, + "learning_rate": 0.00019995804121515637, + "loss": 1.4626, + "step": 485 + }, + { + "epoch": 0.01886429258902791, + "grad_norm": 2.31048321723938, + "learning_rate": 0.00019995716073990133, + "loss": 1.3653, + "step": 490 + }, + { + "epoch": 0.019056785370548605, + "grad_norm": 1.8170429468154907, + "learning_rate": 0.0001999562711242261, + "loss": 1.3537, + "step": 495 + }, + { + "epoch": 0.019249278152069296, + "grad_norm": 1.1187188625335693, + "learning_rate": 0.00019995537236821198, + "loss": 1.6358, + "step": 500 + }, + { + "epoch": 0.01944177093358999, + "grad_norm": 1.2112963199615479, + "learning_rate": 0.0001999544644719412, + "loss": 1.4565, + "step": 505 + }, + { + "epoch": 0.019634263715110685, + "grad_norm": 1.3345009088516235, + "learning_rate": 0.0001999535474354968, + "loss": 1.647, + "step": 510 + }, + { + "epoch": 0.019826756496631376, + "grad_norm": 1.3109021186828613, + "learning_rate": 0.00019995262125896266, + "loss": 1.5462, + "step": 515 + }, + { + "epoch": 0.02001924927815207, + "grad_norm": 1.1681957244873047, + "learning_rate": 0.00019995168594242338, + "loss": 1.5292, + "step": 520 + }, + { + "epoch": 0.02021174205967276, + "grad_norm": 0.9509350657463074, + "learning_rate": 0.00019995074148596457, + "loss": 1.5566, + "step": 525 + }, + { + "epoch": 0.020404234841193455, + "grad_norm": 0.6594029664993286, + "learning_rate": 0.00019994978788967255, + "loss": 1.3693, + "step": 530 + }, + { + "epoch": 0.02059672762271415, + "grad_norm": 0.8029458522796631, + "learning_rate": 0.00019994882515363452, + "loss": 1.4664, + "step": 535 + }, + { + "epoch": 0.02078922040423484, + "grad_norm": 1.1551908254623413, + "learning_rate": 0.00019994785327793856, + "loss": 1.5342, + "step": 540 + }, + { + "epoch": 0.020981713185755535, + "grad_norm": 1.3600980043411255, + "learning_rate": 0.0001999468722626735, + "loss": 1.5262, + "step": 545 + }, + { + "epoch": 0.021174205967276226, + "grad_norm": 1.0333319902420044, + "learning_rate": 0.00019994588210792906, + "loss": 1.5079, + "step": 550 + }, + { + "epoch": 0.02136669874879692, + "grad_norm": 1.2757694721221924, + "learning_rate": 0.00019994488281379578, + "loss": 1.7721, + "step": 555 + }, + { + "epoch": 0.021559191530317615, + "grad_norm": 1.1292661428451538, + "learning_rate": 0.00019994387438036505, + "loss": 1.5077, + "step": 560 + }, + { + "epoch": 0.021751684311838305, + "grad_norm": 1.105522871017456, + "learning_rate": 0.00019994285680772906, + "loss": 1.6468, + "step": 565 + }, + { + "epoch": 0.021944177093359, + "grad_norm": 1.6378583908081055, + "learning_rate": 0.00019994183009598086, + "loss": 1.5432, + "step": 570 + }, + { + "epoch": 0.02213666987487969, + "grad_norm": 0.931384801864624, + "learning_rate": 0.0001999407942452144, + "loss": 1.3818, + "step": 575 + }, + { + "epoch": 0.022329162656400385, + "grad_norm": 1.0986119508743286, + "learning_rate": 0.0001999397492555243, + "loss": 1.552, + "step": 580 + }, + { + "epoch": 0.02252165543792108, + "grad_norm": 1.121957540512085, + "learning_rate": 0.00019993869512700623, + "loss": 1.5241, + "step": 585 + }, + { + "epoch": 0.02271414821944177, + "grad_norm": 1.2508270740509033, + "learning_rate": 0.00019993763185975646, + "loss": 1.6431, + "step": 590 + }, + { + "epoch": 0.022906641000962465, + "grad_norm": 1.293603777885437, + "learning_rate": 0.00019993655945387234, + "loss": 1.3788, + "step": 595 + }, + { + "epoch": 0.023099133782483156, + "grad_norm": 1.3218696117401123, + "learning_rate": 0.00019993547790945183, + "loss": 1.398, + "step": 600 + }, + { + "epoch": 0.02329162656400385, + "grad_norm": 0.8816308975219727, + "learning_rate": 0.0001999343872265939, + "loss": 1.4239, + "step": 605 + }, + { + "epoch": 0.023484119345524544, + "grad_norm": 1.9127452373504639, + "learning_rate": 0.00019993328740539824, + "loss": 1.549, + "step": 610 + }, + { + "epoch": 0.023676612127045235, + "grad_norm": 2.071992874145508, + "learning_rate": 0.0001999321784459655, + "loss": 1.6769, + "step": 615 + }, + { + "epoch": 0.02386910490856593, + "grad_norm": 1.335153579711914, + "learning_rate": 0.000199931060348397, + "loss": 1.6157, + "step": 620 + }, + { + "epoch": 0.02406159769008662, + "grad_norm": 1.1237496137619019, + "learning_rate": 0.000199929933112795, + "loss": 1.4733, + "step": 625 + }, + { + "epoch": 0.024254090471607315, + "grad_norm": 1.2557927370071411, + "learning_rate": 0.00019992879673926258, + "loss": 1.3888, + "step": 630 + }, + { + "epoch": 0.02444658325312801, + "grad_norm": 1.0877735614776611, + "learning_rate": 0.00019992765122790371, + "loss": 1.4241, + "step": 635 + }, + { + "epoch": 0.0246390760346487, + "grad_norm": 1.0029325485229492, + "learning_rate": 0.00019992649657882307, + "loss": 1.6504, + "step": 640 + }, + { + "epoch": 0.024831568816169394, + "grad_norm": 1.5832372903823853, + "learning_rate": 0.00019992533279212626, + "loss": 1.4662, + "step": 645 + }, + { + "epoch": 0.025024061597690085, + "grad_norm": 1.1658433675765991, + "learning_rate": 0.00019992415986791974, + "loss": 1.3723, + "step": 650 + }, + { + "epoch": 0.02521655437921078, + "grad_norm": 1.8895657062530518, + "learning_rate": 0.00019992297780631072, + "loss": 1.457, + "step": 655 + }, + { + "epoch": 0.025409047160731474, + "grad_norm": 1.193961501121521, + "learning_rate": 0.00019992178660740732, + "loss": 1.623, + "step": 660 + }, + { + "epoch": 0.025601539942252165, + "grad_norm": 0.9851275086402893, + "learning_rate": 0.00019992058627131844, + "loss": 1.6884, + "step": 665 + }, + { + "epoch": 0.02579403272377286, + "grad_norm": 1.5353829860687256, + "learning_rate": 0.00019991937679815386, + "loss": 1.3246, + "step": 670 + }, + { + "epoch": 0.02598652550529355, + "grad_norm": 1.2476325035095215, + "learning_rate": 0.0001999181581880242, + "loss": 1.596, + "step": 675 + }, + { + "epoch": 0.026179018286814244, + "grad_norm": 1.1163430213928223, + "learning_rate": 0.00019991693044104083, + "loss": 1.5077, + "step": 680 + }, + { + "epoch": 0.02637151106833494, + "grad_norm": 1.1388076543807983, + "learning_rate": 0.0001999156935573161, + "loss": 1.4827, + "step": 685 + }, + { + "epoch": 0.02656400384985563, + "grad_norm": 0.9100907444953918, + "learning_rate": 0.00019991444753696304, + "loss": 1.3429, + "step": 690 + }, + { + "epoch": 0.026756496631376324, + "grad_norm": 2.032510995864868, + "learning_rate": 0.00019991319238009565, + "loss": 1.5473, + "step": 695 + }, + { + "epoch": 0.026948989412897015, + "grad_norm": 1.0866800546646118, + "learning_rate": 0.00019991192808682868, + "loss": 1.5552, + "step": 700 + }, + { + "epoch": 0.02714148219441771, + "grad_norm": 1.3941971063613892, + "learning_rate": 0.00019991065465727774, + "loss": 1.4103, + "step": 705 + }, + { + "epoch": 0.027333974975938403, + "grad_norm": 1.721247911453247, + "learning_rate": 0.0001999093720915593, + "loss": 1.4965, + "step": 710 + }, + { + "epoch": 0.027526467757459094, + "grad_norm": 1.4090749025344849, + "learning_rate": 0.00019990808038979058, + "loss": 1.3159, + "step": 715 + }, + { + "epoch": 0.02771896053897979, + "grad_norm": 1.731886625289917, + "learning_rate": 0.00019990677955208973, + "loss": 1.4392, + "step": 720 + }, + { + "epoch": 0.02791145332050048, + "grad_norm": 1.9695488214492798, + "learning_rate": 0.00019990546957857576, + "loss": 1.6206, + "step": 725 + }, + { + "epoch": 0.028103946102021174, + "grad_norm": 0.7977893352508545, + "learning_rate": 0.0001999041504693684, + "loss": 1.5764, + "step": 730 + }, + { + "epoch": 0.02829643888354187, + "grad_norm": 0.9448668360710144, + "learning_rate": 0.00019990282222458826, + "loss": 1.3149, + "step": 735 + }, + { + "epoch": 0.02848893166506256, + "grad_norm": 1.0612679719924927, + "learning_rate": 0.00019990148484435682, + "loss": 1.4942, + "step": 740 + }, + { + "epoch": 0.028681424446583254, + "grad_norm": 1.4038052558898926, + "learning_rate": 0.0001999001383287964, + "loss": 1.5184, + "step": 745 + }, + { + "epoch": 0.028873917228103944, + "grad_norm": 1.0545177459716797, + "learning_rate": 0.0001998987826780301, + "loss": 1.5617, + "step": 750 + }, + { + "epoch": 0.02906641000962464, + "grad_norm": 2.392878532409668, + "learning_rate": 0.0001998974178921819, + "loss": 1.3638, + "step": 755 + }, + { + "epoch": 0.029258902791145333, + "grad_norm": 1.1004624366760254, + "learning_rate": 0.0001998960439713766, + "loss": 1.5162, + "step": 760 + }, + { + "epoch": 0.029451395572666024, + "grad_norm": 1.2530279159545898, + "learning_rate": 0.0001998946609157398, + "loss": 1.5422, + "step": 765 + }, + { + "epoch": 0.02964388835418672, + "grad_norm": 0.8240470290184021, + "learning_rate": 0.00019989326872539803, + "loss": 1.3828, + "step": 770 + }, + { + "epoch": 0.029836381135707413, + "grad_norm": 0.9734111428260803, + "learning_rate": 0.00019989186740047857, + "loss": 1.7041, + "step": 775 + }, + { + "epoch": 0.030028873917228104, + "grad_norm": 0.9785217642784119, + "learning_rate": 0.00019989045694110953, + "loss": 1.6267, + "step": 780 + }, + { + "epoch": 0.030221366698748798, + "grad_norm": 1.3278164863586426, + "learning_rate": 0.00019988903734741994, + "loss": 1.5041, + "step": 785 + }, + { + "epoch": 0.03041385948026949, + "grad_norm": 1.9143437147140503, + "learning_rate": 0.00019988760861953958, + "loss": 1.4728, + "step": 790 + }, + { + "epoch": 0.030606352261790183, + "grad_norm": 1.5717315673828125, + "learning_rate": 0.0001998861707575991, + "loss": 1.3824, + "step": 795 + }, + { + "epoch": 0.030798845043310877, + "grad_norm": 1.0486010313034058, + "learning_rate": 0.00019988472376173, + "loss": 1.6186, + "step": 800 + }, + { + "epoch": 0.03099133782483157, + "grad_norm": 1.1566083431243896, + "learning_rate": 0.00019988326763206458, + "loss": 1.3773, + "step": 805 + }, + { + "epoch": 0.031183830606352263, + "grad_norm": 1.6336543560028076, + "learning_rate": 0.00019988180236873602, + "loss": 1.2998, + "step": 810 + }, + { + "epoch": 0.031376323387872954, + "grad_norm": 1.4655206203460693, + "learning_rate": 0.00019988032797187824, + "loss": 1.3966, + "step": 815 + }, + { + "epoch": 0.031568816169393644, + "grad_norm": 2.0325050354003906, + "learning_rate": 0.00019987884444162618, + "loss": 1.3464, + "step": 820 + }, + { + "epoch": 0.03176130895091434, + "grad_norm": 1.254342794418335, + "learning_rate": 0.0001998773517781154, + "loss": 1.5236, + "step": 825 + }, + { + "epoch": 0.03195380173243503, + "grad_norm": 0.8909908533096313, + "learning_rate": 0.00019987584998148244, + "loss": 1.4838, + "step": 830 + }, + { + "epoch": 0.032146294513955724, + "grad_norm": 1.1440258026123047, + "learning_rate": 0.00019987433905186458, + "loss": 1.3952, + "step": 835 + }, + { + "epoch": 0.03233878729547642, + "grad_norm": 1.2138668298721313, + "learning_rate": 0.00019987281898940003, + "loss": 1.5982, + "step": 840 + }, + { + "epoch": 0.03253128007699711, + "grad_norm": 1.1847470998764038, + "learning_rate": 0.00019987128979422782, + "loss": 1.4313, + "step": 845 + }, + { + "epoch": 0.032723772858517804, + "grad_norm": 1.4961762428283691, + "learning_rate": 0.0001998697514664877, + "loss": 1.5187, + "step": 850 + }, + { + "epoch": 0.0329162656400385, + "grad_norm": 1.4735344648361206, + "learning_rate": 0.00019986820400632043, + "loss": 1.5443, + "step": 855 + }, + { + "epoch": 0.03310875842155919, + "grad_norm": 1.1350771188735962, + "learning_rate": 0.00019986664741386743, + "loss": 1.5219, + "step": 860 + }, + { + "epoch": 0.03330125120307988, + "grad_norm": 1.098781943321228, + "learning_rate": 0.0001998650816892711, + "loss": 1.6074, + "step": 865 + }, + { + "epoch": 0.033493743984600574, + "grad_norm": 1.9639078378677368, + "learning_rate": 0.0001998635068326746, + "loss": 1.342, + "step": 870 + }, + { + "epoch": 0.03368623676612127, + "grad_norm": 1.1193336248397827, + "learning_rate": 0.00019986192284422193, + "loss": 1.5647, + "step": 875 + }, + { + "epoch": 0.03387872954764196, + "grad_norm": 1.0558106899261475, + "learning_rate": 0.00019986032972405793, + "loss": 1.2448, + "step": 880 + }, + { + "epoch": 0.034071222329162654, + "grad_norm": 1.1178051233291626, + "learning_rate": 0.0001998587274723283, + "loss": 1.3455, + "step": 885 + }, + { + "epoch": 0.03426371511068335, + "grad_norm": 1.728400468826294, + "learning_rate": 0.0001998571160891795, + "loss": 1.44, + "step": 890 + }, + { + "epoch": 0.03445620789220404, + "grad_norm": 1.158931016921997, + "learning_rate": 0.000199855495574759, + "loss": 1.4247, + "step": 895 + }, + { + "epoch": 0.03464870067372473, + "grad_norm": 1.8745627403259277, + "learning_rate": 0.0001998538659292149, + "loss": 1.4036, + "step": 900 + }, + { + "epoch": 0.03484119345524543, + "grad_norm": 1.4273000955581665, + "learning_rate": 0.0001998522271526962, + "loss": 1.4857, + "step": 905 + }, + { + "epoch": 0.03503368623676612, + "grad_norm": 1.1671931743621826, + "learning_rate": 0.0001998505792453528, + "loss": 1.7199, + "step": 910 + }, + { + "epoch": 0.03522617901828681, + "grad_norm": 1.1703475713729858, + "learning_rate": 0.00019984892220733537, + "loss": 1.5659, + "step": 915 + }, + { + "epoch": 0.03541867179980751, + "grad_norm": 0.8550274968147278, + "learning_rate": 0.00019984725603879546, + "loss": 1.3608, + "step": 920 + }, + { + "epoch": 0.0356111645813282, + "grad_norm": 1.676072359085083, + "learning_rate": 0.0001998455807398854, + "loss": 1.4841, + "step": 925 + }, + { + "epoch": 0.03580365736284889, + "grad_norm": 1.362423062324524, + "learning_rate": 0.00019984389631075842, + "loss": 1.5501, + "step": 930 + }, + { + "epoch": 0.03599615014436958, + "grad_norm": 1.1643259525299072, + "learning_rate": 0.0001998422027515685, + "loss": 1.4954, + "step": 935 + }, + { + "epoch": 0.03618864292589028, + "grad_norm": 1.4984415769577026, + "learning_rate": 0.00019984050006247053, + "loss": 1.337, + "step": 940 + }, + { + "epoch": 0.03638113570741097, + "grad_norm": 1.399708867073059, + "learning_rate": 0.00019983878824362023, + "loss": 1.5546, + "step": 945 + }, + { + "epoch": 0.03657362848893166, + "grad_norm": 1.8458516597747803, + "learning_rate": 0.00019983706729517412, + "loss": 1.5268, + "step": 950 + }, + { + "epoch": 0.03676612127045236, + "grad_norm": 1.1428085565567017, + "learning_rate": 0.00019983533721728956, + "loss": 1.4454, + "step": 955 + }, + { + "epoch": 0.03695861405197305, + "grad_norm": 1.2200374603271484, + "learning_rate": 0.00019983359801012475, + "loss": 1.5586, + "step": 960 + }, + { + "epoch": 0.03715110683349374, + "grad_norm": 1.3679723739624023, + "learning_rate": 0.00019983184967383875, + "loss": 1.3948, + "step": 965 + }, + { + "epoch": 0.03734359961501444, + "grad_norm": 1.489397644996643, + "learning_rate": 0.00019983009220859142, + "loss": 1.5154, + "step": 970 + }, + { + "epoch": 0.03753609239653513, + "grad_norm": 1.0442456007003784, + "learning_rate": 0.00019982832561454345, + "loss": 1.5704, + "step": 975 + }, + { + "epoch": 0.03772858517805582, + "grad_norm": 1.7480882406234741, + "learning_rate": 0.00019982654989185642, + "loss": 1.5235, + "step": 980 + }, + { + "epoch": 0.03792107795957651, + "grad_norm": 1.0078760385513306, + "learning_rate": 0.00019982476504069272, + "loss": 1.3936, + "step": 985 + }, + { + "epoch": 0.03811357074109721, + "grad_norm": 1.0461446046829224, + "learning_rate": 0.0001998229710612155, + "loss": 1.6994, + "step": 990 + }, + { + "epoch": 0.0383060635226179, + "grad_norm": 2.1919922828674316, + "learning_rate": 0.00019982116795358885, + "loss": 1.5739, + "step": 995 + }, + { + "epoch": 0.03849855630413859, + "grad_norm": 1.7092692852020264, + "learning_rate": 0.00019981935571797768, + "loss": 1.2746, + "step": 1000 + }, + { + "epoch": 0.03869104908565929, + "grad_norm": 1.3044835329055786, + "learning_rate": 0.00019981753435454764, + "loss": 1.5254, + "step": 1005 + }, + { + "epoch": 0.03888354186717998, + "grad_norm": 1.1550064086914062, + "learning_rate": 0.0001998157038634653, + "loss": 1.6154, + "step": 1010 + }, + { + "epoch": 0.03907603464870067, + "grad_norm": 2.0250370502471924, + "learning_rate": 0.00019981386424489808, + "loss": 1.4807, + "step": 1015 + }, + { + "epoch": 0.03926852743022137, + "grad_norm": 1.036095380783081, + "learning_rate": 0.00019981201549901419, + "loss": 1.4124, + "step": 1020 + }, + { + "epoch": 0.03946102021174206, + "grad_norm": 1.126434564590454, + "learning_rate": 0.0001998101576259827, + "loss": 1.4959, + "step": 1025 + }, + { + "epoch": 0.03965351299326275, + "grad_norm": 1.2912375926971436, + "learning_rate": 0.00019980829062597342, + "loss": 1.5006, + "step": 1030 + }, + { + "epoch": 0.03984600577478344, + "grad_norm": 1.5378974676132202, + "learning_rate": 0.00019980641449915713, + "loss": 1.3073, + "step": 1035 + }, + { + "epoch": 0.04003849855630414, + "grad_norm": 1.52741277217865, + "learning_rate": 0.0001998045292457054, + "loss": 1.3709, + "step": 1040 + }, + { + "epoch": 0.04023099133782483, + "grad_norm": 1.6989667415618896, + "learning_rate": 0.00019980263486579064, + "loss": 1.4784, + "step": 1045 + }, + { + "epoch": 0.04042348411934552, + "grad_norm": 1.0623974800109863, + "learning_rate": 0.00019980073135958607, + "loss": 1.5163, + "step": 1050 + }, + { + "epoch": 0.04061597690086622, + "grad_norm": 1.323283314704895, + "learning_rate": 0.0001997988187272657, + "loss": 1.4793, + "step": 1055 + }, + { + "epoch": 0.04080846968238691, + "grad_norm": 1.4508922100067139, + "learning_rate": 0.00019979689696900447, + "loss": 1.4746, + "step": 1060 + }, + { + "epoch": 0.0410009624639076, + "grad_norm": 1.159579873085022, + "learning_rate": 0.0001997949660849781, + "loss": 1.2928, + "step": 1065 + }, + { + "epoch": 0.0411934552454283, + "grad_norm": 1.5187591314315796, + "learning_rate": 0.0001997930260753632, + "loss": 1.5116, + "step": 1070 + }, + { + "epoch": 0.04138594802694899, + "grad_norm": 1.7137175798416138, + "learning_rate": 0.0001997910769403371, + "loss": 1.6406, + "step": 1075 + }, + { + "epoch": 0.04157844080846968, + "grad_norm": 1.221326470375061, + "learning_rate": 0.00019978911868007807, + "loss": 1.418, + "step": 1080 + }, + { + "epoch": 0.04177093358999037, + "grad_norm": 1.0666981935501099, + "learning_rate": 0.0001997871512947652, + "loss": 1.3768, + "step": 1085 + }, + { + "epoch": 0.04196342637151107, + "grad_norm": 0.9577809572219849, + "learning_rate": 0.00019978517478457834, + "loss": 1.4915, + "step": 1090 + }, + { + "epoch": 0.04215591915303176, + "grad_norm": 2.3966264724731445, + "learning_rate": 0.00019978318914969827, + "loss": 1.7057, + "step": 1095 + }, + { + "epoch": 0.04234841193455245, + "grad_norm": 1.0523775815963745, + "learning_rate": 0.0001997811943903066, + "loss": 1.3887, + "step": 1100 + }, + { + "epoch": 0.04254090471607315, + "grad_norm": 1.3975977897644043, + "learning_rate": 0.00019977919050658566, + "loss": 1.5335, + "step": 1105 + }, + { + "epoch": 0.04273339749759384, + "grad_norm": 1.5198701620101929, + "learning_rate": 0.0001997771774987187, + "loss": 1.3939, + "step": 1110 + }, + { + "epoch": 0.04292589027911453, + "grad_norm": 0.7943345308303833, + "learning_rate": 0.00019977515536688984, + "loss": 1.5908, + "step": 1115 + }, + { + "epoch": 0.04311838306063523, + "grad_norm": 0.9602519869804382, + "learning_rate": 0.00019977312411128398, + "loss": 1.3225, + "step": 1120 + }, + { + "epoch": 0.04331087584215592, + "grad_norm": 1.0204732418060303, + "learning_rate": 0.00019977108373208687, + "loss": 1.518, + "step": 1125 + }, + { + "epoch": 0.04350336862367661, + "grad_norm": 1.2130141258239746, + "learning_rate": 0.00019976903422948503, + "loss": 1.3693, + "step": 1130 + }, + { + "epoch": 0.0436958614051973, + "grad_norm": 0.854958176612854, + "learning_rate": 0.00019976697560366598, + "loss": 1.4907, + "step": 1135 + }, + { + "epoch": 0.043888354186718, + "grad_norm": 1.3699367046356201, + "learning_rate": 0.00019976490785481789, + "loss": 1.4448, + "step": 1140 + }, + { + "epoch": 0.04408084696823869, + "grad_norm": 1.1766821146011353, + "learning_rate": 0.00019976283098312983, + "loss": 1.5171, + "step": 1145 + }, + { + "epoch": 0.04427333974975938, + "grad_norm": 1.6543035507202148, + "learning_rate": 0.00019976074498879174, + "loss": 1.2751, + "step": 1150 + }, + { + "epoch": 0.04446583253128008, + "grad_norm": 1.2228333950042725, + "learning_rate": 0.0001997586498719944, + "loss": 1.4522, + "step": 1155 + }, + { + "epoch": 0.04465832531280077, + "grad_norm": 1.2733262777328491, + "learning_rate": 0.00019975654563292937, + "loss": 1.6292, + "step": 1160 + }, + { + "epoch": 0.04485081809432146, + "grad_norm": 1.3934366703033447, + "learning_rate": 0.00019975443227178904, + "loss": 1.433, + "step": 1165 + }, + { + "epoch": 0.04504331087584216, + "grad_norm": 1.5495753288269043, + "learning_rate": 0.00019975230978876672, + "loss": 1.5803, + "step": 1170 + }, + { + "epoch": 0.04523580365736285, + "grad_norm": 1.0099114179611206, + "learning_rate": 0.00019975017818405646, + "loss": 1.3434, + "step": 1175 + }, + { + "epoch": 0.04542829643888354, + "grad_norm": 0.9009067416191101, + "learning_rate": 0.0001997480374578532, + "loss": 1.2312, + "step": 1180 + }, + { + "epoch": 0.04562078922040423, + "grad_norm": 1.8678425550460815, + "learning_rate": 0.00019974588761035266, + "loss": 1.6331, + "step": 1185 + }, + { + "epoch": 0.04581328200192493, + "grad_norm": 0.8258862495422363, + "learning_rate": 0.00019974372864175148, + "loss": 1.4584, + "step": 1190 + }, + { + "epoch": 0.04600577478344562, + "grad_norm": 1.44557523727417, + "learning_rate": 0.00019974156055224706, + "loss": 1.4866, + "step": 1195 + }, + { + "epoch": 0.04619826756496631, + "grad_norm": 1.7249491214752197, + "learning_rate": 0.00019973938334203763, + "loss": 1.3704, + "step": 1200 + }, + { + "epoch": 0.04639076034648701, + "grad_norm": 1.005623698234558, + "learning_rate": 0.0001997371970113223, + "loss": 1.1993, + "step": 1205 + }, + { + "epoch": 0.0465832531280077, + "grad_norm": 1.4596670866012573, + "learning_rate": 0.00019973500156030105, + "loss": 1.4996, + "step": 1210 + }, + { + "epoch": 0.04677574590952839, + "grad_norm": 1.3085503578186035, + "learning_rate": 0.00019973279698917454, + "loss": 1.441, + "step": 1215 + }, + { + "epoch": 0.04696823869104909, + "grad_norm": 0.9477142691612244, + "learning_rate": 0.00019973058329814445, + "loss": 1.5278, + "step": 1220 + }, + { + "epoch": 0.04716073147256978, + "grad_norm": 0.9040088653564453, + "learning_rate": 0.00019972836048741318, + "loss": 1.5374, + "step": 1225 + }, + { + "epoch": 0.04735322425409047, + "grad_norm": 1.7435801029205322, + "learning_rate": 0.00019972612855718395, + "loss": 1.3884, + "step": 1230 + }, + { + "epoch": 0.04754571703561117, + "grad_norm": 1.180665135383606, + "learning_rate": 0.00019972388750766088, + "loss": 1.2097, + "step": 1235 + }, + { + "epoch": 0.04773820981713186, + "grad_norm": 1.066064715385437, + "learning_rate": 0.00019972163733904895, + "loss": 1.4299, + "step": 1240 + }, + { + "epoch": 0.04793070259865255, + "grad_norm": 1.1051660776138306, + "learning_rate": 0.00019971937805155382, + "loss": 1.5055, + "step": 1245 + }, + { + "epoch": 0.04812319538017324, + "grad_norm": 1.2021822929382324, + "learning_rate": 0.0001997171096453822, + "loss": 1.5842, + "step": 1250 + }, + { + "epoch": 0.04831568816169394, + "grad_norm": 2.1715807914733887, + "learning_rate": 0.00019971483212074146, + "loss": 1.4096, + "step": 1255 + }, + { + "epoch": 0.04850818094321463, + "grad_norm": 1.1615819931030273, + "learning_rate": 0.00019971254547783987, + "loss": 1.2554, + "step": 1260 + }, + { + "epoch": 0.04870067372473532, + "grad_norm": 1.5363492965698242, + "learning_rate": 0.00019971024971688652, + "loss": 1.5773, + "step": 1265 + }, + { + "epoch": 0.04889316650625602, + "grad_norm": 1.3774447441101074, + "learning_rate": 0.00019970794483809137, + "loss": 1.3441, + "step": 1270 + }, + { + "epoch": 0.04908565928777671, + "grad_norm": 2.065901041030884, + "learning_rate": 0.00019970563084166515, + "loss": 1.6342, + "step": 1275 + }, + { + "epoch": 0.0492781520692974, + "grad_norm": 1.3221025466918945, + "learning_rate": 0.0001997033077278195, + "loss": 1.4967, + "step": 1280 + }, + { + "epoch": 0.0494706448508181, + "grad_norm": 1.6636276245117188, + "learning_rate": 0.00019970097549676684, + "loss": 1.4936, + "step": 1285 + }, + { + "epoch": 0.04966313763233879, + "grad_norm": 1.4630615711212158, + "learning_rate": 0.0001996986341487204, + "loss": 1.4096, + "step": 1290 + }, + { + "epoch": 0.04985563041385948, + "grad_norm": 1.9586588144302368, + "learning_rate": 0.00019969628368389432, + "loss": 1.5956, + "step": 1295 + }, + { + "epoch": 0.05004812319538017, + "grad_norm": 1.0234311819076538, + "learning_rate": 0.00019969392410250353, + "loss": 1.247, + "step": 1300 + }, + { + "epoch": 0.05024061597690087, + "grad_norm": 1.7005319595336914, + "learning_rate": 0.0001996915554047638, + "loss": 1.4179, + "step": 1305 + }, + { + "epoch": 0.05043310875842156, + "grad_norm": 1.3052936792373657, + "learning_rate": 0.0001996891775908917, + "loss": 1.4002, + "step": 1310 + }, + { + "epoch": 0.05062560153994225, + "grad_norm": 1.0146903991699219, + "learning_rate": 0.00019968679066110473, + "loss": 1.5062, + "step": 1315 + }, + { + "epoch": 0.05081809432146295, + "grad_norm": 0.9611810445785522, + "learning_rate": 0.00019968439461562104, + "loss": 1.5303, + "step": 1320 + }, + { + "epoch": 0.05101058710298364, + "grad_norm": 0.8518236875534058, + "learning_rate": 0.0001996819894546599, + "loss": 1.3589, + "step": 1325 + }, + { + "epoch": 0.05120307988450433, + "grad_norm": 1.6918632984161377, + "learning_rate": 0.00019967957517844111, + "loss": 1.4589, + "step": 1330 + }, + { + "epoch": 0.05139557266602503, + "grad_norm": 1.4838560819625854, + "learning_rate": 0.00019967715178718551, + "loss": 1.2714, + "step": 1335 + }, + { + "epoch": 0.05158806544754572, + "grad_norm": 1.291231632232666, + "learning_rate": 0.00019967471928111465, + "loss": 1.6378, + "step": 1340 + }, + { + "epoch": 0.05178055822906641, + "grad_norm": 1.2091941833496094, + "learning_rate": 0.00019967227766045102, + "loss": 1.3985, + "step": 1345 + }, + { + "epoch": 0.0519730510105871, + "grad_norm": 1.2294058799743652, + "learning_rate": 0.00019966982692541785, + "loss": 1.498, + "step": 1350 + }, + { + "epoch": 0.0521655437921078, + "grad_norm": 1.1644397974014282, + "learning_rate": 0.00019966736707623928, + "loss": 1.4185, + "step": 1355 + }, + { + "epoch": 0.05235803657362849, + "grad_norm": 1.7669397592544556, + "learning_rate": 0.0001996648981131402, + "loss": 1.3564, + "step": 1360 + }, + { + "epoch": 0.05255052935514918, + "grad_norm": 0.7178487777709961, + "learning_rate": 0.00019966242003634644, + "loss": 1.2015, + "step": 1365 + }, + { + "epoch": 0.05274302213666988, + "grad_norm": 0.8149698376655579, + "learning_rate": 0.00019965993284608457, + "loss": 1.4046, + "step": 1370 + }, + { + "epoch": 0.05293551491819057, + "grad_norm": 1.3934742212295532, + "learning_rate": 0.00019965743654258198, + "loss": 1.5289, + "step": 1375 + }, + { + "epoch": 0.05312800769971126, + "grad_norm": 1.060002326965332, + "learning_rate": 0.00019965493112606702, + "loss": 1.391, + "step": 1380 + }, + { + "epoch": 0.05332050048123196, + "grad_norm": 1.1154258251190186, + "learning_rate": 0.00019965241659676875, + "loss": 1.3004, + "step": 1385 + }, + { + "epoch": 0.05351299326275265, + "grad_norm": 1.8101186752319336, + "learning_rate": 0.00019964989295491713, + "loss": 1.4968, + "step": 1390 + }, + { + "epoch": 0.05370548604427334, + "grad_norm": 1.075211524963379, + "learning_rate": 0.00019964736020074294, + "loss": 1.5198, + "step": 1395 + }, + { + "epoch": 0.05389797882579403, + "grad_norm": 2.0130980014801025, + "learning_rate": 0.00019964481833447775, + "loss": 1.5495, + "step": 1400 + }, + { + "epoch": 0.05409047160731473, + "grad_norm": 1.214570164680481, + "learning_rate": 0.000199642267356354, + "loss": 1.5886, + "step": 1405 + }, + { + "epoch": 0.05428296438883542, + "grad_norm": 1.6430037021636963, + "learning_rate": 0.00019963970726660497, + "loss": 1.5293, + "step": 1410 + }, + { + "epoch": 0.05447545717035611, + "grad_norm": 0.94575035572052, + "learning_rate": 0.00019963713806546478, + "loss": 1.276, + "step": 1415 + }, + { + "epoch": 0.05466794995187681, + "grad_norm": 1.1988322734832764, + "learning_rate": 0.00019963455975316832, + "loss": 1.3151, + "step": 1420 + }, + { + "epoch": 0.0548604427333975, + "grad_norm": 1.2768787145614624, + "learning_rate": 0.00019963197232995142, + "loss": 1.5559, + "step": 1425 + }, + { + "epoch": 0.05505293551491819, + "grad_norm": 1.5184259414672852, + "learning_rate": 0.0001996293757960506, + "loss": 1.2998, + "step": 1430 + }, + { + "epoch": 0.055245428296438887, + "grad_norm": 6.240184783935547, + "learning_rate": 0.0001996267701517034, + "loss": 1.4497, + "step": 1435 + }, + { + "epoch": 0.05543792107795958, + "grad_norm": 1.4356882572174072, + "learning_rate": 0.00019962415539714803, + "loss": 1.6364, + "step": 1440 + }, + { + "epoch": 0.05563041385948027, + "grad_norm": 0.9310120940208435, + "learning_rate": 0.00019962153153262358, + "loss": 1.417, + "step": 1445 + }, + { + "epoch": 0.05582290664100096, + "grad_norm": 1.2131333351135254, + "learning_rate": 0.00019961889855837, + "loss": 1.4059, + "step": 1450 + }, + { + "epoch": 0.05601539942252166, + "grad_norm": 1.2134804725646973, + "learning_rate": 0.00019961625647462808, + "loss": 1.458, + "step": 1455 + }, + { + "epoch": 0.05620789220404235, + "grad_norm": 1.5725634098052979, + "learning_rate": 0.0001996136052816394, + "loss": 1.352, + "step": 1460 + }, + { + "epoch": 0.05640038498556304, + "grad_norm": 0.9882212281227112, + "learning_rate": 0.00019961094497964642, + "loss": 1.1665, + "step": 1465 + }, + { + "epoch": 0.05659287776708374, + "grad_norm": 1.055966854095459, + "learning_rate": 0.00019960827556889235, + "loss": 1.388, + "step": 1470 + }, + { + "epoch": 0.05678537054860443, + "grad_norm": 1.0809309482574463, + "learning_rate": 0.00019960559704962133, + "loss": 1.4287, + "step": 1475 + }, + { + "epoch": 0.05697786333012512, + "grad_norm": 1.0014935731887817, + "learning_rate": 0.00019960290942207828, + "loss": 1.5539, + "step": 1480 + }, + { + "epoch": 0.057170356111645816, + "grad_norm": 1.1717151403427124, + "learning_rate": 0.000199600212686509, + "loss": 1.3619, + "step": 1485 + }, + { + "epoch": 0.05736284889316651, + "grad_norm": 1.3981553316116333, + "learning_rate": 0.00019959750684316, + "loss": 1.3303, + "step": 1490 + }, + { + "epoch": 0.0575553416746872, + "grad_norm": 0.7471413016319275, + "learning_rate": 0.00019959479189227884, + "loss": 1.4048, + "step": 1495 + }, + { + "epoch": 0.05774783445620789, + "grad_norm": 1.1570223569869995, + "learning_rate": 0.00019959206783411372, + "loss": 1.6713, + "step": 1500 + }, + { + "epoch": 0.05794032723772859, + "grad_norm": 1.4656585454940796, + "learning_rate": 0.00019958933466891366, + "loss": 1.3911, + "step": 1505 + }, + { + "epoch": 0.05813282001924928, + "grad_norm": 1.5338329076766968, + "learning_rate": 0.0001995865923969287, + "loss": 1.578, + "step": 1510 + }, + { + "epoch": 0.05832531280076997, + "grad_norm": 0.9481655955314636, + "learning_rate": 0.0001995838410184096, + "loss": 1.2903, + "step": 1515 + }, + { + "epoch": 0.058517805582290666, + "grad_norm": 1.4928970336914062, + "learning_rate": 0.00019958108053360788, + "loss": 1.4139, + "step": 1520 + }, + { + "epoch": 0.05871029836381136, + "grad_norm": 1.015381932258606, + "learning_rate": 0.00019957831094277604, + "loss": 1.5427, + "step": 1525 + }, + { + "epoch": 0.05890279114533205, + "grad_norm": 1.3471331596374512, + "learning_rate": 0.0001995755322461673, + "loss": 1.3763, + "step": 1530 + }, + { + "epoch": 0.059095283926852746, + "grad_norm": 2.0942165851593018, + "learning_rate": 0.00019957274444403576, + "loss": 1.4669, + "step": 1535 + }, + { + "epoch": 0.05928777670837344, + "grad_norm": 1.4853599071502686, + "learning_rate": 0.00019956994753663634, + "loss": 1.4259, + "step": 1540 + }, + { + "epoch": 0.05948026948989413, + "grad_norm": 1.3337596654891968, + "learning_rate": 0.0001995671415242248, + "loss": 1.4169, + "step": 1545 + }, + { + "epoch": 0.059672762271414825, + "grad_norm": 1.3816536664962769, + "learning_rate": 0.00019956432640705777, + "loss": 1.3679, + "step": 1550 + }, + { + "epoch": 0.059865255052935516, + "grad_norm": 1.1726235151290894, + "learning_rate": 0.00019956150218539262, + "loss": 1.4076, + "step": 1555 + }, + { + "epoch": 0.06005774783445621, + "grad_norm": 1.419520378112793, + "learning_rate": 0.00019955866885948764, + "loss": 1.3621, + "step": 1560 + }, + { + "epoch": 0.0602502406159769, + "grad_norm": 1.4154486656188965, + "learning_rate": 0.0001995558264296019, + "loss": 1.4221, + "step": 1565 + }, + { + "epoch": 0.060442733397497596, + "grad_norm": 1.4721988439559937, + "learning_rate": 0.00019955297489599537, + "loss": 1.3641, + "step": 1570 + }, + { + "epoch": 0.06063522617901829, + "grad_norm": 1.1087952852249146, + "learning_rate": 0.0001995501142589287, + "loss": 1.3734, + "step": 1575 + }, + { + "epoch": 0.06082771896053898, + "grad_norm": 1.4815518856048584, + "learning_rate": 0.00019954724451866357, + "loss": 1.4042, + "step": 1580 + }, + { + "epoch": 0.061020211742059675, + "grad_norm": 1.835754632949829, + "learning_rate": 0.00019954436567546236, + "loss": 1.2457, + "step": 1585 + }, + { + "epoch": 0.061212704523580366, + "grad_norm": 1.3139601945877075, + "learning_rate": 0.00019954147772958836, + "loss": 1.4457, + "step": 1590 + }, + { + "epoch": 0.06140519730510106, + "grad_norm": 1.155369758605957, + "learning_rate": 0.0001995385806813056, + "loss": 1.3483, + "step": 1595 + }, + { + "epoch": 0.061597690086621755, + "grad_norm": 1.1897907257080078, + "learning_rate": 0.00019953567453087902, + "loss": 1.467, + "step": 1600 + }, + { + "epoch": 0.061790182868142446, + "grad_norm": 1.0794181823730469, + "learning_rate": 0.00019953275927857438, + "loss": 1.5171, + "step": 1605 + }, + { + "epoch": 0.06198267564966314, + "grad_norm": 0.9538444876670837, + "learning_rate": 0.00019952983492465824, + "loss": 1.2643, + "step": 1610 + }, + { + "epoch": 0.06217516843118383, + "grad_norm": 1.1179461479187012, + "learning_rate": 0.00019952690146939804, + "loss": 1.408, + "step": 1615 + }, + { + "epoch": 0.062367661212704525, + "grad_norm": 1.8034144639968872, + "learning_rate": 0.00019952395891306197, + "loss": 1.3685, + "step": 1620 + }, + { + "epoch": 0.06256015399422522, + "grad_norm": 1.04547119140625, + "learning_rate": 0.00019952100725591912, + "loss": 1.4271, + "step": 1625 + }, + { + "epoch": 0.06275264677574591, + "grad_norm": 1.3097724914550781, + "learning_rate": 0.00019951804649823949, + "loss": 1.3303, + "step": 1630 + }, + { + "epoch": 0.0629451395572666, + "grad_norm": 1.8794469833374023, + "learning_rate": 0.00019951507664029374, + "loss": 1.5223, + "step": 1635 + }, + { + "epoch": 0.06313763233878729, + "grad_norm": 1.4077703952789307, + "learning_rate": 0.00019951209768235344, + "loss": 1.5582, + "step": 1640 + }, + { + "epoch": 0.06333012512030799, + "grad_norm": 1.2244471311569214, + "learning_rate": 0.000199509109624691, + "loss": 1.3437, + "step": 1645 + }, + { + "epoch": 0.06352261790182868, + "grad_norm": 1.4610791206359863, + "learning_rate": 0.00019950611246757972, + "loss": 1.6944, + "step": 1650 + }, + { + "epoch": 0.06371511068334937, + "grad_norm": 1.544989824295044, + "learning_rate": 0.00019950310621129358, + "loss": 1.3288, + "step": 1655 + }, + { + "epoch": 0.06390760346487007, + "grad_norm": 1.4837945699691772, + "learning_rate": 0.00019950009085610755, + "loss": 1.1296, + "step": 1660 + }, + { + "epoch": 0.06410009624639076, + "grad_norm": 2.2527410984039307, + "learning_rate": 0.0001994970664022973, + "loss": 1.3105, + "step": 1665 + }, + { + "epoch": 0.06429258902791145, + "grad_norm": 1.3723945617675781, + "learning_rate": 0.00019949403285013948, + "loss": 1.3976, + "step": 1670 + }, + { + "epoch": 0.06448508180943215, + "grad_norm": 1.571265459060669, + "learning_rate": 0.0001994909901999114, + "loss": 1.4603, + "step": 1675 + }, + { + "epoch": 0.06467757459095284, + "grad_norm": 1.2445194721221924, + "learning_rate": 0.00019948793845189137, + "loss": 1.3072, + "step": 1680 + }, + { + "epoch": 0.06487006737247353, + "grad_norm": 2.068112373352051, + "learning_rate": 0.00019948487760635842, + "loss": 1.4638, + "step": 1685 + }, + { + "epoch": 0.06506256015399423, + "grad_norm": 1.0896637439727783, + "learning_rate": 0.00019948180766359244, + "loss": 1.3184, + "step": 1690 + }, + { + "epoch": 0.06525505293551492, + "grad_norm": 2.0666351318359375, + "learning_rate": 0.00019947872862387413, + "loss": 1.3944, + "step": 1695 + }, + { + "epoch": 0.06544754571703561, + "grad_norm": 1.5204085111618042, + "learning_rate": 0.00019947564048748508, + "loss": 1.3795, + "step": 1700 + }, + { + "epoch": 0.0656400384985563, + "grad_norm": 0.9768043160438538, + "learning_rate": 0.00019947254325470768, + "loss": 1.3329, + "step": 1705 + }, + { + "epoch": 0.065832531280077, + "grad_norm": 1.3453469276428223, + "learning_rate": 0.00019946943692582516, + "loss": 1.304, + "step": 1710 + }, + { + "epoch": 0.06602502406159769, + "grad_norm": 1.0725489854812622, + "learning_rate": 0.00019946632150112152, + "loss": 1.5547, + "step": 1715 + }, + { + "epoch": 0.06621751684311838, + "grad_norm": 1.5973418951034546, + "learning_rate": 0.0001994631969808817, + "loss": 1.3263, + "step": 1720 + }, + { + "epoch": 0.06641000962463908, + "grad_norm": 1.2451751232147217, + "learning_rate": 0.0001994600633653914, + "loss": 1.4935, + "step": 1725 + }, + { + "epoch": 0.06660250240615977, + "grad_norm": 1.3474830389022827, + "learning_rate": 0.00019945692065493717, + "loss": 1.6282, + "step": 1730 + }, + { + "epoch": 0.06679499518768046, + "grad_norm": 1.7913939952850342, + "learning_rate": 0.00019945376884980643, + "loss": 1.2935, + "step": 1735 + }, + { + "epoch": 0.06698748796920115, + "grad_norm": 1.0764446258544922, + "learning_rate": 0.00019945060795028728, + "loss": 1.6034, + "step": 1740 + }, + { + "epoch": 0.06717998075072185, + "grad_norm": 1.0572975873947144, + "learning_rate": 0.00019944743795666887, + "loss": 1.3997, + "step": 1745 + }, + { + "epoch": 0.06737247353224254, + "grad_norm": 1.3195079565048218, + "learning_rate": 0.00019944425886924102, + "loss": 1.4838, + "step": 1750 + }, + { + "epoch": 0.06756496631376323, + "grad_norm": 1.0044989585876465, + "learning_rate": 0.00019944107068829448, + "loss": 1.388, + "step": 1755 + }, + { + "epoch": 0.06775745909528393, + "grad_norm": 1.8276032209396362, + "learning_rate": 0.0001994378734141207, + "loss": 1.447, + "step": 1760 + }, + { + "epoch": 0.06794995187680462, + "grad_norm": 1.5056366920471191, + "learning_rate": 0.00019943466704701218, + "loss": 1.5153, + "step": 1765 + }, + { + "epoch": 0.06814244465832531, + "grad_norm": 1.6947304010391235, + "learning_rate": 0.00019943145158726205, + "loss": 1.5551, + "step": 1770 + }, + { + "epoch": 0.068334937439846, + "grad_norm": 0.9702686667442322, + "learning_rate": 0.00019942822703516433, + "loss": 1.3168, + "step": 1775 + }, + { + "epoch": 0.0685274302213667, + "grad_norm": 1.6755216121673584, + "learning_rate": 0.0001994249933910139, + "loss": 1.6223, + "step": 1780 + }, + { + "epoch": 0.06871992300288739, + "grad_norm": 1.3666303157806396, + "learning_rate": 0.00019942175065510643, + "loss": 1.5748, + "step": 1785 + }, + { + "epoch": 0.06891241578440808, + "grad_norm": 1.3785196542739868, + "learning_rate": 0.0001994184988277385, + "loss": 1.4033, + "step": 1790 + }, + { + "epoch": 0.06910490856592878, + "grad_norm": 1.081828236579895, + "learning_rate": 0.00019941523790920743, + "loss": 1.4, + "step": 1795 + }, + { + "epoch": 0.06929740134744947, + "grad_norm": 1.1024401187896729, + "learning_rate": 0.0001994119678998114, + "loss": 1.4751, + "step": 1800 + }, + { + "epoch": 0.06948989412897016, + "grad_norm": 3.584055185317993, + "learning_rate": 0.0001994086887998495, + "loss": 1.3449, + "step": 1805 + }, + { + "epoch": 0.06968238691049086, + "grad_norm": 0.9418397545814514, + "learning_rate": 0.0001994054006096215, + "loss": 1.3217, + "step": 1810 + }, + { + "epoch": 0.06987487969201155, + "grad_norm": 1.6071193218231201, + "learning_rate": 0.00019940210332942813, + "loss": 1.3636, + "step": 1815 + }, + { + "epoch": 0.07006737247353224, + "grad_norm": 2.0080580711364746, + "learning_rate": 0.00019939879695957084, + "loss": 1.4779, + "step": 1820 + }, + { + "epoch": 0.07025986525505294, + "grad_norm": 1.169058918952942, + "learning_rate": 0.00019939548150035207, + "loss": 1.4031, + "step": 1825 + }, + { + "epoch": 0.07045235803657363, + "grad_norm": 0.9863006472587585, + "learning_rate": 0.00019939215695207496, + "loss": 1.3832, + "step": 1830 + }, + { + "epoch": 0.07064485081809432, + "grad_norm": 1.2257460355758667, + "learning_rate": 0.00019938882331504347, + "loss": 1.4967, + "step": 1835 + }, + { + "epoch": 0.07083734359961502, + "grad_norm": 1.0062893629074097, + "learning_rate": 0.00019938548058956253, + "loss": 1.2637, + "step": 1840 + }, + { + "epoch": 0.0710298363811357, + "grad_norm": 1.4179530143737793, + "learning_rate": 0.0001993821287759377, + "loss": 1.2961, + "step": 1845 + }, + { + "epoch": 0.0712223291626564, + "grad_norm": 1.2181779146194458, + "learning_rate": 0.00019937876787447557, + "loss": 1.4104, + "step": 1850 + }, + { + "epoch": 0.07141482194417709, + "grad_norm": 1.6110061407089233, + "learning_rate": 0.00019937539788548344, + "loss": 1.4045, + "step": 1855 + }, + { + "epoch": 0.07160731472569778, + "grad_norm": 1.2814903259277344, + "learning_rate": 0.0001993720188092695, + "loss": 1.4194, + "step": 1860 + }, + { + "epoch": 0.07179980750721848, + "grad_norm": 1.382265329360962, + "learning_rate": 0.00019936863064614268, + "loss": 1.5848, + "step": 1865 + }, + { + "epoch": 0.07199230028873917, + "grad_norm": 1.4708553552627563, + "learning_rate": 0.00019936523339641286, + "loss": 1.6196, + "step": 1870 + }, + { + "epoch": 0.07218479307025986, + "grad_norm": 1.0691862106323242, + "learning_rate": 0.0001993618270603907, + "loss": 1.4939, + "step": 1875 + }, + { + "epoch": 0.07237728585178056, + "grad_norm": 0.9476374387741089, + "learning_rate": 0.0001993584116383876, + "loss": 1.5043, + "step": 1880 + }, + { + "epoch": 0.07256977863330125, + "grad_norm": 1.37090003490448, + "learning_rate": 0.000199354987130716, + "loss": 1.4371, + "step": 1885 + }, + { + "epoch": 0.07276227141482194, + "grad_norm": 1.2001820802688599, + "learning_rate": 0.000199351553537689, + "loss": 1.3048, + "step": 1890 + }, + { + "epoch": 0.07295476419634264, + "grad_norm": 1.1123398542404175, + "learning_rate": 0.00019934811085962055, + "loss": 1.4398, + "step": 1895 + }, + { + "epoch": 0.07314725697786333, + "grad_norm": 1.638574242591858, + "learning_rate": 0.0001993446590968255, + "loss": 1.3563, + "step": 1900 + }, + { + "epoch": 0.07333974975938402, + "grad_norm": 1.9532630443572998, + "learning_rate": 0.00019934119824961948, + "loss": 1.3723, + "step": 1905 + }, + { + "epoch": 0.07353224254090472, + "grad_norm": 1.3247241973876953, + "learning_rate": 0.0001993377283183189, + "loss": 1.4474, + "step": 1910 + }, + { + "epoch": 0.0737247353224254, + "grad_norm": 1.203049659729004, + "learning_rate": 0.00019933424930324118, + "loss": 1.3347, + "step": 1915 + }, + { + "epoch": 0.0739172281039461, + "grad_norm": 1.8858312368392944, + "learning_rate": 0.00019933076120470436, + "loss": 1.4754, + "step": 1920 + }, + { + "epoch": 0.0741097208854668, + "grad_norm": 1.117814540863037, + "learning_rate": 0.00019932726402302744, + "loss": 1.4828, + "step": 1925 + }, + { + "epoch": 0.07430221366698748, + "grad_norm": 1.0317554473876953, + "learning_rate": 0.00019932375775853021, + "loss": 1.5034, + "step": 1930 + }, + { + "epoch": 0.07449470644850818, + "grad_norm": 2.315903902053833, + "learning_rate": 0.00019932024241153332, + "loss": 1.4311, + "step": 1935 + }, + { + "epoch": 0.07468719923002888, + "grad_norm": 1.5780115127563477, + "learning_rate": 0.00019931671798235817, + "loss": 1.3917, + "step": 1940 + }, + { + "epoch": 0.07487969201154956, + "grad_norm": 1.3360038995742798, + "learning_rate": 0.00019931318447132706, + "loss": 1.3634, + "step": 1945 + }, + { + "epoch": 0.07507218479307026, + "grad_norm": 2.275620937347412, + "learning_rate": 0.00019930964187876314, + "loss": 1.414, + "step": 1950 + }, + { + "epoch": 0.07526467757459095, + "grad_norm": 1.7956300973892212, + "learning_rate": 0.00019930609020499032, + "loss": 1.5117, + "step": 1955 + }, + { + "epoch": 0.07545717035611164, + "grad_norm": 1.6429657936096191, + "learning_rate": 0.0001993025294503334, + "loss": 1.4436, + "step": 1960 + }, + { + "epoch": 0.07564966313763234, + "grad_norm": 1.432246446609497, + "learning_rate": 0.000199298959615118, + "loss": 1.3952, + "step": 1965 + }, + { + "epoch": 0.07584215591915303, + "grad_norm": 1.0579869747161865, + "learning_rate": 0.00019929538069967051, + "loss": 1.4369, + "step": 1970 + }, + { + "epoch": 0.07603464870067372, + "grad_norm": 1.766543984413147, + "learning_rate": 0.00019929179270431824, + "loss": 1.5033, + "step": 1975 + }, + { + "epoch": 0.07622714148219442, + "grad_norm": 1.0774848461151123, + "learning_rate": 0.00019928819562938928, + "loss": 1.3399, + "step": 1980 + }, + { + "epoch": 0.0764196342637151, + "grad_norm": 1.0951963663101196, + "learning_rate": 0.00019928458947521252, + "loss": 1.3656, + "step": 1985 + }, + { + "epoch": 0.0766121270452358, + "grad_norm": 1.278283953666687, + "learning_rate": 0.0001992809742421178, + "loss": 1.3467, + "step": 1990 + }, + { + "epoch": 0.0768046198267565, + "grad_norm": 1.139508605003357, + "learning_rate": 0.00019927734993043566, + "loss": 1.4316, + "step": 1995 + }, + { + "epoch": 0.07699711260827719, + "grad_norm": 1.39482581615448, + "learning_rate": 0.00019927371654049748, + "loss": 1.2032, + "step": 2000 + }, + { + "epoch": 0.07718960538979788, + "grad_norm": 0.9154567718505859, + "learning_rate": 0.0001992700740726356, + "loss": 1.5053, + "step": 2005 + }, + { + "epoch": 0.07738209817131858, + "grad_norm": 1.5105671882629395, + "learning_rate": 0.00019926642252718303, + "loss": 1.5059, + "step": 2010 + }, + { + "epoch": 0.07757459095283926, + "grad_norm": 1.4019540548324585, + "learning_rate": 0.00019926276190447367, + "loss": 1.4051, + "step": 2015 + }, + { + "epoch": 0.07776708373435996, + "grad_norm": 1.619841456413269, + "learning_rate": 0.00019925909220484234, + "loss": 1.1784, + "step": 2020 + }, + { + "epoch": 0.07795957651588066, + "grad_norm": 1.6128195524215698, + "learning_rate": 0.0001992554134286245, + "loss": 1.4623, + "step": 2025 + }, + { + "epoch": 0.07815206929740134, + "grad_norm": 1.2766104936599731, + "learning_rate": 0.00019925172557615665, + "loss": 1.3162, + "step": 2030 + }, + { + "epoch": 0.07834456207892204, + "grad_norm": 1.2187426090240479, + "learning_rate": 0.00019924802864777598, + "loss": 1.2874, + "step": 2035 + }, + { + "epoch": 0.07853705486044274, + "grad_norm": 1.1050268411636353, + "learning_rate": 0.00019924432264382055, + "loss": 1.433, + "step": 2040 + }, + { + "epoch": 0.07872954764196342, + "grad_norm": 1.6128287315368652, + "learning_rate": 0.00019924060756462925, + "loss": 1.4698, + "step": 2045 + }, + { + "epoch": 0.07892204042348412, + "grad_norm": 1.6588749885559082, + "learning_rate": 0.00019923688341054176, + "loss": 1.4972, + "step": 2050 + }, + { + "epoch": 0.0791145332050048, + "grad_norm": 1.135289192199707, + "learning_rate": 0.0001992331501818987, + "loss": 1.3991, + "step": 2055 + }, + { + "epoch": 0.0793070259865255, + "grad_norm": 1.757759928703308, + "learning_rate": 0.00019922940787904137, + "loss": 1.3736, + "step": 2060 + }, + { + "epoch": 0.0794995187680462, + "grad_norm": 0.9943239092826843, + "learning_rate": 0.00019922565650231207, + "loss": 1.4476, + "step": 2065 + }, + { + "epoch": 0.07969201154956689, + "grad_norm": 0.9459586143493652, + "learning_rate": 0.00019922189605205379, + "loss": 1.3913, + "step": 2070 + }, + { + "epoch": 0.07988450433108758, + "grad_norm": 1.2325133085250854, + "learning_rate": 0.00019921812652861037, + "loss": 1.4658, + "step": 2075 + }, + { + "epoch": 0.08007699711260828, + "grad_norm": 1.2397321462631226, + "learning_rate": 0.00019921434793232658, + "loss": 1.2552, + "step": 2080 + }, + { + "epoch": 0.08026948989412896, + "grad_norm": 0.9636020660400391, + "learning_rate": 0.0001992105602635479, + "loss": 1.3296, + "step": 2085 + }, + { + "epoch": 0.08046198267564966, + "grad_norm": 0.900841474533081, + "learning_rate": 0.00019920676352262067, + "loss": 1.2329, + "step": 2090 + }, + { + "epoch": 0.08065447545717036, + "grad_norm": 1.0425807237625122, + "learning_rate": 0.00019920295770989213, + "loss": 1.1604, + "step": 2095 + }, + { + "epoch": 0.08084696823869104, + "grad_norm": 1.1449722051620483, + "learning_rate": 0.00019919914282571024, + "loss": 1.3233, + "step": 2100 + }, + { + "epoch": 0.08103946102021174, + "grad_norm": 1.2076728343963623, + "learning_rate": 0.00019919531887042387, + "loss": 1.3449, + "step": 2105 + }, + { + "epoch": 0.08123195380173244, + "grad_norm": 0.968323826789856, + "learning_rate": 0.00019919148584438272, + "loss": 1.4273, + "step": 2110 + }, + { + "epoch": 0.08142444658325312, + "grad_norm": 1.7322039604187012, + "learning_rate": 0.00019918764374793726, + "loss": 1.4994, + "step": 2115 + }, + { + "epoch": 0.08161693936477382, + "grad_norm": 1.4216794967651367, + "learning_rate": 0.00019918379258143884, + "loss": 1.4071, + "step": 2120 + }, + { + "epoch": 0.08180943214629452, + "grad_norm": 1.2262970209121704, + "learning_rate": 0.00019917993234523963, + "loss": 1.3528, + "step": 2125 + }, + { + "epoch": 0.0820019249278152, + "grad_norm": 1.3137859106063843, + "learning_rate": 0.0001991760630396926, + "loss": 1.4367, + "step": 2130 + }, + { + "epoch": 0.0821944177093359, + "grad_norm": 1.364478588104248, + "learning_rate": 0.00019917218466515156, + "loss": 1.6896, + "step": 2135 + }, + { + "epoch": 0.0823869104908566, + "grad_norm": 1.2037614583969116, + "learning_rate": 0.00019916829722197124, + "loss": 1.5371, + "step": 2140 + }, + { + "epoch": 0.08257940327237728, + "grad_norm": 1.7590453624725342, + "learning_rate": 0.00019916440071050706, + "loss": 1.6331, + "step": 2145 + }, + { + "epoch": 0.08277189605389798, + "grad_norm": 1.6112565994262695, + "learning_rate": 0.00019916049513111532, + "loss": 1.5066, + "step": 2150 + }, + { + "epoch": 0.08296438883541868, + "grad_norm": 0.937174916267395, + "learning_rate": 0.00019915658048415318, + "loss": 1.4698, + "step": 2155 + }, + { + "epoch": 0.08315688161693936, + "grad_norm": 1.8568309545516968, + "learning_rate": 0.00019915265676997862, + "loss": 1.3197, + "step": 2160 + }, + { + "epoch": 0.08334937439846006, + "grad_norm": 1.9865350723266602, + "learning_rate": 0.00019914872398895043, + "loss": 1.4883, + "step": 2165 + }, + { + "epoch": 0.08354186717998074, + "grad_norm": 1.0227729082107544, + "learning_rate": 0.0001991447821414282, + "loss": 1.3967, + "step": 2170 + }, + { + "epoch": 0.08373435996150144, + "grad_norm": 1.3028923273086548, + "learning_rate": 0.00019914083122777245, + "loss": 1.4296, + "step": 2175 + }, + { + "epoch": 0.08392685274302214, + "grad_norm": 1.6131690740585327, + "learning_rate": 0.00019913687124834442, + "loss": 1.2983, + "step": 2180 + }, + { + "epoch": 0.08411934552454282, + "grad_norm": 1.1791858673095703, + "learning_rate": 0.00019913290220350622, + "loss": 1.4632, + "step": 2185 + }, + { + "epoch": 0.08431183830606352, + "grad_norm": 1.8457857370376587, + "learning_rate": 0.00019912892409362085, + "loss": 1.3623, + "step": 2190 + }, + { + "epoch": 0.08450433108758422, + "grad_norm": 1.525680422782898, + "learning_rate": 0.00019912493691905198, + "loss": 1.2729, + "step": 2195 + }, + { + "epoch": 0.0846968238691049, + "grad_norm": 1.3267451524734497, + "learning_rate": 0.0001991209406801643, + "loss": 1.3808, + "step": 2200 + }, + { + "epoch": 0.0848893166506256, + "grad_norm": 1.37312912940979, + "learning_rate": 0.00019911693537732323, + "loss": 1.6072, + "step": 2205 + }, + { + "epoch": 0.0850818094321463, + "grad_norm": 1.3433706760406494, + "learning_rate": 0.000199112921010895, + "loss": 1.4956, + "step": 2210 + }, + { + "epoch": 0.08527430221366698, + "grad_norm": 1.220732569694519, + "learning_rate": 0.00019910889758124672, + "loss": 1.4875, + "step": 2215 + }, + { + "epoch": 0.08546679499518768, + "grad_norm": 0.9385544657707214, + "learning_rate": 0.00019910486508874627, + "loss": 1.4202, + "step": 2220 + }, + { + "epoch": 0.08565928777670838, + "grad_norm": 0.8727134466171265, + "learning_rate": 0.0001991008235337624, + "loss": 1.2268, + "step": 2225 + }, + { + "epoch": 0.08585178055822906, + "grad_norm": 2.276063919067383, + "learning_rate": 0.00019909677291666473, + "loss": 1.3911, + "step": 2230 + }, + { + "epoch": 0.08604427333974976, + "grad_norm": 1.2023353576660156, + "learning_rate": 0.00019909271323782364, + "loss": 1.4754, + "step": 2235 + }, + { + "epoch": 0.08623676612127046, + "grad_norm": 0.9018556475639343, + "learning_rate": 0.00019908864449761033, + "loss": 1.4073, + "step": 2240 + }, + { + "epoch": 0.08642925890279114, + "grad_norm": 1.2011221647262573, + "learning_rate": 0.00019908456669639687, + "loss": 1.3213, + "step": 2245 + }, + { + "epoch": 0.08662175168431184, + "grad_norm": 1.9858746528625488, + "learning_rate": 0.0001990804798345562, + "loss": 1.3403, + "step": 2250 + }, + { + "epoch": 0.08681424446583254, + "grad_norm": 1.0072557926177979, + "learning_rate": 0.000199076383912462, + "loss": 1.3387, + "step": 2255 + }, + { + "epoch": 0.08700673724735322, + "grad_norm": 1.4516913890838623, + "learning_rate": 0.00019907227893048877, + "loss": 1.3755, + "step": 2260 + }, + { + "epoch": 0.08719923002887392, + "grad_norm": 1.0636364221572876, + "learning_rate": 0.00019906816488901195, + "loss": 1.2495, + "step": 2265 + }, + { + "epoch": 0.0873917228103946, + "grad_norm": 1.8495078086853027, + "learning_rate": 0.0001990640417884077, + "loss": 1.4166, + "step": 2270 + }, + { + "epoch": 0.0875842155919153, + "grad_norm": 2.327951431274414, + "learning_rate": 0.00019905990962905312, + "loss": 1.3934, + "step": 2275 + }, + { + "epoch": 0.087776708373436, + "grad_norm": 1.5719425678253174, + "learning_rate": 0.00019905576841132595, + "loss": 1.3932, + "step": 2280 + }, + { + "epoch": 0.08796920115495668, + "grad_norm": 1.5799787044525146, + "learning_rate": 0.000199051618135605, + "loss": 1.5148, + "step": 2285 + }, + { + "epoch": 0.08816169393647738, + "grad_norm": 0.7972100377082825, + "learning_rate": 0.00019904745880226966, + "loss": 1.2456, + "step": 2290 + }, + { + "epoch": 0.08835418671799808, + "grad_norm": 1.4252464771270752, + "learning_rate": 0.00019904329041170042, + "loss": 1.4287, + "step": 2295 + }, + { + "epoch": 0.08854667949951876, + "grad_norm": 1.5532910823822021, + "learning_rate": 0.00019903911296427834, + "loss": 1.3685, + "step": 2300 + }, + { + "epoch": 0.08873917228103946, + "grad_norm": 1.3019160032272339, + "learning_rate": 0.00019903492646038544, + "loss": 1.3928, + "step": 2305 + }, + { + "epoch": 0.08893166506256016, + "grad_norm": 1.7292853593826294, + "learning_rate": 0.00019903073090040457, + "loss": 1.369, + "step": 2310 + }, + { + "epoch": 0.08912415784408084, + "grad_norm": 1.1780908107757568, + "learning_rate": 0.00019902652628471938, + "loss": 1.2541, + "step": 2315 + }, + { + "epoch": 0.08931665062560154, + "grad_norm": 1.353721261024475, + "learning_rate": 0.00019902231261371433, + "loss": 1.2658, + "step": 2320 + }, + { + "epoch": 0.08950914340712224, + "grad_norm": 1.0020657777786255, + "learning_rate": 0.0001990180898877748, + "loss": 1.3319, + "step": 2325 + }, + { + "epoch": 0.08970163618864292, + "grad_norm": 1.1655325889587402, + "learning_rate": 0.00019901385810728686, + "loss": 1.3783, + "step": 2330 + }, + { + "epoch": 0.08989412897016362, + "grad_norm": 1.2237039804458618, + "learning_rate": 0.00019900961727263748, + "loss": 1.2919, + "step": 2335 + }, + { + "epoch": 0.09008662175168432, + "grad_norm": 1.6417179107666016, + "learning_rate": 0.0001990053673842145, + "loss": 1.471, + "step": 2340 + }, + { + "epoch": 0.090279114533205, + "grad_norm": 1.2170498371124268, + "learning_rate": 0.00019900110844240653, + "loss": 1.3889, + "step": 2345 + }, + { + "epoch": 0.0904716073147257, + "grad_norm": 1.1462334394454956, + "learning_rate": 0.00019899684044760304, + "loss": 1.4191, + "step": 2350 + }, + { + "epoch": 0.0906641000962464, + "grad_norm": 0.961063802242279, + "learning_rate": 0.00019899256340019425, + "loss": 1.5019, + "step": 2355 + }, + { + "epoch": 0.09085659287776708, + "grad_norm": 0.9323278069496155, + "learning_rate": 0.0001989882773005713, + "loss": 1.3988, + "step": 2360 + }, + { + "epoch": 0.09104908565928778, + "grad_norm": 1.8326833248138428, + "learning_rate": 0.00019898398214912612, + "loss": 1.4211, + "step": 2365 + }, + { + "epoch": 0.09124157844080846, + "grad_norm": 1.2725722789764404, + "learning_rate": 0.00019897967794625153, + "loss": 1.3274, + "step": 2370 + }, + { + "epoch": 0.09143407122232916, + "grad_norm": 0.9105005860328674, + "learning_rate": 0.00019897536469234102, + "loss": 1.3309, + "step": 2375 + }, + { + "epoch": 0.09162656400384986, + "grad_norm": 1.3157737255096436, + "learning_rate": 0.00019897104238778907, + "loss": 1.4086, + "step": 2380 + }, + { + "epoch": 0.09181905678537054, + "grad_norm": 1.9295995235443115, + "learning_rate": 0.00019896671103299094, + "loss": 1.3849, + "step": 2385 + }, + { + "epoch": 0.09201154956689124, + "grad_norm": 1.0183601379394531, + "learning_rate": 0.00019896237062834267, + "loss": 1.4397, + "step": 2390 + }, + { + "epoch": 0.09220404234841194, + "grad_norm": 1.118998646736145, + "learning_rate": 0.00019895802117424118, + "loss": 1.568, + "step": 2395 + }, + { + "epoch": 0.09239653512993262, + "grad_norm": 1.6463871002197266, + "learning_rate": 0.00019895366267108416, + "loss": 1.2755, + "step": 2400 + }, + { + "epoch": 0.09258902791145332, + "grad_norm": 1.3326902389526367, + "learning_rate": 0.00019894929511927022, + "loss": 1.4369, + "step": 2405 + }, + { + "epoch": 0.09278152069297402, + "grad_norm": 1.4168566465377808, + "learning_rate": 0.00019894491851919871, + "loss": 1.4323, + "step": 2410 + }, + { + "epoch": 0.0929740134744947, + "grad_norm": 1.3266388177871704, + "learning_rate": 0.00019894053287126986, + "loss": 1.17, + "step": 2415 + }, + { + "epoch": 0.0931665062560154, + "grad_norm": 1.7362377643585205, + "learning_rate": 0.0001989361381758847, + "loss": 1.5996, + "step": 2420 + }, + { + "epoch": 0.0933589990375361, + "grad_norm": 1.1684424877166748, + "learning_rate": 0.00019893173443344511, + "loss": 1.3486, + "step": 2425 + }, + { + "epoch": 0.09355149181905678, + "grad_norm": 1.3784310817718506, + "learning_rate": 0.00019892732164435376, + "loss": 1.2775, + "step": 2430 + }, + { + "epoch": 0.09374398460057748, + "grad_norm": 1.1288561820983887, + "learning_rate": 0.00019892289980901414, + "loss": 1.2044, + "step": 2435 + }, + { + "epoch": 0.09393647738209818, + "grad_norm": 1.1601535081863403, + "learning_rate": 0.00019891846892783067, + "loss": 1.4937, + "step": 2440 + }, + { + "epoch": 0.09412897016361886, + "grad_norm": 1.3866316080093384, + "learning_rate": 0.0001989140290012085, + "loss": 1.913, + "step": 2445 + }, + { + "epoch": 0.09432146294513956, + "grad_norm": 1.4638808965682983, + "learning_rate": 0.00019890958002955362, + "loss": 1.4114, + "step": 2450 + }, + { + "epoch": 0.09451395572666026, + "grad_norm": 1.4660701751708984, + "learning_rate": 0.00019890512201327284, + "loss": 1.3607, + "step": 2455 + }, + { + "epoch": 0.09470644850818094, + "grad_norm": 0.9787619113922119, + "learning_rate": 0.00019890065495277388, + "loss": 1.3729, + "step": 2460 + }, + { + "epoch": 0.09489894128970164, + "grad_norm": 1.4845494031906128, + "learning_rate": 0.00019889617884846517, + "loss": 1.3326, + "step": 2465 + }, + { + "epoch": 0.09509143407122234, + "grad_norm": 1.2955145835876465, + "learning_rate": 0.000198891693700756, + "loss": 1.3738, + "step": 2470 + }, + { + "epoch": 0.09528392685274302, + "grad_norm": 1.7431209087371826, + "learning_rate": 0.00019888719951005656, + "loss": 1.3676, + "step": 2475 + }, + { + "epoch": 0.09547641963426372, + "grad_norm": 0.923613965511322, + "learning_rate": 0.00019888269627677777, + "loss": 1.4142, + "step": 2480 + }, + { + "epoch": 0.0956689124157844, + "grad_norm": 1.0258625745773315, + "learning_rate": 0.0001988781840013315, + "loss": 1.3868, + "step": 2485 + }, + { + "epoch": 0.0958614051973051, + "grad_norm": 1.1365761756896973, + "learning_rate": 0.00019887366268413025, + "loss": 1.2871, + "step": 2490 + }, + { + "epoch": 0.0960538979788258, + "grad_norm": 2.3250112533569336, + "learning_rate": 0.00019886913232558754, + "loss": 1.4345, + "step": 2495 + }, + { + "epoch": 0.09624639076034648, + "grad_norm": 1.1625771522521973, + "learning_rate": 0.00019886459292611767, + "loss": 1.5796, + "step": 2500 + }, + { + "epoch": 0.09643888354186718, + "grad_norm": 1.7454233169555664, + "learning_rate": 0.00019886004448613562, + "loss": 1.6151, + "step": 2505 + }, + { + "epoch": 0.09663137632338788, + "grad_norm": 1.3514907360076904, + "learning_rate": 0.00019885548700605745, + "loss": 1.4529, + "step": 2510 + }, + { + "epoch": 0.09682386910490856, + "grad_norm": 1.9735958576202393, + "learning_rate": 0.00019885092048629982, + "loss": 1.4945, + "step": 2515 + }, + { + "epoch": 0.09701636188642926, + "grad_norm": 1.190207600593567, + "learning_rate": 0.00019884634492728037, + "loss": 1.473, + "step": 2520 + }, + { + "epoch": 0.09720885466794996, + "grad_norm": 1.1596134901046753, + "learning_rate": 0.00019884176032941743, + "loss": 1.3745, + "step": 2525 + }, + { + "epoch": 0.09740134744947064, + "grad_norm": 1.0496324300765991, + "learning_rate": 0.0001988371666931303, + "loss": 1.3853, + "step": 2530 + }, + { + "epoch": 0.09759384023099134, + "grad_norm": 1.2820552587509155, + "learning_rate": 0.000198832564018839, + "loss": 1.4205, + "step": 2535 + }, + { + "epoch": 0.09778633301251204, + "grad_norm": 0.9559310674667358, + "learning_rate": 0.00019882795230696446, + "loss": 1.2517, + "step": 2540 + }, + { + "epoch": 0.09797882579403272, + "grad_norm": 1.026782751083374, + "learning_rate": 0.00019882333155792835, + "loss": 1.335, + "step": 2545 + }, + { + "epoch": 0.09817131857555342, + "grad_norm": 1.3378793001174927, + "learning_rate": 0.00019881870177215319, + "loss": 1.3419, + "step": 2550 + }, + { + "epoch": 0.09836381135707412, + "grad_norm": 1.0646761655807495, + "learning_rate": 0.00019881406295006238, + "loss": 1.3793, + "step": 2555 + }, + { + "epoch": 0.0985563041385948, + "grad_norm": 1.3302899599075317, + "learning_rate": 0.00019880941509208005, + "loss": 1.3056, + "step": 2560 + }, + { + "epoch": 0.0987487969201155, + "grad_norm": 1.3029305934906006, + "learning_rate": 0.00019880475819863134, + "loss": 1.3028, + "step": 2565 + }, + { + "epoch": 0.0989412897016362, + "grad_norm": 1.6653764247894287, + "learning_rate": 0.00019880009227014197, + "loss": 1.4698, + "step": 2570 + }, + { + "epoch": 0.09913378248315688, + "grad_norm": 1.5575610399246216, + "learning_rate": 0.00019879541730703865, + "loss": 1.2843, + "step": 2575 + }, + { + "epoch": 0.09932627526467758, + "grad_norm": 1.1219451427459717, + "learning_rate": 0.0001987907333097489, + "loss": 1.2824, + "step": 2580 + }, + { + "epoch": 0.09951876804619826, + "grad_norm": 1.680050253868103, + "learning_rate": 0.000198786040278701, + "loss": 1.431, + "step": 2585 + }, + { + "epoch": 0.09971126082771896, + "grad_norm": 2.5341451168060303, + "learning_rate": 0.00019878133821432412, + "loss": 1.3925, + "step": 2590 + }, + { + "epoch": 0.09990375360923966, + "grad_norm": 1.132542610168457, + "learning_rate": 0.00019877662711704824, + "loss": 1.4082, + "step": 2595 + }, + { + "epoch": 0.10009624639076034, + "grad_norm": 1.0605584383010864, + "learning_rate": 0.0001987719069873041, + "loss": 1.2904, + "step": 2600 + }, + { + "epoch": 0.10028873917228104, + "grad_norm": 1.161116361618042, + "learning_rate": 0.0001987671778255234, + "loss": 1.2922, + "step": 2605 + }, + { + "epoch": 0.10048123195380174, + "grad_norm": 2.2763168811798096, + "learning_rate": 0.0001987624396321386, + "loss": 1.4692, + "step": 2610 + }, + { + "epoch": 0.10067372473532242, + "grad_norm": 1.547316312789917, + "learning_rate": 0.00019875769240758286, + "loss": 1.458, + "step": 2615 + }, + { + "epoch": 0.10086621751684312, + "grad_norm": 1.0679529905319214, + "learning_rate": 0.0001987529361522904, + "loss": 1.3075, + "step": 2620 + }, + { + "epoch": 0.10105871029836382, + "grad_norm": 1.9426227807998657, + "learning_rate": 0.0001987481708666961, + "loss": 1.4985, + "step": 2625 + }, + { + "epoch": 0.1012512030798845, + "grad_norm": 1.1619765758514404, + "learning_rate": 0.00019874339655123575, + "loss": 1.329, + "step": 2630 + }, + { + "epoch": 0.1014436958614052, + "grad_norm": 0.8115332722663879, + "learning_rate": 0.00019873861320634587, + "loss": 1.218, + "step": 2635 + }, + { + "epoch": 0.1016361886429259, + "grad_norm": 1.2575538158416748, + "learning_rate": 0.0001987338208324639, + "loss": 1.3133, + "step": 2640 + }, + { + "epoch": 0.10182868142444658, + "grad_norm": 0.9605635404586792, + "learning_rate": 0.00019872901943002806, + "loss": 1.4462, + "step": 2645 + }, + { + "epoch": 0.10202117420596728, + "grad_norm": 1.7909116744995117, + "learning_rate": 0.00019872420899947742, + "loss": 1.257, + "step": 2650 + }, + { + "epoch": 0.10221366698748797, + "grad_norm": 1.5501129627227783, + "learning_rate": 0.00019871938954125185, + "loss": 1.2825, + "step": 2655 + }, + { + "epoch": 0.10240615976900866, + "grad_norm": 1.4636069536209106, + "learning_rate": 0.00019871456105579208, + "loss": 1.3909, + "step": 2660 + }, + { + "epoch": 0.10259865255052936, + "grad_norm": 1.4283297061920166, + "learning_rate": 0.0001987097235435396, + "loss": 1.2148, + "step": 2665 + }, + { + "epoch": 0.10279114533205005, + "grad_norm": 1.316149115562439, + "learning_rate": 0.00019870487700493684, + "loss": 1.393, + "step": 2670 + }, + { + "epoch": 0.10298363811357074, + "grad_norm": 0.8449459671974182, + "learning_rate": 0.00019870002144042689, + "loss": 1.4969, + "step": 2675 + }, + { + "epoch": 0.10317613089509144, + "grad_norm": 1.3309835195541382, + "learning_rate": 0.00019869515685045383, + "loss": 1.4927, + "step": 2680 + }, + { + "epoch": 0.10336862367661212, + "grad_norm": 0.9159907102584839, + "learning_rate": 0.00019869028323546246, + "loss": 1.3526, + "step": 2685 + }, + { + "epoch": 0.10356111645813282, + "grad_norm": 2.2842464447021484, + "learning_rate": 0.00019868540059589845, + "loss": 1.3646, + "step": 2690 + }, + { + "epoch": 0.10375360923965352, + "grad_norm": 0.9444146156311035, + "learning_rate": 0.00019868050893220832, + "loss": 1.349, + "step": 2695 + }, + { + "epoch": 0.1039461020211742, + "grad_norm": 1.8546898365020752, + "learning_rate": 0.0001986756082448393, + "loss": 1.3195, + "step": 2700 + }, + { + "epoch": 0.1041385948026949, + "grad_norm": 1.310783863067627, + "learning_rate": 0.00019867069853423961, + "loss": 1.6065, + "step": 2705 + }, + { + "epoch": 0.1043310875842156, + "grad_norm": 1.248542308807373, + "learning_rate": 0.00019866577980085813, + "loss": 1.1987, + "step": 2710 + }, + { + "epoch": 0.10452358036573628, + "grad_norm": 1.421844482421875, + "learning_rate": 0.00019866085204514472, + "loss": 1.3576, + "step": 2715 + }, + { + "epoch": 0.10471607314725698, + "grad_norm": 1.1641993522644043, + "learning_rate": 0.00019865591526754996, + "loss": 1.436, + "step": 2720 + }, + { + "epoch": 0.10490856592877768, + "grad_norm": 1.1122993230819702, + "learning_rate": 0.0001986509694685253, + "loss": 1.4218, + "step": 2725 + }, + { + "epoch": 0.10510105871029836, + "grad_norm": 1.222016453742981, + "learning_rate": 0.00019864601464852295, + "loss": 1.2965, + "step": 2730 + }, + { + "epoch": 0.10529355149181906, + "grad_norm": 1.6765378713607788, + "learning_rate": 0.00019864105080799602, + "loss": 1.3908, + "step": 2735 + }, + { + "epoch": 0.10548604427333975, + "grad_norm": 1.8405592441558838, + "learning_rate": 0.00019863607794739845, + "loss": 1.2583, + "step": 2740 + }, + { + "epoch": 0.10567853705486044, + "grad_norm": 1.3908604383468628, + "learning_rate": 0.00019863109606718497, + "loss": 1.2726, + "step": 2745 + }, + { + "epoch": 0.10587102983638114, + "grad_norm": 1.3825894594192505, + "learning_rate": 0.0001986261051678111, + "loss": 1.3234, + "step": 2750 + }, + { + "epoch": 0.10606352261790183, + "grad_norm": 1.5409029722213745, + "learning_rate": 0.00019862110524973328, + "loss": 1.4151, + "step": 2755 + }, + { + "epoch": 0.10625601539942252, + "grad_norm": 2.1902191638946533, + "learning_rate": 0.00019861609631340868, + "loss": 1.3865, + "step": 2760 + }, + { + "epoch": 0.10644850818094322, + "grad_norm": 0.9851712584495544, + "learning_rate": 0.00019861107835929533, + "loss": 1.4799, + "step": 2765 + }, + { + "epoch": 0.10664100096246391, + "grad_norm": 1.2206732034683228, + "learning_rate": 0.0001986060513878521, + "loss": 1.3456, + "step": 2770 + }, + { + "epoch": 0.1068334937439846, + "grad_norm": 1.3443645238876343, + "learning_rate": 0.0001986010153995387, + "loss": 1.2586, + "step": 2775 + }, + { + "epoch": 0.1070259865255053, + "grad_norm": 1.1602864265441895, + "learning_rate": 0.00019859597039481561, + "loss": 1.1789, + "step": 2780 + }, + { + "epoch": 0.107218479307026, + "grad_norm": 0.8068190813064575, + "learning_rate": 0.00019859091637414414, + "loss": 1.4228, + "step": 2785 + }, + { + "epoch": 0.10741097208854668, + "grad_norm": 1.4439321756362915, + "learning_rate": 0.0001985858533379865, + "loss": 1.4365, + "step": 2790 + }, + { + "epoch": 0.10760346487006738, + "grad_norm": 1.0814299583435059, + "learning_rate": 0.00019858078128680564, + "loss": 1.2755, + "step": 2795 + }, + { + "epoch": 0.10779595765158806, + "grad_norm": 1.7848068475723267, + "learning_rate": 0.00019857570022106536, + "loss": 1.4061, + "step": 2800 + }, + { + "epoch": 0.10798845043310876, + "grad_norm": 1.3163549900054932, + "learning_rate": 0.0001985706101412303, + "loss": 1.3599, + "step": 2805 + }, + { + "epoch": 0.10818094321462945, + "grad_norm": 1.439104437828064, + "learning_rate": 0.0001985655110477659, + "loss": 1.3054, + "step": 2810 + }, + { + "epoch": 0.10837343599615014, + "grad_norm": 0.892706036567688, + "learning_rate": 0.0001985604029411385, + "loss": 1.3504, + "step": 2815 + }, + { + "epoch": 0.10856592877767084, + "grad_norm": 1.102704405784607, + "learning_rate": 0.0001985552858218151, + "loss": 1.3902, + "step": 2820 + }, + { + "epoch": 0.10875842155919153, + "grad_norm": 1.21804678440094, + "learning_rate": 0.0001985501596902637, + "loss": 1.36, + "step": 2825 + }, + { + "epoch": 0.10895091434071222, + "grad_norm": 1.6015477180480957, + "learning_rate": 0.00019854502454695302, + "loss": 1.6163, + "step": 2830 + }, + { + "epoch": 0.10914340712223292, + "grad_norm": 1.3947224617004395, + "learning_rate": 0.00019853988039235265, + "loss": 1.2207, + "step": 2835 + }, + { + "epoch": 0.10933589990375361, + "grad_norm": 1.616458535194397, + "learning_rate": 0.00019853472722693302, + "loss": 1.2081, + "step": 2840 + }, + { + "epoch": 0.1095283926852743, + "grad_norm": 2.1588330268859863, + "learning_rate": 0.00019852956505116528, + "loss": 1.4428, + "step": 2845 + }, + { + "epoch": 0.109720885466795, + "grad_norm": 1.2287509441375732, + "learning_rate": 0.00019852439386552152, + "loss": 1.4548, + "step": 2850 + }, + { + "epoch": 0.1099133782483157, + "grad_norm": 1.7198657989501953, + "learning_rate": 0.00019851921367047463, + "loss": 1.2034, + "step": 2855 + }, + { + "epoch": 0.11010587102983638, + "grad_norm": 1.4924067258834839, + "learning_rate": 0.00019851402446649825, + "loss": 1.3635, + "step": 2860 + }, + { + "epoch": 0.11029836381135708, + "grad_norm": 1.3675332069396973, + "learning_rate": 0.00019850882625406695, + "loss": 1.29, + "step": 2865 + }, + { + "epoch": 0.11049085659287777, + "grad_norm": 1.2170599699020386, + "learning_rate": 0.00019850361903365603, + "loss": 1.3495, + "step": 2870 + }, + { + "epoch": 0.11068334937439846, + "grad_norm": 1.6067026853561401, + "learning_rate": 0.00019849840280574167, + "loss": 1.4679, + "step": 2875 + }, + { + "epoch": 0.11087584215591915, + "grad_norm": 1.0457261800765991, + "learning_rate": 0.00019849317757080092, + "loss": 1.3289, + "step": 2880 + }, + { + "epoch": 0.11106833493743985, + "grad_norm": 0.6958736181259155, + "learning_rate": 0.00019848794332931146, + "loss": 0.9412, + "step": 2885 + }, + { + "epoch": 0.11126082771896054, + "grad_norm": 0.9687005281448364, + "learning_rate": 0.00019848270008175205, + "loss": 1.2777, + "step": 2890 + }, + { + "epoch": 0.11145332050048123, + "grad_norm": 0.8073298931121826, + "learning_rate": 0.00019847744782860213, + "loss": 1.4295, + "step": 2895 + }, + { + "epoch": 0.11164581328200192, + "grad_norm": 0.8794350624084473, + "learning_rate": 0.00019847218657034193, + "loss": 1.2199, + "step": 2900 + }, + { + "epoch": 0.11183830606352262, + "grad_norm": 1.644554853439331, + "learning_rate": 0.00019846691630745258, + "loss": 1.3076, + "step": 2905 + }, + { + "epoch": 0.11203079884504331, + "grad_norm": 1.0819231271743774, + "learning_rate": 0.00019846163704041603, + "loss": 1.385, + "step": 2910 + }, + { + "epoch": 0.112223291626564, + "grad_norm": 1.4424269199371338, + "learning_rate": 0.000198456348769715, + "loss": 1.4287, + "step": 2915 + }, + { + "epoch": 0.1124157844080847, + "grad_norm": 1.289413332939148, + "learning_rate": 0.00019845105149583308, + "loss": 1.25, + "step": 2920 + }, + { + "epoch": 0.1126082771896054, + "grad_norm": 1.4669229984283447, + "learning_rate": 0.00019844574521925474, + "loss": 1.5371, + "step": 2925 + }, + { + "epoch": 0.11280076997112608, + "grad_norm": 2.102736473083496, + "learning_rate": 0.0001984404299404651, + "loss": 1.5017, + "step": 2930 + }, + { + "epoch": 0.11299326275264678, + "grad_norm": 1.1487330198287964, + "learning_rate": 0.00019843510565995025, + "loss": 1.3164, + "step": 2935 + }, + { + "epoch": 0.11318575553416747, + "grad_norm": 1.259538173675537, + "learning_rate": 0.00019842977237819707, + "loss": 1.2946, + "step": 2940 + }, + { + "epoch": 0.11337824831568816, + "grad_norm": 2.3158466815948486, + "learning_rate": 0.00019842443009569324, + "loss": 1.4614, + "step": 2945 + }, + { + "epoch": 0.11357074109720885, + "grad_norm": 1.5077046155929565, + "learning_rate": 0.0001984190788129273, + "loss": 1.3478, + "step": 2950 + }, + { + "epoch": 0.11376323387872955, + "grad_norm": 1.2548809051513672, + "learning_rate": 0.00019841371853038852, + "loss": 1.3351, + "step": 2955 + }, + { + "epoch": 0.11395572666025024, + "grad_norm": 1.4622430801391602, + "learning_rate": 0.00019840834924856715, + "loss": 1.2788, + "step": 2960 + }, + { + "epoch": 0.11414821944177093, + "grad_norm": 0.9759154319763184, + "learning_rate": 0.00019840297096795415, + "loss": 1.2793, + "step": 2965 + }, + { + "epoch": 0.11434071222329163, + "grad_norm": 1.2217987775802612, + "learning_rate": 0.00019839758368904128, + "loss": 1.284, + "step": 2970 + }, + { + "epoch": 0.11453320500481232, + "grad_norm": 2.180697441101074, + "learning_rate": 0.00019839326738746614, + "loss": 1.4163, + "step": 2975 + }, + { + "epoch": 0.11472569778633301, + "grad_norm": 1.156293511390686, + "learning_rate": 0.00019838786391285554, + "loss": 1.3045, + "step": 2980 + }, + { + "epoch": 0.11491819056785371, + "grad_norm": 1.1444417238235474, + "learning_rate": 0.00019838245144132658, + "loss": 1.4522, + "step": 2985 + }, + { + "epoch": 0.1151106833493744, + "grad_norm": 1.3959949016571045, + "learning_rate": 0.00019837702997337414, + "loss": 1.3959, + "step": 2990 + }, + { + "epoch": 0.1153031761308951, + "grad_norm": 1.2789435386657715, + "learning_rate": 0.00019837159950949402, + "loss": 1.2951, + "step": 2995 + }, + { + "epoch": 0.11549566891241578, + "grad_norm": 1.0902299880981445, + "learning_rate": 0.00019836616005018275, + "loss": 1.4573, + "step": 3000 + }, + { + "epoch": 0.11568816169393648, + "grad_norm": 1.452920913696289, + "learning_rate": 0.0001983607115959378, + "loss": 1.4688, + "step": 3005 + }, + { + "epoch": 0.11588065447545717, + "grad_norm": 2.192514419555664, + "learning_rate": 0.0001983552541472573, + "loss": 1.4282, + "step": 3010 + }, + { + "epoch": 0.11607314725697786, + "grad_norm": 1.938883900642395, + "learning_rate": 0.0001983497877046404, + "loss": 1.6123, + "step": 3015 + }, + { + "epoch": 0.11626564003849855, + "grad_norm": 2.4365732669830322, + "learning_rate": 0.0001983443122685869, + "loss": 1.4987, + "step": 3020 + }, + { + "epoch": 0.11645813282001925, + "grad_norm": 1.827972173690796, + "learning_rate": 0.0001983388278395975, + "loss": 1.2196, + "step": 3025 + }, + { + "epoch": 0.11665062560153994, + "grad_norm": 1.6184618473052979, + "learning_rate": 0.00019833333441817374, + "loss": 1.5257, + "step": 3030 + }, + { + "epoch": 0.11684311838306063, + "grad_norm": 1.0191036462783813, + "learning_rate": 0.00019832783200481797, + "loss": 1.4799, + "step": 3035 + }, + { + "epoch": 0.11703561116458133, + "grad_norm": 1.1552925109863281, + "learning_rate": 0.0001983223206000333, + "loss": 1.2014, + "step": 3040 + }, + { + "epoch": 0.11722810394610202, + "grad_norm": 0.9793531894683838, + "learning_rate": 0.00019831680020432376, + "loss": 1.2092, + "step": 3045 + }, + { + "epoch": 0.11742059672762271, + "grad_norm": 1.480634331703186, + "learning_rate": 0.0001983112708181941, + "loss": 1.3238, + "step": 3050 + }, + { + "epoch": 0.11761308950914341, + "grad_norm": 1.5112073421478271, + "learning_rate": 0.00019830573244215, + "loss": 1.5513, + "step": 3055 + }, + { + "epoch": 0.1178055822906641, + "grad_norm": 1.4130852222442627, + "learning_rate": 0.00019830018507669786, + "loss": 1.4368, + "step": 3060 + }, + { + "epoch": 0.1179980750721848, + "grad_norm": 1.401934027671814, + "learning_rate": 0.000198294628722345, + "loss": 1.243, + "step": 3065 + }, + { + "epoch": 0.11819056785370549, + "grad_norm": 1.8309379816055298, + "learning_rate": 0.00019828906337959946, + "loss": 1.1656, + "step": 3070 + }, + { + "epoch": 0.11838306063522618, + "grad_norm": 0.8511875867843628, + "learning_rate": 0.0001982834890489702, + "loss": 1.406, + "step": 3075 + }, + { + "epoch": 0.11857555341674687, + "grad_norm": 1.4291598796844482, + "learning_rate": 0.00019827790573096694, + "loss": 1.3963, + "step": 3080 + }, + { + "epoch": 0.11876804619826757, + "grad_norm": 0.6835631132125854, + "learning_rate": 0.0001982723134261002, + "loss": 1.1238, + "step": 3085 + }, + { + "epoch": 0.11896053897978826, + "grad_norm": 1.6569236516952515, + "learning_rate": 0.00019826671213488145, + "loss": 1.3335, + "step": 3090 + }, + { + "epoch": 0.11915303176130895, + "grad_norm": 1.0488132238388062, + "learning_rate": 0.00019826110185782277, + "loss": 1.3009, + "step": 3095 + }, + { + "epoch": 0.11934552454282965, + "grad_norm": 1.3253639936447144, + "learning_rate": 0.00019825548259543726, + "loss": 1.3863, + "step": 3100 + }, + { + "epoch": 0.11953801732435033, + "grad_norm": 0.9408076405525208, + "learning_rate": 0.00019824985434823878, + "loss": 1.3184, + "step": 3105 + }, + { + "epoch": 0.11973051010587103, + "grad_norm": 0.9649772644042969, + "learning_rate": 0.00019824421711674194, + "loss": 1.2427, + "step": 3110 + }, + { + "epoch": 0.11992300288739172, + "grad_norm": 1.7673052549362183, + "learning_rate": 0.00019823857090146225, + "loss": 1.2804, + "step": 3115 + }, + { + "epoch": 0.12011549566891241, + "grad_norm": 1.230724811553955, + "learning_rate": 0.00019823291570291604, + "loss": 1.3527, + "step": 3120 + }, + { + "epoch": 0.12030798845043311, + "grad_norm": 2.382617473602295, + "learning_rate": 0.0001982272515216204, + "loss": 1.4123, + "step": 3125 + }, + { + "epoch": 0.1205004812319538, + "grad_norm": 1.2811720371246338, + "learning_rate": 0.00019822157835809332, + "loss": 1.3935, + "step": 3130 + }, + { + "epoch": 0.1206929740134745, + "grad_norm": 1.9592630863189697, + "learning_rate": 0.00019821589621285356, + "loss": 1.2387, + "step": 3135 + }, + { + "epoch": 0.12088546679499519, + "grad_norm": 1.659197449684143, + "learning_rate": 0.0001982102050864207, + "loss": 1.4228, + "step": 3140 + }, + { + "epoch": 0.12107795957651588, + "grad_norm": 1.2591451406478882, + "learning_rate": 0.00019820450497931517, + "loss": 1.3192, + "step": 3145 + }, + { + "epoch": 0.12127045235803657, + "grad_norm": 1.1670453548431396, + "learning_rate": 0.00019819879589205822, + "loss": 1.2593, + "step": 3150 + }, + { + "epoch": 0.12146294513955727, + "grad_norm": 1.680776834487915, + "learning_rate": 0.0001981930778251719, + "loss": 1.5809, + "step": 3155 + }, + { + "epoch": 0.12165543792107796, + "grad_norm": 1.388492226600647, + "learning_rate": 0.00019818735077917904, + "loss": 1.5646, + "step": 3160 + }, + { + "epoch": 0.12184793070259865, + "grad_norm": 1.3851470947265625, + "learning_rate": 0.00019818161475460342, + "loss": 1.3282, + "step": 3165 + }, + { + "epoch": 0.12204042348411935, + "grad_norm": 1.252103567123413, + "learning_rate": 0.0001981758697519695, + "loss": 1.3326, + "step": 3170 + }, + { + "epoch": 0.12223291626564003, + "grad_norm": 2.6637227535247803, + "learning_rate": 0.0001981701157718027, + "loss": 1.4247, + "step": 3175 + }, + { + "epoch": 0.12242540904716073, + "grad_norm": 1.4228829145431519, + "learning_rate": 0.00019816435281462907, + "loss": 1.3287, + "step": 3180 + }, + { + "epoch": 0.12261790182868143, + "grad_norm": 1.0654631853103638, + "learning_rate": 0.00019815858088097565, + "loss": 1.3651, + "step": 3185 + }, + { + "epoch": 0.12281039461020211, + "grad_norm": 1.1779879331588745, + "learning_rate": 0.00019815279997137028, + "loss": 1.2699, + "step": 3190 + }, + { + "epoch": 0.12300288739172281, + "grad_norm": 0.966482937335968, + "learning_rate": 0.0001981470100863416, + "loss": 1.3029, + "step": 3195 + }, + { + "epoch": 0.12319538017324351, + "grad_norm": 1.13119375705719, + "learning_rate": 0.00019814121122641894, + "loss": 1.3431, + "step": 3200 + }, + { + "epoch": 0.1233878729547642, + "grad_norm": 1.0690468549728394, + "learning_rate": 0.00019813540339213263, + "loss": 1.237, + "step": 3205 + }, + { + "epoch": 0.12358036573628489, + "grad_norm": 1.169592022895813, + "learning_rate": 0.00019812958658401382, + "loss": 1.3341, + "step": 3210 + }, + { + "epoch": 0.12377285851780558, + "grad_norm": 0.9310591816902161, + "learning_rate": 0.00019812376080259435, + "loss": 1.3168, + "step": 3215 + }, + { + "epoch": 0.12396535129932627, + "grad_norm": 1.1262513399124146, + "learning_rate": 0.00019811792604840694, + "loss": 1.322, + "step": 3220 + }, + { + "epoch": 0.12415784408084697, + "grad_norm": 1.0723376274108887, + "learning_rate": 0.00019811208232198518, + "loss": 1.2814, + "step": 3225 + }, + { + "epoch": 0.12435033686236766, + "grad_norm": 1.5084266662597656, + "learning_rate": 0.00019810622962386344, + "loss": 1.3136, + "step": 3230 + }, + { + "epoch": 0.12454282964388835, + "grad_norm": 1.5219266414642334, + "learning_rate": 0.0001981003679545769, + "loss": 1.2971, + "step": 3235 + }, + { + "epoch": 0.12473532242540905, + "grad_norm": 1.8135708570480347, + "learning_rate": 0.00019809449731466154, + "loss": 1.3987, + "step": 3240 + }, + { + "epoch": 0.12492781520692973, + "grad_norm": 1.9838290214538574, + "learning_rate": 0.00019808861770465424, + "loss": 1.4063, + "step": 3245 + }, + { + "epoch": 0.12512030798845045, + "grad_norm": 0.9821895956993103, + "learning_rate": 0.00019808272912509258, + "loss": 1.4336, + "step": 3250 + }, + { + "epoch": 0.12531280076997112, + "grad_norm": 1.0371532440185547, + "learning_rate": 0.00019807683157651513, + "loss": 1.4659, + "step": 3255 + }, + { + "epoch": 0.12550529355149181, + "grad_norm": 1.2441003322601318, + "learning_rate": 0.0001980709250594611, + "loss": 1.3807, + "step": 3260 + }, + { + "epoch": 0.1256977863330125, + "grad_norm": 1.6097456216812134, + "learning_rate": 0.00019806500957447067, + "loss": 1.4115, + "step": 3265 + }, + { + "epoch": 0.1258902791145332, + "grad_norm": 1.4005634784698486, + "learning_rate": 0.0001980590851220847, + "loss": 1.6008, + "step": 3270 + }, + { + "epoch": 0.1260827718960539, + "grad_norm": 1.1883544921875, + "learning_rate": 0.00019805315170284498, + "loss": 1.3768, + "step": 3275 + }, + { + "epoch": 0.12627526467757458, + "grad_norm": 1.2404242753982544, + "learning_rate": 0.00019804720931729413, + "loss": 1.463, + "step": 3280 + }, + { + "epoch": 0.12646775745909528, + "grad_norm": 0.625027596950531, + "learning_rate": 0.00019804125796597544, + "loss": 1.3286, + "step": 3285 + }, + { + "epoch": 0.12666025024061597, + "grad_norm": 1.5616633892059326, + "learning_rate": 0.0001980352976494332, + "loss": 1.4161, + "step": 3290 + }, + { + "epoch": 0.12685274302213667, + "grad_norm": 0.8003360629081726, + "learning_rate": 0.0001980293283682124, + "loss": 1.4117, + "step": 3295 + }, + { + "epoch": 0.12704523580365737, + "grad_norm": 1.0671011209487915, + "learning_rate": 0.0001980233501228589, + "loss": 1.4192, + "step": 3300 + }, + { + "epoch": 0.12723772858517807, + "grad_norm": 1.4135669469833374, + "learning_rate": 0.0001980173629139194, + "loss": 1.3046, + "step": 3305 + }, + { + "epoch": 0.12743022136669874, + "grad_norm": 1.0450470447540283, + "learning_rate": 0.00019801136674194134, + "loss": 1.4156, + "step": 3310 + }, + { + "epoch": 0.12762271414821943, + "grad_norm": 1.1435261964797974, + "learning_rate": 0.00019800536160747306, + "loss": 1.2311, + "step": 3315 + }, + { + "epoch": 0.12781520692974013, + "grad_norm": 1.5508229732513428, + "learning_rate": 0.0001979993475110637, + "loss": 1.4224, + "step": 3320 + }, + { + "epoch": 0.12800769971126083, + "grad_norm": 0.9542085528373718, + "learning_rate": 0.0001979933244532632, + "loss": 1.2423, + "step": 3325 + }, + { + "epoch": 0.12820019249278153, + "grad_norm": 1.5797593593597412, + "learning_rate": 0.0001979872924346223, + "loss": 1.3357, + "step": 3330 + }, + { + "epoch": 0.12839268527430223, + "grad_norm": 1.0982688665390015, + "learning_rate": 0.00019798125145569263, + "loss": 1.2404, + "step": 3335 + }, + { + "epoch": 0.1285851780558229, + "grad_norm": 1.5471248626708984, + "learning_rate": 0.0001979752015170266, + "loss": 1.3556, + "step": 3340 + }, + { + "epoch": 0.1287776708373436, + "grad_norm": 1.64442777633667, + "learning_rate": 0.0001979691426191774, + "loss": 1.3407, + "step": 3345 + }, + { + "epoch": 0.1289701636188643, + "grad_norm": 1.494186520576477, + "learning_rate": 0.0001979630747626991, + "loss": 1.4509, + "step": 3350 + }, + { + "epoch": 0.129162656400385, + "grad_norm": 0.9598186612129211, + "learning_rate": 0.00019795699794814654, + "loss": 1.3221, + "step": 3355 + }, + { + "epoch": 0.1293551491819057, + "grad_norm": 1.1328315734863281, + "learning_rate": 0.00019795091217607544, + "loss": 1.5129, + "step": 3360 + }, + { + "epoch": 0.12954764196342639, + "grad_norm": 1.0476043224334717, + "learning_rate": 0.00019794481744704227, + "loss": 1.3448, + "step": 3365 + }, + { + "epoch": 0.12974013474494706, + "grad_norm": 1.2570463418960571, + "learning_rate": 0.0001979387137616044, + "loss": 1.2726, + "step": 3370 + }, + { + "epoch": 0.12993262752646775, + "grad_norm": 1.395627498626709, + "learning_rate": 0.00019793260112031992, + "loss": 1.1469, + "step": 3375 + }, + { + "epoch": 0.13012512030798845, + "grad_norm": 2.2382960319519043, + "learning_rate": 0.00019792647952374782, + "loss": 1.3375, + "step": 3380 + }, + { + "epoch": 0.13031761308950915, + "grad_norm": 1.4930087327957153, + "learning_rate": 0.00019792034897244784, + "loss": 1.3684, + "step": 3385 + }, + { + "epoch": 0.13051010587102985, + "grad_norm": 0.9732452034950256, + "learning_rate": 0.00019791420946698064, + "loss": 1.0792, + "step": 3390 + }, + { + "epoch": 0.13070259865255052, + "grad_norm": 1.9484987258911133, + "learning_rate": 0.0001979080610079076, + "loss": 1.4284, + "step": 3395 + }, + { + "epoch": 0.13089509143407121, + "grad_norm": 1.3746837377548218, + "learning_rate": 0.00019790190359579097, + "loss": 1.4393, + "step": 3400 + }, + { + "epoch": 0.1310875842155919, + "grad_norm": 1.2191319465637207, + "learning_rate": 0.0001978957372311938, + "loss": 1.2184, + "step": 3405 + }, + { + "epoch": 0.1312800769971126, + "grad_norm": 1.0825196504592896, + "learning_rate": 0.00019788956191467994, + "loss": 1.3891, + "step": 3410 + }, + { + "epoch": 0.1314725697786333, + "grad_norm": 1.9972898960113525, + "learning_rate": 0.00019788337764681412, + "loss": 1.3207, + "step": 3415 + }, + { + "epoch": 0.131665062560154, + "grad_norm": 1.3864003419876099, + "learning_rate": 0.00019787718442816182, + "loss": 1.3791, + "step": 3420 + }, + { + "epoch": 0.13185755534167468, + "grad_norm": 1.3315006494522095, + "learning_rate": 0.0001978709822592894, + "loss": 1.4253, + "step": 3425 + }, + { + "epoch": 0.13205004812319537, + "grad_norm": 1.0171843767166138, + "learning_rate": 0.00019786477114076397, + "loss": 1.2974, + "step": 3430 + }, + { + "epoch": 0.13224254090471607, + "grad_norm": 1.293380618095398, + "learning_rate": 0.00019785855107315353, + "loss": 1.3616, + "step": 3435 + }, + { + "epoch": 0.13243503368623677, + "grad_norm": 2.0498528480529785, + "learning_rate": 0.00019785232205702681, + "loss": 1.3431, + "step": 3440 + }, + { + "epoch": 0.13262752646775747, + "grad_norm": 0.8635803461074829, + "learning_rate": 0.0001978460840929535, + "loss": 1.3672, + "step": 3445 + }, + { + "epoch": 0.13282001924927817, + "grad_norm": 0.9983857274055481, + "learning_rate": 0.00019783983718150392, + "loss": 1.4856, + "step": 3450 + }, + { + "epoch": 0.13301251203079884, + "grad_norm": 4.542407989501953, + "learning_rate": 0.00019783358132324937, + "loss": 1.4599, + "step": 3455 + }, + { + "epoch": 0.13320500481231953, + "grad_norm": 1.5495860576629639, + "learning_rate": 0.00019782731651876194, + "loss": 1.3641, + "step": 3460 + }, + { + "epoch": 0.13339749759384023, + "grad_norm": 1.2070780992507935, + "learning_rate": 0.00019782104276861443, + "loss": 1.3596, + "step": 3465 + }, + { + "epoch": 0.13358999037536093, + "grad_norm": 1.1749752759933472, + "learning_rate": 0.00019781476007338058, + "loss": 1.2387, + "step": 3470 + }, + { + "epoch": 0.13378248315688163, + "grad_norm": 1.8580079078674316, + "learning_rate": 0.00019780846843363485, + "loss": 1.3966, + "step": 3475 + }, + { + "epoch": 0.1339749759384023, + "grad_norm": 1.9713795185089111, + "learning_rate": 0.00019780216784995265, + "loss": 1.2541, + "step": 3480 + }, + { + "epoch": 0.134167468719923, + "grad_norm": 1.4017597436904907, + "learning_rate": 0.00019779585832291002, + "loss": 1.4827, + "step": 3485 + }, + { + "epoch": 0.1343599615014437, + "grad_norm": 1.188761591911316, + "learning_rate": 0.00019778953985308406, + "loss": 1.3972, + "step": 3490 + }, + { + "epoch": 0.1345524542829644, + "grad_norm": 1.0930372476577759, + "learning_rate": 0.00019778321244105242, + "loss": 1.4706, + "step": 3495 + }, + { + "epoch": 0.1347449470644851, + "grad_norm": 1.3041532039642334, + "learning_rate": 0.0001977768760873938, + "loss": 1.1929, + "step": 3500 + }, + { + "epoch": 0.13493743984600579, + "grad_norm": 2.6741833686828613, + "learning_rate": 0.00019777053079268753, + "loss": 1.268, + "step": 3505 + }, + { + "epoch": 0.13512993262752646, + "grad_norm": 1.091823935508728, + "learning_rate": 0.0001977641765575139, + "loss": 1.2776, + "step": 3510 + }, + { + "epoch": 0.13532242540904715, + "grad_norm": 0.9205764532089233, + "learning_rate": 0.00019775781338245398, + "loss": 1.3007, + "step": 3515 + }, + { + "epoch": 0.13551491819056785, + "grad_norm": 1.6321576833724976, + "learning_rate": 0.00019775144126808958, + "loss": 1.4214, + "step": 3520 + }, + { + "epoch": 0.13570741097208855, + "grad_norm": 1.7947146892547607, + "learning_rate": 0.00019774506021500343, + "loss": 1.3895, + "step": 3525 + }, + { + "epoch": 0.13589990375360925, + "grad_norm": 1.6696717739105225, + "learning_rate": 0.00019773867022377902, + "loss": 1.3968, + "step": 3530 + }, + { + "epoch": 0.13609239653512994, + "grad_norm": 1.1003444194793701, + "learning_rate": 0.0001977322712950007, + "loss": 1.4084, + "step": 3535 + }, + { + "epoch": 0.13628488931665061, + "grad_norm": 1.0268352031707764, + "learning_rate": 0.00019772586342925357, + "loss": 1.254, + "step": 3540 + }, + { + "epoch": 0.1364773820981713, + "grad_norm": 1.3906810283660889, + "learning_rate": 0.0001977194466271236, + "loss": 1.3266, + "step": 3545 + }, + { + "epoch": 0.136669874879692, + "grad_norm": 1.1786664724349976, + "learning_rate": 0.00019771302088919757, + "loss": 1.3114, + "step": 3550 + }, + { + "epoch": 0.1368623676612127, + "grad_norm": 1.0252714157104492, + "learning_rate": 0.00019770658621606307, + "loss": 1.2089, + "step": 3555 + }, + { + "epoch": 0.1370548604427334, + "grad_norm": 0.8099033236503601, + "learning_rate": 0.00019770014260830853, + "loss": 1.2607, + "step": 3560 + }, + { + "epoch": 0.1372473532242541, + "grad_norm": 1.3679542541503906, + "learning_rate": 0.0001976936900665231, + "loss": 1.376, + "step": 3565 + }, + { + "epoch": 0.13743984600577477, + "grad_norm": 1.7685283422470093, + "learning_rate": 0.00019768722859129693, + "loss": 1.4522, + "step": 3570 + }, + { + "epoch": 0.13763233878729547, + "grad_norm": 1.0158277750015259, + "learning_rate": 0.00019768075818322081, + "loss": 1.2714, + "step": 3575 + }, + { + "epoch": 0.13782483156881617, + "grad_norm": 1.7043020725250244, + "learning_rate": 0.00019767427884288642, + "loss": 1.5669, + "step": 3580 + }, + { + "epoch": 0.13801732435033687, + "grad_norm": 1.8171344995498657, + "learning_rate": 0.00019766779057088627, + "loss": 1.4186, + "step": 3585 + }, + { + "epoch": 0.13820981713185757, + "grad_norm": 1.0524088144302368, + "learning_rate": 0.00019766129336781365, + "loss": 1.167, + "step": 3590 + }, + { + "epoch": 0.13840230991337824, + "grad_norm": 1.558383584022522, + "learning_rate": 0.0001976547872342627, + "loss": 1.5015, + "step": 3595 + }, + { + "epoch": 0.13859480269489893, + "grad_norm": 1.9925919771194458, + "learning_rate": 0.00019764827217082838, + "loss": 1.3661, + "step": 3600 + }, + { + "epoch": 0.13878729547641963, + "grad_norm": 1.5693559646606445, + "learning_rate": 0.0001976417481781064, + "loss": 1.3389, + "step": 3605 + }, + { + "epoch": 0.13897978825794033, + "grad_norm": 1.2609871625900269, + "learning_rate": 0.00019763521525669343, + "loss": 1.2883, + "step": 3610 + }, + { + "epoch": 0.13917228103946103, + "grad_norm": 1.4910306930541992, + "learning_rate": 0.00019762867340718674, + "loss": 1.4237, + "step": 3615 + }, + { + "epoch": 0.13936477382098172, + "grad_norm": 0.9409481287002563, + "learning_rate": 0.0001976221226301846, + "loss": 1.4289, + "step": 3620 + }, + { + "epoch": 0.1395572666025024, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.00019761556292628604, + "loss": 1.2987, + "step": 3625 + }, + { + "epoch": 0.1397497593840231, + "grad_norm": 0.9329832792282104, + "learning_rate": 0.0001976089942960909, + "loss": 1.3709, + "step": 3630 + }, + { + "epoch": 0.1399422521655438, + "grad_norm": 1.7852829694747925, + "learning_rate": 0.00019760241674019984, + "loss": 1.2282, + "step": 3635 + }, + { + "epoch": 0.1401347449470645, + "grad_norm": 1.0068609714508057, + "learning_rate": 0.0001975958302592143, + "loss": 1.3143, + "step": 3640 + }, + { + "epoch": 0.14032723772858519, + "grad_norm": 2.1680188179016113, + "learning_rate": 0.0001975892348537366, + "loss": 1.4447, + "step": 3645 + }, + { + "epoch": 0.14051973051010588, + "grad_norm": 1.633169412612915, + "learning_rate": 0.00019758263052436988, + "loss": 1.2633, + "step": 3650 + }, + { + "epoch": 0.14071222329162655, + "grad_norm": 1.3609623908996582, + "learning_rate": 0.000197576017271718, + "loss": 1.3352, + "step": 3655 + }, + { + "epoch": 0.14090471607314725, + "grad_norm": 1.50294828414917, + "learning_rate": 0.00019756939509638573, + "loss": 1.3557, + "step": 3660 + }, + { + "epoch": 0.14109720885466795, + "grad_norm": 0.9931232333183289, + "learning_rate": 0.0001975627639989786, + "loss": 1.4719, + "step": 3665 + }, + { + "epoch": 0.14128970163618865, + "grad_norm": 1.3870011568069458, + "learning_rate": 0.000197556123980103, + "loss": 1.5173, + "step": 3670 + }, + { + "epoch": 0.14148219441770934, + "grad_norm": 1.274064540863037, + "learning_rate": 0.00019754947504036608, + "loss": 1.3951, + "step": 3675 + }, + { + "epoch": 0.14167468719923004, + "grad_norm": 1.6096014976501465, + "learning_rate": 0.00019754281718037593, + "loss": 1.4478, + "step": 3680 + }, + { + "epoch": 0.1418671799807507, + "grad_norm": 1.155772089958191, + "learning_rate": 0.00019753615040074131, + "loss": 1.229, + "step": 3685 + }, + { + "epoch": 0.1420596727622714, + "grad_norm": 1.123856544494629, + "learning_rate": 0.0001975294747020718, + "loss": 1.5036, + "step": 3690 + }, + { + "epoch": 0.1422521655437921, + "grad_norm": 1.541308879852295, + "learning_rate": 0.00019752279008497796, + "loss": 1.1174, + "step": 3695 + }, + { + "epoch": 0.1424446583253128, + "grad_norm": 1.8912441730499268, + "learning_rate": 0.00019751609655007098, + "loss": 1.3753, + "step": 3700 + }, + { + "epoch": 0.1426371511068335, + "grad_norm": 1.7746648788452148, + "learning_rate": 0.00019750939409796293, + "loss": 1.3115, + "step": 3705 + }, + { + "epoch": 0.14282964388835417, + "grad_norm": 1.2228045463562012, + "learning_rate": 0.00019750268272926676, + "loss": 1.3477, + "step": 3710 + }, + { + "epoch": 0.14302213666987487, + "grad_norm": 1.5031695365905762, + "learning_rate": 0.00019749596244459614, + "loss": 1.1905, + "step": 3715 + }, + { + "epoch": 0.14321462945139557, + "grad_norm": 2.871879816055298, + "learning_rate": 0.0001974892332445656, + "loss": 1.3334, + "step": 3720 + }, + { + "epoch": 0.14340712223291627, + "grad_norm": 1.1911511421203613, + "learning_rate": 0.00019748249512979048, + "loss": 1.2528, + "step": 3725 + }, + { + "epoch": 0.14359961501443697, + "grad_norm": 1.2722115516662598, + "learning_rate": 0.00019747574810088697, + "loss": 1.3314, + "step": 3730 + }, + { + "epoch": 0.14379210779595766, + "grad_norm": 1.0464539527893066, + "learning_rate": 0.00019746899215847198, + "loss": 1.1621, + "step": 3735 + }, + { + "epoch": 0.14398460057747833, + "grad_norm": 1.8877158164978027, + "learning_rate": 0.00019746222730316338, + "loss": 1.2534, + "step": 3740 + }, + { + "epoch": 0.14417709335899903, + "grad_norm": 1.5137780904769897, + "learning_rate": 0.00019745545353557967, + "loss": 1.1738, + "step": 3745 + }, + { + "epoch": 0.14436958614051973, + "grad_norm": 1.7104227542877197, + "learning_rate": 0.00019744867085634034, + "loss": 1.2868, + "step": 3750 + }, + { + "epoch": 0.14456207892204043, + "grad_norm": 1.2920212745666504, + "learning_rate": 0.00019744187926606558, + "loss": 1.3054, + "step": 3755 + }, + { + "epoch": 0.14475457170356112, + "grad_norm": 2.3661959171295166, + "learning_rate": 0.00019743507876537647, + "loss": 1.3187, + "step": 3760 + }, + { + "epoch": 0.14494706448508182, + "grad_norm": 1.4622807502746582, + "learning_rate": 0.00019742826935489487, + "loss": 1.1548, + "step": 3765 + }, + { + "epoch": 0.1451395572666025, + "grad_norm": 1.7818437814712524, + "learning_rate": 0.00019742145103524342, + "loss": 1.4081, + "step": 3770 + }, + { + "epoch": 0.1453320500481232, + "grad_norm": 1.023716926574707, + "learning_rate": 0.00019741462380704566, + "loss": 1.3367, + "step": 3775 + }, + { + "epoch": 0.1455245428296439, + "grad_norm": 1.4382961988449097, + "learning_rate": 0.00019740778767092585, + "loss": 1.3498, + "step": 3780 + }, + { + "epoch": 0.14571703561116459, + "grad_norm": 1.5282870531082153, + "learning_rate": 0.0001974009426275091, + "loss": 1.2685, + "step": 3785 + }, + { + "epoch": 0.14590952839268528, + "grad_norm": 1.2222365140914917, + "learning_rate": 0.0001973940886774214, + "loss": 1.2273, + "step": 3790 + }, + { + "epoch": 0.14610202117420595, + "grad_norm": 1.3231360912322998, + "learning_rate": 0.00019738722582128944, + "loss": 1.5449, + "step": 3795 + }, + { + "epoch": 0.14629451395572665, + "grad_norm": 1.2198995351791382, + "learning_rate": 0.00019738035405974085, + "loss": 1.4927, + "step": 3800 + }, + { + "epoch": 0.14648700673724735, + "grad_norm": 1.1108288764953613, + "learning_rate": 0.00019737347339340394, + "loss": 1.3894, + "step": 3805 + }, + { + "epoch": 0.14667949951876805, + "grad_norm": 1.1478091478347778, + "learning_rate": 0.0001973665838229079, + "loss": 1.342, + "step": 3810 + }, + { + "epoch": 0.14687199230028875, + "grad_norm": 1.555680751800537, + "learning_rate": 0.0001973596853488828, + "loss": 1.269, + "step": 3815 + }, + { + "epoch": 0.14706448508180944, + "grad_norm": 1.2819339036941528, + "learning_rate": 0.0001973527779719594, + "loss": 1.3462, + "step": 3820 + }, + { + "epoch": 0.1472569778633301, + "grad_norm": 1.6733057498931885, + "learning_rate": 0.00019734586169276939, + "loss": 1.3179, + "step": 3825 + }, + { + "epoch": 0.1474494706448508, + "grad_norm": 1.8622225522994995, + "learning_rate": 0.00019733893651194517, + "loss": 1.452, + "step": 3830 + }, + { + "epoch": 0.1476419634263715, + "grad_norm": 1.2225052118301392, + "learning_rate": 0.00019733200243012006, + "loss": 1.2925, + "step": 3835 + }, + { + "epoch": 0.1478344562078922, + "grad_norm": 0.7980884313583374, + "learning_rate": 0.00019732505944792804, + "loss": 1.1505, + "step": 3840 + }, + { + "epoch": 0.1480269489894129, + "grad_norm": 1.3874131441116333, + "learning_rate": 0.00019731810756600405, + "loss": 1.2989, + "step": 3845 + }, + { + "epoch": 0.1482194417709336, + "grad_norm": 1.4387590885162354, + "learning_rate": 0.00019731114678498378, + "loss": 1.3295, + "step": 3850 + }, + { + "epoch": 0.14841193455245427, + "grad_norm": 1.8189646005630493, + "learning_rate": 0.00019730417710550383, + "loss": 1.2926, + "step": 3855 + }, + { + "epoch": 0.14860442733397497, + "grad_norm": 0.9577664732933044, + "learning_rate": 0.0001972971985282014, + "loss": 1.2375, + "step": 3860 + }, + { + "epoch": 0.14879692011549567, + "grad_norm": 1.7154825925827026, + "learning_rate": 0.00019729021105371474, + "loss": 1.2853, + "step": 3865 + }, + { + "epoch": 0.14898941289701637, + "grad_norm": 2.1061089038848877, + "learning_rate": 0.00019728321468268277, + "loss": 1.3391, + "step": 3870 + }, + { + "epoch": 0.14918190567853706, + "grad_norm": 1.0177017450332642, + "learning_rate": 0.00019727620941574524, + "loss": 1.2801, + "step": 3875 + }, + { + "epoch": 0.14937439846005776, + "grad_norm": 1.0773547887802124, + "learning_rate": 0.00019726919525354277, + "loss": 1.3063, + "step": 3880 + }, + { + "epoch": 0.14956689124157843, + "grad_norm": 0.9082854986190796, + "learning_rate": 0.00019726217219671673, + "loss": 1.3601, + "step": 3885 + }, + { + "epoch": 0.14975938402309913, + "grad_norm": 1.341280221939087, + "learning_rate": 0.00019725514024590934, + "loss": 1.4052, + "step": 3890 + }, + { + "epoch": 0.14995187680461983, + "grad_norm": 2.240399122238159, + "learning_rate": 0.00019724809940176364, + "loss": 1.1955, + "step": 3895 + }, + { + "epoch": 0.15014436958614052, + "grad_norm": 1.549137830734253, + "learning_rate": 0.00019724104966492348, + "loss": 1.3089, + "step": 3900 + }, + { + "epoch": 0.15033686236766122, + "grad_norm": 1.6887294054031372, + "learning_rate": 0.00019723399103603346, + "loss": 1.4147, + "step": 3905 + }, + { + "epoch": 0.1505293551491819, + "grad_norm": 1.793087363243103, + "learning_rate": 0.0001972269235157391, + "loss": 1.2674, + "step": 3910 + }, + { + "epoch": 0.1507218479307026, + "grad_norm": 1.718336820602417, + "learning_rate": 0.00019721984710468663, + "loss": 1.2716, + "step": 3915 + }, + { + "epoch": 0.1509143407122233, + "grad_norm": 2.2342288494110107, + "learning_rate": 0.0001972127618035232, + "loss": 0.965, + "step": 3920 + }, + { + "epoch": 0.15110683349374399, + "grad_norm": 1.5450822114944458, + "learning_rate": 0.00019720566761289665, + "loss": 1.3461, + "step": 3925 + }, + { + "epoch": 0.15129932627526468, + "grad_norm": 1.4395346641540527, + "learning_rate": 0.0001971985645334557, + "loss": 1.3462, + "step": 3930 + }, + { + "epoch": 0.15149181905678538, + "grad_norm": 1.1160500049591064, + "learning_rate": 0.00019719145256584994, + "loss": 1.3334, + "step": 3935 + }, + { + "epoch": 0.15168431183830605, + "grad_norm": 1.0270999670028687, + "learning_rate": 0.00019718433171072967, + "loss": 1.2737, + "step": 3940 + }, + { + "epoch": 0.15187680461982675, + "grad_norm": 1.4266023635864258, + "learning_rate": 0.00019717720196874608, + "loss": 1.3639, + "step": 3945 + }, + { + "epoch": 0.15206929740134745, + "grad_norm": 1.552283525466919, + "learning_rate": 0.00019717006334055108, + "loss": 1.301, + "step": 3950 + }, + { + "epoch": 0.15226179018286815, + "grad_norm": 1.5459437370300293, + "learning_rate": 0.0001971629158267975, + "loss": 1.265, + "step": 3955 + }, + { + "epoch": 0.15245428296438884, + "grad_norm": 1.4866915941238403, + "learning_rate": 0.00019715575942813888, + "loss": 1.5694, + "step": 3960 + }, + { + "epoch": 0.15264677574590954, + "grad_norm": 1.1116254329681396, + "learning_rate": 0.00019714859414522967, + "loss": 1.4858, + "step": 3965 + }, + { + "epoch": 0.1528392685274302, + "grad_norm": 1.1708245277404785, + "learning_rate": 0.0001971414199787251, + "loss": 1.3582, + "step": 3970 + }, + { + "epoch": 0.1530317613089509, + "grad_norm": 1.1672711372375488, + "learning_rate": 0.00019713423692928114, + "loss": 1.3393, + "step": 3975 + }, + { + "epoch": 0.1532242540904716, + "grad_norm": 1.4800153970718384, + "learning_rate": 0.0001971270449975547, + "loss": 1.22, + "step": 3980 + }, + { + "epoch": 0.1534167468719923, + "grad_norm": 1.92826509475708, + "learning_rate": 0.00019711984418420338, + "loss": 1.3902, + "step": 3985 + }, + { + "epoch": 0.153609239653513, + "grad_norm": 1.2292252779006958, + "learning_rate": 0.00019711263448988567, + "loss": 1.2327, + "step": 3990 + }, + { + "epoch": 0.1538017324350337, + "grad_norm": 1.1007169485092163, + "learning_rate": 0.00019710541591526085, + "loss": 1.4284, + "step": 3995 + }, + { + "epoch": 0.15399422521655437, + "grad_norm": 0.9456301927566528, + "learning_rate": 0.00019709818846098905, + "loss": 1.1589, + "step": 4000 + }, + { + "epoch": 0.15418671799807507, + "grad_norm": 1.518704891204834, + "learning_rate": 0.0001970909521277311, + "loss": 1.3976, + "step": 4005 + }, + { + "epoch": 0.15437921077959577, + "grad_norm": 1.3318589925765991, + "learning_rate": 0.00019708370691614872, + "loss": 1.3635, + "step": 4010 + }, + { + "epoch": 0.15457170356111646, + "grad_norm": 1.752626657485962, + "learning_rate": 0.0001970764528269045, + "loss": 1.3175, + "step": 4015 + }, + { + "epoch": 0.15476419634263716, + "grad_norm": 2.055469512939453, + "learning_rate": 0.00019706918986066172, + "loss": 1.2873, + "step": 4020 + }, + { + "epoch": 0.15495668912415783, + "grad_norm": 2.1063289642333984, + "learning_rate": 0.00019706191801808457, + "loss": 1.3208, + "step": 4025 + }, + { + "epoch": 0.15514918190567853, + "grad_norm": 1.2449209690093994, + "learning_rate": 0.00019705463729983798, + "loss": 1.2863, + "step": 4030 + }, + { + "epoch": 0.15534167468719923, + "grad_norm": 1.4950852394104004, + "learning_rate": 0.00019704734770658778, + "loss": 1.2338, + "step": 4035 + }, + { + "epoch": 0.15553416746871992, + "grad_norm": 0.9372254014015198, + "learning_rate": 0.00019704004923900046, + "loss": 1.2105, + "step": 4040 + }, + { + "epoch": 0.15572666025024062, + "grad_norm": 1.2273038625717163, + "learning_rate": 0.00019703274189774347, + "loss": 1.3584, + "step": 4045 + }, + { + "epoch": 0.15591915303176132, + "grad_norm": 1.1560612916946411, + "learning_rate": 0.00019702542568348502, + "loss": 1.432, + "step": 4050 + }, + { + "epoch": 0.156111645813282, + "grad_norm": 1.2214939594268799, + "learning_rate": 0.00019701810059689415, + "loss": 1.3237, + "step": 4055 + }, + { + "epoch": 0.1563041385948027, + "grad_norm": 1.255182147026062, + "learning_rate": 0.00019701076663864066, + "loss": 1.5111, + "step": 4060 + }, + { + "epoch": 0.1564966313763234, + "grad_norm": 1.2496423721313477, + "learning_rate": 0.0001970034238093952, + "loss": 1.3917, + "step": 4065 + }, + { + "epoch": 0.15668912415784408, + "grad_norm": 2.773935556411743, + "learning_rate": 0.00019699607210982918, + "loss": 1.3072, + "step": 4070 + }, + { + "epoch": 0.15688161693936478, + "grad_norm": 2.5853006839752197, + "learning_rate": 0.00019698871154061497, + "loss": 1.2737, + "step": 4075 + }, + { + "epoch": 0.15707410972088548, + "grad_norm": 0.9573465585708618, + "learning_rate": 0.00019698134210242553, + "loss": 1.411, + "step": 4080 + }, + { + "epoch": 0.15726660250240615, + "grad_norm": 2.204242467880249, + "learning_rate": 0.00019697396379593482, + "loss": 1.2493, + "step": 4085 + }, + { + "epoch": 0.15745909528392685, + "grad_norm": 1.4688855409622192, + "learning_rate": 0.0001969665766218175, + "loss": 1.273, + "step": 4090 + }, + { + "epoch": 0.15765158806544755, + "grad_norm": 2.1439919471740723, + "learning_rate": 0.0001969591805807491, + "loss": 1.4691, + "step": 4095 + }, + { + "epoch": 0.15784408084696824, + "grad_norm": 1.4877434968948364, + "learning_rate": 0.00019695177567340594, + "loss": 1.4427, + "step": 4100 + }, + { + "epoch": 0.15803657362848894, + "grad_norm": 1.3709458112716675, + "learning_rate": 0.00019694436190046514, + "loss": 1.2713, + "step": 4105 + }, + { + "epoch": 0.1582290664100096, + "grad_norm": 2.1676931381225586, + "learning_rate": 0.00019693693926260464, + "loss": 1.1888, + "step": 4110 + }, + { + "epoch": 0.1584215591915303, + "grad_norm": 1.1726205348968506, + "learning_rate": 0.0001969295077605032, + "loss": 1.3544, + "step": 4115 + }, + { + "epoch": 0.158614051973051, + "grad_norm": 1.2441811561584473, + "learning_rate": 0.00019692206739484037, + "loss": 1.4796, + "step": 4120 + }, + { + "epoch": 0.1588065447545717, + "grad_norm": 1.4889960289001465, + "learning_rate": 0.00019691461816629652, + "loss": 1.418, + "step": 4125 + }, + { + "epoch": 0.1589990375360924, + "grad_norm": 1.3810794353485107, + "learning_rate": 0.00019690716007555282, + "loss": 1.6398, + "step": 4130 + }, + { + "epoch": 0.1591915303176131, + "grad_norm": 1.589390754699707, + "learning_rate": 0.00019689969312329132, + "loss": 1.3203, + "step": 4135 + }, + { + "epoch": 0.15938402309913377, + "grad_norm": 0.8731974959373474, + "learning_rate": 0.00019689221731019477, + "loss": 1.2408, + "step": 4140 + }, + { + "epoch": 0.15957651588065447, + "grad_norm": 1.046852707862854, + "learning_rate": 0.00019688473263694678, + "loss": 1.1249, + "step": 4145 + }, + { + "epoch": 0.15976900866217517, + "grad_norm": 0.8767102360725403, + "learning_rate": 0.0001968772391042318, + "loss": 1.2611, + "step": 4150 + }, + { + "epoch": 0.15996150144369586, + "grad_norm": 1.1452685594558716, + "learning_rate": 0.0001968697367127351, + "loss": 1.2992, + "step": 4155 + }, + { + "epoch": 0.16015399422521656, + "grad_norm": 0.9254185557365417, + "learning_rate": 0.00019686222546314266, + "loss": 1.3894, + "step": 4160 + }, + { + "epoch": 0.16034648700673726, + "grad_norm": 0.9607768654823303, + "learning_rate": 0.00019685470535614133, + "loss": 1.3076, + "step": 4165 + }, + { + "epoch": 0.16053897978825793, + "grad_norm": 1.2880384922027588, + "learning_rate": 0.0001968471763924188, + "loss": 1.3868, + "step": 4170 + }, + { + "epoch": 0.16073147256977863, + "grad_norm": 1.1116464138031006, + "learning_rate": 0.00019683963857266356, + "loss": 1.2489, + "step": 4175 + }, + { + "epoch": 0.16092396535129933, + "grad_norm": 0.9132522940635681, + "learning_rate": 0.0001968320918975649, + "loss": 1.3788, + "step": 4180 + }, + { + "epoch": 0.16111645813282002, + "grad_norm": 1.1793001890182495, + "learning_rate": 0.00019682453636781283, + "loss": 1.4742, + "step": 4185 + }, + { + "epoch": 0.16130895091434072, + "grad_norm": 1.1624877452850342, + "learning_rate": 0.00019681697198409835, + "loss": 1.3547, + "step": 4190 + }, + { + "epoch": 0.16150144369586142, + "grad_norm": 1.1367181539535522, + "learning_rate": 0.00019680939874711312, + "loss": 1.3692, + "step": 4195 + }, + { + "epoch": 0.1616939364773821, + "grad_norm": 1.0168886184692383, + "learning_rate": 0.00019680181665754972, + "loss": 1.4148, + "step": 4200 + }, + { + "epoch": 0.1618864292589028, + "grad_norm": 1.3179705142974854, + "learning_rate": 0.0001967942257161014, + "loss": 1.2674, + "step": 4205 + }, + { + "epoch": 0.16207892204042348, + "grad_norm": 0.8679062724113464, + "learning_rate": 0.00019678662592346235, + "loss": 1.4001, + "step": 4210 + }, + { + "epoch": 0.16227141482194418, + "grad_norm": 0.8477693200111389, + "learning_rate": 0.00019677901728032754, + "loss": 1.3527, + "step": 4215 + }, + { + "epoch": 0.16246390760346488, + "grad_norm": 1.280357003211975, + "learning_rate": 0.00019677139978739266, + "loss": 1.2576, + "step": 4220 + }, + { + "epoch": 0.16265640038498555, + "grad_norm": 3.5572381019592285, + "learning_rate": 0.00019676377344535434, + "loss": 1.3059, + "step": 4225 + }, + { + "epoch": 0.16284889316650625, + "grad_norm": 0.9162838459014893, + "learning_rate": 0.0001967561382549099, + "loss": 1.3655, + "step": 4230 + }, + { + "epoch": 0.16304138594802695, + "grad_norm": 1.0635076761245728, + "learning_rate": 0.00019674849421675764, + "loss": 1.2356, + "step": 4235 + }, + { + "epoch": 0.16323387872954764, + "grad_norm": 2.3638720512390137, + "learning_rate": 0.00019674084133159642, + "loss": 1.3598, + "step": 4240 + }, + { + "epoch": 0.16342637151106834, + "grad_norm": 1.013108730316162, + "learning_rate": 0.00019673317960012615, + "loss": 1.6119, + "step": 4245 + }, + { + "epoch": 0.16361886429258904, + "grad_norm": 1.391450047492981, + "learning_rate": 0.00019672550902304737, + "loss": 1.2481, + "step": 4250 + }, + { + "epoch": 0.1638113570741097, + "grad_norm": 1.5574865341186523, + "learning_rate": 0.00019671782960106157, + "loss": 1.345, + "step": 4255 + }, + { + "epoch": 0.1640038498556304, + "grad_norm": 1.8456825017929077, + "learning_rate": 0.00019671014133487095, + "loss": 1.3582, + "step": 4260 + }, + { + "epoch": 0.1641963426371511, + "grad_norm": 1.4087297916412354, + "learning_rate": 0.00019670244422517855, + "loss": 1.3162, + "step": 4265 + }, + { + "epoch": 0.1643888354186718, + "grad_norm": 1.167403221130371, + "learning_rate": 0.0001966947382726882, + "loss": 1.3841, + "step": 4270 + }, + { + "epoch": 0.1645813282001925, + "grad_norm": 1.3395906686782837, + "learning_rate": 0.0001966870234781046, + "loss": 1.1306, + "step": 4275 + }, + { + "epoch": 0.1647738209817132, + "grad_norm": 0.8549813628196716, + "learning_rate": 0.00019667929984213317, + "loss": 1.3017, + "step": 4280 + }, + { + "epoch": 0.16496631376323387, + "grad_norm": 0.8681890368461609, + "learning_rate": 0.00019667156736548021, + "loss": 1.2152, + "step": 4285 + }, + { + "epoch": 0.16515880654475457, + "grad_norm": 1.8476097583770752, + "learning_rate": 0.00019666382604885283, + "loss": 1.2571, + "step": 4290 + }, + { + "epoch": 0.16535129932627526, + "grad_norm": 1.6583194732666016, + "learning_rate": 0.00019665607589295888, + "loss": 1.3866, + "step": 4295 + }, + { + "epoch": 0.16554379210779596, + "grad_norm": 1.6784121990203857, + "learning_rate": 0.00019664831689850712, + "loss": 1.2966, + "step": 4300 + }, + { + "epoch": 0.16573628488931666, + "grad_norm": 1.5268521308898926, + "learning_rate": 0.00019664054906620696, + "loss": 1.3086, + "step": 4305 + }, + { + "epoch": 0.16592877767083736, + "grad_norm": 2.0114951133728027, + "learning_rate": 0.00019663277239676877, + "loss": 1.2137, + "step": 4310 + }, + { + "epoch": 0.16612127045235803, + "grad_norm": 1.4572757482528687, + "learning_rate": 0.00019662498689090372, + "loss": 1.2505, + "step": 4315 + }, + { + "epoch": 0.16631376323387873, + "grad_norm": 1.4267566204071045, + "learning_rate": 0.00019661719254932369, + "loss": 1.1485, + "step": 4320 + }, + { + "epoch": 0.16650625601539942, + "grad_norm": 0.9921162128448486, + "learning_rate": 0.00019660938937274142, + "loss": 1.304, + "step": 4325 + }, + { + "epoch": 0.16669874879692012, + "grad_norm": 1.3901869058609009, + "learning_rate": 0.00019660157736187047, + "loss": 1.4347, + "step": 4330 + }, + { + "epoch": 0.16689124157844082, + "grad_norm": 1.5446443557739258, + "learning_rate": 0.0001965937565174252, + "loss": 1.3157, + "step": 4335 + }, + { + "epoch": 0.1670837343599615, + "grad_norm": 1.2553350925445557, + "learning_rate": 0.0001965859268401208, + "loss": 1.1882, + "step": 4340 + }, + { + "epoch": 0.1672762271414822, + "grad_norm": 1.9385195970535278, + "learning_rate": 0.0001965780883306732, + "loss": 1.4522, + "step": 4345 + }, + { + "epoch": 0.16746871992300288, + "grad_norm": 1.426032543182373, + "learning_rate": 0.00019657024098979916, + "loss": 1.1029, + "step": 4350 + }, + { + "epoch": 0.16766121270452358, + "grad_norm": 1.5562461614608765, + "learning_rate": 0.0001965623848182163, + "loss": 1.4837, + "step": 4355 + }, + { + "epoch": 0.16785370548604428, + "grad_norm": 1.0057613849639893, + "learning_rate": 0.00019655451981664306, + "loss": 1.3095, + "step": 4360 + }, + { + "epoch": 0.16804619826756498, + "grad_norm": 1.447845697402954, + "learning_rate": 0.00019654664598579857, + "loss": 1.4002, + "step": 4365 + }, + { + "epoch": 0.16823869104908565, + "grad_norm": 0.9452415108680725, + "learning_rate": 0.00019653876332640288, + "loss": 1.3324, + "step": 4370 + }, + { + "epoch": 0.16843118383060635, + "grad_norm": 1.7831186056137085, + "learning_rate": 0.00019653087183917677, + "loss": 1.3004, + "step": 4375 + }, + { + "epoch": 0.16862367661212704, + "grad_norm": 1.0656229257583618, + "learning_rate": 0.0001965229715248419, + "loss": 1.5165, + "step": 4380 + }, + { + "epoch": 0.16881616939364774, + "grad_norm": 1.0360915660858154, + "learning_rate": 0.0001965150623841207, + "loss": 1.2842, + "step": 4385 + }, + { + "epoch": 0.16900866217516844, + "grad_norm": 1.286447525024414, + "learning_rate": 0.00019650714441773643, + "loss": 1.2902, + "step": 4390 + }, + { + "epoch": 0.16920115495668914, + "grad_norm": 1.2435790300369263, + "learning_rate": 0.00019649921762641306, + "loss": 1.3049, + "step": 4395 + }, + { + "epoch": 0.1693936477382098, + "grad_norm": 1.9299678802490234, + "learning_rate": 0.0001964912820108755, + "loss": 1.3057, + "step": 4400 + }, + { + "epoch": 0.1695861405197305, + "grad_norm": 1.7493208646774292, + "learning_rate": 0.0001964833375718494, + "loss": 1.3225, + "step": 4405 + }, + { + "epoch": 0.1697786333012512, + "grad_norm": 1.3697878122329712, + "learning_rate": 0.0001964753843100612, + "loss": 1.3518, + "step": 4410 + }, + { + "epoch": 0.1699711260827719, + "grad_norm": 1.343985676765442, + "learning_rate": 0.0001964674222262382, + "loss": 1.3195, + "step": 4415 + }, + { + "epoch": 0.1701636188642926, + "grad_norm": 1.0094975233078003, + "learning_rate": 0.00019645945132110853, + "loss": 1.3184, + "step": 4420 + }, + { + "epoch": 0.17035611164581327, + "grad_norm": 1.6048771142959595, + "learning_rate": 0.00019645147159540096, + "loss": 1.3307, + "step": 4425 + }, + { + "epoch": 0.17054860442733397, + "grad_norm": 2.14099383354187, + "learning_rate": 0.00019644348304984524, + "loss": 1.3221, + "step": 4430 + }, + { + "epoch": 0.17074109720885466, + "grad_norm": 2.5571303367614746, + "learning_rate": 0.00019643548568517192, + "loss": 1.3092, + "step": 4435 + }, + { + "epoch": 0.17093358999037536, + "grad_norm": 1.1076972484588623, + "learning_rate": 0.00019642747950211225, + "loss": 1.1981, + "step": 4440 + }, + { + "epoch": 0.17112608277189606, + "grad_norm": 1.1315946578979492, + "learning_rate": 0.00019641946450139831, + "loss": 1.335, + "step": 4445 + }, + { + "epoch": 0.17131857555341676, + "grad_norm": 1.33171808719635, + "learning_rate": 0.00019641144068376312, + "loss": 1.4677, + "step": 4450 + }, + { + "epoch": 0.17151106833493743, + "grad_norm": 0.87531977891922, + "learning_rate": 0.0001964034080499403, + "loss": 1.1795, + "step": 4455 + }, + { + "epoch": 0.17170356111645813, + "grad_norm": 1.6923136711120605, + "learning_rate": 0.00019639536660066446, + "loss": 1.2491, + "step": 4460 + }, + { + "epoch": 0.17189605389797882, + "grad_norm": 1.481703519821167, + "learning_rate": 0.0001963873163366709, + "loss": 1.2894, + "step": 4465 + }, + { + "epoch": 0.17208854667949952, + "grad_norm": 3.3689515590667725, + "learning_rate": 0.00019637925725869576, + "loss": 1.3785, + "step": 4470 + }, + { + "epoch": 0.17228103946102022, + "grad_norm": 2.498059034347534, + "learning_rate": 0.000196371189367476, + "loss": 1.2854, + "step": 4475 + }, + { + "epoch": 0.17247353224254092, + "grad_norm": 1.2852959632873535, + "learning_rate": 0.00019636311266374939, + "loss": 1.2272, + "step": 4480 + }, + { + "epoch": 0.1726660250240616, + "grad_norm": 0.9257192015647888, + "learning_rate": 0.00019635502714825446, + "loss": 1.1707, + "step": 4485 + }, + { + "epoch": 0.17285851780558228, + "grad_norm": 0.989142656326294, + "learning_rate": 0.00019634693282173058, + "loss": 1.3174, + "step": 4490 + }, + { + "epoch": 0.17305101058710298, + "grad_norm": 1.4923882484436035, + "learning_rate": 0.00019633882968491794, + "loss": 1.2334, + "step": 4495 + }, + { + "epoch": 0.17324350336862368, + "grad_norm": 1.2684218883514404, + "learning_rate": 0.0001963307177385575, + "loss": 1.2468, + "step": 4500 + }, + { + "epoch": 0.17343599615014438, + "grad_norm": 0.9474775791168213, + "learning_rate": 0.0001963225969833911, + "loss": 1.2767, + "step": 4505 + }, + { + "epoch": 0.17362848893166508, + "grad_norm": 2.477541446685791, + "learning_rate": 0.00019631446742016126, + "loss": 1.4144, + "step": 4510 + }, + { + "epoch": 0.17382098171318575, + "grad_norm": 1.040477991104126, + "learning_rate": 0.00019630632904961138, + "loss": 1.5665, + "step": 4515 + }, + { + "epoch": 0.17401347449470644, + "grad_norm": 1.3127304315567017, + "learning_rate": 0.0001962981818724857, + "loss": 1.3511, + "step": 4520 + }, + { + "epoch": 0.17420596727622714, + "grad_norm": 1.6968106031417847, + "learning_rate": 0.0001962900258895292, + "loss": 1.3202, + "step": 4525 + }, + { + "epoch": 0.17439846005774784, + "grad_norm": 2.2431318759918213, + "learning_rate": 0.0001962818611014877, + "loss": 1.351, + "step": 4530 + }, + { + "epoch": 0.17459095283926854, + "grad_norm": 1.2938642501831055, + "learning_rate": 0.00019627368750910779, + "loss": 1.276, + "step": 4535 + }, + { + "epoch": 0.1747834456207892, + "grad_norm": 1.1331931352615356, + "learning_rate": 0.00019626550511313694, + "loss": 1.4734, + "step": 4540 + }, + { + "epoch": 0.1749759384023099, + "grad_norm": 1.4755507707595825, + "learning_rate": 0.00019625731391432333, + "loss": 1.24, + "step": 4545 + }, + { + "epoch": 0.1751684311838306, + "grad_norm": 1.5442554950714111, + "learning_rate": 0.00019624911391341604, + "loss": 1.0894, + "step": 4550 + }, + { + "epoch": 0.1753609239653513, + "grad_norm": 1.2970473766326904, + "learning_rate": 0.00019624090511116481, + "loss": 1.3262, + "step": 4555 + }, + { + "epoch": 0.175553416746872, + "grad_norm": 2.1946523189544678, + "learning_rate": 0.0001962326875083204, + "loss": 1.4652, + "step": 4560 + }, + { + "epoch": 0.1757459095283927, + "grad_norm": 1.1216411590576172, + "learning_rate": 0.00019622446110563417, + "loss": 1.1608, + "step": 4565 + }, + { + "epoch": 0.17593840230991337, + "grad_norm": 1.996535301208496, + "learning_rate": 0.00019621622590385842, + "loss": 1.2568, + "step": 4570 + }, + { + "epoch": 0.17613089509143406, + "grad_norm": 1.9742660522460938, + "learning_rate": 0.0001962079819037462, + "loss": 1.3335, + "step": 4575 + }, + { + "epoch": 0.17632338787295476, + "grad_norm": 1.985192060470581, + "learning_rate": 0.00019619972910605134, + "loss": 1.3529, + "step": 4580 + }, + { + "epoch": 0.17651588065447546, + "grad_norm": 0.8765020966529846, + "learning_rate": 0.00019619146751152848, + "loss": 1.3956, + "step": 4585 + }, + { + "epoch": 0.17670837343599616, + "grad_norm": 1.483407974243164, + "learning_rate": 0.00019618319712093319, + "loss": 1.4396, + "step": 4590 + }, + { + "epoch": 0.17690086621751686, + "grad_norm": 1.5663124322891235, + "learning_rate": 0.00019617491793502164, + "loss": 1.3896, + "step": 4595 + }, + { + "epoch": 0.17709335899903753, + "grad_norm": 1.3831099271774292, + "learning_rate": 0.00019616662995455096, + "loss": 1.2669, + "step": 4600 + }, + { + "epoch": 0.17728585178055822, + "grad_norm": 0.8688403964042664, + "learning_rate": 0.00019615833318027898, + "loss": 1.2098, + "step": 4605 + }, + { + "epoch": 0.17747834456207892, + "grad_norm": 1.9218660593032837, + "learning_rate": 0.00019615002761296446, + "loss": 1.1568, + "step": 4610 + }, + { + "epoch": 0.17767083734359962, + "grad_norm": 1.5095698833465576, + "learning_rate": 0.00019614171325336684, + "loss": 1.0516, + "step": 4615 + }, + { + "epoch": 0.17786333012512032, + "grad_norm": 0.9288404583930969, + "learning_rate": 0.00019613339010224646, + "loss": 1.075, + "step": 4620 + }, + { + "epoch": 0.17805582290664101, + "grad_norm": 1.414787769317627, + "learning_rate": 0.00019612505816036434, + "loss": 1.2158, + "step": 4625 + }, + { + "epoch": 0.17824831568816168, + "grad_norm": 1.3182802200317383, + "learning_rate": 0.0001961167174284824, + "loss": 1.3719, + "step": 4630 + }, + { + "epoch": 0.17844080846968238, + "grad_norm": 1.1671231985092163, + "learning_rate": 0.0001961083679073634, + "loss": 1.3067, + "step": 4635 + }, + { + "epoch": 0.17863330125120308, + "grad_norm": 1.11225163936615, + "learning_rate": 0.0001961000095977708, + "loss": 1.1593, + "step": 4640 + }, + { + "epoch": 0.17882579403272378, + "grad_norm": 1.235335111618042, + "learning_rate": 0.00019609164250046894, + "loss": 1.2232, + "step": 4645 + }, + { + "epoch": 0.17901828681424448, + "grad_norm": 1.0023348331451416, + "learning_rate": 0.00019608326661622291, + "loss": 1.2926, + "step": 4650 + }, + { + "epoch": 0.17921077959576515, + "grad_norm": 1.7143383026123047, + "learning_rate": 0.00019607488194579867, + "loss": 1.3149, + "step": 4655 + }, + { + "epoch": 0.17940327237728584, + "grad_norm": 1.135324478149414, + "learning_rate": 0.00019606648848996287, + "loss": 1.4155, + "step": 4660 + }, + { + "epoch": 0.17959576515880654, + "grad_norm": 0.7830592393875122, + "learning_rate": 0.0001960580862494831, + "loss": 1.2632, + "step": 4665 + }, + { + "epoch": 0.17978825794032724, + "grad_norm": 1.546481966972351, + "learning_rate": 0.0001960496752251277, + "loss": 1.4674, + "step": 4670 + }, + { + "epoch": 0.17998075072184794, + "grad_norm": 1.5377360582351685, + "learning_rate": 0.00019604125541766574, + "loss": 1.0782, + "step": 4675 + }, + { + "epoch": 0.18017324350336864, + "grad_norm": 2.1382510662078857, + "learning_rate": 0.0001960328268278672, + "loss": 1.3008, + "step": 4680 + }, + { + "epoch": 0.1803657362848893, + "grad_norm": 1.4963937997817993, + "learning_rate": 0.00019602438945650277, + "loss": 1.2601, + "step": 4685 + }, + { + "epoch": 0.18055822906641, + "grad_norm": 1.4736862182617188, + "learning_rate": 0.00019601594330434405, + "loss": 1.163, + "step": 4690 + }, + { + "epoch": 0.1807507218479307, + "grad_norm": 0.9905889630317688, + "learning_rate": 0.00019600748837216337, + "loss": 1.3675, + "step": 4695 + }, + { + "epoch": 0.1809432146294514, + "grad_norm": 1.1800122261047363, + "learning_rate": 0.00019599902466073385, + "loss": 1.3252, + "step": 4700 + }, + { + "epoch": 0.1811357074109721, + "grad_norm": 1.1933966875076294, + "learning_rate": 0.00019599055217082949, + "loss": 1.2163, + "step": 4705 + }, + { + "epoch": 0.1813282001924928, + "grad_norm": 1.3980772495269775, + "learning_rate": 0.000195982070903225, + "loss": 1.2807, + "step": 4710 + }, + { + "epoch": 0.18152069297401346, + "grad_norm": 2.541808605194092, + "learning_rate": 0.00019597358085869594, + "loss": 1.1333, + "step": 4715 + }, + { + "epoch": 0.18171318575553416, + "grad_norm": 1.616479516029358, + "learning_rate": 0.0001959650820380187, + "loss": 1.2991, + "step": 4720 + }, + { + "epoch": 0.18190567853705486, + "grad_norm": 0.9473749399185181, + "learning_rate": 0.00019595657444197037, + "loss": 1.2273, + "step": 4725 + }, + { + "epoch": 0.18209817131857556, + "grad_norm": 1.3119609355926514, + "learning_rate": 0.000195948058071329, + "loss": 1.2754, + "step": 4730 + }, + { + "epoch": 0.18229066410009626, + "grad_norm": 1.0062682628631592, + "learning_rate": 0.00019593953292687332, + "loss": 1.2494, + "step": 4735 + }, + { + "epoch": 0.18248315688161693, + "grad_norm": 1.2124086618423462, + "learning_rate": 0.0001959309990093829, + "loss": 1.3725, + "step": 4740 + }, + { + "epoch": 0.18267564966313762, + "grad_norm": 1.2050824165344238, + "learning_rate": 0.0001959224563196381, + "loss": 1.5103, + "step": 4745 + }, + { + "epoch": 0.18286814244465832, + "grad_norm": 0.9262427091598511, + "learning_rate": 0.00019591390485842008, + "loss": 1.4155, + "step": 4750 + }, + { + "epoch": 0.18306063522617902, + "grad_norm": 1.5612881183624268, + "learning_rate": 0.00019590534462651086, + "loss": 1.2289, + "step": 4755 + }, + { + "epoch": 0.18325312800769972, + "grad_norm": 1.5384646654129028, + "learning_rate": 0.00019589677562469312, + "loss": 1.2474, + "step": 4760 + }, + { + "epoch": 0.18344562078922041, + "grad_norm": 1.397716999053955, + "learning_rate": 0.00019588819785375057, + "loss": 1.4273, + "step": 4765 + }, + { + "epoch": 0.18363811357074108, + "grad_norm": 1.169207215309143, + "learning_rate": 0.00019587961131446754, + "loss": 1.3963, + "step": 4770 + }, + { + "epoch": 0.18383060635226178, + "grad_norm": 1.5064833164215088, + "learning_rate": 0.00019587101600762916, + "loss": 1.5192, + "step": 4775 + }, + { + "epoch": 0.18402309913378248, + "grad_norm": 0.9700071811676025, + "learning_rate": 0.00019586241193402147, + "loss": 1.2697, + "step": 4780 + }, + { + "epoch": 0.18421559191530318, + "grad_norm": 1.2304507493972778, + "learning_rate": 0.00019585379909443123, + "loss": 1.3025, + "step": 4785 + }, + { + "epoch": 0.18440808469682388, + "grad_norm": 1.3768020868301392, + "learning_rate": 0.00019584517748964605, + "loss": 1.3785, + "step": 4790 + }, + { + "epoch": 0.18460057747834457, + "grad_norm": 1.062251091003418, + "learning_rate": 0.0001958365471204543, + "loss": 1.5416, + "step": 4795 + }, + { + "epoch": 0.18479307025986524, + "grad_norm": 0.9126803874969482, + "learning_rate": 0.00019582790798764518, + "loss": 1.1479, + "step": 4800 + }, + { + "epoch": 0.18498556304138594, + "grad_norm": 1.579830288887024, + "learning_rate": 0.00019581926009200866, + "loss": 1.3315, + "step": 4805 + }, + { + "epoch": 0.18517805582290664, + "grad_norm": 2.351717710494995, + "learning_rate": 0.00019581060343433555, + "loss": 1.2503, + "step": 4810 + }, + { + "epoch": 0.18537054860442734, + "grad_norm": 1.1480222940444946, + "learning_rate": 0.00019580193801541746, + "loss": 1.2048, + "step": 4815 + }, + { + "epoch": 0.18556304138594804, + "grad_norm": 1.606439471244812, + "learning_rate": 0.00019579326383604675, + "loss": 1.5204, + "step": 4820 + }, + { + "epoch": 0.18575553416746873, + "grad_norm": 1.520969271659851, + "learning_rate": 0.00019578458089701664, + "loss": 1.2584, + "step": 4825 + }, + { + "epoch": 0.1859480269489894, + "grad_norm": 1.9096931219100952, + "learning_rate": 0.00019577588919912113, + "loss": 1.5508, + "step": 4830 + }, + { + "epoch": 0.1861405197305101, + "grad_norm": 1.004654884338379, + "learning_rate": 0.00019576718874315501, + "loss": 1.2249, + "step": 4835 + }, + { + "epoch": 0.1863330125120308, + "grad_norm": 1.0160667896270752, + "learning_rate": 0.00019575847952991388, + "loss": 1.0782, + "step": 4840 + }, + { + "epoch": 0.1865255052935515, + "grad_norm": 1.4719328880310059, + "learning_rate": 0.0001957497615601941, + "loss": 1.4679, + "step": 4845 + }, + { + "epoch": 0.1867179980750722, + "grad_norm": 1.229625940322876, + "learning_rate": 0.00019574103483479296, + "loss": 1.347, + "step": 4850 + }, + { + "epoch": 0.18691049085659286, + "grad_norm": 3.0996217727661133, + "learning_rate": 0.00019573229935450842, + "loss": 1.3325, + "step": 4855 + }, + { + "epoch": 0.18710298363811356, + "grad_norm": 1.59645676612854, + "learning_rate": 0.00019572355512013922, + "loss": 1.2983, + "step": 4860 + }, + { + "epoch": 0.18729547641963426, + "grad_norm": 1.373542070388794, + "learning_rate": 0.00019571480213248504, + "loss": 1.3285, + "step": 4865 + }, + { + "epoch": 0.18748796920115496, + "grad_norm": 0.9625198245048523, + "learning_rate": 0.00019570604039234626, + "loss": 1.2823, + "step": 4870 + }, + { + "epoch": 0.18768046198267566, + "grad_norm": 1.1096363067626953, + "learning_rate": 0.00019569726990052407, + "loss": 1.2508, + "step": 4875 + }, + { + "epoch": 0.18787295476419635, + "grad_norm": 1.2040042877197266, + "learning_rate": 0.0001956884906578205, + "loss": 1.3767, + "step": 4880 + }, + { + "epoch": 0.18806544754571702, + "grad_norm": 1.103530764579773, + "learning_rate": 0.00019567970266503833, + "loss": 1.4559, + "step": 4885 + }, + { + "epoch": 0.18825794032723772, + "grad_norm": 1.1266409158706665, + "learning_rate": 0.0001956709059229812, + "loss": 1.0687, + "step": 4890 + }, + { + "epoch": 0.18845043310875842, + "grad_norm": 1.2266972064971924, + "learning_rate": 0.00019566210043245344, + "loss": 1.1801, + "step": 4895 + }, + { + "epoch": 0.18864292589027912, + "grad_norm": 1.416676640510559, + "learning_rate": 0.0001956532861942603, + "loss": 1.346, + "step": 4900 + }, + { + "epoch": 0.18883541867179982, + "grad_norm": 1.5538910627365112, + "learning_rate": 0.0001956444632092078, + "loss": 1.3498, + "step": 4905 + }, + { + "epoch": 0.1890279114533205, + "grad_norm": 1.1525146961212158, + "learning_rate": 0.00019563563147810274, + "loss": 1.39, + "step": 4910 + }, + { + "epoch": 0.18922040423484118, + "grad_norm": 1.6796061992645264, + "learning_rate": 0.00019562679100175266, + "loss": 1.3377, + "step": 4915 + }, + { + "epoch": 0.18941289701636188, + "grad_norm": 1.6094450950622559, + "learning_rate": 0.00019561794178096607, + "loss": 1.3057, + "step": 4920 + }, + { + "epoch": 0.18960538979788258, + "grad_norm": 1.8123548030853271, + "learning_rate": 0.00019560908381655208, + "loss": 1.1257, + "step": 4925 + }, + { + "epoch": 0.18979788257940328, + "grad_norm": 1.5495673418045044, + "learning_rate": 0.00019560021710932074, + "loss": 1.303, + "step": 4930 + }, + { + "epoch": 0.18999037536092397, + "grad_norm": 1.623429298400879, + "learning_rate": 0.00019559134166008283, + "loss": 1.1491, + "step": 4935 + }, + { + "epoch": 0.19018286814244467, + "grad_norm": 1.2682925462722778, + "learning_rate": 0.00019558245746964997, + "loss": 1.3774, + "step": 4940 + }, + { + "epoch": 0.19037536092396534, + "grad_norm": 0.9362719058990479, + "learning_rate": 0.00019557356453883456, + "loss": 1.2936, + "step": 4945 + }, + { + "epoch": 0.19056785370548604, + "grad_norm": 1.4271594285964966, + "learning_rate": 0.00019556466286844976, + "loss": 1.3865, + "step": 4950 + }, + { + "epoch": 0.19076034648700674, + "grad_norm": 1.4094691276550293, + "learning_rate": 0.00019555575245930963, + "loss": 1.2941, + "step": 4955 + }, + { + "epoch": 0.19095283926852744, + "grad_norm": 0.9695935249328613, + "learning_rate": 0.00019554683331222893, + "loss": 1.1724, + "step": 4960 + }, + { + "epoch": 0.19114533205004813, + "grad_norm": 1.110616683959961, + "learning_rate": 0.00019553790542802327, + "loss": 1.3999, + "step": 4965 + }, + { + "epoch": 0.1913378248315688, + "grad_norm": 1.5389796495437622, + "learning_rate": 0.000195528968807509, + "loss": 1.2693, + "step": 4970 + }, + { + "epoch": 0.1915303176130895, + "grad_norm": 1.921168565750122, + "learning_rate": 0.00019552002345150338, + "loss": 1.2392, + "step": 4975 + }, + { + "epoch": 0.1917228103946102, + "grad_norm": 1.3342314958572388, + "learning_rate": 0.00019551106936082437, + "loss": 1.2477, + "step": 4980 + }, + { + "epoch": 0.1919153031761309, + "grad_norm": 1.745754361152649, + "learning_rate": 0.0001955021065362908, + "loss": 1.7169, + "step": 4985 + }, + { + "epoch": 0.1921077959576516, + "grad_norm": 1.090145468711853, + "learning_rate": 0.0001954931349787222, + "loss": 1.1156, + "step": 4990 + }, + { + "epoch": 0.1923002887391723, + "grad_norm": 1.5357612371444702, + "learning_rate": 0.00019548415468893899, + "loss": 1.5436, + "step": 4995 + }, + { + "epoch": 0.19249278152069296, + "grad_norm": 1.0309633016586304, + "learning_rate": 0.00019547516566776238, + "loss": 1.3212, + "step": 5000 + }, + { + "epoch": 0.19268527430221366, + "grad_norm": 1.000688076019287, + "learning_rate": 0.0001954661679160143, + "loss": 1.2821, + "step": 5005 + }, + { + "epoch": 0.19287776708373436, + "grad_norm": 1.268754243850708, + "learning_rate": 0.0001954571614345176, + "loss": 1.2168, + "step": 5010 + }, + { + "epoch": 0.19307025986525506, + "grad_norm": 1.3859111070632935, + "learning_rate": 0.00019544814622409582, + "loss": 1.0701, + "step": 5015 + }, + { + "epoch": 0.19326275264677575, + "grad_norm": 2.248309850692749, + "learning_rate": 0.00019543912228557337, + "loss": 1.3548, + "step": 5020 + }, + { + "epoch": 0.19345524542829645, + "grad_norm": 1.0269944667816162, + "learning_rate": 0.00019543008961977538, + "loss": 1.213, + "step": 5025 + }, + { + "epoch": 0.19364773820981712, + "grad_norm": 1.0082924365997314, + "learning_rate": 0.00019542104822752789, + "loss": 1.2395, + "step": 5030 + }, + { + "epoch": 0.19384023099133782, + "grad_norm": 2.1287014484405518, + "learning_rate": 0.00019541199810965766, + "loss": 1.3794, + "step": 5035 + }, + { + "epoch": 0.19403272377285852, + "grad_norm": 1.230859637260437, + "learning_rate": 0.0001954029392669922, + "loss": 1.3985, + "step": 5040 + }, + { + "epoch": 0.19422521655437922, + "grad_norm": 1.0987460613250732, + "learning_rate": 0.00019539387170035996, + "loss": 1.2637, + "step": 5045 + }, + { + "epoch": 0.1944177093358999, + "grad_norm": 1.2570157051086426, + "learning_rate": 0.00019538479541059007, + "loss": 1.2752, + "step": 5050 + }, + { + "epoch": 0.19461020211742058, + "grad_norm": 0.5122241377830505, + "learning_rate": 0.00019537571039851252, + "loss": 1.1927, + "step": 5055 + }, + { + "epoch": 0.19480269489894128, + "grad_norm": 1.7925124168395996, + "learning_rate": 0.00019536661666495807, + "loss": 1.1414, + "step": 5060 + }, + { + "epoch": 0.19499518768046198, + "grad_norm": 0.8517950773239136, + "learning_rate": 0.00019535751421075826, + "loss": 1.2359, + "step": 5065 + }, + { + "epoch": 0.19518768046198268, + "grad_norm": 0.582260012626648, + "learning_rate": 0.00019534840303674544, + "loss": 1.3528, + "step": 5070 + }, + { + "epoch": 0.19538017324350337, + "grad_norm": 1.3547414541244507, + "learning_rate": 0.0001953392831437528, + "loss": 1.296, + "step": 5075 + }, + { + "epoch": 0.19557266602502407, + "grad_norm": Infinity, + "learning_rate": 0.0001953319809522536, + "loss": 1.4074, + "step": 5080 + }, + { + "epoch": 0.19576515880654474, + "grad_norm": 2.2984917163848877, + "learning_rate": 0.00019532284536719936, + "loss": 1.2002, + "step": 5085 + }, + { + "epoch": 0.19595765158806544, + "grad_norm": 1.4113095998764038, + "learning_rate": 0.0001953137010655024, + "loss": 1.2755, + "step": 5090 + }, + { + "epoch": 0.19615014436958614, + "grad_norm": 1.921242594718933, + "learning_rate": 0.00019530454804799881, + "loss": 1.2431, + "step": 5095 + }, + { + "epoch": 0.19634263715110684, + "grad_norm": 1.3097113370895386, + "learning_rate": 0.0001952953863155257, + "loss": 1.415, + "step": 5100 + }, + { + "epoch": 0.19653512993262753, + "grad_norm": 2.1493217945098877, + "learning_rate": 0.00019528621586892072, + "loss": 1.4282, + "step": 5105 + }, + { + "epoch": 0.19672762271414823, + "grad_norm": 1.2487257719039917, + "learning_rate": 0.0001952770367090226, + "loss": 1.3512, + "step": 5110 + }, + { + "epoch": 0.1969201154956689, + "grad_norm": 0.9984391331672668, + "learning_rate": 0.00019526784883667055, + "loss": 1.5437, + "step": 5115 + }, + { + "epoch": 0.1971126082771896, + "grad_norm": 1.241417646408081, + "learning_rate": 0.00019525865225270486, + "loss": 1.2399, + "step": 5120 + }, + { + "epoch": 0.1973051010587103, + "grad_norm": 1.5192227363586426, + "learning_rate": 0.00019524944695796642, + "loss": 1.3236, + "step": 5125 + }, + { + "epoch": 0.197497593840231, + "grad_norm": 1.7465555667877197, + "learning_rate": 0.00019524023295329704, + "loss": 1.4247, + "step": 5130 + }, + { + "epoch": 0.1976900866217517, + "grad_norm": 1.455175757408142, + "learning_rate": 0.00019523101023953925, + "loss": 1.5053, + "step": 5135 + }, + { + "epoch": 0.1978825794032724, + "grad_norm": 2.164982318878174, + "learning_rate": 0.00019522177881753643, + "loss": 1.2796, + "step": 5140 + }, + { + "epoch": 0.19807507218479306, + "grad_norm": 1.58863365650177, + "learning_rate": 0.00019521253868813273, + "loss": 1.349, + "step": 5145 + }, + { + "epoch": 0.19826756496631376, + "grad_norm": 1.5380641222000122, + "learning_rate": 0.0001952032898521731, + "loss": 1.3107, + "step": 5150 + }, + { + "epoch": 0.19846005774783446, + "grad_norm": 1.1790603399276733, + "learning_rate": 0.00019519403231050327, + "loss": 1.2178, + "step": 5155 + }, + { + "epoch": 0.19865255052935515, + "grad_norm": 1.7905482053756714, + "learning_rate": 0.0001951847660639698, + "loss": 1.3579, + "step": 5160 + }, + { + "epoch": 0.19884504331087585, + "grad_norm": 1.1262041330337524, + "learning_rate": 0.00019517549111342, + "loss": 1.2988, + "step": 5165 + }, + { + "epoch": 0.19903753609239652, + "grad_norm": 1.6370010375976562, + "learning_rate": 0.00019516620745970199, + "loss": 1.2326, + "step": 5170 + }, + { + "epoch": 0.19923002887391722, + "grad_norm": 1.1789335012435913, + "learning_rate": 0.00019515691510366476, + "loss": 1.1357, + "step": 5175 + }, + { + "epoch": 0.19942252165543792, + "grad_norm": 1.167226791381836, + "learning_rate": 0.000195147614046158, + "loss": 1.4007, + "step": 5180 + }, + { + "epoch": 0.19961501443695862, + "grad_norm": 1.3708933591842651, + "learning_rate": 0.00019513830428803225, + "loss": 1.3029, + "step": 5185 + }, + { + "epoch": 0.1998075072184793, + "grad_norm": 1.6595165729522705, + "learning_rate": 0.00019512898583013875, + "loss": 1.3159, + "step": 5190 + }, + { + "epoch": 0.2, + "grad_norm": 1.1252923011779785, + "learning_rate": 0.00019511965867332972, + "loss": 1.1894, + "step": 5195 + }, + { + "epoch": 0.20019249278152068, + "grad_norm": 0.8440331816673279, + "learning_rate": 0.00019511032281845797, + "loss": 1.2108, + "step": 5200 + }, + { + "epoch": 0.20038498556304138, + "grad_norm": 1.427147626876831, + "learning_rate": 0.0001951009782663773, + "loss": 1.197, + "step": 5205 + }, + { + "epoch": 0.20057747834456208, + "grad_norm": 1.3509503602981567, + "learning_rate": 0.00019509162501794213, + "loss": 1.3348, + "step": 5210 + }, + { + "epoch": 0.20076997112608277, + "grad_norm": 1.533103108406067, + "learning_rate": 0.00019508226307400777, + "loss": 1.1919, + "step": 5215 + }, + { + "epoch": 0.20096246390760347, + "grad_norm": 1.1347332000732422, + "learning_rate": 0.0001950728924354303, + "loss": 1.2954, + "step": 5220 + }, + { + "epoch": 0.20115495668912417, + "grad_norm": 1.65277099609375, + "learning_rate": 0.00019506351310306664, + "loss": 1.2686, + "step": 5225 + }, + { + "epoch": 0.20134744947064484, + "grad_norm": 1.0601050853729248, + "learning_rate": 0.00019505412507777442, + "loss": 1.4066, + "step": 5230 + }, + { + "epoch": 0.20153994225216554, + "grad_norm": 0.9429787397384644, + "learning_rate": 0.00019504472836041217, + "loss": 1.208, + "step": 5235 + }, + { + "epoch": 0.20173243503368624, + "grad_norm": 0.9101033806800842, + "learning_rate": 0.00019503532295183908, + "loss": 1.3172, + "step": 5240 + }, + { + "epoch": 0.20192492781520693, + "grad_norm": 1.1404805183410645, + "learning_rate": 0.0001950259088529153, + "loss": 1.1539, + "step": 5245 + }, + { + "epoch": 0.20211742059672763, + "grad_norm": 1.1555522680282593, + "learning_rate": 0.00019501648606450161, + "loss": 1.3754, + "step": 5250 + }, + { + "epoch": 0.20230991337824833, + "grad_norm": 1.5473912954330444, + "learning_rate": 0.00019500705458745974, + "loss": 1.1878, + "step": 5255 + }, + { + "epoch": 0.202502406159769, + "grad_norm": 1.8766716718673706, + "learning_rate": 0.00019499761442265208, + "loss": 1.2445, + "step": 5260 + }, + { + "epoch": 0.2026948989412897, + "grad_norm": 1.7951183319091797, + "learning_rate": 0.00019498816557094188, + "loss": 1.3496, + "step": 5265 + }, + { + "epoch": 0.2028873917228104, + "grad_norm": 1.6615973711013794, + "learning_rate": 0.00019497870803319317, + "loss": 1.2919, + "step": 5270 + }, + { + "epoch": 0.2030798845043311, + "grad_norm": 1.2885236740112305, + "learning_rate": 0.00019496924181027078, + "loss": 1.1807, + "step": 5275 + }, + { + "epoch": 0.2032723772858518, + "grad_norm": 0.9546861052513123, + "learning_rate": 0.00019495976690304034, + "loss": 1.309, + "step": 5280 + }, + { + "epoch": 0.20346487006737246, + "grad_norm": 1.6904189586639404, + "learning_rate": 0.0001949502833123683, + "loss": 1.2244, + "step": 5285 + }, + { + "epoch": 0.20365736284889316, + "grad_norm": 1.394254446029663, + "learning_rate": 0.0001949407910391218, + "loss": 1.2877, + "step": 5290 + }, + { + "epoch": 0.20384985563041386, + "grad_norm": 0.8937919735908508, + "learning_rate": 0.0001949312900841689, + "loss": 1.2389, + "step": 5295 + }, + { + "epoch": 0.20404234841193455, + "grad_norm": 1.1096867322921753, + "learning_rate": 0.00019492178044837837, + "loss": 1.3766, + "step": 5300 + }, + { + "epoch": 0.20423484119345525, + "grad_norm": 1.009758472442627, + "learning_rate": 0.00019491226213261983, + "loss": 1.2281, + "step": 5305 + }, + { + "epoch": 0.20442733397497595, + "grad_norm": 1.4888296127319336, + "learning_rate": 0.00019490273513776365, + "loss": 1.0624, + "step": 5310 + }, + { + "epoch": 0.20461982675649662, + "grad_norm": 1.4901612997055054, + "learning_rate": 0.00019489319946468104, + "loss": 1.1554, + "step": 5315 + }, + { + "epoch": 0.20481231953801732, + "grad_norm": 1.2920863628387451, + "learning_rate": 0.0001948836551142439, + "loss": 1.2103, + "step": 5320 + }, + { + "epoch": 0.20500481231953802, + "grad_norm": 1.3616580963134766, + "learning_rate": 0.00019487410208732508, + "loss": 1.3246, + "step": 5325 + }, + { + "epoch": 0.2051973051010587, + "grad_norm": 1.0202921628952026, + "learning_rate": 0.0001948645403847981, + "loss": 1.3046, + "step": 5330 + }, + { + "epoch": 0.2053897978825794, + "grad_norm": 1.0083186626434326, + "learning_rate": 0.00019485497000753735, + "loss": 1.2541, + "step": 5335 + }, + { + "epoch": 0.2055822906641001, + "grad_norm": 1.137617588043213, + "learning_rate": 0.0001948453909564179, + "loss": 1.3143, + "step": 5340 + }, + { + "epoch": 0.20577478344562078, + "grad_norm": 1.6331067085266113, + "learning_rate": 0.00019483580323231578, + "loss": 1.1129, + "step": 5345 + }, + { + "epoch": 0.20596727622714148, + "grad_norm": 1.4032361507415771, + "learning_rate": 0.00019482620683610767, + "loss": 1.3412, + "step": 5350 + }, + { + "epoch": 0.20615976900866217, + "grad_norm": 1.3207452297210693, + "learning_rate": 0.00019481660176867108, + "loss": 1.4614, + "step": 5355 + }, + { + "epoch": 0.20635226179018287, + "grad_norm": 0.9236577749252319, + "learning_rate": 0.0001948069880308844, + "loss": 1.3131, + "step": 5360 + }, + { + "epoch": 0.20654475457170357, + "grad_norm": 2.2021703720092773, + "learning_rate": 0.0001947973656236267, + "loss": 1.2434, + "step": 5365 + }, + { + "epoch": 0.20673724735322424, + "grad_norm": 1.5074305534362793, + "learning_rate": 0.00019478773454777789, + "loss": 1.4204, + "step": 5370 + }, + { + "epoch": 0.20692974013474494, + "grad_norm": 1.5073877573013306, + "learning_rate": 0.00019477809480421865, + "loss": 1.4193, + "step": 5375 + }, + { + "epoch": 0.20712223291626564, + "grad_norm": 1.0522600412368774, + "learning_rate": 0.00019476844639383049, + "loss": 1.228, + "step": 5380 + }, + { + "epoch": 0.20731472569778633, + "grad_norm": 1.1478843688964844, + "learning_rate": 0.0001947587893174957, + "loss": 1.2315, + "step": 5385 + }, + { + "epoch": 0.20750721847930703, + "grad_norm": 0.922837495803833, + "learning_rate": 0.00019474912357609733, + "loss": 1.2567, + "step": 5390 + }, + { + "epoch": 0.20769971126082773, + "grad_norm": 1.156615972518921, + "learning_rate": 0.0001947394491705193, + "loss": 1.443, + "step": 5395 + }, + { + "epoch": 0.2078922040423484, + "grad_norm": 1.909555435180664, + "learning_rate": 0.0001947297661016462, + "loss": 1.1625, + "step": 5400 + }, + { + "epoch": 0.2080846968238691, + "grad_norm": 1.8379411697387695, + "learning_rate": 0.00019472007437036352, + "loss": 1.3015, + "step": 5405 + }, + { + "epoch": 0.2082771896053898, + "grad_norm": 1.188402771949768, + "learning_rate": 0.00019471037397755754, + "loss": 1.3294, + "step": 5410 + }, + { + "epoch": 0.2084696823869105, + "grad_norm": 1.597538948059082, + "learning_rate": 0.00019470066492411521, + "loss": 1.3824, + "step": 5415 + }, + { + "epoch": 0.2086621751684312, + "grad_norm": 1.0081026554107666, + "learning_rate": 0.00019469094721092444, + "loss": 1.2914, + "step": 5420 + }, + { + "epoch": 0.2088546679499519, + "grad_norm": 1.3790476322174072, + "learning_rate": 0.0001946812208388738, + "loss": 1.2817, + "step": 5425 + }, + { + "epoch": 0.20904716073147256, + "grad_norm": 1.777570128440857, + "learning_rate": 0.00019467148580885272, + "loss": 1.2253, + "step": 5430 + }, + { + "epoch": 0.20923965351299326, + "grad_norm": 1.1196024417877197, + "learning_rate": 0.00019466174212175142, + "loss": 1.2956, + "step": 5435 + }, + { + "epoch": 0.20943214629451395, + "grad_norm": 2.940906524658203, + "learning_rate": 0.00019465198977846086, + "loss": 1.3912, + "step": 5440 + }, + { + "epoch": 0.20962463907603465, + "grad_norm": 1.9075424671173096, + "learning_rate": 0.00019464222877987286, + "loss": 1.2518, + "step": 5445 + }, + { + "epoch": 0.20981713185755535, + "grad_norm": 1.0282469987869263, + "learning_rate": 0.00019463245912687996, + "loss": 1.2569, + "step": 5450 + }, + { + "epoch": 0.21000962463907605, + "grad_norm": 1.1651009321212769, + "learning_rate": 0.0001946226808203756, + "loss": 1.4676, + "step": 5455 + }, + { + "epoch": 0.21020211742059672, + "grad_norm": 1.1911680698394775, + "learning_rate": 0.00019461289386125388, + "loss": 1.3822, + "step": 5460 + }, + { + "epoch": 0.21039461020211742, + "grad_norm": 0.7187578082084656, + "learning_rate": 0.00019460309825040974, + "loss": 1.1462, + "step": 5465 + }, + { + "epoch": 0.2105871029836381, + "grad_norm": 2.401764154434204, + "learning_rate": 0.000194593293988739, + "loss": 1.3187, + "step": 5470 + }, + { + "epoch": 0.2107795957651588, + "grad_norm": 1.783333659172058, + "learning_rate": 0.0001945834810771381, + "loss": 1.3539, + "step": 5475 + }, + { + "epoch": 0.2109720885466795, + "grad_norm": 0.9923986196517944, + "learning_rate": 0.00019457365951650445, + "loss": 1.4837, + "step": 5480 + }, + { + "epoch": 0.21116458132820018, + "grad_norm": 1.0704642534255981, + "learning_rate": 0.00019456382930773612, + "loss": 1.2345, + "step": 5485 + }, + { + "epoch": 0.21135707410972088, + "grad_norm": 1.5242959260940552, + "learning_rate": 0.000194553990451732, + "loss": 1.2113, + "step": 5490 + }, + { + "epoch": 0.21154956689124157, + "grad_norm": 1.3185608386993408, + "learning_rate": 0.00019454414294939185, + "loss": 1.4083, + "step": 5495 + }, + { + "epoch": 0.21174205967276227, + "grad_norm": 1.1448662281036377, + "learning_rate": 0.00019453428680161615, + "loss": 1.4091, + "step": 5500 + }, + { + "epoch": 0.21193455245428297, + "grad_norm": 1.172396183013916, + "learning_rate": 0.0001945244220093061, + "loss": 1.1414, + "step": 5505 + }, + { + "epoch": 0.21212704523580367, + "grad_norm": 2.988346576690674, + "learning_rate": 0.00019451454857336383, + "loss": 1.3968, + "step": 5510 + }, + { + "epoch": 0.21231953801732434, + "grad_norm": 0.8824801445007324, + "learning_rate": 0.00019450466649469222, + "loss": 1.2229, + "step": 5515 + }, + { + "epoch": 0.21251203079884504, + "grad_norm": 1.7703745365142822, + "learning_rate": 0.00019449477577419488, + "loss": 1.3073, + "step": 5520 + }, + { + "epoch": 0.21270452358036573, + "grad_norm": 1.3374749422073364, + "learning_rate": 0.00019448487641277629, + "loss": 1.3908, + "step": 5525 + }, + { + "epoch": 0.21289701636188643, + "grad_norm": 1.2366503477096558, + "learning_rate": 0.00019447496841134163, + "loss": 1.2764, + "step": 5530 + }, + { + "epoch": 0.21308950914340713, + "grad_norm": 1.242353081703186, + "learning_rate": 0.00019446505177079696, + "loss": 1.3136, + "step": 5535 + }, + { + "epoch": 0.21328200192492783, + "grad_norm": 1.046583652496338, + "learning_rate": 0.00019445512649204907, + "loss": 1.1483, + "step": 5540 + }, + { + "epoch": 0.2134744947064485, + "grad_norm": 1.6280517578125, + "learning_rate": 0.00019444519257600558, + "loss": 1.4076, + "step": 5545 + }, + { + "epoch": 0.2136669874879692, + "grad_norm": 1.7472679615020752, + "learning_rate": 0.00019443525002357486, + "loss": 1.2842, + "step": 5550 + }, + { + "epoch": 0.2138594802694899, + "grad_norm": 1.101185917854309, + "learning_rate": 0.00019442529883566612, + "loss": 1.3037, + "step": 5555 + }, + { + "epoch": 0.2140519730510106, + "grad_norm": 1.8548834323883057, + "learning_rate": 0.0001944153390131893, + "loss": 1.4081, + "step": 5560 + }, + { + "epoch": 0.2142444658325313, + "grad_norm": 1.4205219745635986, + "learning_rate": 0.00019440537055705515, + "loss": 1.3419, + "step": 5565 + }, + { + "epoch": 0.214436958614052, + "grad_norm": 1.135933756828308, + "learning_rate": 0.0001943953934681753, + "loss": 0.9906, + "step": 5570 + }, + { + "epoch": 0.21462945139557266, + "grad_norm": 1.7350742816925049, + "learning_rate": 0.00019438540774746198, + "loss": 1.1193, + "step": 5575 + }, + { + "epoch": 0.21482194417709335, + "grad_norm": 1.891998291015625, + "learning_rate": 0.00019437541339582836, + "loss": 1.2271, + "step": 5580 + }, + { + "epoch": 0.21501443695861405, + "grad_norm": 1.2564722299575806, + "learning_rate": 0.0001943654104141884, + "loss": 1.5134, + "step": 5585 + }, + { + "epoch": 0.21520692974013475, + "grad_norm": 1.3632197380065918, + "learning_rate": 0.00019435539880345673, + "loss": 1.1772, + "step": 5590 + }, + { + "epoch": 0.21539942252165545, + "grad_norm": 1.8670414686203003, + "learning_rate": 0.00019434537856454894, + "loss": 1.2685, + "step": 5595 + }, + { + "epoch": 0.21559191530317612, + "grad_norm": 2.5948314666748047, + "learning_rate": 0.00019433534969838122, + "loss": 1.487, + "step": 5600 + }, + { + "epoch": 0.21578440808469682, + "grad_norm": 1.2312328815460205, + "learning_rate": 0.00019432531220587071, + "loss": 1.3394, + "step": 5605 + }, + { + "epoch": 0.2159769008662175, + "grad_norm": 0.9402896165847778, + "learning_rate": 0.0001943152660879352, + "loss": 1.1471, + "step": 5610 + }, + { + "epoch": 0.2161693936477382, + "grad_norm": 0.3871050477027893, + "learning_rate": 0.00019430521134549346, + "loss": 0.9597, + "step": 5615 + }, + { + "epoch": 0.2163618864292589, + "grad_norm": 0.9395222067832947, + "learning_rate": 0.0001942951479794648, + "loss": 1.3055, + "step": 5620 + }, + { + "epoch": 0.2165543792107796, + "grad_norm": 0.8928638696670532, + "learning_rate": 0.00019428507599076955, + "loss": 1.4099, + "step": 5625 + }, + { + "epoch": 0.21674687199230028, + "grad_norm": 1.8891551494598389, + "learning_rate": 0.00019427499538032865, + "loss": 1.5009, + "step": 5630 + }, + { + "epoch": 0.21693936477382098, + "grad_norm": 0.6684243679046631, + "learning_rate": 0.00019426490614906394, + "loss": 1.2251, + "step": 5635 + }, + { + "epoch": 0.21713185755534167, + "grad_norm": 1.5765355825424194, + "learning_rate": 0.00019425480829789803, + "loss": 1.1114, + "step": 5640 + }, + { + "epoch": 0.21732435033686237, + "grad_norm": 0.9966096878051758, + "learning_rate": 0.00019424470182775427, + "loss": 1.2907, + "step": 5645 + }, + { + "epoch": 0.21751684311838307, + "grad_norm": 1.263469934463501, + "learning_rate": 0.00019423458673955684, + "loss": 1.1443, + "step": 5650 + }, + { + "epoch": 0.21770933589990377, + "grad_norm": 1.5138813257217407, + "learning_rate": 0.0001942244630342307, + "loss": 1.2699, + "step": 5655 + }, + { + "epoch": 0.21790182868142444, + "grad_norm": 1.0215526819229126, + "learning_rate": 0.00019421433071270156, + "loss": 1.4265, + "step": 5660 + }, + { + "epoch": 0.21809432146294513, + "grad_norm": 0.7587301731109619, + "learning_rate": 0.00019420418977589605, + "loss": 1.1706, + "step": 5665 + }, + { + "epoch": 0.21828681424446583, + "grad_norm": 0.9531148672103882, + "learning_rate": 0.0001941940402247414, + "loss": 1.4041, + "step": 5670 + }, + { + "epoch": 0.21847930702598653, + "grad_norm": 1.098739743232727, + "learning_rate": 0.00019418388206016575, + "loss": 1.3476, + "step": 5675 + }, + { + "epoch": 0.21867179980750723, + "grad_norm": 1.0307271480560303, + "learning_rate": 0.000194173715283098, + "loss": 1.2333, + "step": 5680 + }, + { + "epoch": 0.2188642925890279, + "grad_norm": 1.538256049156189, + "learning_rate": 0.00019416353989446785, + "loss": 1.4489, + "step": 5685 + }, + { + "epoch": 0.2190567853705486, + "grad_norm": 1.5411714315414429, + "learning_rate": 0.00019415335589520574, + "loss": 1.2597, + "step": 5690 + }, + { + "epoch": 0.2192492781520693, + "grad_norm": 1.3543205261230469, + "learning_rate": 0.00019414316328624293, + "loss": 1.265, + "step": 5695 + }, + { + "epoch": 0.21944177093359, + "grad_norm": 0.7644770741462708, + "learning_rate": 0.0001941329620685115, + "loss": 1.1888, + "step": 5700 + }, + { + "epoch": 0.2196342637151107, + "grad_norm": 2.1122093200683594, + "learning_rate": 0.00019412275224294423, + "loss": 1.1301, + "step": 5705 + }, + { + "epoch": 0.2198267564966314, + "grad_norm": 1.4159448146820068, + "learning_rate": 0.00019411253381047477, + "loss": 1.209, + "step": 5710 + }, + { + "epoch": 0.22001924927815206, + "grad_norm": 1.4212615489959717, + "learning_rate": 0.00019410230677203755, + "loss": 1.3268, + "step": 5715 + }, + { + "epoch": 0.22021174205967275, + "grad_norm": 1.2042075395584106, + "learning_rate": 0.00019409207112856778, + "loss": 1.1976, + "step": 5720 + }, + { + "epoch": 0.22040423484119345, + "grad_norm": 1.5765044689178467, + "learning_rate": 0.00019408182688100136, + "loss": 1.3631, + "step": 5725 + }, + { + "epoch": 0.22059672762271415, + "grad_norm": 2.197000026702881, + "learning_rate": 0.00019407157403027514, + "loss": 1.2964, + "step": 5730 + }, + { + "epoch": 0.22078922040423485, + "grad_norm": 1.3434042930603027, + "learning_rate": 0.00019406131257732664, + "loss": 1.244, + "step": 5735 + }, + { + "epoch": 0.22098171318575555, + "grad_norm": 1.2889900207519531, + "learning_rate": 0.0001940510425230942, + "loss": 1.1333, + "step": 5740 + }, + { + "epoch": 0.22117420596727622, + "grad_norm": 0.8795220851898193, + "learning_rate": 0.00019404076386851692, + "loss": 1.2635, + "step": 5745 + }, + { + "epoch": 0.22136669874879691, + "grad_norm": 1.0312747955322266, + "learning_rate": 0.00019403047661453477, + "loss": 1.3195, + "step": 5750 + }, + { + "epoch": 0.2215591915303176, + "grad_norm": 1.5083264112472534, + "learning_rate": 0.00019402018076208845, + "loss": 1.3417, + "step": 5755 + }, + { + "epoch": 0.2217516843118383, + "grad_norm": 1.1538232564926147, + "learning_rate": 0.00019400987631211936, + "loss": 1.2956, + "step": 5760 + }, + { + "epoch": 0.221944177093359, + "grad_norm": 1.975381851196289, + "learning_rate": 0.0001939995632655699, + "loss": 1.4641, + "step": 5765 + }, + { + "epoch": 0.2221366698748797, + "grad_norm": 1.3251721858978271, + "learning_rate": 0.00019398924162338305, + "loss": 1.3429, + "step": 5770 + }, + { + "epoch": 0.22232916265640038, + "grad_norm": 1.1281229257583618, + "learning_rate": 0.0001939789113865027, + "loss": 1.2155, + "step": 5775 + }, + { + "epoch": 0.22252165543792107, + "grad_norm": 2.6070075035095215, + "learning_rate": 0.00019396857255587344, + "loss": 1.2634, + "step": 5780 + }, + { + "epoch": 0.22271414821944177, + "grad_norm": 1.0815184116363525, + "learning_rate": 0.00019395822513244067, + "loss": 1.1176, + "step": 5785 + }, + { + "epoch": 0.22290664100096247, + "grad_norm": 2.819180965423584, + "learning_rate": 0.0001939478691171507, + "loss": 1.2624, + "step": 5790 + }, + { + "epoch": 0.22309913378248317, + "grad_norm": 1.180055022239685, + "learning_rate": 0.0001939375045109504, + "loss": 1.3433, + "step": 5795 + }, + { + "epoch": 0.22329162656400384, + "grad_norm": 1.1582396030426025, + "learning_rate": 0.0001939271313147876, + "loss": 1.2815, + "step": 5800 + }, + { + "epoch": 0.22348411934552453, + "grad_norm": 2.32379412651062, + "learning_rate": 0.00019391674952961085, + "loss": 1.4095, + "step": 5805 + }, + { + "epoch": 0.22367661212704523, + "grad_norm": 1.5146657228469849, + "learning_rate": 0.0001939063591563695, + "loss": 1.2434, + "step": 5810 + }, + { + "epoch": 0.22386910490856593, + "grad_norm": 1.6434500217437744, + "learning_rate": 0.00019389596019601365, + "loss": 1.1739, + "step": 5815 + }, + { + "epoch": 0.22406159769008663, + "grad_norm": 1.7917993068695068, + "learning_rate": 0.0001938855526494943, + "loss": 1.5106, + "step": 5820 + }, + { + "epoch": 0.22425409047160733, + "grad_norm": 1.10679030418396, + "learning_rate": 0.00019387513651776303, + "loss": 1.284, + "step": 5825 + }, + { + "epoch": 0.224446583253128, + "grad_norm": 1.521506905555725, + "learning_rate": 0.00019386471180177247, + "loss": 1.4129, + "step": 5830 + }, + { + "epoch": 0.2246390760346487, + "grad_norm": 1.4055581092834473, + "learning_rate": 0.00019385427850247572, + "loss": 1.2476, + "step": 5835 + }, + { + "epoch": 0.2248315688161694, + "grad_norm": 0.9506363868713379, + "learning_rate": 0.00019384383662082703, + "loss": 1.3105, + "step": 5840 + }, + { + "epoch": 0.2250240615976901, + "grad_norm": 1.354658842086792, + "learning_rate": 0.00019383338615778107, + "loss": 1.29, + "step": 5845 + }, + { + "epoch": 0.2252165543792108, + "grad_norm": 0.8972203135490417, + "learning_rate": 0.00019382292711429353, + "loss": 1.3407, + "step": 5850 + }, + { + "epoch": 0.22540904716073148, + "grad_norm": 0.9989115595817566, + "learning_rate": 0.00019381245949132085, + "loss": 1.1662, + "step": 5855 + }, + { + "epoch": 0.22560153994225216, + "grad_norm": 1.1133052110671997, + "learning_rate": 0.0001938019832898202, + "loss": 1.2674, + "step": 5860 + }, + { + "epoch": 0.22579403272377285, + "grad_norm": 1.3640556335449219, + "learning_rate": 0.00019379149851074957, + "loss": 1.1989, + "step": 5865 + }, + { + "epoch": 0.22598652550529355, + "grad_norm": 1.2812589406967163, + "learning_rate": 0.0001937810051550677, + "loss": 1.4749, + "step": 5870 + }, + { + "epoch": 0.22617901828681425, + "grad_norm": 1.223944902420044, + "learning_rate": 0.00019377050322373412, + "loss": 1.305, + "step": 5875 + }, + { + "epoch": 0.22637151106833495, + "grad_norm": 1.3493690490722656, + "learning_rate": 0.00019375999271770925, + "loss": 1.458, + "step": 5880 + }, + { + "epoch": 0.22656400384985564, + "grad_norm": 1.4042202234268188, + "learning_rate": 0.0001937494736379541, + "loss": 1.1714, + "step": 5885 + }, + { + "epoch": 0.22675649663137631, + "grad_norm": 1.6239880323410034, + "learning_rate": 0.00019373894598543066, + "loss": 1.3224, + "step": 5890 + }, + { + "epoch": 0.226948989412897, + "grad_norm": 1.096960425376892, + "learning_rate": 0.00019372840976110154, + "loss": 1.128, + "step": 5895 + }, + { + "epoch": 0.2271414821944177, + "grad_norm": 1.6740233898162842, + "learning_rate": 0.00019371786496593028, + "loss": 1.195, + "step": 5900 + }, + { + "epoch": 0.2273339749759384, + "grad_norm": 1.454030156135559, + "learning_rate": 0.00019370731160088105, + "loss": 1.2641, + "step": 5905 + }, + { + "epoch": 0.2275264677574591, + "grad_norm": 1.4465221166610718, + "learning_rate": 0.00019369674966691897, + "loss": 1.331, + "step": 5910 + }, + { + "epoch": 0.22771896053897978, + "grad_norm": 1.6115851402282715, + "learning_rate": 0.00019368617916500978, + "loss": 1.4061, + "step": 5915 + }, + { + "epoch": 0.22791145332050047, + "grad_norm": 1.0165706872940063, + "learning_rate": 0.00019367560009612013, + "loss": 1.177, + "step": 5920 + }, + { + "epoch": 0.22810394610202117, + "grad_norm": 1.5200728178024292, + "learning_rate": 0.00019366501246121737, + "loss": 1.1323, + "step": 5925 + }, + { + "epoch": 0.22829643888354187, + "grad_norm": 1.4613386392593384, + "learning_rate": 0.00019365441626126976, + "loss": 1.4626, + "step": 5930 + }, + { + "epoch": 0.22848893166506257, + "grad_norm": 1.2502466440200806, + "learning_rate": 0.00019364381149724613, + "loss": 1.2797, + "step": 5935 + }, + { + "epoch": 0.22868142444658326, + "grad_norm": 1.2946960926055908, + "learning_rate": 0.0001936331981701163, + "loss": 1.3844, + "step": 5940 + }, + { + "epoch": 0.22887391722810393, + "grad_norm": 1.2478231191635132, + "learning_rate": 0.00019362257628085074, + "loss": 1.2855, + "step": 5945 + }, + { + "epoch": 0.22906641000962463, + "grad_norm": 1.0097830295562744, + "learning_rate": 0.0001936119458304208, + "loss": 1.1223, + "step": 5950 + }, + { + "epoch": 0.22925890279114533, + "grad_norm": 1.3235141038894653, + "learning_rate": 0.00019360130681979852, + "loss": 1.284, + "step": 5955 + }, + { + "epoch": 0.22945139557266603, + "grad_norm": 1.6869986057281494, + "learning_rate": 0.00019359065924995678, + "loss": 1.517, + "step": 5960 + }, + { + "epoch": 0.22964388835418673, + "grad_norm": 0.9644334316253662, + "learning_rate": 0.00019358000312186925, + "loss": 1.0607, + "step": 5965 + }, + { + "epoch": 0.22983638113570742, + "grad_norm": 1.063192367553711, + "learning_rate": 0.0001935693384365103, + "loss": 0.9187, + "step": 5970 + }, + { + "epoch": 0.2300288739172281, + "grad_norm": 1.0339081287384033, + "learning_rate": 0.00019355866519485523, + "loss": 1.2946, + "step": 5975 + }, + { + "epoch": 0.2302213666987488, + "grad_norm": 1.3194791078567505, + "learning_rate": 0.00019354798339788, + "loss": 1.4293, + "step": 5980 + }, + { + "epoch": 0.2304138594802695, + "grad_norm": 1.8870794773101807, + "learning_rate": 0.00019353729304656136, + "loss": 1.4124, + "step": 5985 + }, + { + "epoch": 0.2306063522617902, + "grad_norm": 1.132385015487671, + "learning_rate": 0.00019352659414187694, + "loss": 1.1949, + "step": 5990 + }, + { + "epoch": 0.23079884504331089, + "grad_norm": 2.763613700866699, + "learning_rate": 0.000193515886684805, + "loss": 1.2341, + "step": 5995 + }, + { + "epoch": 0.23099133782483156, + "grad_norm": 1.6793404817581177, + "learning_rate": 0.00019350517067632473, + "loss": 1.3597, + "step": 6000 + }, + { + "epoch": 0.23118383060635225, + "grad_norm": 1.1538963317871094, + "learning_rate": 0.000193494446117416, + "loss": 1.1981, + "step": 6005 + }, + { + "epoch": 0.23137632338787295, + "grad_norm": 1.0233584642410278, + "learning_rate": 0.00019348371300905955, + "loss": 1.2821, + "step": 6010 + }, + { + "epoch": 0.23156881616939365, + "grad_norm": 1.3905096054077148, + "learning_rate": 0.0001934729713522368, + "loss": 1.3471, + "step": 6015 + }, + { + "epoch": 0.23176130895091435, + "grad_norm": 1.345563292503357, + "learning_rate": 0.00019346222114793, + "loss": 1.0454, + "step": 6020 + }, + { + "epoch": 0.23195380173243504, + "grad_norm": 0.739811897277832, + "learning_rate": 0.00019345146239712225, + "loss": 1.3125, + "step": 6025 + }, + { + "epoch": 0.23214629451395571, + "grad_norm": 1.977918028831482, + "learning_rate": 0.0001934406951007973, + "loss": 1.3328, + "step": 6030 + }, + { + "epoch": 0.2323387872954764, + "grad_norm": 0.9505223035812378, + "learning_rate": 0.00019342991925993977, + "loss": 1.1388, + "step": 6035 + }, + { + "epoch": 0.2325312800769971, + "grad_norm": 1.257755160331726, + "learning_rate": 0.00019341913487553502, + "loss": 1.3064, + "step": 6040 + }, + { + "epoch": 0.2327237728585178, + "grad_norm": 1.2003203630447388, + "learning_rate": 0.00019340834194856926, + "loss": 1.4369, + "step": 6045 + }, + { + "epoch": 0.2329162656400385, + "grad_norm": 1.2289738655090332, + "learning_rate": 0.0001933975404800294, + "loss": 1.1462, + "step": 6050 + }, + { + "epoch": 0.2331087584215592, + "grad_norm": 1.227171540260315, + "learning_rate": 0.00019338673047090317, + "loss": 1.1829, + "step": 6055 + }, + { + "epoch": 0.23330125120307987, + "grad_norm": 1.2766560316085815, + "learning_rate": 0.00019337591192217904, + "loss": 1.2572, + "step": 6060 + }, + { + "epoch": 0.23349374398460057, + "grad_norm": 2.6716904640197754, + "learning_rate": 0.00019336508483484634, + "loss": 1.0195, + "step": 6065 + }, + { + "epoch": 0.23368623676612127, + "grad_norm": 1.1586931943893433, + "learning_rate": 0.00019335424920989512, + "loss": 1.4932, + "step": 6070 + }, + { + "epoch": 0.23387872954764197, + "grad_norm": 1.0196670293807983, + "learning_rate": 0.00019334340504831624, + "loss": 1.3497, + "step": 6075 + }, + { + "epoch": 0.23407122232916266, + "grad_norm": 1.6527109146118164, + "learning_rate": 0.00019333255235110127, + "loss": 1.1239, + "step": 6080 + }, + { + "epoch": 0.23426371511068336, + "grad_norm": 0.9913870096206665, + "learning_rate": 0.00019332169111924271, + "loss": 1.2757, + "step": 6085 + }, + { + "epoch": 0.23445620789220403, + "grad_norm": 1.1027697324752808, + "learning_rate": 0.00019331082135373367, + "loss": 1.2512, + "step": 6090 + }, + { + "epoch": 0.23464870067372473, + "grad_norm": 1.9269218444824219, + "learning_rate": 0.00019329994305556815, + "loss": 1.4698, + "step": 6095 + }, + { + "epoch": 0.23484119345524543, + "grad_norm": 1.1504942178726196, + "learning_rate": 0.00019328905622574086, + "loss": 1.4844, + "step": 6100 + }, + { + "epoch": 0.23503368623676613, + "grad_norm": 1.1164321899414062, + "learning_rate": 0.0001932781608652474, + "loss": 1.2972, + "step": 6105 + }, + { + "epoch": 0.23522617901828682, + "grad_norm": 1.283000111579895, + "learning_rate": 0.00019326725697508407, + "loss": 1.3117, + "step": 6110 + }, + { + "epoch": 0.2354186717998075, + "grad_norm": 1.3553595542907715, + "learning_rate": 0.00019325634455624787, + "loss": 1.027, + "step": 6115 + }, + { + "epoch": 0.2356111645813282, + "grad_norm": 2.1605517864227295, + "learning_rate": 0.00019324542360973674, + "loss": 1.2211, + "step": 6120 + }, + { + "epoch": 0.2358036573628489, + "grad_norm": 1.1028283834457397, + "learning_rate": 0.00019323449413654933, + "loss": 1.3034, + "step": 6125 + }, + { + "epoch": 0.2359961501443696, + "grad_norm": 1.1728841066360474, + "learning_rate": 0.00019322355613768505, + "loss": 1.3135, + "step": 6130 + }, + { + "epoch": 0.23618864292589029, + "grad_norm": 1.7304178476333618, + "learning_rate": 0.0001932126096141441, + "loss": 1.3516, + "step": 6135 + }, + { + "epoch": 0.23638113570741098, + "grad_norm": 1.3326451778411865, + "learning_rate": 0.00019320165456692748, + "loss": 1.3371, + "step": 6140 + }, + { + "epoch": 0.23657362848893165, + "grad_norm": 1.6894330978393555, + "learning_rate": 0.00019319069099703697, + "loss": 1.2126, + "step": 6145 + }, + { + "epoch": 0.23676612127045235, + "grad_norm": 1.7248213291168213, + "learning_rate": 0.0001931797189054751, + "loss": 1.193, + "step": 6150 + }, + { + "epoch": 0.23695861405197305, + "grad_norm": 1.1517174243927002, + "learning_rate": 0.0001931687382932452, + "loss": 1.1472, + "step": 6155 + }, + { + "epoch": 0.23715110683349375, + "grad_norm": 2.4606590270996094, + "learning_rate": 0.00019315774916135134, + "loss": 1.524, + "step": 6160 + }, + { + "epoch": 0.23734359961501444, + "grad_norm": 1.6130386590957642, + "learning_rate": 0.00019314675151079844, + "loss": 1.052, + "step": 6165 + }, + { + "epoch": 0.23753609239653514, + "grad_norm": 1.3845412731170654, + "learning_rate": 0.00019313574534259216, + "loss": 1.2557, + "step": 6170 + }, + { + "epoch": 0.2377285851780558, + "grad_norm": 1.3509567975997925, + "learning_rate": 0.00019312473065773893, + "loss": 1.3083, + "step": 6175 + }, + { + "epoch": 0.2379210779595765, + "grad_norm": 1.358113408088684, + "learning_rate": 0.000193113707457246, + "loss": 1.2226, + "step": 6180 + }, + { + "epoch": 0.2381135707410972, + "grad_norm": 0.9598337411880493, + "learning_rate": 0.00019310267574212134, + "loss": 1.1861, + "step": 6185 + }, + { + "epoch": 0.2383060635226179, + "grad_norm": 1.347159743309021, + "learning_rate": 0.0001930916355133737, + "loss": 1.2782, + "step": 6190 + }, + { + "epoch": 0.2384985563041386, + "grad_norm": 1.0227164030075073, + "learning_rate": 0.0001930805867720127, + "loss": 1.2909, + "step": 6195 + }, + { + "epoch": 0.2386910490856593, + "grad_norm": 1.8373135328292847, + "learning_rate": 0.00019306952951904865, + "loss": 1.3371, + "step": 6200 + }, + { + "epoch": 0.23888354186717997, + "grad_norm": 2.130218267440796, + "learning_rate": 0.00019305846375549263, + "loss": 1.3275, + "step": 6205 + }, + { + "epoch": 0.23907603464870067, + "grad_norm": 1.3699109554290771, + "learning_rate": 0.00019304738948235656, + "loss": 1.172, + "step": 6210 + }, + { + "epoch": 0.23926852743022137, + "grad_norm": 1.8254964351654053, + "learning_rate": 0.0001930363067006531, + "loss": 1.166, + "step": 6215 + }, + { + "epoch": 0.23946102021174206, + "grad_norm": 2.6475026607513428, + "learning_rate": 0.00019302521541139571, + "loss": 1.3168, + "step": 6220 + }, + { + "epoch": 0.23965351299326276, + "grad_norm": 1.4869440793991089, + "learning_rate": 0.0001930141156155986, + "loss": 1.1112, + "step": 6225 + }, + { + "epoch": 0.23984600577478343, + "grad_norm": 1.0316526889801025, + "learning_rate": 0.00019300300731427678, + "loss": 1.3845, + "step": 6230 + }, + { + "epoch": 0.24003849855630413, + "grad_norm": 1.1549556255340576, + "learning_rate": 0.00019299189050844603, + "loss": 1.378, + "step": 6235 + }, + { + "epoch": 0.24023099133782483, + "grad_norm": 1.9833987951278687, + "learning_rate": 0.00019298076519912294, + "loss": 1.2631, + "step": 6240 + }, + { + "epoch": 0.24042348411934553, + "grad_norm": 1.1354988813400269, + "learning_rate": 0.00019296963138732478, + "loss": 1.6525, + "step": 6245 + }, + { + "epoch": 0.24061597690086622, + "grad_norm": 1.6483670473098755, + "learning_rate": 0.0001929584890740697, + "loss": 0.9828, + "step": 6250 + }, + { + "epoch": 0.24080846968238692, + "grad_norm": 1.537610650062561, + "learning_rate": 0.00019294733826037659, + "loss": 1.3566, + "step": 6255 + }, + { + "epoch": 0.2410009624639076, + "grad_norm": 1.207406759262085, + "learning_rate": 0.0001929361789472651, + "loss": 1.3306, + "step": 6260 + }, + { + "epoch": 0.2411934552454283, + "grad_norm": 1.4772666692733765, + "learning_rate": 0.00019292501113575572, + "loss": 1.3117, + "step": 6265 + }, + { + "epoch": 0.241385948026949, + "grad_norm": 1.8285613059997559, + "learning_rate": 0.00019291383482686962, + "loss": 1.3711, + "step": 6270 + }, + { + "epoch": 0.24157844080846969, + "grad_norm": 0.9223503470420837, + "learning_rate": 0.00019290265002162884, + "loss": 1.1712, + "step": 6275 + }, + { + "epoch": 0.24177093358999038, + "grad_norm": 2.1818087100982666, + "learning_rate": 0.00019289145672105612, + "loss": 1.1596, + "step": 6280 + }, + { + "epoch": 0.24196342637151108, + "grad_norm": 0.8749092817306519, + "learning_rate": 0.00019288025492617504, + "loss": 1.0726, + "step": 6285 + }, + { + "epoch": 0.24215591915303175, + "grad_norm": 1.1598855257034302, + "learning_rate": 0.00019286904463800995, + "loss": 1.2931, + "step": 6290 + }, + { + "epoch": 0.24234841193455245, + "grad_norm": 1.4357101917266846, + "learning_rate": 0.0001928578258575859, + "loss": 1.2612, + "step": 6295 + }, + { + "epoch": 0.24254090471607315, + "grad_norm": 0.9731203317642212, + "learning_rate": 0.0001928465985859288, + "loss": 1.178, + "step": 6300 + }, + { + "epoch": 0.24273339749759384, + "grad_norm": 1.1217381954193115, + "learning_rate": 0.00019283536282406534, + "loss": 1.285, + "step": 6305 + }, + { + "epoch": 0.24292589027911454, + "grad_norm": 1.415860891342163, + "learning_rate": 0.0001928241185730229, + "loss": 1.399, + "step": 6310 + }, + { + "epoch": 0.2431183830606352, + "grad_norm": 0.9067175388336182, + "learning_rate": 0.00019281286583382973, + "loss": 1.2336, + "step": 6315 + }, + { + "epoch": 0.2433108758421559, + "grad_norm": 1.6320233345031738, + "learning_rate": 0.0001928016046075148, + "loss": 1.4348, + "step": 6320 + }, + { + "epoch": 0.2435033686236766, + "grad_norm": 1.3945854902267456, + "learning_rate": 0.0001927903348951079, + "loss": 1.1614, + "step": 6325 + }, + { + "epoch": 0.2436958614051973, + "grad_norm": 1.37948477268219, + "learning_rate": 0.00019277905669763952, + "loss": 1.2058, + "step": 6330 + }, + { + "epoch": 0.243888354186718, + "grad_norm": 1.3325083255767822, + "learning_rate": 0.00019276777001614104, + "loss": 1.2737, + "step": 6335 + }, + { + "epoch": 0.2440808469682387, + "grad_norm": 1.5902581214904785, + "learning_rate": 0.00019275647485164453, + "loss": 1.3706, + "step": 6340 + }, + { + "epoch": 0.24427333974975937, + "grad_norm": 1.1309142112731934, + "learning_rate": 0.00019274517120518284, + "loss": 1.2408, + "step": 6345 + }, + { + "epoch": 0.24446583253128007, + "grad_norm": 1.9998489618301392, + "learning_rate": 0.0001927338590777896, + "loss": 1.3079, + "step": 6350 + }, + { + "epoch": 0.24465832531280077, + "grad_norm": 1.569667100906372, + "learning_rate": 0.00019272253847049927, + "loss": 1.2365, + "step": 6355 + }, + { + "epoch": 0.24485081809432147, + "grad_norm": 1.2294694185256958, + "learning_rate": 0.00019271120938434702, + "loss": 1.3544, + "step": 6360 + }, + { + "epoch": 0.24504331087584216, + "grad_norm": 1.9876806735992432, + "learning_rate": 0.00019269987182036883, + "loss": 1.3675, + "step": 6365 + }, + { + "epoch": 0.24523580365736286, + "grad_norm": 1.3317819833755493, + "learning_rate": 0.0001926885257796015, + "loss": 1.0949, + "step": 6370 + }, + { + "epoch": 0.24542829643888353, + "grad_norm": 1.7602546215057373, + "learning_rate": 0.00019267717126308242, + "loss": 1.3168, + "step": 6375 + }, + { + "epoch": 0.24562078922040423, + "grad_norm": 1.5651274919509888, + "learning_rate": 0.00019266580827184996, + "loss": 1.2802, + "step": 6380 + }, + { + "epoch": 0.24581328200192493, + "grad_norm": 0.9537544846534729, + "learning_rate": 0.0001926544368069432, + "loss": 1.1876, + "step": 6385 + }, + { + "epoch": 0.24600577478344562, + "grad_norm": 0.9649773240089417, + "learning_rate": 0.000192643056869402, + "loss": 1.1378, + "step": 6390 + }, + { + "epoch": 0.24619826756496632, + "grad_norm": 1.6363686323165894, + "learning_rate": 0.00019263166846026692, + "loss": 1.3284, + "step": 6395 + }, + { + "epoch": 0.24639076034648702, + "grad_norm": 1.748897910118103, + "learning_rate": 0.00019262027158057943, + "loss": 1.4314, + "step": 6400 + }, + { + "epoch": 0.2465832531280077, + "grad_norm": 2.138967990875244, + "learning_rate": 0.00019260886623138164, + "loss": 1.2244, + "step": 6405 + }, + { + "epoch": 0.2467757459095284, + "grad_norm": 2.517312526702881, + "learning_rate": 0.0001925974524137165, + "loss": 1.3394, + "step": 6410 + }, + { + "epoch": 0.24696823869104909, + "grad_norm": 1.7510714530944824, + "learning_rate": 0.00019258603012862772, + "loss": 1.3369, + "step": 6415 + }, + { + "epoch": 0.24716073147256978, + "grad_norm": 1.1651504039764404, + "learning_rate": 0.00019257459937715985, + "loss": 1.2953, + "step": 6420 + }, + { + "epoch": 0.24735322425409048, + "grad_norm": 1.325554609298706, + "learning_rate": 0.0001925631601603581, + "loss": 1.3062, + "step": 6425 + }, + { + "epoch": 0.24754571703561115, + "grad_norm": 1.0340043306350708, + "learning_rate": 0.00019255171247926852, + "loss": 1.337, + "step": 6430 + }, + { + "epoch": 0.24773820981713185, + "grad_norm": 1.677131175994873, + "learning_rate": 0.00019254025633493792, + "loss": 1.3179, + "step": 6435 + }, + { + "epoch": 0.24793070259865255, + "grad_norm": 2.475339651107788, + "learning_rate": 0.00019252879172841395, + "loss": 1.4765, + "step": 6440 + }, + { + "epoch": 0.24812319538017324, + "grad_norm": 1.1302917003631592, + "learning_rate": 0.00019251731866074486, + "loss": 1.3029, + "step": 6445 + }, + { + "epoch": 0.24831568816169394, + "grad_norm": 1.3425379991531372, + "learning_rate": 0.0001925058371329799, + "loss": 1.1263, + "step": 6450 + }, + { + "epoch": 0.24850818094321464, + "grad_norm": 1.0058633089065552, + "learning_rate": 0.0001924943471461689, + "loss": 1.1059, + "step": 6455 + }, + { + "epoch": 0.2487006737247353, + "grad_norm": 1.9793190956115723, + "learning_rate": 0.0001924828487013626, + "loss": 1.5268, + "step": 6460 + }, + { + "epoch": 0.248893166506256, + "grad_norm": 1.0673744678497314, + "learning_rate": 0.00019247134179961242, + "loss": 1.2199, + "step": 6465 + }, + { + "epoch": 0.2490856592877767, + "grad_norm": 1.1182838678359985, + "learning_rate": 0.00019245982644197057, + "loss": 1.5456, + "step": 6470 + }, + { + "epoch": 0.2492781520692974, + "grad_norm": 0.9264312982559204, + "learning_rate": 0.00019244830262949014, + "loss": 1.2367, + "step": 6475 + }, + { + "epoch": 0.2494706448508181, + "grad_norm": 1.2094528675079346, + "learning_rate": 0.00019243677036322478, + "loss": 1.2026, + "step": 6480 + }, + { + "epoch": 0.2496631376323388, + "grad_norm": 1.275902509689331, + "learning_rate": 0.00019242522964422917, + "loss": 1.206, + "step": 6485 + }, + { + "epoch": 0.24985563041385947, + "grad_norm": 1.515559434890747, + "learning_rate": 0.00019241368047355853, + "loss": 1.2222, + "step": 6490 + }, + { + "epoch": 0.25004812319538017, + "grad_norm": 0.9974495768547058, + "learning_rate": 0.000192402122852269, + "loss": 1.5274, + "step": 6495 + }, + { + "epoch": 0.2502406159769009, + "grad_norm": 1.8940407037734985, + "learning_rate": 0.00019239055678141746, + "loss": 1.3639, + "step": 6500 + }, + { + "epoch": 0.25043310875842156, + "grad_norm": 1.7484371662139893, + "learning_rate": 0.00019237898226206153, + "loss": 1.3517, + "step": 6505 + }, + { + "epoch": 0.25062560153994223, + "grad_norm": 1.004660725593567, + "learning_rate": 0.00019236739929525963, + "loss": 1.0603, + "step": 6510 + }, + { + "epoch": 0.25081809432146296, + "grad_norm": 0.9729489684104919, + "learning_rate": 0.00019235580788207093, + "loss": 1.3252, + "step": 6515 + }, + { + "epoch": 0.25101058710298363, + "grad_norm": 0.4645654857158661, + "learning_rate": 0.00019234420802355539, + "loss": 1.1804, + "step": 6520 + }, + { + "epoch": 0.25120307988450435, + "grad_norm": 1.0810743570327759, + "learning_rate": 0.00019233259972077378, + "loss": 1.3045, + "step": 6525 + }, + { + "epoch": 0.251395572666025, + "grad_norm": 1.1666224002838135, + "learning_rate": 0.00019232098297478756, + "loss": 1.324, + "step": 6530 + }, + { + "epoch": 0.2515880654475457, + "grad_norm": 1.06947660446167, + "learning_rate": 0.000192309357786659, + "loss": 1.3131, + "step": 6535 + }, + { + "epoch": 0.2517805582290664, + "grad_norm": 1.1774028539657593, + "learning_rate": 0.0001922977241574512, + "loss": 1.301, + "step": 6540 + }, + { + "epoch": 0.2519730510105871, + "grad_norm": 1.528041958808899, + "learning_rate": 0.0001922860820882279, + "loss": 1.2542, + "step": 6545 + }, + { + "epoch": 0.2521655437921078, + "grad_norm": 1.1932915449142456, + "learning_rate": 0.00019227443158005377, + "loss": 1.125, + "step": 6550 + }, + { + "epoch": 0.2523580365736285, + "grad_norm": 1.3258370161056519, + "learning_rate": 0.0001922627726339941, + "loss": 1.3776, + "step": 6555 + }, + { + "epoch": 0.25255052935514916, + "grad_norm": 0.994076132774353, + "learning_rate": 0.0001922511052511151, + "loss": 1.0908, + "step": 6560 + }, + { + "epoch": 0.2527430221366699, + "grad_norm": 1.0820032358169556, + "learning_rate": 0.00019223942943248358, + "loss": 1.215, + "step": 6565 + }, + { + "epoch": 0.25293551491819055, + "grad_norm": 0.9792138338088989, + "learning_rate": 0.00019222774517916734, + "loss": 1.2413, + "step": 6570 + }, + { + "epoch": 0.2531280076997113, + "grad_norm": 1.1704801321029663, + "learning_rate": 0.0001922160524922347, + "loss": 1.5203, + "step": 6575 + }, + { + "epoch": 0.25332050048123195, + "grad_norm": 1.6249198913574219, + "learning_rate": 0.00019220435137275494, + "loss": 1.2771, + "step": 6580 + }, + { + "epoch": 0.2535129932627527, + "grad_norm": 1.3218034505844116, + "learning_rate": 0.00019219264182179804, + "loss": 1.4433, + "step": 6585 + }, + { + "epoch": 0.25370548604427334, + "grad_norm": 1.7230724096298218, + "learning_rate": 0.0001921809238404348, + "loss": 1.1069, + "step": 6590 + }, + { + "epoch": 0.253897978825794, + "grad_norm": 1.3148738145828247, + "learning_rate": 0.00019216919742973669, + "loss": 1.2386, + "step": 6595 + }, + { + "epoch": 0.25409047160731474, + "grad_norm": 1.257513403892517, + "learning_rate": 0.00019215746259077605, + "loss": 1.3476, + "step": 6600 + }, + { + "epoch": 0.2542829643888354, + "grad_norm": 0.965403139591217, + "learning_rate": 0.00019214571932462592, + "loss": 1.1045, + "step": 6605 + }, + { + "epoch": 0.25447545717035613, + "grad_norm": 0.8903887867927551, + "learning_rate": 0.0001921339676323602, + "loss": 1.1481, + "step": 6610 + }, + { + "epoch": 0.2546679499518768, + "grad_norm": 1.284529209136963, + "learning_rate": 0.00019212220751505345, + "loss": 1.3179, + "step": 6615 + }, + { + "epoch": 0.2548604427333975, + "grad_norm": 2.3491082191467285, + "learning_rate": 0.0001921104389737811, + "loss": 1.3042, + "step": 6620 + }, + { + "epoch": 0.2550529355149182, + "grad_norm": 1.4170057773590088, + "learning_rate": 0.00019209866200961927, + "loss": 1.3775, + "step": 6625 + }, + { + "epoch": 0.25524542829643887, + "grad_norm": 1.4182847738265991, + "learning_rate": 0.00019208687662364488, + "loss": 1.3895, + "step": 6630 + }, + { + "epoch": 0.2554379210779596, + "grad_norm": 1.2162110805511475, + "learning_rate": 0.00019207508281693568, + "loss": 1.0754, + "step": 6635 + }, + { + "epoch": 0.25563041385948027, + "grad_norm": 1.473873257637024, + "learning_rate": 0.00019206328059057006, + "loss": 1.3323, + "step": 6640 + }, + { + "epoch": 0.25582290664100094, + "grad_norm": 1.2990386486053467, + "learning_rate": 0.0001920514699456273, + "loss": 1.2304, + "step": 6645 + }, + { + "epoch": 0.25601539942252166, + "grad_norm": 1.2828303575515747, + "learning_rate": 0.00019203965088318743, + "loss": 1.2566, + "step": 6650 + }, + { + "epoch": 0.25620789220404233, + "grad_norm": 0.9165570735931396, + "learning_rate": 0.00019202782340433115, + "loss": 1.2186, + "step": 6655 + }, + { + "epoch": 0.25640038498556306, + "grad_norm": 2.0381886959075928, + "learning_rate": 0.00019201598751014006, + "loss": 1.114, + "step": 6660 + }, + { + "epoch": 0.2565928777670837, + "grad_norm": 1.252790093421936, + "learning_rate": 0.00019200414320169647, + "loss": 1.2354, + "step": 6665 + }, + { + "epoch": 0.25678537054860445, + "grad_norm": 1.1557594537734985, + "learning_rate": 0.00019199229048008347, + "loss": 1.3652, + "step": 6670 + }, + { + "epoch": 0.2569778633301251, + "grad_norm": 1.356181025505066, + "learning_rate": 0.0001919804293463849, + "loss": 1.1026, + "step": 6675 + }, + { + "epoch": 0.2571703561116458, + "grad_norm": 1.2493314743041992, + "learning_rate": 0.00019196855980168536, + "loss": 1.2225, + "step": 6680 + }, + { + "epoch": 0.2573628488931665, + "grad_norm": 1.7480677366256714, + "learning_rate": 0.00019195668184707025, + "loss": 1.2898, + "step": 6685 + }, + { + "epoch": 0.2575553416746872, + "grad_norm": 1.0522620677947998, + "learning_rate": 0.00019194479548362577, + "loss": 1.1404, + "step": 6690 + }, + { + "epoch": 0.2577478344562079, + "grad_norm": 1.4085676670074463, + "learning_rate": 0.00019193290071243882, + "loss": 1.5024, + "step": 6695 + }, + { + "epoch": 0.2579403272377286, + "grad_norm": 1.393096923828125, + "learning_rate": 0.0001919209975345971, + "loss": 1.2555, + "step": 6700 + }, + { + "epoch": 0.25813282001924925, + "grad_norm": 1.5740808248519897, + "learning_rate": 0.00019190908595118907, + "loss": 1.2362, + "step": 6705 + }, + { + "epoch": 0.25832531280077, + "grad_norm": 1.3243273496627808, + "learning_rate": 0.00019189716596330395, + "loss": 1.2517, + "step": 6710 + }, + { + "epoch": 0.25851780558229065, + "grad_norm": 2.5867626667022705, + "learning_rate": 0.00019188523757203177, + "loss": 1.3509, + "step": 6715 + }, + { + "epoch": 0.2587102983638114, + "grad_norm": 1.450181484222412, + "learning_rate": 0.00019187330077846334, + "loss": 1.3451, + "step": 6720 + }, + { + "epoch": 0.25890279114533205, + "grad_norm": 1.4387754201889038, + "learning_rate": 0.0001918613555836901, + "loss": 1.2518, + "step": 6725 + }, + { + "epoch": 0.25909528392685277, + "grad_norm": 1.427882432937622, + "learning_rate": 0.00019184940198880448, + "loss": 1.235, + "step": 6730 + }, + { + "epoch": 0.25928777670837344, + "grad_norm": 1.060436487197876, + "learning_rate": 0.00019183743999489947, + "loss": 1.4583, + "step": 6735 + }, + { + "epoch": 0.2594802694898941, + "grad_norm": 1.0780494213104248, + "learning_rate": 0.00019182546960306893, + "loss": 1.1134, + "step": 6740 + }, + { + "epoch": 0.25967276227141484, + "grad_norm": 1.3795710802078247, + "learning_rate": 0.0001918134908144075, + "loss": 1.2979, + "step": 6745 + }, + { + "epoch": 0.2598652550529355, + "grad_norm": 2.0972957611083984, + "learning_rate": 0.00019180150363001051, + "loss": 1.6512, + "step": 6750 + }, + { + "epoch": 0.26005774783445623, + "grad_norm": 1.129204273223877, + "learning_rate": 0.00019178950805097416, + "loss": 1.2263, + "step": 6755 + }, + { + "epoch": 0.2602502406159769, + "grad_norm": 0.8816843628883362, + "learning_rate": 0.00019177750407839536, + "loss": 1.2265, + "step": 6760 + }, + { + "epoch": 0.26044273339749757, + "grad_norm": 1.5167860984802246, + "learning_rate": 0.00019176549171337178, + "loss": 1.226, + "step": 6765 + }, + { + "epoch": 0.2606352261790183, + "grad_norm": 1.329172968864441, + "learning_rate": 0.00019175347095700188, + "loss": 1.3375, + "step": 6770 + }, + { + "epoch": 0.26082771896053897, + "grad_norm": 1.8215051889419556, + "learning_rate": 0.00019174144181038485, + "loss": 1.2453, + "step": 6775 + }, + { + "epoch": 0.2610202117420597, + "grad_norm": 1.147878646850586, + "learning_rate": 0.00019172940427462072, + "loss": 1.3137, + "step": 6780 + }, + { + "epoch": 0.26121270452358036, + "grad_norm": 1.5783206224441528, + "learning_rate": 0.0001917173583508102, + "loss": 1.1803, + "step": 6785 + }, + { + "epoch": 0.26140519730510103, + "grad_norm": 1.7433182001113892, + "learning_rate": 0.00019170530404005485, + "loss": 1.171, + "step": 6790 + }, + { + "epoch": 0.26159769008662176, + "grad_norm": 1.5278960466384888, + "learning_rate": 0.0001916932413434569, + "loss": 1.2274, + "step": 6795 + }, + { + "epoch": 0.26179018286814243, + "grad_norm": 1.375710368156433, + "learning_rate": 0.00019168117026211948, + "loss": 1.241, + "step": 6800 + }, + { + "epoch": 0.26198267564966315, + "grad_norm": 2.146165370941162, + "learning_rate": 0.00019166909079714636, + "loss": 1.2778, + "step": 6805 + }, + { + "epoch": 0.2621751684311838, + "grad_norm": 1.7670506238937378, + "learning_rate": 0.00019165700294964216, + "loss": 1.3293, + "step": 6810 + }, + { + "epoch": 0.26236766121270455, + "grad_norm": 1.5492186546325684, + "learning_rate": 0.00019164490672071217, + "loss": 1.2808, + "step": 6815 + }, + { + "epoch": 0.2625601539942252, + "grad_norm": 1.4138727188110352, + "learning_rate": 0.00019163280211146257, + "loss": 1.2352, + "step": 6820 + }, + { + "epoch": 0.2627526467757459, + "grad_norm": 1.185674786567688, + "learning_rate": 0.00019162068912300024, + "loss": 1.1883, + "step": 6825 + }, + { + "epoch": 0.2629451395572666, + "grad_norm": 1.717349886894226, + "learning_rate": 0.0001916085677564328, + "loss": 1.1329, + "step": 6830 + }, + { + "epoch": 0.2631376323387873, + "grad_norm": 1.1391080617904663, + "learning_rate": 0.00019159643801286872, + "loss": 1.4104, + "step": 6835 + }, + { + "epoch": 0.263330125120308, + "grad_norm": 1.0915690660476685, + "learning_rate": 0.00019158429989341716, + "loss": 1.2813, + "step": 6840 + }, + { + "epoch": 0.2635226179018287, + "grad_norm": 1.120492696762085, + "learning_rate": 0.000191572153399188, + "loss": 1.2669, + "step": 6845 + }, + { + "epoch": 0.26371511068334935, + "grad_norm": 1.0648150444030762, + "learning_rate": 0.0001915599985312921, + "loss": 1.2581, + "step": 6850 + }, + { + "epoch": 0.2639076034648701, + "grad_norm": 1.7173513174057007, + "learning_rate": 0.0001915478352908408, + "loss": 1.2081, + "step": 6855 + }, + { + "epoch": 0.26410009624639075, + "grad_norm": 1.3801002502441406, + "learning_rate": 0.00019153566367894644, + "loss": 1.4625, + "step": 6860 + }, + { + "epoch": 0.2642925890279115, + "grad_norm": 2.5863940715789795, + "learning_rate": 0.00019152348369672203, + "loss": 1.4777, + "step": 6865 + }, + { + "epoch": 0.26448508180943214, + "grad_norm": 1.5995707511901855, + "learning_rate": 0.0001915112953452813, + "loss": 1.2089, + "step": 6870 + }, + { + "epoch": 0.2646775745909528, + "grad_norm": 1.2661023139953613, + "learning_rate": 0.0001914990986257388, + "loss": 1.1937, + "step": 6875 + }, + { + "epoch": 0.26487006737247354, + "grad_norm": 1.4782702922821045, + "learning_rate": 0.00019148689353920987, + "loss": 1.2462, + "step": 6880 + }, + { + "epoch": 0.2650625601539942, + "grad_norm": 1.8557063341140747, + "learning_rate": 0.0001914746800868106, + "loss": 1.425, + "step": 6885 + }, + { + "epoch": 0.26525505293551493, + "grad_norm": 2.825359582901001, + "learning_rate": 0.00019146245826965775, + "loss": 1.3628, + "step": 6890 + }, + { + "epoch": 0.2654475457170356, + "grad_norm": 1.7262654304504395, + "learning_rate": 0.00019145022808886902, + "loss": 1.2902, + "step": 6895 + }, + { + "epoch": 0.26564003849855633, + "grad_norm": 0.9676236510276794, + "learning_rate": 0.00019143798954556268, + "loss": 1.3342, + "step": 6900 + }, + { + "epoch": 0.265832531280077, + "grad_norm": 1.4607850313186646, + "learning_rate": 0.00019142574264085797, + "loss": 1.3084, + "step": 6905 + }, + { + "epoch": 0.26602502406159767, + "grad_norm": 2.181511878967285, + "learning_rate": 0.0001914134873758747, + "loss": 1.1746, + "step": 6910 + }, + { + "epoch": 0.2662175168431184, + "grad_norm": 1.4534579515457153, + "learning_rate": 0.00019140122375173362, + "loss": 1.3071, + "step": 6915 + }, + { + "epoch": 0.26641000962463907, + "grad_norm": 1.607039213180542, + "learning_rate": 0.00019138895176955604, + "loss": 1.2883, + "step": 6920 + }, + { + "epoch": 0.2666025024061598, + "grad_norm": 0.9929762482643127, + "learning_rate": 0.00019137667143046425, + "loss": 1.1122, + "step": 6925 + }, + { + "epoch": 0.26679499518768046, + "grad_norm": 1.6732393503189087, + "learning_rate": 0.0001913643827355812, + "loss": 1.149, + "step": 6930 + }, + { + "epoch": 0.26698748796920113, + "grad_norm": 1.3785120248794556, + "learning_rate": 0.0001913520856860305, + "loss": 1.3759, + "step": 6935 + }, + { + "epoch": 0.26717998075072186, + "grad_norm": 1.8252770900726318, + "learning_rate": 0.0001913397802829368, + "loss": 1.2633, + "step": 6940 + }, + { + "epoch": 0.2673724735322425, + "grad_norm": 1.6789536476135254, + "learning_rate": 0.0001913274665274252, + "loss": 1.2741, + "step": 6945 + }, + { + "epoch": 0.26756496631376325, + "grad_norm": 2.0153861045837402, + "learning_rate": 0.00019131514442062184, + "loss": 1.196, + "step": 6950 + }, + { + "epoch": 0.2677574590952839, + "grad_norm": 1.0000704526901245, + "learning_rate": 0.0001913028139636534, + "loss": 1.1872, + "step": 6955 + }, + { + "epoch": 0.2679499518768046, + "grad_norm": 1.2803142070770264, + "learning_rate": 0.00019129047515764743, + "loss": 1.2655, + "step": 6960 + }, + { + "epoch": 0.2681424446583253, + "grad_norm": 0.9827659130096436, + "learning_rate": 0.00019127812800373225, + "loss": 1.3503, + "step": 6965 + }, + { + "epoch": 0.268334937439846, + "grad_norm": 1.3766348361968994, + "learning_rate": 0.00019126577250303697, + "loss": 1.2851, + "step": 6970 + }, + { + "epoch": 0.2685274302213667, + "grad_norm": 2.285708427429199, + "learning_rate": 0.00019125340865669134, + "loss": 1.3247, + "step": 6975 + }, + { + "epoch": 0.2687199230028874, + "grad_norm": 1.79937744140625, + "learning_rate": 0.000191241036465826, + "loss": 1.0306, + "step": 6980 + }, + { + "epoch": 0.2689124157844081, + "grad_norm": 1.6062885522842407, + "learning_rate": 0.0001912286559315723, + "loss": 1.2068, + "step": 6985 + }, + { + "epoch": 0.2691049085659288, + "grad_norm": 1.9590744972229004, + "learning_rate": 0.00019121626705506233, + "loss": 1.2195, + "step": 6990 + }, + { + "epoch": 0.26929740134744945, + "grad_norm": 1.366186261177063, + "learning_rate": 0.000191203869837429, + "loss": 1.1627, + "step": 6995 + }, + { + "epoch": 0.2694898941289702, + "grad_norm": 0.9655261635780334, + "learning_rate": 0.00019119146427980593, + "loss": 1.053, + "step": 7000 + }, + { + "epoch": 0.26968238691049085, + "grad_norm": 1.4636151790618896, + "learning_rate": 0.00019117905038332756, + "loss": 1.0954, + "step": 7005 + }, + { + "epoch": 0.26987487969201157, + "grad_norm": 1.4435783624649048, + "learning_rate": 0.00019116662814912903, + "loss": 1.2102, + "step": 7010 + }, + { + "epoch": 0.27006737247353224, + "grad_norm": 0.9880768060684204, + "learning_rate": 0.00019115419757834628, + "loss": 1.0698, + "step": 7015 + }, + { + "epoch": 0.2702598652550529, + "grad_norm": 1.516515851020813, + "learning_rate": 0.000191141758672116, + "loss": 1.3894, + "step": 7020 + }, + { + "epoch": 0.27045235803657364, + "grad_norm": 2.1763806343078613, + "learning_rate": 0.00019112931143157563, + "loss": 1.3794, + "step": 7025 + }, + { + "epoch": 0.2706448508180943, + "grad_norm": 1.2275705337524414, + "learning_rate": 0.00019111685585786344, + "loss": 1.2897, + "step": 7030 + }, + { + "epoch": 0.27083734359961503, + "grad_norm": 0.966526985168457, + "learning_rate": 0.00019110439195211835, + "loss": 1.2112, + "step": 7035 + }, + { + "epoch": 0.2710298363811357, + "grad_norm": 1.251911997795105, + "learning_rate": 0.00019109191971548016, + "loss": 1.2481, + "step": 7040 + }, + { + "epoch": 0.27122232916265643, + "grad_norm": 2.3555140495300293, + "learning_rate": 0.0001910794391490893, + "loss": 1.3372, + "step": 7045 + }, + { + "epoch": 0.2714148219441771, + "grad_norm": 1.229268193244934, + "learning_rate": 0.0001910669502540871, + "loss": 1.4362, + "step": 7050 + }, + { + "epoch": 0.27160731472569777, + "grad_norm": 1.2356593608856201, + "learning_rate": 0.00019105445303161555, + "loss": 1.379, + "step": 7055 + }, + { + "epoch": 0.2717998075072185, + "grad_norm": 1.910232424736023, + "learning_rate": 0.00019104194748281747, + "loss": 1.2902, + "step": 7060 + }, + { + "epoch": 0.27199230028873916, + "grad_norm": 1.9058904647827148, + "learning_rate": 0.0001910294336088364, + "loss": 1.3313, + "step": 7065 + }, + { + "epoch": 0.2721847930702599, + "grad_norm": 0.8631892800331116, + "learning_rate": 0.0001910169114108166, + "loss": 1.2843, + "step": 7070 + }, + { + "epoch": 0.27237728585178056, + "grad_norm": 1.2212119102478027, + "learning_rate": 0.0001910043808899032, + "loss": 1.2588, + "step": 7075 + }, + { + "epoch": 0.27256977863330123, + "grad_norm": 2.3140738010406494, + "learning_rate": 0.00019099184204724202, + "loss": 1.1781, + "step": 7080 + }, + { + "epoch": 0.27276227141482196, + "grad_norm": 1.0162906646728516, + "learning_rate": 0.00019097929488397965, + "loss": 1.3433, + "step": 7085 + }, + { + "epoch": 0.2729547641963426, + "grad_norm": 1.719766616821289, + "learning_rate": 0.00019096673940126343, + "loss": 1.1469, + "step": 7090 + }, + { + "epoch": 0.27314725697786335, + "grad_norm": 1.5173147916793823, + "learning_rate": 0.00019095417560024153, + "loss": 1.1663, + "step": 7095 + }, + { + "epoch": 0.273339749759384, + "grad_norm": 2.1228654384613037, + "learning_rate": 0.00019094160348206277, + "loss": 1.3433, + "step": 7100 + }, + { + "epoch": 0.2735322425409047, + "grad_norm": 1.3896198272705078, + "learning_rate": 0.00019092902304787679, + "loss": 1.1782, + "step": 7105 + }, + { + "epoch": 0.2737247353224254, + "grad_norm": 1.6935322284698486, + "learning_rate": 0.00019091643429883402, + "loss": 1.1867, + "step": 7110 + }, + { + "epoch": 0.2739172281039461, + "grad_norm": 1.5454139709472656, + "learning_rate": 0.00019090383723608558, + "loss": 1.3938, + "step": 7115 + }, + { + "epoch": 0.2741097208854668, + "grad_norm": 1.1493245363235474, + "learning_rate": 0.00019089123186078342, + "loss": 1.2127, + "step": 7120 + }, + { + "epoch": 0.2743022136669875, + "grad_norm": 1.7321335077285767, + "learning_rate": 0.00019087861817408021, + "loss": 1.3068, + "step": 7125 + }, + { + "epoch": 0.2744947064485082, + "grad_norm": 1.7654987573623657, + "learning_rate": 0.00019086599617712936, + "loss": 1.3236, + "step": 7130 + }, + { + "epoch": 0.2746871992300289, + "grad_norm": 1.0047959089279175, + "learning_rate": 0.0001908533658710851, + "loss": 1.404, + "step": 7135 + }, + { + "epoch": 0.27487969201154955, + "grad_norm": 1.9708582162857056, + "learning_rate": 0.0001908407272571024, + "loss": 1.2387, + "step": 7140 + }, + { + "epoch": 0.2750721847930703, + "grad_norm": 2.097369432449341, + "learning_rate": 0.00019082808033633696, + "loss": 1.189, + "step": 7145 + }, + { + "epoch": 0.27526467757459094, + "grad_norm": 1.1789932250976562, + "learning_rate": 0.00019081542510994523, + "loss": 1.4815, + "step": 7150 + }, + { + "epoch": 0.27545717035611167, + "grad_norm": 1.7205069065093994, + "learning_rate": 0.00019080276157908447, + "loss": 1.2906, + "step": 7155 + }, + { + "epoch": 0.27564966313763234, + "grad_norm": 1.7320606708526611, + "learning_rate": 0.0001907900897449127, + "loss": 1.339, + "step": 7160 + }, + { + "epoch": 0.275842155919153, + "grad_norm": 2.100649356842041, + "learning_rate": 0.00019077740960858863, + "loss": 1.3145, + "step": 7165 + }, + { + "epoch": 0.27603464870067373, + "grad_norm": 1.9302312135696411, + "learning_rate": 0.00019076472117127182, + "loss": 1.3082, + "step": 7170 + }, + { + "epoch": 0.2762271414821944, + "grad_norm": 0.5863549113273621, + "learning_rate": 0.0001907520244341225, + "loss": 1.0183, + "step": 7175 + }, + { + "epoch": 0.27641963426371513, + "grad_norm": 1.0428977012634277, + "learning_rate": 0.00019073931939830174, + "loss": 1.2488, + "step": 7180 + }, + { + "epoch": 0.2766121270452358, + "grad_norm": 1.1643081903457642, + "learning_rate": 0.0001907266060649713, + "loss": 1.476, + "step": 7185 + }, + { + "epoch": 0.27680461982675647, + "grad_norm": 1.0771207809448242, + "learning_rate": 0.00019071388443529376, + "loss": 1.3134, + "step": 7190 + }, + { + "epoch": 0.2769971126082772, + "grad_norm": 1.9787309169769287, + "learning_rate": 0.00019070115451043238, + "loss": 1.3884, + "step": 7195 + }, + { + "epoch": 0.27718960538979787, + "grad_norm": 2.095546245574951, + "learning_rate": 0.0001906884162915513, + "loss": 1.1221, + "step": 7200 + }, + { + "epoch": 0.2773820981713186, + "grad_norm": 2.0389225482940674, + "learning_rate": 0.00019067566977981528, + "loss": 1.0463, + "step": 7205 + }, + { + "epoch": 0.27757459095283926, + "grad_norm": 0.9991855621337891, + "learning_rate": 0.00019066291497638993, + "loss": 1.341, + "step": 7210 + }, + { + "epoch": 0.27776708373436, + "grad_norm": 1.411401391029358, + "learning_rate": 0.0001906501518824416, + "loss": 1.434, + "step": 7215 + }, + { + "epoch": 0.27795957651588066, + "grad_norm": 1.61775803565979, + "learning_rate": 0.0001906373804991374, + "loss": 1.1553, + "step": 7220 + }, + { + "epoch": 0.2781520692974013, + "grad_norm": 2.546022653579712, + "learning_rate": 0.00019062460082764515, + "loss": 1.2496, + "step": 7225 + }, + { + "epoch": 0.27834456207892205, + "grad_norm": 1.2731270790100098, + "learning_rate": 0.00019061181286913348, + "loss": 1.3236, + "step": 7230 + }, + { + "epoch": 0.2785370548604427, + "grad_norm": 1.0163904428482056, + "learning_rate": 0.00019059901662477177, + "loss": 1.2854, + "step": 7235 + }, + { + "epoch": 0.27872954764196345, + "grad_norm": 1.0653849840164185, + "learning_rate": 0.0001905862120957302, + "loss": 1.6351, + "step": 7240 + }, + { + "epoch": 0.2789220404234841, + "grad_norm": 1.081264853477478, + "learning_rate": 0.00019057339928317958, + "loss": 1.2466, + "step": 7245 + }, + { + "epoch": 0.2791145332050048, + "grad_norm": 1.3285462856292725, + "learning_rate": 0.00019056057818829156, + "loss": 1.2087, + "step": 7250 + }, + { + "epoch": 0.2793070259865255, + "grad_norm": 1.067254900932312, + "learning_rate": 0.0001905477488122386, + "loss": 1.3877, + "step": 7255 + }, + { + "epoch": 0.2794995187680462, + "grad_norm": 0.9383085370063782, + "learning_rate": 0.0001905349111561938, + "loss": 1.0643, + "step": 7260 + }, + { + "epoch": 0.2796920115495669, + "grad_norm": 2.7797493934631348, + "learning_rate": 0.00019052206522133117, + "loss": 1.3828, + "step": 7265 + }, + { + "epoch": 0.2798845043310876, + "grad_norm": 1.410261631011963, + "learning_rate": 0.0001905092110088253, + "loss": 1.3019, + "step": 7270 + }, + { + "epoch": 0.28007699711260825, + "grad_norm": 2.313541889190674, + "learning_rate": 0.0001904963485198517, + "loss": 1.2058, + "step": 7275 + }, + { + "epoch": 0.280269489894129, + "grad_norm": 1.4474842548370361, + "learning_rate": 0.00019048347775558645, + "loss": 1.2187, + "step": 7280 + }, + { + "epoch": 0.28046198267564965, + "grad_norm": 1.5846171379089355, + "learning_rate": 0.00019047059871720657, + "loss": 1.0326, + "step": 7285 + }, + { + "epoch": 0.28065447545717037, + "grad_norm": 1.1118413209915161, + "learning_rate": 0.00019045771140588976, + "loss": 1.2881, + "step": 7290 + }, + { + "epoch": 0.28084696823869104, + "grad_norm": 2.5894134044647217, + "learning_rate": 0.00019044481582281448, + "loss": 1.3885, + "step": 7295 + }, + { + "epoch": 0.28103946102021177, + "grad_norm": 1.6019679307937622, + "learning_rate": 0.00019043191196915993, + "loss": 1.3247, + "step": 7300 + }, + { + "epoch": 0.28123195380173244, + "grad_norm": 1.3384417295455933, + "learning_rate": 0.00019041899984610606, + "loss": 1.346, + "step": 7305 + }, + { + "epoch": 0.2814244465832531, + "grad_norm": 1.3584142923355103, + "learning_rate": 0.00019040607945483367, + "loss": 1.3418, + "step": 7310 + }, + { + "epoch": 0.28161693936477383, + "grad_norm": 1.379162073135376, + "learning_rate": 0.00019039315079652416, + "loss": 1.293, + "step": 7315 + }, + { + "epoch": 0.2818094321462945, + "grad_norm": 1.499841570854187, + "learning_rate": 0.00019038021387235982, + "loss": 1.2131, + "step": 7320 + }, + { + "epoch": 0.28200192492781523, + "grad_norm": 1.9813991785049438, + "learning_rate": 0.00019036726868352366, + "loss": 1.3282, + "step": 7325 + }, + { + "epoch": 0.2821944177093359, + "grad_norm": 1.404096245765686, + "learning_rate": 0.00019035431523119938, + "loss": 1.2238, + "step": 7330 + }, + { + "epoch": 0.28238691049085657, + "grad_norm": 1.1089609861373901, + "learning_rate": 0.00019034135351657152, + "loss": 1.1705, + "step": 7335 + }, + { + "epoch": 0.2825794032723773, + "grad_norm": 1.0567266941070557, + "learning_rate": 0.00019032838354082535, + "loss": 1.1228, + "step": 7340 + }, + { + "epoch": 0.28277189605389796, + "grad_norm": 1.2407151460647583, + "learning_rate": 0.00019031540530514685, + "loss": 1.1154, + "step": 7345 + }, + { + "epoch": 0.2829643888354187, + "grad_norm": 1.3094842433929443, + "learning_rate": 0.00019030241881072283, + "loss": 1.2251, + "step": 7350 + }, + { + "epoch": 0.28315688161693936, + "grad_norm": 0.9434831142425537, + "learning_rate": 0.00019028942405874082, + "loss": 1.0644, + "step": 7355 + }, + { + "epoch": 0.2833493743984601, + "grad_norm": 1.107958197593689, + "learning_rate": 0.0001902764210503891, + "loss": 1.295, + "step": 7360 + }, + { + "epoch": 0.28354186717998076, + "grad_norm": 1.4402803182601929, + "learning_rate": 0.00019026340978685666, + "loss": 1.3339, + "step": 7365 + }, + { + "epoch": 0.2837343599615014, + "grad_norm": 1.1564158201217651, + "learning_rate": 0.0001902503902693334, + "loss": 1.252, + "step": 7370 + }, + { + "epoch": 0.28392685274302215, + "grad_norm": 1.8258494138717651, + "learning_rate": 0.00019023736249900973, + "loss": 1.3495, + "step": 7375 + }, + { + "epoch": 0.2841193455245428, + "grad_norm": 1.1436362266540527, + "learning_rate": 0.00019022432647707708, + "loss": 1.4295, + "step": 7380 + }, + { + "epoch": 0.28431183830606355, + "grad_norm": 1.1649361848831177, + "learning_rate": 0.00019021128220472747, + "loss": 1.3438, + "step": 7385 + }, + { + "epoch": 0.2845043310875842, + "grad_norm": 1.7044711112976074, + "learning_rate": 0.00019019822968315364, + "loss": 1.2735, + "step": 7390 + }, + { + "epoch": 0.2846968238691049, + "grad_norm": 0.8998376727104187, + "learning_rate": 0.00019018516891354924, + "loss": 1.1817, + "step": 7395 + }, + { + "epoch": 0.2848893166506256, + "grad_norm": 1.8617538213729858, + "learning_rate": 0.00019017209989710855, + "loss": 1.3235, + "step": 7400 + }, + { + "epoch": 0.2850818094321463, + "grad_norm": 0.9981639981269836, + "learning_rate": 0.00019015902263502669, + "loss": 1.1171, + "step": 7405 + }, + { + "epoch": 0.285274302213667, + "grad_norm": 0.935457170009613, + "learning_rate": 0.00019014593712849944, + "loss": 1.1926, + "step": 7410 + }, + { + "epoch": 0.2854667949951877, + "grad_norm": 1.3465532064437866, + "learning_rate": 0.00019013284337872341, + "loss": 1.5102, + "step": 7415 + }, + { + "epoch": 0.28565928777670835, + "grad_norm": 1.3213337659835815, + "learning_rate": 0.00019011974138689595, + "loss": 1.2597, + "step": 7420 + }, + { + "epoch": 0.2858517805582291, + "grad_norm": 1.655229091644287, + "learning_rate": 0.0001901066311542151, + "loss": 1.0345, + "step": 7425 + }, + { + "epoch": 0.28604427333974974, + "grad_norm": 1.0165207386016846, + "learning_rate": 0.00019009351268187974, + "loss": 1.2854, + "step": 7430 + }, + { + "epoch": 0.28623676612127047, + "grad_norm": 1.3425116539001465, + "learning_rate": 0.00019008038597108945, + "loss": 1.381, + "step": 7435 + }, + { + "epoch": 0.28642925890279114, + "grad_norm": 1.2017732858657837, + "learning_rate": 0.0001900672510230446, + "loss": 1.2171, + "step": 7440 + }, + { + "epoch": 0.28662175168431187, + "grad_norm": 1.4958349466323853, + "learning_rate": 0.00019005410783894626, + "loss": 1.3524, + "step": 7445 + }, + { + "epoch": 0.28681424446583254, + "grad_norm": 1.1109000444412231, + "learning_rate": 0.00019004095641999636, + "loss": 1.2046, + "step": 7450 + }, + { + "epoch": 0.2870067372473532, + "grad_norm": 1.5347834825515747, + "learning_rate": 0.00019002779676739745, + "loss": 1.2295, + "step": 7455 + }, + { + "epoch": 0.28719923002887393, + "grad_norm": 1.5204600095748901, + "learning_rate": 0.00019001462888235286, + "loss": 1.0319, + "step": 7460 + }, + { + "epoch": 0.2873917228103946, + "grad_norm": 2.0644850730895996, + "learning_rate": 0.00019000145276606677, + "loss": 1.2371, + "step": 7465 + }, + { + "epoch": 0.2875842155919153, + "grad_norm": 1.5903024673461914, + "learning_rate": 0.00018998826841974407, + "loss": 1.3781, + "step": 7470 + }, + { + "epoch": 0.287776708373436, + "grad_norm": 1.045086145401001, + "learning_rate": 0.00018997507584459032, + "loss": 1.0918, + "step": 7475 + }, + { + "epoch": 0.28796920115495667, + "grad_norm": 1.499211311340332, + "learning_rate": 0.0001899618750418119, + "loss": 1.2377, + "step": 7480 + }, + { + "epoch": 0.2881616939364774, + "grad_norm": 1.2885223627090454, + "learning_rate": 0.00018994866601261597, + "loss": 1.2936, + "step": 7485 + }, + { + "epoch": 0.28835418671799806, + "grad_norm": 1.9687073230743408, + "learning_rate": 0.00018993544875821035, + "loss": 1.2043, + "step": 7490 + }, + { + "epoch": 0.2885466794995188, + "grad_norm": 0.9758608937263489, + "learning_rate": 0.00018992222327980375, + "loss": 1.0775, + "step": 7495 + }, + { + "epoch": 0.28873917228103946, + "grad_norm": 1.4256442785263062, + "learning_rate": 0.00018990898957860547, + "loss": 1.2608, + "step": 7500 + }, + { + "epoch": 0.28893166506256013, + "grad_norm": 1.267991304397583, + "learning_rate": 0.00018989574765582572, + "loss": 1.3826, + "step": 7505 + }, + { + "epoch": 0.28912415784408085, + "grad_norm": 1.4104158878326416, + "learning_rate": 0.00018988249751267534, + "loss": 1.1589, + "step": 7510 + }, + { + "epoch": 0.2893166506256015, + "grad_norm": 0.9540778994560242, + "learning_rate": 0.000189869239150366, + "loss": 1.196, + "step": 7515 + }, + { + "epoch": 0.28950914340712225, + "grad_norm": 4.175881385803223, + "learning_rate": 0.00018985597257011006, + "loss": 1.3408, + "step": 7520 + }, + { + "epoch": 0.2897016361886429, + "grad_norm": 1.79558527469635, + "learning_rate": 0.00018984269777312066, + "loss": 1.0596, + "step": 7525 + }, + { + "epoch": 0.28989412897016364, + "grad_norm": 1.5449460744857788, + "learning_rate": 0.0001898294147606117, + "loss": 1.2628, + "step": 7530 + }, + { + "epoch": 0.2900866217516843, + "grad_norm": 1.5056041479110718, + "learning_rate": 0.00018981612353379784, + "loss": 1.132, + "step": 7535 + }, + { + "epoch": 0.290279114533205, + "grad_norm": 1.7045507431030273, + "learning_rate": 0.00018980282409389445, + "loss": 1.1663, + "step": 7540 + }, + { + "epoch": 0.2904716073147257, + "grad_norm": 1.203892469406128, + "learning_rate": 0.00018978951644211766, + "loss": 1.1168, + "step": 7545 + }, + { + "epoch": 0.2906641000962464, + "grad_norm": 0.9239038228988647, + "learning_rate": 0.0001897762005796844, + "loss": 1.3328, + "step": 7550 + }, + { + "epoch": 0.2908565928777671, + "grad_norm": 1.3521167039871216, + "learning_rate": 0.00018976287650781238, + "loss": 1.2766, + "step": 7555 + }, + { + "epoch": 0.2910490856592878, + "grad_norm": 1.3824992179870605, + "learning_rate": 0.00018974954422771987, + "loss": 1.0153, + "step": 7560 + }, + { + "epoch": 0.29124157844080845, + "grad_norm": 0.9183006286621094, + "learning_rate": 0.00018973620374062607, + "loss": 1.0558, + "step": 7565 + }, + { + "epoch": 0.29143407122232917, + "grad_norm": 1.7128045558929443, + "learning_rate": 0.0001897228550477509, + "loss": 1.316, + "step": 7570 + }, + { + "epoch": 0.29162656400384984, + "grad_norm": 1.3998011350631714, + "learning_rate": 0.000189709498150315, + "loss": 1.2359, + "step": 7575 + }, + { + "epoch": 0.29181905678537057, + "grad_norm": 1.2251836061477661, + "learning_rate": 0.00018969613304953975, + "loss": 1.2464, + "step": 7580 + }, + { + "epoch": 0.29201154956689124, + "grad_norm": 1.3014954328536987, + "learning_rate": 0.00018968275974664734, + "loss": 1.0624, + "step": 7585 + }, + { + "epoch": 0.2922040423484119, + "grad_norm": 1.8785862922668457, + "learning_rate": 0.00018966937824286062, + "loss": 1.3491, + "step": 7590 + }, + { + "epoch": 0.29239653512993263, + "grad_norm": 1.0634154081344604, + "learning_rate": 0.00018965598853940327, + "loss": 1.1012, + "step": 7595 + }, + { + "epoch": 0.2925890279114533, + "grad_norm": 0.9114715456962585, + "learning_rate": 0.00018964259063749967, + "loss": 1.3738, + "step": 7600 + }, + { + "epoch": 0.29278152069297403, + "grad_norm": 1.9063506126403809, + "learning_rate": 0.00018962918453837503, + "loss": 1.1161, + "step": 7605 + }, + { + "epoch": 0.2929740134744947, + "grad_norm": 1.12264084815979, + "learning_rate": 0.00018961577024325516, + "loss": 1.4191, + "step": 7610 + }, + { + "epoch": 0.2931665062560154, + "grad_norm": 1.4751306772232056, + "learning_rate": 0.00018960234775336677, + "loss": 1.2153, + "step": 7615 + }, + { + "epoch": 0.2933589990375361, + "grad_norm": 1.4374860525131226, + "learning_rate": 0.00018958891706993724, + "loss": 1.1999, + "step": 7620 + }, + { + "epoch": 0.29355149181905676, + "grad_norm": 1.5792250633239746, + "learning_rate": 0.0001895754781941947, + "loss": 1.266, + "step": 7625 + }, + { + "epoch": 0.2937439846005775, + "grad_norm": 1.3390734195709229, + "learning_rate": 0.00018956203112736807, + "loss": 1.2703, + "step": 7630 + }, + { + "epoch": 0.29393647738209816, + "grad_norm": 1.2470978498458862, + "learning_rate": 0.00018954857587068701, + "loss": 1.0415, + "step": 7635 + }, + { + "epoch": 0.2941289701636189, + "grad_norm": 1.6102235317230225, + "learning_rate": 0.00018953511242538186, + "loss": 1.2707, + "step": 7640 + }, + { + "epoch": 0.29432146294513956, + "grad_norm": 1.334554672241211, + "learning_rate": 0.0001895216407926838, + "loss": 1.2672, + "step": 7645 + }, + { + "epoch": 0.2945139557266602, + "grad_norm": 1.2881218194961548, + "learning_rate": 0.00018950816097382475, + "loss": 1.1641, + "step": 7650 + }, + { + "epoch": 0.29470644850818095, + "grad_norm": 1.2150179147720337, + "learning_rate": 0.00018949467297003732, + "loss": 1.2636, + "step": 7655 + }, + { + "epoch": 0.2948989412897016, + "grad_norm": 1.1388130187988281, + "learning_rate": 0.00018948117678255485, + "loss": 1.2354, + "step": 7660 + }, + { + "epoch": 0.29509143407122235, + "grad_norm": 0.785776674747467, + "learning_rate": 0.0001894676724126115, + "loss": 1.2621, + "step": 7665 + }, + { + "epoch": 0.295283926852743, + "grad_norm": 1.005819320678711, + "learning_rate": 0.00018945415986144223, + "loss": 1.1175, + "step": 7670 + }, + { + "epoch": 0.29547641963426374, + "grad_norm": 2.2892065048217773, + "learning_rate": 0.00018944063913028264, + "loss": 1.148, + "step": 7675 + }, + { + "epoch": 0.2956689124157844, + "grad_norm": 2.0920302867889404, + "learning_rate": 0.00018942711022036903, + "loss": 1.178, + "step": 7680 + }, + { + "epoch": 0.2958614051973051, + "grad_norm": 1.228538155555725, + "learning_rate": 0.00018941357313293863, + "loss": 1.2499, + "step": 7685 + }, + { + "epoch": 0.2960538979788258, + "grad_norm": 1.8671079874038696, + "learning_rate": 0.00018940002786922925, + "loss": 1.2361, + "step": 7690 + }, + { + "epoch": 0.2962463907603465, + "grad_norm": 1.7283247709274292, + "learning_rate": 0.00018938647443047957, + "loss": 1.2695, + "step": 7695 + }, + { + "epoch": 0.2964388835418672, + "grad_norm": 1.9629713296890259, + "learning_rate": 0.0001893729128179289, + "loss": 1.5226, + "step": 7700 + }, + { + "epoch": 0.2966313763233879, + "grad_norm": 1.2868784666061401, + "learning_rate": 0.00018935934303281743, + "loss": 1.3237, + "step": 7705 + }, + { + "epoch": 0.29682386910490854, + "grad_norm": 1.3925827741622925, + "learning_rate": 0.000189345765076386, + "loss": 1.4075, + "step": 7710 + }, + { + "epoch": 0.29701636188642927, + "grad_norm": 1.1560002565383911, + "learning_rate": 0.0001893321789498762, + "loss": 1.3212, + "step": 7715 + }, + { + "epoch": 0.29720885466794994, + "grad_norm": 1.207263708114624, + "learning_rate": 0.0001893185846545304, + "loss": 1.3106, + "step": 7720 + }, + { + "epoch": 0.29740134744947067, + "grad_norm": Infinity, + "learning_rate": 0.00018930770333752716, + "loss": 1.5499, + "step": 7725 + }, + { + "epoch": 0.29759384023099134, + "grad_norm": 1.2437909841537476, + "learning_rate": 0.0001892940943414097, + "loss": 1.2797, + "step": 7730 + }, + { + "epoch": 0.297786333012512, + "grad_norm": 0.8919286131858826, + "learning_rate": 0.00018928047717993885, + "loss": 1.1074, + "step": 7735 + }, + { + "epoch": 0.29797882579403273, + "grad_norm": 1.219995379447937, + "learning_rate": 0.00018926685185435978, + "loss": 1.0856, + "step": 7740 + }, + { + "epoch": 0.2981713185755534, + "grad_norm": 0.8819857835769653, + "learning_rate": 0.00018925321836591846, + "loss": 1.3518, + "step": 7745 + }, + { + "epoch": 0.2983638113570741, + "grad_norm": 1.2268033027648926, + "learning_rate": 0.00018923957671586154, + "loss": 1.3786, + "step": 7750 + }, + { + "epoch": 0.2985563041385948, + "grad_norm": 0.9456066489219666, + "learning_rate": 0.0001892259269054365, + "loss": 1.3424, + "step": 7755 + }, + { + "epoch": 0.2987487969201155, + "grad_norm": 1.5397047996520996, + "learning_rate": 0.0001892122689358915, + "loss": 1.3618, + "step": 7760 + }, + { + "epoch": 0.2989412897016362, + "grad_norm": 1.3874872922897339, + "learning_rate": 0.0001891986028084755, + "loss": 1.2717, + "step": 7765 + }, + { + "epoch": 0.29913378248315686, + "grad_norm": 1.1725342273712158, + "learning_rate": 0.00018918492852443817, + "loss": 1.4347, + "step": 7770 + }, + { + "epoch": 0.2993262752646776, + "grad_norm": 1.2135777473449707, + "learning_rate": 0.0001891712460850299, + "loss": 1.1892, + "step": 7775 + }, + { + "epoch": 0.29951876804619826, + "grad_norm": 1.549715280532837, + "learning_rate": 0.00018915755549150188, + "loss": 1.2041, + "step": 7780 + }, + { + "epoch": 0.299711260827719, + "grad_norm": 0.9927541613578796, + "learning_rate": 0.00018914385674510605, + "loss": 1.2198, + "step": 7785 + }, + { + "epoch": 0.29990375360923965, + "grad_norm": 1.3314557075500488, + "learning_rate": 0.00018913014984709502, + "loss": 1.1805, + "step": 7790 + }, + { + "epoch": 0.3000962463907603, + "grad_norm": 1.4021222591400146, + "learning_rate": 0.00018911643479872225, + "loss": 1.3375, + "step": 7795 + }, + { + "epoch": 0.30028873917228105, + "grad_norm": 1.0226534605026245, + "learning_rate": 0.00018910271160124182, + "loss": 1.329, + "step": 7800 + }, + { + "epoch": 0.3004812319538017, + "grad_norm": 0.8493847846984863, + "learning_rate": 0.0001890889802559087, + "loss": 1.4581, + "step": 7805 + }, + { + "epoch": 0.30067372473532245, + "grad_norm": 1.0437967777252197, + "learning_rate": 0.00018907524076397847, + "loss": 1.409, + "step": 7810 + }, + { + "epoch": 0.3008662175168431, + "grad_norm": 2.574695110321045, + "learning_rate": 0.00018906149312670754, + "loss": 1.3962, + "step": 7815 + }, + { + "epoch": 0.3010587102983638, + "grad_norm": 1.3757768869400024, + "learning_rate": 0.00018904773734535306, + "loss": 1.4098, + "step": 7820 + }, + { + "epoch": 0.3012512030798845, + "grad_norm": 1.2249635457992554, + "learning_rate": 0.0001890339734211729, + "loss": 1.1643, + "step": 7825 + }, + { + "epoch": 0.3014436958614052, + "grad_norm": 1.6329936981201172, + "learning_rate": 0.00018902020135542564, + "loss": 1.1914, + "step": 7830 + }, + { + "epoch": 0.3016361886429259, + "grad_norm": 1.0217385292053223, + "learning_rate": 0.0001890064211493707, + "loss": 1.043, + "step": 7835 + }, + { + "epoch": 0.3018286814244466, + "grad_norm": 1.448754072189331, + "learning_rate": 0.0001889926328042681, + "loss": 1.0953, + "step": 7840 + }, + { + "epoch": 0.3020211742059673, + "grad_norm": 0.9284221529960632, + "learning_rate": 0.00018897883632137881, + "loss": 1.321, + "step": 7845 + }, + { + "epoch": 0.30221366698748797, + "grad_norm": 1.4679608345031738, + "learning_rate": 0.00018896503170196435, + "loss": 1.2266, + "step": 7850 + }, + { + "epoch": 0.30240615976900864, + "grad_norm": 1.1148631572723389, + "learning_rate": 0.00018895121894728709, + "loss": 1.1666, + "step": 7855 + }, + { + "epoch": 0.30259865255052937, + "grad_norm": 1.0431932210922241, + "learning_rate": 0.00018893739805861008, + "loss": 1.2986, + "step": 7860 + }, + { + "epoch": 0.30279114533205004, + "grad_norm": 1.5691524744033813, + "learning_rate": 0.00018892356903719718, + "loss": 1.3928, + "step": 7865 + }, + { + "epoch": 0.30298363811357076, + "grad_norm": 1.6849128007888794, + "learning_rate": 0.000188909731884313, + "loss": 1.3569, + "step": 7870 + }, + { + "epoch": 0.30317613089509143, + "grad_norm": 1.1832456588745117, + "learning_rate": 0.00018889588660122276, + "loss": 1.2984, + "step": 7875 + }, + { + "epoch": 0.3033686236766121, + "grad_norm": 1.3270272016525269, + "learning_rate": 0.0001888820331891926, + "loss": 1.1498, + "step": 7880 + }, + { + "epoch": 0.30356111645813283, + "grad_norm": 1.6383373737335205, + "learning_rate": 0.0001888681716494893, + "loss": 1.4725, + "step": 7885 + }, + { + "epoch": 0.3037536092396535, + "grad_norm": 1.1068469285964966, + "learning_rate": 0.00018885430198338038, + "loss": 1.3326, + "step": 7890 + }, + { + "epoch": 0.3039461020211742, + "grad_norm": 1.8454192876815796, + "learning_rate": 0.00018884042419213412, + "loss": 1.2307, + "step": 7895 + }, + { + "epoch": 0.3041385948026949, + "grad_norm": 1.160762906074524, + "learning_rate": 0.00018882653827701965, + "loss": 1.6025, + "step": 7900 + }, + { + "epoch": 0.30433108758421556, + "grad_norm": 1.9325065612792969, + "learning_rate": 0.00018881264423930663, + "loss": 1.3071, + "step": 7905 + }, + { + "epoch": 0.3045235803657363, + "grad_norm": 0.9047966003417969, + "learning_rate": 0.00018879874208026562, + "loss": 1.3166, + "step": 7910 + }, + { + "epoch": 0.30471607314725696, + "grad_norm": 0.9753623008728027, + "learning_rate": 0.00018878483180116793, + "loss": 1.3702, + "step": 7915 + }, + { + "epoch": 0.3049085659287777, + "grad_norm": 1.210321307182312, + "learning_rate": 0.00018877091340328549, + "loss": 1.3775, + "step": 7920 + }, + { + "epoch": 0.30510105871029836, + "grad_norm": 1.287484049797058, + "learning_rate": 0.00018875698688789106, + "loss": 1.3534, + "step": 7925 + }, + { + "epoch": 0.3052935514918191, + "grad_norm": 1.1604797840118408, + "learning_rate": 0.00018874305225625814, + "loss": 1.2154, + "step": 7930 + }, + { + "epoch": 0.30548604427333975, + "grad_norm": 1.4771429300308228, + "learning_rate": 0.00018872910950966097, + "loss": 1.2438, + "step": 7935 + }, + { + "epoch": 0.3056785370548604, + "grad_norm": 1.1472980976104736, + "learning_rate": 0.00018871515864937453, + "loss": 1.0805, + "step": 7940 + }, + { + "epoch": 0.30587102983638115, + "grad_norm": 1.1015262603759766, + "learning_rate": 0.0001887011996766745, + "loss": 1.0594, + "step": 7945 + }, + { + "epoch": 0.3060635226179018, + "grad_norm": 1.5410771369934082, + "learning_rate": 0.00018868723259283737, + "loss": 1.2624, + "step": 7950 + }, + { + "epoch": 0.30625601539942254, + "grad_norm": 1.2014496326446533, + "learning_rate": 0.0001886732573991403, + "loss": 1.2259, + "step": 7955 + }, + { + "epoch": 0.3064485081809432, + "grad_norm": 2.0007143020629883, + "learning_rate": 0.0001886592740968612, + "loss": 1.3877, + "step": 7960 + }, + { + "epoch": 0.3066410009624639, + "grad_norm": 1.2455111742019653, + "learning_rate": 0.00018864528268727887, + "loss": 1.3254, + "step": 7965 + }, + { + "epoch": 0.3068334937439846, + "grad_norm": 1.2766424417495728, + "learning_rate": 0.00018863128317167264, + "loss": 1.2663, + "step": 7970 + }, + { + "epoch": 0.3070259865255053, + "grad_norm": 1.2151165008544922, + "learning_rate": 0.0001886172755513227, + "loss": 1.3597, + "step": 7975 + }, + { + "epoch": 0.307218479307026, + "grad_norm": 1.1774568557739258, + "learning_rate": 0.0001886032598275099, + "loss": 1.1311, + "step": 7980 + }, + { + "epoch": 0.3074109720885467, + "grad_norm": 1.43276846408844, + "learning_rate": 0.00018858923600151596, + "loss": 1.1123, + "step": 7985 + }, + { + "epoch": 0.3076034648700674, + "grad_norm": 1.691684603691101, + "learning_rate": 0.00018857520407462326, + "loss": 1.4089, + "step": 7990 + }, + { + "epoch": 0.30779595765158807, + "grad_norm": 1.7944872379302979, + "learning_rate": 0.00018856116404811487, + "loss": 1.3098, + "step": 7995 + }, + { + "epoch": 0.30798845043310874, + "grad_norm": 1.2894377708435059, + "learning_rate": 0.00018854711592327473, + "loss": 1.2128, + "step": 8000 + }, + { + "epoch": 0.30818094321462947, + "grad_norm": 2.52504301071167, + "learning_rate": 0.00018853305970138737, + "loss": 1.4214, + "step": 8005 + }, + { + "epoch": 0.30837343599615014, + "grad_norm": 1.0757540464401245, + "learning_rate": 0.0001885189953837382, + "loss": 1.1836, + "step": 8010 + }, + { + "epoch": 0.30856592877767086, + "grad_norm": 0.9253488183021545, + "learning_rate": 0.0001885049229716133, + "loss": 1.0756, + "step": 8015 + }, + { + "epoch": 0.30875842155919153, + "grad_norm": 2.042194366455078, + "learning_rate": 0.00018849084246629945, + "loss": 1.4017, + "step": 8020 + }, + { + "epoch": 0.3089509143407122, + "grad_norm": 1.750023603439331, + "learning_rate": 0.00018847675386908427, + "loss": 1.2352, + "step": 8025 + }, + { + "epoch": 0.3091434071222329, + "grad_norm": 1.5334408283233643, + "learning_rate": 0.00018846265718125605, + "loss": 1.3053, + "step": 8030 + }, + { + "epoch": 0.3093358999037536, + "grad_norm": 1.262428641319275, + "learning_rate": 0.00018844855240410387, + "loss": 1.28, + "step": 8035 + }, + { + "epoch": 0.3095283926852743, + "grad_norm": 1.1430000066757202, + "learning_rate": 0.0001884344395389175, + "loss": 1.2133, + "step": 8040 + }, + { + "epoch": 0.309720885466795, + "grad_norm": 1.792740821838379, + "learning_rate": 0.0001884203185869874, + "loss": 1.3004, + "step": 8045 + }, + { + "epoch": 0.30991337824831566, + "grad_norm": 1.7067112922668457, + "learning_rate": 0.00018840618954960495, + "loss": 1.4131, + "step": 8050 + }, + { + "epoch": 0.3101058710298364, + "grad_norm": 1.5428810119628906, + "learning_rate": 0.00018839205242806206, + "loss": 1.2361, + "step": 8055 + }, + { + "epoch": 0.31029836381135706, + "grad_norm": 1.078902244567871, + "learning_rate": 0.00018837790722365152, + "loss": 1.2126, + "step": 8060 + }, + { + "epoch": 0.3104908565928778, + "grad_norm": 1.5348985195159912, + "learning_rate": 0.00018836375393766684, + "loss": 1.2591, + "step": 8065 + }, + { + "epoch": 0.31068334937439845, + "grad_norm": 1.2026286125183105, + "learning_rate": 0.00018834959257140222, + "loss": 1.3059, + "step": 8070 + }, + { + "epoch": 0.3108758421559192, + "grad_norm": 1.3559043407440186, + "learning_rate": 0.0001883354231261526, + "loss": 1.2006, + "step": 8075 + }, + { + "epoch": 0.31106833493743985, + "grad_norm": 1.2358171939849854, + "learning_rate": 0.00018832124560321374, + "loss": 1.2656, + "step": 8080 + }, + { + "epoch": 0.3112608277189605, + "grad_norm": 1.720358967781067, + "learning_rate": 0.00018830706000388202, + "loss": 1.3493, + "step": 8085 + }, + { + "epoch": 0.31145332050048125, + "grad_norm": 1.4281798601150513, + "learning_rate": 0.00018829286632945463, + "loss": 1.1485, + "step": 8090 + }, + { + "epoch": 0.3116458132820019, + "grad_norm": 1.6174485683441162, + "learning_rate": 0.00018827866458122951, + "loss": 1.4384, + "step": 8095 + }, + { + "epoch": 0.31183830606352264, + "grad_norm": 1.0020065307617188, + "learning_rate": 0.00018826445476050532, + "loss": 1.0489, + "step": 8100 + }, + { + "epoch": 0.3120307988450433, + "grad_norm": 1.8663140535354614, + "learning_rate": 0.0001882502368685814, + "loss": 1.3252, + "step": 8105 + }, + { + "epoch": 0.312223291626564, + "grad_norm": 1.4404470920562744, + "learning_rate": 0.00018823601090675796, + "loss": 1.1452, + "step": 8110 + }, + { + "epoch": 0.3124157844080847, + "grad_norm": 1.3358442783355713, + "learning_rate": 0.00018822177687633583, + "loss": 1.1581, + "step": 8115 + }, + { + "epoch": 0.3126082771896054, + "grad_norm": 1.6938860416412354, + "learning_rate": 0.00018820753477861662, + "loss": 1.5378, + "step": 8120 + }, + { + "epoch": 0.3128007699711261, + "grad_norm": 1.1914762258529663, + "learning_rate": 0.00018819328461490268, + "loss": 1.172, + "step": 8125 + }, + { + "epoch": 0.3129932627526468, + "grad_norm": 2.0504634380340576, + "learning_rate": 0.0001881790263864971, + "loss": 1.2462, + "step": 8130 + }, + { + "epoch": 0.31318575553416744, + "grad_norm": 1.548021912574768, + "learning_rate": 0.00018816476009470367, + "loss": 1.271, + "step": 8135 + }, + { + "epoch": 0.31337824831568817, + "grad_norm": 1.2875434160232544, + "learning_rate": 0.00018815048574082698, + "loss": 1.2484, + "step": 8140 + }, + { + "epoch": 0.31357074109720884, + "grad_norm": 0.936850905418396, + "learning_rate": 0.00018813620332617227, + "loss": 1.2765, + "step": 8145 + }, + { + "epoch": 0.31376323387872956, + "grad_norm": 1.2823413610458374, + "learning_rate": 0.00018812191285204566, + "loss": 1.1859, + "step": 8150 + }, + { + "epoch": 0.31395572666025023, + "grad_norm": 2.052490472793579, + "learning_rate": 0.00018810761431975386, + "loss": 1.2033, + "step": 8155 + }, + { + "epoch": 0.31414821944177096, + "grad_norm": 2.4439830780029297, + "learning_rate": 0.00018809330773060442, + "loss": 1.3678, + "step": 8160 + }, + { + "epoch": 0.31434071222329163, + "grad_norm": 1.9978455305099487, + "learning_rate": 0.0001880789930859055, + "loss": 1.25, + "step": 8165 + }, + { + "epoch": 0.3145332050048123, + "grad_norm": 1.2606321573257446, + "learning_rate": 0.00018806467038696615, + "loss": 1.4966, + "step": 8170 + }, + { + "epoch": 0.314725697786333, + "grad_norm": 1.4588353633880615, + "learning_rate": 0.00018805033963509605, + "loss": 1.1843, + "step": 8175 + }, + { + "epoch": 0.3149181905678537, + "grad_norm": 2.8686156272888184, + "learning_rate": 0.00018803600083160574, + "loss": 1.3017, + "step": 8180 + }, + { + "epoch": 0.3151106833493744, + "grad_norm": 1.812328815460205, + "learning_rate": 0.00018802165397780626, + "loss": 1.4141, + "step": 8185 + }, + { + "epoch": 0.3153031761308951, + "grad_norm": 1.4686119556427002, + "learning_rate": 0.00018800729907500968, + "loss": 1.4522, + "step": 8190 + }, + { + "epoch": 0.31549566891241576, + "grad_norm": 1.766160249710083, + "learning_rate": 0.00018799293612452856, + "loss": 1.1501, + "step": 8195 + }, + { + "epoch": 0.3156881616939365, + "grad_norm": 1.5843030214309692, + "learning_rate": 0.00018797856512767634, + "loss": 1.2997, + "step": 8200 + }, + { + "epoch": 0.31588065447545716, + "grad_norm": 1.2028679847717285, + "learning_rate": 0.00018796418608576712, + "loss": 1.108, + "step": 8205 + }, + { + "epoch": 0.3160731472569779, + "grad_norm": 1.4626559019088745, + "learning_rate": 0.0001879497990001158, + "loss": 1.116, + "step": 8210 + }, + { + "epoch": 0.31626564003849855, + "grad_norm": 1.956745982170105, + "learning_rate": 0.000187935403872038, + "loss": 1.2741, + "step": 8215 + }, + { + "epoch": 0.3164581328200192, + "grad_norm": 1.1932622194290161, + "learning_rate": 0.00018792100070285002, + "loss": 1.1966, + "step": 8220 + }, + { + "epoch": 0.31665062560153995, + "grad_norm": 2.212184429168701, + "learning_rate": 0.00018790658949386892, + "loss": 1.1485, + "step": 8225 + }, + { + "epoch": 0.3168431183830606, + "grad_norm": 0.867708146572113, + "learning_rate": 0.00018789217024641256, + "loss": 1.2457, + "step": 8230 + }, + { + "epoch": 0.31703561116458134, + "grad_norm": 2.4929304122924805, + "learning_rate": 0.0001878777429617995, + "loss": 1.1819, + "step": 8235 + }, + { + "epoch": 0.317228103946102, + "grad_norm": 1.4232670068740845, + "learning_rate": 0.00018786330764134897, + "loss": 1.2189, + "step": 8240 + }, + { + "epoch": 0.31742059672762274, + "grad_norm": 1.8306447267532349, + "learning_rate": 0.00018784886428638094, + "loss": 1.2939, + "step": 8245 + }, + { + "epoch": 0.3176130895091434, + "grad_norm": 0.9103988409042358, + "learning_rate": 0.00018783441289821627, + "loss": 1.2982, + "step": 8250 + }, + { + "epoch": 0.3178055822906641, + "grad_norm": 1.08035409450531, + "learning_rate": 0.0001878199534781764, + "loss": 1.2777, + "step": 8255 + }, + { + "epoch": 0.3179980750721848, + "grad_norm": 1.1342133283615112, + "learning_rate": 0.0001878054860275835, + "loss": 1.1476, + "step": 8260 + }, + { + "epoch": 0.3181905678537055, + "grad_norm": 1.7727190256118774, + "learning_rate": 0.0001877910105477606, + "loss": 1.1887, + "step": 8265 + }, + { + "epoch": 0.3183830606352262, + "grad_norm": 2.5168001651763916, + "learning_rate": 0.0001877765270400313, + "loss": 1.0494, + "step": 8270 + }, + { + "epoch": 0.31857555341674687, + "grad_norm": 1.2397305965423584, + "learning_rate": 0.0001877620355057201, + "loss": 1.321, + "step": 8275 + }, + { + "epoch": 0.31876804619826754, + "grad_norm": 1.3002814054489136, + "learning_rate": 0.0001877475359461521, + "loss": 1.1543, + "step": 8280 + }, + { + "epoch": 0.31896053897978827, + "grad_norm": 1.5683960914611816, + "learning_rate": 0.00018773302836265322, + "loss": 1.1987, + "step": 8285 + }, + { + "epoch": 0.31915303176130894, + "grad_norm": 1.6934245824813843, + "learning_rate": 0.00018771851275655008, + "loss": 1.2946, + "step": 8290 + }, + { + "epoch": 0.31934552454282966, + "grad_norm": 1.4387637376785278, + "learning_rate": 0.00018770398912917004, + "loss": 1.2151, + "step": 8295 + }, + { + "epoch": 0.31953801732435033, + "grad_norm": 1.3155730962753296, + "learning_rate": 0.00018768945748184117, + "loss": 1.1692, + "step": 8300 + }, + { + "epoch": 0.31973051010587106, + "grad_norm": 1.039670467376709, + "learning_rate": 0.0001876749178158923, + "loss": 1.2783, + "step": 8305 + }, + { + "epoch": 0.3199230028873917, + "grad_norm": 1.1988794803619385, + "learning_rate": 0.00018766037013265302, + "loss": 1.1775, + "step": 8310 + }, + { + "epoch": 0.3201154956689124, + "grad_norm": 1.39814031124115, + "learning_rate": 0.00018764581443345355, + "loss": 1.2256, + "step": 8315 + }, + { + "epoch": 0.3203079884504331, + "grad_norm": 1.7934690713882446, + "learning_rate": 0.00018763125071962495, + "loss": 1.3505, + "step": 8320 + }, + { + "epoch": 0.3205004812319538, + "grad_norm": 1.5974578857421875, + "learning_rate": 0.00018761667899249899, + "loss": 1.1725, + "step": 8325 + }, + { + "epoch": 0.3206929740134745, + "grad_norm": 0.9480400085449219, + "learning_rate": 0.00018760209925340818, + "loss": 1.2059, + "step": 8330 + }, + { + "epoch": 0.3208854667949952, + "grad_norm": 1.9734187126159668, + "learning_rate": 0.00018758751150368564, + "loss": 1.2116, + "step": 8335 + }, + { + "epoch": 0.32107795957651586, + "grad_norm": 0.9984979033470154, + "learning_rate": 0.00018757291574466543, + "loss": 1.1347, + "step": 8340 + }, + { + "epoch": 0.3212704523580366, + "grad_norm": 0.96681147813797, + "learning_rate": 0.00018755831197768215, + "loss": 1.2824, + "step": 8345 + }, + { + "epoch": 0.32146294513955725, + "grad_norm": 1.5365724563598633, + "learning_rate": 0.00018754370020407127, + "loss": 1.3718, + "step": 8350 + }, + { + "epoch": 0.321655437921078, + "grad_norm": 1.6202696561813354, + "learning_rate": 0.00018752908042516897, + "loss": 1.3233, + "step": 8355 + }, + { + "epoch": 0.32184793070259865, + "grad_norm": 2.0272514820098877, + "learning_rate": 0.00018751445264231207, + "loss": 1.3406, + "step": 8360 + }, + { + "epoch": 0.3220404234841193, + "grad_norm": 1.1724604368209839, + "learning_rate": 0.0001874998168568382, + "loss": 1.2649, + "step": 8365 + }, + { + "epoch": 0.32223291626564005, + "grad_norm": 1.0908805131912231, + "learning_rate": 0.00018748517307008573, + "loss": 1.2924, + "step": 8370 + }, + { + "epoch": 0.3224254090471607, + "grad_norm": 1.0658169984817505, + "learning_rate": 0.0001874705212833937, + "loss": 1.1266, + "step": 8375 + }, + { + "epoch": 0.32261790182868144, + "grad_norm": 1.2267755270004272, + "learning_rate": 0.00018745586149810194, + "loss": 1.172, + "step": 8380 + }, + { + "epoch": 0.3228103946102021, + "grad_norm": 0.9808927178382874, + "learning_rate": 0.000187441193715551, + "loss": 1.1241, + "step": 8385 + }, + { + "epoch": 0.32300288739172284, + "grad_norm": 1.2251529693603516, + "learning_rate": 0.00018742651793708212, + "loss": 1.1649, + "step": 8390 + }, + { + "epoch": 0.3231953801732435, + "grad_norm": 1.7396290302276611, + "learning_rate": 0.00018741183416403734, + "loss": 1.173, + "step": 8395 + }, + { + "epoch": 0.3233878729547642, + "grad_norm": 1.1498087644577026, + "learning_rate": 0.00018739714239775936, + "loss": 1.266, + "step": 8400 + }, + { + "epoch": 0.3235803657362849, + "grad_norm": 0.9458256959915161, + "learning_rate": 0.0001873824426395917, + "loss": 1.1651, + "step": 8405 + }, + { + "epoch": 0.3237728585178056, + "grad_norm": 1.701441764831543, + "learning_rate": 0.00018736773489087845, + "loss": 1.4314, + "step": 8410 + }, + { + "epoch": 0.3239653512993263, + "grad_norm": 1.3168058395385742, + "learning_rate": 0.00018735301915296466, + "loss": 1.3837, + "step": 8415 + }, + { + "epoch": 0.32415784408084697, + "grad_norm": 1.2277673482894897, + "learning_rate": 0.0001873382954271959, + "loss": 1.2433, + "step": 8420 + }, + { + "epoch": 0.32435033686236764, + "grad_norm": 1.3443776369094849, + "learning_rate": 0.00018732356371491858, + "loss": 1.1514, + "step": 8425 + }, + { + "epoch": 0.32454282964388836, + "grad_norm": 1.3421462774276733, + "learning_rate": 0.00018730882401747984, + "loss": 1.2908, + "step": 8430 + }, + { + "epoch": 0.32473532242540903, + "grad_norm": 2.7043700218200684, + "learning_rate": 0.0001872940763362275, + "loss": 1.426, + "step": 8435 + }, + { + "epoch": 0.32492781520692976, + "grad_norm": 1.2363086938858032, + "learning_rate": 0.00018727932067251016, + "loss": 1.2172, + "step": 8440 + }, + { + "epoch": 0.32512030798845043, + "grad_norm": 1.7551484107971191, + "learning_rate": 0.00018726455702767713, + "loss": 1.2379, + "step": 8445 + }, + { + "epoch": 0.3253128007699711, + "grad_norm": 1.2935433387756348, + "learning_rate": 0.00018724978540307844, + "loss": 1.2109, + "step": 8450 + }, + { + "epoch": 0.3255052935514918, + "grad_norm": 1.723219871520996, + "learning_rate": 0.00018723500580006483, + "loss": 1.3996, + "step": 8455 + }, + { + "epoch": 0.3256977863330125, + "grad_norm": 1.1455639600753784, + "learning_rate": 0.0001872202182199878, + "loss": 1.1223, + "step": 8460 + }, + { + "epoch": 0.3258902791145332, + "grad_norm": 1.194926381111145, + "learning_rate": 0.0001872054226641996, + "loss": 1.3301, + "step": 8465 + }, + { + "epoch": 0.3260827718960539, + "grad_norm": 1.9672341346740723, + "learning_rate": 0.00018719061913405322, + "loss": 1.3884, + "step": 8470 + }, + { + "epoch": 0.3262752646775746, + "grad_norm": 1.5594457387924194, + "learning_rate": 0.0001871758076309023, + "loss": 1.1862, + "step": 8475 + }, + { + "epoch": 0.3264677574590953, + "grad_norm": 1.141787052154541, + "learning_rate": 0.0001871609881561012, + "loss": 1.2375, + "step": 8480 + }, + { + "epoch": 0.32666025024061596, + "grad_norm": 1.1914411783218384, + "learning_rate": 0.0001871461607110052, + "loss": 1.397, + "step": 8485 + }, + { + "epoch": 0.3268527430221367, + "grad_norm": 1.2841687202453613, + "learning_rate": 0.00018713132529697007, + "loss": 1.3052, + "step": 8490 + }, + { + "epoch": 0.32704523580365735, + "grad_norm": 2.2977144718170166, + "learning_rate": 0.0001871164819153524, + "loss": 1.2819, + "step": 8495 + }, + { + "epoch": 0.3272377285851781, + "grad_norm": 1.62446928024292, + "learning_rate": 0.00018710163056750957, + "loss": 1.1739, + "step": 8500 + }, + { + "epoch": 0.32743022136669875, + "grad_norm": 1.471348524093628, + "learning_rate": 0.00018708677125479963, + "loss": 1.0684, + "step": 8505 + }, + { + "epoch": 0.3276227141482194, + "grad_norm": 1.0703455209732056, + "learning_rate": 0.00018707190397858133, + "loss": 1.0832, + "step": 8510 + }, + { + "epoch": 0.32781520692974014, + "grad_norm": 1.3942466974258423, + "learning_rate": 0.00018705702874021425, + "loss": 1.1855, + "step": 8515 + }, + { + "epoch": 0.3280076997112608, + "grad_norm": 1.1790398359298706, + "learning_rate": 0.00018704214554105856, + "loss": 1.1459, + "step": 8520 + }, + { + "epoch": 0.32820019249278154, + "grad_norm": 1.2982394695281982, + "learning_rate": 0.00018702725438247527, + "loss": 1.2642, + "step": 8525 + }, + { + "epoch": 0.3283926852743022, + "grad_norm": 1.4757968187332153, + "learning_rate": 0.00018701235526582608, + "loss": 1.291, + "step": 8530 + }, + { + "epoch": 0.3285851780558229, + "grad_norm": 1.6837409734725952, + "learning_rate": 0.0001870004302436148, + "loss": 1.3796, + "step": 8535 + }, + { + "epoch": 0.3287776708373436, + "grad_norm": 1.1914480924606323, + "learning_rate": 0.00018698551680588075, + "loss": 1.2608, + "step": 8540 + }, + { + "epoch": 0.3289701636188643, + "grad_norm": 1.2581427097320557, + "learning_rate": 0.00018697059541389742, + "loss": 1.3011, + "step": 8545 + }, + { + "epoch": 0.329162656400385, + "grad_norm": 1.5642743110656738, + "learning_rate": 0.0001869556660690293, + "loss": 1.2273, + "step": 8550 + }, + { + "epoch": 0.32935514918190567, + "grad_norm": 1.621721863746643, + "learning_rate": 0.0001869407287726415, + "loss": 1.1648, + "step": 8555 + }, + { + "epoch": 0.3295476419634264, + "grad_norm": 0.9840386509895325, + "learning_rate": 0.00018692578352610002, + "loss": 1.2741, + "step": 8560 + }, + { + "epoch": 0.32974013474494707, + "grad_norm": 1.5852268934249878, + "learning_rate": 0.00018691083033077144, + "loss": 1.2913, + "step": 8565 + }, + { + "epoch": 0.32993262752646774, + "grad_norm": 1.280247688293457, + "learning_rate": 0.00018689586918802314, + "loss": 1.172, + "step": 8570 + }, + { + "epoch": 0.33012512030798846, + "grad_norm": 1.3940321207046509, + "learning_rate": 0.0001868809000992233, + "loss": 1.175, + "step": 8575 + }, + { + "epoch": 0.33031761308950913, + "grad_norm": 1.0753341913223267, + "learning_rate": 0.00018686592306574063, + "loss": 1.3922, + "step": 8580 + }, + { + "epoch": 0.33051010587102986, + "grad_norm": 1.5959515571594238, + "learning_rate": 0.00018685093808894476, + "loss": 1.2741, + "step": 8585 + }, + { + "epoch": 0.33070259865255053, + "grad_norm": 1.1567896604537964, + "learning_rate": 0.00018683594517020593, + "loss": 1.1325, + "step": 8590 + }, + { + "epoch": 0.3308950914340712, + "grad_norm": 1.202486276626587, + "learning_rate": 0.0001868209443108951, + "loss": 1.1915, + "step": 8595 + }, + { + "epoch": 0.3310875842155919, + "grad_norm": 1.6866669654846191, + "learning_rate": 0.00018680593551238412, + "loss": 1.2806, + "step": 8600 + }, + { + "epoch": 0.3312800769971126, + "grad_norm": 1.1932209730148315, + "learning_rate": 0.00018679091877604536, + "loss": 1.2254, + "step": 8605 + }, + { + "epoch": 0.3314725697786333, + "grad_norm": 1.5348761081695557, + "learning_rate": 0.000186775894103252, + "loss": 1.1519, + "step": 8610 + }, + { + "epoch": 0.331665062560154, + "grad_norm": 1.908500075340271, + "learning_rate": 0.00018676086149537792, + "loss": 1.3105, + "step": 8615 + }, + { + "epoch": 0.3318575553416747, + "grad_norm": 2.0427961349487305, + "learning_rate": 0.00018674582095379788, + "loss": 1.1415, + "step": 8620 + }, + { + "epoch": 0.3320500481231954, + "grad_norm": 1.0964915752410889, + "learning_rate": 0.00018673077247988707, + "loss": 1.2041, + "step": 8625 + }, + { + "epoch": 0.33224254090471605, + "grad_norm": 1.2229498624801636, + "learning_rate": 0.00018671571607502168, + "loss": 1.2975, + "step": 8630 + }, + { + "epoch": 0.3324350336862368, + "grad_norm": 1.3551470041275024, + "learning_rate": 0.00018670065174057854, + "loss": 1.1592, + "step": 8635 + }, + { + "epoch": 0.33262752646775745, + "grad_norm": 0.8810299634933472, + "learning_rate": 0.0001866855794779351, + "loss": 1.1414, + "step": 8640 + }, + { + "epoch": 0.3328200192492782, + "grad_norm": 1.5907199382781982, + "learning_rate": 0.00018667049928846967, + "loss": 1.2191, + "step": 8645 + }, + { + "epoch": 0.33301251203079885, + "grad_norm": 2.042478561401367, + "learning_rate": 0.0001866554111735612, + "loss": 1.1619, + "step": 8650 + }, + { + "epoch": 0.3332050048123195, + "grad_norm": 1.6686564683914185, + "learning_rate": 0.00018664031513458942, + "loss": 1.2534, + "step": 8655 + }, + { + "epoch": 0.33339749759384024, + "grad_norm": 1.7643070220947266, + "learning_rate": 0.0001866252111729348, + "loss": 1.2631, + "step": 8660 + }, + { + "epoch": 0.3335899903753609, + "grad_norm": 1.4883722066879272, + "learning_rate": 0.0001866100992899784, + "loss": 1.1786, + "step": 8665 + }, + { + "epoch": 0.33378248315688164, + "grad_norm": 0.9850770235061646, + "learning_rate": 0.00018659497948710218, + "loss": 1.4181, + "step": 8670 + }, + { + "epoch": 0.3339749759384023, + "grad_norm": 0.9056932926177979, + "learning_rate": 0.00018657985176568875, + "loss": 1.0365, + "step": 8675 + }, + { + "epoch": 0.334167468719923, + "grad_norm": 1.9456449747085571, + "learning_rate": 0.00018656471612712137, + "loss": 1.227, + "step": 8680 + }, + { + "epoch": 0.3343599615014437, + "grad_norm": 1.289870262145996, + "learning_rate": 0.00018654957257278415, + "loss": 1.32, + "step": 8685 + }, + { + "epoch": 0.3345524542829644, + "grad_norm": 1.048143744468689, + "learning_rate": 0.00018653442110406189, + "loss": 1.2123, + "step": 8690 + }, + { + "epoch": 0.3347449470644851, + "grad_norm": 1.1696733236312866, + "learning_rate": 0.00018651926172234004, + "loss": 1.0226, + "step": 8695 + }, + { + "epoch": 0.33493743984600577, + "grad_norm": 1.4806257486343384, + "learning_rate": 0.00018650409442900486, + "loss": 1.1715, + "step": 8700 + }, + { + "epoch": 0.3351299326275265, + "grad_norm": 1.525719404220581, + "learning_rate": 0.00018648891922544325, + "loss": 1.2037, + "step": 8705 + }, + { + "epoch": 0.33532242540904716, + "grad_norm": 1.3378442525863647, + "learning_rate": 0.00018647373611304293, + "loss": 1.2188, + "step": 8710 + }, + { + "epoch": 0.33551491819056783, + "grad_norm": 0.870988130569458, + "learning_rate": 0.00018645854509319226, + "loss": 1.2153, + "step": 8715 + }, + { + "epoch": 0.33570741097208856, + "grad_norm": 1.5496007204055786, + "learning_rate": 0.00018644334616728042, + "loss": 1.1974, + "step": 8720 + }, + { + "epoch": 0.33589990375360923, + "grad_norm": 1.0248416662216187, + "learning_rate": 0.00018642813933669717, + "loss": 1.2845, + "step": 8725 + }, + { + "epoch": 0.33609239653512996, + "grad_norm": 1.9984816312789917, + "learning_rate": 0.00018641292460283313, + "loss": 1.3144, + "step": 8730 + }, + { + "epoch": 0.3362848893166506, + "grad_norm": 1.3114112615585327, + "learning_rate": 0.00018639770196707955, + "loss": 1.209, + "step": 8735 + }, + { + "epoch": 0.3364773820981713, + "grad_norm": 1.1683485507965088, + "learning_rate": 0.00018638247143082848, + "loss": 1.2688, + "step": 8740 + }, + { + "epoch": 0.336669874879692, + "grad_norm": 1.507900595664978, + "learning_rate": 0.0001863672329954726, + "loss": 1.1325, + "step": 8745 + }, + { + "epoch": 0.3368623676612127, + "grad_norm": 1.3393852710723877, + "learning_rate": 0.00018635198666240542, + "loss": 1.1573, + "step": 8750 + }, + { + "epoch": 0.3370548604427334, + "grad_norm": 1.0203709602355957, + "learning_rate": 0.00018633673243302108, + "loss": 1.2922, + "step": 8755 + }, + { + "epoch": 0.3372473532242541, + "grad_norm": 0.8483877778053284, + "learning_rate": 0.00018632147030871448, + "loss": 1.2252, + "step": 8760 + }, + { + "epoch": 0.33743984600577476, + "grad_norm": 0.983748197555542, + "learning_rate": 0.00018630620029088125, + "loss": 1.2027, + "step": 8765 + }, + { + "epoch": 0.3376323387872955, + "grad_norm": 1.2489101886749268, + "learning_rate": 0.00018629092238091775, + "loss": 1.1962, + "step": 8770 + }, + { + "epoch": 0.33782483156881615, + "grad_norm": 1.4553676843643188, + "learning_rate": 0.000186275636580221, + "loss": 1.3698, + "step": 8775 + }, + { + "epoch": 0.3380173243503369, + "grad_norm": 0.9494854807853699, + "learning_rate": 0.0001862603428901888, + "loss": 1.25, + "step": 8780 + }, + { + "epoch": 0.33820981713185755, + "grad_norm": 0.8667522072792053, + "learning_rate": 0.00018624504131221968, + "loss": 1.222, + "step": 8785 + }, + { + "epoch": 0.3384023099133783, + "grad_norm": 1.4215630292892456, + "learning_rate": 0.00018622973184771285, + "loss": 1.2592, + "step": 8790 + }, + { + "epoch": 0.33859480269489894, + "grad_norm": 0.9913888573646545, + "learning_rate": 0.00018621441449806828, + "loss": 1.2904, + "step": 8795 + }, + { + "epoch": 0.3387872954764196, + "grad_norm": 0.9612273573875427, + "learning_rate": 0.00018619908926468664, + "loss": 1.24, + "step": 8800 + }, + { + "epoch": 0.33897978825794034, + "grad_norm": 1.656568169593811, + "learning_rate": 0.00018618375614896926, + "loss": 1.1763, + "step": 8805 + }, + { + "epoch": 0.339172281039461, + "grad_norm": 1.4496088027954102, + "learning_rate": 0.0001861684151523183, + "loss": 1.2045, + "step": 8810 + }, + { + "epoch": 0.33936477382098174, + "grad_norm": 1.3886058330535889, + "learning_rate": 0.0001861530662761366, + "loss": 1.3111, + "step": 8815 + }, + { + "epoch": 0.3395572666025024, + "grad_norm": 1.644887089729309, + "learning_rate": 0.0001861377095218277, + "loss": 1.3172, + "step": 8820 + }, + { + "epoch": 0.3397497593840231, + "grad_norm": 1.1925910711288452, + "learning_rate": 0.00018612234489079587, + "loss": 1.3268, + "step": 8825 + }, + { + "epoch": 0.3399422521655438, + "grad_norm": 1.1367309093475342, + "learning_rate": 0.0001861069723844461, + "loss": 1.1209, + "step": 8830 + }, + { + "epoch": 0.34013474494706447, + "grad_norm": 1.0649480819702148, + "learning_rate": 0.00018609159200418414, + "loss": 1.1514, + "step": 8835 + }, + { + "epoch": 0.3403272377285852, + "grad_norm": 1.1887884140014648, + "learning_rate": 0.00018607620375141637, + "loss": 1.1026, + "step": 8840 + }, + { + "epoch": 0.34051973051010587, + "grad_norm": 1.9125694036483765, + "learning_rate": 0.00018606080762754995, + "loss": 1.4718, + "step": 8845 + }, + { + "epoch": 0.34071222329162654, + "grad_norm": 1.1742594242095947, + "learning_rate": 0.00018604540363399282, + "loss": 1.3206, + "step": 8850 + }, + { + "epoch": 0.34090471607314726, + "grad_norm": 1.504146695137024, + "learning_rate": 0.0001860299917721535, + "loss": 1.1639, + "step": 8855 + }, + { + "epoch": 0.34109720885466793, + "grad_norm": 0.8869237899780273, + "learning_rate": 0.00018601457204344131, + "loss": 1.2674, + "step": 8860 + }, + { + "epoch": 0.34128970163618866, + "grad_norm": 0.8492304682731628, + "learning_rate": 0.00018599914444926636, + "loss": 1.2732, + "step": 8865 + }, + { + "epoch": 0.34148219441770933, + "grad_norm": 1.1681571006774902, + "learning_rate": 0.00018598370899103932, + "loss": 1.2995, + "step": 8870 + }, + { + "epoch": 0.34167468719923005, + "grad_norm": 1.6912837028503418, + "learning_rate": 0.00018596826567017166, + "loss": 1.3217, + "step": 8875 + }, + { + "epoch": 0.3418671799807507, + "grad_norm": 1.0427602529525757, + "learning_rate": 0.0001859528144880756, + "loss": 1.05, + "step": 8880 + }, + { + "epoch": 0.3420596727622714, + "grad_norm": 1.9644991159439087, + "learning_rate": 0.00018593735544616404, + "loss": 1.1087, + "step": 8885 + }, + { + "epoch": 0.3422521655437921, + "grad_norm": 1.966264247894287, + "learning_rate": 0.0001859218885458506, + "loss": 1.2221, + "step": 8890 + }, + { + "epoch": 0.3424446583253128, + "grad_norm": 1.9770557880401611, + "learning_rate": 0.00018590641378854965, + "loss": 1.2489, + "step": 8895 + }, + { + "epoch": 0.3426371511068335, + "grad_norm": 1.4175180196762085, + "learning_rate": 0.00018589093117567625, + "loss": 1.1292, + "step": 8900 + }, + { + "epoch": 0.3428296438883542, + "grad_norm": 1.066177487373352, + "learning_rate": 0.00018587544070864612, + "loss": 1.1182, + "step": 8905 + }, + { + "epoch": 0.34302213666987486, + "grad_norm": 2.6207172870635986, + "learning_rate": 0.00018585994238887586, + "loss": 1.1, + "step": 8910 + }, + { + "epoch": 0.3432146294513956, + "grad_norm": 1.6905888319015503, + "learning_rate": 0.0001858444362177826, + "loss": 1.3135, + "step": 8915 + }, + { + "epoch": 0.34340712223291625, + "grad_norm": 1.117883324623108, + "learning_rate": 0.00018582892219678435, + "loss": 1.3394, + "step": 8920 + }, + { + "epoch": 0.343599615014437, + "grad_norm": 1.549805760383606, + "learning_rate": 0.00018581340032729972, + "loss": 1.1957, + "step": 8925 + }, + { + "epoch": 0.34379210779595765, + "grad_norm": 1.165260672569275, + "learning_rate": 0.00018579787061074807, + "loss": 1.2406, + "step": 8930 + }, + { + "epoch": 0.34398460057747837, + "grad_norm": 1.1872533559799194, + "learning_rate": 0.00018578233304854952, + "loss": 1.1831, + "step": 8935 + }, + { + "epoch": 0.34417709335899904, + "grad_norm": 0.8727648854255676, + "learning_rate": 0.00018576678764212489, + "loss": 1.2645, + "step": 8940 + }, + { + "epoch": 0.3443695861405197, + "grad_norm": 1.1179304122924805, + "learning_rate": 0.00018575123439289567, + "loss": 1.297, + "step": 8945 + }, + { + "epoch": 0.34456207892204044, + "grad_norm": 1.9064927101135254, + "learning_rate": 0.0001857356733022841, + "loss": 1.3917, + "step": 8950 + }, + { + "epoch": 0.3447545717035611, + "grad_norm": 2.100154399871826, + "learning_rate": 0.00018572010437171315, + "loss": 1.1723, + "step": 8955 + }, + { + "epoch": 0.34494706448508183, + "grad_norm": 1.0105838775634766, + "learning_rate": 0.00018570452760260654, + "loss": 1.0851, + "step": 8960 + }, + { + "epoch": 0.3451395572666025, + "grad_norm": 1.760038137435913, + "learning_rate": 0.0001856889429963886, + "loss": 1.0612, + "step": 8965 + }, + { + "epoch": 0.3453320500481232, + "grad_norm": 1.5740501880645752, + "learning_rate": 0.00018567335055448444, + "loss": 1.117, + "step": 8970 + }, + { + "epoch": 0.3455245428296439, + "grad_norm": 1.4148597717285156, + "learning_rate": 0.00018565775027831993, + "loss": 1.2003, + "step": 8975 + }, + { + "epoch": 0.34571703561116457, + "grad_norm": 1.2243534326553345, + "learning_rate": 0.00018564214216932159, + "loss": 1.2106, + "step": 8980 + }, + { + "epoch": 0.3459095283926853, + "grad_norm": 1.3532603979110718, + "learning_rate": 0.00018562652622891666, + "loss": 1.1703, + "step": 8985 + }, + { + "epoch": 0.34610202117420596, + "grad_norm": 1.6701220273971558, + "learning_rate": 0.00018561090245853315, + "loss": 1.2409, + "step": 8990 + }, + { + "epoch": 0.34629451395572663, + "grad_norm": 1.6342322826385498, + "learning_rate": 0.00018559527085959968, + "loss": 1.2981, + "step": 8995 + }, + { + "epoch": 0.34648700673724736, + "grad_norm": 2.4354701042175293, + "learning_rate": 0.00018557963143354576, + "loss": 1.1021, + "step": 9000 + }, + { + "epoch": 0.34667949951876803, + "grad_norm": 1.5688186883926392, + "learning_rate": 0.00018556398418180146, + "loss": 1.2649, + "step": 9005 + }, + { + "epoch": 0.34687199230028876, + "grad_norm": 2.2158894538879395, + "learning_rate": 0.0001855483291057976, + "loss": 1.2335, + "step": 9010 + }, + { + "epoch": 0.3470644850818094, + "grad_norm": 1.7294437885284424, + "learning_rate": 0.00018553266620696573, + "loss": 1.3235, + "step": 9015 + }, + { + "epoch": 0.34725697786333015, + "grad_norm": 1.1023756265640259, + "learning_rate": 0.00018551699548673814, + "loss": 1.3515, + "step": 9020 + }, + { + "epoch": 0.3474494706448508, + "grad_norm": 1.4505863189697266, + "learning_rate": 0.00018550131694654784, + "loss": 1.3773, + "step": 9025 + }, + { + "epoch": 0.3476419634263715, + "grad_norm": 2.221957206726074, + "learning_rate": 0.00018548563058782847, + "loss": 1.0896, + "step": 9030 + }, + { + "epoch": 0.3478344562078922, + "grad_norm": 0.917010486125946, + "learning_rate": 0.0001854699364120145, + "loss": 1.1569, + "step": 9035 + }, + { + "epoch": 0.3480269489894129, + "grad_norm": 1.4631186723709106, + "learning_rate": 0.00018545423442054105, + "loss": 1.2169, + "step": 9040 + }, + { + "epoch": 0.3482194417709336, + "grad_norm": 1.0917268991470337, + "learning_rate": 0.0001854385246148439, + "loss": 1.2425, + "step": 9045 + }, + { + "epoch": 0.3484119345524543, + "grad_norm": 1.5985426902770996, + "learning_rate": 0.00018542280699635968, + "loss": 1.0944, + "step": 9050 + }, + { + "epoch": 0.34860442733397495, + "grad_norm": 1.5402495861053467, + "learning_rate": 0.0001854070815665256, + "loss": 1.1497, + "step": 9055 + }, + { + "epoch": 0.3487969201154957, + "grad_norm": 1.211295485496521, + "learning_rate": 0.00018539134832677972, + "loss": 1.0403, + "step": 9060 + }, + { + "epoch": 0.34898941289701635, + "grad_norm": 1.0569374561309814, + "learning_rate": 0.00018537560727856068, + "loss": 1.2886, + "step": 9065 + }, + { + "epoch": 0.3491819056785371, + "grad_norm": 1.550212025642395, + "learning_rate": 0.00018535985842330793, + "loss": 1.2654, + "step": 9070 + }, + { + "epoch": 0.34937439846005774, + "grad_norm": 1.7941083908081055, + "learning_rate": 0.00018534410176246154, + "loss": 1.2757, + "step": 9075 + }, + { + "epoch": 0.3495668912415784, + "grad_norm": 0.9004856944084167, + "learning_rate": 0.00018532833729746243, + "loss": 1.2045, + "step": 9080 + }, + { + "epoch": 0.34975938402309914, + "grad_norm": 0.9916037321090698, + "learning_rate": 0.00018531256502975216, + "loss": 1.1788, + "step": 9085 + }, + { + "epoch": 0.3499518768046198, + "grad_norm": 1.0524908304214478, + "learning_rate": 0.00018529678496077292, + "loss": 1.3298, + "step": 9090 + }, + { + "epoch": 0.35014436958614054, + "grad_norm": 2.7244019508361816, + "learning_rate": 0.00018528099709196774, + "loss": 1.3274, + "step": 9095 + }, + { + "epoch": 0.3503368623676612, + "grad_norm": 1.4286680221557617, + "learning_rate": 0.0001852652014247803, + "loss": 1.193, + "step": 9100 + }, + { + "epoch": 0.35052935514918193, + "grad_norm": 1.0943810939788818, + "learning_rate": 0.00018524939796065503, + "loss": 1.2953, + "step": 9105 + }, + { + "epoch": 0.3507218479307026, + "grad_norm": 1.1513092517852783, + "learning_rate": 0.00018523358670103704, + "loss": 1.3436, + "step": 9110 + }, + { + "epoch": 0.35091434071222327, + "grad_norm": 2.142829656600952, + "learning_rate": 0.00018521776764737218, + "loss": 1.2998, + "step": 9115 + }, + { + "epoch": 0.351106833493744, + "grad_norm": 0.9734616875648499, + "learning_rate": 0.00018520194080110699, + "loss": 1.2794, + "step": 9120 + }, + { + "epoch": 0.35129932627526467, + "grad_norm": 1.0793628692626953, + "learning_rate": 0.00018518610616368868, + "loss": 1.2574, + "step": 9125 + }, + { + "epoch": 0.3514918190567854, + "grad_norm": 2.409484386444092, + "learning_rate": 0.00018517026373656532, + "loss": 1.1601, + "step": 9130 + }, + { + "epoch": 0.35168431183830606, + "grad_norm": 1.1166318655014038, + "learning_rate": 0.0001851544135211855, + "loss": 1.2705, + "step": 9135 + }, + { + "epoch": 0.35187680461982673, + "grad_norm": 1.183131217956543, + "learning_rate": 0.0001851385555189987, + "loss": 1.132, + "step": 9140 + }, + { + "epoch": 0.35206929740134746, + "grad_norm": 1.3792176246643066, + "learning_rate": 0.00018512268973145497, + "loss": 1.1271, + "step": 9145 + }, + { + "epoch": 0.35226179018286813, + "grad_norm": 1.3978809118270874, + "learning_rate": 0.00018510681616000513, + "loss": 1.3828, + "step": 9150 + }, + { + "epoch": 0.35245428296438885, + "grad_norm": 1.0242118835449219, + "learning_rate": 0.00018509093480610078, + "loss": 1.1982, + "step": 9155 + }, + { + "epoch": 0.3526467757459095, + "grad_norm": 1.326621174812317, + "learning_rate": 0.00018507504567119408, + "loss": 1.0175, + "step": 9160 + }, + { + "epoch": 0.3528392685274302, + "grad_norm": 1.1905460357666016, + "learning_rate": 0.00018505914875673805, + "loss": 1.3367, + "step": 9165 + }, + { + "epoch": 0.3530317613089509, + "grad_norm": 1.5423171520233154, + "learning_rate": 0.0001850432440641863, + "loss": 1.1721, + "step": 9170 + }, + { + "epoch": 0.3532242540904716, + "grad_norm": 1.0577900409698486, + "learning_rate": 0.00018502733159499326, + "loss": 1.2173, + "step": 9175 + }, + { + "epoch": 0.3534167468719923, + "grad_norm": 0.8053417205810547, + "learning_rate": 0.000185011411350614, + "loss": 1.1492, + "step": 9180 + }, + { + "epoch": 0.353609239653513, + "grad_norm": 1.076053261756897, + "learning_rate": 0.0001849954833325043, + "loss": 1.2117, + "step": 9185 + }, + { + "epoch": 0.3538017324350337, + "grad_norm": 1.206359624862671, + "learning_rate": 0.0001849795475421207, + "loss": 1.1659, + "step": 9190 + }, + { + "epoch": 0.3539942252165544, + "grad_norm": 1.4652369022369385, + "learning_rate": 0.00018496360398092046, + "loss": 1.2605, + "step": 9195 + }, + { + "epoch": 0.35418671799807505, + "grad_norm": 1.158055067062378, + "learning_rate": 0.00018494765265036144, + "loss": 1.414, + "step": 9200 + }, + { + "epoch": 0.3543792107795958, + "grad_norm": 2.4634461402893066, + "learning_rate": 0.0001849316935519023, + "loss": 1.1982, + "step": 9205 + }, + { + "epoch": 0.35457170356111645, + "grad_norm": 1.875139594078064, + "learning_rate": 0.00018491572668700242, + "loss": 1.4133, + "step": 9210 + }, + { + "epoch": 0.3547641963426372, + "grad_norm": 1.0054875612258911, + "learning_rate": 0.00018489975205712185, + "loss": 1.2294, + "step": 9215 + }, + { + "epoch": 0.35495668912415784, + "grad_norm": 2.2620842456817627, + "learning_rate": 0.00018488376966372134, + "loss": 1.2672, + "step": 9220 + }, + { + "epoch": 0.3551491819056785, + "grad_norm": 1.584251880645752, + "learning_rate": 0.00018486777950826243, + "loss": 1.4366, + "step": 9225 + }, + { + "epoch": 0.35534167468719924, + "grad_norm": 1.6498923301696777, + "learning_rate": 0.00018485178159220725, + "loss": 1.3502, + "step": 9230 + }, + { + "epoch": 0.3555341674687199, + "grad_norm": 1.6700108051300049, + "learning_rate": 0.00018483577591701876, + "loss": 1.2462, + "step": 9235 + }, + { + "epoch": 0.35572666025024063, + "grad_norm": 1.6976680755615234, + "learning_rate": 0.00018481976248416052, + "loss": 1.4637, + "step": 9240 + }, + { + "epoch": 0.3559191530317613, + "grad_norm": 0.9686551094055176, + "learning_rate": 0.0001848037412950969, + "loss": 1.1902, + "step": 9245 + }, + { + "epoch": 0.35611164581328203, + "grad_norm": 1.2102336883544922, + "learning_rate": 0.00018478771235129292, + "loss": 1.586, + "step": 9250 + }, + { + "epoch": 0.3563041385948027, + "grad_norm": 1.7220674753189087, + "learning_rate": 0.0001847716756542143, + "loss": 1.2324, + "step": 9255 + }, + { + "epoch": 0.35649663137632337, + "grad_norm": 1.7433216571807861, + "learning_rate": 0.0001847556312053275, + "loss": 1.4454, + "step": 9260 + }, + { + "epoch": 0.3566891241578441, + "grad_norm": 0.9930455088615417, + "learning_rate": 0.0001847395790060997, + "loss": 1.1601, + "step": 9265 + }, + { + "epoch": 0.35688161693936477, + "grad_norm": 1.1169023513793945, + "learning_rate": 0.00018472351905799873, + "loss": 1.2534, + "step": 9270 + }, + { + "epoch": 0.3570741097208855, + "grad_norm": 1.238748550415039, + "learning_rate": 0.00018470745136249316, + "loss": 1.2174, + "step": 9275 + }, + { + "epoch": 0.35726660250240616, + "grad_norm": 2.130223035812378, + "learning_rate": 0.00018469137592105235, + "loss": 1.3975, + "step": 9280 + }, + { + "epoch": 0.35745909528392683, + "grad_norm": 1.4341787099838257, + "learning_rate": 0.0001846752927351462, + "loss": 1.1725, + "step": 9285 + }, + { + "epoch": 0.35765158806544756, + "grad_norm": 1.948145866394043, + "learning_rate": 0.00018465920180624548, + "loss": 1.2741, + "step": 9290 + }, + { + "epoch": 0.3578440808469682, + "grad_norm": 1.0314382314682007, + "learning_rate": 0.00018464310313582157, + "loss": 1.0998, + "step": 9295 + }, + { + "epoch": 0.35803657362848895, + "grad_norm": 1.0461472272872925, + "learning_rate": 0.0001846269967253466, + "loss": 1.1953, + "step": 9300 + }, + { + "epoch": 0.3582290664100096, + "grad_norm": 1.781084656715393, + "learning_rate": 0.00018461088257629334, + "loss": 1.3629, + "step": 9305 + }, + { + "epoch": 0.3584215591915303, + "grad_norm": 1.9082306623458862, + "learning_rate": 0.00018459476069013537, + "loss": 1.2675, + "step": 9310 + }, + { + "epoch": 0.358614051973051, + "grad_norm": 1.803348422050476, + "learning_rate": 0.00018457863106834693, + "loss": 1.2303, + "step": 9315 + }, + { + "epoch": 0.3588065447545717, + "grad_norm": 1.5346139669418335, + "learning_rate": 0.000184562493712403, + "loss": 1.3354, + "step": 9320 + }, + { + "epoch": 0.3589990375360924, + "grad_norm": 1.3731290102005005, + "learning_rate": 0.00018454634862377916, + "loss": 1.4874, + "step": 9325 + }, + { + "epoch": 0.3591915303176131, + "grad_norm": 1.186759352684021, + "learning_rate": 0.0001845301958039518, + "loss": 1.29, + "step": 9330 + }, + { + "epoch": 0.3593840230991338, + "grad_norm": 3.729174852371216, + "learning_rate": 0.00018451403525439802, + "loss": 1.2589, + "step": 9335 + }, + { + "epoch": 0.3595765158806545, + "grad_norm": 2.46051025390625, + "learning_rate": 0.00018449786697659554, + "loss": 1.1818, + "step": 9340 + }, + { + "epoch": 0.35976900866217515, + "grad_norm": 1.6652323007583618, + "learning_rate": 0.00018448169097202288, + "loss": 1.2719, + "step": 9345 + }, + { + "epoch": 0.3599615014436959, + "grad_norm": 1.375410556793213, + "learning_rate": 0.00018446550724215922, + "loss": 1.2687, + "step": 9350 + }, + { + "epoch": 0.36015399422521654, + "grad_norm": 1.9113675355911255, + "learning_rate": 0.00018444931578848447, + "loss": 1.2475, + "step": 9355 + }, + { + "epoch": 0.36034648700673727, + "grad_norm": 1.8949065208435059, + "learning_rate": 0.0001844331166124792, + "loss": 1.3439, + "step": 9360 + }, + { + "epoch": 0.36053897978825794, + "grad_norm": 1.0940630435943604, + "learning_rate": 0.00018441690971562476, + "loss": 1.203, + "step": 9365 + }, + { + "epoch": 0.3607314725697786, + "grad_norm": 1.2999101877212524, + "learning_rate": 0.00018440069509940315, + "loss": 1.2729, + "step": 9370 + }, + { + "epoch": 0.36092396535129934, + "grad_norm": 1.3675721883773804, + "learning_rate": 0.00018438447276529702, + "loss": 1.2024, + "step": 9375 + }, + { + "epoch": 0.36111645813282, + "grad_norm": 1.6651533842086792, + "learning_rate": 0.00018436824271478988, + "loss": 1.2235, + "step": 9380 + }, + { + "epoch": 0.36130895091434073, + "grad_norm": 2.16670823097229, + "learning_rate": 0.00018435200494936585, + "loss": 1.4486, + "step": 9385 + }, + { + "epoch": 0.3615014436958614, + "grad_norm": 1.3305730819702148, + "learning_rate": 0.00018433575947050972, + "loss": 1.2003, + "step": 9390 + }, + { + "epoch": 0.36169393647738207, + "grad_norm": 1.5913615226745605, + "learning_rate": 0.00018431950627970708, + "loss": 1.2722, + "step": 9395 + }, + { + "epoch": 0.3618864292589028, + "grad_norm": 0.9965779781341553, + "learning_rate": 0.00018430324537844415, + "loss": 1.0604, + "step": 9400 + }, + { + "epoch": 0.36207892204042347, + "grad_norm": 1.7614198923110962, + "learning_rate": 0.00018428697676820788, + "loss": 1.2734, + "step": 9405 + }, + { + "epoch": 0.3622714148219442, + "grad_norm": 1.190706491470337, + "learning_rate": 0.00018427070045048594, + "loss": 1.2309, + "step": 9410 + }, + { + "epoch": 0.36246390760346486, + "grad_norm": 1.1487165689468384, + "learning_rate": 0.00018425441642676667, + "loss": 1.2049, + "step": 9415 + }, + { + "epoch": 0.3626564003849856, + "grad_norm": 1.0437067747116089, + "learning_rate": 0.00018423812469853918, + "loss": 1.3632, + "step": 9420 + }, + { + "epoch": 0.36284889316650626, + "grad_norm": 1.7774686813354492, + "learning_rate": 0.00018422182526729318, + "loss": 1.1797, + "step": 9425 + }, + { + "epoch": 0.36304138594802693, + "grad_norm": 1.3748910427093506, + "learning_rate": 0.0001842055181345192, + "loss": 1.4438, + "step": 9430 + }, + { + "epoch": 0.36323387872954765, + "grad_norm": 0.891248881816864, + "learning_rate": 0.00018418920330170842, + "loss": 1.3017, + "step": 9435 + }, + { + "epoch": 0.3634263715110683, + "grad_norm": 1.5410393476486206, + "learning_rate": 0.00018417288077035267, + "loss": 1.2239, + "step": 9440 + }, + { + "epoch": 0.36361886429258905, + "grad_norm": 1.3638213872909546, + "learning_rate": 0.00018415655054194457, + "loss": 1.2245, + "step": 9445 + }, + { + "epoch": 0.3638113570741097, + "grad_norm": 1.84505033493042, + "learning_rate": 0.00018414021261797743, + "loss": 1.1362, + "step": 9450 + }, + { + "epoch": 0.3640038498556304, + "grad_norm": 1.5999794006347656, + "learning_rate": 0.00018412386699994518, + "loss": 1.1647, + "step": 9455 + }, + { + "epoch": 0.3641963426371511, + "grad_norm": 1.55308997631073, + "learning_rate": 0.0001841075136893426, + "loss": 1.2612, + "step": 9460 + }, + { + "epoch": 0.3643888354186718, + "grad_norm": 1.3549528121948242, + "learning_rate": 0.00018409115268766505, + "loss": 1.2095, + "step": 9465 + }, + { + "epoch": 0.3645813282001925, + "grad_norm": 1.123184323310852, + "learning_rate": 0.00018407478399640862, + "loss": 1.3047, + "step": 9470 + }, + { + "epoch": 0.3647738209817132, + "grad_norm": 1.3776748180389404, + "learning_rate": 0.00018405840761707016, + "loss": 1.1064, + "step": 9475 + }, + { + "epoch": 0.36496631376323385, + "grad_norm": 1.3778200149536133, + "learning_rate": 0.00018404202355114718, + "loss": 1.0956, + "step": 9480 + }, + { + "epoch": 0.3651588065447546, + "grad_norm": 0.9069898128509521, + "learning_rate": 0.00018402563180013783, + "loss": 1.141, + "step": 9485 + }, + { + "epoch": 0.36535129932627525, + "grad_norm": 1.3908804655075073, + "learning_rate": 0.0001840092323655411, + "loss": 1.2679, + "step": 9490 + }, + { + "epoch": 0.365543792107796, + "grad_norm": 1.3785732984542847, + "learning_rate": 0.00018399282524885654, + "loss": 1.22, + "step": 9495 + }, + { + "epoch": 0.36573628488931664, + "grad_norm": 1.1326193809509277, + "learning_rate": 0.00018397641045158453, + "loss": 1.2289, + "step": 9500 + }, + { + "epoch": 0.36592877767083737, + "grad_norm": 1.2267814874649048, + "learning_rate": 0.0001839599879752261, + "loss": 1.1337, + "step": 9505 + }, + { + "epoch": 0.36612127045235804, + "grad_norm": 0.8690314888954163, + "learning_rate": 0.00018394355782128295, + "loss": 1.2535, + "step": 9510 + }, + { + "epoch": 0.3663137632338787, + "grad_norm": 1.448415994644165, + "learning_rate": 0.00018392711999125748, + "loss": 1.1405, + "step": 9515 + }, + { + "epoch": 0.36650625601539943, + "grad_norm": 1.8989317417144775, + "learning_rate": 0.00018391067448665288, + "loss": 1.091, + "step": 9520 + }, + { + "epoch": 0.3666987487969201, + "grad_norm": 1.2263299226760864, + "learning_rate": 0.00018389422130897295, + "loss": 1.1925, + "step": 9525 + }, + { + "epoch": 0.36689124157844083, + "grad_norm": 0.8818153142929077, + "learning_rate": 0.00018387776045972225, + "loss": 1.2961, + "step": 9530 + }, + { + "epoch": 0.3670837343599615, + "grad_norm": 1.0975017547607422, + "learning_rate": 0.00018386129194040597, + "loss": 1.414, + "step": 9535 + }, + { + "epoch": 0.36727622714148217, + "grad_norm": 2.2097692489624023, + "learning_rate": 0.00018384481575253004, + "loss": 1.1941, + "step": 9540 + }, + { + "epoch": 0.3674687199230029, + "grad_norm": 1.2249376773834229, + "learning_rate": 0.0001838283318976012, + "loss": 1.4472, + "step": 9545 + }, + { + "epoch": 0.36766121270452357, + "grad_norm": 1.0000889301300049, + "learning_rate": 0.0001838118403771267, + "loss": 1.2399, + "step": 9550 + }, + { + "epoch": 0.3678537054860443, + "grad_norm": 1.0249544382095337, + "learning_rate": 0.00018379534119261458, + "loss": 1.3182, + "step": 9555 + }, + { + "epoch": 0.36804619826756496, + "grad_norm": 1.2347283363342285, + "learning_rate": 0.00018377883434557362, + "loss": 1.1313, + "step": 9560 + }, + { + "epoch": 0.3682386910490857, + "grad_norm": 1.1021714210510254, + "learning_rate": 0.0001837623198375132, + "loss": 1.2381, + "step": 9565 + }, + { + "epoch": 0.36843118383060636, + "grad_norm": 1.0923985242843628, + "learning_rate": 0.00018374579766994355, + "loss": 1.3386, + "step": 9570 + }, + { + "epoch": 0.368623676612127, + "grad_norm": 1.7709978818893433, + "learning_rate": 0.00018372926784437547, + "loss": 1.2405, + "step": 9575 + }, + { + "epoch": 0.36881616939364775, + "grad_norm": 1.316901683807373, + "learning_rate": 0.00018371273036232047, + "loss": 1.1244, + "step": 9580 + }, + { + "epoch": 0.3690086621751684, + "grad_norm": 1.7281345129013062, + "learning_rate": 0.00018369618522529085, + "loss": 1.2979, + "step": 9585 + }, + { + "epoch": 0.36920115495668915, + "grad_norm": 1.6363762617111206, + "learning_rate": 0.00018367963243479953, + "loss": 1.1528, + "step": 9590 + }, + { + "epoch": 0.3693936477382098, + "grad_norm": 1.7078179121017456, + "learning_rate": 0.00018366307199236013, + "loss": 1.2833, + "step": 9595 + }, + { + "epoch": 0.3695861405197305, + "grad_norm": 1.9110232591629028, + "learning_rate": 0.000183646503899487, + "loss": 1.4191, + "step": 9600 + }, + { + "epoch": 0.3697786333012512, + "grad_norm": 0.952301025390625, + "learning_rate": 0.00018362992815769525, + "loss": 1.1504, + "step": 9605 + }, + { + "epoch": 0.3699711260827719, + "grad_norm": 0.9142165780067444, + "learning_rate": 0.0001836133447685005, + "loss": 1.2617, + "step": 9610 + }, + { + "epoch": 0.3701636188642926, + "grad_norm": 1.5571134090423584, + "learning_rate": 0.0001835967537334193, + "loss": 1.3054, + "step": 9615 + }, + { + "epoch": 0.3703561116458133, + "grad_norm": 1.799795389175415, + "learning_rate": 0.00018358015505396877, + "loss": 1.0603, + "step": 9620 + }, + { + "epoch": 0.37054860442733395, + "grad_norm": 1.6660315990447998, + "learning_rate": 0.0001835635487316667, + "loss": 1.1757, + "step": 9625 + }, + { + "epoch": 0.3707410972088547, + "grad_norm": 0.9840423464775085, + "learning_rate": 0.00018354693476803168, + "loss": 0.9815, + "step": 9630 + }, + { + "epoch": 0.37093358999037535, + "grad_norm": 2.0538954734802246, + "learning_rate": 0.00018353031316458286, + "loss": 1.2396, + "step": 9635 + }, + { + "epoch": 0.37112608277189607, + "grad_norm": 1.2079198360443115, + "learning_rate": 0.0001835136839228403, + "loss": 1.2731, + "step": 9640 + }, + { + "epoch": 0.37131857555341674, + "grad_norm": 1.7076921463012695, + "learning_rate": 0.00018349704704432457, + "loss": 1.1388, + "step": 9645 + }, + { + "epoch": 0.37151106833493747, + "grad_norm": 1.0324435234069824, + "learning_rate": 0.00018348040253055698, + "loss": 0.9949, + "step": 9650 + }, + { + "epoch": 0.37170356111645814, + "grad_norm": 1.3635584115982056, + "learning_rate": 0.0001834637503830596, + "loss": 1.307, + "step": 9655 + }, + { + "epoch": 0.3718960538979788, + "grad_norm": 1.6683429479599, + "learning_rate": 0.00018344709060335513, + "loss": 1.1687, + "step": 9660 + }, + { + "epoch": 0.37208854667949953, + "grad_norm": 2.3687121868133545, + "learning_rate": 0.00018343042319296702, + "loss": 1.4163, + "step": 9665 + }, + { + "epoch": 0.3722810394610202, + "grad_norm": 1.9078242778778076, + "learning_rate": 0.00018341374815341937, + "loss": 1.2986, + "step": 9670 + }, + { + "epoch": 0.37247353224254093, + "grad_norm": 1.6381220817565918, + "learning_rate": 0.00018339706548623706, + "loss": 1.5092, + "step": 9675 + }, + { + "epoch": 0.3726660250240616, + "grad_norm": 1.3529161214828491, + "learning_rate": 0.00018338037519294553, + "loss": 1.2296, + "step": 9680 + }, + { + "epoch": 0.37285851780558227, + "grad_norm": 1.1034053564071655, + "learning_rate": 0.00018336367727507104, + "loss": 1.2774, + "step": 9685 + }, + { + "epoch": 0.373051010587103, + "grad_norm": 2.0935397148132324, + "learning_rate": 0.0001833469717341405, + "loss": 1.2247, + "step": 9690 + }, + { + "epoch": 0.37324350336862366, + "grad_norm": 1.6294866800308228, + "learning_rate": 0.0001833302585716815, + "loss": 1.3766, + "step": 9695 + }, + { + "epoch": 0.3734359961501444, + "grad_norm": 1.6927978992462158, + "learning_rate": 0.0001833135377892224, + "loss": 1.3069, + "step": 9700 + }, + { + "epoch": 0.37362848893166506, + "grad_norm": 0.8497247695922852, + "learning_rate": 0.00018329680938829212, + "loss": 1.0906, + "step": 9705 + }, + { + "epoch": 0.37382098171318573, + "grad_norm": 1.9347554445266724, + "learning_rate": 0.00018328007337042046, + "loss": 1.277, + "step": 9710 + }, + { + "epoch": 0.37401347449470645, + "grad_norm": 1.023130178451538, + "learning_rate": 0.00018326332973713776, + "loss": 1.254, + "step": 9715 + }, + { + "epoch": 0.3742059672762271, + "grad_norm": 1.7206385135650635, + "learning_rate": 0.0001832465784899751, + "loss": 1.2141, + "step": 9720 + }, + { + "epoch": 0.37439846005774785, + "grad_norm": 1.2445294857025146, + "learning_rate": 0.00018322981963046433, + "loss": 1.3817, + "step": 9725 + }, + { + "epoch": 0.3745909528392685, + "grad_norm": 1.832334280014038, + "learning_rate": 0.00018321305316013788, + "loss": 1.3584, + "step": 9730 + }, + { + "epoch": 0.37478344562078925, + "grad_norm": 1.2087010145187378, + "learning_rate": 0.00018319627908052898, + "loss": 1.116, + "step": 9735 + }, + { + "epoch": 0.3749759384023099, + "grad_norm": 1.286687970161438, + "learning_rate": 0.00018317949739317147, + "loss": 1.1913, + "step": 9740 + }, + { + "epoch": 0.3751684311838306, + "grad_norm": 1.44833242893219, + "learning_rate": 0.00018316270809959993, + "loss": 1.2713, + "step": 9745 + }, + { + "epoch": 0.3753609239653513, + "grad_norm": 1.1395667791366577, + "learning_rate": 0.00018314591120134963, + "loss": 1.2912, + "step": 9750 + }, + { + "epoch": 0.375553416746872, + "grad_norm": 1.1399837732315063, + "learning_rate": 0.00018312910669995654, + "loss": 1.2804, + "step": 9755 + }, + { + "epoch": 0.3757459095283927, + "grad_norm": 1.814249038696289, + "learning_rate": 0.00018311229459695735, + "loss": 1.1062, + "step": 9760 + }, + { + "epoch": 0.3759384023099134, + "grad_norm": 1.4851144552230835, + "learning_rate": 0.00018309547489388933, + "loss": 1.2826, + "step": 9765 + }, + { + "epoch": 0.37613089509143405, + "grad_norm": 0.9308827519416809, + "learning_rate": 0.00018307864759229065, + "loss": 1.3706, + "step": 9770 + }, + { + "epoch": 0.3763233878729548, + "grad_norm": 3.707566261291504, + "learning_rate": 0.00018306181269369998, + "loss": 1.2292, + "step": 9775 + }, + { + "epoch": 0.37651588065447544, + "grad_norm": 2.6666324138641357, + "learning_rate": 0.00018304497019965677, + "loss": 1.4645, + "step": 9780 + }, + { + "epoch": 0.37670837343599617, + "grad_norm": 1.5997512340545654, + "learning_rate": 0.00018302812011170114, + "loss": 1.2812, + "step": 9785 + }, + { + "epoch": 0.37690086621751684, + "grad_norm": 0.8998873233795166, + "learning_rate": 0.00018301126243137395, + "loss": 1.195, + "step": 9790 + }, + { + "epoch": 0.3770933589990375, + "grad_norm": 1.407524585723877, + "learning_rate": 0.0001829943971602167, + "loss": 1.1793, + "step": 9795 + }, + { + "epoch": 0.37728585178055823, + "grad_norm": 1.1469497680664062, + "learning_rate": 0.00018297752429977164, + "loss": 1.3624, + "step": 9800 + }, + { + "epoch": 0.3774783445620789, + "grad_norm": 1.4583423137664795, + "learning_rate": 0.00018296064385158164, + "loss": 1.2033, + "step": 9805 + }, + { + "epoch": 0.37767083734359963, + "grad_norm": 1.0782575607299805, + "learning_rate": 0.00018294375581719036, + "loss": 1.1823, + "step": 9810 + }, + { + "epoch": 0.3778633301251203, + "grad_norm": 1.1890922784805298, + "learning_rate": 0.00018292686019814202, + "loss": 1.2711, + "step": 9815 + }, + { + "epoch": 0.378055822906641, + "grad_norm": 0.854491651058197, + "learning_rate": 0.00018290995699598165, + "loss": 1.1953, + "step": 9820 + }, + { + "epoch": 0.3782483156881617, + "grad_norm": 1.2184374332427979, + "learning_rate": 0.00018289304621225497, + "loss": 1.2052, + "step": 9825 + }, + { + "epoch": 0.37844080846968237, + "grad_norm": 1.1952948570251465, + "learning_rate": 0.0001828761278485083, + "loss": 1.2516, + "step": 9830 + }, + { + "epoch": 0.3786333012512031, + "grad_norm": 2.1117265224456787, + "learning_rate": 0.00018285920190628879, + "loss": 1.2834, + "step": 9835 + }, + { + "epoch": 0.37882579403272376, + "grad_norm": 1.1815403699874878, + "learning_rate": 0.00018284226838714412, + "loss": 1.0574, + "step": 9840 + }, + { + "epoch": 0.3790182868142445, + "grad_norm": 1.3763145208358765, + "learning_rate": 0.00018282532729262278, + "loss": 1.2813, + "step": 9845 + }, + { + "epoch": 0.37921077959576516, + "grad_norm": 1.5308822393417358, + "learning_rate": 0.00018280837862427393, + "loss": 1.2118, + "step": 9850 + }, + { + "epoch": 0.3794032723772858, + "grad_norm": 1.1991111040115356, + "learning_rate": 0.00018279142238364745, + "loss": 1.0999, + "step": 9855 + }, + { + "epoch": 0.37959576515880655, + "grad_norm": 1.7062435150146484, + "learning_rate": 0.0001827744585722938, + "loss": 1.2103, + "step": 9860 + }, + { + "epoch": 0.3797882579403272, + "grad_norm": 1.5572453737258911, + "learning_rate": 0.00018275748719176425, + "loss": 1.112, + "step": 9865 + }, + { + "epoch": 0.37998075072184795, + "grad_norm": 0.9328321218490601, + "learning_rate": 0.00018274050824361072, + "loss": 1.2688, + "step": 9870 + }, + { + "epoch": 0.3801732435033686, + "grad_norm": 1.290634036064148, + "learning_rate": 0.0001827235217293858, + "loss": 1.1486, + "step": 9875 + }, + { + "epoch": 0.38036573628488934, + "grad_norm": 1.7471963167190552, + "learning_rate": 0.00018270652765064283, + "loss": 1.2584, + "step": 9880 + }, + { + "epoch": 0.38055822906641, + "grad_norm": 1.4827409982681274, + "learning_rate": 0.00018268952600893577, + "loss": 1.3655, + "step": 9885 + }, + { + "epoch": 0.3807507218479307, + "grad_norm": 1.0229063034057617, + "learning_rate": 0.00018267251680581935, + "loss": 1.1955, + "step": 9890 + }, + { + "epoch": 0.3809432146294514, + "grad_norm": 1.3075898885726929, + "learning_rate": 0.0001826555000428489, + "loss": 0.9779, + "step": 9895 + }, + { + "epoch": 0.3811357074109721, + "grad_norm": 1.5942119359970093, + "learning_rate": 0.00018263847572158053, + "loss": 1.2556, + "step": 9900 + }, + { + "epoch": 0.3813282001924928, + "grad_norm": 0.9223330616950989, + "learning_rate": 0.00018262144384357097, + "loss": 1.1109, + "step": 9905 + }, + { + "epoch": 0.3815206929740135, + "grad_norm": 1.7757457494735718, + "learning_rate": 0.00018260440441037766, + "loss": 1.2219, + "step": 9910 + }, + { + "epoch": 0.38171318575553415, + "grad_norm": 1.4870551824569702, + "learning_rate": 0.00018258735742355883, + "loss": 1.3312, + "step": 9915 + }, + { + "epoch": 0.38190567853705487, + "grad_norm": 1.2982031106948853, + "learning_rate": 0.00018257030288467322, + "loss": 1.2421, + "step": 9920 + }, + { + "epoch": 0.38209817131857554, + "grad_norm": 1.016822338104248, + "learning_rate": 0.0001825532407952804, + "loss": 1.3542, + "step": 9925 + }, + { + "epoch": 0.38229066410009627, + "grad_norm": 1.0763219594955444, + "learning_rate": 0.00018253617115694058, + "loss": 1.2579, + "step": 9930 + }, + { + "epoch": 0.38248315688161694, + "grad_norm": 1.7673341035842896, + "learning_rate": 0.00018251909397121464, + "loss": 1.1875, + "step": 9935 + }, + { + "epoch": 0.3826756496631376, + "grad_norm": 1.3719041347503662, + "learning_rate": 0.00018250200923966423, + "loss": 1.1493, + "step": 9940 + }, + { + "epoch": 0.38286814244465833, + "grad_norm": 1.8589760065078735, + "learning_rate": 0.00018248491696385157, + "loss": 1.2751, + "step": 9945 + }, + { + "epoch": 0.383060635226179, + "grad_norm": 1.6069539785385132, + "learning_rate": 0.0001824678171453397, + "loss": 1.415, + "step": 9950 + }, + { + "epoch": 0.38325312800769973, + "grad_norm": 1.7131226062774658, + "learning_rate": 0.0001824507097856922, + "loss": 1.1773, + "step": 9955 + }, + { + "epoch": 0.3834456207892204, + "grad_norm": 0.7622759342193604, + "learning_rate": 0.0001824335948864735, + "loss": 1.1588, + "step": 9960 + }, + { + "epoch": 0.3836381135707411, + "grad_norm": 1.6202800273895264, + "learning_rate": 0.0001824164724492486, + "loss": 1.3064, + "step": 9965 + }, + { + "epoch": 0.3838306063522618, + "grad_norm": 1.5452194213867188, + "learning_rate": 0.0001823993424755833, + "loss": 1.2993, + "step": 9970 + }, + { + "epoch": 0.38402309913378246, + "grad_norm": 1.013929009437561, + "learning_rate": 0.00018238220496704396, + "loss": 1.3123, + "step": 9975 + }, + { + "epoch": 0.3842155919153032, + "grad_norm": 0.9624648094177246, + "learning_rate": 0.0001823650599251977, + "loss": 1.0517, + "step": 9980 + }, + { + "epoch": 0.38440808469682386, + "grad_norm": 1.2065962553024292, + "learning_rate": 0.00018234790735161232, + "loss": 1.1954, + "step": 9985 + }, + { + "epoch": 0.3846005774783446, + "grad_norm": 1.425376057624817, + "learning_rate": 0.00018233074724785634, + "loss": 1.069, + "step": 9990 + }, + { + "epoch": 0.38479307025986526, + "grad_norm": 1.0355112552642822, + "learning_rate": 0.00018231357961549888, + "loss": 1.0839, + "step": 9995 + }, + { + "epoch": 0.3849855630413859, + "grad_norm": 1.7273633480072021, + "learning_rate": 0.00018229640445610988, + "loss": 1.1324, + "step": 10000 + } + ], + "logging_steps": 5, + "max_steps": 51950, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "total_flos": 3.133033729973453e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}