diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.539942252165544, + "epoch": 1.9249278152069298, "eval_steps": 500, - "global_step": 40000, + "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -56007,6 +56007,14006 @@ "learning_rate": 2.5063532569201454e-05, "loss": 0.765, "step": 40000 + }, + { + "epoch": 1.5401347449470646, + "grad_norm": 1.959747076034546, + "learning_rate": 2.5043512613218933e-05, + "loss": 0.8532, + "step": 40005 + }, + { + "epoch": 1.5403272377285853, + "grad_norm": 1.0300486087799072, + "learning_rate": 2.502349951151064e-05, + "loss": 0.7775, + "step": 40010 + }, + { + "epoch": 1.540519730510106, + "grad_norm": 1.6674004793167114, + "learning_rate": 2.5003493265906664e-05, + "loss": 0.9549, + "step": 40015 + }, + { + "epoch": 1.5407122232916266, + "grad_norm": 1.486302137374878, + "learning_rate": 2.4983493878236374e-05, + "loss": 0.7536, + "step": 40020 + }, + { + "epoch": 1.5409047160731473, + "grad_norm": 0.8989318013191223, + "learning_rate": 2.496350135032869e-05, + "loss": 0.805, + "step": 40025 + }, + { + "epoch": 1.541097208854668, + "grad_norm": 1.692155361175537, + "learning_rate": 2.49435156840117e-05, + "loss": 0.8884, + "step": 40030 + }, + { + "epoch": 1.5412897016361886, + "grad_norm": 0.7467209100723267, + "learning_rate": 2.4923536881112997e-05, + "loss": 0.7268, + "step": 40035 + }, + { + "epoch": 1.5414821944177093, + "grad_norm": 1.0695582628250122, + "learning_rate": 2.490356494345951e-05, + "loss": 0.6906, + "step": 40040 + }, + { + "epoch": 1.54167468719923, + "grad_norm": 1.119673728942871, + "learning_rate": 2.4883599872877583e-05, + "loss": 0.8238, + "step": 40045 + }, + { + "epoch": 1.5418671799807506, + "grad_norm": 1.4105931520462036, + "learning_rate": 2.4863641671192806e-05, + "loss": 0.7352, + "step": 40050 + }, + { + "epoch": 1.5420596727622713, + "grad_norm": 0.8062414526939392, + "learning_rate": 2.4843690340230265e-05, + "loss": 0.8423, + "step": 40055 + }, + { + "epoch": 1.542252165543792, + "grad_norm": 1.100274682044983, + "learning_rate": 2.4823745881814374e-05, + "loss": 0.7665, + "step": 40060 + }, + { + "epoch": 1.5424446583253129, + "grad_norm": 1.5957090854644775, + "learning_rate": 2.4803808297768937e-05, + "loss": 0.7665, + "step": 40065 + }, + { + "epoch": 1.5426371511068335, + "grad_norm": 1.4533089399337769, + "learning_rate": 2.4783877589917125e-05, + "loss": 0.7874, + "step": 40070 + }, + { + "epoch": 1.5428296438883542, + "grad_norm": 1.0134419202804565, + "learning_rate": 2.4763953760081414e-05, + "loss": 0.8836, + "step": 40075 + }, + { + "epoch": 1.5430221366698749, + "grad_norm": 1.2606679201126099, + "learning_rate": 2.4744036810083738e-05, + "loss": 0.8233, + "step": 40080 + }, + { + "epoch": 1.5432146294513955, + "grad_norm": 1.0272793769836426, + "learning_rate": 2.472412674174538e-05, + "loss": 0.8782, + "step": 40085 + }, + { + "epoch": 1.5434071222329164, + "grad_norm": 0.853097140789032, + "learning_rate": 2.4704223556886998e-05, + "loss": 0.7858, + "step": 40090 + }, + { + "epoch": 1.543599615014437, + "grad_norm": 1.5345498323440552, + "learning_rate": 2.4684327257328522e-05, + "loss": 0.7227, + "step": 40095 + }, + { + "epoch": 1.5437921077959578, + "grad_norm": 1.565811276435852, + "learning_rate": 2.4664437844889454e-05, + "loss": 0.8012, + "step": 40100 + }, + { + "epoch": 1.5439846005774784, + "grad_norm": 1.7416857481002808, + "learning_rate": 2.4644555321388462e-05, + "loss": 0.7459, + "step": 40105 + }, + { + "epoch": 1.544177093358999, + "grad_norm": 1.2711718082427979, + "learning_rate": 2.4624679688643716e-05, + "loss": 0.7688, + "step": 40110 + }, + { + "epoch": 1.5443695861405198, + "grad_norm": 1.38042414188385, + "learning_rate": 2.4604810948472677e-05, + "loss": 0.8042, + "step": 40115 + }, + { + "epoch": 1.5445620789220404, + "grad_norm": 1.0315542221069336, + "learning_rate": 2.458494910269228e-05, + "loss": 0.8793, + "step": 40120 + }, + { + "epoch": 1.5447545717035611, + "grad_norm": 1.4990910291671753, + "learning_rate": 2.456509415311864e-05, + "loss": 0.766, + "step": 40125 + }, + { + "epoch": 1.5449470644850818, + "grad_norm": 1.0002154111862183, + "learning_rate": 2.4545246101567476e-05, + "loss": 0.8452, + "step": 40130 + }, + { + "epoch": 1.5451395572666025, + "grad_norm": 1.0912269353866577, + "learning_rate": 2.4525404949853702e-05, + "loss": 0.71, + "step": 40135 + }, + { + "epoch": 1.5453320500481231, + "grad_norm": 1.2968130111694336, + "learning_rate": 2.4505570699791668e-05, + "loss": 0.6927, + "step": 40140 + }, + { + "epoch": 1.5455245428296438, + "grad_norm": 0.9810501933097839, + "learning_rate": 2.448574335319508e-05, + "loss": 0.821, + "step": 40145 + }, + { + "epoch": 1.5457170356111645, + "grad_norm": 1.6552939414978027, + "learning_rate": 2.446592291187706e-05, + "loss": 0.8898, + "step": 40150 + }, + { + "epoch": 1.5459095283926851, + "grad_norm": 2.9171223640441895, + "learning_rate": 2.4446109377649996e-05, + "loss": 0.8895, + "step": 40155 + }, + { + "epoch": 1.5461020211742058, + "grad_norm": 1.7031809091567993, + "learning_rate": 2.4426302752325735e-05, + "loss": 0.7551, + "step": 40160 + }, + { + "epoch": 1.5462945139557267, + "grad_norm": 0.709255039691925, + "learning_rate": 2.4406503037715445e-05, + "loss": 0.7308, + "step": 40165 + }, + { + "epoch": 1.5464870067372474, + "grad_norm": 1.2134255170822144, + "learning_rate": 2.4386710235629708e-05, + "loss": 0.8311, + "step": 40170 + }, + { + "epoch": 1.546679499518768, + "grad_norm": 0.7918189764022827, + "learning_rate": 2.436692434787844e-05, + "loss": 0.7764, + "step": 40175 + }, + { + "epoch": 1.5468719923002887, + "grad_norm": 1.3109943866729736, + "learning_rate": 2.4347145376270896e-05, + "loss": 0.8472, + "step": 40180 + }, + { + "epoch": 1.5470644850818094, + "grad_norm": 1.0307000875473022, + "learning_rate": 2.4327373322615754e-05, + "loss": 0.8618, + "step": 40185 + }, + { + "epoch": 1.5472569778633303, + "grad_norm": 0.8745335936546326, + "learning_rate": 2.430760818872103e-05, + "loss": 0.7465, + "step": 40190 + }, + { + "epoch": 1.547449470644851, + "grad_norm": 1.271261215209961, + "learning_rate": 2.428784997639415e-05, + "loss": 0.6754, + "step": 40195 + }, + { + "epoch": 1.5476419634263716, + "grad_norm": 0.9863643646240234, + "learning_rate": 2.42680986874418e-05, + "loss": 0.7717, + "step": 40200 + }, + { + "epoch": 1.5478344562078923, + "grad_norm": 1.5643993616104126, + "learning_rate": 2.4248354323670185e-05, + "loss": 0.8209, + "step": 40205 + }, + { + "epoch": 1.548026948989413, + "grad_norm": 1.2394016981124878, + "learning_rate": 2.4228616886884713e-05, + "loss": 0.8473, + "step": 40210 + }, + { + "epoch": 1.5482194417709336, + "grad_norm": 1.1060245037078857, + "learning_rate": 2.420888637889034e-05, + "loss": 0.6498, + "step": 40215 + }, + { + "epoch": 1.5484119345524543, + "grad_norm": 1.6194653511047363, + "learning_rate": 2.4189162801491206e-05, + "loss": 0.839, + "step": 40220 + }, + { + "epoch": 1.548604427333975, + "grad_norm": 0.9702569842338562, + "learning_rate": 2.4169446156490938e-05, + "loss": 0.8054, + "step": 40225 + }, + { + "epoch": 1.5487969201154956, + "grad_norm": 0.964542031288147, + "learning_rate": 2.4149736445692483e-05, + "loss": 0.7966, + "step": 40230 + }, + { + "epoch": 1.5489894128970163, + "grad_norm": 1.230900526046753, + "learning_rate": 2.413003367089821e-05, + "loss": 0.8332, + "step": 40235 + }, + { + "epoch": 1.549181905678537, + "grad_norm": 0.8581804633140564, + "learning_rate": 2.411033783390969e-05, + "loss": 0.8305, + "step": 40240 + }, + { + "epoch": 1.5493743984600576, + "grad_norm": 1.232013463973999, + "learning_rate": 2.4090648936528125e-05, + "loss": 0.7663, + "step": 40245 + }, + { + "epoch": 1.5495668912415783, + "grad_norm": 1.6917791366577148, + "learning_rate": 2.407096698055382e-05, + "loss": 0.8289, + "step": 40250 + }, + { + "epoch": 1.549759384023099, + "grad_norm": 1.128301978111267, + "learning_rate": 2.4051291967786605e-05, + "loss": 0.8291, + "step": 40255 + }, + { + "epoch": 1.5499518768046199, + "grad_norm": 1.8056789636611938, + "learning_rate": 2.4031623900025624e-05, + "loss": 0.7495, + "step": 40260 + }, + { + "epoch": 1.5501443695861405, + "grad_norm": 1.2374507188796997, + "learning_rate": 2.4011962779069432e-05, + "loss": 0.7374, + "step": 40265 + }, + { + "epoch": 1.5503368623676612, + "grad_norm": 1.8159735202789307, + "learning_rate": 2.3992308606715828e-05, + "loss": 0.7937, + "step": 40270 + }, + { + "epoch": 1.5505293551491819, + "grad_norm": 1.4417248964309692, + "learning_rate": 2.3972661384762096e-05, + "loss": 0.8844, + "step": 40275 + }, + { + "epoch": 1.5507218479307026, + "grad_norm": 1.61116623878479, + "learning_rate": 2.3953021115004858e-05, + "loss": 0.8133, + "step": 40280 + }, + { + "epoch": 1.5509143407122234, + "grad_norm": 1.6744754314422607, + "learning_rate": 2.393338779924006e-05, + "loss": 0.7884, + "step": 40285 + }, + { + "epoch": 1.5511068334937441, + "grad_norm": 1.0945570468902588, + "learning_rate": 2.3913761439263095e-05, + "loss": 0.9085, + "step": 40290 + }, + { + "epoch": 1.5512993262752648, + "grad_norm": 1.8907279968261719, + "learning_rate": 2.3894142036868583e-05, + "loss": 0.9115, + "step": 40295 + }, + { + "epoch": 1.5514918190567855, + "grad_norm": 1.0637871026992798, + "learning_rate": 2.3874529593850624e-05, + "loss": 0.7011, + "step": 40300 + }, + { + "epoch": 1.5516843118383061, + "grad_norm": 1.661790132522583, + "learning_rate": 2.3854924112002665e-05, + "loss": 0.7207, + "step": 40305 + }, + { + "epoch": 1.5518768046198268, + "grad_norm": 1.138431429862976, + "learning_rate": 2.3835325593117498e-05, + "loss": 0.8999, + "step": 40310 + }, + { + "epoch": 1.5520692974013475, + "grad_norm": 1.0955361127853394, + "learning_rate": 2.381573403898719e-05, + "loss": 0.8033, + "step": 40315 + }, + { + "epoch": 1.5522617901828681, + "grad_norm": 0.4663851857185364, + "learning_rate": 2.3796149451403405e-05, + "loss": 0.8223, + "step": 40320 + }, + { + "epoch": 1.5524542829643888, + "grad_norm": 0.9485166072845459, + "learning_rate": 2.3776571832156914e-05, + "loss": 0.8846, + "step": 40325 + }, + { + "epoch": 1.5526467757459095, + "grad_norm": 0.8865009546279907, + "learning_rate": 2.3757001183037998e-05, + "loss": 0.7741, + "step": 40330 + }, + { + "epoch": 1.5528392685274301, + "grad_norm": 1.5837032794952393, + "learning_rate": 2.3737437505836257e-05, + "loss": 0.8743, + "step": 40335 + }, + { + "epoch": 1.5530317613089508, + "grad_norm": 1.4707670211791992, + "learning_rate": 2.3717880802340698e-05, + "loss": 0.8894, + "step": 40340 + }, + { + "epoch": 1.5532242540904715, + "grad_norm": 1.321226954460144, + "learning_rate": 2.3698331074339553e-05, + "loss": 0.9362, + "step": 40345 + }, + { + "epoch": 1.5534167468719922, + "grad_norm": 1.60404372215271, + "learning_rate": 2.3678788323620638e-05, + "loss": 0.7989, + "step": 40350 + }, + { + "epoch": 1.5536092396535128, + "grad_norm": 1.4875679016113281, + "learning_rate": 2.3659252551970922e-05, + "loss": 0.8456, + "step": 40355 + }, + { + "epoch": 1.5538017324350337, + "grad_norm": 1.7115007638931274, + "learning_rate": 2.3639723761176847e-05, + "loss": 0.8346, + "step": 40360 + }, + { + "epoch": 1.5539942252165544, + "grad_norm": 1.4487700462341309, + "learning_rate": 2.362020195302419e-05, + "loss": 0.7359, + "step": 40365 + }, + { + "epoch": 1.554186717998075, + "grad_norm": 1.6501834392547607, + "learning_rate": 2.3600687129298126e-05, + "loss": 0.769, + "step": 40370 + }, + { + "epoch": 1.5543792107795957, + "grad_norm": 1.4506645202636719, + "learning_rate": 2.3581179291783094e-05, + "loss": 0.9698, + "step": 40375 + }, + { + "epoch": 1.5545717035611166, + "grad_norm": 1.098969578742981, + "learning_rate": 2.356167844226299e-05, + "loss": 0.8277, + "step": 40380 + }, + { + "epoch": 1.5547641963426373, + "grad_norm": 0.9754076600074768, + "learning_rate": 2.3542184582521034e-05, + "loss": 0.754, + "step": 40385 + }, + { + "epoch": 1.554956689124158, + "grad_norm": 1.5836515426635742, + "learning_rate": 2.3522697714339814e-05, + "loss": 0.8996, + "step": 40390 + }, + { + "epoch": 1.5551491819056786, + "grad_norm": 1.1244460344314575, + "learning_rate": 2.3503217839501302e-05, + "loss": 0.721, + "step": 40395 + }, + { + "epoch": 1.5553416746871993, + "grad_norm": 0.96147620677948, + "learning_rate": 2.3483744959786735e-05, + "loss": 0.7998, + "step": 40400 + }, + { + "epoch": 1.55553416746872, + "grad_norm": 0.9615457653999329, + "learning_rate": 2.3464279076976837e-05, + "loss": 0.9238, + "step": 40405 + }, + { + "epoch": 1.5557266602502406, + "grad_norm": 2.3235690593719482, + "learning_rate": 2.344482019285159e-05, + "loss": 0.9334, + "step": 40410 + }, + { + "epoch": 1.5559191530317613, + "grad_norm": 1.226501703262329, + "learning_rate": 2.3425368309190455e-05, + "loss": 0.8288, + "step": 40415 + }, + { + "epoch": 1.556111645813282, + "grad_norm": 1.4257621765136719, + "learning_rate": 2.3405923427772057e-05, + "loss": 0.5693, + "step": 40420 + }, + { + "epoch": 1.5563041385948027, + "grad_norm": 1.0261151790618896, + "learning_rate": 2.3386485550374636e-05, + "loss": 0.9288, + "step": 40425 + }, + { + "epoch": 1.5564966313763233, + "grad_norm": 1.0929063558578491, + "learning_rate": 2.3367054678775557e-05, + "loss": 0.7563, + "step": 40430 + }, + { + "epoch": 1.556689124157844, + "grad_norm": 0.8762004375457764, + "learning_rate": 2.3347630814751687e-05, + "loss": 0.7824, + "step": 40435 + }, + { + "epoch": 1.5568816169393647, + "grad_norm": 1.4772645235061646, + "learning_rate": 2.3328213960079205e-05, + "loss": 0.8197, + "step": 40440 + }, + { + "epoch": 1.5570741097208853, + "grad_norm": 1.4557557106018066, + "learning_rate": 2.330880411653368e-05, + "loss": 0.8008, + "step": 40445 + }, + { + "epoch": 1.557266602502406, + "grad_norm": 1.2183635234832764, + "learning_rate": 2.3289401285889934e-05, + "loss": 0.769, + "step": 40450 + }, + { + "epoch": 1.557459095283927, + "grad_norm": 0.9675846695899963, + "learning_rate": 2.327000546992233e-05, + "loss": 0.8019, + "step": 40455 + }, + { + "epoch": 1.5576515880654476, + "grad_norm": 1.5185277462005615, + "learning_rate": 2.32506166704044e-05, + "loss": 0.6456, + "step": 40460 + }, + { + "epoch": 1.5578440808469682, + "grad_norm": 1.6056936979293823, + "learning_rate": 2.3231234889109165e-05, + "loss": 0.8468, + "step": 40465 + }, + { + "epoch": 1.558036573628489, + "grad_norm": 1.182332992553711, + "learning_rate": 2.3211860127808948e-05, + "loss": 0.991, + "step": 40470 + }, + { + "epoch": 1.5582290664100096, + "grad_norm": 1.1876064538955688, + "learning_rate": 2.3192492388275454e-05, + "loss": 0.7254, + "step": 40475 + }, + { + "epoch": 1.5584215591915305, + "grad_norm": 1.7529163360595703, + "learning_rate": 2.3173131672279725e-05, + "loss": 0.9351, + "step": 40480 + }, + { + "epoch": 1.5586140519730511, + "grad_norm": 1.0285496711730957, + "learning_rate": 2.3153777981592207e-05, + "loss": 0.835, + "step": 40485 + }, + { + "epoch": 1.5588065447545718, + "grad_norm": 1.4586035013198853, + "learning_rate": 2.3134431317982597e-05, + "loss": 0.7694, + "step": 40490 + }, + { + "epoch": 1.5589990375360925, + "grad_norm": 1.656645655632019, + "learning_rate": 2.311509168322006e-05, + "loss": 0.7945, + "step": 40495 + }, + { + "epoch": 1.5591915303176132, + "grad_norm": 1.66303551197052, + "learning_rate": 2.30957590790731e-05, + "loss": 0.7964, + "step": 40500 + }, + { + "epoch": 1.5593840230991338, + "grad_norm": 1.375352382659912, + "learning_rate": 2.307643350730947e-05, + "loss": 0.9546, + "step": 40505 + }, + { + "epoch": 1.5595765158806545, + "grad_norm": 1.1943459510803223, + "learning_rate": 2.305711496969648e-05, + "loss": 0.6616, + "step": 40510 + }, + { + "epoch": 1.5597690086621752, + "grad_norm": 1.4341318607330322, + "learning_rate": 2.3037803468000597e-05, + "loss": 0.8164, + "step": 40515 + }, + { + "epoch": 1.5599615014436958, + "grad_norm": 1.3383629322052002, + "learning_rate": 2.301849900398776e-05, + "loss": 0.8711, + "step": 40520 + }, + { + "epoch": 1.5601539942252165, + "grad_norm": 1.8261584043502808, + "learning_rate": 2.2999201579423236e-05, + "loss": 0.8195, + "step": 40525 + }, + { + "epoch": 1.5603464870067372, + "grad_norm": 2.052516222000122, + "learning_rate": 2.2979911196071668e-05, + "loss": 0.9955, + "step": 40530 + }, + { + "epoch": 1.5605389797882578, + "grad_norm": 1.2053110599517822, + "learning_rate": 2.2960627855696958e-05, + "loss": 0.8021, + "step": 40535 + }, + { + "epoch": 1.5607314725697785, + "grad_norm": 1.301857590675354, + "learning_rate": 2.294135156006255e-05, + "loss": 0.8293, + "step": 40540 + }, + { + "epoch": 1.5609239653512992, + "grad_norm": 1.1416163444519043, + "learning_rate": 2.2922082310931036e-05, + "loss": 0.8486, + "step": 40545 + }, + { + "epoch": 1.56111645813282, + "grad_norm": 1.3656333684921265, + "learning_rate": 2.2902820110064503e-05, + "loss": 0.7598, + "step": 40550 + }, + { + "epoch": 1.5613089509143407, + "grad_norm": 2.20866060256958, + "learning_rate": 2.288356495922436e-05, + "loss": 0.7617, + "step": 40555 + }, + { + "epoch": 1.5615014436958614, + "grad_norm": 1.5742281675338745, + "learning_rate": 2.2864316860171375e-05, + "loss": 0.9051, + "step": 40560 + }, + { + "epoch": 1.561693936477382, + "grad_norm": 1.7272670269012451, + "learning_rate": 2.2845075814665573e-05, + "loss": 0.8531, + "step": 40565 + }, + { + "epoch": 1.5618864292589028, + "grad_norm": 1.8699864149093628, + "learning_rate": 2.2825841824466543e-05, + "loss": 0.7353, + "step": 40570 + }, + { + "epoch": 1.5620789220404236, + "grad_norm": 1.5581755638122559, + "learning_rate": 2.2806614891333022e-05, + "loss": 0.7466, + "step": 40575 + }, + { + "epoch": 1.5622714148219443, + "grad_norm": 1.340266227722168, + "learning_rate": 2.2787395017023205e-05, + "loss": 0.7033, + "step": 40580 + }, + { + "epoch": 1.562463907603465, + "grad_norm": 2.0326457023620605, + "learning_rate": 2.276818220329463e-05, + "loss": 0.8551, + "step": 40585 + }, + { + "epoch": 1.5626564003849857, + "grad_norm": 1.2885262966156006, + "learning_rate": 2.2748976451904203e-05, + "loss": 0.7386, + "step": 40590 + }, + { + "epoch": 1.5628488931665063, + "grad_norm": 0.5644479393959045, + "learning_rate": 2.2729777764608108e-05, + "loss": 0.618, + "step": 40595 + }, + { + "epoch": 1.563041385948027, + "grad_norm": 1.572131633758545, + "learning_rate": 2.2710586143161972e-05, + "loss": 0.7215, + "step": 40600 + }, + { + "epoch": 1.5632338787295477, + "grad_norm": 1.1839364767074585, + "learning_rate": 2.2691401589320737e-05, + "loss": 0.7607, + "step": 40605 + }, + { + "epoch": 1.5634263715110683, + "grad_norm": 1.3653178215026855, + "learning_rate": 2.2672224104838713e-05, + "loss": 0.6762, + "step": 40610 + }, + { + "epoch": 1.563618864292589, + "grad_norm": 1.8901535272598267, + "learning_rate": 2.2653053691469563e-05, + "loss": 0.8515, + "step": 40615 + }, + { + "epoch": 1.5638113570741097, + "grad_norm": 1.8246430158615112, + "learning_rate": 2.2633890350966248e-05, + "loss": 0.85, + "step": 40620 + }, + { + "epoch": 1.5640038498556303, + "grad_norm": 1.524446725845337, + "learning_rate": 2.2614734085081158e-05, + "loss": 0.7622, + "step": 40625 + }, + { + "epoch": 1.564196342637151, + "grad_norm": 1.142157793045044, + "learning_rate": 2.2595584895566e-05, + "loss": 0.7196, + "step": 40630 + }, + { + "epoch": 1.5643888354186717, + "grad_norm": 1.0782500505447388, + "learning_rate": 2.2576442784171892e-05, + "loss": 0.7491, + "step": 40635 + }, + { + "epoch": 1.5645813282001924, + "grad_norm": 1.034097671508789, + "learning_rate": 2.2557307752649137e-05, + "loss": 0.851, + "step": 40640 + }, + { + "epoch": 1.564773820981713, + "grad_norm": 1.894054889678955, + "learning_rate": 2.253817980274764e-05, + "loss": 0.6389, + "step": 40645 + }, + { + "epoch": 1.564966313763234, + "grad_norm": 0.6015737056732178, + "learning_rate": 2.251905893621642e-05, + "loss": 0.749, + "step": 40650 + }, + { + "epoch": 1.5651588065447546, + "grad_norm": 0.9904623627662659, + "learning_rate": 2.2499945154804013e-05, + "loss": 0.836, + "step": 40655 + }, + { + "epoch": 1.5653512993262753, + "grad_norm": 1.3582662343978882, + "learning_rate": 2.2480838460258226e-05, + "loss": 0.8239, + "step": 40660 + }, + { + "epoch": 1.565543792107796, + "grad_norm": 1.4158751964569092, + "learning_rate": 2.2461738854326263e-05, + "loss": 0.8528, + "step": 40665 + }, + { + "epoch": 1.5657362848893166, + "grad_norm": 1.0276498794555664, + "learning_rate": 2.244264633875459e-05, + "loss": 0.7761, + "step": 40670 + }, + { + "epoch": 1.5659287776708375, + "grad_norm": 1.4849311113357544, + "learning_rate": 2.24235609152892e-05, + "loss": 0.934, + "step": 40675 + }, + { + "epoch": 1.5661212704523582, + "grad_norm": 1.024477481842041, + "learning_rate": 2.2404482585675225e-05, + "loss": 0.675, + "step": 40680 + }, + { + "epoch": 1.5663137632338788, + "grad_norm": 1.2070591449737549, + "learning_rate": 2.2385411351657303e-05, + "loss": 0.7506, + "step": 40685 + }, + { + "epoch": 1.5665062560153995, + "grad_norm": 1.2458285093307495, + "learning_rate": 2.2366347214979366e-05, + "loss": 0.6916, + "step": 40690 + }, + { + "epoch": 1.5666987487969202, + "grad_norm": 1.7189850807189941, + "learning_rate": 2.2347290177384726e-05, + "loss": 0.7032, + "step": 40695 + }, + { + "epoch": 1.5668912415784408, + "grad_norm": 1.3378642797470093, + "learning_rate": 2.2328240240615972e-05, + "loss": 0.6995, + "step": 40700 + }, + { + "epoch": 1.5670837343599615, + "grad_norm": 1.4483795166015625, + "learning_rate": 2.2309197406415117e-05, + "loss": 0.7532, + "step": 40705 + }, + { + "epoch": 1.5672762271414822, + "grad_norm": 1.1107079982757568, + "learning_rate": 2.2290161676523503e-05, + "loss": 0.7787, + "step": 40710 + }, + { + "epoch": 1.5674687199230029, + "grad_norm": 1.3404992818832397, + "learning_rate": 2.2271133052681825e-05, + "loss": 0.7247, + "step": 40715 + }, + { + "epoch": 1.5676612127045235, + "grad_norm": 1.0901662111282349, + "learning_rate": 2.2252111536630148e-05, + "loss": 0.8229, + "step": 40720 + }, + { + "epoch": 1.5678537054860442, + "grad_norm": 1.3521157503128052, + "learning_rate": 2.2233097130107782e-05, + "loss": 0.6774, + "step": 40725 + }, + { + "epoch": 1.5680461982675649, + "grad_norm": 1.1512130498886108, + "learning_rate": 2.221408983485358e-05, + "loss": 0.8416, + "step": 40730 + }, + { + "epoch": 1.5682386910490855, + "grad_norm": 1.2217384576797485, + "learning_rate": 2.219508965260555e-05, + "loss": 0.8276, + "step": 40735 + }, + { + "epoch": 1.5684311838306062, + "grad_norm": 1.0364052057266235, + "learning_rate": 2.217609658510117e-05, + "loss": 0.7916, + "step": 40740 + }, + { + "epoch": 1.568623676612127, + "grad_norm": 1.143720269203186, + "learning_rate": 2.2157110634077215e-05, + "loss": 0.98, + "step": 40745 + }, + { + "epoch": 1.5688161693936478, + "grad_norm": 1.7398204803466797, + "learning_rate": 2.2138131801269857e-05, + "loss": 0.8718, + "step": 40750 + }, + { + "epoch": 1.5690086621751684, + "grad_norm": 1.4039459228515625, + "learning_rate": 2.2119160088414502e-05, + "loss": 1.0399, + "step": 40755 + }, + { + "epoch": 1.569201154956689, + "grad_norm": 1.1382685899734497, + "learning_rate": 2.2100195497246103e-05, + "loss": 0.8066, + "step": 40760 + }, + { + "epoch": 1.5693936477382098, + "grad_norm": 1.2807059288024902, + "learning_rate": 2.208123802949875e-05, + "loss": 0.8231, + "step": 40765 + }, + { + "epoch": 1.5695861405197307, + "grad_norm": 1.0869920253753662, + "learning_rate": 2.2062287686906026e-05, + "loss": 0.7107, + "step": 40770 + }, + { + "epoch": 1.5697786333012513, + "grad_norm": 1.4672589302062988, + "learning_rate": 2.20433444712008e-05, + "loss": 0.6363, + "step": 40775 + }, + { + "epoch": 1.569971126082772, + "grad_norm": 1.185530185699463, + "learning_rate": 2.2024408384115337e-05, + "loss": 0.7595, + "step": 40780 + }, + { + "epoch": 1.5701636188642927, + "grad_norm": 1.210688829421997, + "learning_rate": 2.2005479427381126e-05, + "loss": 0.7845, + "step": 40785 + }, + { + "epoch": 1.5703561116458133, + "grad_norm": 0.9549643397331238, + "learning_rate": 2.1986557602729207e-05, + "loss": 0.7966, + "step": 40790 + }, + { + "epoch": 1.570548604427334, + "grad_norm": 1.4014770984649658, + "learning_rate": 2.1967642911889787e-05, + "loss": 0.7121, + "step": 40795 + }, + { + "epoch": 1.5707410972088547, + "grad_norm": 1.2537885904312134, + "learning_rate": 2.1948735356592497e-05, + "loss": 0.8161, + "step": 40800 + }, + { + "epoch": 1.5709335899903754, + "grad_norm": 1.4344686269760132, + "learning_rate": 2.1929834938566317e-05, + "loss": 0.9185, + "step": 40805 + }, + { + "epoch": 1.571126082771896, + "grad_norm": 1.5772898197174072, + "learning_rate": 2.1910941659539585e-05, + "loss": 0.7617, + "step": 40810 + }, + { + "epoch": 1.5713185755534167, + "grad_norm": 0.9699328541755676, + "learning_rate": 2.189205552123993e-05, + "loss": 0.7618, + "step": 40815 + }, + { + "epoch": 1.5715110683349374, + "grad_norm": 1.8218029737472534, + "learning_rate": 2.1873176525394378e-05, + "loss": 0.8044, + "step": 40820 + }, + { + "epoch": 1.571703561116458, + "grad_norm": 1.0945191383361816, + "learning_rate": 2.1854304673729287e-05, + "loss": 0.8599, + "step": 40825 + }, + { + "epoch": 1.5718960538979787, + "grad_norm": 1.4021522998809814, + "learning_rate": 2.183543996797036e-05, + "loss": 0.8188, + "step": 40830 + }, + { + "epoch": 1.5720885466794994, + "grad_norm": 0.9821120500564575, + "learning_rate": 2.181658240984269e-05, + "loss": 0.8562, + "step": 40835 + }, + { + "epoch": 1.57228103946102, + "grad_norm": 1.1095430850982666, + "learning_rate": 2.1797732001070613e-05, + "loss": 0.7987, + "step": 40840 + }, + { + "epoch": 1.572473532242541, + "grad_norm": 1.1971663236618042, + "learning_rate": 2.17788887433779e-05, + "loss": 0.942, + "step": 40845 + }, + { + "epoch": 1.5726660250240616, + "grad_norm": 1.4306426048278809, + "learning_rate": 2.176005263848765e-05, + "loss": 0.922, + "step": 40850 + }, + { + "epoch": 1.5728585178055823, + "grad_norm": 1.3114017248153687, + "learning_rate": 2.1741223688122313e-05, + "loss": 0.76, + "step": 40855 + }, + { + "epoch": 1.573051010587103, + "grad_norm": 1.3243577480316162, + "learning_rate": 2.1722401894003608e-05, + "loss": 0.7896, + "step": 40860 + }, + { + "epoch": 1.5732435033686238, + "grad_norm": 1.723612666130066, + "learning_rate": 2.1703587257852755e-05, + "loss": 0.9003, + "step": 40865 + }, + { + "epoch": 1.5734359961501445, + "grad_norm": 1.3303959369659424, + "learning_rate": 2.1684779781390152e-05, + "loss": 0.8007, + "step": 40870 + }, + { + "epoch": 1.5736284889316652, + "grad_norm": 1.080396056175232, + "learning_rate": 2.166597946633565e-05, + "loss": 0.7348, + "step": 40875 + }, + { + "epoch": 1.5738209817131859, + "grad_norm": 0.7364131808280945, + "learning_rate": 2.1647186314408408e-05, + "loss": 0.8049, + "step": 40880 + }, + { + "epoch": 1.5740134744947065, + "grad_norm": 0.581088662147522, + "learning_rate": 2.162840032732697e-05, + "loss": 0.7795, + "step": 40885 + }, + { + "epoch": 1.5742059672762272, + "grad_norm": 1.5997105836868286, + "learning_rate": 2.1609621506809097e-05, + "loss": 0.7561, + "step": 40890 + }, + { + "epoch": 1.5743984600577479, + "grad_norm": 1.5176271200180054, + "learning_rate": 2.1590849854572114e-05, + "loss": 0.6649, + "step": 40895 + }, + { + "epoch": 1.5745909528392685, + "grad_norm": 2.400883197784424, + "learning_rate": 2.1572085372332463e-05, + "loss": 0.8173, + "step": 40900 + }, + { + "epoch": 1.5747834456207892, + "grad_norm": 1.7215204238891602, + "learning_rate": 2.1553328061806065e-05, + "loss": 0.8483, + "step": 40905 + }, + { + "epoch": 1.5749759384023099, + "grad_norm": 1.2602962255477905, + "learning_rate": 2.1534577924708155e-05, + "loss": 0.7841, + "step": 40910 + }, + { + "epoch": 1.5751684311838305, + "grad_norm": 2.122758388519287, + "learning_rate": 2.1515834962753346e-05, + "loss": 0.7834, + "step": 40915 + }, + { + "epoch": 1.5753609239653512, + "grad_norm": 1.6246029138565063, + "learning_rate": 2.1497099177655476e-05, + "loss": 0.7015, + "step": 40920 + }, + { + "epoch": 1.5755534167468719, + "grad_norm": 1.3783003091812134, + "learning_rate": 2.147837057112786e-05, + "loss": 0.9372, + "step": 40925 + }, + { + "epoch": 1.5757459095283926, + "grad_norm": 1.430508017539978, + "learning_rate": 2.14596491448831e-05, + "loss": 0.8844, + "step": 40930 + }, + { + "epoch": 1.5759384023099132, + "grad_norm": 1.1913694143295288, + "learning_rate": 2.1440934900633148e-05, + "loss": 0.8196, + "step": 40935 + }, + { + "epoch": 1.5761308950914341, + "grad_norm": 0.9103003740310669, + "learning_rate": 2.1422227840089303e-05, + "loss": 0.8644, + "step": 40940 + }, + { + "epoch": 1.5763233878729548, + "grad_norm": 1.2136942148208618, + "learning_rate": 2.1403527964962176e-05, + "loss": 0.8063, + "step": 40945 + }, + { + "epoch": 1.5765158806544755, + "grad_norm": 1.0555609464645386, + "learning_rate": 2.1384835276961767e-05, + "loss": 0.7176, + "step": 40950 + }, + { + "epoch": 1.5767083734359961, + "grad_norm": 1.4605985879898071, + "learning_rate": 2.1369886302441366e-05, + "loss": 0.8852, + "step": 40955 + }, + { + "epoch": 1.5769008662175168, + "grad_norm": 1.9542622566223145, + "learning_rate": 2.1351206555576065e-05, + "loss": 0.8881, + "step": 40960 + }, + { + "epoch": 1.5770933589990377, + "grad_norm": 0.9330739974975586, + "learning_rate": 2.1332534000621972e-05, + "loss": 0.701, + "step": 40965 + }, + { + "epoch": 1.5772858517805584, + "grad_norm": 1.3597526550292969, + "learning_rate": 2.1313868639286494e-05, + "loss": 0.928, + "step": 40970 + }, + { + "epoch": 1.577478344562079, + "grad_norm": 1.327373743057251, + "learning_rate": 2.1295210473276484e-05, + "loss": 0.8114, + "step": 40975 + }, + { + "epoch": 1.5776708373435997, + "grad_norm": 1.1426680088043213, + "learning_rate": 2.12765595042981e-05, + "loss": 0.8321, + "step": 40980 + }, + { + "epoch": 1.5778633301251204, + "grad_norm": 1.3431953191757202, + "learning_rate": 2.1257915734056854e-05, + "loss": 0.7583, + "step": 40985 + }, + { + "epoch": 1.578055822906641, + "grad_norm": 1.6750779151916504, + "learning_rate": 2.123927916425763e-05, + "loss": 0.7295, + "step": 40990 + }, + { + "epoch": 1.5782483156881617, + "grad_norm": 1.191678762435913, + "learning_rate": 2.122064979660455e-05, + "loss": 0.731, + "step": 40995 + }, + { + "epoch": 1.5784408084696824, + "grad_norm": 1.6331219673156738, + "learning_rate": 2.120202763280118e-05, + "loss": 0.7803, + "step": 41000 + }, + { + "epoch": 1.578633301251203, + "grad_norm": 1.1647868156433105, + "learning_rate": 2.1183412674550396e-05, + "loss": 0.8047, + "step": 41005 + }, + { + "epoch": 1.5788257940327237, + "grad_norm": 1.082160472869873, + "learning_rate": 2.1164804923554438e-05, + "loss": 0.7405, + "step": 41010 + }, + { + "epoch": 1.5790182868142444, + "grad_norm": 2.297400951385498, + "learning_rate": 2.1146204381514788e-05, + "loss": 1.0183, + "step": 41015 + }, + { + "epoch": 1.579210779595765, + "grad_norm": 1.0821828842163086, + "learning_rate": 2.1127611050132435e-05, + "loss": 0.9013, + "step": 41020 + }, + { + "epoch": 1.5794032723772857, + "grad_norm": 0.7487876415252686, + "learning_rate": 2.1109024931107547e-05, + "loss": 0.7388, + "step": 41025 + }, + { + "epoch": 1.5795957651588064, + "grad_norm": 1.8875432014465332, + "learning_rate": 2.109044602613971e-05, + "loss": 0.9369, + "step": 41030 + }, + { + "epoch": 1.5797882579403273, + "grad_norm": 1.49079167842865, + "learning_rate": 2.1071874336927876e-05, + "loss": 0.8084, + "step": 41035 + }, + { + "epoch": 1.579980750721848, + "grad_norm": 1.307343602180481, + "learning_rate": 2.10533098651703e-05, + "loss": 0.8797, + "step": 41040 + }, + { + "epoch": 1.5801732435033686, + "grad_norm": 1.5751981735229492, + "learning_rate": 2.1034752612564503e-05, + "loss": 0.6406, + "step": 41045 + }, + { + "epoch": 1.5803657362848893, + "grad_norm": 1.0212968587875366, + "learning_rate": 2.1016202580807544e-05, + "loss": 0.8681, + "step": 41050 + }, + { + "epoch": 1.58055822906641, + "grad_norm": 0.9244958758354187, + "learning_rate": 2.0997659771595612e-05, + "loss": 0.76, + "step": 41055 + }, + { + "epoch": 1.5807507218479309, + "grad_norm": 2.024540662765503, + "learning_rate": 2.0979124186624356e-05, + "loss": 0.8869, + "step": 41060 + }, + { + "epoch": 1.5809432146294515, + "grad_norm": 1.2309986352920532, + "learning_rate": 2.0960595827588713e-05, + "loss": 0.8485, + "step": 41065 + }, + { + "epoch": 1.5811357074109722, + "grad_norm": 1.1824150085449219, + "learning_rate": 2.0942074696183033e-05, + "loss": 0.8635, + "step": 41070 + }, + { + "epoch": 1.5813282001924929, + "grad_norm": 1.0491746664047241, + "learning_rate": 2.092356079410086e-05, + "loss": 0.8331, + "step": 41075 + }, + { + "epoch": 1.5815206929740135, + "grad_norm": 1.1368030309677124, + "learning_rate": 2.090505412303526e-05, + "loss": 0.6937, + "step": 41080 + }, + { + "epoch": 1.5817131857555342, + "grad_norm": 1.2097387313842773, + "learning_rate": 2.0886554684678485e-05, + "loss": 0.7318, + "step": 41085 + }, + { + "epoch": 1.5819056785370549, + "grad_norm": 2.0431954860687256, + "learning_rate": 2.0868062480722206e-05, + "loss": 0.8352, + "step": 41090 + }, + { + "epoch": 1.5820981713185756, + "grad_norm": 1.763728141784668, + "learning_rate": 2.084957751285741e-05, + "loss": 0.7479, + "step": 41095 + }, + { + "epoch": 1.5822906641000962, + "grad_norm": 1.1527445316314697, + "learning_rate": 2.0831099782774455e-05, + "loss": 0.8316, + "step": 41100 + }, + { + "epoch": 1.582483156881617, + "grad_norm": 1.493467092514038, + "learning_rate": 2.0812629292162955e-05, + "loss": 0.772, + "step": 41105 + }, + { + "epoch": 1.5826756496631376, + "grad_norm": 1.3418041467666626, + "learning_rate": 2.0794166042711936e-05, + "loss": 0.7486, + "step": 41110 + }, + { + "epoch": 1.5828681424446582, + "grad_norm": 0.8369565606117249, + "learning_rate": 2.0775710036109762e-05, + "loss": 0.7444, + "step": 41115 + }, + { + "epoch": 1.583060635226179, + "grad_norm": 1.1210306882858276, + "learning_rate": 2.0757261274044048e-05, + "loss": 0.7875, + "step": 41120 + }, + { + "epoch": 1.5832531280076996, + "grad_norm": 1.1858896017074585, + "learning_rate": 2.07388197582019e-05, + "loss": 0.7042, + "step": 41125 + }, + { + "epoch": 1.5834456207892202, + "grad_norm": 0.9110993146896362, + "learning_rate": 2.072038549026961e-05, + "loss": 0.7668, + "step": 41130 + }, + { + "epoch": 1.5836381135707411, + "grad_norm": 1.5564924478530884, + "learning_rate": 2.070195847193288e-05, + "loss": 0.7743, + "step": 41135 + }, + { + "epoch": 1.5838306063522618, + "grad_norm": 1.6773813962936401, + "learning_rate": 2.068353870487675e-05, + "loss": 0.7443, + "step": 41140 + }, + { + "epoch": 1.5840230991337825, + "grad_norm": 1.0506762266159058, + "learning_rate": 2.066512619078561e-05, + "loss": 0.681, + "step": 41145 + }, + { + "epoch": 1.5842155919153031, + "grad_norm": 1.599478006362915, + "learning_rate": 2.0646720931343078e-05, + "loss": 0.7801, + "step": 41150 + }, + { + "epoch": 1.5844080846968238, + "grad_norm": 0.9615776538848877, + "learning_rate": 2.06283229282323e-05, + "loss": 0.6929, + "step": 41155 + }, + { + "epoch": 1.5846005774783447, + "grad_norm": 1.4779179096221924, + "learning_rate": 2.0609932183135582e-05, + "loss": 0.8112, + "step": 41160 + }, + { + "epoch": 1.5847930702598654, + "grad_norm": 1.1211433410644531, + "learning_rate": 2.059154869773465e-05, + "loss": 0.7598, + "step": 41165 + }, + { + "epoch": 1.584985563041386, + "grad_norm": 1.0643943548202515, + "learning_rate": 2.0573172473710567e-05, + "loss": 0.8607, + "step": 41170 + }, + { + "epoch": 1.5851780558229067, + "grad_norm": 2.5210201740264893, + "learning_rate": 2.0554803512743724e-05, + "loss": 0.9052, + "step": 41175 + }, + { + "epoch": 1.5853705486044274, + "grad_norm": 1.6734479665756226, + "learning_rate": 2.053644181651376e-05, + "loss": 0.8745, + "step": 41180 + }, + { + "epoch": 1.585563041385948, + "grad_norm": 1.5669529438018799, + "learning_rate": 2.051808738669987e-05, + "loss": 0.7996, + "step": 41185 + }, + { + "epoch": 1.5857555341674687, + "grad_norm": 1.2366806268692017, + "learning_rate": 2.0499740224980325e-05, + "loss": 0.719, + "step": 41190 + }, + { + "epoch": 1.5859480269489894, + "grad_norm": 1.0737662315368652, + "learning_rate": 2.0481400333032897e-05, + "loss": 0.6766, + "step": 41195 + }, + { + "epoch": 1.58614051973051, + "grad_norm": 2.486666679382324, + "learning_rate": 2.0463067712534656e-05, + "loss": 0.9854, + "step": 41200 + }, + { + "epoch": 1.5863330125120307, + "grad_norm": 1.0239930152893066, + "learning_rate": 2.0444742365162005e-05, + "loss": 0.8435, + "step": 41205 + }, + { + "epoch": 1.5865255052935514, + "grad_norm": 1.0103557109832764, + "learning_rate": 2.042642429259064e-05, + "loss": 0.8264, + "step": 41210 + }, + { + "epoch": 1.586717998075072, + "grad_norm": 1.7183647155761719, + "learning_rate": 2.040811349649564e-05, + "loss": 0.781, + "step": 41215 + }, + { + "epoch": 1.5869104908565927, + "grad_norm": 1.1959103345870972, + "learning_rate": 2.038980997855142e-05, + "loss": 0.7821, + "step": 41220 + }, + { + "epoch": 1.5871029836381134, + "grad_norm": 1.4438652992248535, + "learning_rate": 2.03715137404317e-05, + "loss": 0.8455, + "step": 41225 + }, + { + "epoch": 1.5872954764196343, + "grad_norm": 1.0840461254119873, + "learning_rate": 2.0353224783809587e-05, + "loss": 0.7846, + "step": 41230 + }, + { + "epoch": 1.587487969201155, + "grad_norm": 1.6811954975128174, + "learning_rate": 2.033494311035744e-05, + "loss": 0.8039, + "step": 41235 + }, + { + "epoch": 1.5876804619826757, + "grad_norm": 1.2145577669143677, + "learning_rate": 2.0316668721747e-05, + "loss": 0.8147, + "step": 41240 + }, + { + "epoch": 1.5878729547641963, + "grad_norm": 1.1736549139022827, + "learning_rate": 2.0298401619649353e-05, + "loss": 1.0046, + "step": 41245 + }, + { + "epoch": 1.588065447545717, + "grad_norm": 1.1645379066467285, + "learning_rate": 2.028014180573491e-05, + "loss": 0.8703, + "step": 41250 + }, + { + "epoch": 1.5882579403272379, + "grad_norm": 1.2913013696670532, + "learning_rate": 2.0261889281673397e-05, + "loss": 0.8041, + "step": 41255 + }, + { + "epoch": 1.5884504331087586, + "grad_norm": 1.1715826988220215, + "learning_rate": 2.0243644049133926e-05, + "loss": 0.8894, + "step": 41260 + }, + { + "epoch": 1.5886429258902792, + "grad_norm": 1.0594674348831177, + "learning_rate": 2.0225406109784805e-05, + "loss": 0.8548, + "step": 41265 + }, + { + "epoch": 1.5888354186718, + "grad_norm": 2.4181666374206543, + "learning_rate": 2.0207175465293904e-05, + "loss": 0.8165, + "step": 41270 + }, + { + "epoch": 1.5890279114533206, + "grad_norm": 0.9365816712379456, + "learning_rate": 2.0188952117328186e-05, + "loss": 0.8741, + "step": 41275 + }, + { + "epoch": 1.5892204042348412, + "grad_norm": 2.1108558177948, + "learning_rate": 2.0170736067554108e-05, + "loss": 0.8202, + "step": 41280 + }, + { + "epoch": 1.589412897016362, + "grad_norm": 1.5129166841506958, + "learning_rate": 2.0152527317637394e-05, + "loss": 0.7616, + "step": 41285 + }, + { + "epoch": 1.5896053897978826, + "grad_norm": 1.32972252368927, + "learning_rate": 2.0134325869243144e-05, + "loss": 0.905, + "step": 41290 + }, + { + "epoch": 1.5897978825794032, + "grad_norm": 0.950168788433075, + "learning_rate": 2.011613172403567e-05, + "loss": 0.6316, + "step": 41295 + }, + { + "epoch": 1.589990375360924, + "grad_norm": 0.9195626378059387, + "learning_rate": 2.009794488367882e-05, + "loss": 0.8385, + "step": 41300 + }, + { + "epoch": 1.5901828681424446, + "grad_norm": 1.176831603050232, + "learning_rate": 2.007976534983559e-05, + "loss": 0.8977, + "step": 41305 + }, + { + "epoch": 1.5903753609239653, + "grad_norm": 1.0982027053833008, + "learning_rate": 2.0061593124168398e-05, + "loss": 0.8981, + "step": 41310 + }, + { + "epoch": 1.590567853705486, + "grad_norm": 1.0368854999542236, + "learning_rate": 2.0043428208338987e-05, + "loss": 0.6795, + "step": 41315 + }, + { + "epoch": 1.5907603464870066, + "grad_norm": 0.7842391729354858, + "learning_rate": 2.0025270604008384e-05, + "loss": 0.6644, + "step": 41320 + }, + { + "epoch": 1.5909528392685275, + "grad_norm": 1.4070217609405518, + "learning_rate": 2.0007120312837e-05, + "loss": 0.7587, + "step": 41325 + }, + { + "epoch": 1.5911453320500482, + "grad_norm": 1.3950802087783813, + "learning_rate": 1.998897733648456e-05, + "loss": 0.8808, + "step": 41330 + }, + { + "epoch": 1.5913378248315688, + "grad_norm": 0.873643696308136, + "learning_rate": 1.9970841676610143e-05, + "loss": 0.8078, + "step": 41335 + }, + { + "epoch": 1.5915303176130895, + "grad_norm": 1.5144951343536377, + "learning_rate": 1.995271333487205e-05, + "loss": 0.7164, + "step": 41340 + }, + { + "epoch": 1.5917228103946102, + "grad_norm": 1.031936526298523, + "learning_rate": 1.993459231292811e-05, + "loss": 0.6438, + "step": 41345 + }, + { + "epoch": 1.591915303176131, + "grad_norm": 1.1276479959487915, + "learning_rate": 1.991647861243531e-05, + "loss": 0.8856, + "step": 41350 + }, + { + "epoch": 1.5921077959576517, + "grad_norm": 0.9697341322898865, + "learning_rate": 1.9898372235050022e-05, + "loss": 0.788, + "step": 41355 + }, + { + "epoch": 1.5923002887391724, + "grad_norm": 1.0358269214630127, + "learning_rate": 1.9880273182427965e-05, + "loss": 0.7337, + "step": 41360 + }, + { + "epoch": 1.592492781520693, + "grad_norm": 1.1483445167541504, + "learning_rate": 1.9862181456224216e-05, + "loss": 0.8329, + "step": 41365 + }, + { + "epoch": 1.5926852743022137, + "grad_norm": 0.7864820957183838, + "learning_rate": 1.9844097058093047e-05, + "loss": 0.664, + "step": 41370 + }, + { + "epoch": 1.5928777670837344, + "grad_norm": 0.8901979327201843, + "learning_rate": 1.9826019989688283e-05, + "loss": 0.7352, + "step": 41375 + }, + { + "epoch": 1.593070259865255, + "grad_norm": 1.5492757558822632, + "learning_rate": 1.9807950252662854e-05, + "loss": 0.8297, + "step": 41380 + }, + { + "epoch": 1.5932627526467757, + "grad_norm": 1.5439116954803467, + "learning_rate": 1.9789887848669143e-05, + "loss": 0.7502, + "step": 41385 + }, + { + "epoch": 1.5934552454282964, + "grad_norm": 1.6057932376861572, + "learning_rate": 1.9771832779358857e-05, + "loss": 0.8005, + "step": 41390 + }, + { + "epoch": 1.593647738209817, + "grad_norm": 1.489683985710144, + "learning_rate": 1.9753785046383022e-05, + "loss": 0.8509, + "step": 41395 + }, + { + "epoch": 1.5938402309913378, + "grad_norm": 1.2398594617843628, + "learning_rate": 1.9735744651391906e-05, + "loss": 0.839, + "step": 41400 + }, + { + "epoch": 1.5940327237728584, + "grad_norm": 0.5610536932945251, + "learning_rate": 1.9717711596035292e-05, + "loss": 0.686, + "step": 41405 + }, + { + "epoch": 1.594225216554379, + "grad_norm": 1.1076955795288086, + "learning_rate": 1.9699685881962115e-05, + "loss": 0.7216, + "step": 41410 + }, + { + "epoch": 1.5944177093358998, + "grad_norm": 0.8716223835945129, + "learning_rate": 1.9681667510820713e-05, + "loss": 0.8062, + "step": 41415 + }, + { + "epoch": 1.5946102021174204, + "grad_norm": 0.9298526048660278, + "learning_rate": 1.9663656484258764e-05, + "loss": 0.8884, + "step": 41420 + }, + { + "epoch": 1.5948026948989413, + "grad_norm": 1.1228723526000977, + "learning_rate": 1.9645652803923266e-05, + "loss": 0.7249, + "step": 41425 + }, + { + "epoch": 1.594995187680462, + "grad_norm": 1.0350871086120605, + "learning_rate": 1.9627656471460498e-05, + "loss": 0.7171, + "step": 41430 + }, + { + "epoch": 1.5951876804619827, + "grad_norm": 1.5256117582321167, + "learning_rate": 1.9609667488516138e-05, + "loss": 1.0133, + "step": 41435 + }, + { + "epoch": 1.5953801732435033, + "grad_norm": 1.4316469430923462, + "learning_rate": 1.9591685856735144e-05, + "loss": 0.7591, + "step": 41440 + }, + { + "epoch": 1.595572666025024, + "grad_norm": 1.8641667366027832, + "learning_rate": 1.9573711577761812e-05, + "loss": 0.8817, + "step": 41445 + }, + { + "epoch": 1.595765158806545, + "grad_norm": 1.348741054534912, + "learning_rate": 1.9555744653239815e-05, + "loss": 0.9214, + "step": 41450 + }, + { + "epoch": 1.5959576515880656, + "grad_norm": 1.2496349811553955, + "learning_rate": 1.9537785084812044e-05, + "loss": 0.7907, + "step": 41455 + }, + { + "epoch": 1.5961501443695862, + "grad_norm": 1.1491204500198364, + "learning_rate": 1.9519832874120824e-05, + "loss": 0.8712, + "step": 41460 + }, + { + "epoch": 1.596342637151107, + "grad_norm": 0.7588009238243103, + "learning_rate": 1.9501888022807745e-05, + "loss": 0.7473, + "step": 41465 + }, + { + "epoch": 1.5965351299326276, + "grad_norm": 1.0782458782196045, + "learning_rate": 1.9483950532513783e-05, + "loss": 0.7309, + "step": 41470 + }, + { + "epoch": 1.5967276227141483, + "grad_norm": 1.52632474899292, + "learning_rate": 1.9466020404879127e-05, + "loss": 0.8698, + "step": 41475 + }, + { + "epoch": 1.596920115495669, + "grad_norm": 1.286037564277649, + "learning_rate": 1.9448097641543462e-05, + "loss": 0.8768, + "step": 41480 + }, + { + "epoch": 1.5971126082771896, + "grad_norm": 0.8476093411445618, + "learning_rate": 1.9430182244145646e-05, + "loss": 0.9017, + "step": 41485 + }, + { + "epoch": 1.5973051010587103, + "grad_norm": 1.0609407424926758, + "learning_rate": 1.9412274214323923e-05, + "loss": 0.8112, + "step": 41490 + }, + { + "epoch": 1.597497593840231, + "grad_norm": 1.7310614585876465, + "learning_rate": 1.9394373553715885e-05, + "loss": 1.0244, + "step": 41495 + }, + { + "epoch": 1.5976900866217516, + "grad_norm": 1.7635456323623657, + "learning_rate": 1.9376480263958453e-05, + "loss": 0.8217, + "step": 41500 + }, + { + "epoch": 1.5978825794032723, + "grad_norm": 0.9183743000030518, + "learning_rate": 1.9358594346687765e-05, + "loss": 0.8512, + "step": 41505 + }, + { + "epoch": 1.598075072184793, + "grad_norm": 1.0998347997665405, + "learning_rate": 1.9340715803539466e-05, + "loss": 0.7804, + "step": 41510 + }, + { + "epoch": 1.5982675649663136, + "grad_norm": 1.388878345489502, + "learning_rate": 1.9322844636148375e-05, + "loss": 0.8285, + "step": 41515 + }, + { + "epoch": 1.5984600577478345, + "grad_norm": 1.1632678508758545, + "learning_rate": 1.930498084614869e-05, + "loss": 0.7176, + "step": 41520 + }, + { + "epoch": 1.5986525505293552, + "grad_norm": 0.8704983592033386, + "learning_rate": 1.9287124435173964e-05, + "loss": 0.7018, + "step": 41525 + }, + { + "epoch": 1.5988450433108758, + "grad_norm": 1.2163587808609009, + "learning_rate": 1.9269275404857022e-05, + "loss": 0.6076, + "step": 41530 + }, + { + "epoch": 1.5990375360923965, + "grad_norm": 0.8491012454032898, + "learning_rate": 1.9251433756830095e-05, + "loss": 0.7527, + "step": 41535 + }, + { + "epoch": 1.5992300288739172, + "grad_norm": 0.996997058391571, + "learning_rate": 1.9233599492724607e-05, + "loss": 0.6848, + "step": 41540 + }, + { + "epoch": 1.599422521655438, + "grad_norm": 1.3385977745056152, + "learning_rate": 1.9215772614171413e-05, + "loss": 0.8753, + "step": 41545 + }, + { + "epoch": 1.5996150144369587, + "grad_norm": 0.9963300228118896, + "learning_rate": 1.919795312280067e-05, + "loss": 0.8053, + "step": 41550 + }, + { + "epoch": 1.5998075072184794, + "grad_norm": 0.8091634511947632, + "learning_rate": 1.918014102024187e-05, + "loss": 0.8183, + "step": 41555 + }, + { + "epoch": 1.6, + "grad_norm": 1.6417735815048218, + "learning_rate": 1.916233630812374e-05, + "loss": 0.7796, + "step": 41560 + }, + { + "epoch": 1.6001924927815208, + "grad_norm": 0.8539758324623108, + "learning_rate": 1.914453898807451e-05, + "loss": 0.7709, + "step": 41565 + }, + { + "epoch": 1.6003849855630414, + "grad_norm": 1.14285409450531, + "learning_rate": 1.912674906172155e-05, + "loss": 0.762, + "step": 41570 + }, + { + "epoch": 1.600577478344562, + "grad_norm": 1.2252355813980103, + "learning_rate": 1.910896653069165e-05, + "loss": 0.9033, + "step": 41575 + }, + { + "epoch": 1.6007699711260828, + "grad_norm": 1.3306511640548706, + "learning_rate": 1.9091191396610895e-05, + "loss": 0.8857, + "step": 41580 + }, + { + "epoch": 1.6009624639076034, + "grad_norm": 0.9147464632987976, + "learning_rate": 1.9073423661104762e-05, + "loss": 0.7447, + "step": 41585 + }, + { + "epoch": 1.601154956689124, + "grad_norm": 1.4393258094787598, + "learning_rate": 1.9055663325797877e-05, + "loss": 0.7681, + "step": 41590 + }, + { + "epoch": 1.6013474494706448, + "grad_norm": 1.5441241264343262, + "learning_rate": 1.903791039231443e-05, + "loss": 0.9034, + "step": 41595 + }, + { + "epoch": 1.6015399422521654, + "grad_norm": 1.5993388891220093, + "learning_rate": 1.9020164862277724e-05, + "loss": 0.7531, + "step": 41600 + }, + { + "epoch": 1.6017324350336861, + "grad_norm": 1.3545401096343994, + "learning_rate": 1.900242673731051e-05, + "loss": 0.7417, + "step": 41605 + }, + { + "epoch": 1.6019249278152068, + "grad_norm": 2.0168793201446533, + "learning_rate": 1.89846960190348e-05, + "loss": 0.83, + "step": 41610 + }, + { + "epoch": 1.6021174205967275, + "grad_norm": 0.8939989805221558, + "learning_rate": 1.8966972709071985e-05, + "loss": 0.7973, + "step": 41615 + }, + { + "epoch": 1.6023099133782484, + "grad_norm": 1.6125478744506836, + "learning_rate": 1.894925680904268e-05, + "loss": 0.8518, + "step": 41620 + }, + { + "epoch": 1.602502406159769, + "grad_norm": 1.4295742511749268, + "learning_rate": 1.8931548320566972e-05, + "loss": 0.782, + "step": 41625 + }, + { + "epoch": 1.6026948989412897, + "grad_norm": 0.9275886416435242, + "learning_rate": 1.8913847245264116e-05, + "loss": 0.8498, + "step": 41630 + }, + { + "epoch": 1.6028873917228104, + "grad_norm": 1.2459815740585327, + "learning_rate": 1.8896153584752785e-05, + "loss": 0.8219, + "step": 41635 + }, + { + "epoch": 1.6030798845043313, + "grad_norm": 1.0280636548995972, + "learning_rate": 1.887846734065094e-05, + "loss": 0.7993, + "step": 41640 + }, + { + "epoch": 1.603272377285852, + "grad_norm": 0.9304599761962891, + "learning_rate": 1.886078851457591e-05, + "loss": 0.8449, + "step": 41645 + }, + { + "epoch": 1.6034648700673726, + "grad_norm": 1.3438136577606201, + "learning_rate": 1.884311710814425e-05, + "loss": 0.706, + "step": 41650 + }, + { + "epoch": 1.6036573628488933, + "grad_norm": 1.3000657558441162, + "learning_rate": 1.8825453122971904e-05, + "loss": 0.8801, + "step": 41655 + }, + { + "epoch": 1.603849855630414, + "grad_norm": 0.9097469449043274, + "learning_rate": 1.880779656067414e-05, + "loss": 0.7417, + "step": 41660 + }, + { + "epoch": 1.6040423484119346, + "grad_norm": 1.5576119422912598, + "learning_rate": 1.8790147422865532e-05, + "loss": 0.8143, + "step": 41665 + }, + { + "epoch": 1.6042348411934553, + "grad_norm": 1.0649263858795166, + "learning_rate": 1.8772505711160003e-05, + "loss": 0.6215, + "step": 41670 + }, + { + "epoch": 1.604427333974976, + "grad_norm": 1.2119357585906982, + "learning_rate": 1.8754871427170716e-05, + "loss": 0.7959, + "step": 41675 + }, + { + "epoch": 1.6046198267564966, + "grad_norm": 1.5147475004196167, + "learning_rate": 1.8737244572510238e-05, + "loss": 0.6919, + "step": 41680 + }, + { + "epoch": 1.6048123195380173, + "grad_norm": 1.540199637413025, + "learning_rate": 1.8719625148790432e-05, + "loss": 0.9114, + "step": 41685 + }, + { + "epoch": 1.605004812319538, + "grad_norm": 1.3708090782165527, + "learning_rate": 1.8702013157622488e-05, + "loss": 0.7575, + "step": 41690 + }, + { + "epoch": 1.6051973051010586, + "grad_norm": 0.8762415647506714, + "learning_rate": 1.8684408600616855e-05, + "loss": 0.7135, + "step": 41695 + }, + { + "epoch": 1.6053897978825793, + "grad_norm": 1.0719411373138428, + "learning_rate": 1.866681147938343e-05, + "loss": 0.7409, + "step": 41700 + }, + { + "epoch": 1.6055822906641, + "grad_norm": 1.3425824642181396, + "learning_rate": 1.864922179553128e-05, + "loss": 1.0134, + "step": 41705 + }, + { + "epoch": 1.6057747834456206, + "grad_norm": 1.2449208498001099, + "learning_rate": 1.8631639550668912e-05, + "loss": 0.8516, + "step": 41710 + }, + { + "epoch": 1.6059672762271415, + "grad_norm": 0.9909605979919434, + "learning_rate": 1.861406474640408e-05, + "loss": 0.8218, + "step": 41715 + }, + { + "epoch": 1.6061597690086622, + "grad_norm": 1.1478825807571411, + "learning_rate": 1.8596497384343926e-05, + "loss": 0.9639, + "step": 41720 + }, + { + "epoch": 1.6063522617901829, + "grad_norm": 1.305647611618042, + "learning_rate": 1.857893746609478e-05, + "loss": 0.8432, + "step": 41725 + }, + { + "epoch": 1.6065447545717035, + "grad_norm": 1.028073787689209, + "learning_rate": 1.8561384993262497e-05, + "loss": 0.7462, + "step": 41730 + }, + { + "epoch": 1.6067372473532242, + "grad_norm": 1.058986783027649, + "learning_rate": 1.8543839967452047e-05, + "loss": 0.8153, + "step": 41735 + }, + { + "epoch": 1.606929740134745, + "grad_norm": 1.5460129976272583, + "learning_rate": 1.8526302390267836e-05, + "loss": 0.7346, + "step": 41740 + }, + { + "epoch": 1.6071222329162658, + "grad_norm": 1.0355900526046753, + "learning_rate": 1.8508772263313556e-05, + "loss": 0.8171, + "step": 41745 + }, + { + "epoch": 1.6073147256977864, + "grad_norm": 1.0379014015197754, + "learning_rate": 1.849124958819224e-05, + "loss": 0.7432, + "step": 41750 + }, + { + "epoch": 1.607507218479307, + "grad_norm": 1.5380711555480957, + "learning_rate": 1.847373436650619e-05, + "loss": 0.9295, + "step": 41755 + }, + { + "epoch": 1.6076997112608278, + "grad_norm": 1.3490667343139648, + "learning_rate": 1.8456226599857064e-05, + "loss": 0.9094, + "step": 41760 + }, + { + "epoch": 1.6078922040423484, + "grad_norm": 1.5740834474563599, + "learning_rate": 1.8438726289845833e-05, + "loss": 0.7674, + "step": 41765 + }, + { + "epoch": 1.6080846968238691, + "grad_norm": 0.9285767674446106, + "learning_rate": 1.8421233438072795e-05, + "loss": 0.7368, + "step": 41770 + }, + { + "epoch": 1.6082771896053898, + "grad_norm": 1.3774783611297607, + "learning_rate": 1.840374804613757e-05, + "loss": 0.7023, + "step": 41775 + }, + { + "epoch": 1.6084696823869105, + "grad_norm": 1.00751793384552, + "learning_rate": 1.8386270115639013e-05, + "loss": 0.7488, + "step": 41780 + }, + { + "epoch": 1.6086621751684311, + "grad_norm": 1.7420424222946167, + "learning_rate": 1.836879964817546e-05, + "loss": 0.9531, + "step": 41785 + }, + { + "epoch": 1.6088546679499518, + "grad_norm": 1.4065346717834473, + "learning_rate": 1.8351336645344408e-05, + "loss": 0.8083, + "step": 41790 + }, + { + "epoch": 1.6090471607314725, + "grad_norm": 1.2782877683639526, + "learning_rate": 1.8333881108742736e-05, + "loss": 0.9427, + "step": 41795 + }, + { + "epoch": 1.6092396535129931, + "grad_norm": 1.675103783607483, + "learning_rate": 1.8316433039966653e-05, + "loss": 0.9521, + "step": 41800 + }, + { + "epoch": 1.6094321462945138, + "grad_norm": 1.5068132877349854, + "learning_rate": 1.8298992440611686e-05, + "loss": 0.8201, + "step": 41805 + }, + { + "epoch": 1.6096246390760347, + "grad_norm": 1.5667356252670288, + "learning_rate": 1.828155931227259e-05, + "loss": 0.882, + "step": 41810 + }, + { + "epoch": 1.6098171318575554, + "grad_norm": 1.710042119026184, + "learning_rate": 1.8264133656543613e-05, + "loss": 0.867, + "step": 41815 + }, + { + "epoch": 1.610009624639076, + "grad_norm": 1.5348536968231201, + "learning_rate": 1.824671547501814e-05, + "loss": 0.8304, + "step": 41820 + }, + { + "epoch": 1.6102021174205967, + "grad_norm": 1.3562618494033813, + "learning_rate": 1.8229304769288956e-05, + "loss": 0.8852, + "step": 41825 + }, + { + "epoch": 1.6103946102021174, + "grad_norm": 1.1753439903259277, + "learning_rate": 1.8211901540948183e-05, + "loss": 0.8705, + "step": 41830 + }, + { + "epoch": 1.6105871029836383, + "grad_norm": 0.7911993265151978, + "learning_rate": 1.8194505791587245e-05, + "loss": 0.642, + "step": 41835 + }, + { + "epoch": 1.610779595765159, + "grad_norm": 1.2742297649383545, + "learning_rate": 1.8177117522796784e-05, + "loss": 0.7778, + "step": 41840 + }, + { + "epoch": 1.6109720885466796, + "grad_norm": 0.5650262236595154, + "learning_rate": 1.8159736736166943e-05, + "loss": 0.6833, + "step": 41845 + }, + { + "epoch": 1.6111645813282003, + "grad_norm": 0.8346420526504517, + "learning_rate": 1.8142363433287026e-05, + "loss": 0.7115, + "step": 41850 + }, + { + "epoch": 1.611357074109721, + "grad_norm": 2.0539052486419678, + "learning_rate": 1.812499761574571e-05, + "loss": 0.9054, + "step": 41855 + }, + { + "epoch": 1.6115495668912416, + "grad_norm": 1.3676276206970215, + "learning_rate": 1.8107639285131005e-05, + "loss": 0.7522, + "step": 41860 + }, + { + "epoch": 1.6117420596727623, + "grad_norm": 0.8344974517822266, + "learning_rate": 1.809028844303018e-05, + "loss": 0.7912, + "step": 41865 + }, + { + "epoch": 1.611934552454283, + "grad_norm": 2.75577449798584, + "learning_rate": 1.807294509102988e-05, + "loss": 0.7293, + "step": 41870 + }, + { + "epoch": 1.6121270452358036, + "grad_norm": 1.7650033235549927, + "learning_rate": 1.8055609230716032e-05, + "loss": 0.7737, + "step": 41875 + }, + { + "epoch": 1.6123195380173243, + "grad_norm": 1.5389347076416016, + "learning_rate": 1.8038280863673907e-05, + "loss": 0.7563, + "step": 41880 + }, + { + "epoch": 1.612512030798845, + "grad_norm": 1.5038697719573975, + "learning_rate": 1.8020959991488006e-05, + "loss": 0.7828, + "step": 41885 + }, + { + "epoch": 1.6127045235803656, + "grad_norm": 1.3098371028900146, + "learning_rate": 1.8003646615742308e-05, + "loss": 0.7512, + "step": 41890 + }, + { + "epoch": 1.6128970163618863, + "grad_norm": 1.2375686168670654, + "learning_rate": 1.7986340738019912e-05, + "loss": 0.8669, + "step": 41895 + }, + { + "epoch": 1.613089509143407, + "grad_norm": 1.7652937173843384, + "learning_rate": 1.7969042359903376e-05, + "loss": 0.6875, + "step": 41900 + }, + { + "epoch": 1.6132820019249277, + "grad_norm": 1.404327630996704, + "learning_rate": 1.795175148297451e-05, + "loss": 0.786, + "step": 41905 + }, + { + "epoch": 1.6134744947064485, + "grad_norm": 1.2358356714248657, + "learning_rate": 1.7934468108814472e-05, + "loss": 0.7852, + "step": 41910 + }, + { + "epoch": 1.6136669874879692, + "grad_norm": 1.1059602499008179, + "learning_rate": 1.7917192239003644e-05, + "loss": 0.7523, + "step": 41915 + }, + { + "epoch": 1.6138594802694899, + "grad_norm": 1.4840894937515259, + "learning_rate": 1.7899923875121882e-05, + "loss": 0.7958, + "step": 41920 + }, + { + "epoch": 1.6140519730510106, + "grad_norm": 1.5999846458435059, + "learning_rate": 1.7882663018748193e-05, + "loss": 0.8017, + "step": 41925 + }, + { + "epoch": 1.6142444658325312, + "grad_norm": 1.9015750885009766, + "learning_rate": 1.7865409671460996e-05, + "loss": 0.8296, + "step": 41930 + }, + { + "epoch": 1.6144369586140521, + "grad_norm": 1.0894391536712646, + "learning_rate": 1.7848163834837995e-05, + "loss": 0.8116, + "step": 41935 + }, + { + "epoch": 1.6146294513955728, + "grad_norm": 1.131320834159851, + "learning_rate": 1.783092551045623e-05, + "loss": 0.7442, + "step": 41940 + }, + { + "epoch": 1.6148219441770935, + "grad_norm": 2.2992284297943115, + "learning_rate": 1.781369469989196e-05, + "loss": 0.8746, + "step": 41945 + }, + { + "epoch": 1.6150144369586141, + "grad_norm": 1.1256681680679321, + "learning_rate": 1.7796471404720916e-05, + "loss": 0.721, + "step": 41950 + }, + { + "epoch": 1.6152069297401348, + "grad_norm": 1.115007758140564, + "learning_rate": 1.7779255626518e-05, + "loss": 0.7711, + "step": 41955 + }, + { + "epoch": 1.6153994225216555, + "grad_norm": 1.9384006261825562, + "learning_rate": 1.7762047366857483e-05, + "loss": 1.0144, + "step": 41960 + }, + { + "epoch": 1.6155919153031761, + "grad_norm": 1.0422606468200684, + "learning_rate": 1.7744846627312962e-05, + "loss": 0.8137, + "step": 41965 + }, + { + "epoch": 1.6157844080846968, + "grad_norm": 0.9894946217536926, + "learning_rate": 1.7727653409457358e-05, + "loss": 0.7198, + "step": 41970 + }, + { + "epoch": 1.6159769008662175, + "grad_norm": 1.2634601593017578, + "learning_rate": 1.771046771486281e-05, + "loss": 0.9481, + "step": 41975 + }, + { + "epoch": 1.6161693936477382, + "grad_norm": 1.1557183265686035, + "learning_rate": 1.7693289545100876e-05, + "loss": 0.8676, + "step": 41980 + }, + { + "epoch": 1.6163618864292588, + "grad_norm": 1.1935524940490723, + "learning_rate": 1.767611890174238e-05, + "loss": 0.8162, + "step": 41985 + }, + { + "epoch": 1.6165543792107795, + "grad_norm": 1.1359078884124756, + "learning_rate": 1.7658955786357455e-05, + "loss": 0.8553, + "step": 41990 + }, + { + "epoch": 1.6167468719923002, + "grad_norm": 1.311218500137329, + "learning_rate": 1.76418002005156e-05, + "loss": 0.745, + "step": 41995 + }, + { + "epoch": 1.6169393647738208, + "grad_norm": 1.595218300819397, + "learning_rate": 1.7624652145785523e-05, + "loss": 0.6425, + "step": 42000 + }, + { + "epoch": 1.6171318575553417, + "grad_norm": 1.5510958433151245, + "learning_rate": 1.7607511623735317e-05, + "loss": 0.7513, + "step": 42005 + }, + { + "epoch": 1.6173243503368624, + "grad_norm": 1.0332719087600708, + "learning_rate": 1.759037863593237e-05, + "loss": 0.7869, + "step": 42010 + }, + { + "epoch": 1.617516843118383, + "grad_norm": 1.556477427482605, + "learning_rate": 1.7573253183943404e-05, + "loss": 0.6109, + "step": 42015 + }, + { + "epoch": 1.6177093358999037, + "grad_norm": 3.525594472885132, + "learning_rate": 1.755613526933436e-05, + "loss": 0.9172, + "step": 42020 + }, + { + "epoch": 1.6179018286814244, + "grad_norm": 2.301964044570923, + "learning_rate": 1.7539024893670664e-05, + "loss": 1.1278, + "step": 42025 + }, + { + "epoch": 1.6180943214629453, + "grad_norm": 0.9457361698150635, + "learning_rate": 1.7521922058516827e-05, + "loss": 0.6383, + "step": 42030 + }, + { + "epoch": 1.618286814244466, + "grad_norm": 2.5374155044555664, + "learning_rate": 1.7504826765436898e-05, + "loss": 0.74, + "step": 42035 + }, + { + "epoch": 1.6184793070259866, + "grad_norm": 1.9584330320358276, + "learning_rate": 1.7487739015994064e-05, + "loss": 0.693, + "step": 42040 + }, + { + "epoch": 1.6186717998075073, + "grad_norm": 1.7019546031951904, + "learning_rate": 1.7470658811750905e-05, + "loss": 0.8639, + "step": 42045 + }, + { + "epoch": 1.618864292589028, + "grad_norm": 0.9637229442596436, + "learning_rate": 1.7453586154269287e-05, + "loss": 0.7905, + "step": 42050 + }, + { + "epoch": 1.6190567853705486, + "grad_norm": 1.0330854654312134, + "learning_rate": 1.7436521045110422e-05, + "loss": 0.6918, + "step": 42055 + }, + { + "epoch": 1.6192492781520693, + "grad_norm": 1.4338631629943848, + "learning_rate": 1.741946348583474e-05, + "loss": 0.7246, + "step": 42060 + }, + { + "epoch": 1.61944177093359, + "grad_norm": 1.057032823562622, + "learning_rate": 1.740241347800209e-05, + "loss": 0.8211, + "step": 42065 + }, + { + "epoch": 1.6196342637151107, + "grad_norm": 1.3909273147583008, + "learning_rate": 1.738537102317156e-05, + "loss": 0.8421, + "step": 42070 + }, + { + "epoch": 1.6198267564966313, + "grad_norm": 1.2499020099639893, + "learning_rate": 1.7368336122901573e-05, + "loss": 0.7806, + "step": 42075 + }, + { + "epoch": 1.620019249278152, + "grad_norm": 0.8923901319503784, + "learning_rate": 1.7351308778749897e-05, + "loss": 0.6891, + "step": 42080 + }, + { + "epoch": 1.6202117420596727, + "grad_norm": 1.582690954208374, + "learning_rate": 1.7334288992273505e-05, + "loss": 0.8041, + "step": 42085 + }, + { + "epoch": 1.6204042348411933, + "grad_norm": 2.02708101272583, + "learning_rate": 1.731727676502878e-05, + "loss": 0.9111, + "step": 42090 + }, + { + "epoch": 1.620596727622714, + "grad_norm": 0.7452544569969177, + "learning_rate": 1.730027209857137e-05, + "loss": 0.7931, + "step": 42095 + }, + { + "epoch": 1.6207892204042347, + "grad_norm": 1.4981223344802856, + "learning_rate": 1.7283274994456267e-05, + "loss": 0.8637, + "step": 42100 + }, + { + "epoch": 1.6209817131857556, + "grad_norm": 1.1747374534606934, + "learning_rate": 1.7266285454237664e-05, + "loss": 0.8657, + "step": 42105 + }, + { + "epoch": 1.6211742059672762, + "grad_norm": 1.1444958448410034, + "learning_rate": 1.7249303479469247e-05, + "loss": 0.8245, + "step": 42110 + }, + { + "epoch": 1.621366698748797, + "grad_norm": 1.2287684679031372, + "learning_rate": 1.7232329071703833e-05, + "loss": 0.8287, + "step": 42115 + }, + { + "epoch": 1.6215591915303176, + "grad_norm": 2.1734228134155273, + "learning_rate": 1.7215362232493638e-05, + "loss": 0.8472, + "step": 42120 + }, + { + "epoch": 1.6217516843118385, + "grad_norm": 1.0564271211624146, + "learning_rate": 1.719840296339017e-05, + "loss": 0.7898, + "step": 42125 + }, + { + "epoch": 1.6219441770933591, + "grad_norm": 1.1211154460906982, + "learning_rate": 1.718145126594426e-05, + "loss": 0.8268, + "step": 42130 + }, + { + "epoch": 1.6221366698748798, + "grad_norm": 1.5351982116699219, + "learning_rate": 1.7164507141705967e-05, + "loss": 0.8677, + "step": 42135 + }, + { + "epoch": 1.6223291626564005, + "grad_norm": 1.2579121589660645, + "learning_rate": 1.7147570592224803e-05, + "loss": 0.9406, + "step": 42140 + }, + { + "epoch": 1.6225216554379212, + "grad_norm": 1.0373250246047974, + "learning_rate": 1.7130641619049436e-05, + "loss": 0.7835, + "step": 42145 + }, + { + "epoch": 1.6227141482194418, + "grad_norm": 1.5844688415527344, + "learning_rate": 1.7113720223727937e-05, + "loss": 0.8265, + "step": 42150 + }, + { + "epoch": 1.6229066410009625, + "grad_norm": 1.3220007419586182, + "learning_rate": 1.7096806407807653e-05, + "loss": 0.8124, + "step": 42155 + }, + { + "epoch": 1.6230991337824832, + "grad_norm": 1.0909702777862549, + "learning_rate": 1.7079900172835263e-05, + "loss": 0.7539, + "step": 42160 + }, + { + "epoch": 1.6232916265640038, + "grad_norm": 1.1731175184249878, + "learning_rate": 1.7063001520356658e-05, + "loss": 0.8448, + "step": 42165 + }, + { + "epoch": 1.6234841193455245, + "grad_norm": 1.3320856094360352, + "learning_rate": 1.7046110451917207e-05, + "loss": 0.81, + "step": 42170 + }, + { + "epoch": 1.6236766121270452, + "grad_norm": 1.6085408926010132, + "learning_rate": 1.7029226969061407e-05, + "loss": 0.8147, + "step": 42175 + }, + { + "epoch": 1.6238691049085658, + "grad_norm": 1.2450767755508423, + "learning_rate": 1.7012351073333168e-05, + "loss": 0.7533, + "step": 42180 + }, + { + "epoch": 1.6240615976900865, + "grad_norm": 0.9313931465148926, + "learning_rate": 1.6995482766275682e-05, + "loss": 0.8205, + "step": 42185 + }, + { + "epoch": 1.6242540904716072, + "grad_norm": 2.4100301265716553, + "learning_rate": 1.697862204943148e-05, + "loss": 0.8043, + "step": 42190 + }, + { + "epoch": 1.6244465832531279, + "grad_norm": 1.371580958366394, + "learning_rate": 1.6965138941945725e-05, + "loss": 1.6104, + "step": 42195 + }, + { + "epoch": 1.6246390760346487, + "grad_norm": 0.8591129779815674, + "learning_rate": 1.6948291891370227e-05, + "loss": 0.7858, + "step": 42200 + }, + { + "epoch": 1.6248315688161694, + "grad_norm": 1.4864912033081055, + "learning_rate": 1.6931452435323226e-05, + "loss": 0.8206, + "step": 42205 + }, + { + "epoch": 1.62502406159769, + "grad_norm": 1.010176420211792, + "learning_rate": 1.6914620575344663e-05, + "loss": 0.766, + "step": 42210 + }, + { + "epoch": 1.6252165543792108, + "grad_norm": 1.7786433696746826, + "learning_rate": 1.6897796312973634e-05, + "loss": 0.8346, + "step": 42215 + }, + { + "epoch": 1.6254090471607314, + "grad_norm": 0.9494479894638062, + "learning_rate": 1.688097964974863e-05, + "loss": 0.8556, + "step": 42220 + }, + { + "epoch": 1.6256015399422523, + "grad_norm": 1.153810739517212, + "learning_rate": 1.686417058720743e-05, + "loss": 0.7756, + "step": 42225 + }, + { + "epoch": 1.625794032723773, + "grad_norm": 1.232348918914795, + "learning_rate": 1.6847369126887124e-05, + "loss": 0.8343, + "step": 42230 + }, + { + "epoch": 1.6259865255052937, + "grad_norm": 1.010253667831421, + "learning_rate": 1.6830575270324022e-05, + "loss": 0.9215, + "step": 42235 + }, + { + "epoch": 1.6261790182868143, + "grad_norm": 2.2589335441589355, + "learning_rate": 1.6813789019053926e-05, + "loss": 0.7625, + "step": 42240 + }, + { + "epoch": 1.626371511068335, + "grad_norm": 2.0202784538269043, + "learning_rate": 1.679701037461173e-05, + "loss": 0.8108, + "step": 42245 + }, + { + "epoch": 1.6265640038498557, + "grad_norm": 1.7455151081085205, + "learning_rate": 1.6780239338531777e-05, + "loss": 0.946, + "step": 42250 + }, + { + "epoch": 1.6267564966313763, + "grad_norm": 1.2754290103912354, + "learning_rate": 1.676347591234765e-05, + "loss": 0.9888, + "step": 42255 + }, + { + "epoch": 1.626948989412897, + "grad_norm": 1.6412559747695923, + "learning_rate": 1.6746720097592285e-05, + "loss": 0.8801, + "step": 42260 + }, + { + "epoch": 1.6271414821944177, + "grad_norm": 1.385617733001709, + "learning_rate": 1.6729971895797835e-05, + "loss": 0.8472, + "step": 42265 + }, + { + "epoch": 1.6273339749759383, + "grad_norm": 1.6804052591323853, + "learning_rate": 1.6713231308495846e-05, + "loss": 0.8365, + "step": 42270 + }, + { + "epoch": 1.627526467757459, + "grad_norm": 1.2974660396575928, + "learning_rate": 1.6696498337217125e-05, + "loss": 0.7316, + "step": 42275 + }, + { + "epoch": 1.6277189605389797, + "grad_norm": 1.354999303817749, + "learning_rate": 1.6679772983491804e-05, + "loss": 0.9026, + "step": 42280 + }, + { + "epoch": 1.6279114533205004, + "grad_norm": 1.667365312576294, + "learning_rate": 1.666305524884931e-05, + "loss": 0.8071, + "step": 42285 + }, + { + "epoch": 1.628103946102021, + "grad_norm": 0.764616847038269, + "learning_rate": 1.664634513481832e-05, + "loss": 0.6431, + "step": 42290 + }, + { + "epoch": 1.628296438883542, + "grad_norm": 1.2956533432006836, + "learning_rate": 1.6629642642926947e-05, + "loss": 0.8066, + "step": 42295 + }, + { + "epoch": 1.6284889316650626, + "grad_norm": 1.1321804523468018, + "learning_rate": 1.661294777470245e-05, + "loss": 0.7923, + "step": 42300 + }, + { + "epoch": 1.6286814244465833, + "grad_norm": 1.4168885946273804, + "learning_rate": 1.659626053167149e-05, + "loss": 0.6876, + "step": 42305 + }, + { + "epoch": 1.628873917228104, + "grad_norm": 1.1318153142929077, + "learning_rate": 1.6579580915360003e-05, + "loss": 1.0224, + "step": 42310 + }, + { + "epoch": 1.6290664100096246, + "grad_norm": 1.2682236433029175, + "learning_rate": 1.656290892729325e-05, + "loss": 0.6783, + "step": 42315 + }, + { + "epoch": 1.6292589027911455, + "grad_norm": 1.0369542837142944, + "learning_rate": 1.654624456899572e-05, + "loss": 0.746, + "step": 42320 + }, + { + "epoch": 1.6294513955726662, + "grad_norm": 1.284934639930725, + "learning_rate": 1.6529587841991336e-05, + "loss": 0.7735, + "step": 42325 + }, + { + "epoch": 1.6296438883541868, + "grad_norm": 0.9802597761154175, + "learning_rate": 1.6512938747803186e-05, + "loss": 0.7332, + "step": 42330 + }, + { + "epoch": 1.6298363811357075, + "grad_norm": 1.6991342306137085, + "learning_rate": 1.649629728795372e-05, + "loss": 0.8675, + "step": 42335 + }, + { + "epoch": 1.6300288739172282, + "grad_norm": 1.371645212173462, + "learning_rate": 1.6479663463964722e-05, + "loss": 0.7313, + "step": 42340 + }, + { + "epoch": 1.6302213666987488, + "grad_norm": 1.8463810682296753, + "learning_rate": 1.646303727735724e-05, + "loss": 0.9124, + "step": 42345 + }, + { + "epoch": 1.6304138594802695, + "grad_norm": 2.088982582092285, + "learning_rate": 1.6446418729651604e-05, + "loss": 0.8852, + "step": 42350 + }, + { + "epoch": 1.6306063522617902, + "grad_norm": 1.3795658349990845, + "learning_rate": 1.6429807822367482e-05, + "loss": 0.838, + "step": 42355 + }, + { + "epoch": 1.6307988450433109, + "grad_norm": 1.5469810962677002, + "learning_rate": 1.641320455702383e-05, + "loss": 0.9927, + "step": 42360 + }, + { + "epoch": 1.6309913378248315, + "grad_norm": 1.1996501684188843, + "learning_rate": 1.6396608935138902e-05, + "loss": 0.7755, + "step": 42365 + }, + { + "epoch": 1.6311838306063522, + "grad_norm": 0.9938015937805176, + "learning_rate": 1.63800209582303e-05, + "loss": 0.7619, + "step": 42370 + }, + { + "epoch": 1.6313763233878729, + "grad_norm": 1.932384729385376, + "learning_rate": 1.636344062781482e-05, + "loss": 0.8774, + "step": 42375 + }, + { + "epoch": 1.6315688161693935, + "grad_norm": 0.921647310256958, + "learning_rate": 1.6346867945408662e-05, + "loss": 0.7319, + "step": 42380 + }, + { + "epoch": 1.6317613089509142, + "grad_norm": 1.4289840459823608, + "learning_rate": 1.6330302912527263e-05, + "loss": 0.7474, + "step": 42385 + }, + { + "epoch": 1.6319538017324349, + "grad_norm": 2.0691914558410645, + "learning_rate": 1.6313745530685443e-05, + "loss": 0.9581, + "step": 42390 + }, + { + "epoch": 1.6321462945139558, + "grad_norm": 1.3220711946487427, + "learning_rate": 1.6297195801397157e-05, + "loss": 0.6706, + "step": 42395 + }, + { + "epoch": 1.6323387872954764, + "grad_norm": 1.0470640659332275, + "learning_rate": 1.6280653726175897e-05, + "loss": 0.6795, + "step": 42400 + }, + { + "epoch": 1.632531280076997, + "grad_norm": 1.722261667251587, + "learning_rate": 1.626411930653423e-05, + "loss": 0.8447, + "step": 42405 + }, + { + "epoch": 1.6327237728585178, + "grad_norm": 2.425245761871338, + "learning_rate": 1.624759254398417e-05, + "loss": 0.9633, + "step": 42410 + }, + { + "epoch": 1.6329162656400384, + "grad_norm": 1.6226695775985718, + "learning_rate": 1.6231073440036947e-05, + "loss": 0.8426, + "step": 42415 + }, + { + "epoch": 1.6331087584215593, + "grad_norm": 2.568894147872925, + "learning_rate": 1.621456199620317e-05, + "loss": 0.729, + "step": 42420 + }, + { + "epoch": 1.63330125120308, + "grad_norm": 1.7268556356430054, + "learning_rate": 1.6198058213992617e-05, + "loss": 0.7958, + "step": 42425 + }, + { + "epoch": 1.6334937439846007, + "grad_norm": 1.1411241292953491, + "learning_rate": 1.618156209491456e-05, + "loss": 0.8699, + "step": 42430 + }, + { + "epoch": 1.6336862367661213, + "grad_norm": 1.1613513231277466, + "learning_rate": 1.6165073640477368e-05, + "loss": 0.8642, + "step": 42435 + }, + { + "epoch": 1.633878729547642, + "grad_norm": 1.3415052890777588, + "learning_rate": 1.6148592852188838e-05, + "loss": 0.9454, + "step": 42440 + }, + { + "epoch": 1.6340712223291627, + "grad_norm": 0.5342742204666138, + "learning_rate": 1.613211973155604e-05, + "loss": 0.8002, + "step": 42445 + }, + { + "epoch": 1.6342637151106834, + "grad_norm": 1.4904274940490723, + "learning_rate": 1.6115654280085335e-05, + "loss": 0.8969, + "step": 42450 + }, + { + "epoch": 1.634456207892204, + "grad_norm": 0.9641575813293457, + "learning_rate": 1.609919649928231e-05, + "loss": 0.7795, + "step": 42455 + }, + { + "epoch": 1.6346487006737247, + "grad_norm": 1.267978310585022, + "learning_rate": 1.6082746390652026e-05, + "loss": 0.7523, + "step": 42460 + }, + { + "epoch": 1.6348411934552454, + "grad_norm": 1.0590537786483765, + "learning_rate": 1.606630395569866e-05, + "loss": 0.7176, + "step": 42465 + }, + { + "epoch": 1.635033686236766, + "grad_norm": 0.7965215444564819, + "learning_rate": 1.604986919592578e-05, + "loss": 0.8919, + "step": 42470 + }, + { + "epoch": 1.6352261790182867, + "grad_norm": 0.9418613314628601, + "learning_rate": 1.603344211283625e-05, + "loss": 0.7038, + "step": 42475 + }, + { + "epoch": 1.6354186717998074, + "grad_norm": 1.7966370582580566, + "learning_rate": 1.6017022707932237e-05, + "loss": 0.7553, + "step": 42480 + }, + { + "epoch": 1.635611164581328, + "grad_norm": 0.9546367526054382, + "learning_rate": 1.6000610982715135e-05, + "loss": 0.7796, + "step": 42485 + }, + { + "epoch": 1.635803657362849, + "grad_norm": 1.3808022737503052, + "learning_rate": 1.598420693868571e-05, + "loss": 0.6758, + "step": 42490 + }, + { + "epoch": 1.6359961501443696, + "grad_norm": 0.980531632900238, + "learning_rate": 1.5967810577344034e-05, + "loss": 0.8068, + "step": 42495 + }, + { + "epoch": 1.6361886429258903, + "grad_norm": 1.2579573392868042, + "learning_rate": 1.5951421900189366e-05, + "loss": 0.9325, + "step": 42500 + }, + { + "epoch": 1.636381135707411, + "grad_norm": 1.0082838535308838, + "learning_rate": 1.5935040908720455e-05, + "loss": 0.8162, + "step": 42505 + }, + { + "epoch": 1.6365736284889316, + "grad_norm": 1.9807016849517822, + "learning_rate": 1.5918667604435132e-05, + "loss": 0.7665, + "step": 42510 + }, + { + "epoch": 1.6367661212704525, + "grad_norm": 1.1579536199569702, + "learning_rate": 1.5902301988830682e-05, + "loss": 0.856, + "step": 42515 + }, + { + "epoch": 1.6369586140519732, + "grad_norm": 1.3920001983642578, + "learning_rate": 1.588594406340361e-05, + "loss": 0.8391, + "step": 42520 + }, + { + "epoch": 1.6371511068334939, + "grad_norm": 1.4542222023010254, + "learning_rate": 1.5869593829649787e-05, + "loss": 0.81, + "step": 42525 + }, + { + "epoch": 1.6373435996150145, + "grad_norm": 1.2081284523010254, + "learning_rate": 1.5853251289064242e-05, + "loss": 0.8316, + "step": 42530 + }, + { + "epoch": 1.6375360923965352, + "grad_norm": 3.1090595722198486, + "learning_rate": 1.583691644314148e-05, + "loss": 0.9793, + "step": 42535 + }, + { + "epoch": 1.6377285851780559, + "grad_norm": 1.5080655813217163, + "learning_rate": 1.5820589293375142e-05, + "loss": 0.9199, + "step": 42540 + }, + { + "epoch": 1.6379210779595765, + "grad_norm": 1.2855561971664429, + "learning_rate": 1.5804269841258323e-05, + "loss": 0.7934, + "step": 42545 + }, + { + "epoch": 1.6381135707410972, + "grad_norm": 1.6619151830673218, + "learning_rate": 1.578795808828326e-05, + "loss": 0.8615, + "step": 42550 + }, + { + "epoch": 1.6383060635226179, + "grad_norm": 1.0249853134155273, + "learning_rate": 1.5771654035941574e-05, + "loss": 0.8015, + "step": 42555 + }, + { + "epoch": 1.6384985563041385, + "grad_norm": 1.0512340068817139, + "learning_rate": 1.575535768572416e-05, + "loss": 0.7564, + "step": 42560 + }, + { + "epoch": 1.6386910490856592, + "grad_norm": 1.487186074256897, + "learning_rate": 1.5739069039121245e-05, + "loss": 0.8638, + "step": 42565 + }, + { + "epoch": 1.6388835418671799, + "grad_norm": 1.0381027460098267, + "learning_rate": 1.5722788097622267e-05, + "loss": 0.917, + "step": 42570 + }, + { + "epoch": 1.6390760346487006, + "grad_norm": 1.8956249952316284, + "learning_rate": 1.5706514862716028e-05, + "loss": 0.8835, + "step": 42575 + }, + { + "epoch": 1.6392685274302212, + "grad_norm": 0.9786503314971924, + "learning_rate": 1.5690249335890605e-05, + "loss": 0.7697, + "step": 42580 + }, + { + "epoch": 1.6394610202117421, + "grad_norm": 1.1818338632583618, + "learning_rate": 1.567399151863339e-05, + "loss": 0.8998, + "step": 42585 + }, + { + "epoch": 1.6396535129932628, + "grad_norm": 1.3527082204818726, + "learning_rate": 1.565774141243106e-05, + "loss": 0.7461, + "step": 42590 + }, + { + "epoch": 1.6398460057747835, + "grad_norm": 1.5790470838546753, + "learning_rate": 1.5641499018769545e-05, + "loss": 0.8563, + "step": 42595 + }, + { + "epoch": 1.6400384985563041, + "grad_norm": 0.7987781763076782, + "learning_rate": 1.5625264339134115e-05, + "loss": 0.7464, + "step": 42600 + }, + { + "epoch": 1.6402309913378248, + "grad_norm": 1.0493252277374268, + "learning_rate": 1.5609037375009326e-05, + "loss": 0.6797, + "step": 42605 + }, + { + "epoch": 1.6404234841193457, + "grad_norm": 1.0951621532440186, + "learning_rate": 1.559281812787906e-05, + "loss": 0.7957, + "step": 42610 + }, + { + "epoch": 1.6406159769008664, + "grad_norm": 1.529111623764038, + "learning_rate": 1.5576606599226383e-05, + "loss": 0.894, + "step": 42615 + }, + { + "epoch": 1.640808469682387, + "grad_norm": 0.8548210263252258, + "learning_rate": 1.5560402790533823e-05, + "loss": 0.7031, + "step": 42620 + }, + { + "epoch": 1.6410009624639077, + "grad_norm": 1.763548493385315, + "learning_rate": 1.554420670328305e-05, + "loss": 0.7808, + "step": 42625 + }, + { + "epoch": 1.6411934552454284, + "grad_norm": 1.7161123752593994, + "learning_rate": 1.5528018338955098e-05, + "loss": 0.686, + "step": 42630 + }, + { + "epoch": 1.641385948026949, + "grad_norm": 2.433823347091675, + "learning_rate": 1.5511837699030295e-05, + "loss": 0.788, + "step": 42635 + }, + { + "epoch": 1.6415784408084697, + "grad_norm": 1.3295754194259644, + "learning_rate": 1.549566478498827e-05, + "loss": 0.765, + "step": 42640 + }, + { + "epoch": 1.6417709335899904, + "grad_norm": 1.3078267574310303, + "learning_rate": 1.547949959830787e-05, + "loss": 0.7929, + "step": 42645 + }, + { + "epoch": 1.641963426371511, + "grad_norm": 2.7279410362243652, + "learning_rate": 1.5463342140467373e-05, + "loss": 0.9705, + "step": 42650 + }, + { + "epoch": 1.6421559191530317, + "grad_norm": 1.5727055072784424, + "learning_rate": 1.5447192412944223e-05, + "loss": 0.8038, + "step": 42655 + }, + { + "epoch": 1.6423484119345524, + "grad_norm": 1.5005487203598022, + "learning_rate": 1.5431050417215208e-05, + "loss": 0.8304, + "step": 42660 + }, + { + "epoch": 1.642540904716073, + "grad_norm": 1.3737221956253052, + "learning_rate": 1.541491615475642e-05, + "loss": 0.9049, + "step": 42665 + }, + { + "epoch": 1.6427333974975937, + "grad_norm": 0.9405394196510315, + "learning_rate": 1.5398789627043243e-05, + "loss": 0.7638, + "step": 42670 + }, + { + "epoch": 1.6429258902791144, + "grad_norm": 1.3767449855804443, + "learning_rate": 1.5382670835550293e-05, + "loss": 0.791, + "step": 42675 + }, + { + "epoch": 1.643118383060635, + "grad_norm": 0.9052772521972656, + "learning_rate": 1.5366559781751566e-05, + "loss": 0.8165, + "step": 42680 + }, + { + "epoch": 1.643310875842156, + "grad_norm": 1.9760810136795044, + "learning_rate": 1.53504564671203e-05, + "loss": 1.0684, + "step": 42685 + }, + { + "epoch": 1.6435033686236766, + "grad_norm": 0.9905810952186584, + "learning_rate": 1.533436089312904e-05, + "loss": 0.7722, + "step": 42690 + }, + { + "epoch": 1.6436958614051973, + "grad_norm": 1.814131259918213, + "learning_rate": 1.531827306124963e-05, + "loss": 0.7637, + "step": 42695 + }, + { + "epoch": 1.643888354186718, + "grad_norm": 1.4231481552124023, + "learning_rate": 1.530219297295318e-05, + "loss": 0.8173, + "step": 42700 + }, + { + "epoch": 1.6440808469682386, + "grad_norm": 1.3890655040740967, + "learning_rate": 1.5286120629710098e-05, + "loss": 0.7146, + "step": 42705 + }, + { + "epoch": 1.6442733397497595, + "grad_norm": 1.4815847873687744, + "learning_rate": 1.527005603299011e-05, + "loss": 0.8625, + "step": 42710 + }, + { + "epoch": 1.6444658325312802, + "grad_norm": 1.5338302850723267, + "learning_rate": 1.5253999184262235e-05, + "loss": 0.6705, + "step": 42715 + }, + { + "epoch": 1.6446583253128009, + "grad_norm": 1.2656974792480469, + "learning_rate": 1.523795008499469e-05, + "loss": 0.9487, + "step": 42720 + }, + { + "epoch": 1.6448508180943215, + "grad_norm": 1.0494598150253296, + "learning_rate": 1.5221908736655167e-05, + "loss": 0.7775, + "step": 42725 + }, + { + "epoch": 1.6450433108758422, + "grad_norm": 1.1466418504714966, + "learning_rate": 1.5205875140710458e-05, + "loss": 0.9629, + "step": 42730 + }, + { + "epoch": 1.6452358036573629, + "grad_norm": 1.3125535249710083, + "learning_rate": 1.5189849298626769e-05, + "loss": 0.9284, + "step": 42735 + }, + { + "epoch": 1.6454282964388836, + "grad_norm": 1.2060669660568237, + "learning_rate": 1.5173831211869539e-05, + "loss": 0.8197, + "step": 42740 + }, + { + "epoch": 1.6456207892204042, + "grad_norm": 1.393114447593689, + "learning_rate": 1.5157820881903539e-05, + "loss": 0.8467, + "step": 42745 + }, + { + "epoch": 1.645813282001925, + "grad_norm": 2.256540060043335, + "learning_rate": 1.5141818310192756e-05, + "loss": 0.9494, + "step": 42750 + }, + { + "epoch": 1.6460057747834456, + "grad_norm": 1.1511859893798828, + "learning_rate": 1.5125823498200598e-05, + "loss": 0.8653, + "step": 42755 + }, + { + "epoch": 1.6461982675649662, + "grad_norm": 1.7524446249008179, + "learning_rate": 1.5109836447389613e-05, + "loss": 0.7703, + "step": 42760 + }, + { + "epoch": 1.646390760346487, + "grad_norm": 1.9260464906692505, + "learning_rate": 1.5093857159221747e-05, + "loss": 0.8431, + "step": 42765 + }, + { + "epoch": 1.6465832531280076, + "grad_norm": 1.6984754800796509, + "learning_rate": 1.5077885635158185e-05, + "loss": 0.7436, + "step": 42770 + }, + { + "epoch": 1.6467757459095282, + "grad_norm": 1.1173107624053955, + "learning_rate": 1.5061921876659446e-05, + "loss": 0.7743, + "step": 42775 + }, + { + "epoch": 1.6469682386910491, + "grad_norm": 1.5459610223770142, + "learning_rate": 1.5045965885185253e-05, + "loss": 0.7828, + "step": 42780 + }, + { + "epoch": 1.6471607314725698, + "grad_norm": 1.2092872858047485, + "learning_rate": 1.503001766219475e-05, + "loss": 0.8356, + "step": 42785 + }, + { + "epoch": 1.6473532242540905, + "grad_norm": 1.4177119731903076, + "learning_rate": 1.5014077209146227e-05, + "loss": 0.6802, + "step": 42790 + }, + { + "epoch": 1.6475457170356111, + "grad_norm": 1.1195024251937866, + "learning_rate": 1.499814452749737e-05, + "loss": 0.8973, + "step": 42795 + }, + { + "epoch": 1.6477382098171318, + "grad_norm": 1.1029400825500488, + "learning_rate": 1.4982219618705119e-05, + "loss": 0.7615, + "step": 42800 + }, + { + "epoch": 1.6479307025986527, + "grad_norm": 1.2463111877441406, + "learning_rate": 1.4966302484225681e-05, + "loss": 0.8533, + "step": 42805 + }, + { + "epoch": 1.6481231953801734, + "grad_norm": 1.6013331413269043, + "learning_rate": 1.4950393125514605e-05, + "loss": 0.7185, + "step": 42810 + }, + { + "epoch": 1.648315688161694, + "grad_norm": 1.4675661325454712, + "learning_rate": 1.4934491544026663e-05, + "loss": 0.6616, + "step": 42815 + }, + { + "epoch": 1.6485081809432147, + "grad_norm": 1.2098175287246704, + "learning_rate": 1.4918597741215957e-05, + "loss": 0.7673, + "step": 42820 + }, + { + "epoch": 1.6487006737247354, + "grad_norm": 0.9863401651382446, + "learning_rate": 1.4902711718535866e-05, + "loss": 0.7465, + "step": 42825 + }, + { + "epoch": 1.648893166506256, + "grad_norm": 1.357666254043579, + "learning_rate": 1.4886833477439099e-05, + "loss": 0.8188, + "step": 42830 + }, + { + "epoch": 1.6490856592877767, + "grad_norm": 2.8012890815734863, + "learning_rate": 1.4870963019377548e-05, + "loss": 0.8675, + "step": 42835 + }, + { + "epoch": 1.6492781520692974, + "grad_norm": 2.035005807876587, + "learning_rate": 1.4855100345802542e-05, + "loss": 0.9065, + "step": 42840 + }, + { + "epoch": 1.649470644850818, + "grad_norm": 1.256974458694458, + "learning_rate": 1.4839245458164553e-05, + "loss": 0.8464, + "step": 42845 + }, + { + "epoch": 1.6496631376323387, + "grad_norm": 1.1360011100769043, + "learning_rate": 1.4823398357913432e-05, + "loss": 0.8485, + "step": 42850 + }, + { + "epoch": 1.6498556304138594, + "grad_norm": 1.2088253498077393, + "learning_rate": 1.4807559046498287e-05, + "loss": 0.8786, + "step": 42855 + }, + { + "epoch": 1.65004812319538, + "grad_norm": 1.3639460802078247, + "learning_rate": 1.4791727525367539e-05, + "loss": 0.9163, + "step": 42860 + }, + { + "epoch": 1.6502406159769007, + "grad_norm": 0.8863676190376282, + "learning_rate": 1.4775903795968804e-05, + "loss": 0.7723, + "step": 42865 + }, + { + "epoch": 1.6504331087584214, + "grad_norm": 1.2062976360321045, + "learning_rate": 1.476008785974916e-05, + "loss": 0.7728, + "step": 42870 + }, + { + "epoch": 1.650625601539942, + "grad_norm": 2.016284227371216, + "learning_rate": 1.4744279718154797e-05, + "loss": 0.8003, + "step": 42875 + }, + { + "epoch": 1.650818094321463, + "grad_norm": 2.4403231143951416, + "learning_rate": 1.4728479372631287e-05, + "loss": 0.8282, + "step": 42880 + }, + { + "epoch": 1.6510105871029837, + "grad_norm": 1.3682914972305298, + "learning_rate": 1.4712686824623466e-05, + "loss": 0.8877, + "step": 42885 + }, + { + "epoch": 1.6512030798845043, + "grad_norm": 1.542219877243042, + "learning_rate": 1.469690207557548e-05, + "loss": 0.8553, + "step": 42890 + }, + { + "epoch": 1.651395572666025, + "grad_norm": 1.1081209182739258, + "learning_rate": 1.4681125126930695e-05, + "loss": 0.8886, + "step": 42895 + }, + { + "epoch": 1.6515880654475459, + "grad_norm": 2.007857084274292, + "learning_rate": 1.4665355980131834e-05, + "loss": 0.8171, + "step": 42900 + }, + { + "epoch": 1.6517805582290666, + "grad_norm": 1.1662068367004395, + "learning_rate": 1.4649594636620878e-05, + "loss": 0.8906, + "step": 42905 + }, + { + "epoch": 1.6519730510105872, + "grad_norm": 1.0259616374969482, + "learning_rate": 1.4633841097839096e-05, + "loss": 0.6473, + "step": 42910 + }, + { + "epoch": 1.652165543792108, + "grad_norm": 0.8987205624580383, + "learning_rate": 1.4618095365227069e-05, + "loss": 0.7654, + "step": 42915 + }, + { + "epoch": 1.6523580365736286, + "grad_norm": 1.9955406188964844, + "learning_rate": 1.460235744022459e-05, + "loss": 0.882, + "step": 42920 + }, + { + "epoch": 1.6525505293551492, + "grad_norm": 1.3027300834655762, + "learning_rate": 1.458662732427083e-05, + "loss": 0.9002, + "step": 42925 + }, + { + "epoch": 1.65274302213667, + "grad_norm": 0.7334115505218506, + "learning_rate": 1.457090501880417e-05, + "loss": 0.8804, + "step": 42930 + }, + { + "epoch": 1.6529355149181906, + "grad_norm": 1.0303765535354614, + "learning_rate": 1.4555190525262363e-05, + "loss": 0.7637, + "step": 42935 + }, + { + "epoch": 1.6531280076997112, + "grad_norm": 0.9630566239356995, + "learning_rate": 1.4539483845082324e-05, + "loss": 0.8624, + "step": 42940 + }, + { + "epoch": 1.653320500481232, + "grad_norm": 1.3009661436080933, + "learning_rate": 1.4523784979700395e-05, + "loss": 0.799, + "step": 42945 + }, + { + "epoch": 1.6535129932627526, + "grad_norm": 1.6627331972122192, + "learning_rate": 1.4508093930552092e-05, + "loss": 0.9133, + "step": 42950 + }, + { + "epoch": 1.6537054860442733, + "grad_norm": 1.2457365989685059, + "learning_rate": 1.4492410699072256e-05, + "loss": 0.8653, + "step": 42955 + }, + { + "epoch": 1.653897978825794, + "grad_norm": 2.1877925395965576, + "learning_rate": 1.447673528669503e-05, + "loss": 0.9323, + "step": 42960 + }, + { + "epoch": 1.6540904716073146, + "grad_norm": 1.2046828269958496, + "learning_rate": 1.4461067694853847e-05, + "loss": 0.7993, + "step": 42965 + }, + { + "epoch": 1.6542829643888353, + "grad_norm": 1.9960542917251587, + "learning_rate": 1.4445407924981325e-05, + "loss": 0.7632, + "step": 42970 + }, + { + "epoch": 1.6544754571703562, + "grad_norm": 1.4354302883148193, + "learning_rate": 1.4429755978509551e-05, + "loss": 0.7538, + "step": 42975 + }, + { + "epoch": 1.6546679499518768, + "grad_norm": 1.3290390968322754, + "learning_rate": 1.4414111856869727e-05, + "loss": 0.7158, + "step": 42980 + }, + { + "epoch": 1.6548604427333975, + "grad_norm": 0.9498152136802673, + "learning_rate": 1.4398475561492409e-05, + "loss": 0.7065, + "step": 42985 + }, + { + "epoch": 1.6550529355149182, + "grad_norm": 1.9382153749465942, + "learning_rate": 1.438284709380745e-05, + "loss": 0.6704, + "step": 42990 + }, + { + "epoch": 1.6552454282964388, + "grad_norm": 1.109632968902588, + "learning_rate": 1.4367226455243988e-05, + "loss": 0.8012, + "step": 42995 + }, + { + "epoch": 1.6554379210779597, + "grad_norm": 1.8244105577468872, + "learning_rate": 1.4351613647230344e-05, + "loss": 0.7368, + "step": 43000 + }, + { + "epoch": 1.6556304138594804, + "grad_norm": 1.2546757459640503, + "learning_rate": 1.4336008671194311e-05, + "loss": 0.7095, + "step": 43005 + }, + { + "epoch": 1.655822906641001, + "grad_norm": 1.7501899003982544, + "learning_rate": 1.4320411528562806e-05, + "loss": 0.6884, + "step": 43010 + }, + { + "epoch": 1.6560153994225217, + "grad_norm": 1.3198050260543823, + "learning_rate": 1.430482222076207e-05, + "loss": 0.679, + "step": 43015 + }, + { + "epoch": 1.6562078922040424, + "grad_norm": 1.1397435665130615, + "learning_rate": 1.428924074921768e-05, + "loss": 0.7621, + "step": 43020 + }, + { + "epoch": 1.656400384985563, + "grad_norm": 1.1400476694107056, + "learning_rate": 1.427366711535445e-05, + "loss": 0.7215, + "step": 43025 + }, + { + "epoch": 1.6565928777670837, + "grad_norm": 1.151114821434021, + "learning_rate": 1.4258101320596462e-05, + "loss": 0.856, + "step": 43030 + }, + { + "epoch": 1.6567853705486044, + "grad_norm": 1.7196004390716553, + "learning_rate": 1.4242543366367122e-05, + "loss": 0.8217, + "step": 43035 + }, + { + "epoch": 1.656977863330125, + "grad_norm": 1.8382436037063599, + "learning_rate": 1.4226993254089127e-05, + "loss": 0.8671, + "step": 43040 + }, + { + "epoch": 1.6571703561116458, + "grad_norm": 0.8144475221633911, + "learning_rate": 1.4211450985184349e-05, + "loss": 0.9136, + "step": 43045 + }, + { + "epoch": 1.6573628488931664, + "grad_norm": 1.0420215129852295, + "learning_rate": 1.4195916561074129e-05, + "loss": 0.8111, + "step": 43050 + }, + { + "epoch": 1.657555341674687, + "grad_norm": 1.1034629344940186, + "learning_rate": 1.41803899831789e-05, + "loss": 0.7063, + "step": 43055 + }, + { + "epoch": 1.6577478344562078, + "grad_norm": 1.1261588335037231, + "learning_rate": 1.4164871252918544e-05, + "loss": 0.9457, + "step": 43060 + }, + { + "epoch": 1.6579403272377284, + "grad_norm": 1.1537283658981323, + "learning_rate": 1.4149360371712084e-05, + "loss": 1.0019, + "step": 43065 + }, + { + "epoch": 1.6581328200192493, + "grad_norm": 1.0885237455368042, + "learning_rate": 1.4133857340977908e-05, + "loss": 0.8041, + "step": 43070 + }, + { + "epoch": 1.65832531280077, + "grad_norm": 1.2921056747436523, + "learning_rate": 1.411836216213367e-05, + "loss": 0.8121, + "step": 43075 + }, + { + "epoch": 1.6585178055822907, + "grad_norm": 1.4436019659042358, + "learning_rate": 1.4102874836596325e-05, + "loss": 0.7213, + "step": 43080 + }, + { + "epoch": 1.6587102983638113, + "grad_norm": 1.3824903964996338, + "learning_rate": 1.4087395365782008e-05, + "loss": 0.8929, + "step": 43085 + }, + { + "epoch": 1.658902791145332, + "grad_norm": 1.0888464450836182, + "learning_rate": 1.407192375110632e-05, + "loss": 0.828, + "step": 43090 + }, + { + "epoch": 1.659095283926853, + "grad_norm": 1.3013052940368652, + "learning_rate": 1.405645999398395e-05, + "loss": 0.7308, + "step": 43095 + }, + { + "epoch": 1.6592877767083736, + "grad_norm": 1.4749294519424438, + "learning_rate": 1.4041004095828992e-05, + "loss": 0.8799, + "step": 43100 + }, + { + "epoch": 1.6594802694898942, + "grad_norm": 1.4504752159118652, + "learning_rate": 1.4025556058054789e-05, + "loss": 0.7252, + "step": 43105 + }, + { + "epoch": 1.659672762271415, + "grad_norm": 1.064914584159851, + "learning_rate": 1.4010115882073971e-05, + "loss": 0.8309, + "step": 43110 + }, + { + "epoch": 1.6598652550529356, + "grad_norm": 1.1333822011947632, + "learning_rate": 1.3994683569298406e-05, + "loss": 0.8019, + "step": 43115 + }, + { + "epoch": 1.6600577478344563, + "grad_norm": 1.1221379041671753, + "learning_rate": 1.39792591211393e-05, + "loss": 0.9147, + "step": 43120 + }, + { + "epoch": 1.660250240615977, + "grad_norm": 1.3781883716583252, + "learning_rate": 1.3963842539007098e-05, + "loss": 0.7619, + "step": 43125 + }, + { + "epoch": 1.6604427333974976, + "grad_norm": 0.8834213614463806, + "learning_rate": 1.394843382431158e-05, + "loss": 0.9033, + "step": 43130 + }, + { + "epoch": 1.6606352261790183, + "grad_norm": 1.049788236618042, + "learning_rate": 1.3933032978461757e-05, + "loss": 0.6857, + "step": 43135 + }, + { + "epoch": 1.660827718960539, + "grad_norm": 1.520080804824829, + "learning_rate": 1.3917640002865905e-05, + "loss": 0.8151, + "step": 43140 + }, + { + "epoch": 1.6610202117420596, + "grad_norm": 1.1372543573379517, + "learning_rate": 1.3902254898931633e-05, + "loss": 0.7423, + "step": 43145 + }, + { + "epoch": 1.6612127045235803, + "grad_norm": 0.8767010569572449, + "learning_rate": 1.3886877668065802e-05, + "loss": 0.874, + "step": 43150 + }, + { + "epoch": 1.661405197305101, + "grad_norm": 1.2810707092285156, + "learning_rate": 1.3871508311674587e-05, + "loss": 0.7602, + "step": 43155 + }, + { + "epoch": 1.6615976900866216, + "grad_norm": 1.0464438199996948, + "learning_rate": 1.3856146831163341e-05, + "loss": 0.6807, + "step": 43160 + }, + { + "epoch": 1.6617901828681423, + "grad_norm": 2.142082452774048, + "learning_rate": 1.3840793227936867e-05, + "loss": 0.7548, + "step": 43165 + }, + { + "epoch": 1.6619826756496632, + "grad_norm": 0.9313360452651978, + "learning_rate": 1.382544750339907e-05, + "loss": 0.804, + "step": 43170 + }, + { + "epoch": 1.6621751684311838, + "grad_norm": 0.9089387059211731, + "learning_rate": 1.3810109658953252e-05, + "loss": 0.6871, + "step": 43175 + }, + { + "epoch": 1.6623676612127045, + "grad_norm": 1.2716861963272095, + "learning_rate": 1.3794779696001948e-05, + "loss": 0.914, + "step": 43180 + }, + { + "epoch": 1.6625601539942252, + "grad_norm": 1.5578489303588867, + "learning_rate": 1.3779457615947e-05, + "loss": 0.7148, + "step": 43185 + }, + { + "epoch": 1.6627526467757459, + "grad_norm": 0.8890004754066467, + "learning_rate": 1.3764143420189457e-05, + "loss": 0.7453, + "step": 43190 + }, + { + "epoch": 1.6629451395572667, + "grad_norm": 1.7845152616500854, + "learning_rate": 1.3748837110129774e-05, + "loss": 0.8449, + "step": 43195 + }, + { + "epoch": 1.6631376323387874, + "grad_norm": 1.1087902784347534, + "learning_rate": 1.3733538687167558e-05, + "loss": 0.8144, + "step": 43200 + }, + { + "epoch": 1.663330125120308, + "grad_norm": 1.7164037227630615, + "learning_rate": 1.3718248152701773e-05, + "loss": 0.8841, + "step": 43205 + }, + { + "epoch": 1.6635226179018288, + "grad_norm": 1.1538746356964111, + "learning_rate": 1.3702965508130616e-05, + "loss": 0.9051, + "step": 43210 + }, + { + "epoch": 1.6637151106833494, + "grad_norm": 1.328781247138977, + "learning_rate": 1.3687690754851634e-05, + "loss": 0.7957, + "step": 43215 + }, + { + "epoch": 1.66390760346487, + "grad_norm": 1.2953364849090576, + "learning_rate": 1.367242389426151e-05, + "loss": 0.8322, + "step": 43220 + }, + { + "epoch": 1.6641000962463908, + "grad_norm": 1.3127518892288208, + "learning_rate": 1.3657164927756405e-05, + "loss": 0.7086, + "step": 43225 + }, + { + "epoch": 1.6642925890279114, + "grad_norm": 1.5213780403137207, + "learning_rate": 1.3641913856731569e-05, + "loss": 0.8327, + "step": 43230 + }, + { + "epoch": 1.664485081809432, + "grad_norm": 1.8218622207641602, + "learning_rate": 1.3626670682581655e-05, + "loss": 0.9758, + "step": 43235 + }, + { + "epoch": 1.6646775745909528, + "grad_norm": 1.4787392616271973, + "learning_rate": 1.3611435406700546e-05, + "loss": 0.6978, + "step": 43240 + }, + { + "epoch": 1.6648700673724735, + "grad_norm": 1.1849435567855835, + "learning_rate": 1.3596208030481372e-05, + "loss": 0.9329, + "step": 43245 + }, + { + "epoch": 1.6650625601539941, + "grad_norm": 0.34040671586990356, + "learning_rate": 1.358098855531661e-05, + "loss": 0.67, + "step": 43250 + }, + { + "epoch": 1.6652550529355148, + "grad_norm": 1.1814277172088623, + "learning_rate": 1.3565776982597966e-05, + "loss": 0.8672, + "step": 43255 + }, + { + "epoch": 1.6654475457170355, + "grad_norm": 1.2392293214797974, + "learning_rate": 1.3550573313716463e-05, + "loss": 0.9071, + "step": 43260 + }, + { + "epoch": 1.6656400384985564, + "grad_norm": 1.1878242492675781, + "learning_rate": 1.3535377550062323e-05, + "loss": 0.7641, + "step": 43265 + }, + { + "epoch": 1.665832531280077, + "grad_norm": 0.885995090007782, + "learning_rate": 1.3520189693025164e-05, + "loss": 0.7289, + "step": 43270 + }, + { + "epoch": 1.6660250240615977, + "grad_norm": 1.223071575164795, + "learning_rate": 1.3505009743993757e-05, + "loss": 0.6685, + "step": 43275 + }, + { + "epoch": 1.6662175168431184, + "grad_norm": 1.0630251169204712, + "learning_rate": 1.3489837704356235e-05, + "loss": 0.8338, + "step": 43280 + }, + { + "epoch": 1.666410009624639, + "grad_norm": 1.187340497970581, + "learning_rate": 1.3474673575499986e-05, + "loss": 0.6967, + "step": 43285 + }, + { + "epoch": 1.66660250240616, + "grad_norm": 2.237581729888916, + "learning_rate": 1.3459517358811668e-05, + "loss": 0.8451, + "step": 43290 + }, + { + "epoch": 1.6667949951876806, + "grad_norm": 1.1633507013320923, + "learning_rate": 1.3444369055677175e-05, + "loss": 0.7811, + "step": 43295 + }, + { + "epoch": 1.6669874879692013, + "grad_norm": 1.9150440692901611, + "learning_rate": 1.3429228667481797e-05, + "loss": 0.8467, + "step": 43300 + }, + { + "epoch": 1.667179980750722, + "grad_norm": 0.9145646691322327, + "learning_rate": 1.341409619560996e-05, + "loss": 0.8669, + "step": 43305 + }, + { + "epoch": 1.6673724735322426, + "grad_norm": 1.4156630039215088, + "learning_rate": 1.3398971641445434e-05, + "loss": 0.7688, + "step": 43310 + }, + { + "epoch": 1.6675649663137633, + "grad_norm": 1.122641682624817, + "learning_rate": 1.3383855006371281e-05, + "loss": 0.9059, + "step": 43315 + }, + { + "epoch": 1.667757459095284, + "grad_norm": 1.1816550493240356, + "learning_rate": 1.3368746291769806e-05, + "loss": 0.6822, + "step": 43320 + }, + { + "epoch": 1.6679499518768046, + "grad_norm": 1.2884527444839478, + "learning_rate": 1.3353645499022605e-05, + "loss": 0.989, + "step": 43325 + }, + { + "epoch": 1.6681424446583253, + "grad_norm": 2.128404140472412, + "learning_rate": 1.333855262951056e-05, + "loss": 0.835, + "step": 43330 + }, + { + "epoch": 1.668334937439846, + "grad_norm": 1.3156408071517944, + "learning_rate": 1.3323467684613789e-05, + "loss": 0.7899, + "step": 43335 + }, + { + "epoch": 1.6685274302213666, + "grad_norm": 2.0468761920928955, + "learning_rate": 1.3308390665711701e-05, + "loss": 0.7012, + "step": 43340 + }, + { + "epoch": 1.6687199230028873, + "grad_norm": 1.0322657823562622, + "learning_rate": 1.3293321574183016e-05, + "loss": 0.6967, + "step": 43345 + }, + { + "epoch": 1.668912415784408, + "grad_norm": 0.9102632999420166, + "learning_rate": 1.3278260411405697e-05, + "loss": 0.7604, + "step": 43350 + }, + { + "epoch": 1.6691049085659286, + "grad_norm": 1.8231315612792969, + "learning_rate": 1.3263207178756997e-05, + "loss": 0.6853, + "step": 43355 + }, + { + "epoch": 1.6692974013474493, + "grad_norm": 1.3293402194976807, + "learning_rate": 1.3248161877613408e-05, + "loss": 0.9419, + "step": 43360 + }, + { + "epoch": 1.6694898941289702, + "grad_norm": 1.4297016859054565, + "learning_rate": 1.3233124509350736e-05, + "loss": 0.9883, + "step": 43365 + }, + { + "epoch": 1.6696823869104909, + "grad_norm": 1.4253541231155396, + "learning_rate": 1.3218095075344051e-05, + "loss": 0.8432, + "step": 43370 + }, + { + "epoch": 1.6698748796920115, + "grad_norm": 1.1191109418869019, + "learning_rate": 1.3203073576967717e-05, + "loss": 0.87, + "step": 43375 + }, + { + "epoch": 1.6700673724735322, + "grad_norm": 1.2235162258148193, + "learning_rate": 1.3188060015595271e-05, + "loss": 0.943, + "step": 43380 + }, + { + "epoch": 1.670259865255053, + "grad_norm": 0.8941404223442078, + "learning_rate": 1.3173054392599715e-05, + "loss": 0.7541, + "step": 43385 + }, + { + "epoch": 1.6704523580365738, + "grad_norm": 1.3058192729949951, + "learning_rate": 1.3158056709353139e-05, + "loss": 0.7509, + "step": 43390 + }, + { + "epoch": 1.6706448508180944, + "grad_norm": 1.0127733945846558, + "learning_rate": 1.3143066967226992e-05, + "loss": 0.6995, + "step": 43395 + }, + { + "epoch": 1.670837343599615, + "grad_norm": 1.4045414924621582, + "learning_rate": 1.3128085167592007e-05, + "loss": 0.751, + "step": 43400 + }, + { + "epoch": 1.6710298363811358, + "grad_norm": 1.8028631210327148, + "learning_rate": 1.3113111311818171e-05, + "loss": 0.8459, + "step": 43405 + }, + { + "epoch": 1.6712223291626565, + "grad_norm": 2.1013052463531494, + "learning_rate": 1.3098145401274697e-05, + "loss": 0.8916, + "step": 43410 + }, + { + "epoch": 1.6714148219441771, + "grad_norm": 1.4260973930358887, + "learning_rate": 1.3083187437330192e-05, + "loss": 0.6563, + "step": 43415 + }, + { + "epoch": 1.6716073147256978, + "grad_norm": 1.321631908416748, + "learning_rate": 1.3068237421352414e-05, + "loss": 0.7359, + "step": 43420 + }, + { + "epoch": 1.6717998075072185, + "grad_norm": 1.628443717956543, + "learning_rate": 1.3053295354708439e-05, + "loss": 0.7925, + "step": 43425 + }, + { + "epoch": 1.6719923002887391, + "grad_norm": 1.8474700450897217, + "learning_rate": 1.3038361238764641e-05, + "loss": 0.8156, + "step": 43430 + }, + { + "epoch": 1.6721847930702598, + "grad_norm": 1.3773387670516968, + "learning_rate": 1.3023435074886658e-05, + "loss": 0.7901, + "step": 43435 + }, + { + "epoch": 1.6723772858517805, + "grad_norm": 0.8466984033584595, + "learning_rate": 1.3008516864439357e-05, + "loss": 0.8321, + "step": 43440 + }, + { + "epoch": 1.6725697786333011, + "grad_norm": 1.169967770576477, + "learning_rate": 1.2993606608786913e-05, + "loss": 0.7029, + "step": 43445 + }, + { + "epoch": 1.6727622714148218, + "grad_norm": 1.8645579814910889, + "learning_rate": 1.2978704309292789e-05, + "loss": 0.8151, + "step": 43450 + }, + { + "epoch": 1.6729547641963425, + "grad_norm": 0.84092777967453, + "learning_rate": 1.2963809967319685e-05, + "loss": 0.8796, + "step": 43455 + }, + { + "epoch": 1.6731472569778634, + "grad_norm": 1.5797427892684937, + "learning_rate": 1.2948923584229622e-05, + "loss": 0.8069, + "step": 43460 + }, + { + "epoch": 1.673339749759384, + "grad_norm": 1.3241761922836304, + "learning_rate": 1.2934045161383824e-05, + "loss": 0.9362, + "step": 43465 + }, + { + "epoch": 1.6735322425409047, + "grad_norm": 1.459402322769165, + "learning_rate": 1.2919174700142822e-05, + "loss": 0.8827, + "step": 43470 + }, + { + "epoch": 1.6737247353224254, + "grad_norm": 1.3563963174819946, + "learning_rate": 1.2904312201866443e-05, + "loss": 0.7035, + "step": 43475 + }, + { + "epoch": 1.673917228103946, + "grad_norm": 1.3299168348312378, + "learning_rate": 1.2889457667913785e-05, + "loss": 0.8429, + "step": 43480 + }, + { + "epoch": 1.674109720885467, + "grad_norm": 1.492050051689148, + "learning_rate": 1.2874611099643108e-05, + "loss": 0.9821, + "step": 43485 + }, + { + "epoch": 1.6743022136669876, + "grad_norm": 0.9983692169189453, + "learning_rate": 1.2859772498412149e-05, + "loss": 0.6268, + "step": 43490 + }, + { + "epoch": 1.6744947064485083, + "grad_norm": 1.0508897304534912, + "learning_rate": 1.2844941865577719e-05, + "loss": 0.8499, + "step": 43495 + }, + { + "epoch": 1.674687199230029, + "grad_norm": 1.9004470109939575, + "learning_rate": 1.2830119202496016e-05, + "loss": 0.8397, + "step": 43500 + }, + { + "epoch": 1.6748796920115496, + "grad_norm": 1.5322777032852173, + "learning_rate": 1.2815304510522453e-05, + "loss": 0.9092, + "step": 43505 + }, + { + "epoch": 1.6750721847930703, + "grad_norm": 1.5270432233810425, + "learning_rate": 1.2800497791011768e-05, + "loss": 0.739, + "step": 43510 + }, + { + "epoch": 1.675264677574591, + "grad_norm": 1.1705281734466553, + "learning_rate": 1.2785699045317878e-05, + "loss": 0.7841, + "step": 43515 + }, + { + "epoch": 1.6754571703561116, + "grad_norm": 1.2318379878997803, + "learning_rate": 1.2770908274794102e-05, + "loss": 0.7705, + "step": 43520 + }, + { + "epoch": 1.6756496631376323, + "grad_norm": 1.481648325920105, + "learning_rate": 1.2756125480792912e-05, + "loss": 0.92, + "step": 43525 + }, + { + "epoch": 1.675842155919153, + "grad_norm": 1.4268490076065063, + "learning_rate": 1.2741350664666108e-05, + "loss": 0.8993, + "step": 43530 + }, + { + "epoch": 1.6760346487006736, + "grad_norm": 1.973768711090088, + "learning_rate": 1.2726583827764748e-05, + "loss": 0.7195, + "step": 43535 + }, + { + "epoch": 1.6762271414821943, + "grad_norm": 1.2458196878433228, + "learning_rate": 1.271182497143919e-05, + "loss": 0.7451, + "step": 43540 + }, + { + "epoch": 1.676419634263715, + "grad_norm": 1.4529706239700317, + "learning_rate": 1.2697074097038964e-05, + "loss": 0.726, + "step": 43545 + }, + { + "epoch": 1.6766121270452357, + "grad_norm": 2.0508511066436768, + "learning_rate": 1.2682331205913012e-05, + "loss": 0.8247, + "step": 43550 + }, + { + "epoch": 1.6768046198267565, + "grad_norm": 1.2535037994384766, + "learning_rate": 1.2667596299409434e-05, + "loss": 0.8559, + "step": 43555 + }, + { + "epoch": 1.6769971126082772, + "grad_norm": 1.0279282331466675, + "learning_rate": 1.2652869378875654e-05, + "loss": 0.7823, + "step": 43560 + }, + { + "epoch": 1.6771896053897979, + "grad_norm": 0.9795730113983154, + "learning_rate": 1.2638150445658337e-05, + "loss": 0.6945, + "step": 43565 + }, + { + "epoch": 1.6773820981713186, + "grad_norm": 1.2508692741394043, + "learning_rate": 1.2623439501103452e-05, + "loss": 0.6393, + "step": 43570 + }, + { + "epoch": 1.6775745909528392, + "grad_norm": 1.5805447101593018, + "learning_rate": 1.260873654655622e-05, + "loss": 0.7677, + "step": 43575 + }, + { + "epoch": 1.6777670837343601, + "grad_norm": 1.477809190750122, + "learning_rate": 1.2594041583361105e-05, + "loss": 0.8169, + "step": 43580 + }, + { + "epoch": 1.6779595765158808, + "grad_norm": 1.8884755373001099, + "learning_rate": 1.257935461286187e-05, + "loss": 0.7033, + "step": 43585 + }, + { + "epoch": 1.6781520692974015, + "grad_norm": 1.1278042793273926, + "learning_rate": 1.2564675636401557e-05, + "loss": 0.6912, + "step": 43590 + }, + { + "epoch": 1.6783445620789221, + "grad_norm": 1.1749690771102905, + "learning_rate": 1.2550004655322457e-05, + "loss": 0.8726, + "step": 43595 + }, + { + "epoch": 1.6785370548604428, + "grad_norm": 2.014747142791748, + "learning_rate": 1.2535341670966094e-05, + "loss": 0.835, + "step": 43600 + }, + { + "epoch": 1.6787295476419635, + "grad_norm": 1.0013171434402466, + "learning_rate": 1.2520686684673377e-05, + "loss": 0.7554, + "step": 43605 + }, + { + "epoch": 1.6789220404234841, + "grad_norm": 2.000584602355957, + "learning_rate": 1.2506039697784345e-05, + "loss": 0.9073, + "step": 43610 + }, + { + "epoch": 1.6791145332050048, + "grad_norm": 1.03412926197052, + "learning_rate": 1.2491400711638378e-05, + "loss": 0.9339, + "step": 43615 + }, + { + "epoch": 1.6793070259865255, + "grad_norm": 1.905938744544983, + "learning_rate": 1.2476769727574133e-05, + "loss": 0.7793, + "step": 43620 + }, + { + "epoch": 1.6794995187680462, + "grad_norm": 1.2626748085021973, + "learning_rate": 1.2462146746929538e-05, + "loss": 0.6789, + "step": 43625 + }, + { + "epoch": 1.6796920115495668, + "grad_norm": 1.5359333753585815, + "learning_rate": 1.2447531771041677e-05, + "loss": 0.7657, + "step": 43630 + }, + { + "epoch": 1.6798845043310875, + "grad_norm": 0.68184894323349, + "learning_rate": 1.2432924801247115e-05, + "loss": 1.0114, + "step": 43635 + }, + { + "epoch": 1.6800769971126082, + "grad_norm": 2.0016860961914062, + "learning_rate": 1.2418325838881462e-05, + "loss": 0.7352, + "step": 43640 + }, + { + "epoch": 1.6802694898941288, + "grad_norm": 1.29106605052948, + "learning_rate": 1.240373488527975e-05, + "loss": 0.743, + "step": 43645 + }, + { + "epoch": 1.6804619826756495, + "grad_norm": 1.8269158601760864, + "learning_rate": 1.23891519417762e-05, + "loss": 0.8502, + "step": 43650 + }, + { + "epoch": 1.6806544754571704, + "grad_norm": 0.9514836072921753, + "learning_rate": 1.2374577009704357e-05, + "loss": 0.8288, + "step": 43655 + }, + { + "epoch": 1.680846968238691, + "grad_norm": 1.3545150756835938, + "learning_rate": 1.2360010090396968e-05, + "loss": 0.9582, + "step": 43660 + }, + { + "epoch": 1.6810394610202117, + "grad_norm": 1.4287010431289673, + "learning_rate": 1.2345451185186097e-05, + "loss": 0.8761, + "step": 43665 + }, + { + "epoch": 1.6812319538017324, + "grad_norm": 0.9901794195175171, + "learning_rate": 1.2330900295403048e-05, + "loss": 0.878, + "step": 43670 + }, + { + "epoch": 1.681424446583253, + "grad_norm": 1.1503381729125977, + "learning_rate": 1.231635742237841e-05, + "loss": 0.747, + "step": 43675 + }, + { + "epoch": 1.681616939364774, + "grad_norm": 0.8054229617118835, + "learning_rate": 1.2301822567442067e-05, + "loss": 0.724, + "step": 43680 + }, + { + "epoch": 1.6818094321462946, + "grad_norm": 2.6240620613098145, + "learning_rate": 1.2287295731923077e-05, + "loss": 0.7472, + "step": 43685 + }, + { + "epoch": 1.6820019249278153, + "grad_norm": 0.9073877930641174, + "learning_rate": 1.2272776917149841e-05, + "loss": 0.7837, + "step": 43690 + }, + { + "epoch": 1.682194417709336, + "grad_norm": 0.9025770425796509, + "learning_rate": 1.2258266124450024e-05, + "loss": 0.8009, + "step": 43695 + }, + { + "epoch": 1.6823869104908566, + "grad_norm": 0.9586546421051025, + "learning_rate": 1.224376335515055e-05, + "loss": 0.8342, + "step": 43700 + }, + { + "epoch": 1.6825794032723773, + "grad_norm": 1.0272797346115112, + "learning_rate": 1.222926861057755e-05, + "loss": 0.8561, + "step": 43705 + }, + { + "epoch": 1.682771896053898, + "grad_norm": 1.4825763702392578, + "learning_rate": 1.2214781892056548e-05, + "loss": 0.938, + "step": 43710 + }, + { + "epoch": 1.6829643888354187, + "grad_norm": 1.5734376907348633, + "learning_rate": 1.2200303200912199e-05, + "loss": 0.7255, + "step": 43715 + }, + { + "epoch": 1.6831568816169393, + "grad_norm": 1.2651904821395874, + "learning_rate": 1.21858325384685e-05, + "loss": 0.762, + "step": 43720 + }, + { + "epoch": 1.68334937439846, + "grad_norm": 1.516546368598938, + "learning_rate": 1.2171369906048703e-05, + "loss": 0.7557, + "step": 43725 + }, + { + "epoch": 1.6835418671799807, + "grad_norm": 1.8338733911514282, + "learning_rate": 1.2156915304975325e-05, + "loss": 0.7487, + "step": 43730 + }, + { + "epoch": 1.6837343599615013, + "grad_norm": 2.001396417617798, + "learning_rate": 1.21424687365701e-05, + "loss": 0.8106, + "step": 43735 + }, + { + "epoch": 1.683926852743022, + "grad_norm": 1.8002668619155884, + "learning_rate": 1.212803020215415e-05, + "loss": 0.6761, + "step": 43740 + }, + { + "epoch": 1.6841193455245427, + "grad_norm": 1.0602316856384277, + "learning_rate": 1.2113599703047728e-05, + "loss": 0.7588, + "step": 43745 + }, + { + "epoch": 1.6843118383060636, + "grad_norm": 1.10280442237854, + "learning_rate": 1.2099177240570403e-05, + "loss": 0.8926, + "step": 43750 + }, + { + "epoch": 1.6845043310875842, + "grad_norm": 1.1837788820266724, + "learning_rate": 1.208476281604104e-05, + "loss": 0.8216, + "step": 43755 + }, + { + "epoch": 1.684696823869105, + "grad_norm": 1.3022428750991821, + "learning_rate": 1.2070356430777752e-05, + "loss": 0.6659, + "step": 43760 + }, + { + "epoch": 1.6848893166506256, + "grad_norm": 1.8487025499343872, + "learning_rate": 1.205595808609784e-05, + "loss": 0.7536, + "step": 43765 + }, + { + "epoch": 1.6850818094321462, + "grad_norm": 1.2362842559814453, + "learning_rate": 1.2041567783318031e-05, + "loss": 0.8764, + "step": 43770 + }, + { + "epoch": 1.6852743022136671, + "grad_norm": 1.0950088500976562, + "learning_rate": 1.2027185523754159e-05, + "loss": 0.9292, + "step": 43775 + }, + { + "epoch": 1.6854667949951878, + "grad_norm": 1.8685953617095947, + "learning_rate": 1.2012811308721395e-05, + "loss": 0.6901, + "step": 43780 + }, + { + "epoch": 1.6856592877767085, + "grad_norm": 1.1124467849731445, + "learning_rate": 1.1998445139534209e-05, + "loss": 0.835, + "step": 43785 + }, + { + "epoch": 1.6858517805582292, + "grad_norm": 1.07558012008667, + "learning_rate": 1.1984087017506228e-05, + "loss": 0.8145, + "step": 43790 + }, + { + "epoch": 1.6860442733397498, + "grad_norm": 1.631927490234375, + "learning_rate": 1.1969736943950439e-05, + "loss": 0.9775, + "step": 43795 + }, + { + "epoch": 1.6862367661212705, + "grad_norm": 1.3678085803985596, + "learning_rate": 1.1955394920179053e-05, + "loss": 0.7821, + "step": 43800 + }, + { + "epoch": 1.6864292589027912, + "grad_norm": 1.4719831943511963, + "learning_rate": 1.1941060947503591e-05, + "loss": 0.7762, + "step": 43805 + }, + { + "epoch": 1.6866217516843118, + "grad_norm": 1.012670636177063, + "learning_rate": 1.1926735027234726e-05, + "loss": 0.8692, + "step": 43810 + }, + { + "epoch": 1.6868142444658325, + "grad_norm": 1.3539128303527832, + "learning_rate": 1.1912417160682543e-05, + "loss": 0.6571, + "step": 43815 + }, + { + "epoch": 1.6870067372473532, + "grad_norm": 1.2712576389312744, + "learning_rate": 1.1898107349156274e-05, + "loss": 0.7417, + "step": 43820 + }, + { + "epoch": 1.6871992300288738, + "grad_norm": 2.003711462020874, + "learning_rate": 1.188380559396446e-05, + "loss": 0.8174, + "step": 43825 + }, + { + "epoch": 1.6873917228103945, + "grad_norm": 2.019716739654541, + "learning_rate": 1.186951189641491e-05, + "loss": 0.6646, + "step": 43830 + }, + { + "epoch": 1.6875842155919152, + "grad_norm": 1.0991610288619995, + "learning_rate": 1.1855226257814688e-05, + "loss": 0.7247, + "step": 43835 + }, + { + "epoch": 1.6877767083734359, + "grad_norm": 1.6221376657485962, + "learning_rate": 1.184094867947011e-05, + "loss": 0.8466, + "step": 43840 + }, + { + "epoch": 1.6879692011549567, + "grad_norm": 1.4020202159881592, + "learning_rate": 1.1826679162686805e-05, + "loss": 0.8511, + "step": 43845 + }, + { + "epoch": 1.6881616939364774, + "grad_norm": 1.3694734573364258, + "learning_rate": 1.1812417708769552e-05, + "loss": 0.8935, + "step": 43850 + }, + { + "epoch": 1.688354186717998, + "grad_norm": 1.3666578531265259, + "learning_rate": 1.1798164319022554e-05, + "loss": 0.8224, + "step": 43855 + }, + { + "epoch": 1.6885466794995188, + "grad_norm": 1.5298011302947998, + "learning_rate": 1.1783918994749122e-05, + "loss": 0.834, + "step": 43860 + }, + { + "epoch": 1.6887391722810394, + "grad_norm": 1.589240550994873, + "learning_rate": 1.1769681737251914e-05, + "loss": 0.7738, + "step": 43865 + }, + { + "epoch": 1.6889316650625603, + "grad_norm": 1.132411241531372, + "learning_rate": 1.1755452547832846e-05, + "loss": 0.8, + "step": 43870 + }, + { + "epoch": 1.689124157844081, + "grad_norm": 0.9924208521842957, + "learning_rate": 1.1741231427793097e-05, + "loss": 0.7401, + "step": 43875 + }, + { + "epoch": 1.6893166506256017, + "grad_norm": 1.3757164478302002, + "learning_rate": 1.172701837843304e-05, + "loss": 0.9371, + "step": 43880 + }, + { + "epoch": 1.6895091434071223, + "grad_norm": 1.4748291969299316, + "learning_rate": 1.1712813401052414e-05, + "loss": 0.7713, + "step": 43885 + }, + { + "epoch": 1.689701636188643, + "grad_norm": 2.083977460861206, + "learning_rate": 1.1698616496950143e-05, + "loss": 0.7284, + "step": 43890 + }, + { + "epoch": 1.6898941289701637, + "grad_norm": 1.163346529006958, + "learning_rate": 1.1684427667424458e-05, + "loss": 0.7961, + "step": 43895 + }, + { + "epoch": 1.6900866217516843, + "grad_norm": 1.0689622163772583, + "learning_rate": 1.1670246913772841e-05, + "loss": 0.8038, + "step": 43900 + }, + { + "epoch": 1.690279114533205, + "grad_norm": 1.596536636352539, + "learning_rate": 1.1656074237291991e-05, + "loss": 0.852, + "step": 43905 + }, + { + "epoch": 1.6904716073147257, + "grad_norm": 2.0381526947021484, + "learning_rate": 1.1641909639277936e-05, + "loss": 0.8369, + "step": 43910 + }, + { + "epoch": 1.6906641000962463, + "grad_norm": 0.8638001084327698, + "learning_rate": 1.162775312102592e-05, + "loss": 0.5935, + "step": 43915 + }, + { + "epoch": 1.690856592877767, + "grad_norm": 1.7793620824813843, + "learning_rate": 1.161360468383048e-05, + "loss": 0.8603, + "step": 43920 + }, + { + "epoch": 1.6910490856592877, + "grad_norm": 1.2274523973464966, + "learning_rate": 1.1599464328985355e-05, + "loss": 0.8225, + "step": 43925 + }, + { + "epoch": 1.6912415784408084, + "grad_norm": 1.3787938356399536, + "learning_rate": 1.158533205778366e-05, + "loss": 0.9415, + "step": 43930 + }, + { + "epoch": 1.691434071222329, + "grad_norm": 1.7021337747573853, + "learning_rate": 1.157120787151763e-05, + "loss": 0.9423, + "step": 43935 + }, + { + "epoch": 1.6916265640038497, + "grad_norm": 1.1812869310379028, + "learning_rate": 1.1557091771478855e-05, + "loss": 0.8007, + "step": 43940 + }, + { + "epoch": 1.6918190567853706, + "grad_norm": 1.0627899169921875, + "learning_rate": 1.1542983758958148e-05, + "loss": 0.8179, + "step": 43945 + }, + { + "epoch": 1.6920115495668913, + "grad_norm": 1.115240216255188, + "learning_rate": 1.152888383524563e-05, + "loss": 0.811, + "step": 43950 + }, + { + "epoch": 1.692204042348412, + "grad_norm": 0.7895986437797546, + "learning_rate": 1.151479200163058e-05, + "loss": 0.8463, + "step": 43955 + }, + { + "epoch": 1.6923965351299326, + "grad_norm": 1.0171902179718018, + "learning_rate": 1.1500708259401682e-05, + "loss": 0.7925, + "step": 43960 + }, + { + "epoch": 1.6925890279114533, + "grad_norm": 0.9279621243476868, + "learning_rate": 1.1486632609846726e-05, + "loss": 0.7073, + "step": 43965 + }, + { + "epoch": 1.6927815206929742, + "grad_norm": 1.056172490119934, + "learning_rate": 1.1472565054252882e-05, + "loss": 0.7971, + "step": 43970 + }, + { + "epoch": 1.6929740134744948, + "grad_norm": 1.2332504987716675, + "learning_rate": 1.1458505593906522e-05, + "loss": 0.8648, + "step": 43975 + }, + { + "epoch": 1.6931665062560155, + "grad_norm": 1.2039694786071777, + "learning_rate": 1.1444454230093315e-05, + "loss": 0.7207, + "step": 43980 + }, + { + "epoch": 1.6933589990375362, + "grad_norm": 1.3070398569107056, + "learning_rate": 1.1430410964098115e-05, + "loss": 0.6398, + "step": 43985 + }, + { + "epoch": 1.6935514918190568, + "grad_norm": 1.1500900983810425, + "learning_rate": 1.1416375797205114e-05, + "loss": 0.7082, + "step": 43990 + }, + { + "epoch": 1.6937439846005775, + "grad_norm": 2.088250160217285, + "learning_rate": 1.1402348730697731e-05, + "loss": 0.7245, + "step": 43995 + }, + { + "epoch": 1.6939364773820982, + "grad_norm": 1.7187893390655518, + "learning_rate": 1.1388329765858651e-05, + "loss": 0.7806, + "step": 44000 + }, + { + "epoch": 1.6941289701636189, + "grad_norm": 1.2111575603485107, + "learning_rate": 1.137431890396985e-05, + "loss": 0.6781, + "step": 44005 + }, + { + "epoch": 1.6943214629451395, + "grad_norm": 1.4132143259048462, + "learning_rate": 1.1360316146312455e-05, + "loss": 1.0329, + "step": 44010 + }, + { + "epoch": 1.6945139557266602, + "grad_norm": 1.3241591453552246, + "learning_rate": 1.1346321494166978e-05, + "loss": 0.8164, + "step": 44015 + }, + { + "epoch": 1.6947064485081809, + "grad_norm": 2.9741406440734863, + "learning_rate": 1.1332334948813117e-05, + "loss": 0.8342, + "step": 44020 + }, + { + "epoch": 1.6948989412897015, + "grad_norm": 1.5332659482955933, + "learning_rate": 1.1318356511529871e-05, + "loss": 0.6292, + "step": 44025 + }, + { + "epoch": 1.6950914340712222, + "grad_norm": 1.2463070154190063, + "learning_rate": 1.1304386183595428e-05, + "loss": 0.7732, + "step": 44030 + }, + { + "epoch": 1.6952839268527429, + "grad_norm": 1.6358565092086792, + "learning_rate": 1.1290423966287345e-05, + "loss": 0.7089, + "step": 44035 + }, + { + "epoch": 1.6954764196342638, + "grad_norm": 1.1293914318084717, + "learning_rate": 1.1276469860882332e-05, + "loss": 0.8117, + "step": 44040 + }, + { + "epoch": 1.6956689124157844, + "grad_norm": 1.2490708827972412, + "learning_rate": 1.1262523868656405e-05, + "loss": 0.8166, + "step": 44045 + }, + { + "epoch": 1.695861405197305, + "grad_norm": 1.3659740686416626, + "learning_rate": 1.124858599088484e-05, + "loss": 0.7455, + "step": 44050 + }, + { + "epoch": 1.6960538979788258, + "grad_norm": 1.2188682556152344, + "learning_rate": 1.1234656228842177e-05, + "loss": 0.8226, + "step": 44055 + }, + { + "epoch": 1.6962463907603464, + "grad_norm": 1.0454777479171753, + "learning_rate": 1.122073458380215e-05, + "loss": 0.8888, + "step": 44060 + }, + { + "epoch": 1.6964388835418673, + "grad_norm": 2.2984609603881836, + "learning_rate": 1.1206821057037886e-05, + "loss": 0.8094, + "step": 44065 + }, + { + "epoch": 1.696631376323388, + "grad_norm": 1.107490062713623, + "learning_rate": 1.11929156498216e-05, + "loss": 0.8469, + "step": 44070 + }, + { + "epoch": 1.6968238691049087, + "grad_norm": 0.9354246854782104, + "learning_rate": 1.1179018363424899e-05, + "loss": 0.6261, + "step": 44075 + }, + { + "epoch": 1.6970163618864293, + "grad_norm": 1.164993405342102, + "learning_rate": 1.1165129199118574e-05, + "loss": 0.61, + "step": 44080 + }, + { + "epoch": 1.69720885466795, + "grad_norm": 0.9452062845230103, + "learning_rate": 1.1151248158172722e-05, + "loss": 0.7721, + "step": 44085 + }, + { + "epoch": 1.6974013474494707, + "grad_norm": 0.8486310839653015, + "learning_rate": 1.1137375241856619e-05, + "loss": 0.9994, + "step": 44090 + }, + { + "epoch": 1.6975938402309914, + "grad_norm": 1.195550560951233, + "learning_rate": 1.1123510451438934e-05, + "loss": 0.812, + "step": 44095 + }, + { + "epoch": 1.697786333012512, + "grad_norm": 1.6935982704162598, + "learning_rate": 1.1109653788187447e-05, + "loss": 0.7816, + "step": 44100 + }, + { + "epoch": 1.6979788257940327, + "grad_norm": 1.238194465637207, + "learning_rate": 1.1095805253369274e-05, + "loss": 0.8888, + "step": 44105 + }, + { + "epoch": 1.6981713185755534, + "grad_norm": 1.1349773406982422, + "learning_rate": 1.108196484825077e-05, + "loss": 0.6549, + "step": 44110 + }, + { + "epoch": 1.698363811357074, + "grad_norm": 1.0032782554626465, + "learning_rate": 1.1068132574097557e-05, + "loss": 0.7298, + "step": 44115 + }, + { + "epoch": 1.6985563041385947, + "grad_norm": 2.31643009185791, + "learning_rate": 1.1054308432174521e-05, + "loss": 0.871, + "step": 44120 + }, + { + "epoch": 1.6987487969201154, + "grad_norm": 1.0429223775863647, + "learning_rate": 1.1040492423745752e-05, + "loss": 0.7892, + "step": 44125 + }, + { + "epoch": 1.698941289701636, + "grad_norm": 1.4634356498718262, + "learning_rate": 1.102668455007464e-05, + "loss": 0.679, + "step": 44130 + }, + { + "epoch": 1.6991337824831567, + "grad_norm": 1.5524640083312988, + "learning_rate": 1.1012884812423829e-05, + "loss": 0.9072, + "step": 44135 + }, + { + "epoch": 1.6993262752646776, + "grad_norm": 0.9184067845344543, + "learning_rate": 1.0999093212055244e-05, + "loss": 0.6238, + "step": 44140 + }, + { + "epoch": 1.6995187680461983, + "grad_norm": 1.144820213317871, + "learning_rate": 1.0985309750229966e-05, + "loss": 0.7566, + "step": 44145 + }, + { + "epoch": 1.699711260827719, + "grad_norm": 1.1043323278427124, + "learning_rate": 1.0971534428208485e-05, + "loss": 0.7834, + "step": 44150 + }, + { + "epoch": 1.6999037536092396, + "grad_norm": 0.9373242259025574, + "learning_rate": 1.0957767247250395e-05, + "loss": 0.7216, + "step": 44155 + }, + { + "epoch": 1.7000962463907605, + "grad_norm": 0.977449357509613, + "learning_rate": 1.0944008208614643e-05, + "loss": 0.7585, + "step": 44160 + }, + { + "epoch": 1.7002887391722812, + "grad_norm": 1.5112518072128296, + "learning_rate": 1.09302573135594e-05, + "loss": 0.7718, + "step": 44165 + }, + { + "epoch": 1.7004812319538019, + "grad_norm": 1.095468282699585, + "learning_rate": 1.0916514563342106e-05, + "loss": 0.9487, + "step": 44170 + }, + { + "epoch": 1.7006737247353225, + "grad_norm": 1.084921956062317, + "learning_rate": 1.0902779959219401e-05, + "loss": 0.8464, + "step": 44175 + }, + { + "epoch": 1.7008662175168432, + "grad_norm": 2.3119983673095703, + "learning_rate": 1.0889053502447278e-05, + "loss": 0.7722, + "step": 44180 + }, + { + "epoch": 1.7010587102983639, + "grad_norm": 0.5821973085403442, + "learning_rate": 1.08753351942809e-05, + "loss": 0.715, + "step": 44185 + }, + { + "epoch": 1.7012512030798845, + "grad_norm": 1.1047406196594238, + "learning_rate": 1.086162503597472e-05, + "loss": 0.7629, + "step": 44190 + }, + { + "epoch": 1.7014436958614052, + "grad_norm": 1.3281866312026978, + "learning_rate": 1.0847923028782437e-05, + "loss": 0.7576, + "step": 44195 + }, + { + "epoch": 1.7016361886429259, + "grad_norm": 1.2818950414657593, + "learning_rate": 1.0834229173957045e-05, + "loss": 0.722, + "step": 44200 + }, + { + "epoch": 1.7018286814244465, + "grad_norm": 0.4923422336578369, + "learning_rate": 1.08205434727507e-05, + "loss": 0.7202, + "step": 44205 + }, + { + "epoch": 1.7020211742059672, + "grad_norm": 1.5802128314971924, + "learning_rate": 1.0806865926414889e-05, + "loss": 0.7272, + "step": 44210 + }, + { + "epoch": 1.7022136669874879, + "grad_norm": 2.5249600410461426, + "learning_rate": 1.0793196536200346e-05, + "loss": 0.8254, + "step": 44215 + }, + { + "epoch": 1.7024061597690086, + "grad_norm": 1.4376665353775024, + "learning_rate": 1.0779535303357035e-05, + "loss": 0.8114, + "step": 44220 + }, + { + "epoch": 1.7025986525505292, + "grad_norm": 1.4501975774765015, + "learning_rate": 1.0765882229134205e-05, + "loss": 0.953, + "step": 44225 + }, + { + "epoch": 1.70279114533205, + "grad_norm": 1.0345442295074463, + "learning_rate": 1.0752237314780311e-05, + "loss": 0.7878, + "step": 44230 + }, + { + "epoch": 1.7029836381135708, + "grad_norm": 0.9220666289329529, + "learning_rate": 1.07386005615431e-05, + "loss": 0.8297, + "step": 44235 + }, + { + "epoch": 1.7031761308950915, + "grad_norm": 1.8300734758377075, + "learning_rate": 1.0724971970669561e-05, + "loss": 0.7797, + "step": 44240 + }, + { + "epoch": 1.7033686236766121, + "grad_norm": 1.5990431308746338, + "learning_rate": 1.0711351543405967e-05, + "loss": 0.79, + "step": 44245 + }, + { + "epoch": 1.7035611164581328, + "grad_norm": 1.114249587059021, + "learning_rate": 1.0697739280997753e-05, + "loss": 0.7867, + "step": 44250 + }, + { + "epoch": 1.7037536092396535, + "grad_norm": 1.217193603515625, + "learning_rate": 1.0684135184689748e-05, + "loss": 0.756, + "step": 44255 + }, + { + "epoch": 1.7039461020211744, + "grad_norm": 1.1771777868270874, + "learning_rate": 1.0670539255725886e-05, + "loss": 0.7378, + "step": 44260 + }, + { + "epoch": 1.704138594802695, + "grad_norm": 1.1569862365722656, + "learning_rate": 1.0656951495349466e-05, + "loss": 0.8281, + "step": 44265 + }, + { + "epoch": 1.7043310875842157, + "grad_norm": 1.5429112911224365, + "learning_rate": 1.064337190480299e-05, + "loss": 0.7918, + "step": 44270 + }, + { + "epoch": 1.7045235803657364, + "grad_norm": 1.192523717880249, + "learning_rate": 1.0629800485328235e-05, + "loss": 0.8646, + "step": 44275 + }, + { + "epoch": 1.704716073147257, + "grad_norm": 2.4051706790924072, + "learning_rate": 1.061623723816616e-05, + "loss": 0.7809, + "step": 44280 + }, + { + "epoch": 1.7049085659287777, + "grad_norm": 1.8021824359893799, + "learning_rate": 1.0602682164557121e-05, + "loss": 0.8048, + "step": 44285 + }, + { + "epoch": 1.7051010587102984, + "grad_norm": 0.9928067922592163, + "learning_rate": 1.0589135265740569e-05, + "loss": 0.8615, + "step": 44290 + }, + { + "epoch": 1.705293551491819, + "grad_norm": 0.8101016879081726, + "learning_rate": 1.0575596542955312e-05, + "loss": 0.6574, + "step": 44295 + }, + { + "epoch": 1.7054860442733397, + "grad_norm": 1.1880675554275513, + "learning_rate": 1.0562065997439364e-05, + "loss": 0.7205, + "step": 44300 + }, + { + "epoch": 1.7056785370548604, + "grad_norm": 1.8503644466400146, + "learning_rate": 1.054854363043003e-05, + "loss": 0.8311, + "step": 44305 + }, + { + "epoch": 1.705871029836381, + "grad_norm": 1.2988847494125366, + "learning_rate": 1.053502944316378e-05, + "loss": 0.8, + "step": 44310 + }, + { + "epoch": 1.7060635226179017, + "grad_norm": 1.75754714012146, + "learning_rate": 1.0521523436876479e-05, + "loss": 0.7782, + "step": 44315 + }, + { + "epoch": 1.7062560153994224, + "grad_norm": 1.1873159408569336, + "learning_rate": 1.0508025612803096e-05, + "loss": 0.8162, + "step": 44320 + }, + { + "epoch": 1.706448508180943, + "grad_norm": 1.8230105638504028, + "learning_rate": 1.0494535972177932e-05, + "loss": 0.7841, + "step": 44325 + }, + { + "epoch": 1.706641000962464, + "grad_norm": 1.1108636856079102, + "learning_rate": 1.0481054516234546e-05, + "loss": 0.7711, + "step": 44330 + }, + { + "epoch": 1.7068334937439846, + "grad_norm": 1.4503856897354126, + "learning_rate": 1.0467581246205726e-05, + "loss": 0.8515, + "step": 44335 + }, + { + "epoch": 1.7070259865255053, + "grad_norm": 1.6712833642959595, + "learning_rate": 1.0454116163323491e-05, + "loss": 0.7857, + "step": 44340 + }, + { + "epoch": 1.707218479307026, + "grad_norm": 2.288127899169922, + "learning_rate": 1.0440659268819143e-05, + "loss": 0.6903, + "step": 44345 + }, + { + "epoch": 1.7074109720885466, + "grad_norm": 1.7935099601745605, + "learning_rate": 1.0427210563923228e-05, + "loss": 0.7954, + "step": 44350 + }, + { + "epoch": 1.7076034648700675, + "grad_norm": 1.5855534076690674, + "learning_rate": 1.0413770049865546e-05, + "loss": 0.8879, + "step": 44355 + }, + { + "epoch": 1.7077959576515882, + "grad_norm": 0.9473410844802856, + "learning_rate": 1.0400337727875153e-05, + "loss": 1.0308, + "step": 44360 + }, + { + "epoch": 1.7079884504331089, + "grad_norm": 1.1841708421707153, + "learning_rate": 1.0386913599180293e-05, + "loss": 0.9523, + "step": 44365 + }, + { + "epoch": 1.7081809432146295, + "grad_norm": 1.8651456832885742, + "learning_rate": 1.037349766500859e-05, + "loss": 0.8538, + "step": 44370 + }, + { + "epoch": 1.7083734359961502, + "grad_norm": 1.2280970811843872, + "learning_rate": 1.036008992658679e-05, + "loss": 0.7538, + "step": 44375 + }, + { + "epoch": 1.7085659287776709, + "grad_norm": 1.300763726234436, + "learning_rate": 1.0346690385140956e-05, + "loss": 0.9075, + "step": 44380 + }, + { + "epoch": 1.7087584215591916, + "grad_norm": 1.2974258661270142, + "learning_rate": 1.0333299041896383e-05, + "loss": 0.8188, + "step": 44385 + }, + { + "epoch": 1.7089509143407122, + "grad_norm": 1.9830495119094849, + "learning_rate": 1.0319915898077648e-05, + "loss": 0.7887, + "step": 44390 + }, + { + "epoch": 1.709143407122233, + "grad_norm": 1.0923196077346802, + "learning_rate": 1.0306540954908483e-05, + "loss": 0.8392, + "step": 44395 + }, + { + "epoch": 1.7093358999037536, + "grad_norm": 1.537200689315796, + "learning_rate": 1.0293174213612023e-05, + "loss": 0.8336, + "step": 44400 + }, + { + "epoch": 1.7095283926852742, + "grad_norm": 1.2134389877319336, + "learning_rate": 1.027981567541051e-05, + "loss": 0.7743, + "step": 44405 + }, + { + "epoch": 1.709720885466795, + "grad_norm": 0.6608363389968872, + "learning_rate": 1.026646534152551e-05, + "loss": 0.6123, + "step": 44410 + }, + { + "epoch": 1.7099133782483156, + "grad_norm": 1.0624284744262695, + "learning_rate": 1.0253123213177828e-05, + "loss": 0.6536, + "step": 44415 + }, + { + "epoch": 1.7101058710298362, + "grad_norm": 1.9946283102035522, + "learning_rate": 1.0239789291587531e-05, + "loss": 0.8706, + "step": 44420 + }, + { + "epoch": 1.710298363811357, + "grad_norm": 1.8715145587921143, + "learning_rate": 1.0226463577973877e-05, + "loss": 0.8116, + "step": 44425 + }, + { + "epoch": 1.7104908565928778, + "grad_norm": 1.597334384918213, + "learning_rate": 1.0213146073555424e-05, + "loss": 0.6751, + "step": 44430 + }, + { + "epoch": 1.7106833493743985, + "grad_norm": 1.811631202697754, + "learning_rate": 1.0199836779549987e-05, + "loss": 0.779, + "step": 44435 + }, + { + "epoch": 1.7108758421559191, + "grad_norm": 1.2666516304016113, + "learning_rate": 1.0186535697174603e-05, + "loss": 0.8132, + "step": 44440 + }, + { + "epoch": 1.7110683349374398, + "grad_norm": 1.3092554807662964, + "learning_rate": 1.017324282764559e-05, + "loss": 0.6895, + "step": 44445 + }, + { + "epoch": 1.7112608277189605, + "grad_norm": 1.3765099048614502, + "learning_rate": 1.0159958172178452e-05, + "loss": 0.7155, + "step": 44450 + }, + { + "epoch": 1.7114533205004814, + "grad_norm": 1.301037311553955, + "learning_rate": 1.0146681731988006e-05, + "loss": 0.827, + "step": 44455 + }, + { + "epoch": 1.711645813282002, + "grad_norm": 1.229777455329895, + "learning_rate": 1.0133413508288292e-05, + "loss": 0.7375, + "step": 44460 + }, + { + "epoch": 1.7118383060635227, + "grad_norm": 1.2582491636276245, + "learning_rate": 1.0120153502292618e-05, + "loss": 0.9079, + "step": 44465 + }, + { + "epoch": 1.7120307988450434, + "grad_norm": 1.4134953022003174, + "learning_rate": 1.0106901715213468e-05, + "loss": 0.7504, + "step": 44470 + }, + { + "epoch": 1.712223291626564, + "grad_norm": 1.3563551902770996, + "learning_rate": 1.0093658148262709e-05, + "loss": 0.7363, + "step": 44475 + }, + { + "epoch": 1.7124157844080847, + "grad_norm": 1.642959713935852, + "learning_rate": 1.0080422802651312e-05, + "loss": 0.7629, + "step": 44480 + }, + { + "epoch": 1.7126082771896054, + "grad_norm": 1.9216701984405518, + "learning_rate": 1.0067195679589591e-05, + "loss": 0.8328, + "step": 44485 + }, + { + "epoch": 1.712800769971126, + "grad_norm": 0.9144423007965088, + "learning_rate": 1.0053976780287078e-05, + "loss": 0.7265, + "step": 44490 + }, + { + "epoch": 1.7129932627526467, + "grad_norm": 1.0702438354492188, + "learning_rate": 1.0040766105952559e-05, + "loss": 0.8567, + "step": 44495 + }, + { + "epoch": 1.7131857555341674, + "grad_norm": 1.9012739658355713, + "learning_rate": 1.0027563657794026e-05, + "loss": 0.8425, + "step": 44500 + }, + { + "epoch": 1.713378248315688, + "grad_norm": 1.2741745710372925, + "learning_rate": 1.0014369437018823e-05, + "loss": 0.8694, + "step": 44505 + }, + { + "epoch": 1.7135707410972087, + "grad_norm": 2.203899383544922, + "learning_rate": 1.0001183444833417e-05, + "loss": 0.7839, + "step": 44510 + }, + { + "epoch": 1.7137632338787294, + "grad_norm": 1.059478998184204, + "learning_rate": 9.98800568244359e-06, + "loss": 0.7702, + "step": 44515 + }, + { + "epoch": 1.71395572666025, + "grad_norm": 2.4111287593841553, + "learning_rate": 9.974836151054367e-06, + "loss": 0.7798, + "step": 44520 + }, + { + "epoch": 1.714148219441771, + "grad_norm": 1.2681207656860352, + "learning_rate": 9.96167485187004e-06, + "loss": 0.818, + "step": 44525 + }, + { + "epoch": 1.7143407122232917, + "grad_norm": 1.0277059078216553, + "learning_rate": 9.948521786094079e-06, + "loss": 0.6552, + "step": 44530 + }, + { + "epoch": 1.7145332050048123, + "grad_norm": 0.9331089854240417, + "learning_rate": 9.935376954929265e-06, + "loss": 0.8268, + "step": 44535 + }, + { + "epoch": 1.714725697786333, + "grad_norm": 1.3255749940872192, + "learning_rate": 9.922240359577606e-06, + "loss": 0.8625, + "step": 44540 + }, + { + "epoch": 1.7149181905678537, + "grad_norm": 1.65230393409729, + "learning_rate": 9.90911200124035e-06, + "loss": 0.7516, + "step": 44545 + }, + { + "epoch": 1.7151106833493746, + "grad_norm": 1.164333462715149, + "learning_rate": 9.895991881118028e-06, + "loss": 0.8635, + "step": 44550 + }, + { + "epoch": 1.7153031761308952, + "grad_norm": 1.0087164640426636, + "learning_rate": 9.882880000410344e-06, + "loss": 0.703, + "step": 44555 + }, + { + "epoch": 1.715495668912416, + "grad_norm": 1.272603988647461, + "learning_rate": 9.869776360316307e-06, + "loss": 0.6907, + "step": 44560 + }, + { + "epoch": 1.7156881616939366, + "grad_norm": 0.8092613816261292, + "learning_rate": 9.85668096203417e-06, + "loss": 0.7481, + "step": 44565 + }, + { + "epoch": 1.7158806544754572, + "grad_norm": 2.1582627296447754, + "learning_rate": 9.84359380676143e-06, + "loss": 0.9793, + "step": 44570 + }, + { + "epoch": 1.716073147256978, + "grad_norm": 2.2021024227142334, + "learning_rate": 9.830514895694775e-06, + "loss": 0.7774, + "step": 44575 + }, + { + "epoch": 1.7162656400384986, + "grad_norm": 0.9477373361587524, + "learning_rate": 9.817444230030247e-06, + "loss": 0.7769, + "step": 44580 + }, + { + "epoch": 1.7164581328200192, + "grad_norm": 1.2492976188659668, + "learning_rate": 9.804381810963015e-06, + "loss": 0.7954, + "step": 44585 + }, + { + "epoch": 1.71665062560154, + "grad_norm": 1.2578282356262207, + "learning_rate": 9.791327639687587e-06, + "loss": 0.9006, + "step": 44590 + }, + { + "epoch": 1.7168431183830606, + "grad_norm": 1.1162382364273071, + "learning_rate": 9.778281717397652e-06, + "loss": 0.755, + "step": 44595 + }, + { + "epoch": 1.7170356111645813, + "grad_norm": 1.123497724533081, + "learning_rate": 9.765244045286227e-06, + "loss": 0.711, + "step": 44600 + }, + { + "epoch": 1.717228103946102, + "grad_norm": 1.5501790046691895, + "learning_rate": 9.75221462454543e-06, + "loss": 0.8798, + "step": 44605 + }, + { + "epoch": 1.7174205967276226, + "grad_norm": 1.510923147201538, + "learning_rate": 9.739193456366813e-06, + "loss": 0.8363, + "step": 44610 + }, + { + "epoch": 1.7176130895091433, + "grad_norm": 1.6598632335662842, + "learning_rate": 9.726180541941e-06, + "loss": 0.8614, + "step": 44615 + }, + { + "epoch": 1.717805582290664, + "grad_norm": 1.1419432163238525, + "learning_rate": 9.713175882458003e-06, + "loss": 0.8118, + "step": 44620 + }, + { + "epoch": 1.7179980750721848, + "grad_norm": 1.0711899995803833, + "learning_rate": 9.700179479106953e-06, + "loss": 0.7971, + "step": 44625 + }, + { + "epoch": 1.7181905678537055, + "grad_norm": 1.0233666896820068, + "learning_rate": 9.687191333076306e-06, + "loss": 0.7606, + "step": 44630 + }, + { + "epoch": 1.7183830606352262, + "grad_norm": 2.493830919265747, + "learning_rate": 9.674211445553738e-06, + "loss": 0.9428, + "step": 44635 + }, + { + "epoch": 1.7185755534167468, + "grad_norm": 1.0951639413833618, + "learning_rate": 9.661239817726209e-06, + "loss": 0.9051, + "step": 44640 + }, + { + "epoch": 1.7187680461982677, + "grad_norm": 1.3658279180526733, + "learning_rate": 9.648276450779836e-06, + "loss": 0.5982, + "step": 44645 + }, + { + "epoch": 1.7189605389797884, + "grad_norm": 1.604722499847412, + "learning_rate": 9.635321345900061e-06, + "loss": 0.8538, + "step": 44650 + }, + { + "epoch": 1.719153031761309, + "grad_norm": 1.2309184074401855, + "learning_rate": 9.622374504271536e-06, + "loss": 0.7353, + "step": 44655 + }, + { + "epoch": 1.7193455245428297, + "grad_norm": 1.8740421533584595, + "learning_rate": 9.60943592707817e-06, + "loss": 0.8622, + "step": 44660 + }, + { + "epoch": 1.7195380173243504, + "grad_norm": 1.5069630146026611, + "learning_rate": 9.596505615503116e-06, + "loss": 0.7259, + "step": 44665 + }, + { + "epoch": 1.719730510105871, + "grad_norm": 0.8005820512771606, + "learning_rate": 9.583583570728738e-06, + "loss": 0.876, + "step": 44670 + }, + { + "epoch": 1.7199230028873917, + "grad_norm": 1.9598007202148438, + "learning_rate": 9.570669793936694e-06, + "loss": 0.799, + "step": 44675 + }, + { + "epoch": 1.7201154956689124, + "grad_norm": 1.1604948043823242, + "learning_rate": 9.557764286307846e-06, + "loss": 0.7464, + "step": 44680 + }, + { + "epoch": 1.720307988450433, + "grad_norm": 0.9628298282623291, + "learning_rate": 9.544867049022354e-06, + "loss": 0.7404, + "step": 44685 + }, + { + "epoch": 1.7205004812319538, + "grad_norm": 1.5390831232070923, + "learning_rate": 9.53197808325953e-06, + "loss": 0.8366, + "step": 44690 + }, + { + "epoch": 1.7206929740134744, + "grad_norm": 0.8520711660385132, + "learning_rate": 9.51909739019804e-06, + "loss": 0.7651, + "step": 44695 + }, + { + "epoch": 1.720885466794995, + "grad_norm": 1.1799733638763428, + "learning_rate": 9.506224971015709e-06, + "loss": 0.7417, + "step": 44700 + }, + { + "epoch": 1.7210779595765158, + "grad_norm": 1.2342482805252075, + "learning_rate": 9.49336082688962e-06, + "loss": 0.829, + "step": 44705 + }, + { + "epoch": 1.7212704523580364, + "grad_norm": 1.1500221490859985, + "learning_rate": 9.48050495899615e-06, + "loss": 0.8516, + "step": 44710 + }, + { + "epoch": 1.721462945139557, + "grad_norm": 1.6604115962982178, + "learning_rate": 9.467657368510874e-06, + "loss": 0.8381, + "step": 44715 + }, + { + "epoch": 1.721655437921078, + "grad_norm": 1.3249857425689697, + "learning_rate": 9.454818056608573e-06, + "loss": 0.7726, + "step": 44720 + }, + { + "epoch": 1.7218479307025987, + "grad_norm": 1.1568611860275269, + "learning_rate": 9.441987024463384e-06, + "loss": 0.7704, + "step": 44725 + }, + { + "epoch": 1.7220404234841193, + "grad_norm": 1.5718170404434204, + "learning_rate": 9.429164273248581e-06, + "loss": 0.7952, + "step": 44730 + }, + { + "epoch": 1.72223291626564, + "grad_norm": 1.3416763544082642, + "learning_rate": 9.416349804136726e-06, + "loss": 0.77, + "step": 44735 + }, + { + "epoch": 1.7224254090471607, + "grad_norm": 1.5594481229782104, + "learning_rate": 9.403543618299614e-06, + "loss": 0.7476, + "step": 44740 + }, + { + "epoch": 1.7226179018286816, + "grad_norm": 1.517449975013733, + "learning_rate": 9.390745716908312e-06, + "loss": 0.6838, + "step": 44745 + }, + { + "epoch": 1.7228103946102022, + "grad_norm": 0.9632403254508972, + "learning_rate": 9.377956101133068e-06, + "loss": 0.7085, + "step": 44750 + }, + { + "epoch": 1.723002887391723, + "grad_norm": 1.5700565576553345, + "learning_rate": 9.365174772143426e-06, + "loss": 0.8404, + "step": 44755 + }, + { + "epoch": 1.7231953801732436, + "grad_norm": 1.4765150547027588, + "learning_rate": 9.35240173110814e-06, + "loss": 0.811, + "step": 44760 + }, + { + "epoch": 1.7233878729547643, + "grad_norm": 1.0645157098770142, + "learning_rate": 9.339636979195244e-06, + "loss": 0.7075, + "step": 44765 + }, + { + "epoch": 1.723580365736285, + "grad_norm": 1.7466483116149902, + "learning_rate": 9.326880517571978e-06, + "loss": 0.8613, + "step": 44770 + }, + { + "epoch": 1.7237728585178056, + "grad_norm": 1.016494870185852, + "learning_rate": 9.314132347404824e-06, + "loss": 0.697, + "step": 44775 + }, + { + "epoch": 1.7239653512993263, + "grad_norm": 1.2336843013763428, + "learning_rate": 9.301392469859527e-06, + "loss": 0.8933, + "step": 44780 + }, + { + "epoch": 1.724157844080847, + "grad_norm": 1.7032203674316406, + "learning_rate": 9.288660886101075e-06, + "loss": 0.7887, + "step": 44785 + }, + { + "epoch": 1.7243503368623676, + "grad_norm": 0.8340054154396057, + "learning_rate": 9.275937597293682e-06, + "loss": 0.8859, + "step": 44790 + }, + { + "epoch": 1.7245428296438883, + "grad_norm": 1.1701608896255493, + "learning_rate": 9.263222604600774e-06, + "loss": 0.7429, + "step": 44795 + }, + { + "epoch": 1.724735322425409, + "grad_norm": 0.891636073589325, + "learning_rate": 9.25051590918512e-06, + "loss": 0.7672, + "step": 44800 + }, + { + "epoch": 1.7249278152069296, + "grad_norm": 1.2118922472000122, + "learning_rate": 9.237817512208602e-06, + "loss": 0.8563, + "step": 44805 + }, + { + "epoch": 1.7251203079884503, + "grad_norm": 1.1670253276824951, + "learning_rate": 9.225127414832436e-06, + "loss": 0.8383, + "step": 44810 + }, + { + "epoch": 1.7253128007699712, + "grad_norm": 1.722845435142517, + "learning_rate": 9.212445618217035e-06, + "loss": 0.8429, + "step": 44815 + }, + { + "epoch": 1.7255052935514918, + "grad_norm": 1.2573529481887817, + "learning_rate": 9.199772123522088e-06, + "loss": 0.9031, + "step": 44820 + }, + { + "epoch": 1.7256977863330125, + "grad_norm": 1.844647765159607, + "learning_rate": 9.187106931906442e-06, + "loss": 0.8546, + "step": 44825 + }, + { + "epoch": 1.7258902791145332, + "grad_norm": 2.288663864135742, + "learning_rate": 9.174450044528327e-06, + "loss": 0.7559, + "step": 44830 + }, + { + "epoch": 1.7260827718960539, + "grad_norm": 1.2201417684555054, + "learning_rate": 9.161801462545084e-06, + "loss": 0.7306, + "step": 44835 + }, + { + "epoch": 1.7262752646775748, + "grad_norm": 1.2007523775100708, + "learning_rate": 9.14916118711333e-06, + "loss": 0.7666, + "step": 44840 + }, + { + "epoch": 1.7264677574590954, + "grad_norm": 0.9047454595565796, + "learning_rate": 9.136529219388968e-06, + "loss": 0.7994, + "step": 44845 + }, + { + "epoch": 1.726660250240616, + "grad_norm": 0.8384250402450562, + "learning_rate": 9.123905560527102e-06, + "loss": 0.6711, + "step": 44850 + }, + { + "epoch": 1.7268527430221368, + "grad_norm": 1.0188740491867065, + "learning_rate": 9.111290211682044e-06, + "loss": 0.7907, + "step": 44855 + }, + { + "epoch": 1.7270452358036574, + "grad_norm": 1.1332343816757202, + "learning_rate": 9.09868317400745e-06, + "loss": 0.7105, + "step": 44860 + }, + { + "epoch": 1.727237728585178, + "grad_norm": 0.6927266716957092, + "learning_rate": 9.086084448656095e-06, + "loss": 0.9879, + "step": 44865 + }, + { + "epoch": 1.7274302213666988, + "grad_norm": 1.233681559562683, + "learning_rate": 9.073494036780062e-06, + "loss": 0.7542, + "step": 44870 + }, + { + "epoch": 1.7276227141482194, + "grad_norm": 1.4534273147583008, + "learning_rate": 9.06091193953067e-06, + "loss": 0.7775, + "step": 44875 + }, + { + "epoch": 1.72781520692974, + "grad_norm": 1.0983202457427979, + "learning_rate": 9.048338158058467e-06, + "loss": 0.9082, + "step": 44880 + }, + { + "epoch": 1.7280076997112608, + "grad_norm": 0.7827137112617493, + "learning_rate": 9.035772693513256e-06, + "loss": 0.7013, + "step": 44885 + }, + { + "epoch": 1.7282001924927815, + "grad_norm": 1.1753028631210327, + "learning_rate": 9.023215547044028e-06, + "loss": 0.6837, + "step": 44890 + }, + { + "epoch": 1.7283926852743021, + "grad_norm": 1.4475585222244263, + "learning_rate": 9.010666719799077e-06, + "loss": 0.7162, + "step": 44895 + }, + { + "epoch": 1.7285851780558228, + "grad_norm": 1.2361037731170654, + "learning_rate": 8.998126212925906e-06, + "loss": 0.7424, + "step": 44900 + }, + { + "epoch": 1.7287776708373435, + "grad_norm": 1.0601775646209717, + "learning_rate": 8.985594027571276e-06, + "loss": 0.6877, + "step": 44905 + }, + { + "epoch": 1.7289701636188641, + "grad_norm": 1.1087630987167358, + "learning_rate": 8.973070164881126e-06, + "loss": 0.7211, + "step": 44910 + }, + { + "epoch": 1.729162656400385, + "grad_norm": 1.3524638414382935, + "learning_rate": 8.96055462600074e-06, + "loss": 0.7259, + "step": 44915 + }, + { + "epoch": 1.7293551491819057, + "grad_norm": 1.305503487586975, + "learning_rate": 8.948047412074534e-06, + "loss": 0.8636, + "step": 44920 + }, + { + "epoch": 1.7295476419634264, + "grad_norm": 1.1058146953582764, + "learning_rate": 8.935548524246229e-06, + "loss": 0.8791, + "step": 44925 + }, + { + "epoch": 1.729740134744947, + "grad_norm": 1.93928062915802, + "learning_rate": 8.923057963658766e-06, + "loss": 0.7892, + "step": 44930 + }, + { + "epoch": 1.7299326275264677, + "grad_norm": 0.9320961236953735, + "learning_rate": 8.91057573145434e-06, + "loss": 0.8925, + "step": 44935 + }, + { + "epoch": 1.7301251203079886, + "grad_norm": 1.0383516550064087, + "learning_rate": 8.898101828774318e-06, + "loss": 0.7483, + "step": 44940 + }, + { + "epoch": 1.7303176130895093, + "grad_norm": 1.3876311779022217, + "learning_rate": 8.885636256759422e-06, + "loss": 0.8479, + "step": 44945 + }, + { + "epoch": 1.73051010587103, + "grad_norm": 2.2601609230041504, + "learning_rate": 8.873179016549505e-06, + "loss": 0.9106, + "step": 44950 + }, + { + "epoch": 1.7307025986525506, + "grad_norm": 0.9025186896324158, + "learning_rate": 8.8607301092837e-06, + "loss": 0.7625, + "step": 44955 + }, + { + "epoch": 1.7308950914340713, + "grad_norm": 2.425801992416382, + "learning_rate": 8.848289536100374e-06, + "loss": 0.9814, + "step": 44960 + }, + { + "epoch": 1.731087584215592, + "grad_norm": 1.1960424184799194, + "learning_rate": 8.835857298137173e-06, + "loss": 0.7639, + "step": 44965 + }, + { + "epoch": 1.7312800769971126, + "grad_norm": 1.3602185249328613, + "learning_rate": 8.8234333965309e-06, + "loss": 0.7965, + "step": 44970 + }, + { + "epoch": 1.7314725697786333, + "grad_norm": 1.1339243650436401, + "learning_rate": 8.811017832417645e-06, + "loss": 0.9131, + "step": 44975 + }, + { + "epoch": 1.731665062560154, + "grad_norm": 1.8668417930603027, + "learning_rate": 8.798610606932744e-06, + "loss": 0.8395, + "step": 44980 + }, + { + "epoch": 1.7318575553416746, + "grad_norm": 2.117316246032715, + "learning_rate": 8.786211721210747e-06, + "loss": 0.6666, + "step": 44985 + }, + { + "epoch": 1.7320500481231953, + "grad_norm": 0.8929471373558044, + "learning_rate": 8.773821176385466e-06, + "loss": 0.829, + "step": 44990 + }, + { + "epoch": 1.732242540904716, + "grad_norm": 2.004077196121216, + "learning_rate": 8.761438973589908e-06, + "loss": 0.8636, + "step": 44995 + }, + { + "epoch": 1.7324350336862366, + "grad_norm": 0.8660832643508911, + "learning_rate": 8.749065113956357e-06, + "loss": 0.7696, + "step": 45000 + }, + { + "epoch": 1.7326275264677573, + "grad_norm": 1.3923887014389038, + "learning_rate": 8.736699598616305e-06, + "loss": 0.8972, + "step": 45005 + }, + { + "epoch": 1.7328200192492782, + "grad_norm": 0.835381805896759, + "learning_rate": 8.72434242870055e-06, + "loss": 0.7019, + "step": 45010 + }, + { + "epoch": 1.7330125120307989, + "grad_norm": 0.8896954655647278, + "learning_rate": 8.711993605338985e-06, + "loss": 0.8025, + "step": 45015 + }, + { + "epoch": 1.7332050048123195, + "grad_norm": 1.4621267318725586, + "learning_rate": 8.699653129660912e-06, + "loss": 0.7648, + "step": 45020 + }, + { + "epoch": 1.7333974975938402, + "grad_norm": 1.1056898832321167, + "learning_rate": 8.687321002794734e-06, + "loss": 0.7427, + "step": 45025 + }, + { + "epoch": 1.7335899903753609, + "grad_norm": 1.546595573425293, + "learning_rate": 8.674997225868165e-06, + "loss": 0.8088, + "step": 45030 + }, + { + "epoch": 1.7337824831568818, + "grad_norm": 0.7969051599502563, + "learning_rate": 8.662681800008121e-06, + "loss": 0.7576, + "step": 45035 + }, + { + "epoch": 1.7339749759384024, + "grad_norm": 0.9694302678108215, + "learning_rate": 8.650374726340793e-06, + "loss": 0.9295, + "step": 45040 + }, + { + "epoch": 1.734167468719923, + "grad_norm": 0.5235655903816223, + "learning_rate": 8.638076005991524e-06, + "loss": 0.7925, + "step": 45045 + }, + { + "epoch": 1.7343599615014438, + "grad_norm": 1.4736652374267578, + "learning_rate": 8.625785640085026e-06, + "loss": 0.8665, + "step": 45050 + }, + { + "epoch": 1.7345524542829645, + "grad_norm": 1.5209852457046509, + "learning_rate": 8.613503629745113e-06, + "loss": 0.8142, + "step": 45055 + }, + { + "epoch": 1.7347449470644851, + "grad_norm": 1.1320667266845703, + "learning_rate": 8.601229976094904e-06, + "loss": 0.7985, + "step": 45060 + }, + { + "epoch": 1.7349374398460058, + "grad_norm": 1.1330043077468872, + "learning_rate": 8.588964680256761e-06, + "loss": 0.8037, + "step": 45065 + }, + { + "epoch": 1.7351299326275265, + "grad_norm": 1.6776443719863892, + "learning_rate": 8.576707743352275e-06, + "loss": 0.8429, + "step": 45070 + }, + { + "epoch": 1.7353224254090471, + "grad_norm": 0.8801172971725464, + "learning_rate": 8.564459166502204e-06, + "loss": 0.8425, + "step": 45075 + }, + { + "epoch": 1.7355149181905678, + "grad_norm": 0.9202898740768433, + "learning_rate": 8.552218950826662e-06, + "loss": 0.7558, + "step": 45080 + }, + { + "epoch": 1.7357074109720885, + "grad_norm": 0.8619052767753601, + "learning_rate": 8.539987097444912e-06, + "loss": 0.7801, + "step": 45085 + }, + { + "epoch": 1.7358999037536091, + "grad_norm": 1.3250690698623657, + "learning_rate": 8.527763607475459e-06, + "loss": 0.8215, + "step": 45090 + }, + { + "epoch": 1.7360923965351298, + "grad_norm": 1.4232019186019897, + "learning_rate": 8.515548482036106e-06, + "loss": 0.9854, + "step": 45095 + }, + { + "epoch": 1.7362848893166505, + "grad_norm": 0.8504436016082764, + "learning_rate": 8.503341722243785e-06, + "loss": 0.7121, + "step": 45100 + }, + { + "epoch": 1.7364773820981714, + "grad_norm": 1.5012342929840088, + "learning_rate": 8.491143329214768e-06, + "loss": 0.8461, + "step": 45105 + }, + { + "epoch": 1.736669874879692, + "grad_norm": 1.4541362524032593, + "learning_rate": 8.478953304064485e-06, + "loss": 0.7671, + "step": 45110 + }, + { + "epoch": 1.7368623676612127, + "grad_norm": 1.0640056133270264, + "learning_rate": 8.466771647907679e-06, + "loss": 0.7623, + "step": 45115 + }, + { + "epoch": 1.7370548604427334, + "grad_norm": 1.897068738937378, + "learning_rate": 8.454598361858223e-06, + "loss": 0.8152, + "step": 45120 + }, + { + "epoch": 1.737247353224254, + "grad_norm": 1.7493306398391724, + "learning_rate": 8.44243344702934e-06, + "loss": 0.8049, + "step": 45125 + }, + { + "epoch": 1.737439846005775, + "grad_norm": 1.0308282375335693, + "learning_rate": 8.430276904533375e-06, + "loss": 0.9274, + "step": 45130 + }, + { + "epoch": 1.7376323387872956, + "grad_norm": 1.5391184091567993, + "learning_rate": 8.418128735482033e-06, + "loss": 0.8391, + "step": 45135 + }, + { + "epoch": 1.7378248315688163, + "grad_norm": 1.622266173362732, + "learning_rate": 8.40598894098612e-06, + "loss": 0.7576, + "step": 45140 + }, + { + "epoch": 1.738017324350337, + "grad_norm": 1.7522730827331543, + "learning_rate": 8.393857522155758e-06, + "loss": 0.6953, + "step": 45145 + }, + { + "epoch": 1.7382098171318576, + "grad_norm": 1.5102694034576416, + "learning_rate": 8.3817344801003e-06, + "loss": 0.7033, + "step": 45150 + }, + { + "epoch": 1.7384023099133783, + "grad_norm": 1.0262799263000488, + "learning_rate": 8.369619815928321e-06, + "loss": 0.8477, + "step": 45155 + }, + { + "epoch": 1.738594802694899, + "grad_norm": 1.1589692831039429, + "learning_rate": 8.357513530747585e-06, + "loss": 0.6228, + "step": 45160 + }, + { + "epoch": 1.7387872954764196, + "grad_norm": 1.038752555847168, + "learning_rate": 8.345415625665187e-06, + "loss": 0.8333, + "step": 45165 + }, + { + "epoch": 1.7389797882579403, + "grad_norm": 1.9438576698303223, + "learning_rate": 8.333326101787365e-06, + "loss": 0.8507, + "step": 45170 + }, + { + "epoch": 1.739172281039461, + "grad_norm": 1.3162816762924194, + "learning_rate": 8.321244960219632e-06, + "loss": 0.8264, + "step": 45175 + }, + { + "epoch": 1.7393647738209816, + "grad_norm": 1.4058345556259155, + "learning_rate": 8.309172202066728e-06, + "loss": 0.8086, + "step": 45180 + }, + { + "epoch": 1.7395572666025023, + "grad_norm": 1.2361959218978882, + "learning_rate": 8.297107828432649e-06, + "loss": 0.6796, + "step": 45185 + }, + { + "epoch": 1.739749759384023, + "grad_norm": 1.3547639846801758, + "learning_rate": 8.285051840420565e-06, + "loss": 0.8473, + "step": 45190 + }, + { + "epoch": 1.7399422521655437, + "grad_norm": 1.3733632564544678, + "learning_rate": 8.273004239132932e-06, + "loss": 0.7467, + "step": 45195 + }, + { + "epoch": 1.7401347449470643, + "grad_norm": 1.2235511541366577, + "learning_rate": 8.26096502567143e-06, + "loss": 0.8072, + "step": 45200 + }, + { + "epoch": 1.7403272377285852, + "grad_norm": 1.30812406539917, + "learning_rate": 8.24893420113696e-06, + "loss": 0.8308, + "step": 45205 + }, + { + "epoch": 1.740519730510106, + "grad_norm": 1.1475272178649902, + "learning_rate": 8.236911766629674e-06, + "loss": 0.7597, + "step": 45210 + }, + { + "epoch": 1.7407122232916266, + "grad_norm": 1.2318791151046753, + "learning_rate": 8.224897723248926e-06, + "loss": 0.7935, + "step": 45215 + }, + { + "epoch": 1.7409047160731472, + "grad_norm": 0.8889015913009644, + "learning_rate": 8.212892072093314e-06, + "loss": 0.7292, + "step": 45220 + }, + { + "epoch": 1.741097208854668, + "grad_norm": 2.2487030029296875, + "learning_rate": 8.200894814260695e-06, + "loss": 0.8981, + "step": 45225 + }, + { + "epoch": 1.7412897016361888, + "grad_norm": 2.454103469848633, + "learning_rate": 8.188905950848157e-06, + "loss": 0.9552, + "step": 45230 + }, + { + "epoch": 1.7414821944177095, + "grad_norm": 1.027070164680481, + "learning_rate": 8.176925482951925e-06, + "loss": 0.8115, + "step": 45235 + }, + { + "epoch": 1.7416746871992301, + "grad_norm": 1.2418460845947266, + "learning_rate": 8.16495341166763e-06, + "loss": 0.7263, + "step": 45240 + }, + { + "epoch": 1.7418671799807508, + "grad_norm": 1.0683330297470093, + "learning_rate": 8.152989738089978e-06, + "loss": 0.8064, + "step": 45245 + }, + { + "epoch": 1.7420596727622715, + "grad_norm": 0.9566577672958374, + "learning_rate": 8.14103446331298e-06, + "loss": 0.7102, + "step": 45250 + }, + { + "epoch": 1.7422521655437921, + "grad_norm": 0.8572260737419128, + "learning_rate": 8.129087588429873e-06, + "loss": 0.8575, + "step": 45255 + }, + { + "epoch": 1.7424446583253128, + "grad_norm": 0.9579412341117859, + "learning_rate": 8.11714911453314e-06, + "loss": 0.8609, + "step": 45260 + }, + { + "epoch": 1.7426371511068335, + "grad_norm": 1.0199538469314575, + "learning_rate": 8.105219042714406e-06, + "loss": 0.7884, + "step": 45265 + }, + { + "epoch": 1.7428296438883542, + "grad_norm": 1.9061774015426636, + "learning_rate": 8.093297374064679e-06, + "loss": 0.7509, + "step": 45270 + }, + { + "epoch": 1.7430221366698748, + "grad_norm": 1.1046141386032104, + "learning_rate": 8.081384109674073e-06, + "loss": 0.861, + "step": 45275 + }, + { + "epoch": 1.7432146294513955, + "grad_norm": 0.9744033217430115, + "learning_rate": 8.069479250631972e-06, + "loss": 0.8627, + "step": 45280 + }, + { + "epoch": 1.7434071222329162, + "grad_norm": 1.1140841245651245, + "learning_rate": 8.057582798027019e-06, + "loss": 0.8167, + "step": 45285 + }, + { + "epoch": 1.7435996150144368, + "grad_norm": 0.7895367741584778, + "learning_rate": 8.045694752947076e-06, + "loss": 0.7456, + "step": 45290 + }, + { + "epoch": 1.7437921077959575, + "grad_norm": 1.113579511642456, + "learning_rate": 8.033815116479182e-06, + "loss": 0.7822, + "step": 45295 + }, + { + "epoch": 1.7439846005774784, + "grad_norm": 1.654111385345459, + "learning_rate": 8.021943889709682e-06, + "loss": 0.8276, + "step": 45300 + }, + { + "epoch": 1.744177093358999, + "grad_norm": 1.1563928127288818, + "learning_rate": 8.010081073724107e-06, + "loss": 0.7169, + "step": 45305 + }, + { + "epoch": 1.7443695861405197, + "grad_norm": 1.4437462091445923, + "learning_rate": 7.998226669607245e-06, + "loss": 0.759, + "step": 45310 + }, + { + "epoch": 1.7445620789220404, + "grad_norm": 1.2180962562561035, + "learning_rate": 7.986380678443117e-06, + "loss": 0.8452, + "step": 45315 + }, + { + "epoch": 1.744754571703561, + "grad_norm": 1.4761182069778442, + "learning_rate": 7.974543101314912e-06, + "loss": 0.8095, + "step": 45320 + }, + { + "epoch": 1.744947064485082, + "grad_norm": 0.9857079982757568, + "learning_rate": 7.962713939305143e-06, + "loss": 0.7167, + "step": 45325 + }, + { + "epoch": 1.7451395572666026, + "grad_norm": 1.562577724456787, + "learning_rate": 7.95089319349549e-06, + "loss": 0.829, + "step": 45330 + }, + { + "epoch": 1.7453320500481233, + "grad_norm": 1.0676238536834717, + "learning_rate": 7.939080864966897e-06, + "loss": 0.7187, + "step": 45335 + }, + { + "epoch": 1.745524542829644, + "grad_norm": 1.6058528423309326, + "learning_rate": 7.92727695479948e-06, + "loss": 0.7018, + "step": 45340 + }, + { + "epoch": 1.7457170356111646, + "grad_norm": 1.4000190496444702, + "learning_rate": 7.915481464072694e-06, + "loss": 0.7528, + "step": 45345 + }, + { + "epoch": 1.7459095283926853, + "grad_norm": 1.476828694343567, + "learning_rate": 7.903694393865102e-06, + "loss": 0.6115, + "step": 45350 + }, + { + "epoch": 1.746102021174206, + "grad_norm": 0.9761121273040771, + "learning_rate": 7.891915745254574e-06, + "loss": 0.775, + "step": 45355 + }, + { + "epoch": 1.7462945139557267, + "grad_norm": 1.4030824899673462, + "learning_rate": 7.880145519318205e-06, + "loss": 0.7931, + "step": 45360 + }, + { + "epoch": 1.7464870067372473, + "grad_norm": 2.230025291442871, + "learning_rate": 7.868383717132299e-06, + "loss": 0.9373, + "step": 45365 + }, + { + "epoch": 1.746679499518768, + "grad_norm": 1.5030139684677124, + "learning_rate": 7.856630339772341e-06, + "loss": 0.8815, + "step": 45370 + }, + { + "epoch": 1.7468719923002887, + "grad_norm": 1.0745930671691895, + "learning_rate": 7.844885388313194e-06, + "loss": 0.7682, + "step": 45375 + }, + { + "epoch": 1.7470644850818093, + "grad_norm": 1.3051297664642334, + "learning_rate": 7.833148863828766e-06, + "loss": 0.7516, + "step": 45380 + }, + { + "epoch": 1.74725697786333, + "grad_norm": 1.2218395471572876, + "learning_rate": 7.821420767392362e-06, + "loss": 0.7248, + "step": 45385 + }, + { + "epoch": 1.7474494706448507, + "grad_norm": 1.7142714262008667, + "learning_rate": 7.809701100076384e-06, + "loss": 0.953, + "step": 45390 + }, + { + "epoch": 1.7476419634263713, + "grad_norm": 1.6819877624511719, + "learning_rate": 7.797989862952525e-06, + "loss": 0.8154, + "step": 45395 + }, + { + "epoch": 1.7478344562078922, + "grad_norm": 2.180309295654297, + "learning_rate": 7.786287057091723e-06, + "loss": 0.7794, + "step": 45400 + }, + { + "epoch": 1.748026948989413, + "grad_norm": 1.5509096384048462, + "learning_rate": 7.77459268356413e-06, + "loss": 0.9624, + "step": 45405 + }, + { + "epoch": 1.7482194417709336, + "grad_norm": 1.4458515644073486, + "learning_rate": 7.762906743439069e-06, + "loss": 0.836, + "step": 45410 + }, + { + "epoch": 1.7484119345524542, + "grad_norm": 1.2456862926483154, + "learning_rate": 7.751229237785173e-06, + "loss": 0.863, + "step": 45415 + }, + { + "epoch": 1.7486044273339751, + "grad_norm": 1.0490598678588867, + "learning_rate": 7.739560167670279e-06, + "loss": 0.8947, + "step": 45420 + }, + { + "epoch": 1.7487969201154958, + "grad_norm": 1.548740029335022, + "learning_rate": 7.727899534161431e-06, + "loss": 0.8212, + "step": 45425 + }, + { + "epoch": 1.7489894128970165, + "grad_norm": 1.1737068891525269, + "learning_rate": 7.716247338324945e-06, + "loss": 0.6706, + "step": 45430 + }, + { + "epoch": 1.7491819056785372, + "grad_norm": 1.5018718242645264, + "learning_rate": 7.7046035812263e-06, + "loss": 0.7855, + "step": 45435 + }, + { + "epoch": 1.7493743984600578, + "grad_norm": 1.2772295475006104, + "learning_rate": 7.692968263930255e-06, + "loss": 0.7608, + "step": 45440 + }, + { + "epoch": 1.7495668912415785, + "grad_norm": 1.0926804542541504, + "learning_rate": 7.681341387500784e-06, + "loss": 0.7587, + "step": 45445 + }, + { + "epoch": 1.7497593840230992, + "grad_norm": 1.2796525955200195, + "learning_rate": 7.669722953001113e-06, + "loss": 0.8821, + "step": 45450 + }, + { + "epoch": 1.7499518768046198, + "grad_norm": 0.9413089752197266, + "learning_rate": 7.658112961493602e-06, + "loss": 0.7892, + "step": 45455 + }, + { + "epoch": 1.7501443695861405, + "grad_norm": 1.1582038402557373, + "learning_rate": 7.646511414039981e-06, + "loss": 0.759, + "step": 45460 + }, + { + "epoch": 1.7503368623676612, + "grad_norm": 1.9622395038604736, + "learning_rate": 7.6349183117011e-06, + "loss": 0.7499, + "step": 45465 + }, + { + "epoch": 1.7505293551491818, + "grad_norm": 1.9417619705200195, + "learning_rate": 7.623333655537068e-06, + "loss": 0.8312, + "step": 45470 + }, + { + "epoch": 1.7507218479307025, + "grad_norm": 1.0554540157318115, + "learning_rate": 7.611757446607238e-06, + "loss": 0.8, + "step": 45475 + }, + { + "epoch": 1.7509143407122232, + "grad_norm": 0.9951033592224121, + "learning_rate": 7.6001896859701935e-06, + "loss": 0.7422, + "step": 45480 + }, + { + "epoch": 1.7511068334937439, + "grad_norm": 1.446395993232727, + "learning_rate": 7.58863037468367e-06, + "loss": 0.8357, + "step": 45485 + }, + { + "epoch": 1.7512993262752645, + "grad_norm": 1.0408679246902466, + "learning_rate": 7.577079513804764e-06, + "loss": 0.8824, + "step": 45490 + }, + { + "epoch": 1.7514918190567854, + "grad_norm": 2.0275330543518066, + "learning_rate": 7.565537104389675e-06, + "loss": 0.865, + "step": 45495 + }, + { + "epoch": 1.751684311838306, + "grad_norm": 0.5688580870628357, + "learning_rate": 7.5540031474938936e-06, + "loss": 0.7585, + "step": 45500 + }, + { + "epoch": 1.7518768046198268, + "grad_norm": 0.9278910160064697, + "learning_rate": 7.54247764417213e-06, + "loss": 0.8563, + "step": 45505 + }, + { + "epoch": 1.7520692974013474, + "grad_norm": 1.2069793939590454, + "learning_rate": 7.53096059547832e-06, + "loss": 0.8896, + "step": 45510 + }, + { + "epoch": 1.752261790182868, + "grad_norm": 1.3929320573806763, + "learning_rate": 7.51945200246561e-06, + "loss": 0.6949, + "step": 45515 + }, + { + "epoch": 1.752454282964389, + "grad_norm": 1.403387427330017, + "learning_rate": 7.507951866186369e-06, + "loss": 0.8077, + "step": 45520 + }, + { + "epoch": 1.7526467757459097, + "grad_norm": 1.3101508617401123, + "learning_rate": 7.496460187692233e-06, + "loss": 0.8342, + "step": 45525 + }, + { + "epoch": 1.7528392685274303, + "grad_norm": 1.3768970966339111, + "learning_rate": 7.484976968034041e-06, + "loss": 0.7386, + "step": 45530 + }, + { + "epoch": 1.753031761308951, + "grad_norm": 1.012230396270752, + "learning_rate": 7.473502208261862e-06, + "loss": 0.7605, + "step": 45535 + }, + { + "epoch": 1.7532242540904717, + "grad_norm": 0.9598119258880615, + "learning_rate": 7.462035909424947e-06, + "loss": 0.7581, + "step": 45540 + }, + { + "epoch": 1.7534167468719923, + "grad_norm": 1.7553858757019043, + "learning_rate": 7.450578072571857e-06, + "loss": 0.6762, + "step": 45545 + }, + { + "epoch": 1.753609239653513, + "grad_norm": 1.4688769578933716, + "learning_rate": 7.439128698750309e-06, + "loss": 0.661, + "step": 45550 + }, + { + "epoch": 1.7538017324350337, + "grad_norm": 1.9736151695251465, + "learning_rate": 7.427687789007299e-06, + "loss": 0.8019, + "step": 45555 + }, + { + "epoch": 1.7539942252165543, + "grad_norm": 0.9081621170043945, + "learning_rate": 7.416255344388967e-06, + "loss": 0.8455, + "step": 45560 + }, + { + "epoch": 1.754186717998075, + "grad_norm": 0.812998354434967, + "learning_rate": 7.404831365940812e-06, + "loss": 0.8729, + "step": 45565 + }, + { + "epoch": 1.7543792107795957, + "grad_norm": 1.4121613502502441, + "learning_rate": 7.393415854707419e-06, + "loss": 0.7995, + "step": 45570 + }, + { + "epoch": 1.7545717035611164, + "grad_norm": 1.363328456878662, + "learning_rate": 7.382008811732688e-06, + "loss": 0.7346, + "step": 45575 + }, + { + "epoch": 1.754764196342637, + "grad_norm": 1.5376365184783936, + "learning_rate": 7.370610238059716e-06, + "loss": 0.703, + "step": 45580 + }, + { + "epoch": 1.7549566891241577, + "grad_norm": 1.3588625192642212, + "learning_rate": 7.359220134730838e-06, + "loss": 0.8671, + "step": 45585 + }, + { + "epoch": 1.7551491819056786, + "grad_norm": 1.2630447149276733, + "learning_rate": 7.347838502787563e-06, + "loss": 0.8949, + "step": 45590 + }, + { + "epoch": 1.7553416746871993, + "grad_norm": 1.2255010604858398, + "learning_rate": 7.336465343270715e-06, + "loss": 0.8899, + "step": 45595 + }, + { + "epoch": 1.75553416746872, + "grad_norm": 0.93909752368927, + "learning_rate": 7.325100657220263e-06, + "loss": 0.6891, + "step": 45600 + }, + { + "epoch": 1.7557266602502406, + "grad_norm": 1.0947200059890747, + "learning_rate": 7.31374444567543e-06, + "loss": 0.7669, + "step": 45605 + }, + { + "epoch": 1.7559191530317613, + "grad_norm": 1.0405867099761963, + "learning_rate": 7.302396709674686e-06, + "loss": 0.9153, + "step": 45610 + }, + { + "epoch": 1.7561116458132822, + "grad_norm": 1.1569056510925293, + "learning_rate": 7.291057450255712e-06, + "loss": 0.7694, + "step": 45615 + }, + { + "epoch": 1.7563041385948028, + "grad_norm": 2.7271223068237305, + "learning_rate": 7.279726668455366e-06, + "loss": 0.9885, + "step": 45620 + }, + { + "epoch": 1.7564966313763235, + "grad_norm": 1.421797752380371, + "learning_rate": 7.268404365309822e-06, + "loss": 0.9199, + "step": 45625 + }, + { + "epoch": 1.7566891241578442, + "grad_norm": 1.7297556400299072, + "learning_rate": 7.257090541854405e-06, + "loss": 0.8034, + "step": 45630 + }, + { + "epoch": 1.7568816169393648, + "grad_norm": 1.0657942295074463, + "learning_rate": 7.245785199123689e-06, + "loss": 0.7211, + "step": 45635 + }, + { + "epoch": 1.7570741097208855, + "grad_norm": 1.2450711727142334, + "learning_rate": 7.23448833815148e-06, + "loss": 0.9191, + "step": 45640 + }, + { + "epoch": 1.7572666025024062, + "grad_norm": 1.1879651546478271, + "learning_rate": 7.223199959970784e-06, + "loss": 0.7244, + "step": 45645 + }, + { + "epoch": 1.7574590952839269, + "grad_norm": 1.0603852272033691, + "learning_rate": 7.211920065613875e-06, + "loss": 0.757, + "step": 45650 + }, + { + "epoch": 1.7576515880654475, + "grad_norm": 1.9512847661972046, + "learning_rate": 7.200648656112208e-06, + "loss": 0.6762, + "step": 45655 + }, + { + "epoch": 1.7578440808469682, + "grad_norm": 1.7029409408569336, + "learning_rate": 7.189385732496479e-06, + "loss": 0.7148, + "step": 45660 + }, + { + "epoch": 1.7580365736284889, + "grad_norm": 1.464708685874939, + "learning_rate": 7.178131295796609e-06, + "loss": 0.7853, + "step": 45665 + }, + { + "epoch": 1.7582290664100095, + "grad_norm": 1.0993046760559082, + "learning_rate": 7.166885347041763e-06, + "loss": 0.8098, + "step": 45670 + }, + { + "epoch": 1.7584215591915302, + "grad_norm": 1.787937045097351, + "learning_rate": 7.155647887260253e-06, + "loss": 0.9171, + "step": 45675 + }, + { + "epoch": 1.7586140519730509, + "grad_norm": 1.478559136390686, + "learning_rate": 7.144418917479734e-06, + "loss": 0.7519, + "step": 45680 + }, + { + "epoch": 1.7588065447545715, + "grad_norm": 1.5054091215133667, + "learning_rate": 7.133198438726985e-06, + "loss": 0.6784, + "step": 45685 + }, + { + "epoch": 1.7589990375360924, + "grad_norm": 1.190794587135315, + "learning_rate": 7.121986452028051e-06, + "loss": 0.7847, + "step": 45690 + }, + { + "epoch": 1.759191530317613, + "grad_norm": 1.474478006362915, + "learning_rate": 7.110782958408191e-06, + "loss": 0.7256, + "step": 45695 + }, + { + "epoch": 1.7593840230991338, + "grad_norm": 0.9803133010864258, + "learning_rate": 7.0995879588919176e-06, + "loss": 0.7769, + "step": 45700 + }, + { + "epoch": 1.7595765158806544, + "grad_norm": 1.1921695470809937, + "learning_rate": 7.08840145450288e-06, + "loss": 0.7835, + "step": 45705 + }, + { + "epoch": 1.7597690086621751, + "grad_norm": 0.8569211363792419, + "learning_rate": 7.077223446264081e-06, + "loss": 0.7431, + "step": 45710 + }, + { + "epoch": 1.759961501443696, + "grad_norm": 1.798243522644043, + "learning_rate": 7.066053935197625e-06, + "loss": 0.9124, + "step": 45715 + }, + { + "epoch": 1.7601539942252167, + "grad_norm": 1.0873985290527344, + "learning_rate": 7.054892922324896e-06, + "loss": 0.7808, + "step": 45720 + }, + { + "epoch": 1.7603464870067373, + "grad_norm": 0.899448573589325, + "learning_rate": 7.043740408666511e-06, + "loss": 0.6514, + "step": 45725 + }, + { + "epoch": 1.760538979788258, + "grad_norm": 0.8346114754676819, + "learning_rate": 7.032596395242308e-06, + "loss": 0.7472, + "step": 45730 + }, + { + "epoch": 1.7607314725697787, + "grad_norm": 1.0554609298706055, + "learning_rate": 7.021460883071296e-06, + "loss": 0.7476, + "step": 45735 + }, + { + "epoch": 1.7609239653512994, + "grad_norm": 2.069948196411133, + "learning_rate": 7.010333873171749e-06, + "loss": 0.8979, + "step": 45740 + }, + { + "epoch": 1.76111645813282, + "grad_norm": 1.2894495725631714, + "learning_rate": 6.999215366561174e-06, + "loss": 0.8933, + "step": 45745 + }, + { + "epoch": 1.7613089509143407, + "grad_norm": 0.8491725325584412, + "learning_rate": 6.988105364256281e-06, + "loss": 0.7834, + "step": 45750 + }, + { + "epoch": 1.7615014436958614, + "grad_norm": 1.3714752197265625, + "learning_rate": 6.977003867273024e-06, + "loss": 0.8987, + "step": 45755 + }, + { + "epoch": 1.761693936477382, + "grad_norm": 1.4427109956741333, + "learning_rate": 6.965910876626525e-06, + "loss": 0.7007, + "step": 45760 + }, + { + "epoch": 1.7618864292589027, + "grad_norm": 2.3159167766571045, + "learning_rate": 6.954826393331182e-06, + "loss": 0.7829, + "step": 45765 + }, + { + "epoch": 1.7620789220404234, + "grad_norm": 1.3657716512680054, + "learning_rate": 6.9437504184005964e-06, + "loss": 0.8195, + "step": 45770 + }, + { + "epoch": 1.762271414821944, + "grad_norm": 1.9438031911849976, + "learning_rate": 6.932682952847602e-06, + "loss": 0.851, + "step": 45775 + }, + { + "epoch": 1.7624639076034647, + "grad_norm": 1.6237000226974487, + "learning_rate": 6.9216239976842125e-06, + "loss": 0.7393, + "step": 45780 + }, + { + "epoch": 1.7626564003849856, + "grad_norm": 1.7042378187179565, + "learning_rate": 6.910573553921762e-06, + "loss": 0.7954, + "step": 45785 + }, + { + "epoch": 1.7628488931665063, + "grad_norm": 1.3898320198059082, + "learning_rate": 6.899531622570665e-06, + "loss": 0.7336, + "step": 45790 + }, + { + "epoch": 1.763041385948027, + "grad_norm": 0.9973713159561157, + "learning_rate": 6.888498204640681e-06, + "loss": 0.7636, + "step": 45795 + }, + { + "epoch": 1.7632338787295476, + "grad_norm": 1.350479006767273, + "learning_rate": 6.877473301140725e-06, + "loss": 0.7871, + "step": 45800 + }, + { + "epoch": 1.7634263715110683, + "grad_norm": 0.9988759756088257, + "learning_rate": 6.866456913078967e-06, + "loss": 0.7535, + "step": 45805 + }, + { + "epoch": 1.7636188642925892, + "grad_norm": 1.9600189924240112, + "learning_rate": 6.855449041462736e-06, + "loss": 0.8776, + "step": 45810 + }, + { + "epoch": 1.7638113570741099, + "grad_norm": 1.4199215173721313, + "learning_rate": 6.844449687298704e-06, + "loss": 0.8597, + "step": 45815 + }, + { + "epoch": 1.7640038498556305, + "grad_norm": 0.9643407464027405, + "learning_rate": 6.833458851592633e-06, + "loss": 0.8721, + "step": 45820 + }, + { + "epoch": 1.7641963426371512, + "grad_norm": 1.3059306144714355, + "learning_rate": 6.822476535349576e-06, + "loss": 0.8427, + "step": 45825 + }, + { + "epoch": 1.7643888354186719, + "grad_norm": 1.1498661041259766, + "learning_rate": 6.811502739573794e-06, + "loss": 0.8962, + "step": 45830 + }, + { + "epoch": 1.7645813282001925, + "grad_norm": 1.7601451873779297, + "learning_rate": 6.800537465268786e-06, + "loss": 0.6432, + "step": 45835 + }, + { + "epoch": 1.7647738209817132, + "grad_norm": 1.7763153314590454, + "learning_rate": 6.7895807134372265e-06, + "loss": 0.7724, + "step": 45840 + }, + { + "epoch": 1.7649663137632339, + "grad_norm": 0.8977097868919373, + "learning_rate": 6.778632485081038e-06, + "loss": 0.9375, + "step": 45845 + }, + { + "epoch": 1.7651588065447545, + "grad_norm": 2.03190541267395, + "learning_rate": 6.767692781201385e-06, + "loss": 0.8175, + "step": 45850 + }, + { + "epoch": 1.7653512993262752, + "grad_norm": 1.9310617446899414, + "learning_rate": 6.756761602798611e-06, + "loss": 0.7512, + "step": 45855 + }, + { + "epoch": 1.7655437921077959, + "grad_norm": 1.507844090461731, + "learning_rate": 6.7458389508723295e-06, + "loss": 0.989, + "step": 45860 + }, + { + "epoch": 1.7657362848893166, + "grad_norm": 1.0742762088775635, + "learning_rate": 6.734924826421296e-06, + "loss": 0.7757, + "step": 45865 + }, + { + "epoch": 1.7659287776708372, + "grad_norm": 3.061866283416748, + "learning_rate": 6.724019230443579e-06, + "loss": 0.8364, + "step": 45870 + }, + { + "epoch": 1.766121270452358, + "grad_norm": 1.1504889726638794, + "learning_rate": 6.713122163936392e-06, + "loss": 0.7237, + "step": 45875 + }, + { + "epoch": 1.7663137632338786, + "grad_norm": 0.9810017347335815, + "learning_rate": 6.702233627896237e-06, + "loss": 0.8952, + "step": 45880 + }, + { + "epoch": 1.7665062560153995, + "grad_norm": 1.719298005104065, + "learning_rate": 6.691353623318752e-06, + "loss": 0.9511, + "step": 45885 + }, + { + "epoch": 1.7666987487969201, + "grad_norm": 1.0837730169296265, + "learning_rate": 6.680482151198886e-06, + "loss": 1.0137, + "step": 45890 + }, + { + "epoch": 1.7668912415784408, + "grad_norm": 1.030633807182312, + "learning_rate": 6.669619212530709e-06, + "loss": 0.8197, + "step": 45895 + }, + { + "epoch": 1.7670837343599615, + "grad_norm": 1.0519740581512451, + "learning_rate": 6.658764808307638e-06, + "loss": 0.7061, + "step": 45900 + }, + { + "epoch": 1.7672762271414824, + "grad_norm": 2.0290040969848633, + "learning_rate": 6.647918939522168e-06, + "loss": 0.8013, + "step": 45905 + }, + { + "epoch": 1.767468719923003, + "grad_norm": 1.4919228553771973, + "learning_rate": 6.637081607166129e-06, + "loss": 0.7577, + "step": 45910 + }, + { + "epoch": 1.7676612127045237, + "grad_norm": 1.4189355373382568, + "learning_rate": 6.626252812230494e-06, + "loss": 0.6824, + "step": 45915 + }, + { + "epoch": 1.7678537054860444, + "grad_norm": 1.654653787612915, + "learning_rate": 6.615432555705503e-06, + "loss": 0.7618, + "step": 45920 + }, + { + "epoch": 1.768046198267565, + "grad_norm": 1.591550350189209, + "learning_rate": 6.604620838580566e-06, + "loss": 0.8461, + "step": 45925 + }, + { + "epoch": 1.7682386910490857, + "grad_norm": 1.0825369358062744, + "learning_rate": 6.593817661844393e-06, + "loss": 0.8319, + "step": 45930 + }, + { + "epoch": 1.7684311838306064, + "grad_norm": 1.893295407295227, + "learning_rate": 6.5830230264848245e-06, + "loss": 0.8209, + "step": 45935 + }, + { + "epoch": 1.768623676612127, + "grad_norm": 0.8829330205917358, + "learning_rate": 6.572236933488962e-06, + "loss": 0.7583, + "step": 45940 + }, + { + "epoch": 1.7688161693936477, + "grad_norm": 1.44454026222229, + "learning_rate": 6.561459383843138e-06, + "loss": 0.8129, + "step": 45945 + }, + { + "epoch": 1.7690086621751684, + "grad_norm": 1.0190069675445557, + "learning_rate": 6.5506903785328865e-06, + "loss": 0.7915, + "step": 45950 + }, + { + "epoch": 1.769201154956689, + "grad_norm": 1.2443015575408936, + "learning_rate": 6.539929918542953e-06, + "loss": 0.6906, + "step": 45955 + }, + { + "epoch": 1.7693936477382097, + "grad_norm": 1.2522984743118286, + "learning_rate": 6.529178004857295e-06, + "loss": 0.7546, + "step": 45960 + }, + { + "epoch": 1.7695861405197304, + "grad_norm": 0.4453613758087158, + "learning_rate": 6.5184346384591365e-06, + "loss": 0.7618, + "step": 45965 + }, + { + "epoch": 1.769778633301251, + "grad_norm": 1.1776552200317383, + "learning_rate": 6.507699820330859e-06, + "loss": 0.7245, + "step": 45970 + }, + { + "epoch": 1.7699711260827717, + "grad_norm": 0.9290987253189087, + "learning_rate": 6.4969735514541216e-06, + "loss": 0.8716, + "step": 45975 + }, + { + "epoch": 1.7701636188642926, + "grad_norm": 1.227397084236145, + "learning_rate": 6.48625583280974e-06, + "loss": 0.9719, + "step": 45980 + }, + { + "epoch": 1.7703561116458133, + "grad_norm": 1.5179232358932495, + "learning_rate": 6.4755466653777965e-06, + "loss": 0.7412, + "step": 45985 + }, + { + "epoch": 1.770548604427334, + "grad_norm": 1.2842813730239868, + "learning_rate": 6.464846050137552e-06, + "loss": 0.8631, + "step": 45990 + }, + { + "epoch": 1.7707410972088546, + "grad_norm": 1.102016806602478, + "learning_rate": 6.454153988067557e-06, + "loss": 0.7781, + "step": 45995 + }, + { + "epoch": 1.7709335899903753, + "grad_norm": 1.2582801580429077, + "learning_rate": 6.443470480145452e-06, + "loss": 0.8891, + "step": 46000 + }, + { + "epoch": 1.7711260827718962, + "grad_norm": 1.6584640741348267, + "learning_rate": 6.432795527348246e-06, + "loss": 0.845, + "step": 46005 + }, + { + "epoch": 1.7713185755534169, + "grad_norm": 0.9719191789627075, + "learning_rate": 6.422129130652055e-06, + "loss": 0.7511, + "step": 46010 + }, + { + "epoch": 1.7715110683349375, + "grad_norm": 0.968356192111969, + "learning_rate": 6.411471291032245e-06, + "loss": 0.8289, + "step": 46015 + }, + { + "epoch": 1.7717035611164582, + "grad_norm": 1.0656431913375854, + "learning_rate": 6.400822009463437e-06, + "loss": 0.8338, + "step": 46020 + }, + { + "epoch": 1.7718960538979789, + "grad_norm": 0.6920823454856873, + "learning_rate": 6.3901812869194165e-06, + "loss": 0.6625, + "step": 46025 + }, + { + "epoch": 1.7720885466794996, + "grad_norm": 1.6432719230651855, + "learning_rate": 6.379549124373185e-06, + "loss": 0.8689, + "step": 46030 + }, + { + "epoch": 1.7722810394610202, + "grad_norm": 1.784363865852356, + "learning_rate": 6.368925522797042e-06, + "loss": 0.8466, + "step": 46035 + }, + { + "epoch": 1.772473532242541, + "grad_norm": 0.9978721141815186, + "learning_rate": 6.358310483162389e-06, + "loss": 0.6407, + "step": 46040 + }, + { + "epoch": 1.7726660250240616, + "grad_norm": 1.793863296508789, + "learning_rate": 6.347704006439936e-06, + "loss": 0.7827, + "step": 46045 + }, + { + "epoch": 1.7728585178055822, + "grad_norm": 0.9259586930274963, + "learning_rate": 6.337106093599543e-06, + "loss": 0.8073, + "step": 46050 + }, + { + "epoch": 1.773051010587103, + "grad_norm": 1.971684455871582, + "learning_rate": 6.326516745610367e-06, + "loss": 0.7771, + "step": 46055 + }, + { + "epoch": 1.7732435033686236, + "grad_norm": 1.9362154006958008, + "learning_rate": 6.315935963440689e-06, + "loss": 0.9013, + "step": 46060 + }, + { + "epoch": 1.7734359961501442, + "grad_norm": 1.4866061210632324, + "learning_rate": 6.305363748058057e-06, + "loss": 0.9351, + "step": 46065 + }, + { + "epoch": 1.773628488931665, + "grad_norm": 0.8735952973365784, + "learning_rate": 6.2948001004292435e-06, + "loss": 0.7682, + "step": 46070 + }, + { + "epoch": 1.7738209817131858, + "grad_norm": 1.655439019203186, + "learning_rate": 6.2842450215202295e-06, + "loss": 0.9094, + "step": 46075 + }, + { + "epoch": 1.7740134744947065, + "grad_norm": 1.2115354537963867, + "learning_rate": 6.2736985122962e-06, + "loss": 0.8336, + "step": 46080 + }, + { + "epoch": 1.7742059672762271, + "grad_norm": 1.3702316284179688, + "learning_rate": 6.263160573721561e-06, + "loss": 0.7794, + "step": 46085 + }, + { + "epoch": 1.7743984600577478, + "grad_norm": 0.8656373023986816, + "learning_rate": 6.252631206759929e-06, + "loss": 0.6995, + "step": 46090 + }, + { + "epoch": 1.7745909528392685, + "grad_norm": 1.3100272417068481, + "learning_rate": 6.242110412374158e-06, + "loss": 1.0182, + "step": 46095 + }, + { + "epoch": 1.7747834456207894, + "grad_norm": 0.7958429455757141, + "learning_rate": 6.231598191526311e-06, + "loss": 0.7499, + "step": 46100 + }, + { + "epoch": 1.77497593840231, + "grad_norm": 1.7569608688354492, + "learning_rate": 6.221094545177619e-06, + "loss": 0.6442, + "step": 46105 + }, + { + "epoch": 1.7751684311838307, + "grad_norm": 1.1067934036254883, + "learning_rate": 6.2105994742886365e-06, + "loss": 0.7889, + "step": 46110 + }, + { + "epoch": 1.7753609239653514, + "grad_norm": 1.4211006164550781, + "learning_rate": 6.200112979819017e-06, + "loss": 0.7786, + "step": 46115 + }, + { + "epoch": 1.775553416746872, + "grad_norm": 0.9069014191627502, + "learning_rate": 6.189635062727695e-06, + "loss": 0.8214, + "step": 46120 + }, + { + "epoch": 1.7757459095283927, + "grad_norm": 1.2809123992919922, + "learning_rate": 6.179165723972824e-06, + "loss": 0.8923, + "step": 46125 + }, + { + "epoch": 1.7759384023099134, + "grad_norm": 1.3749951124191284, + "learning_rate": 6.16870496451174e-06, + "loss": 0.7566, + "step": 46130 + }, + { + "epoch": 1.776130895091434, + "grad_norm": 1.8189775943756104, + "learning_rate": 6.158252785300988e-06, + "loss": 0.9776, + "step": 46135 + }, + { + "epoch": 1.7763233878729547, + "grad_norm": 1.2725715637207031, + "learning_rate": 6.147809187296405e-06, + "loss": 0.7726, + "step": 46140 + }, + { + "epoch": 1.7765158806544754, + "grad_norm": 1.479418396949768, + "learning_rate": 6.137374171452948e-06, + "loss": 0.8255, + "step": 46145 + }, + { + "epoch": 1.776708373435996, + "grad_norm": 0.9075431227684021, + "learning_rate": 6.1269477387248306e-06, + "loss": 0.9074, + "step": 46150 + }, + { + "epoch": 1.7769008662175168, + "grad_norm": 1.1781271696090698, + "learning_rate": 6.116529890065492e-06, + "loss": 0.7702, + "step": 46155 + }, + { + "epoch": 1.7770933589990374, + "grad_norm": 1.5959038734436035, + "learning_rate": 6.1061206264275805e-06, + "loss": 0.7611, + "step": 46160 + }, + { + "epoch": 1.777285851780558, + "grad_norm": 1.5484418869018555, + "learning_rate": 6.095719948762934e-06, + "loss": 0.8188, + "step": 46165 + }, + { + "epoch": 1.7774783445620788, + "grad_norm": 1.0949671268463135, + "learning_rate": 6.085327858022672e-06, + "loss": 0.7638, + "step": 46170 + }, + { + "epoch": 1.7776708373435997, + "grad_norm": 1.3612221479415894, + "learning_rate": 6.07494435515702e-06, + "loss": 0.8475, + "step": 46175 + }, + { + "epoch": 1.7778633301251203, + "grad_norm": 1.385720133781433, + "learning_rate": 6.06456944111552e-06, + "loss": 0.7755, + "step": 46180 + }, + { + "epoch": 1.778055822906641, + "grad_norm": 1.639599323272705, + "learning_rate": 6.05420311684689e-06, + "loss": 0.7855, + "step": 46185 + }, + { + "epoch": 1.7782483156881617, + "grad_norm": 1.1654934883117676, + "learning_rate": 6.043845383299018e-06, + "loss": 0.9169, + "step": 46190 + }, + { + "epoch": 1.7784408084696823, + "grad_norm": 0.8145331740379333, + "learning_rate": 6.033496241419112e-06, + "loss": 0.7727, + "step": 46195 + }, + { + "epoch": 1.7786333012512032, + "grad_norm": 1.7755072116851807, + "learning_rate": 6.0231556921534925e-06, + "loss": 0.9423, + "step": 46200 + }, + { + "epoch": 1.778825794032724, + "grad_norm": 0.9639081358909607, + "learning_rate": 6.012823736447748e-06, + "loss": 0.7255, + "step": 46205 + }, + { + "epoch": 1.7790182868142446, + "grad_norm": 1.6836614608764648, + "learning_rate": 6.002500375246667e-06, + "loss": 0.7516, + "step": 46210 + }, + { + "epoch": 1.7792107795957652, + "grad_norm": 0.9299353957176208, + "learning_rate": 5.9921856094942604e-06, + "loss": 0.7697, + "step": 46215 + }, + { + "epoch": 1.779403272377286, + "grad_norm": 1.0631067752838135, + "learning_rate": 5.981879440133709e-06, + "loss": 0.732, + "step": 46220 + }, + { + "epoch": 1.7795957651588066, + "grad_norm": 1.403578519821167, + "learning_rate": 5.97158186810749e-06, + "loss": 0.7957, + "step": 46225 + }, + { + "epoch": 1.7797882579403272, + "grad_norm": 2.0105881690979004, + "learning_rate": 5.961292894357217e-06, + "loss": 0.681, + "step": 46230 + }, + { + "epoch": 1.779980750721848, + "grad_norm": 1.8827532529830933, + "learning_rate": 5.951012519823762e-06, + "loss": 0.9873, + "step": 46235 + }, + { + "epoch": 1.7801732435033686, + "grad_norm": 0.4478518068790436, + "learning_rate": 5.940740745447193e-06, + "loss": 0.7578, + "step": 46240 + }, + { + "epoch": 1.7803657362848893, + "grad_norm": 1.727774977684021, + "learning_rate": 5.930477572166815e-06, + "loss": 0.7855, + "step": 46245 + }, + { + "epoch": 1.78055822906641, + "grad_norm": 0.8507458567619324, + "learning_rate": 5.920223000921077e-06, + "loss": 0.7533, + "step": 46250 + }, + { + "epoch": 1.7807507218479306, + "grad_norm": 1.1110725402832031, + "learning_rate": 5.909977032647762e-06, + "loss": 0.9928, + "step": 46255 + }, + { + "epoch": 1.7809432146294513, + "grad_norm": 1.1344168186187744, + "learning_rate": 5.899739668283732e-06, + "loss": 0.7474, + "step": 46260 + }, + { + "epoch": 1.781135707410972, + "grad_norm": 1.6794506311416626, + "learning_rate": 5.8895109087651616e-06, + "loss": 0.7804, + "step": 46265 + }, + { + "epoch": 1.7813282001924928, + "grad_norm": 1.0263005495071411, + "learning_rate": 5.879290755027378e-06, + "loss": 0.8814, + "step": 46270 + }, + { + "epoch": 1.7815206929740135, + "grad_norm": 1.7495372295379639, + "learning_rate": 5.869079208004991e-06, + "loss": 0.9627, + "step": 46275 + }, + { + "epoch": 1.7817131857555342, + "grad_norm": 1.242300271987915, + "learning_rate": 5.85887626863173e-06, + "loss": 0.8365, + "step": 46280 + }, + { + "epoch": 1.7819056785370548, + "grad_norm": 2.084526777267456, + "learning_rate": 5.848681937840605e-06, + "loss": 0.8406, + "step": 46285 + }, + { + "epoch": 1.7820981713185755, + "grad_norm": 0.9705522656440735, + "learning_rate": 5.8384962165638265e-06, + "loss": 0.7655, + "step": 46290 + }, + { + "epoch": 1.7822906641000964, + "grad_norm": 1.0004565715789795, + "learning_rate": 5.8283191057328045e-06, + "loss": 0.7498, + "step": 46295 + }, + { + "epoch": 1.782483156881617, + "grad_norm": 1.0826503038406372, + "learning_rate": 5.818150606278183e-06, + "loss": 0.8392, + "step": 46300 + }, + { + "epoch": 1.7826756496631377, + "grad_norm": 2.7211012840270996, + "learning_rate": 5.807990719129786e-06, + "loss": 0.8496, + "step": 46305 + }, + { + "epoch": 1.7828681424446584, + "grad_norm": 2.3229408264160156, + "learning_rate": 5.797839445216657e-06, + "loss": 0.7426, + "step": 46310 + }, + { + "epoch": 1.783060635226179, + "grad_norm": 2.3727757930755615, + "learning_rate": 5.78769678546709e-06, + "loss": 0.7056, + "step": 46315 + }, + { + "epoch": 1.7832531280076998, + "grad_norm": 1.557446837425232, + "learning_rate": 5.777562740808562e-06, + "loss": 0.8112, + "step": 46320 + }, + { + "epoch": 1.7834456207892204, + "grad_norm": 1.3999435901641846, + "learning_rate": 5.7674373121677226e-06, + "loss": 0.7352, + "step": 46325 + }, + { + "epoch": 1.783638113570741, + "grad_norm": 1.531625509262085, + "learning_rate": 5.757320500470542e-06, + "loss": 0.6754, + "step": 46330 + }, + { + "epoch": 1.7838306063522618, + "grad_norm": 1.059308648109436, + "learning_rate": 5.74721230664208e-06, + "loss": 0.7609, + "step": 46335 + }, + { + "epoch": 1.7840230991337824, + "grad_norm": 1.5941221714019775, + "learning_rate": 5.737112731606698e-06, + "loss": 0.8795, + "step": 46340 + }, + { + "epoch": 1.784215591915303, + "grad_norm": 1.1354868412017822, + "learning_rate": 5.7270217762879105e-06, + "loss": 0.7511, + "step": 46345 + }, + { + "epoch": 1.7844080846968238, + "grad_norm": 1.1096879243850708, + "learning_rate": 5.716939441608504e-06, + "loss": 0.6635, + "step": 46350 + }, + { + "epoch": 1.7846005774783444, + "grad_norm": 2.472426176071167, + "learning_rate": 5.706865728490373e-06, + "loss": 0.805, + "step": 46355 + }, + { + "epoch": 1.784793070259865, + "grad_norm": 1.060925841331482, + "learning_rate": 5.696800637854782e-06, + "loss": 0.8253, + "step": 46360 + }, + { + "epoch": 1.784985563041386, + "grad_norm": 1.1766963005065918, + "learning_rate": 5.686744170622049e-06, + "loss": 0.6249, + "step": 46365 + }, + { + "epoch": 1.7851780558229067, + "grad_norm": 1.531556248664856, + "learning_rate": 5.676696327711795e-06, + "loss": 0.7294, + "step": 46370 + }, + { + "epoch": 1.7853705486044273, + "grad_norm": 1.9163962602615356, + "learning_rate": 5.666657110042828e-06, + "loss": 0.7667, + "step": 46375 + }, + { + "epoch": 1.785563041385948, + "grad_norm": 1.7495805025100708, + "learning_rate": 5.65662651853317e-06, + "loss": 0.8198, + "step": 46380 + }, + { + "epoch": 1.7857555341674687, + "grad_norm": 0.7979353666305542, + "learning_rate": 5.6466045541000546e-06, + "loss": 0.7781, + "step": 46385 + }, + { + "epoch": 1.7859480269489896, + "grad_norm": 1.199220895767212, + "learning_rate": 5.636591217659903e-06, + "loss": 0.8034, + "step": 46390 + }, + { + "epoch": 1.7861405197305102, + "grad_norm": 1.2027233839035034, + "learning_rate": 5.626586510128384e-06, + "loss": 0.7879, + "step": 46395 + }, + { + "epoch": 1.786333012512031, + "grad_norm": 0.7976201772689819, + "learning_rate": 5.616590432420376e-06, + "loss": 0.6871, + "step": 46400 + }, + { + "epoch": 1.7865255052935516, + "grad_norm": 1.0275331735610962, + "learning_rate": 5.606602985449949e-06, + "loss": 0.7664, + "step": 46405 + }, + { + "epoch": 1.7867179980750723, + "grad_norm": 2.2883615493774414, + "learning_rate": 5.59662417013036e-06, + "loss": 0.957, + "step": 46410 + }, + { + "epoch": 1.786910490856593, + "grad_norm": 0.8585185408592224, + "learning_rate": 5.586653987374125e-06, + "loss": 0.6444, + "step": 46415 + }, + { + "epoch": 1.7871029836381136, + "grad_norm": 2.087956666946411, + "learning_rate": 5.576692438092956e-06, + "loss": 0.8373, + "step": 46420 + }, + { + "epoch": 1.7872954764196343, + "grad_norm": 1.8517224788665771, + "learning_rate": 5.566739523197762e-06, + "loss": 0.8641, + "step": 46425 + }, + { + "epoch": 1.787487969201155, + "grad_norm": 1.3989286422729492, + "learning_rate": 5.556795243598678e-06, + "loss": 0.8464, + "step": 46430 + }, + { + "epoch": 1.7876804619826756, + "grad_norm": 1.8883378505706787, + "learning_rate": 5.546859600205057e-06, + "loss": 0.8401, + "step": 46435 + }, + { + "epoch": 1.7878729547641963, + "grad_norm": 1.1194407939910889, + "learning_rate": 5.536932593925403e-06, + "loss": 0.8559, + "step": 46440 + }, + { + "epoch": 1.788065447545717, + "grad_norm": 1.501118779182434, + "learning_rate": 5.527014225667526e-06, + "loss": 0.7472, + "step": 46445 + }, + { + "epoch": 1.7882579403272376, + "grad_norm": 1.1292697191238403, + "learning_rate": 5.517104496338365e-06, + "loss": 0.8151, + "step": 46450 + }, + { + "epoch": 1.7884504331087583, + "grad_norm": 1.5353429317474365, + "learning_rate": 5.507203406844097e-06, + "loss": 0.693, + "step": 46455 + }, + { + "epoch": 1.788642925890279, + "grad_norm": 1.1052043437957764, + "learning_rate": 5.497310958090129e-06, + "loss": 0.74, + "step": 46460 + }, + { + "epoch": 1.7888354186717998, + "grad_norm": 1.2323806285858154, + "learning_rate": 5.487427150981061e-06, + "loss": 0.7457, + "step": 46465 + }, + { + "epoch": 1.7890279114533205, + "grad_norm": 1.003144383430481, + "learning_rate": 5.477551986420659e-06, + "loss": 0.6708, + "step": 46470 + }, + { + "epoch": 1.7892204042348412, + "grad_norm": 1.9035112857818604, + "learning_rate": 5.467685465312e-06, + "loss": 0.7999, + "step": 46475 + }, + { + "epoch": 1.7894128970163619, + "grad_norm": 2.7132394313812256, + "learning_rate": 5.457827588557285e-06, + "loss": 0.7343, + "step": 46480 + }, + { + "epoch": 1.7896053897978825, + "grad_norm": 2.29404878616333, + "learning_rate": 5.4479783570579366e-06, + "loss": 0.8014, + "step": 46485 + }, + { + "epoch": 1.7897978825794034, + "grad_norm": 1.4308083057403564, + "learning_rate": 5.438137771714635e-06, + "loss": 0.888, + "step": 46490 + }, + { + "epoch": 1.789990375360924, + "grad_norm": 1.0539064407348633, + "learning_rate": 5.428305833427216e-06, + "loss": 0.8976, + "step": 46495 + }, + { + "epoch": 1.7901828681424448, + "grad_norm": 1.6149170398712158, + "learning_rate": 5.41848254309475e-06, + "loss": 0.8853, + "step": 46500 + }, + { + "epoch": 1.7903753609239654, + "grad_norm": 0.8334898948669434, + "learning_rate": 5.408667901615494e-06, + "loss": 0.6841, + "step": 46505 + }, + { + "epoch": 1.790567853705486, + "grad_norm": 1.9171321392059326, + "learning_rate": 5.398861909886965e-06, + "loss": 0.6301, + "step": 46510 + }, + { + "epoch": 1.7907603464870068, + "grad_norm": 1.2018779516220093, + "learning_rate": 5.391023344927259e-06, + "loss": 1.3233, + "step": 46515 + }, + { + "epoch": 1.7909528392685274, + "grad_norm": 1.1932569742202759, + "learning_rate": 5.381232925009128e-06, + "loss": 0.7556, + "step": 46520 + }, + { + "epoch": 1.7911453320500481, + "grad_norm": 0.987067699432373, + "learning_rate": 5.37145115735046e-06, + "loss": 0.6867, + "step": 46525 + }, + { + "epoch": 1.7913378248315688, + "grad_norm": 1.0372260808944702, + "learning_rate": 5.361678042845731e-06, + "loss": 0.7445, + "step": 46530 + }, + { + "epoch": 1.7915303176130895, + "grad_norm": 0.9253237843513489, + "learning_rate": 5.351913582388635e-06, + "loss": 0.7732, + "step": 46535 + }, + { + "epoch": 1.7917228103946101, + "grad_norm": 0.9296650886535645, + "learning_rate": 5.342157776872025e-06, + "loss": 0.7397, + "step": 46540 + }, + { + "epoch": 1.7919153031761308, + "grad_norm": 2.767267942428589, + "learning_rate": 5.332410627188067e-06, + "loss": 0.7794, + "step": 46545 + }, + { + "epoch": 1.7921077959576515, + "grad_norm": 1.5188485383987427, + "learning_rate": 5.322672134228024e-06, + "loss": 0.7275, + "step": 46550 + }, + { + "epoch": 1.7923002887391721, + "grad_norm": 1.1152452230453491, + "learning_rate": 5.312942298882439e-06, + "loss": 0.7639, + "step": 46555 + }, + { + "epoch": 1.792492781520693, + "grad_norm": 1.1599907875061035, + "learning_rate": 5.303221122041036e-06, + "loss": 0.7895, + "step": 46560 + }, + { + "epoch": 1.7926852743022137, + "grad_norm": 1.0555464029312134, + "learning_rate": 5.293508604592768e-06, + "loss": 0.8518, + "step": 46565 + }, + { + "epoch": 1.7928777670837344, + "grad_norm": 1.0384188890457153, + "learning_rate": 5.283804747425747e-06, + "loss": 0.7783, + "step": 46570 + }, + { + "epoch": 1.793070259865255, + "grad_norm": 1.148019790649414, + "learning_rate": 5.274109551427342e-06, + "loss": 0.8909, + "step": 46575 + }, + { + "epoch": 1.7932627526467757, + "grad_norm": 2.0219950675964355, + "learning_rate": 5.264423017484122e-06, + "loss": 0.7428, + "step": 46580 + }, + { + "epoch": 1.7934552454282966, + "grad_norm": 1.3461076021194458, + "learning_rate": 5.254745146481843e-06, + "loss": 0.675, + "step": 46585 + }, + { + "epoch": 1.7936477382098173, + "grad_norm": 1.6037160158157349, + "learning_rate": 5.2450759393055104e-06, + "loss": 0.8687, + "step": 46590 + }, + { + "epoch": 1.793840230991338, + "grad_norm": 0.929437518119812, + "learning_rate": 5.235415396839283e-06, + "loss": 0.9081, + "step": 46595 + }, + { + "epoch": 1.7940327237728586, + "grad_norm": 0.9498231410980225, + "learning_rate": 5.225763519966542e-06, + "loss": 0.7378, + "step": 46600 + }, + { + "epoch": 1.7942252165543793, + "grad_norm": 1.3678570985794067, + "learning_rate": 5.216120309569917e-06, + "loss": 1.001, + "step": 46605 + }, + { + "epoch": 1.7944177093359, + "grad_norm": 1.7113560438156128, + "learning_rate": 5.206485766531222e-06, + "loss": 0.8138, + "step": 46610 + }, + { + "epoch": 1.7946102021174206, + "grad_norm": 1.4284687042236328, + "learning_rate": 5.19685989173142e-06, + "loss": 0.7692, + "step": 46615 + }, + { + "epoch": 1.7948026948989413, + "grad_norm": 1.2268130779266357, + "learning_rate": 5.1872426860507975e-06, + "loss": 0.7408, + "step": 46620 + }, + { + "epoch": 1.794995187680462, + "grad_norm": 1.4930412769317627, + "learning_rate": 5.177634150368738e-06, + "loss": 0.9012, + "step": 46625 + }, + { + "epoch": 1.7951876804619826, + "grad_norm": 1.3684518337249756, + "learning_rate": 5.1680342855638945e-06, + "loss": 0.927, + "step": 46630 + }, + { + "epoch": 1.7953801732435033, + "grad_norm": 1.5305689573287964, + "learning_rate": 5.158443092514109e-06, + "loss": 0.8608, + "step": 46635 + }, + { + "epoch": 1.795572666025024, + "grad_norm": 1.6696339845657349, + "learning_rate": 5.148860572096459e-06, + "loss": 0.8078, + "step": 46640 + }, + { + "epoch": 1.7957651588065446, + "grad_norm": 1.3109546899795532, + "learning_rate": 5.139286725187143e-06, + "loss": 0.7532, + "step": 46645 + }, + { + "epoch": 1.7959576515880653, + "grad_norm": 1.0849995613098145, + "learning_rate": 5.129721552661681e-06, + "loss": 0.9083, + "step": 46650 + }, + { + "epoch": 1.796150144369586, + "grad_norm": 2.3269553184509277, + "learning_rate": 5.120165055394721e-06, + "loss": 0.7511, + "step": 46655 + }, + { + "epoch": 1.7963426371511069, + "grad_norm": 1.75252103805542, + "learning_rate": 5.110617234260151e-06, + "loss": 0.7646, + "step": 46660 + }, + { + "epoch": 1.7965351299326275, + "grad_norm": 1.0314127206802368, + "learning_rate": 5.1010780901310395e-06, + "loss": 0.6695, + "step": 46665 + }, + { + "epoch": 1.7967276227141482, + "grad_norm": 1.0665658712387085, + "learning_rate": 5.091547623879711e-06, + "loss": 0.8726, + "step": 46670 + }, + { + "epoch": 1.7969201154956689, + "grad_norm": 1.8659719228744507, + "learning_rate": 5.082025836377624e-06, + "loss": 0.8587, + "step": 46675 + }, + { + "epoch": 1.7971126082771898, + "grad_norm": 1.0363880395889282, + "learning_rate": 5.072512728495493e-06, + "loss": 0.7135, + "step": 46680 + }, + { + "epoch": 1.7973051010587104, + "grad_norm": 1.0198173522949219, + "learning_rate": 5.063008301103245e-06, + "loss": 0.8571, + "step": 46685 + }, + { + "epoch": 1.7974975938402311, + "grad_norm": 1.313674807548523, + "learning_rate": 5.0535125550699834e-06, + "loss": 0.8776, + "step": 46690 + }, + { + "epoch": 1.7976900866217518, + "grad_norm": 0.9799014925956726, + "learning_rate": 5.044025491264049e-06, + "loss": 0.8095, + "step": 46695 + }, + { + "epoch": 1.7978825794032725, + "grad_norm": 0.9720066785812378, + "learning_rate": 5.0345471105529345e-06, + "loss": 0.7772, + "step": 46700 + }, + { + "epoch": 1.7980750721847931, + "grad_norm": 1.2402182817459106, + "learning_rate": 5.025077413803425e-06, + "loss": 0.7378, + "step": 46705 + }, + { + "epoch": 1.7982675649663138, + "grad_norm": 1.4397085905075073, + "learning_rate": 5.015616401881418e-06, + "loss": 0.8355, + "step": 46710 + }, + { + "epoch": 1.7984600577478345, + "grad_norm": 1.2105292081832886, + "learning_rate": 5.0061640756520754e-06, + "loss": 0.8271, + "step": 46715 + }, + { + "epoch": 1.7986525505293551, + "grad_norm": 1.2763471603393555, + "learning_rate": 4.996720435979763e-06, + "loss": 0.7776, + "step": 46720 + }, + { + "epoch": 1.7988450433108758, + "grad_norm": 1.6655856370925903, + "learning_rate": 4.987285483728033e-06, + "loss": 0.7638, + "step": 46725 + }, + { + "epoch": 1.7990375360923965, + "grad_norm": 1.3366689682006836, + "learning_rate": 4.977859219759617e-06, + "loss": 0.6957, + "step": 46730 + }, + { + "epoch": 1.7992300288739171, + "grad_norm": 0.9900131225585938, + "learning_rate": 4.968441644936539e-06, + "loss": 0.7898, + "step": 46735 + }, + { + "epoch": 1.7994225216554378, + "grad_norm": 1.191714882850647, + "learning_rate": 4.95903276011993e-06, + "loss": 0.8838, + "step": 46740 + }, + { + "epoch": 1.7996150144369585, + "grad_norm": 1.0521550178527832, + "learning_rate": 4.949632566170181e-06, + "loss": 0.7767, + "step": 46745 + }, + { + "epoch": 1.7998075072184792, + "grad_norm": 1.6969459056854248, + "learning_rate": 4.940241063946893e-06, + "loss": 0.8962, + "step": 46750 + }, + { + "epoch": 1.8, + "grad_norm": 1.4018580913543701, + "learning_rate": 4.930858254308856e-06, + "loss": 0.8324, + "step": 46755 + }, + { + "epoch": 1.8001924927815207, + "grad_norm": 1.4890011548995972, + "learning_rate": 4.921484138114029e-06, + "loss": 0.6562, + "step": 46760 + }, + { + "epoch": 1.8003849855630414, + "grad_norm": 1.6725586652755737, + "learning_rate": 4.912118716219672e-06, + "loss": 0.7311, + "step": 46765 + }, + { + "epoch": 1.800577478344562, + "grad_norm": 1.5450921058654785, + "learning_rate": 4.9027619894821404e-06, + "loss": 0.8046, + "step": 46770 + }, + { + "epoch": 1.8007699711260827, + "grad_norm": 1.5963780879974365, + "learning_rate": 4.893413958757065e-06, + "loss": 0.7562, + "step": 46775 + }, + { + "epoch": 1.8009624639076036, + "grad_norm": 2.59673810005188, + "learning_rate": 4.88407462489926e-06, + "loss": 0.7906, + "step": 46780 + }, + { + "epoch": 1.8011549566891243, + "grad_norm": 1.0897094011306763, + "learning_rate": 4.874743988762753e-06, + "loss": 0.6483, + "step": 46785 + }, + { + "epoch": 1.801347449470645, + "grad_norm": 1.5644094944000244, + "learning_rate": 4.865422051200752e-06, + "loss": 0.668, + "step": 46790 + }, + { + "epoch": 1.8015399422521656, + "grad_norm": 1.7178467512130737, + "learning_rate": 4.856108813065696e-06, + "loss": 0.7188, + "step": 46795 + }, + { + "epoch": 1.8017324350336863, + "grad_norm": 1.8276166915893555, + "learning_rate": 4.846804275209216e-06, + "loss": 0.7897, + "step": 46800 + }, + { + "epoch": 1.801924927815207, + "grad_norm": 1.107142448425293, + "learning_rate": 4.8375084384821635e-06, + "loss": 0.6513, + "step": 46805 + }, + { + "epoch": 1.8021174205967276, + "grad_norm": 1.1322238445281982, + "learning_rate": 4.82822130373457e-06, + "loss": 0.8737, + "step": 46810 + }, + { + "epoch": 1.8023099133782483, + "grad_norm": 1.5215245485305786, + "learning_rate": 4.818942871815679e-06, + "loss": 0.7255, + "step": 46815 + }, + { + "epoch": 1.802502406159769, + "grad_norm": 1.0915231704711914, + "learning_rate": 4.809673143573934e-06, + "loss": 0.6991, + "step": 46820 + }, + { + "epoch": 1.8026948989412896, + "grad_norm": 1.1544078588485718, + "learning_rate": 4.800412119857012e-06, + "loss": 0.8788, + "step": 46825 + }, + { + "epoch": 1.8028873917228103, + "grad_norm": 1.521331787109375, + "learning_rate": 4.791159801511769e-06, + "loss": 0.8318, + "step": 46830 + }, + { + "epoch": 1.803079884504331, + "grad_norm": 1.9487007856369019, + "learning_rate": 4.781916189384239e-06, + "loss": 0.7977, + "step": 46835 + }, + { + "epoch": 1.8032723772858517, + "grad_norm": 0.9875978827476501, + "learning_rate": 4.772681284319736e-06, + "loss": 0.7773, + "step": 46840 + }, + { + "epoch": 1.8034648700673723, + "grad_norm": 0.9986359477043152, + "learning_rate": 4.7634550871626935e-06, + "loss": 0.7418, + "step": 46845 + }, + { + "epoch": 1.8036573628488932, + "grad_norm": 0.9883870482444763, + "learning_rate": 4.754237598756806e-06, + "loss": 0.6378, + "step": 46850 + }, + { + "epoch": 1.803849855630414, + "grad_norm": 1.2948468923568726, + "learning_rate": 4.745028819944941e-06, + "loss": 0.6721, + "step": 46855 + }, + { + "epoch": 1.8040423484119346, + "grad_norm": 1.422836422920227, + "learning_rate": 4.735828751569194e-06, + "loss": 0.77, + "step": 46860 + }, + { + "epoch": 1.8042348411934552, + "grad_norm": 0.956969678401947, + "learning_rate": 4.726637394470812e-06, + "loss": 0.7077, + "step": 46865 + }, + { + "epoch": 1.804427333974976, + "grad_norm": 1.3407295942306519, + "learning_rate": 4.717454749490336e-06, + "loss": 0.7287, + "step": 46870 + }, + { + "epoch": 1.8046198267564968, + "grad_norm": 1.0364344120025635, + "learning_rate": 4.7082808174674255e-06, + "loss": 0.8322, + "step": 46875 + }, + { + "epoch": 1.8048123195380175, + "grad_norm": 1.2396444082260132, + "learning_rate": 4.6991155992409885e-06, + "loss": 0.6264, + "step": 46880 + }, + { + "epoch": 1.8050048123195381, + "grad_norm": 1.7426707744598389, + "learning_rate": 4.6899590956491105e-06, + "loss": 0.6881, + "step": 46885 + }, + { + "epoch": 1.8051973051010588, + "grad_norm": 1.3045077323913574, + "learning_rate": 4.68081130752912e-06, + "loss": 0.7758, + "step": 46890 + }, + { + "epoch": 1.8053897978825795, + "grad_norm": 1.462531566619873, + "learning_rate": 4.671672235717494e-06, + "loss": 0.9664, + "step": 46895 + }, + { + "epoch": 1.8055822906641001, + "grad_norm": 1.063500165939331, + "learning_rate": 4.662541881049942e-06, + "loss": 0.8255, + "step": 46900 + }, + { + "epoch": 1.8057747834456208, + "grad_norm": 1.0657175779342651, + "learning_rate": 4.653420244361395e-06, + "loss": 0.8206, + "step": 46905 + }, + { + "epoch": 1.8059672762271415, + "grad_norm": 1.0056036710739136, + "learning_rate": 4.644307326485941e-06, + "loss": 0.8588, + "step": 46910 + }, + { + "epoch": 1.8061597690086622, + "grad_norm": 1.1666234731674194, + "learning_rate": 4.635203128256927e-06, + "loss": 0.5983, + "step": 46915 + }, + { + "epoch": 1.8063522617901828, + "grad_norm": 1.6504344940185547, + "learning_rate": 4.626107650506839e-06, + "loss": 0.7481, + "step": 46920 + }, + { + "epoch": 1.8065447545717035, + "grad_norm": 1.4112893342971802, + "learning_rate": 4.617020894067403e-06, + "loss": 0.9829, + "step": 46925 + }, + { + "epoch": 1.8067372473532242, + "grad_norm": 1.058632493019104, + "learning_rate": 4.607942859769565e-06, + "loss": 0.7069, + "step": 46930 + }, + { + "epoch": 1.8069297401347448, + "grad_norm": 1.1201543807983398, + "learning_rate": 4.598873548443427e-06, + "loss": 0.8061, + "step": 46935 + }, + { + "epoch": 1.8071222329162655, + "grad_norm": 0.8915508389472961, + "learning_rate": 4.589812960918338e-06, + "loss": 0.82, + "step": 46940 + }, + { + "epoch": 1.8073147256977862, + "grad_norm": 2.371730327606201, + "learning_rate": 4.580761098022835e-06, + "loss": 0.8294, + "step": 46945 + }, + { + "epoch": 1.807507218479307, + "grad_norm": 1.3790137767791748, + "learning_rate": 4.571717960584598e-06, + "loss": 0.7928, + "step": 46950 + }, + { + "epoch": 1.8076997112608277, + "grad_norm": 1.2915151119232178, + "learning_rate": 4.562683549430624e-06, + "loss": 0.7858, + "step": 46955 + }, + { + "epoch": 1.8078922040423484, + "grad_norm": 1.653926968574524, + "learning_rate": 4.553657865387018e-06, + "loss": 0.8588, + "step": 46960 + }, + { + "epoch": 1.808084696823869, + "grad_norm": 1.2625577449798584, + "learning_rate": 4.54464090927913e-06, + "loss": 0.8673, + "step": 46965 + }, + { + "epoch": 1.8082771896053897, + "grad_norm": 0.9362401962280273, + "learning_rate": 4.5356326819314894e-06, + "loss": 0.7839, + "step": 46970 + }, + { + "epoch": 1.8084696823869106, + "grad_norm": 1.078352451324463, + "learning_rate": 4.526633184167861e-06, + "loss": 0.8862, + "step": 46975 + }, + { + "epoch": 1.8086621751684313, + "grad_norm": 1.3022172451019287, + "learning_rate": 4.517642416811152e-06, + "loss": 0.8553, + "step": 46980 + }, + { + "epoch": 1.808854667949952, + "grad_norm": 1.2438225746154785, + "learning_rate": 4.50866038068356e-06, + "loss": 0.7513, + "step": 46985 + }, + { + "epoch": 1.8090471607314726, + "grad_norm": 0.9540335536003113, + "learning_rate": 4.4996870766063845e-06, + "loss": 0.7287, + "step": 46990 + }, + { + "epoch": 1.8092396535129933, + "grad_norm": 1.5209485292434692, + "learning_rate": 4.490722505400191e-06, + "loss": 0.8749, + "step": 46995 + }, + { + "epoch": 1.809432146294514, + "grad_norm": 1.1515183448791504, + "learning_rate": 4.481766667884757e-06, + "loss": 0.8147, + "step": 47000 + }, + { + "epoch": 1.8096246390760347, + "grad_norm": 1.2854846715927124, + "learning_rate": 4.472819564878994e-06, + "loss": 0.8026, + "step": 47005 + }, + { + "epoch": 1.8098171318575553, + "grad_norm": 1.7539061307907104, + "learning_rate": 4.4638811972010696e-06, + "loss": 0.8365, + "step": 47010 + }, + { + "epoch": 1.810009624639076, + "grad_norm": 1.8201215267181396, + "learning_rate": 4.454951565668341e-06, + "loss": 0.9819, + "step": 47015 + }, + { + "epoch": 1.8102021174205967, + "grad_norm": 1.3828089237213135, + "learning_rate": 4.4460306710973786e-06, + "loss": 0.7404, + "step": 47020 + }, + { + "epoch": 1.8103946102021173, + "grad_norm": 2.6402223110198975, + "learning_rate": 4.437118514303895e-06, + "loss": 0.887, + "step": 47025 + }, + { + "epoch": 1.810587102983638, + "grad_norm": 1.3061926364898682, + "learning_rate": 4.428215096102906e-06, + "loss": 0.8308, + "step": 47030 + }, + { + "epoch": 1.8107795957651587, + "grad_norm": 1.3701341152191162, + "learning_rate": 4.419320417308526e-06, + "loss": 0.7879, + "step": 47035 + }, + { + "epoch": 1.8109720885466793, + "grad_norm": 1.355481505393982, + "learning_rate": 4.410434478734127e-06, + "loss": 0.8011, + "step": 47040 + }, + { + "epoch": 1.8111645813282002, + "grad_norm": 1.3704595565795898, + "learning_rate": 4.401557281192281e-06, + "loss": 0.8246, + "step": 47045 + }, + { + "epoch": 1.811357074109721, + "grad_norm": 1.3024649620056152, + "learning_rate": 4.39268882549474e-06, + "loss": 0.8316, + "step": 47050 + }, + { + "epoch": 1.8115495668912416, + "grad_norm": 1.1199791431427002, + "learning_rate": 4.383829112452454e-06, + "loss": 0.8883, + "step": 47055 + }, + { + "epoch": 1.8117420596727623, + "grad_norm": 1.2826857566833496, + "learning_rate": 4.374978142875608e-06, + "loss": 0.7167, + "step": 47060 + }, + { + "epoch": 1.811934552454283, + "grad_norm": 0.9372978806495667, + "learning_rate": 4.366135917573555e-06, + "loss": 0.7517, + "step": 47065 + }, + { + "epoch": 1.8121270452358038, + "grad_norm": 1.2647088766098022, + "learning_rate": 4.357302437354848e-06, + "loss": 0.6803, + "step": 47070 + }, + { + "epoch": 1.8123195380173245, + "grad_norm": 0.9146122932434082, + "learning_rate": 4.348477703027254e-06, + "loss": 0.8121, + "step": 47075 + }, + { + "epoch": 1.8125120307988452, + "grad_norm": 1.3470911979675293, + "learning_rate": 4.3396617153977585e-06, + "loss": 0.7432, + "step": 47080 + }, + { + "epoch": 1.8127045235803658, + "grad_norm": 1.119855284690857, + "learning_rate": 4.330854475272483e-06, + "loss": 0.694, + "step": 47085 + }, + { + "epoch": 1.8128970163618865, + "grad_norm": 0.8852443099021912, + "learning_rate": 4.322055983456841e-06, + "loss": 0.8118, + "step": 47090 + }, + { + "epoch": 1.8130895091434072, + "grad_norm": 1.6684072017669678, + "learning_rate": 4.313266240755354e-06, + "loss": 0.9023, + "step": 47095 + }, + { + "epoch": 1.8132820019249278, + "grad_norm": 1.1562302112579346, + "learning_rate": 4.304485247971812e-06, + "loss": 0.7498, + "step": 47100 + }, + { + "epoch": 1.8134744947064485, + "grad_norm": 1.1458892822265625, + "learning_rate": 4.2957130059091635e-06, + "loss": 0.7468, + "step": 47105 + }, + { + "epoch": 1.8136669874879692, + "grad_norm": 1.3492836952209473, + "learning_rate": 4.286949515369587e-06, + "loss": 0.7202, + "step": 47110 + }, + { + "epoch": 1.8138594802694898, + "grad_norm": 1.3468396663665771, + "learning_rate": 4.278194777154432e-06, + "loss": 0.9034, + "step": 47115 + }, + { + "epoch": 1.8140519730510105, + "grad_norm": 1.1316415071487427, + "learning_rate": 4.269448792064257e-06, + "loss": 0.8748, + "step": 47120 + }, + { + "epoch": 1.8142444658325312, + "grad_norm": 2.3300271034240723, + "learning_rate": 4.260711560898833e-06, + "loss": 0.6797, + "step": 47125 + }, + { + "epoch": 1.8144369586140519, + "grad_norm": 1.528640866279602, + "learning_rate": 4.251983084457134e-06, + "loss": 0.8478, + "step": 47130 + }, + { + "epoch": 1.8146294513955725, + "grad_norm": 1.0944819450378418, + "learning_rate": 4.2432633635373084e-06, + "loss": 0.7424, + "step": 47135 + }, + { + "epoch": 1.8148219441770932, + "grad_norm": 1.4346030950546265, + "learning_rate": 4.234552398936709e-06, + "loss": 0.7892, + "step": 47140 + }, + { + "epoch": 1.815014436958614, + "grad_norm": 1.987533450126648, + "learning_rate": 4.225850191451908e-06, + "loss": 0.728, + "step": 47145 + }, + { + "epoch": 1.8152069297401348, + "grad_norm": 1.4819614887237549, + "learning_rate": 4.217156741878658e-06, + "loss": 0.8437, + "step": 47150 + }, + { + "epoch": 1.8153994225216554, + "grad_norm": 1.6680911779403687, + "learning_rate": 4.2084720510119355e-06, + "loss": 0.9606, + "step": 47155 + }, + { + "epoch": 1.815591915303176, + "grad_norm": 1.2909523248672485, + "learning_rate": 4.19979611964586e-06, + "loss": 0.8903, + "step": 47160 + }, + { + "epoch": 1.815784408084697, + "grad_norm": 1.6894687414169312, + "learning_rate": 4.19112894857383e-06, + "loss": 0.8166, + "step": 47165 + }, + { + "epoch": 1.8159769008662177, + "grad_norm": 1.0353437662124634, + "learning_rate": 4.1824705385883786e-06, + "loss": 0.6971, + "step": 47170 + }, + { + "epoch": 1.8161693936477383, + "grad_norm": 1.8149842023849487, + "learning_rate": 4.17382089048125e-06, + "loss": 0.8447, + "step": 47175 + }, + { + "epoch": 1.816361886429259, + "grad_norm": 1.26813805103302, + "learning_rate": 4.165180005043423e-06, + "loss": 0.7669, + "step": 47180 + }, + { + "epoch": 1.8165543792107797, + "grad_norm": 1.110489010810852, + "learning_rate": 4.156547883065043e-06, + "loss": 0.7015, + "step": 47185 + }, + { + "epoch": 1.8167468719923003, + "grad_norm": 1.940280795097351, + "learning_rate": 4.147924525335434e-06, + "loss": 0.7819, + "step": 47190 + }, + { + "epoch": 1.816939364773821, + "grad_norm": 0.8619933724403381, + "learning_rate": 4.139309932643187e-06, + "loss": 0.9221, + "step": 47195 + }, + { + "epoch": 1.8171318575553417, + "grad_norm": 1.2719132900238037, + "learning_rate": 4.130704105776018e-06, + "loss": 0.7197, + "step": 47200 + }, + { + "epoch": 1.8173243503368623, + "grad_norm": 1.1760082244873047, + "learning_rate": 4.1221070455208735e-06, + "loss": 0.8961, + "step": 47205 + }, + { + "epoch": 1.817516843118383, + "grad_norm": 1.3913214206695557, + "learning_rate": 4.113518752663914e-06, + "loss": 0.787, + "step": 47210 + }, + { + "epoch": 1.8177093358999037, + "grad_norm": 1.941649079322815, + "learning_rate": 4.104939227990478e-06, + "loss": 0.918, + "step": 47215 + }, + { + "epoch": 1.8179018286814244, + "grad_norm": 0.8367112278938293, + "learning_rate": 4.096368472285106e-06, + "loss": 0.636, + "step": 47220 + }, + { + "epoch": 1.818094321462945, + "grad_norm": 1.0153110027313232, + "learning_rate": 4.0878064863315356e-06, + "loss": 0.8449, + "step": 47225 + }, + { + "epoch": 1.8182868142444657, + "grad_norm": 1.1490223407745361, + "learning_rate": 4.079253270912687e-06, + "loss": 0.7839, + "step": 47230 + }, + { + "epoch": 1.8184793070259864, + "grad_norm": 1.2465404272079468, + "learning_rate": 4.070708826810721e-06, + "loss": 0.6734, + "step": 47235 + }, + { + "epoch": 1.8186717998075073, + "grad_norm": 1.5955170392990112, + "learning_rate": 4.06217315480697e-06, + "loss": 0.868, + "step": 47240 + }, + { + "epoch": 1.818864292589028, + "grad_norm": 2.5519890785217285, + "learning_rate": 4.05364625568192e-06, + "loss": 0.6769, + "step": 47245 + }, + { + "epoch": 1.8190567853705486, + "grad_norm": 0.968748152256012, + "learning_rate": 4.04512813021537e-06, + "loss": 0.7728, + "step": 47250 + }, + { + "epoch": 1.8192492781520693, + "grad_norm": 1.381900668144226, + "learning_rate": 4.0366187791861855e-06, + "loss": 0.6947, + "step": 47255 + }, + { + "epoch": 1.81944177093359, + "grad_norm": 0.9397858381271362, + "learning_rate": 4.028118203372521e-06, + "loss": 0.7413, + "step": 47260 + }, + { + "epoch": 1.8196342637151108, + "grad_norm": 1.2155348062515259, + "learning_rate": 4.019626403551691e-06, + "loss": 0.8101, + "step": 47265 + }, + { + "epoch": 1.8198267564966315, + "grad_norm": 1.4671454429626465, + "learning_rate": 4.011143380500226e-06, + "loss": 0.6367, + "step": 47270 + }, + { + "epoch": 1.8200192492781522, + "grad_norm": 1.2641961574554443, + "learning_rate": 4.002669134993808e-06, + "loss": 0.8871, + "step": 47275 + }, + { + "epoch": 1.8202117420596728, + "grad_norm": 1.482218861579895, + "learning_rate": 3.994203667807395e-06, + "loss": 0.7126, + "step": 47280 + }, + { + "epoch": 1.8204042348411935, + "grad_norm": 1.0232921838760376, + "learning_rate": 3.985746979715066e-06, + "loss": 0.7237, + "step": 47285 + }, + { + "epoch": 1.8205967276227142, + "grad_norm": 2.7515757083892822, + "learning_rate": 3.977299071490148e-06, + "loss": 0.7837, + "step": 47290 + }, + { + "epoch": 1.8207892204042349, + "grad_norm": 1.110560417175293, + "learning_rate": 3.968859943905124e-06, + "loss": 0.7886, + "step": 47295 + }, + { + "epoch": 1.8209817131857555, + "grad_norm": 1.8675193786621094, + "learning_rate": 3.96042959773173e-06, + "loss": 0.6506, + "step": 47300 + }, + { + "epoch": 1.8211742059672762, + "grad_norm": 1.233778953552246, + "learning_rate": 3.952008033740817e-06, + "loss": 0.7465, + "step": 47305 + }, + { + "epoch": 1.8213666987487969, + "grad_norm": 1.6183221340179443, + "learning_rate": 3.943595252702537e-06, + "loss": 0.7255, + "step": 47310 + }, + { + "epoch": 1.8215591915303175, + "grad_norm": 1.736594796180725, + "learning_rate": 3.935191255386139e-06, + "loss": 0.6987, + "step": 47315 + }, + { + "epoch": 1.8217516843118382, + "grad_norm": 1.2381728887557983, + "learning_rate": 3.926796042560133e-06, + "loss": 0.9369, + "step": 47320 + }, + { + "epoch": 1.8219441770933589, + "grad_norm": 0.9554552435874939, + "learning_rate": 3.9184096149922025e-06, + "loss": 0.8767, + "step": 47325 + }, + { + "epoch": 1.8221366698748795, + "grad_norm": 1.7278703451156616, + "learning_rate": 3.9100319734492465e-06, + "loss": 0.8719, + "step": 47330 + }, + { + "epoch": 1.8223291626564004, + "grad_norm": 0.6862698197364807, + "learning_rate": 3.901663118697308e-06, + "loss": 0.8551, + "step": 47335 + }, + { + "epoch": 1.822521655437921, + "grad_norm": 1.7460417747497559, + "learning_rate": 3.893303051501685e-06, + "loss": 0.7529, + "step": 47340 + }, + { + "epoch": 1.8227141482194418, + "grad_norm": 1.0978227853775024, + "learning_rate": 3.884951772626854e-06, + "loss": 0.6831, + "step": 47345 + }, + { + "epoch": 1.8229066410009624, + "grad_norm": 1.4301860332489014, + "learning_rate": 3.8766092828364856e-06, + "loss": 0.7509, + "step": 47350 + }, + { + "epoch": 1.8230991337824831, + "grad_norm": 1.623776912689209, + "learning_rate": 3.868275582893444e-06, + "loss": 0.891, + "step": 47355 + }, + { + "epoch": 1.823291626564004, + "grad_norm": 0.9064698219299316, + "learning_rate": 3.859950673559765e-06, + "loss": 0.923, + "step": 47360 + }, + { + "epoch": 1.8234841193455247, + "grad_norm": 1.351654052734375, + "learning_rate": 3.8516345555967396e-06, + "loss": 0.8076, + "step": 47365 + }, + { + "epoch": 1.8236766121270453, + "grad_norm": 1.3697307109832764, + "learning_rate": 3.8433272297648035e-06, + "loss": 0.8001, + "step": 47370 + }, + { + "epoch": 1.823869104908566, + "grad_norm": 1.3666245937347412, + "learning_rate": 3.835028696823628e-06, + "loss": 0.7855, + "step": 47375 + }, + { + "epoch": 1.8240615976900867, + "grad_norm": 1.0923709869384766, + "learning_rate": 3.826738957532017e-06, + "loss": 0.8134, + "step": 47380 + }, + { + "epoch": 1.8242540904716074, + "grad_norm": 1.4689079523086548, + "learning_rate": 3.818458012648063e-06, + "loss": 0.8976, + "step": 47385 + }, + { + "epoch": 1.824446583253128, + "grad_norm": 0.9157153367996216, + "learning_rate": 3.810185862928972e-06, + "loss": 0.7001, + "step": 47390 + }, + { + "epoch": 1.8246390760346487, + "grad_norm": 1.6523871421813965, + "learning_rate": 3.801922509131184e-06, + "loss": 0.7818, + "step": 47395 + }, + { + "epoch": 1.8248315688161694, + "grad_norm": 1.4929468631744385, + "learning_rate": 3.793667952010327e-06, + "loss": 0.7585, + "step": 47400 + }, + { + "epoch": 1.82502406159769, + "grad_norm": 0.9369055032730103, + "learning_rate": 3.7854221923212307e-06, + "loss": 0.6929, + "step": 47405 + }, + { + "epoch": 1.8252165543792107, + "grad_norm": 2.0922114849090576, + "learning_rate": 3.7771852308178925e-06, + "loss": 0.8816, + "step": 47410 + }, + { + "epoch": 1.8254090471607314, + "grad_norm": 1.3968013525009155, + "learning_rate": 3.7689570682535758e-06, + "loss": 0.8213, + "step": 47415 + }, + { + "epoch": 1.825601539942252, + "grad_norm": 1.271930456161499, + "learning_rate": 3.7607377053806567e-06, + "loss": 0.7507, + "step": 47420 + }, + { + "epoch": 1.8257940327237727, + "grad_norm": 1.283036231994629, + "learning_rate": 3.752527142950735e-06, + "loss": 0.7738, + "step": 47425 + }, + { + "epoch": 1.8259865255052934, + "grad_norm": 1.001495122909546, + "learning_rate": 3.7443253817146306e-06, + "loss": 0.8228, + "step": 47430 + }, + { + "epoch": 1.8261790182868143, + "grad_norm": 1.5130256414413452, + "learning_rate": 3.736132422422345e-06, + "loss": 0.8288, + "step": 47435 + }, + { + "epoch": 1.826371511068335, + "grad_norm": 0.9629761576652527, + "learning_rate": 3.7279482658230445e-06, + "loss": 0.819, + "step": 47440 + }, + { + "epoch": 1.8265640038498556, + "grad_norm": 1.0789672136306763, + "learning_rate": 3.7197729126651317e-06, + "loss": 0.8386, + "step": 47445 + }, + { + "epoch": 1.8267564966313763, + "grad_norm": 1.1328613758087158, + "learning_rate": 3.7116063636961964e-06, + "loss": 0.916, + "step": 47450 + }, + { + "epoch": 1.826948989412897, + "grad_norm": 0.8800186514854431, + "learning_rate": 3.703448619662997e-06, + "loss": 0.7408, + "step": 47455 + }, + { + "epoch": 1.8271414821944179, + "grad_norm": 1.6841456890106201, + "learning_rate": 3.695299681311537e-06, + "loss": 0.8142, + "step": 47460 + }, + { + "epoch": 1.8273339749759385, + "grad_norm": 1.3652926683425903, + "learning_rate": 3.6871595493869316e-06, + "loss": 0.8027, + "step": 47465 + }, + { + "epoch": 1.8275264677574592, + "grad_norm": 1.3112471103668213, + "learning_rate": 3.679028224633596e-06, + "loss": 0.9375, + "step": 47470 + }, + { + "epoch": 1.8277189605389799, + "grad_norm": 1.2834017276763916, + "learning_rate": 3.670905707795047e-06, + "loss": 0.8115, + "step": 47475 + }, + { + "epoch": 1.8279114533205005, + "grad_norm": 1.075247049331665, + "learning_rate": 3.6627919996140457e-06, + "loss": 0.8848, + "step": 47480 + }, + { + "epoch": 1.8281039461020212, + "grad_norm": 1.0291975736618042, + "learning_rate": 3.6546871008325433e-06, + "loss": 0.9769, + "step": 47485 + }, + { + "epoch": 1.8282964388835419, + "grad_norm": 1.429732084274292, + "learning_rate": 3.646591012191691e-06, + "loss": 0.6725, + "step": 47490 + }, + { + "epoch": 1.8284889316650625, + "grad_norm": 1.1338505744934082, + "learning_rate": 3.6385037344317862e-06, + "loss": 0.8288, + "step": 47495 + }, + { + "epoch": 1.8286814244465832, + "grad_norm": 1.1439357995986938, + "learning_rate": 3.6304252682924036e-06, + "loss": 0.8643, + "step": 47500 + }, + { + "epoch": 1.8288739172281039, + "grad_norm": 2.053274631500244, + "learning_rate": 3.6223556145122186e-06, + "loss": 0.9137, + "step": 47505 + }, + { + "epoch": 1.8290664100096246, + "grad_norm": 0.9340258836746216, + "learning_rate": 3.6142947738291744e-06, + "loss": 0.702, + "step": 47510 + }, + { + "epoch": 1.8292589027911452, + "grad_norm": 2.1745176315307617, + "learning_rate": 3.6062427469803705e-06, + "loss": 0.8425, + "step": 47515 + }, + { + "epoch": 1.829451395572666, + "grad_norm": 1.1325429677963257, + "learning_rate": 3.5981995347021403e-06, + "loss": 0.8262, + "step": 47520 + }, + { + "epoch": 1.8296438883541866, + "grad_norm": 1.0720462799072266, + "learning_rate": 3.5901651377299285e-06, + "loss": 0.8925, + "step": 47525 + }, + { + "epoch": 1.8298363811357075, + "grad_norm": 0.8257774114608765, + "learning_rate": 3.5821395567984805e-06, + "loss": 0.7577, + "step": 47530 + }, + { + "epoch": 1.8300288739172281, + "grad_norm": 1.3935045003890991, + "learning_rate": 3.5741227926416545e-06, + "loss": 0.7907, + "step": 47535 + }, + { + "epoch": 1.8302213666987488, + "grad_norm": 1.0141493082046509, + "learning_rate": 3.56611484599253e-06, + "loss": 0.6785, + "step": 47540 + }, + { + "epoch": 1.8304138594802695, + "grad_norm": 1.2392221689224243, + "learning_rate": 3.558115717583388e-06, + "loss": 0.9234, + "step": 47545 + }, + { + "epoch": 1.8306063522617901, + "grad_norm": 1.5585402250289917, + "learning_rate": 3.5501254081457104e-06, + "loss": 0.9621, + "step": 47550 + }, + { + "epoch": 1.830798845043311, + "grad_norm": 1.6715527772903442, + "learning_rate": 3.5421439184101234e-06, + "loss": 0.869, + "step": 47555 + }, + { + "epoch": 1.8309913378248317, + "grad_norm": 0.8488251566886902, + "learning_rate": 3.5341712491065103e-06, + "loss": 0.8197, + "step": 47560 + }, + { + "epoch": 1.8311838306063524, + "grad_norm": 2.0500576496124268, + "learning_rate": 3.52620740096391e-06, + "loss": 0.9029, + "step": 47565 + }, + { + "epoch": 1.831376323387873, + "grad_norm": 2.1723897457122803, + "learning_rate": 3.518252374710551e-06, + "loss": 0.7979, + "step": 47570 + }, + { + "epoch": 1.8315688161693937, + "grad_norm": 0.6698965430259705, + "learning_rate": 3.510306171073896e-06, + "loss": 0.8421, + "step": 47575 + }, + { + "epoch": 1.8317613089509144, + "grad_norm": 1.0468194484710693, + "learning_rate": 3.5023687907805415e-06, + "loss": 0.7916, + "step": 47580 + }, + { + "epoch": 1.831953801732435, + "grad_norm": 1.206809163093567, + "learning_rate": 3.4944402345563177e-06, + "loss": 0.7774, + "step": 47585 + }, + { + "epoch": 1.8321462945139557, + "grad_norm": 1.2247503995895386, + "learning_rate": 3.486520503126256e-06, + "loss": 0.8677, + "step": 47590 + }, + { + "epoch": 1.8323387872954764, + "grad_norm": 1.510204553604126, + "learning_rate": 3.4786095972145547e-06, + "loss": 0.8027, + "step": 47595 + }, + { + "epoch": 1.832531280076997, + "grad_norm": 0.9858173131942749, + "learning_rate": 3.4707075175445915e-06, + "loss": 0.7336, + "step": 47600 + }, + { + "epoch": 1.8327237728585177, + "grad_norm": 1.1785179376602173, + "learning_rate": 3.462814264838998e-06, + "loss": 0.7863, + "step": 47605 + }, + { + "epoch": 1.8329162656400384, + "grad_norm": 1.1377897262573242, + "learning_rate": 3.4549298398195316e-06, + "loss": 0.7096, + "step": 47610 + }, + { + "epoch": 1.833108758421559, + "grad_norm": 1.5963387489318848, + "learning_rate": 3.4470542432071704e-06, + "loss": 0.8752, + "step": 47615 + }, + { + "epoch": 1.8333012512030797, + "grad_norm": 1.3239431381225586, + "learning_rate": 3.4391874757221054e-06, + "loss": 0.7784, + "step": 47620 + }, + { + "epoch": 1.8334937439846006, + "grad_norm": 1.5421568155288696, + "learning_rate": 3.431329538083694e-06, + "loss": 0.7401, + "step": 47625 + }, + { + "epoch": 1.8336862367661213, + "grad_norm": 1.7763359546661377, + "learning_rate": 3.423480431010462e-06, + "loss": 0.739, + "step": 47630 + }, + { + "epoch": 1.833878729547642, + "grad_norm": 1.8040746450424194, + "learning_rate": 3.415640155220212e-06, + "loss": 0.848, + "step": 47635 + }, + { + "epoch": 1.8340712223291626, + "grad_norm": 1.0281445980072021, + "learning_rate": 3.4078087114298495e-06, + "loss": 0.7374, + "step": 47640 + }, + { + "epoch": 1.8342637151106833, + "grad_norm": 1.061551570892334, + "learning_rate": 3.399986100355501e-06, + "loss": 0.7567, + "step": 47645 + }, + { + "epoch": 1.8344562078922042, + "grad_norm": 0.8794161677360535, + "learning_rate": 3.392172322712517e-06, + "loss": 0.7726, + "step": 47650 + }, + { + "epoch": 1.8346487006737249, + "grad_norm": 2.0824060440063477, + "learning_rate": 3.384367379215425e-06, + "loss": 0.9414, + "step": 47655 + }, + { + "epoch": 1.8348411934552455, + "grad_norm": 1.3683054447174072, + "learning_rate": 3.3765712705778884e-06, + "loss": 0.7369, + "step": 47660 + }, + { + "epoch": 1.8350336862367662, + "grad_norm": 1.2808324098587036, + "learning_rate": 3.3687839975128477e-06, + "loss": 0.7476, + "step": 47665 + }, + { + "epoch": 1.8352261790182869, + "grad_norm": 1.3409979343414307, + "learning_rate": 3.3610055607323887e-06, + "loss": 0.7112, + "step": 47670 + }, + { + "epoch": 1.8354186717998076, + "grad_norm": 1.0510163307189941, + "learning_rate": 3.353235960947787e-06, + "loss": 0.7489, + "step": 47675 + }, + { + "epoch": 1.8356111645813282, + "grad_norm": 0.9734552502632141, + "learning_rate": 3.345475198869552e-06, + "loss": 0.6948, + "step": 47680 + }, + { + "epoch": 1.835803657362849, + "grad_norm": 0.9429619908332825, + "learning_rate": 3.337723275207316e-06, + "loss": 0.5987, + "step": 47685 + }, + { + "epoch": 1.8359961501443696, + "grad_norm": 1.4983386993408203, + "learning_rate": 3.3299801906699567e-06, + "loss": 0.809, + "step": 47690 + }, + { + "epoch": 1.8361886429258902, + "grad_norm": 2.2176480293273926, + "learning_rate": 3.3222459459655297e-06, + "loss": 0.8457, + "step": 47695 + }, + { + "epoch": 1.836381135707411, + "grad_norm": 0.8564404249191284, + "learning_rate": 3.3145205418012915e-06, + "loss": 0.8729, + "step": 47700 + }, + { + "epoch": 1.8365736284889316, + "grad_norm": 1.057340383529663, + "learning_rate": 3.3068039788836435e-06, + "loss": 0.7122, + "step": 47705 + }, + { + "epoch": 1.8367661212704522, + "grad_norm": 1.1386967897415161, + "learning_rate": 3.299096257918255e-06, + "loss": 0.7144, + "step": 47710 + }, + { + "epoch": 1.836958614051973, + "grad_norm": 1.4860408306121826, + "learning_rate": 3.2913973796099174e-06, + "loss": 0.6879, + "step": 47715 + }, + { + "epoch": 1.8371511068334936, + "grad_norm": 1.474790096282959, + "learning_rate": 3.2837073446626677e-06, + "loss": 0.9557, + "step": 47720 + }, + { + "epoch": 1.8373435996150145, + "grad_norm": 1.0030909776687622, + "learning_rate": 3.276026153779688e-06, + "loss": 0.7611, + "step": 47725 + }, + { + "epoch": 1.8375360923965351, + "grad_norm": 1.4009109735488892, + "learning_rate": 3.2683538076633714e-06, + "loss": 0.7646, + "step": 47730 + }, + { + "epoch": 1.8377285851780558, + "grad_norm": 1.6726889610290527, + "learning_rate": 3.2606903070153127e-06, + "loss": 0.8261, + "step": 47735 + }, + { + "epoch": 1.8379210779595765, + "grad_norm": 1.5140515565872192, + "learning_rate": 3.253035652536307e-06, + "loss": 0.8136, + "step": 47740 + }, + { + "epoch": 1.8381135707410972, + "grad_norm": 1.0216679573059082, + "learning_rate": 3.2453898449262834e-06, + "loss": 0.8699, + "step": 47745 + }, + { + "epoch": 1.838306063522618, + "grad_norm": 0.9776056408882141, + "learning_rate": 3.2377528848844154e-06, + "loss": 0.6758, + "step": 47750 + }, + { + "epoch": 1.8384985563041387, + "grad_norm": 1.2438172101974487, + "learning_rate": 3.2301247731090557e-06, + "loss": 0.848, + "step": 47755 + }, + { + "epoch": 1.8386910490856594, + "grad_norm": 1.5708495378494263, + "learning_rate": 3.2225055102977464e-06, + "loss": 0.7504, + "step": 47760 + }, + { + "epoch": 1.83888354186718, + "grad_norm": 1.5115718841552734, + "learning_rate": 3.2148950971472302e-06, + "loss": 0.8011, + "step": 47765 + }, + { + "epoch": 1.8390760346487007, + "grad_norm": 1.2615517377853394, + "learning_rate": 3.207293534353395e-06, + "loss": 0.8359, + "step": 47770 + }, + { + "epoch": 1.8392685274302214, + "grad_norm": 1.3792976140975952, + "learning_rate": 3.1997008226113734e-06, + "loss": 0.8824, + "step": 47775 + }, + { + "epoch": 1.839461020211742, + "grad_norm": 0.8459932804107666, + "learning_rate": 3.192116962615477e-06, + "loss": 0.7326, + "step": 47780 + }, + { + "epoch": 1.8396535129932627, + "grad_norm": 1.6874253749847412, + "learning_rate": 3.184541955059195e-06, + "loss": 0.9437, + "step": 47785 + }, + { + "epoch": 1.8398460057747834, + "grad_norm": 1.0827080011367798, + "learning_rate": 3.1769758006351846e-06, + "loss": 0.7969, + "step": 47790 + }, + { + "epoch": 1.840038498556304, + "grad_norm": 1.4325987100601196, + "learning_rate": 3.1694185000353703e-06, + "loss": 0.6621, + "step": 47795 + }, + { + "epoch": 1.8402309913378248, + "grad_norm": 0.8930256366729736, + "learning_rate": 3.1618700539507774e-06, + "loss": 0.8178, + "step": 47800 + }, + { + "epoch": 1.8404234841193454, + "grad_norm": 1.8248862028121948, + "learning_rate": 3.154330463071675e-06, + "loss": 0.6502, + "step": 47805 + }, + { + "epoch": 1.840615976900866, + "grad_norm": 1.318647027015686, + "learning_rate": 3.146799728087513e-06, + "loss": 0.7669, + "step": 47810 + }, + { + "epoch": 1.8408084696823868, + "grad_norm": 1.4266585111618042, + "learning_rate": 3.139277849686928e-06, + "loss": 0.8331, + "step": 47815 + }, + { + "epoch": 1.8410009624639077, + "grad_norm": 1.6008580923080444, + "learning_rate": 3.131764828557715e-06, + "loss": 0.7556, + "step": 47820 + }, + { + "epoch": 1.8411934552454283, + "grad_norm": 1.1239802837371826, + "learning_rate": 3.1242606653869355e-06, + "loss": 0.7312, + "step": 47825 + }, + { + "epoch": 1.841385948026949, + "grad_norm": 1.142177939414978, + "learning_rate": 3.116765360860774e-06, + "loss": 0.7444, + "step": 47830 + }, + { + "epoch": 1.8415784408084697, + "grad_norm": 1.0553315877914429, + "learning_rate": 3.109278915664615e-06, + "loss": 0.8287, + "step": 47835 + }, + { + "epoch": 1.8417709335899903, + "grad_norm": 1.4490617513656616, + "learning_rate": 3.101801330483067e-06, + "loss": 0.9347, + "step": 47840 + }, + { + "epoch": 1.8419634263715112, + "grad_norm": 2.255176305770874, + "learning_rate": 3.0943326059999056e-06, + "loss": 0.8066, + "step": 47845 + }, + { + "epoch": 1.842155919153032, + "grad_norm": 1.7092933654785156, + "learning_rate": 3.0868727428980617e-06, + "loss": 0.9151, + "step": 47850 + }, + { + "epoch": 1.8423484119345526, + "grad_norm": 1.135002613067627, + "learning_rate": 3.079421741859734e-06, + "loss": 0.873, + "step": 47855 + }, + { + "epoch": 1.8425409047160732, + "grad_norm": 1.5216232538223267, + "learning_rate": 3.071979603566233e-06, + "loss": 0.9477, + "step": 47860 + }, + { + "epoch": 1.842733397497594, + "grad_norm": 1.8921546936035156, + "learning_rate": 3.0645463286981148e-06, + "loss": 0.7256, + "step": 47865 + }, + { + "epoch": 1.8429258902791146, + "grad_norm": 1.7546651363372803, + "learning_rate": 3.0571219179351016e-06, + "loss": 0.7983, + "step": 47870 + }, + { + "epoch": 1.8431183830606352, + "grad_norm": 1.1996418237686157, + "learning_rate": 3.0497063719561068e-06, + "loss": 0.7587, + "step": 47875 + }, + { + "epoch": 1.843310875842156, + "grad_norm": 1.1013127565383911, + "learning_rate": 3.0422996914392098e-06, + "loss": 0.7332, + "step": 47880 + }, + { + "epoch": 1.8435033686236766, + "grad_norm": 1.216601014137268, + "learning_rate": 3.0349018770617354e-06, + "loss": 0.8414, + "step": 47885 + }, + { + "epoch": 1.8436958614051973, + "grad_norm": 0.8273448944091797, + "learning_rate": 3.0275129295001315e-06, + "loss": 0.7199, + "step": 47890 + }, + { + "epoch": 1.843888354186718, + "grad_norm": 1.1838254928588867, + "learning_rate": 3.020132849430102e-06, + "loss": 0.7555, + "step": 47895 + }, + { + "epoch": 1.8440808469682386, + "grad_norm": 0.9097992181777954, + "learning_rate": 3.0127616375264956e-06, + "loss": 0.7774, + "step": 47900 + }, + { + "epoch": 1.8442733397497593, + "grad_norm": 3.2226083278656006, + "learning_rate": 3.0053992944633404e-06, + "loss": 0.8008, + "step": 47905 + }, + { + "epoch": 1.84446583253128, + "grad_norm": 1.555508017539978, + "learning_rate": 2.998045820913886e-06, + "loss": 0.814, + "step": 47910 + }, + { + "epoch": 1.8446583253128006, + "grad_norm": 1.5878740549087524, + "learning_rate": 2.990701217550573e-06, + "loss": 0.7585, + "step": 47915 + }, + { + "epoch": 1.8448508180943215, + "grad_norm": 1.0371932983398438, + "learning_rate": 2.983365485045009e-06, + "loss": 0.8261, + "step": 47920 + }, + { + "epoch": 1.8450433108758422, + "grad_norm": 0.9782282114028931, + "learning_rate": 2.976038624067978e-06, + "loss": 0.6692, + "step": 47925 + }, + { + "epoch": 1.8452358036573628, + "grad_norm": 1.5847156047821045, + "learning_rate": 2.9687206352895125e-06, + "loss": 0.846, + "step": 47930 + }, + { + "epoch": 1.8454282964388835, + "grad_norm": 0.9324734807014465, + "learning_rate": 2.961411519378754e-06, + "loss": 0.8455, + "step": 47935 + }, + { + "epoch": 1.8456207892204044, + "grad_norm": 1.0669317245483398, + "learning_rate": 2.9541112770041013e-06, + "loss": 0.944, + "step": 47940 + }, + { + "epoch": 1.845813282001925, + "grad_norm": 1.0065456628799438, + "learning_rate": 2.9468199088330985e-06, + "loss": 0.7346, + "step": 47945 + }, + { + "epoch": 1.8460057747834457, + "grad_norm": 1.4775437116622925, + "learning_rate": 2.9395374155325007e-06, + "loss": 0.6656, + "step": 47950 + }, + { + "epoch": 1.8461982675649664, + "grad_norm": 0.8717533349990845, + "learning_rate": 2.93226379776822e-06, + "loss": 0.696, + "step": 47955 + }, + { + "epoch": 1.846390760346487, + "grad_norm": 0.9198839664459229, + "learning_rate": 2.924999056205424e-06, + "loss": 0.8188, + "step": 47960 + }, + { + "epoch": 1.8465832531280078, + "grad_norm": 1.2768006324768066, + "learning_rate": 2.917743191508393e-06, + "loss": 0.8166, + "step": 47965 + }, + { + "epoch": 1.8467757459095284, + "grad_norm": 1.1566869020462036, + "learning_rate": 2.9104962043406293e-06, + "loss": 0.7985, + "step": 47970 + }, + { + "epoch": 1.846968238691049, + "grad_norm": 1.2734941244125366, + "learning_rate": 2.9032580953648357e-06, + "loss": 0.8511, + "step": 47975 + }, + { + "epoch": 1.8471607314725698, + "grad_norm": 0.7918460369110107, + "learning_rate": 2.8960288652428726e-06, + "loss": 0.7173, + "step": 47980 + }, + { + "epoch": 1.8473532242540904, + "grad_norm": 1.3652775287628174, + "learning_rate": 2.8888085146358324e-06, + "loss": 0.7498, + "step": 47985 + }, + { + "epoch": 1.847545717035611, + "grad_norm": 1.2592366933822632, + "learning_rate": 2.881597044203943e-06, + "loss": 0.8583, + "step": 47990 + }, + { + "epoch": 1.8477382098171318, + "grad_norm": 2.0500288009643555, + "learning_rate": 2.8743944546066437e-06, + "loss": 0.854, + "step": 47995 + }, + { + "epoch": 1.8479307025986524, + "grad_norm": 1.140693187713623, + "learning_rate": 2.867200746502585e-06, + "loss": 0.6055, + "step": 48000 + }, + { + "epoch": 1.8481231953801731, + "grad_norm": 1.287409782409668, + "learning_rate": 2.8600159205495748e-06, + "loss": 0.8655, + "step": 48005 + }, + { + "epoch": 1.8483156881616938, + "grad_norm": 1.279222846031189, + "learning_rate": 2.8528399774045977e-06, + "loss": 0.7086, + "step": 48010 + }, + { + "epoch": 1.8485081809432147, + "grad_norm": 1.2571754455566406, + "learning_rate": 2.845672917723885e-06, + "loss": 0.8281, + "step": 48015 + }, + { + "epoch": 1.8487006737247353, + "grad_norm": 1.407504916191101, + "learning_rate": 2.838514742162779e-06, + "loss": 0.8478, + "step": 48020 + }, + { + "epoch": 1.848893166506256, + "grad_norm": 1.116768479347229, + "learning_rate": 2.831365451375867e-06, + "loss": 0.7778, + "step": 48025 + }, + { + "epoch": 1.8490856592877767, + "grad_norm": 1.208387017250061, + "learning_rate": 2.824225046016904e-06, + "loss": 0.7465, + "step": 48030 + }, + { + "epoch": 1.8492781520692974, + "grad_norm": 1.3610413074493408, + "learning_rate": 2.8170935267388343e-06, + "loss": 0.7879, + "step": 48035 + }, + { + "epoch": 1.8494706448508182, + "grad_norm": 0.9448495507240295, + "learning_rate": 2.8099708941937697e-06, + "loss": 0.7607, + "step": 48040 + }, + { + "epoch": 1.849663137632339, + "grad_norm": 1.0731843709945679, + "learning_rate": 2.8028571490330556e-06, + "loss": 0.7323, + "step": 48045 + }, + { + "epoch": 1.8498556304138596, + "grad_norm": 1.2336262464523315, + "learning_rate": 2.795752291907183e-06, + "loss": 0.827, + "step": 48050 + }, + { + "epoch": 1.8500481231953803, + "grad_norm": 1.0452556610107422, + "learning_rate": 2.7886563234658327e-06, + "loss": 0.7965, + "step": 48055 + }, + { + "epoch": 1.850240615976901, + "grad_norm": 1.5202009677886963, + "learning_rate": 2.7815692443579066e-06, + "loss": 0.7943, + "step": 48060 + }, + { + "epoch": 1.8504331087584216, + "grad_norm": 1.1691776514053345, + "learning_rate": 2.774491055231465e-06, + "loss": 0.7282, + "step": 48065 + }, + { + "epoch": 1.8506256015399423, + "grad_norm": 2.0975561141967773, + "learning_rate": 2.7674217567337348e-06, + "loss": 0.8538, + "step": 48070 + }, + { + "epoch": 1.850818094321463, + "grad_norm": 1.0445414781570435, + "learning_rate": 2.760361349511198e-06, + "loss": 0.9576, + "step": 48075 + }, + { + "epoch": 1.8510105871029836, + "grad_norm": 1.9726266860961914, + "learning_rate": 2.75330983420945e-06, + "loss": 0.8439, + "step": 48080 + }, + { + "epoch": 1.8512030798845043, + "grad_norm": 1.502845048904419, + "learning_rate": 2.746267211473319e-06, + "loss": 0.9805, + "step": 48085 + }, + { + "epoch": 1.851395572666025, + "grad_norm": 1.754075050354004, + "learning_rate": 2.7392334819468123e-06, + "loss": 0.9215, + "step": 48090 + }, + { + "epoch": 1.8515880654475456, + "grad_norm": 0.9861617088317871, + "learning_rate": 2.7322086462731157e-06, + "loss": 0.9203, + "step": 48095 + }, + { + "epoch": 1.8517805582290663, + "grad_norm": 1.416224718093872, + "learning_rate": 2.7251927050945813e-06, + "loss": 0.834, + "step": 48100 + }, + { + "epoch": 1.851973051010587, + "grad_norm": 1.3885353803634644, + "learning_rate": 2.7181856590527967e-06, + "loss": 0.8002, + "step": 48105 + }, + { + "epoch": 1.8521655437921078, + "grad_norm": 1.1199766397476196, + "learning_rate": 2.7111875087885042e-06, + "loss": 0.8423, + "step": 48110 + }, + { + "epoch": 1.8523580365736285, + "grad_norm": 0.9421118497848511, + "learning_rate": 2.7041982549416144e-06, + "loss": 0.7188, + "step": 48115 + }, + { + "epoch": 1.8525505293551492, + "grad_norm": 2.011782646179199, + "learning_rate": 2.697217898151294e-06, + "loss": 0.8969, + "step": 48120 + }, + { + "epoch": 1.8527430221366699, + "grad_norm": 1.3184289932250977, + "learning_rate": 2.6902464390558103e-06, + "loss": 0.6961, + "step": 48125 + }, + { + "epoch": 1.8529355149181905, + "grad_norm": 1.5041390657424927, + "learning_rate": 2.683283878292675e-06, + "loss": 0.7917, + "step": 48130 + }, + { + "epoch": 1.8531280076997114, + "grad_norm": 1.7774921655654907, + "learning_rate": 2.6763302164985573e-06, + "loss": 1.0091, + "step": 48135 + }, + { + "epoch": 1.853320500481232, + "grad_norm": 1.642179250717163, + "learning_rate": 2.6693854543093476e-06, + "loss": 0.7543, + "step": 48140 + }, + { + "epoch": 1.8535129932627528, + "grad_norm": 1.1686973571777344, + "learning_rate": 2.662449592360061e-06, + "loss": 0.7695, + "step": 48145 + }, + { + "epoch": 1.8537054860442734, + "grad_norm": 0.9292591214179993, + "learning_rate": 2.655522631284979e-06, + "loss": 0.7245, + "step": 48150 + }, + { + "epoch": 1.853897978825794, + "grad_norm": 2.467902660369873, + "learning_rate": 2.6486045717174836e-06, + "loss": 0.8604, + "step": 48155 + }, + { + "epoch": 1.8540904716073148, + "grad_norm": 1.306139349937439, + "learning_rate": 2.641695414290224e-06, + "loss": 0.7734, + "step": 48160 + }, + { + "epoch": 1.8542829643888354, + "grad_norm": 1.376107096672058, + "learning_rate": 2.634795159634962e-06, + "loss": 0.6813, + "step": 48165 + }, + { + "epoch": 1.8544754571703561, + "grad_norm": 1.3315315246582031, + "learning_rate": 2.6279038083827146e-06, + "loss": 0.7047, + "step": 48170 + }, + { + "epoch": 1.8546679499518768, + "grad_norm": 1.155645489692688, + "learning_rate": 2.6210213611636115e-06, + "loss": 0.7626, + "step": 48175 + }, + { + "epoch": 1.8548604427333975, + "grad_norm": 1.1847624778747559, + "learning_rate": 2.6141478186070487e-06, + "loss": 0.8735, + "step": 48180 + }, + { + "epoch": 1.8550529355149181, + "grad_norm": 1.0727204084396362, + "learning_rate": 2.6072831813415354e-06, + "loss": 0.7699, + "step": 48185 + }, + { + "epoch": 1.8552454282964388, + "grad_norm": 1.2296266555786133, + "learning_rate": 2.600427449994813e-06, + "loss": 0.5567, + "step": 48190 + }, + { + "epoch": 1.8554379210779595, + "grad_norm": 1.0693732500076294, + "learning_rate": 2.593580625193781e-06, + "loss": 0.6918, + "step": 48195 + }, + { + "epoch": 1.8556304138594801, + "grad_norm": 1.5725680589675903, + "learning_rate": 2.58674270756456e-06, + "loss": 0.835, + "step": 48200 + }, + { + "epoch": 1.8558229066410008, + "grad_norm": 1.0935879945755005, + "learning_rate": 2.5799136977323948e-06, + "loss": 0.822, + "step": 48205 + }, + { + "epoch": 1.8560153994225217, + "grad_norm": 0.9157683849334717, + "learning_rate": 2.573093596321774e-06, + "loss": 0.8057, + "step": 48210 + }, + { + "epoch": 1.8562078922040424, + "grad_norm": 1.6122345924377441, + "learning_rate": 2.566282403956355e-06, + "loss": 0.9039, + "step": 48215 + }, + { + "epoch": 1.856400384985563, + "grad_norm": 1.6490685939788818, + "learning_rate": 2.5594801212589613e-06, + "loss": 0.9311, + "step": 48220 + }, + { + "epoch": 1.8565928777670837, + "grad_norm": 1.5705087184906006, + "learning_rate": 2.5526867488516513e-06, + "loss": 0.6325, + "step": 48225 + }, + { + "epoch": 1.8567853705486044, + "grad_norm": 1.2620983123779297, + "learning_rate": 2.5459022873555726e-06, + "loss": 0.7455, + "step": 48230 + }, + { + "epoch": 1.8569778633301253, + "grad_norm": 1.1116305589675903, + "learning_rate": 2.5391267373911842e-06, + "loss": 0.65, + "step": 48235 + }, + { + "epoch": 1.857170356111646, + "grad_norm": 1.5795679092407227, + "learning_rate": 2.532360099578024e-06, + "loss": 0.8738, + "step": 48240 + }, + { + "epoch": 1.8573628488931666, + "grad_norm": 1.2260464429855347, + "learning_rate": 2.5256023745348746e-06, + "loss": 0.9088, + "step": 48245 + }, + { + "epoch": 1.8575553416746873, + "grad_norm": 1.582663893699646, + "learning_rate": 2.518853562879675e-06, + "loss": 0.7772, + "step": 48250 + }, + { + "epoch": 1.857747834456208, + "grad_norm": 1.497031807899475, + "learning_rate": 2.5121136652295764e-06, + "loss": 0.8043, + "step": 48255 + }, + { + "epoch": 1.8579403272377286, + "grad_norm": 1.0930918455123901, + "learning_rate": 2.505382682200863e-06, + "loss": 0.7846, + "step": 48260 + }, + { + "epoch": 1.8581328200192493, + "grad_norm": 1.2151498794555664, + "learning_rate": 2.4986606144090762e-06, + "loss": 0.7133, + "step": 48265 + }, + { + "epoch": 1.85832531280077, + "grad_norm": 1.189026951789856, + "learning_rate": 2.49194746246888e-06, + "loss": 0.7892, + "step": 48270 + }, + { + "epoch": 1.8585178055822906, + "grad_norm": 1.6892609596252441, + "learning_rate": 2.4852432269941607e-06, + "loss": 0.8151, + "step": 48275 + }, + { + "epoch": 1.8587102983638113, + "grad_norm": 1.126964807510376, + "learning_rate": 2.4785479085979724e-06, + "loss": 1.0332, + "step": 48280 + }, + { + "epoch": 1.858902791145332, + "grad_norm": 1.3323931694030762, + "learning_rate": 2.471861507892559e-06, + "loss": 0.7541, + "step": 48285 + }, + { + "epoch": 1.8590952839268526, + "grad_norm": 1.3149107694625854, + "learning_rate": 2.465184025489331e-06, + "loss": 0.7269, + "step": 48290 + }, + { + "epoch": 1.8592877767083733, + "grad_norm": 1.9822214841842651, + "learning_rate": 2.458515461998945e-06, + "loss": 0.7885, + "step": 48295 + }, + { + "epoch": 1.859480269489894, + "grad_norm": 1.3917293548583984, + "learning_rate": 2.4518558180311456e-06, + "loss": 0.7677, + "step": 48300 + }, + { + "epoch": 1.8596727622714149, + "grad_norm": 0.9052494764328003, + "learning_rate": 2.4452050941949357e-06, + "loss": 0.7665, + "step": 48305 + }, + { + "epoch": 1.8598652550529355, + "grad_norm": 1.0716874599456787, + "learning_rate": 2.4385632910984834e-06, + "loss": 0.6622, + "step": 48310 + }, + { + "epoch": 1.8600577478344562, + "grad_norm": 1.5617578029632568, + "learning_rate": 2.431930409349137e-06, + "loss": 0.7643, + "step": 48315 + }, + { + "epoch": 1.8602502406159769, + "grad_norm": 1.0250791311264038, + "learning_rate": 2.4253064495534106e-06, + "loss": 0.8006, + "step": 48320 + }, + { + "epoch": 1.8604427333974976, + "grad_norm": 1.2734397649765015, + "learning_rate": 2.4186914123170423e-06, + "loss": 0.6411, + "step": 48325 + }, + { + "epoch": 1.8606352261790184, + "grad_norm": 0.9825404286384583, + "learning_rate": 2.412085298244937e-06, + "loss": 0.7262, + "step": 48330 + }, + { + "epoch": 1.8608277189605391, + "grad_norm": 1.2708152532577515, + "learning_rate": 2.4054881079411564e-06, + "loss": 0.7606, + "step": 48335 + }, + { + "epoch": 1.8610202117420598, + "grad_norm": 1.1390200853347778, + "learning_rate": 2.3988998420089947e-06, + "loss": 0.734, + "step": 48340 + }, + { + "epoch": 1.8612127045235805, + "grad_norm": 1.1848487854003906, + "learning_rate": 2.3923205010508932e-06, + "loss": 0.742, + "step": 48345 + }, + { + "epoch": 1.8614051973051011, + "grad_norm": 1.2743182182312012, + "learning_rate": 2.385750085668481e-06, + "loss": 0.7798, + "step": 48350 + }, + { + "epoch": 1.8615976900866218, + "grad_norm": 0.963897168636322, + "learning_rate": 2.3791885964625894e-06, + "loss": 0.8682, + "step": 48355 + }, + { + "epoch": 1.8617901828681425, + "grad_norm": 1.480070948600769, + "learning_rate": 2.3726360340332376e-06, + "loss": 0.7644, + "step": 48360 + }, + { + "epoch": 1.8619826756496631, + "grad_norm": 0.9536148905754089, + "learning_rate": 2.3660923989795803e-06, + "loss": 0.8048, + "step": 48365 + }, + { + "epoch": 1.8621751684311838, + "grad_norm": 1.9504202604293823, + "learning_rate": 2.3595576919000163e-06, + "loss": 0.8524, + "step": 48370 + }, + { + "epoch": 1.8623676612127045, + "grad_norm": 0.9313097596168518, + "learning_rate": 2.3530319133920896e-06, + "loss": 0.8765, + "step": 48375 + }, + { + "epoch": 1.8625601539942251, + "grad_norm": 1.1792408227920532, + "learning_rate": 2.3465150640525456e-06, + "loss": 0.7123, + "step": 48380 + }, + { + "epoch": 1.8627526467757458, + "grad_norm": 0.8612557649612427, + "learning_rate": 2.3400071444772964e-06, + "loss": 0.6412, + "step": 48385 + }, + { + "epoch": 1.8629451395572665, + "grad_norm": 1.3216156959533691, + "learning_rate": 2.3335081552614768e-06, + "loss": 0.7946, + "step": 48390 + }, + { + "epoch": 1.8631376323387872, + "grad_norm": 0.8052036166191101, + "learning_rate": 2.3270180969993226e-06, + "loss": 0.8188, + "step": 48395 + }, + { + "epoch": 1.8633301251203078, + "grad_norm": 1.8680044412612915, + "learning_rate": 2.3205369702843703e-06, + "loss": 0.767, + "step": 48400 + }, + { + "epoch": 1.8635226179018287, + "grad_norm": 0.8687813878059387, + "learning_rate": 2.314064775709224e-06, + "loss": 0.751, + "step": 48405 + }, + { + "epoch": 1.8637151106833494, + "grad_norm": 1.280096173286438, + "learning_rate": 2.3076015138657537e-06, + "loss": 0.8564, + "step": 48410 + }, + { + "epoch": 1.86390760346487, + "grad_norm": 0.4581018090248108, + "learning_rate": 2.301147185344965e-06, + "loss": 0.6742, + "step": 48415 + }, + { + "epoch": 1.8641000962463907, + "grad_norm": 1.88551926612854, + "learning_rate": 2.294701790737086e-06, + "loss": 0.7482, + "step": 48420 + }, + { + "epoch": 1.8642925890279116, + "grad_norm": 1.5057963132858276, + "learning_rate": 2.2882653306314673e-06, + "loss": 0.8031, + "step": 48425 + }, + { + "epoch": 1.8644850818094323, + "grad_norm": 1.2725811004638672, + "learning_rate": 2.2818378056167155e-06, + "loss": 0.7703, + "step": 48430 + }, + { + "epoch": 1.864677574590953, + "grad_norm": 1.0294675827026367, + "learning_rate": 2.275419216280572e-06, + "loss": 0.8374, + "step": 48435 + }, + { + "epoch": 1.8648700673724736, + "grad_norm": 1.523382306098938, + "learning_rate": 2.2690095632099785e-06, + "loss": 0.9034, + "step": 48440 + }, + { + "epoch": 1.8650625601539943, + "grad_norm": 1.2364825010299683, + "learning_rate": 2.2626088469910547e-06, + "loss": 0.6834, + "step": 48445 + }, + { + "epoch": 1.865255052935515, + "grad_norm": 1.2433017492294312, + "learning_rate": 2.2562170682090877e-06, + "loss": 0.7689, + "step": 48450 + }, + { + "epoch": 1.8654475457170356, + "grad_norm": 1.3385852575302124, + "learning_rate": 2.2498342274485774e-06, + "loss": 0.8025, + "step": 48455 + }, + { + "epoch": 1.8656400384985563, + "grad_norm": 2.2688310146331787, + "learning_rate": 2.2434603252932006e-06, + "loss": 0.8375, + "step": 48460 + }, + { + "epoch": 1.865832531280077, + "grad_norm": 1.4485090970993042, + "learning_rate": 2.237095362325803e-06, + "loss": 0.9097, + "step": 48465 + }, + { + "epoch": 1.8660250240615976, + "grad_norm": 1.457838773727417, + "learning_rate": 2.230739339128396e-06, + "loss": 0.7724, + "step": 48470 + }, + { + "epoch": 1.8662175168431183, + "grad_norm": 1.2256953716278076, + "learning_rate": 2.2243922562822374e-06, + "loss": 0.769, + "step": 48475 + }, + { + "epoch": 1.866410009624639, + "grad_norm": 1.5888760089874268, + "learning_rate": 2.218054114367685e-06, + "loss": 0.7524, + "step": 48480 + }, + { + "epoch": 1.8666025024061597, + "grad_norm": 1.6216797828674316, + "learning_rate": 2.2117249139643415e-06, + "loss": 0.8443, + "step": 48485 + }, + { + "epoch": 1.8667949951876803, + "grad_norm": 0.9898841381072998, + "learning_rate": 2.2054046556509666e-06, + "loss": 0.7376, + "step": 48490 + }, + { + "epoch": 1.866987487969201, + "grad_norm": 1.0620770454406738, + "learning_rate": 2.1990933400055093e-06, + "loss": 0.8391, + "step": 48495 + }, + { + "epoch": 1.867179980750722, + "grad_norm": 0.9419720768928528, + "learning_rate": 2.192790967605085e-06, + "loss": 0.7434, + "step": 48500 + }, + { + "epoch": 1.8673724735322426, + "grad_norm": 2.1842215061187744, + "learning_rate": 2.1864975390260334e-06, + "loss": 0.9107, + "step": 48505 + }, + { + "epoch": 1.8675649663137632, + "grad_norm": 2.277456283569336, + "learning_rate": 2.180213054843816e-06, + "loss": 0.8018, + "step": 48510 + }, + { + "epoch": 1.867757459095284, + "grad_norm": 1.2959650754928589, + "learning_rate": 2.1739375156331176e-06, + "loss": 0.6821, + "step": 48515 + }, + { + "epoch": 1.8679499518768046, + "grad_norm": 1.474094271659851, + "learning_rate": 2.1676709219677905e-06, + "loss": 0.7479, + "step": 48520 + }, + { + "epoch": 1.8681424446583255, + "grad_norm": 3.1948869228363037, + "learning_rate": 2.161413274420876e-06, + "loss": 1.0174, + "step": 48525 + }, + { + "epoch": 1.8683349374398461, + "grad_norm": 1.1724430322647095, + "learning_rate": 2.1551645735646053e-06, + "loss": 0.8481, + "step": 48530 + }, + { + "epoch": 1.8685274302213668, + "grad_norm": 1.5295108556747437, + "learning_rate": 2.1489248199703773e-06, + "loss": 0.7603, + "step": 48535 + }, + { + "epoch": 1.8687199230028875, + "grad_norm": 1.8773775100708008, + "learning_rate": 2.142694014208757e-06, + "loss": 0.8019, + "step": 48540 + }, + { + "epoch": 1.8689124157844081, + "grad_norm": 1.0877622365951538, + "learning_rate": 2.136472156849523e-06, + "loss": 0.8444, + "step": 48545 + }, + { + "epoch": 1.8691049085659288, + "grad_norm": 1.389959692955017, + "learning_rate": 2.130259248461641e-06, + "loss": 0.9192, + "step": 48550 + }, + { + "epoch": 1.8692974013474495, + "grad_norm": 0.84064120054245, + "learning_rate": 2.1240552896131918e-06, + "loss": 0.7529, + "step": 48555 + }, + { + "epoch": 1.8694898941289702, + "grad_norm": 0.8932579159736633, + "learning_rate": 2.117860280871542e-06, + "loss": 0.7907, + "step": 48560 + }, + { + "epoch": 1.8696823869104908, + "grad_norm": 1.7808706760406494, + "learning_rate": 2.1116742228031616e-06, + "loss": 0.8738, + "step": 48565 + }, + { + "epoch": 1.8698748796920115, + "grad_norm": 1.6530510187149048, + "learning_rate": 2.105497115973709e-06, + "loss": 0.8155, + "step": 48570 + }, + { + "epoch": 1.8700673724735322, + "grad_norm": 1.3979318141937256, + "learning_rate": 2.0993289609480547e-06, + "loss": 0.8067, + "step": 48575 + }, + { + "epoch": 1.8702598652550528, + "grad_norm": 1.452545404434204, + "learning_rate": 2.0931697582902476e-06, + "loss": 0.8966, + "step": 48580 + }, + { + "epoch": 1.8704523580365735, + "grad_norm": 1.182625651359558, + "learning_rate": 2.087019508563481e-06, + "loss": 0.7396, + "step": 48585 + }, + { + "epoch": 1.8706448508180942, + "grad_norm": 1.7071011066436768, + "learning_rate": 2.080878212330173e-06, + "loss": 0.773, + "step": 48590 + }, + { + "epoch": 1.870837343599615, + "grad_norm": 0.9710677862167358, + "learning_rate": 2.074745870151895e-06, + "loss": 0.762, + "step": 48595 + }, + { + "epoch": 1.8710298363811357, + "grad_norm": 1.9075227975845337, + "learning_rate": 2.068622482589411e-06, + "loss": 0.7325, + "step": 48600 + }, + { + "epoch": 1.8712223291626564, + "grad_norm": 1.70927894115448, + "learning_rate": 2.0625080502026606e-06, + "loss": 0.7394, + "step": 48605 + }, + { + "epoch": 1.871414821944177, + "grad_norm": 1.2238922119140625, + "learning_rate": 2.0564025735507864e-06, + "loss": 0.8871, + "step": 48610 + }, + { + "epoch": 1.8716073147256977, + "grad_norm": 1.3306856155395508, + "learning_rate": 2.0503060531920528e-06, + "loss": 0.7314, + "step": 48615 + }, + { + "epoch": 1.8717998075072186, + "grad_norm": 1.1816201210021973, + "learning_rate": 2.0442184896840023e-06, + "loss": 0.7342, + "step": 48620 + }, + { + "epoch": 1.8719923002887393, + "grad_norm": 1.3129255771636963, + "learning_rate": 2.0381398835832567e-06, + "loss": 0.6599, + "step": 48625 + }, + { + "epoch": 1.87218479307026, + "grad_norm": 2.268023729324341, + "learning_rate": 2.032070235445682e-06, + "loss": 0.9488, + "step": 48630 + }, + { + "epoch": 1.8723772858517806, + "grad_norm": 1.5048736333847046, + "learning_rate": 2.0260095458263128e-06, + "loss": 0.7311, + "step": 48635 + }, + { + "epoch": 1.8725697786333013, + "grad_norm": 2.065556287765503, + "learning_rate": 2.019957815279361e-06, + "loss": 0.9459, + "step": 48640 + }, + { + "epoch": 1.872762271414822, + "grad_norm": 1.1562378406524658, + "learning_rate": 2.0139150443581944e-06, + "loss": 0.8719, + "step": 48645 + }, + { + "epoch": 1.8729547641963427, + "grad_norm": 1.01041579246521, + "learning_rate": 2.007881233615394e-06, + "loss": 0.8623, + "step": 48650 + }, + { + "epoch": 1.8731472569778633, + "grad_norm": 1.1639820337295532, + "learning_rate": 2.0018563836027293e-06, + "loss": 0.7525, + "step": 48655 + }, + { + "epoch": 1.873339749759384, + "grad_norm": 1.1873496770858765, + "learning_rate": 1.995840494871104e-06, + "loss": 0.7482, + "step": 48660 + }, + { + "epoch": 1.8735322425409047, + "grad_norm": 1.128056526184082, + "learning_rate": 1.989833567970667e-06, + "loss": 0.8526, + "step": 48665 + }, + { + "epoch": 1.8737247353224253, + "grad_norm": 2.4238860607147217, + "learning_rate": 1.98383560345069e-06, + "loss": 0.848, + "step": 48670 + }, + { + "epoch": 1.873917228103946, + "grad_norm": 2.036635160446167, + "learning_rate": 1.977846601859634e-06, + "loss": 0.8757, + "step": 48675 + }, + { + "epoch": 1.8741097208854667, + "grad_norm": 1.680403709411621, + "learning_rate": 1.9718665637451726e-06, + "loss": 0.9098, + "step": 48680 + }, + { + "epoch": 1.8743022136669873, + "grad_norm": 0.7588726878166199, + "learning_rate": 1.965895489654157e-06, + "loss": 0.7572, + "step": 48685 + }, + { + "epoch": 1.874494706448508, + "grad_norm": 1.095284104347229, + "learning_rate": 1.9599333801325505e-06, + "loss": 0.8009, + "step": 48690 + }, + { + "epoch": 1.874687199230029, + "grad_norm": 1.7757399082183838, + "learning_rate": 1.9539802357256055e-06, + "loss": 0.7788, + "step": 48695 + }, + { + "epoch": 1.8748796920115496, + "grad_norm": 1.0243725776672363, + "learning_rate": 1.9480360569776647e-06, + "loss": 0.8988, + "step": 48700 + }, + { + "epoch": 1.8750721847930703, + "grad_norm": 1.7366422414779663, + "learning_rate": 1.942100844432293e-06, + "loss": 0.7865, + "step": 48705 + }, + { + "epoch": 1.875264677574591, + "grad_norm": 1.935095191001892, + "learning_rate": 1.936174598632212e-06, + "loss": 0.9178, + "step": 48710 + }, + { + "epoch": 1.8754571703561116, + "grad_norm": 0.9639614224433899, + "learning_rate": 1.9302573201193776e-06, + "loss": 0.6945, + "step": 48715 + }, + { + "epoch": 1.8756496631376325, + "grad_norm": 1.3461066484451294, + "learning_rate": 1.924349009434834e-06, + "loss": 0.7487, + "step": 48720 + }, + { + "epoch": 1.8758421559191532, + "grad_norm": 1.1384443044662476, + "learning_rate": 1.9184496671188933e-06, + "loss": 0.7095, + "step": 48725 + }, + { + "epoch": 1.8760346487006738, + "grad_norm": 0.8311898112297058, + "learning_rate": 1.9125592937109916e-06, + "loss": 0.7814, + "step": 48730 + }, + { + "epoch": 1.8762271414821945, + "grad_norm": 1.4966212511062622, + "learning_rate": 1.906677889749775e-06, + "loss": 0.9245, + "step": 48735 + }, + { + "epoch": 1.8764196342637152, + "grad_norm": 1.414469599723816, + "learning_rate": 1.900805455773058e-06, + "loss": 0.8376, + "step": 48740 + }, + { + "epoch": 1.8766121270452358, + "grad_norm": 1.3187638521194458, + "learning_rate": 1.8949419923178336e-06, + "loss": 0.7, + "step": 48745 + }, + { + "epoch": 1.8768046198267565, + "grad_norm": 1.3405956029891968, + "learning_rate": 1.8890874999202946e-06, + "loss": 0.9212, + "step": 48750 + }, + { + "epoch": 1.8769971126082772, + "grad_norm": 1.8978711366653442, + "learning_rate": 1.8832419791157574e-06, + "loss": 0.8214, + "step": 48755 + }, + { + "epoch": 1.8771896053897978, + "grad_norm": 1.8074771165847778, + "learning_rate": 1.8774054304387834e-06, + "loss": 0.8833, + "step": 48760 + }, + { + "epoch": 1.8773820981713185, + "grad_norm": 1.301705241203308, + "learning_rate": 1.871577854423079e-06, + "loss": 0.7134, + "step": 48765 + }, + { + "epoch": 1.8775745909528392, + "grad_norm": 0.8135727047920227, + "learning_rate": 1.8657592516015398e-06, + "loss": 0.6364, + "step": 48770 + }, + { + "epoch": 1.8777670837343599, + "grad_norm": 0.7817474007606506, + "learning_rate": 1.8599496225062296e-06, + "loss": 0.6671, + "step": 48775 + }, + { + "epoch": 1.8779595765158805, + "grad_norm": 1.2370343208312988, + "learning_rate": 1.8541489676684232e-06, + "loss": 0.8199, + "step": 48780 + }, + { + "epoch": 1.8781520692974012, + "grad_norm": 1.585356593132019, + "learning_rate": 1.8483572876185296e-06, + "loss": 0.9021, + "step": 48785 + }, + { + "epoch": 1.878344562078922, + "grad_norm": 1.4099061489105225, + "learning_rate": 1.8425745828861585e-06, + "loss": 0.7381, + "step": 48790 + }, + { + "epoch": 1.8785370548604428, + "grad_norm": 2.376617670059204, + "learning_rate": 1.8368008540001203e-06, + "loss": 0.7274, + "step": 48795 + }, + { + "epoch": 1.8787295476419634, + "grad_norm": 2.1324985027313232, + "learning_rate": 1.8310361014883703e-06, + "loss": 0.953, + "step": 48800 + }, + { + "epoch": 1.878922040423484, + "grad_norm": 2.0138044357299805, + "learning_rate": 1.8252803258780538e-06, + "loss": 0.8029, + "step": 48805 + }, + { + "epoch": 1.8791145332050048, + "grad_norm": 1.857802152633667, + "learning_rate": 1.8195335276955162e-06, + "loss": 0.8503, + "step": 48810 + }, + { + "epoch": 1.8793070259865257, + "grad_norm": 1.9087456464767456, + "learning_rate": 1.813795707466237e-06, + "loss": 0.8598, + "step": 48815 + }, + { + "epoch": 1.8794995187680463, + "grad_norm": 1.3068690299987793, + "learning_rate": 1.8080668657149192e-06, + "loss": 0.6669, + "step": 48820 + }, + { + "epoch": 1.879692011549567, + "grad_norm": 1.2544077634811401, + "learning_rate": 1.802347002965421e-06, + "loss": 0.851, + "step": 48825 + }, + { + "epoch": 1.8798845043310877, + "grad_norm": 1.29012930393219, + "learning_rate": 1.7966361197408022e-06, + "loss": 0.8332, + "step": 48830 + }, + { + "epoch": 1.8800769971126083, + "grad_norm": 1.0903306007385254, + "learning_rate": 1.7909342165632558e-06, + "loss": 0.6718, + "step": 48835 + }, + { + "epoch": 1.880269489894129, + "grad_norm": 1.1692110300064087, + "learning_rate": 1.7852412939542208e-06, + "loss": 0.8295, + "step": 48840 + }, + { + "epoch": 1.8804619826756497, + "grad_norm": 1.5184317827224731, + "learning_rate": 1.7795573524342356e-06, + "loss": 0.7913, + "step": 48845 + }, + { + "epoch": 1.8806544754571703, + "grad_norm": 1.50426185131073, + "learning_rate": 1.7738823925230964e-06, + "loss": 0.8514, + "step": 48850 + }, + { + "epoch": 1.880846968238691, + "grad_norm": 1.971408486366272, + "learning_rate": 1.7682164147397208e-06, + "loss": 0.8384, + "step": 48855 + }, + { + "epoch": 1.8810394610202117, + "grad_norm": 0.9322729706764221, + "learning_rate": 1.7625594196022166e-06, + "loss": 0.7074, + "step": 48860 + }, + { + "epoch": 1.8812319538017324, + "grad_norm": 1.4018663167953491, + "learning_rate": 1.7569114076278924e-06, + "loss": 0.7446, + "step": 48865 + }, + { + "epoch": 1.881424446583253, + "grad_norm": 1.2637794017791748, + "learning_rate": 1.7512723793332242e-06, + "loss": 0.7912, + "step": 48870 + }, + { + "epoch": 1.8816169393647737, + "grad_norm": 0.7229753732681274, + "learning_rate": 1.7456423352338658e-06, + "loss": 0.7339, + "step": 48875 + }, + { + "epoch": 1.8818094321462944, + "grad_norm": 0.9690874814987183, + "learning_rate": 1.7400212758446276e-06, + "loss": 0.727, + "step": 48880 + }, + { + "epoch": 1.8820019249278153, + "grad_norm": 1.093895673751831, + "learning_rate": 1.7344092016795433e-06, + "loss": 0.7939, + "step": 48885 + }, + { + "epoch": 1.882194417709336, + "grad_norm": 1.0549845695495605, + "learning_rate": 1.7288061132517686e-06, + "loss": 0.8103, + "step": 48890 + }, + { + "epoch": 1.8823869104908566, + "grad_norm": 1.6850600242614746, + "learning_rate": 1.723212011073705e-06, + "loss": 0.7877, + "step": 48895 + }, + { + "epoch": 1.8825794032723773, + "grad_norm": 1.4307023286819458, + "learning_rate": 1.7176268956568653e-06, + "loss": 0.7448, + "step": 48900 + }, + { + "epoch": 1.882771896053898, + "grad_norm": 1.4105719327926636, + "learning_rate": 1.7120507675120078e-06, + "loss": 0.9085, + "step": 48905 + }, + { + "epoch": 1.8829643888354188, + "grad_norm": 1.330667495727539, + "learning_rate": 1.7064836271489803e-06, + "loss": 0.8304, + "step": 48910 + }, + { + "epoch": 1.8831568816169395, + "grad_norm": 1.1804107427597046, + "learning_rate": 1.7009254750769088e-06, + "loss": 0.8184, + "step": 48915 + }, + { + "epoch": 1.8833493743984602, + "grad_norm": 0.7987260818481445, + "learning_rate": 1.695376311804031e-06, + "loss": 0.6531, + "step": 48920 + }, + { + "epoch": 1.8835418671799808, + "grad_norm": 0.7400739789009094, + "learning_rate": 1.6898361378377747e-06, + "loss": 0.7269, + "step": 48925 + }, + { + "epoch": 1.8837343599615015, + "grad_norm": 1.043596863746643, + "learning_rate": 1.6843049536847567e-06, + "loss": 0.7467, + "step": 48930 + }, + { + "epoch": 1.8839268527430222, + "grad_norm": 1.6760101318359375, + "learning_rate": 1.6787827598507721e-06, + "loss": 0.6619, + "step": 48935 + }, + { + "epoch": 1.8841193455245429, + "grad_norm": 1.3724150657653809, + "learning_rate": 1.6732695568407842e-06, + "loss": 0.7495, + "step": 48940 + }, + { + "epoch": 1.8843118383060635, + "grad_norm": 0.6713613867759705, + "learning_rate": 1.6677653451589448e-06, + "loss": 0.7356, + "step": 48945 + }, + { + "epoch": 1.8845043310875842, + "grad_norm": 1.0061167478561401, + "learning_rate": 1.6622701253085626e-06, + "loss": 0.8709, + "step": 48950 + }, + { + "epoch": 1.8846968238691049, + "grad_norm": 1.6838310956954956, + "learning_rate": 1.656783897792158e-06, + "loss": 0.8505, + "step": 48955 + }, + { + "epoch": 1.8848893166506255, + "grad_norm": 1.262903094291687, + "learning_rate": 1.651306663111396e-06, + "loss": 0.8333, + "step": 48960 + }, + { + "epoch": 1.8850818094321462, + "grad_norm": 0.9866868257522583, + "learning_rate": 1.645838421767154e-06, + "loss": 0.8363, + "step": 48965 + }, + { + "epoch": 1.8852743022136669, + "grad_norm": 1.7241532802581787, + "learning_rate": 1.6403791742594433e-06, + "loss": 0.8477, + "step": 48970 + }, + { + "epoch": 1.8854667949951875, + "grad_norm": 1.6335369348526, + "learning_rate": 1.6349289210874752e-06, + "loss": 0.7239, + "step": 48975 + }, + { + "epoch": 1.8856592877767082, + "grad_norm": 2.1081769466400146, + "learning_rate": 1.6294876627496624e-06, + "loss": 0.6491, + "step": 48980 + }, + { + "epoch": 1.885851780558229, + "grad_norm": 1.3898777961730957, + "learning_rate": 1.6240553997435403e-06, + "loss": 0.7792, + "step": 48985 + }, + { + "epoch": 1.8860442733397498, + "grad_norm": 0.9727959632873535, + "learning_rate": 1.6186321325658893e-06, + "loss": 0.7878, + "step": 48990 + }, + { + "epoch": 1.8862367661212704, + "grad_norm": 1.1837464570999146, + "learning_rate": 1.6132178617126016e-06, + "loss": 1.0175, + "step": 48995 + }, + { + "epoch": 1.8864292589027911, + "grad_norm": 2.5681560039520264, + "learning_rate": 1.607812587678792e-06, + "loss": 0.9884, + "step": 49000 + }, + { + "epoch": 1.8866217516843118, + "grad_norm": 0.915593683719635, + "learning_rate": 1.602416310958732e-06, + "loss": 0.677, + "step": 49005 + }, + { + "epoch": 1.8868142444658327, + "grad_norm": 1.0929181575775146, + "learning_rate": 1.5970290320458715e-06, + "loss": 0.7596, + "step": 49010 + }, + { + "epoch": 1.8870067372473533, + "grad_norm": 1.0402964353561401, + "learning_rate": 1.5916507514328494e-06, + "loss": 0.6634, + "step": 49015 + }, + { + "epoch": 1.887199230028874, + "grad_norm": 1.7485926151275635, + "learning_rate": 1.5862814696114836e-06, + "loss": 0.7347, + "step": 49020 + }, + { + "epoch": 1.8873917228103947, + "grad_norm": 2.60465669631958, + "learning_rate": 1.5809211870727259e-06, + "loss": 0.9895, + "step": 49025 + }, + { + "epoch": 1.8875842155919154, + "grad_norm": 1.0043087005615234, + "learning_rate": 1.5755699043067728e-06, + "loss": 0.7079, + "step": 49030 + }, + { + "epoch": 1.887776708373436, + "grad_norm": 0.876469075679779, + "learning_rate": 1.5702276218029444e-06, + "loss": 0.6647, + "step": 49035 + }, + { + "epoch": 1.8879692011549567, + "grad_norm": 1.5398415327072144, + "learning_rate": 1.5648943400497495e-06, + "loss": 0.8207, + "step": 49040 + }, + { + "epoch": 1.8881616939364774, + "grad_norm": 0.8192068934440613, + "learning_rate": 1.5595700595349093e-06, + "loss": 0.8108, + "step": 49045 + }, + { + "epoch": 1.888354186717998, + "grad_norm": 2.2673983573913574, + "learning_rate": 1.554254780745279e-06, + "loss": 0.8681, + "step": 49050 + }, + { + "epoch": 1.8885466794995187, + "grad_norm": 1.8705366849899292, + "learning_rate": 1.5489485041669026e-06, + "loss": 0.7649, + "step": 49055 + }, + { + "epoch": 1.8887391722810394, + "grad_norm": 1.0904033184051514, + "learning_rate": 1.5436512302850148e-06, + "loss": 0.7311, + "step": 49060 + }, + { + "epoch": 1.88893166506256, + "grad_norm": 1.5426052808761597, + "learning_rate": 1.5383629595839944e-06, + "loss": 0.8157, + "step": 49065 + }, + { + "epoch": 1.8891241578440807, + "grad_norm": 1.1552188396453857, + "learning_rate": 1.5330836925474434e-06, + "loss": 0.8184, + "step": 49070 + }, + { + "epoch": 1.8893166506256014, + "grad_norm": 1.0506844520568848, + "learning_rate": 1.5278134296580981e-06, + "loss": 0.7162, + "step": 49075 + }, + { + "epoch": 1.8895091434071223, + "grad_norm": 1.1629083156585693, + "learning_rate": 1.5225521713979063e-06, + "loss": 0.8053, + "step": 49080 + }, + { + "epoch": 1.889701636188643, + "grad_norm": 1.4255765676498413, + "learning_rate": 1.5172999182479496e-06, + "loss": 0.7563, + "step": 49085 + }, + { + "epoch": 1.8898941289701636, + "grad_norm": 1.1012247800827026, + "learning_rate": 1.512056670688533e-06, + "loss": 0.7282, + "step": 49090 + }, + { + "epoch": 1.8900866217516843, + "grad_norm": 1.0495306253433228, + "learning_rate": 1.5068224291991174e-06, + "loss": 0.9035, + "step": 49095 + }, + { + "epoch": 1.890279114533205, + "grad_norm": 1.2971009016036987, + "learning_rate": 1.5015971942583196e-06, + "loss": 0.7806, + "step": 49100 + }, + { + "epoch": 1.8904716073147259, + "grad_norm": 1.4070019721984863, + "learning_rate": 1.4963809663439908e-06, + "loss": 0.7514, + "step": 49105 + }, + { + "epoch": 1.8906641000962465, + "grad_norm": 1.4686723947525024, + "learning_rate": 1.4911737459330722e-06, + "loss": 0.8591, + "step": 49110 + }, + { + "epoch": 1.8908565928777672, + "grad_norm": 1.885849952697754, + "learning_rate": 1.48597553350176e-06, + "loss": 0.771, + "step": 49115 + }, + { + "epoch": 1.8910490856592879, + "grad_norm": 1.6949317455291748, + "learning_rate": 1.4807863295253965e-06, + "loss": 0.9033, + "step": 49120 + }, + { + "epoch": 1.8912415784408085, + "grad_norm": 1.8095852136611938, + "learning_rate": 1.4756061344784912e-06, + "loss": 0.7384, + "step": 49125 + }, + { + "epoch": 1.8914340712223292, + "grad_norm": 1.4184739589691162, + "learning_rate": 1.4704349488347313e-06, + "loss": 0.8234, + "step": 49130 + }, + { + "epoch": 1.8916265640038499, + "grad_norm": 1.2031188011169434, + "learning_rate": 1.4652727730670058e-06, + "loss": 0.6538, + "step": 49135 + }, + { + "epoch": 1.8918190567853705, + "grad_norm": 0.8883029222488403, + "learning_rate": 1.4601196076473478e-06, + "loss": 0.778, + "step": 49140 + }, + { + "epoch": 1.8920115495668912, + "grad_norm": 1.1456712484359741, + "learning_rate": 1.4549754530469805e-06, + "loss": 0.8069, + "step": 49145 + }, + { + "epoch": 1.8922040423484119, + "grad_norm": 1.2302826642990112, + "learning_rate": 1.4498403097363167e-06, + "loss": 0.6927, + "step": 49150 + }, + { + "epoch": 1.8923965351299326, + "grad_norm": 1.2416324615478516, + "learning_rate": 1.4457386835319498e-06, + "loss": 0.8545, + "step": 49155 + }, + { + "epoch": 1.8925890279114532, + "grad_norm": 1.3639918565750122, + "learning_rate": 1.4406197617255034e-06, + "loss": 0.9177, + "step": 49160 + }, + { + "epoch": 1.892781520692974, + "grad_norm": 1.4873005151748657, + "learning_rate": 1.4355098525214883e-06, + "loss": 0.7928, + "step": 49165 + }, + { + "epoch": 1.8929740134744946, + "grad_norm": 0.8439772725105286, + "learning_rate": 1.4304089563871525e-06, + "loss": 0.9517, + "step": 49170 + }, + { + "epoch": 1.8931665062560152, + "grad_norm": 1.5179928541183472, + "learning_rate": 1.4253170737889787e-06, + "loss": 0.8324, + "step": 49175 + }, + { + "epoch": 1.8933589990375361, + "grad_norm": 2.050947904586792, + "learning_rate": 1.4202342051925387e-06, + "loss": 0.8277, + "step": 49180 + }, + { + "epoch": 1.8935514918190568, + "grad_norm": 1.4222488403320312, + "learning_rate": 1.415160351062661e-06, + "loss": 0.8468, + "step": 49185 + }, + { + "epoch": 1.8937439846005775, + "grad_norm": 1.9091302156448364, + "learning_rate": 1.4100955118632964e-06, + "loss": 0.8484, + "step": 49190 + }, + { + "epoch": 1.8939364773820981, + "grad_norm": 1.2363388538360596, + "learning_rate": 1.4050396880576189e-06, + "loss": 0.7329, + "step": 49195 + }, + { + "epoch": 1.894128970163619, + "grad_norm": 1.5010732412338257, + "learning_rate": 1.3999928801079033e-06, + "loss": 0.7513, + "step": 49200 + }, + { + "epoch": 1.8943214629451397, + "grad_norm": 0.9450666904449463, + "learning_rate": 1.3949550884756913e-06, + "loss": 0.7275, + "step": 49205 + }, + { + "epoch": 1.8945139557266604, + "grad_norm": 1.6268372535705566, + "learning_rate": 1.3899263136216367e-06, + "loss": 0.773, + "step": 49210 + }, + { + "epoch": 1.894706448508181, + "grad_norm": 1.2723169326782227, + "learning_rate": 1.384906556005583e-06, + "loss": 0.7464, + "step": 49215 + }, + { + "epoch": 1.8948989412897017, + "grad_norm": 1.1981759071350098, + "learning_rate": 1.3798958160865517e-06, + "loss": 0.8826, + "step": 49220 + }, + { + "epoch": 1.8950914340712224, + "grad_norm": 1.8858412504196167, + "learning_rate": 1.374894094322765e-06, + "loss": 0.7508, + "step": 49225 + }, + { + "epoch": 1.895283926852743, + "grad_norm": 1.544263482093811, + "learning_rate": 1.3699013911715685e-06, + "loss": 0.7827, + "step": 49230 + }, + { + "epoch": 1.8954764196342637, + "grad_norm": 1.369147777557373, + "learning_rate": 1.3649177070895414e-06, + "loss": 0.8175, + "step": 49235 + }, + { + "epoch": 1.8956689124157844, + "grad_norm": 1.682911992073059, + "learning_rate": 1.3599430425323856e-06, + "loss": 0.6453, + "step": 49240 + }, + { + "epoch": 1.895861405197305, + "grad_norm": 1.029921293258667, + "learning_rate": 1.354977397955004e-06, + "loss": 0.7134, + "step": 49245 + }, + { + "epoch": 1.8960538979788257, + "grad_norm": 1.0889908075332642, + "learning_rate": 1.3500207738114777e-06, + "loss": 0.7324, + "step": 49250 + }, + { + "epoch": 1.8962463907603464, + "grad_norm": 1.1377679109573364, + "learning_rate": 1.3450731705550557e-06, + "loss": 0.7995, + "step": 49255 + }, + { + "epoch": 1.896438883541867, + "grad_norm": 1.6833134889602661, + "learning_rate": 1.3401345886381644e-06, + "loss": 0.6787, + "step": 49260 + }, + { + "epoch": 1.8966313763233877, + "grad_norm": 1.080384373664856, + "learning_rate": 1.3352050285123986e-06, + "loss": 0.7251, + "step": 49265 + }, + { + "epoch": 1.8968238691049084, + "grad_norm": 1.5266867876052856, + "learning_rate": 1.3302844906285417e-06, + "loss": 0.8563, + "step": 49270 + }, + { + "epoch": 1.8970163618864293, + "grad_norm": 1.8439712524414062, + "learning_rate": 1.325372975436545e-06, + "loss": 0.7046, + "step": 49275 + }, + { + "epoch": 1.89720885466795, + "grad_norm": 1.3293750286102295, + "learning_rate": 1.3204704833855275e-06, + "loss": 0.8011, + "step": 49280 + }, + { + "epoch": 1.8974013474494706, + "grad_norm": 0.9964110255241394, + "learning_rate": 1.3155770149237856e-06, + "loss": 0.7107, + "step": 49285 + }, + { + "epoch": 1.8975938402309913, + "grad_norm": 1.0830775499343872, + "learning_rate": 1.310692570498806e-06, + "loss": 0.9186, + "step": 49290 + }, + { + "epoch": 1.897786333012512, + "grad_norm": 2.34660267829895, + "learning_rate": 1.3058171505572424e-06, + "loss": 0.8195, + "step": 49295 + }, + { + "epoch": 1.8979788257940329, + "grad_norm": 1.1595919132232666, + "learning_rate": 1.300950755544894e-06, + "loss": 0.8621, + "step": 49300 + }, + { + "epoch": 1.8981713185755535, + "grad_norm": 1.1527278423309326, + "learning_rate": 1.2960933859067937e-06, + "loss": 0.7031, + "step": 49305 + }, + { + "epoch": 1.8983638113570742, + "grad_norm": 1.7028868198394775, + "learning_rate": 1.291245042087097e-06, + "loss": 0.8185, + "step": 49310 + }, + { + "epoch": 1.8985563041385949, + "grad_norm": 1.3725398778915405, + "learning_rate": 1.2864057245291384e-06, + "loss": 0.7387, + "step": 49315 + }, + { + "epoch": 1.8987487969201156, + "grad_norm": 0.7600288987159729, + "learning_rate": 1.2815754336754748e-06, + "loss": 0.7169, + "step": 49320 + }, + { + "epoch": 1.8989412897016362, + "grad_norm": 1.943352460861206, + "learning_rate": 1.2767541699677865e-06, + "loss": 0.8994, + "step": 49325 + }, + { + "epoch": 1.899133782483157, + "grad_norm": 1.6857982873916626, + "learning_rate": 1.271941933846943e-06, + "loss": 0.8606, + "step": 49330 + }, + { + "epoch": 1.8993262752646776, + "grad_norm": 1.2063673734664917, + "learning_rate": 1.2671387257530033e-06, + "loss": 0.8712, + "step": 49335 + }, + { + "epoch": 1.8995187680461982, + "grad_norm": 1.1214938163757324, + "learning_rate": 1.2623445461251826e-06, + "loss": 0.8094, + "step": 49340 + }, + { + "epoch": 1.899711260827719, + "grad_norm": 1.7402042150497437, + "learning_rate": 1.257559395401875e-06, + "loss": 0.7574, + "step": 49345 + }, + { + "epoch": 1.8999037536092396, + "grad_norm": 2.6430463790893555, + "learning_rate": 1.2527832740206413e-06, + "loss": 0.89, + "step": 49350 + }, + { + "epoch": 1.9000962463907602, + "grad_norm": 1.625260829925537, + "learning_rate": 1.2480161824182435e-06, + "loss": 0.8766, + "step": 49355 + }, + { + "epoch": 1.900288739172281, + "grad_norm": 1.6484109163284302, + "learning_rate": 1.2432581210305883e-06, + "loss": 0.6991, + "step": 49360 + }, + { + "epoch": 1.9004812319538016, + "grad_norm": 1.4141255617141724, + "learning_rate": 1.2385090902927943e-06, + "loss": 0.8387, + "step": 49365 + }, + { + "epoch": 1.9006737247353225, + "grad_norm": 2.3338122367858887, + "learning_rate": 1.233769090639092e-06, + "loss": 0.9757, + "step": 49370 + }, + { + "epoch": 1.9008662175168431, + "grad_norm": 1.6034566164016724, + "learning_rate": 1.2290381225029345e-06, + "loss": 0.8093, + "step": 49375 + }, + { + "epoch": 1.9010587102983638, + "grad_norm": 1.138555645942688, + "learning_rate": 1.224316186316954e-06, + "loss": 0.7738, + "step": 49380 + }, + { + "epoch": 1.9012512030798845, + "grad_norm": 1.7570353746414185, + "learning_rate": 1.2196032825129377e-06, + "loss": 0.8496, + "step": 49385 + }, + { + "epoch": 1.9014436958614052, + "grad_norm": 0.8414403796195984, + "learning_rate": 1.2148994115218194e-06, + "loss": 0.8599, + "step": 49390 + }, + { + "epoch": 1.901636188642926, + "grad_norm": 1.4490456581115723, + "learning_rate": 1.2102045737737655e-06, + "loss": 0.7314, + "step": 49395 + }, + { + "epoch": 1.9018286814244467, + "grad_norm": 1.2178351879119873, + "learning_rate": 1.2055187696980885e-06, + "loss": 0.8655, + "step": 49400 + }, + { + "epoch": 1.9020211742059674, + "grad_norm": 1.1484450101852417, + "learning_rate": 1.2008419997232567e-06, + "loss": 0.7352, + "step": 49405 + }, + { + "epoch": 1.902213666987488, + "grad_norm": 2.422426462173462, + "learning_rate": 1.1961742642769502e-06, + "loss": 0.8654, + "step": 49410 + }, + { + "epoch": 1.9024061597690087, + "grad_norm": 1.6864680051803589, + "learning_rate": 1.1915155637859942e-06, + "loss": 0.7319, + "step": 49415 + }, + { + "epoch": 1.9025986525505294, + "grad_norm": 1.0465269088745117, + "learning_rate": 1.1868658986763704e-06, + "loss": 0.6556, + "step": 49420 + }, + { + "epoch": 1.90279114533205, + "grad_norm": 1.5697258710861206, + "learning_rate": 1.182225269373305e-06, + "loss": 0.8959, + "step": 49425 + }, + { + "epoch": 1.9029836381135707, + "grad_norm": 1.0091673135757446, + "learning_rate": 1.1775936763011252e-06, + "loss": 0.8843, + "step": 49430 + }, + { + "epoch": 1.9031761308950914, + "grad_norm": 1.5295612812042236, + "learning_rate": 1.1729711198833592e-06, + "loss": 0.7747, + "step": 49435 + }, + { + "epoch": 1.903368623676612, + "grad_norm": 1.655747413635254, + "learning_rate": 1.1683576005427243e-06, + "loss": 0.7618, + "step": 49440 + }, + { + "epoch": 1.9035611164581328, + "grad_norm": 1.3846567869186401, + "learning_rate": 1.163753118701083e-06, + "loss": 0.7765, + "step": 49445 + }, + { + "epoch": 1.9037536092396534, + "grad_norm": 1.6685348749160767, + "learning_rate": 1.1591576747794874e-06, + "loss": 0.7638, + "step": 49450 + }, + { + "epoch": 1.903946102021174, + "grad_norm": 1.5088547468185425, + "learning_rate": 1.154571269198168e-06, + "loss": 0.7608, + "step": 49455 + }, + { + "epoch": 1.9041385948026948, + "grad_norm": 2.2280006408691406, + "learning_rate": 1.1499939023765116e-06, + "loss": 0.8825, + "step": 49460 + }, + { + "epoch": 1.9043310875842154, + "grad_norm": 1.4215261936187744, + "learning_rate": 1.1454255747330834e-06, + "loss": 0.953, + "step": 49465 + }, + { + "epoch": 1.9045235803657363, + "grad_norm": 1.827169418334961, + "learning_rate": 1.1408662866856379e-06, + "loss": 0.7853, + "step": 49470 + }, + { + "epoch": 1.904716073147257, + "grad_norm": 1.3283016681671143, + "learning_rate": 1.1363160386510975e-06, + "loss": 0.6103, + "step": 49475 + }, + { + "epoch": 1.9049085659287777, + "grad_norm": 1.2778635025024414, + "learning_rate": 1.13177483104554e-06, + "loss": 0.7057, + "step": 49480 + }, + { + "epoch": 1.9051010587102983, + "grad_norm": 1.315544605255127, + "learning_rate": 1.1272426642842337e-06, + "loss": 0.7023, + "step": 49485 + }, + { + "epoch": 1.905293551491819, + "grad_norm": 1.4004102945327759, + "learning_rate": 1.1227195387816136e-06, + "loss": 0.6889, + "step": 49490 + }, + { + "epoch": 1.90548604427334, + "grad_norm": 2.1009254455566406, + "learning_rate": 1.118205454951271e-06, + "loss": 0.8759, + "step": 49495 + }, + { + "epoch": 1.9056785370548606, + "grad_norm": 1.2553610801696777, + "learning_rate": 1.1137004132060314e-06, + "loss": 0.7435, + "step": 49500 + }, + { + "epoch": 1.9058710298363812, + "grad_norm": 0.8491818308830261, + "learning_rate": 1.1092044139578205e-06, + "loss": 0.6341, + "step": 49505 + }, + { + "epoch": 1.906063522617902, + "grad_norm": 1.680105447769165, + "learning_rate": 1.1047174576177654e-06, + "loss": 0.8919, + "step": 49510 + }, + { + "epoch": 1.9062560153994226, + "grad_norm": 1.8832811117172241, + "learning_rate": 1.1002395445961932e-06, + "loss": 0.7662, + "step": 49515 + }, + { + "epoch": 1.9064485081809432, + "grad_norm": 1.2031275033950806, + "learning_rate": 1.0957706753025432e-06, + "loss": 0.7988, + "step": 49520 + }, + { + "epoch": 1.906641000962464, + "grad_norm": 2.169948101043701, + "learning_rate": 1.0913108501454993e-06, + "loss": 0.8608, + "step": 49525 + }, + { + "epoch": 1.9068334937439846, + "grad_norm": 1.2307795286178589, + "learning_rate": 1.086860069532869e-06, + "loss": 0.7195, + "step": 49530 + }, + { + "epoch": 1.9070259865255053, + "grad_norm": 1.0904724597930908, + "learning_rate": 1.0824183338716377e-06, + "loss": 0.7586, + "step": 49535 + }, + { + "epoch": 1.907218479307026, + "grad_norm": 2.3190550804138184, + "learning_rate": 1.0779856435679913e-06, + "loss": 0.9954, + "step": 49540 + }, + { + "epoch": 1.9074109720885466, + "grad_norm": 1.5595754384994507, + "learning_rate": 1.0735619990272506e-06, + "loss": 0.7631, + "step": 49545 + }, + { + "epoch": 1.9076034648700673, + "grad_norm": 1.5422192811965942, + "learning_rate": 1.069147400653936e-06, + "loss": 0.7709, + "step": 49550 + }, + { + "epoch": 1.907795957651588, + "grad_norm": 1.085741639137268, + "learning_rate": 1.0647418488517358e-06, + "loss": 0.6323, + "step": 49555 + }, + { + "epoch": 1.9079884504331086, + "grad_norm": 1.0943596363067627, + "learning_rate": 1.0603453440235168e-06, + "loss": 0.7902, + "step": 49560 + }, + { + "epoch": 1.9081809432146295, + "grad_norm": 0.9542067646980286, + "learning_rate": 1.0559578865712905e-06, + "loss": 0.8609, + "step": 49565 + }, + { + "epoch": 1.9083734359961502, + "grad_norm": 2.1979501247406006, + "learning_rate": 1.0515794768962806e-06, + "loss": 0.7887, + "step": 49570 + }, + { + "epoch": 1.9085659287776708, + "grad_norm": 1.0749517679214478, + "learning_rate": 1.0472101153988446e-06, + "loss": 0.8218, + "step": 49575 + }, + { + "epoch": 1.9087584215591915, + "grad_norm": 1.4718987941741943, + "learning_rate": 1.0428498024785404e-06, + "loss": 0.8026, + "step": 49580 + }, + { + "epoch": 1.9089509143407122, + "grad_norm": 1.4935468435287476, + "learning_rate": 1.038498538534105e-06, + "loss": 0.7906, + "step": 49585 + }, + { + "epoch": 1.909143407122233, + "grad_norm": 1.2869309186935425, + "learning_rate": 1.034156323963409e-06, + "loss": 0.7009, + "step": 49590 + }, + { + "epoch": 1.9093358999037537, + "grad_norm": 1.7238849401474, + "learning_rate": 1.0298231591635232e-06, + "loss": 0.6669, + "step": 49595 + }, + { + "epoch": 1.9095283926852744, + "grad_norm": 1.0271257162094116, + "learning_rate": 1.0254990445306978e-06, + "loss": 0.893, + "step": 49600 + }, + { + "epoch": 1.909720885466795, + "grad_norm": 1.4019641876220703, + "learning_rate": 1.0211839804603385e-06, + "loss": 0.7958, + "step": 49605 + }, + { + "epoch": 1.9099133782483158, + "grad_norm": 2.0241641998291016, + "learning_rate": 1.0168779673470296e-06, + "loss": 0.9105, + "step": 49610 + }, + { + "epoch": 1.9101058710298364, + "grad_norm": 1.333714485168457, + "learning_rate": 1.0125810055845231e-06, + "loss": 0.7325, + "step": 49615 + }, + { + "epoch": 1.910298363811357, + "grad_norm": 1.3105881214141846, + "learning_rate": 1.00829309556576e-06, + "loss": 0.8457, + "step": 49620 + }, + { + "epoch": 1.9104908565928778, + "grad_norm": 1.3533077239990234, + "learning_rate": 1.0040142376828266e-06, + "loss": 0.7181, + "step": 49625 + }, + { + "epoch": 1.9106833493743984, + "grad_norm": 1.5712995529174805, + "learning_rate": 9.9974443232701e-07, + "loss": 0.9438, + "step": 49630 + }, + { + "epoch": 1.910875842155919, + "grad_norm": 1.0359176397323608, + "learning_rate": 9.954836798887424e-07, + "loss": 0.7692, + "step": 49635 + }, + { + "epoch": 1.9110683349374398, + "grad_norm": 1.355884313583374, + "learning_rate": 9.912319807576452e-07, + "loss": 0.856, + "step": 49640 + }, + { + "epoch": 1.9112608277189604, + "grad_norm": 1.1907180547714233, + "learning_rate": 9.86989335322519e-07, + "loss": 0.7598, + "step": 49645 + }, + { + "epoch": 1.9114533205004811, + "grad_norm": 1.5118666887283325, + "learning_rate": 9.827557439713086e-07, + "loss": 0.9634, + "step": 49650 + }, + { + "epoch": 1.9116458132820018, + "grad_norm": 1.2580753564834595, + "learning_rate": 9.7853120709116e-07, + "loss": 0.7613, + "step": 49655 + }, + { + "epoch": 1.9118383060635225, + "grad_norm": 1.043454885482788, + "learning_rate": 9.743157250683644e-07, + "loss": 0.7669, + "step": 49660 + }, + { + "epoch": 1.9120307988450433, + "grad_norm": 1.603769302368164, + "learning_rate": 9.701092982884351e-07, + "loss": 0.8249, + "step": 49665 + }, + { + "epoch": 1.912223291626564, + "grad_norm": 1.0182993412017822, + "learning_rate": 9.659119271359762e-07, + "loss": 0.7097, + "step": 49670 + }, + { + "epoch": 1.9124157844080847, + "grad_norm": 1.6166123151779175, + "learning_rate": 9.617236119948358e-07, + "loss": 0.6695, + "step": 49675 + }, + { + "epoch": 1.9126082771896054, + "grad_norm": 1.0179489850997925, + "learning_rate": 9.575443532480076e-07, + "loss": 0.7893, + "step": 49680 + }, + { + "epoch": 1.9128007699711262, + "grad_norm": 1.4863917827606201, + "learning_rate": 9.533741512776417e-07, + "loss": 0.7844, + "step": 49685 + }, + { + "epoch": 1.912993262752647, + "grad_norm": 2.095845937728882, + "learning_rate": 9.492130064650995e-07, + "loss": 0.7687, + "step": 49690 + }, + { + "epoch": 1.9131857555341676, + "grad_norm": 0.9106748104095459, + "learning_rate": 9.450609191908766e-07, + "loss": 0.8065, + "step": 49695 + }, + { + "epoch": 1.9133782483156883, + "grad_norm": 0.5192959308624268, + "learning_rate": 9.40917889834636e-07, + "loss": 0.8384, + "step": 49700 + }, + { + "epoch": 1.913570741097209, + "grad_norm": 0.8835710287094116, + "learning_rate": 9.367839187752636e-07, + "loss": 0.8304, + "step": 49705 + }, + { + "epoch": 1.9137632338787296, + "grad_norm": 1.3475233316421509, + "learning_rate": 9.326590063907681e-07, + "loss": 0.8051, + "step": 49710 + }, + { + "epoch": 1.9139557266602503, + "grad_norm": 1.0911017656326294, + "learning_rate": 9.285431530583366e-07, + "loss": 0.6483, + "step": 49715 + }, + { + "epoch": 1.914148219441771, + "grad_norm": 1.183143973350525, + "learning_rate": 9.244363591543459e-07, + "loss": 0.9217, + "step": 49720 + }, + { + "epoch": 1.9143407122232916, + "grad_norm": 1.7121336460113525, + "learning_rate": 9.2033862505434e-07, + "loss": 0.7959, + "step": 49725 + }, + { + "epoch": 1.9145332050048123, + "grad_norm": 1.2456363439559937, + "learning_rate": 9.162499511330192e-07, + "loss": 0.7527, + "step": 49730 + }, + { + "epoch": 1.914725697786333, + "grad_norm": 2.138845205307007, + "learning_rate": 9.121703377642732e-07, + "loss": 0.9166, + "step": 49735 + }, + { + "epoch": 1.9149181905678536, + "grad_norm": 1.7342472076416016, + "learning_rate": 9.080997853211592e-07, + "loss": 0.7697, + "step": 49740 + }, + { + "epoch": 1.9151106833493743, + "grad_norm": 1.9589825868606567, + "learning_rate": 9.040382941758907e-07, + "loss": 0.86, + "step": 49745 + }, + { + "epoch": 1.915303176130895, + "grad_norm": 0.9614568948745728, + "learning_rate": 8.999858646998704e-07, + "loss": 0.8351, + "step": 49750 + }, + { + "epoch": 1.9154956689124156, + "grad_norm": 2.190661668777466, + "learning_rate": 8.959424972636576e-07, + "loss": 0.8423, + "step": 49755 + }, + { + "epoch": 1.9156881616939365, + "grad_norm": 1.1599905490875244, + "learning_rate": 8.91908192237012e-07, + "loss": 0.9698, + "step": 49760 + }, + { + "epoch": 1.9158806544754572, + "grad_norm": 0.9257279634475708, + "learning_rate": 8.878829499888164e-07, + "loss": 0.7918, + "step": 49765 + }, + { + "epoch": 1.9160731472569779, + "grad_norm": 1.8219150304794312, + "learning_rate": 8.838667708871873e-07, + "loss": 0.7945, + "step": 49770 + }, + { + "epoch": 1.9162656400384985, + "grad_norm": 1.6939747333526611, + "learning_rate": 8.798596552993421e-07, + "loss": 0.8348, + "step": 49775 + }, + { + "epoch": 1.9164581328200192, + "grad_norm": 1.0439796447753906, + "learning_rate": 8.758616035917211e-07, + "loss": 0.8166, + "step": 49780 + }, + { + "epoch": 1.91665062560154, + "grad_norm": 1.6584985256195068, + "learning_rate": 8.718726161299206e-07, + "loss": 0.7026, + "step": 49785 + }, + { + "epoch": 1.9168431183830608, + "grad_norm": 1.5818768739700317, + "learning_rate": 8.678926932787157e-07, + "loss": 0.7577, + "step": 49790 + }, + { + "epoch": 1.9170356111645814, + "grad_norm": 1.8825992345809937, + "learning_rate": 8.639218354020151e-07, + "loss": 0.6868, + "step": 49795 + }, + { + "epoch": 1.917228103946102, + "grad_norm": 1.7052052021026611, + "learning_rate": 8.599600428629617e-07, + "loss": 0.705, + "step": 49800 + }, + { + "epoch": 1.9174205967276228, + "grad_norm": 1.2492605447769165, + "learning_rate": 8.560073160238213e-07, + "loss": 0.7604, + "step": 49805 + }, + { + "epoch": 1.9176130895091434, + "grad_norm": 1.3016527891159058, + "learning_rate": 8.520636552460381e-07, + "loss": 0.681, + "step": 49810 + }, + { + "epoch": 1.9178055822906641, + "grad_norm": 1.7560960054397583, + "learning_rate": 8.481290608902348e-07, + "loss": 0.8405, + "step": 49815 + }, + { + "epoch": 1.9179980750721848, + "grad_norm": 1.0738979578018188, + "learning_rate": 8.442035333162013e-07, + "loss": 0.9153, + "step": 49820 + }, + { + "epoch": 1.9181905678537055, + "grad_norm": 2.123007297515869, + "learning_rate": 8.402870728829282e-07, + "loss": 0.8122, + "step": 49825 + }, + { + "epoch": 1.9183830606352261, + "grad_norm": 1.9177762269973755, + "learning_rate": 8.363796799485069e-07, + "loss": 0.9144, + "step": 49830 + }, + { + "epoch": 1.9185755534167468, + "grad_norm": 1.4648776054382324, + "learning_rate": 8.324813548702847e-07, + "loss": 0.8814, + "step": 49835 + }, + { + "epoch": 1.9187680461982675, + "grad_norm": 1.2725647687911987, + "learning_rate": 8.285920980047102e-07, + "loss": 0.8659, + "step": 49840 + }, + { + "epoch": 1.9189605389797881, + "grad_norm": 0.6527199149131775, + "learning_rate": 8.247119097074319e-07, + "loss": 0.7217, + "step": 49845 + }, + { + "epoch": 1.9191530317613088, + "grad_norm": 1.6857417821884155, + "learning_rate": 8.208407903332771e-07, + "loss": 0.8141, + "step": 49850 + }, + { + "epoch": 1.9193455245428297, + "grad_norm": 0.8140450716018677, + "learning_rate": 8.169787402362406e-07, + "loss": 0.7364, + "step": 49855 + }, + { + "epoch": 1.9195380173243504, + "grad_norm": 1.138654112815857, + "learning_rate": 8.131257597694508e-07, + "loss": 0.9553, + "step": 49860 + }, + { + "epoch": 1.919730510105871, + "grad_norm": 1.369598150253296, + "learning_rate": 8.092818492852705e-07, + "loss": 0.7387, + "step": 49865 + }, + { + "epoch": 1.9199230028873917, + "grad_norm": 1.1333906650543213, + "learning_rate": 8.054470091351851e-07, + "loss": 0.7376, + "step": 49870 + }, + { + "epoch": 1.9201154956689124, + "grad_norm": 1.7450077533721924, + "learning_rate": 8.016212396698697e-07, + "loss": 0.9635, + "step": 49875 + }, + { + "epoch": 1.9203079884504333, + "grad_norm": 0.9076559543609619, + "learning_rate": 7.978045412391555e-07, + "loss": 0.7919, + "step": 49880 + }, + { + "epoch": 1.920500481231954, + "grad_norm": 1.402205228805542, + "learning_rate": 7.939969141920744e-07, + "loss": 0.8091, + "step": 49885 + }, + { + "epoch": 1.9206929740134746, + "grad_norm": 1.2806485891342163, + "learning_rate": 7.901983588767814e-07, + "loss": 0.7798, + "step": 49890 + }, + { + "epoch": 1.9208854667949953, + "grad_norm": 2.2308027744293213, + "learning_rate": 7.864088756406429e-07, + "loss": 0.829, + "step": 49895 + }, + { + "epoch": 1.921077959576516, + "grad_norm": 1.4024001359939575, + "learning_rate": 7.826284648301929e-07, + "loss": 0.7619, + "step": 49900 + }, + { + "epoch": 1.9212704523580366, + "grad_norm": 1.3569003343582153, + "learning_rate": 7.788571267911104e-07, + "loss": 0.7127, + "step": 49905 + }, + { + "epoch": 1.9214629451395573, + "grad_norm": 2.439872980117798, + "learning_rate": 7.75094861868264e-07, + "loss": 0.9349, + "step": 49910 + }, + { + "epoch": 1.921655437921078, + "grad_norm": 1.2316006422042847, + "learning_rate": 7.713416704056897e-07, + "loss": 0.8283, + "step": 49915 + }, + { + "epoch": 1.9218479307025986, + "grad_norm": 1.0408631563186646, + "learning_rate": 7.675975527465906e-07, + "loss": 0.765, + "step": 49920 + }, + { + "epoch": 1.9220404234841193, + "grad_norm": 1.4583711624145508, + "learning_rate": 7.638625092333373e-07, + "loss": 0.8184, + "step": 49925 + }, + { + "epoch": 1.92223291626564, + "grad_norm": 2.3234689235687256, + "learning_rate": 7.601365402074789e-07, + "loss": 0.8176, + "step": 49930 + }, + { + "epoch": 1.9224254090471606, + "grad_norm": 1.4219454526901245, + "learning_rate": 7.564196460097316e-07, + "loss": 0.6931, + "step": 49935 + }, + { + "epoch": 1.9226179018286813, + "grad_norm": 1.5232850313186646, + "learning_rate": 7.527118269799793e-07, + "loss": 0.8985, + "step": 49940 + }, + { + "epoch": 1.922810394610202, + "grad_norm": 1.4048689603805542, + "learning_rate": 7.490130834572728e-07, + "loss": 0.9178, + "step": 49945 + }, + { + "epoch": 1.9230028873917226, + "grad_norm": 0.891716718673706, + "learning_rate": 7.453234157798416e-07, + "loss": 0.9839, + "step": 49950 + }, + { + "epoch": 1.9231953801732435, + "grad_norm": 1.1254078149795532, + "learning_rate": 7.416428242850937e-07, + "loss": 0.928, + "step": 49955 + }, + { + "epoch": 1.9233878729547642, + "grad_norm": 1.3273284435272217, + "learning_rate": 7.379713093095708e-07, + "loss": 0.7957, + "step": 49960 + }, + { + "epoch": 1.9235803657362849, + "grad_norm": 1.1084377765655518, + "learning_rate": 7.343088711890267e-07, + "loss": 0.9596, + "step": 49965 + }, + { + "epoch": 1.9237728585178056, + "grad_norm": 1.641728162765503, + "learning_rate": 7.306555102583601e-07, + "loss": 0.8442, + "step": 49970 + }, + { + "epoch": 1.9239653512993262, + "grad_norm": 1.5067929029464722, + "learning_rate": 7.270112268516483e-07, + "loss": 0.7893, + "step": 49975 + }, + { + "epoch": 1.9241578440808471, + "grad_norm": 0.8102709054946899, + "learning_rate": 7.233760213021357e-07, + "loss": 0.7961, + "step": 49980 + }, + { + "epoch": 1.9243503368623678, + "grad_norm": 2.6088552474975586, + "learning_rate": 7.197498939422343e-07, + "loss": 0.8979, + "step": 49985 + }, + { + "epoch": 1.9245428296438885, + "grad_norm": 1.865166187286377, + "learning_rate": 7.161328451035454e-07, + "loss": 0.8434, + "step": 49990 + }, + { + "epoch": 1.9247353224254091, + "grad_norm": 1.1561760902404785, + "learning_rate": 7.125248751167934e-07, + "loss": 0.7265, + "step": 49995 + }, + { + "epoch": 1.9249278152069298, + "grad_norm": 1.1306687593460083, + "learning_rate": 7.089259843119478e-07, + "loss": 0.9502, + "step": 50000 } ], "logging_steps": 5, @@ -56014,7 +70014,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, - "total_flos": 1.2478892695491133e+18, + "total_flos": 1.5612551596802458e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null