{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991386735572783, "eval_steps": 100, "global_step": 3045, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004921865386981666, "grad_norm": 10.908417701721191, "learning_rate": 3.278688524590164e-07, "loss": 2.6851, "mean_token_accuracy": 0.490550322830677, "step": 5 }, { "epoch": 0.009843730773963333, "grad_norm": 10.821477890014648, "learning_rate": 6.557377049180328e-07, "loss": 2.6916, "mean_token_accuracy": 0.4892874449491501, "step": 10 }, { "epoch": 0.014765596160944998, "grad_norm": 9.100831031799316, "learning_rate": 9.836065573770493e-07, "loss": 2.6563, "mean_token_accuracy": 0.49268135130405427, "step": 15 }, { "epoch": 0.019687461547926666, "grad_norm": 6.744043827056885, "learning_rate": 1.3114754098360657e-06, "loss": 2.4838, "mean_token_accuracy": 0.503991749882698, "step": 20 }, { "epoch": 0.02460932693490833, "grad_norm": 4.111428737640381, "learning_rate": 1.6393442622950819e-06, "loss": 2.3481, "mean_token_accuracy": 0.5121142826974392, "step": 25 }, { "epoch": 0.029531192321889995, "grad_norm": 3.504826068878174, "learning_rate": 1.9672131147540985e-06, "loss": 2.1834, "mean_token_accuracy": 0.525759468972683, "step": 30 }, { "epoch": 0.034453057708871665, "grad_norm": 2.371668577194214, "learning_rate": 2.295081967213115e-06, "loss": 1.9992, "mean_token_accuracy": 0.5471328645944595, "step": 35 }, { "epoch": 0.03937492309585333, "grad_norm": 1.910736083984375, "learning_rate": 2.6229508196721314e-06, "loss": 1.8619, "mean_token_accuracy": 0.5657269343733787, "step": 40 }, { "epoch": 0.044296788482835, "grad_norm": 1.6694586277008057, "learning_rate": 2.9508196721311478e-06, "loss": 1.7324, "mean_token_accuracy": 0.582801228761673, "step": 45 }, { "epoch": 0.04921865386981666, "grad_norm": 1.3371120691299438, "learning_rate": 3.2786885245901638e-06, "loss": 1.5922, "mean_token_accuracy": 0.6066210582852364, "step": 50 }, { "epoch": 0.054140519256798324, "grad_norm": 1.153715968132019, "learning_rate": 3.6065573770491806e-06, "loss": 1.4607, "mean_token_accuracy": 0.629358272254467, "step": 55 }, { "epoch": 0.05906238464377999, "grad_norm": 1.011682391166687, "learning_rate": 3.934426229508197e-06, "loss": 1.3312, "mean_token_accuracy": 0.6534152328968048, "step": 60 }, { "epoch": 0.06398425003076166, "grad_norm": 0.8580278158187866, "learning_rate": 4.2622950819672135e-06, "loss": 1.2163, "mean_token_accuracy": 0.676006656885147, "step": 65 }, { "epoch": 0.06890611541774333, "grad_norm": 0.7737818360328674, "learning_rate": 4.59016393442623e-06, "loss": 1.1256, "mean_token_accuracy": 0.695121419429779, "step": 70 }, { "epoch": 0.073827980804725, "grad_norm": 0.6026164889335632, "learning_rate": 4.918032786885246e-06, "loss": 1.0456, "mean_token_accuracy": 0.7120692700147628, "step": 75 }, { "epoch": 0.07874984619170666, "grad_norm": 20.797266006469727, "learning_rate": 5.245901639344263e-06, "loss": 0.9884, "mean_token_accuracy": 0.7246918171644211, "step": 80 }, { "epoch": 0.08367171157868833, "grad_norm": 24.53761100769043, "learning_rate": 5.573770491803278e-06, "loss": 0.9471, "mean_token_accuracy": 0.7344574183225632, "step": 85 }, { "epoch": 0.08859357696567, "grad_norm": 7.69836950302124, "learning_rate": 5.9016393442622956e-06, "loss": 0.9291, "mean_token_accuracy": 0.7384938269853591, "step": 90 }, { "epoch": 0.09351544235265165, "grad_norm": 0.42971891164779663, "learning_rate": 6.229508196721312e-06, "loss": 0.9071, "mean_token_accuracy": 0.743149445950985, "step": 95 }, { "epoch": 0.09843730773963331, "grad_norm": 0.4011496901512146, "learning_rate": 6.5573770491803276e-06, "loss": 0.8839, "mean_token_accuracy": 0.7489838138222694, "step": 100 }, { "epoch": 0.10335917312661498, "grad_norm": 0.4182426631450653, "learning_rate": 6.885245901639345e-06, "loss": 0.864, "mean_token_accuracy": 0.7533508613705635, "step": 105 }, { "epoch": 0.10828103851359665, "grad_norm": 0.4418739080429077, "learning_rate": 7.213114754098361e-06, "loss": 0.8461, "mean_token_accuracy": 0.7571793958544731, "step": 110 }, { "epoch": 0.11320290390057831, "grad_norm": 4.76384973526001, "learning_rate": 7.540983606557377e-06, "loss": 0.8478, "mean_token_accuracy": 0.7560782924294471, "step": 115 }, { "epoch": 0.11812476928755998, "grad_norm": 0.426782488822937, "learning_rate": 7.868852459016394e-06, "loss": 0.8262, "mean_token_accuracy": 0.7621309965848923, "step": 120 }, { "epoch": 0.12304663467454165, "grad_norm": 3.5404343605041504, "learning_rate": 8.19672131147541e-06, "loss": 0.8239, "mean_token_accuracy": 0.7624999329447746, "step": 125 }, { "epoch": 0.12796850006152333, "grad_norm": 0.6128109097480774, "learning_rate": 8.524590163934427e-06, "loss": 0.8125, "mean_token_accuracy": 0.7650709196925163, "step": 130 }, { "epoch": 0.132890365448505, "grad_norm": 0.4441392719745636, "learning_rate": 8.852459016393443e-06, "loss": 0.8178, "mean_token_accuracy": 0.7635303542017937, "step": 135 }, { "epoch": 0.13781223083548666, "grad_norm": 0.6959536075592041, "learning_rate": 9.18032786885246e-06, "loss": 0.797, "mean_token_accuracy": 0.7682553365826607, "step": 140 }, { "epoch": 0.14273409622246833, "grad_norm": 0.4633159935474396, "learning_rate": 9.508196721311476e-06, "loss": 0.7972, "mean_token_accuracy": 0.7677757993340493, "step": 145 }, { "epoch": 0.14765596160945, "grad_norm": 0.3808494806289673, "learning_rate": 9.836065573770493e-06, "loss": 0.7956, "mean_token_accuracy": 0.7682796508073807, "step": 150 }, { "epoch": 0.15257782699643166, "grad_norm": 1.2230223417282104, "learning_rate": 1.0163934426229509e-05, "loss": 0.7714, "mean_token_accuracy": 0.7741705477237701, "step": 155 }, { "epoch": 0.15749969238341333, "grad_norm": 1.2708261013031006, "learning_rate": 1.0491803278688525e-05, "loss": 0.7671, "mean_token_accuracy": 0.7750522747635842, "step": 160 }, { "epoch": 0.162421557770395, "grad_norm": 0.4153311252593994, "learning_rate": 1.0819672131147544e-05, "loss": 0.762, "mean_token_accuracy": 0.776003035902977, "step": 165 }, { "epoch": 0.16734342315737666, "grad_norm": 0.48690149188041687, "learning_rate": 1.1147540983606557e-05, "loss": 0.7611, "mean_token_accuracy": 0.776053948700428, "step": 170 }, { "epoch": 0.17226528854435832, "grad_norm": 0.3839600682258606, "learning_rate": 1.1475409836065575e-05, "loss": 0.7518, "mean_token_accuracy": 0.7784286484122276, "step": 175 }, { "epoch": 0.17718715393134, "grad_norm": 0.33650702238082886, "learning_rate": 1.1803278688524591e-05, "loss": 0.7425, "mean_token_accuracy": 0.7807790979743003, "step": 180 }, { "epoch": 0.18210901931832166, "grad_norm": 0.34878674149513245, "learning_rate": 1.2131147540983608e-05, "loss": 0.7469, "mean_token_accuracy": 0.779270826280117, "step": 185 }, { "epoch": 0.1870308847053033, "grad_norm": 0.4435058534145355, "learning_rate": 1.2459016393442624e-05, "loss": 0.7414, "mean_token_accuracy": 0.7804962411522866, "step": 190 }, { "epoch": 0.19195275009228496, "grad_norm": 0.34793269634246826, "learning_rate": 1.2786885245901642e-05, "loss": 0.7368, "mean_token_accuracy": 0.7817707493901253, "step": 195 }, { "epoch": 0.19687461547926663, "grad_norm": 0.32821062207221985, "learning_rate": 1.3114754098360655e-05, "loss": 0.7309, "mean_token_accuracy": 0.7830819576978684, "step": 200 }, { "epoch": 0.2017964808662483, "grad_norm": 0.3908160626888275, "learning_rate": 1.3442622950819673e-05, "loss": 0.7349, "mean_token_accuracy": 0.7820746794342994, "step": 205 }, { "epoch": 0.20671834625322996, "grad_norm": 1.239039659500122, "learning_rate": 1.377049180327869e-05, "loss": 0.7315, "mean_token_accuracy": 0.7830250725150109, "step": 210 }, { "epoch": 0.21164021164021163, "grad_norm": 0.437558650970459, "learning_rate": 1.4098360655737706e-05, "loss": 0.7213, "mean_token_accuracy": 0.785545514523983, "step": 215 }, { "epoch": 0.2165620770271933, "grad_norm": 0.3581276535987854, "learning_rate": 1.4426229508196722e-05, "loss": 0.7156, "mean_token_accuracy": 0.7868386089801789, "step": 220 }, { "epoch": 0.22148394241417496, "grad_norm": 0.393839031457901, "learning_rate": 1.4754098360655739e-05, "loss": 0.7108, "mean_token_accuracy": 0.7875275865197182, "step": 225 }, { "epoch": 0.22640580780115663, "grad_norm": 0.4203226566314697, "learning_rate": 1.5081967213114754e-05, "loss": 0.7115, "mean_token_accuracy": 0.7875282734632492, "step": 230 }, { "epoch": 0.2313276731881383, "grad_norm": 0.4379311501979828, "learning_rate": 1.5409836065573772e-05, "loss": 0.7176, "mean_token_accuracy": 0.7859495177865028, "step": 235 }, { "epoch": 0.23624953857511996, "grad_norm": 0.5987364053726196, "learning_rate": 1.5737704918032788e-05, "loss": 0.7047, "mean_token_accuracy": 0.7892461016774177, "step": 240 }, { "epoch": 0.24117140396210163, "grad_norm": 0.39721059799194336, "learning_rate": 1.6065573770491805e-05, "loss": 0.7082, "mean_token_accuracy": 0.7879156336188317, "step": 245 }, { "epoch": 0.2460932693490833, "grad_norm": 0.35150638222694397, "learning_rate": 1.639344262295082e-05, "loss": 0.7015, "mean_token_accuracy": 0.7899731829762459, "step": 250 }, { "epoch": 0.25101513473606496, "grad_norm": 0.37812677025794983, "learning_rate": 1.6721311475409837e-05, "loss": 0.7112, "mean_token_accuracy": 0.7869908154010773, "step": 255 }, { "epoch": 0.25593700012304665, "grad_norm": 0.37921008467674255, "learning_rate": 1.7049180327868854e-05, "loss": 0.695, "mean_token_accuracy": 0.7912393018603325, "step": 260 }, { "epoch": 0.2608588655100283, "grad_norm": 0.3776193857192993, "learning_rate": 1.737704918032787e-05, "loss": 0.6975, "mean_token_accuracy": 0.7903847828507423, "step": 265 }, { "epoch": 0.26578073089701, "grad_norm": 0.34160885214805603, "learning_rate": 1.7704918032786887e-05, "loss": 0.7005, "mean_token_accuracy": 0.7901133581995964, "step": 270 }, { "epoch": 0.2707025962839916, "grad_norm": 0.3151760399341583, "learning_rate": 1.8032786885245903e-05, "loss": 0.6838, "mean_token_accuracy": 0.7940751999616623, "step": 275 }, { "epoch": 0.2756244616709733, "grad_norm": 0.3251655101776123, "learning_rate": 1.836065573770492e-05, "loss": 0.683, "mean_token_accuracy": 0.7942519947886467, "step": 280 }, { "epoch": 0.28054632705795496, "grad_norm": 0.392980694770813, "learning_rate": 1.8688524590163936e-05, "loss": 0.6779, "mean_token_accuracy": 0.7953907087445259, "step": 285 }, { "epoch": 0.28546819244493665, "grad_norm": 0.42777085304260254, "learning_rate": 1.9016393442622952e-05, "loss": 0.696, "mean_token_accuracy": 0.7913835749030114, "step": 290 }, { "epoch": 0.2903900578319183, "grad_norm": 0.38064613938331604, "learning_rate": 1.934426229508197e-05, "loss": 0.6777, "mean_token_accuracy": 0.79527537971735, "step": 295 }, { "epoch": 0.2953119232189, "grad_norm": 0.35906219482421875, "learning_rate": 1.9672131147540985e-05, "loss": 0.6772, "mean_token_accuracy": 0.7954441845417023, "step": 300 }, { "epoch": 0.3002337886058816, "grad_norm": 0.4336443543434143, "learning_rate": 2e-05, "loss": 0.6672, "mean_token_accuracy": 0.7982369065284729, "step": 305 }, { "epoch": 0.3051556539928633, "grad_norm": 0.35013464093208313, "learning_rate": 1.9999835673561284e-05, "loss": 0.6823, "mean_token_accuracy": 0.7940784975886345, "step": 310 }, { "epoch": 0.31007751937984496, "grad_norm": 0.4209573566913605, "learning_rate": 1.9999342699645774e-05, "loss": 0.6705, "mean_token_accuracy": 0.7970875754952431, "step": 315 }, { "epoch": 0.31499938476682665, "grad_norm": 0.3402932584285736, "learning_rate": 1.9998521094455198e-05, "loss": 0.6733, "mean_token_accuracy": 0.7962517961859703, "step": 320 }, { "epoch": 0.3199212501538083, "grad_norm": 0.3613898456096649, "learning_rate": 1.9997370884991842e-05, "loss": 0.6659, "mean_token_accuracy": 0.7986094921827316, "step": 325 }, { "epoch": 0.32484311554079, "grad_norm": 0.8141839504241943, "learning_rate": 1.9995892109057675e-05, "loss": 0.6682, "mean_token_accuracy": 0.7979325890541077, "step": 330 }, { "epoch": 0.3297649809277716, "grad_norm": 0.32822492718696594, "learning_rate": 1.99940848152531e-05, "loss": 0.6592, "mean_token_accuracy": 0.799762362241745, "step": 335 }, { "epoch": 0.3346868463147533, "grad_norm": 0.32193639874458313, "learning_rate": 1.9991949062975336e-05, "loss": 0.6669, "mean_token_accuracy": 0.7977916583418846, "step": 340 }, { "epoch": 0.33960871170173496, "grad_norm": 0.6516172885894775, "learning_rate": 1.9989484922416503e-05, "loss": 0.6636, "mean_token_accuracy": 0.7989253982901573, "step": 345 }, { "epoch": 0.34453057708871665, "grad_norm": 0.6252678036689758, "learning_rate": 1.9986692474561292e-05, "loss": 0.6549, "mean_token_accuracy": 0.8010424450039864, "step": 350 }, { "epoch": 0.3494524424756983, "grad_norm": 0.39426907896995544, "learning_rate": 1.9983571811184297e-05, "loss": 0.6583, "mean_token_accuracy": 0.8001298069953918, "step": 355 }, { "epoch": 0.35437430786268, "grad_norm": 0.4398311972618103, "learning_rate": 1.9980123034847025e-05, "loss": 0.6569, "mean_token_accuracy": 0.8002386093139648, "step": 360 }, { "epoch": 0.3592961732496616, "grad_norm": 0.36181896924972534, "learning_rate": 1.9976346258894502e-05, "loss": 0.6572, "mean_token_accuracy": 0.7999640181660652, "step": 365 }, { "epoch": 0.3642180386366433, "grad_norm": 0.33937492966651917, "learning_rate": 1.9972241607451552e-05, "loss": 0.6534, "mean_token_accuracy": 0.8008638471364975, "step": 370 }, { "epoch": 0.36913990402362495, "grad_norm": 0.3220241665840149, "learning_rate": 1.996780921541873e-05, "loss": 0.6491, "mean_token_accuracy": 0.8024497851729393, "step": 375 }, { "epoch": 0.3740617694106066, "grad_norm": 0.3588990867137909, "learning_rate": 1.9963049228467875e-05, "loss": 0.6519, "mean_token_accuracy": 0.8013440445065498, "step": 380 }, { "epoch": 0.3789836347975883, "grad_norm": 0.3850741982460022, "learning_rate": 1.9957961803037325e-05, "loss": 0.6539, "mean_token_accuracy": 0.8007026329636574, "step": 385 }, { "epoch": 0.3839055001845699, "grad_norm": 0.39418673515319824, "learning_rate": 1.9952547106326787e-05, "loss": 0.6511, "mean_token_accuracy": 0.8013290241360664, "step": 390 }, { "epoch": 0.3888273655715516, "grad_norm": 0.33889254927635193, "learning_rate": 1.9946805316291817e-05, "loss": 0.6523, "mean_token_accuracy": 0.8005807921290398, "step": 395 }, { "epoch": 0.39374923095853326, "grad_norm": 0.7381798624992371, "learning_rate": 1.9940736621638e-05, "loss": 0.649, "mean_token_accuracy": 0.8016207367181778, "step": 400 }, { "epoch": 0.39867109634551495, "grad_norm": 0.3772973120212555, "learning_rate": 1.993434122181474e-05, "loss": 0.6458, "mean_token_accuracy": 0.802768674492836, "step": 405 }, { "epoch": 0.4035929617324966, "grad_norm": 0.33333730697631836, "learning_rate": 1.992761932700868e-05, "loss": 0.6444, "mean_token_accuracy": 0.8025879472494125, "step": 410 }, { "epoch": 0.4085148271194783, "grad_norm": 0.3165677785873413, "learning_rate": 1.9920571158136837e-05, "loss": 0.639, "mean_token_accuracy": 0.8042329683899879, "step": 415 }, { "epoch": 0.4134366925064599, "grad_norm": 0.3313787579536438, "learning_rate": 1.9913196946839304e-05, "loss": 0.6422, "mean_token_accuracy": 0.803669148683548, "step": 420 }, { "epoch": 0.4183585578934416, "grad_norm": 0.2832159101963043, "learning_rate": 1.990549693547166e-05, "loss": 0.6378, "mean_token_accuracy": 0.8049987867474556, "step": 425 }, { "epoch": 0.42328042328042326, "grad_norm": 0.3278089463710785, "learning_rate": 1.9897471377096992e-05, "loss": 0.638, "mean_token_accuracy": 0.8043939173221588, "step": 430 }, { "epoch": 0.42820228866740495, "grad_norm": 0.33513346314430237, "learning_rate": 1.9889120535477584e-05, "loss": 0.6366, "mean_token_accuracy": 0.80514996945858, "step": 435 }, { "epoch": 0.4331241540543866, "grad_norm": 0.36697131395339966, "learning_rate": 1.9880444685066252e-05, "loss": 0.6322, "mean_token_accuracy": 0.8064638406038285, "step": 440 }, { "epoch": 0.4380460194413683, "grad_norm": 0.34239935874938965, "learning_rate": 1.987144411099731e-05, "loss": 0.6328, "mean_token_accuracy": 0.8058159291744232, "step": 445 }, { "epoch": 0.4429678848283499, "grad_norm": 0.29778754711151123, "learning_rate": 1.9862119109077226e-05, "loss": 0.6442, "mean_token_accuracy": 0.8030599504709244, "step": 450 }, { "epoch": 0.4478897502153316, "grad_norm": 0.31139907240867615, "learning_rate": 1.985246998577486e-05, "loss": 0.6507, "mean_token_accuracy": 0.8007849171757698, "step": 455 }, { "epoch": 0.45281161560231326, "grad_norm": 0.32070034742355347, "learning_rate": 1.984249705821143e-05, "loss": 0.6405, "mean_token_accuracy": 0.8038340613245964, "step": 460 }, { "epoch": 0.45773348098929495, "grad_norm": 0.3086022734642029, "learning_rate": 1.9832200654150077e-05, "loss": 0.6316, "mean_token_accuracy": 0.8058078184723854, "step": 465 }, { "epoch": 0.4626553463762766, "grad_norm": 0.30972251296043396, "learning_rate": 1.9821581111985072e-05, "loss": 0.6343, "mean_token_accuracy": 0.8051379904150963, "step": 470 }, { "epoch": 0.4675772117632583, "grad_norm": 0.2832852005958557, "learning_rate": 1.981063878073073e-05, "loss": 0.6324, "mean_token_accuracy": 0.8058837354183197, "step": 475 }, { "epoch": 0.4724990771502399, "grad_norm": 0.909318208694458, "learning_rate": 1.979937402000991e-05, "loss": 0.6319, "mean_token_accuracy": 0.8056973502039909, "step": 480 }, { "epoch": 0.4774209425372216, "grad_norm": 0.31788304448127747, "learning_rate": 1.9787787200042224e-05, "loss": 0.6354, "mean_token_accuracy": 0.8051144614815712, "step": 485 }, { "epoch": 0.48234280792420325, "grad_norm": 0.2922450602054596, "learning_rate": 1.977587870163184e-05, "loss": 0.6278, "mean_token_accuracy": 0.8066384568810463, "step": 490 }, { "epoch": 0.48726467331118495, "grad_norm": 0.287406325340271, "learning_rate": 1.9763648916154982e-05, "loss": 0.6271, "mean_token_accuracy": 0.8069956362247467, "step": 495 }, { "epoch": 0.4921865386981666, "grad_norm": 0.34040403366088867, "learning_rate": 1.975109824554707e-05, "loss": 0.6288, "mean_token_accuracy": 0.806525257229805, "step": 500 }, { "epoch": 0.4971084040851483, "grad_norm": 0.3302447199821472, "learning_rate": 1.973822710228951e-05, "loss": 0.6257, "mean_token_accuracy": 0.8072399228811264, "step": 505 }, { "epoch": 0.5020302694721299, "grad_norm": 0.288161963224411, "learning_rate": 1.972503590939612e-05, "loss": 0.6234, "mean_token_accuracy": 0.8078823387622833, "step": 510 }, { "epoch": 0.5069521348591116, "grad_norm": 0.3387835919857025, "learning_rate": 1.971152510039926e-05, "loss": 0.6269, "mean_token_accuracy": 0.8067226454615593, "step": 515 }, { "epoch": 0.5118740002460933, "grad_norm": 0.290519118309021, "learning_rate": 1.9697695119335547e-05, "loss": 0.6213, "mean_token_accuracy": 0.8083379164338111, "step": 520 }, { "epoch": 0.5167958656330749, "grad_norm": 0.3701138496398926, "learning_rate": 1.9683546420731292e-05, "loss": 0.6246, "mean_token_accuracy": 0.8079604268074035, "step": 525 }, { "epoch": 0.5217177310200566, "grad_norm": 0.39614954590797424, "learning_rate": 1.9669079469587548e-05, "loss": 0.6287, "mean_token_accuracy": 0.8067878499627114, "step": 530 }, { "epoch": 0.5266395964070383, "grad_norm": 0.32784542441368103, "learning_rate": 1.965429474136482e-05, "loss": 0.6156, "mean_token_accuracy": 0.8098407059907913, "step": 535 }, { "epoch": 0.53156146179402, "grad_norm": 0.30213144421577454, "learning_rate": 1.963919272196746e-05, "loss": 0.6207, "mean_token_accuracy": 0.8086924180388451, "step": 540 }, { "epoch": 0.5364833271810016, "grad_norm": 0.32220178842544556, "learning_rate": 1.9623773907727682e-05, "loss": 0.6157, "mean_token_accuracy": 0.8098208606243134, "step": 545 }, { "epoch": 0.5414051925679833, "grad_norm": 0.3250666856765747, "learning_rate": 1.9608038805389253e-05, "loss": 0.6195, "mean_token_accuracy": 0.8085113659501075, "step": 550 }, { "epoch": 0.546327057954965, "grad_norm": 0.36724722385406494, "learning_rate": 1.9591987932090836e-05, "loss": 0.6115, "mean_token_accuracy": 0.8109661117196083, "step": 555 }, { "epoch": 0.5512489233419466, "grad_norm": 0.30343472957611084, "learning_rate": 1.9575621815349e-05, "loss": 0.6204, "mean_token_accuracy": 0.8083494484424592, "step": 560 }, { "epoch": 0.5561707887289282, "grad_norm": 0.3323419988155365, "learning_rate": 1.9558940993040885e-05, "loss": 0.6232, "mean_token_accuracy": 0.8077159106731415, "step": 565 }, { "epoch": 0.5610926541159099, "grad_norm": 0.31035885214805603, "learning_rate": 1.954194601338651e-05, "loss": 0.6157, "mean_token_accuracy": 0.8096732005476952, "step": 570 }, { "epoch": 0.5660145195028916, "grad_norm": 0.2931119501590729, "learning_rate": 1.952463743493078e-05, "loss": 0.6199, "mean_token_accuracy": 0.808499938249588, "step": 575 }, { "epoch": 0.5709363848898733, "grad_norm": 0.27563023567199707, "learning_rate": 1.9507015826525096e-05, "loss": 0.6046, "mean_token_accuracy": 0.8128907606005669, "step": 580 }, { "epoch": 0.5758582502768549, "grad_norm": 0.28453299403190613, "learning_rate": 1.9489081767308696e-05, "loss": 0.6105, "mean_token_accuracy": 0.8113355338573456, "step": 585 }, { "epoch": 0.5807801156638366, "grad_norm": 0.37042465806007385, "learning_rate": 1.9470835846689596e-05, "loss": 0.6127, "mean_token_accuracy": 0.8106034889817237, "step": 590 }, { "epoch": 0.5857019810508183, "grad_norm": 0.2963549792766571, "learning_rate": 1.9452278664325227e-05, "loss": 0.6194, "mean_token_accuracy": 0.8086869075894356, "step": 595 }, { "epoch": 0.5906238464378, "grad_norm": 0.2905316948890686, "learning_rate": 1.9433410830102724e-05, "loss": 0.61, "mean_token_accuracy": 0.811042046546936, "step": 600 }, { "epoch": 0.5955457118247816, "grad_norm": 0.2674277424812317, "learning_rate": 1.9414232964118893e-05, "loss": 0.6119, "mean_token_accuracy": 0.8104571312665939, "step": 605 }, { "epoch": 0.6004675772117632, "grad_norm": 0.28245261311531067, "learning_rate": 1.939474569665981e-05, "loss": 0.6115, "mean_token_accuracy": 0.8106845885515213, "step": 610 }, { "epoch": 0.6053894425987449, "grad_norm": 0.2713403105735779, "learning_rate": 1.937494966818014e-05, "loss": 0.6096, "mean_token_accuracy": 0.8106750875711441, "step": 615 }, { "epoch": 0.6103113079857266, "grad_norm": 0.31770050525665283, "learning_rate": 1.9354845529282042e-05, "loss": 0.6142, "mean_token_accuracy": 0.8098479628562927, "step": 620 }, { "epoch": 0.6152331733727082, "grad_norm": 0.28526055812835693, "learning_rate": 1.933443394069383e-05, "loss": 0.6062, "mean_token_accuracy": 0.8120482847094536, "step": 625 }, { "epoch": 0.6201550387596899, "grad_norm": 0.5695453882217407, "learning_rate": 1.9313715573248238e-05, "loss": 0.6122, "mean_token_accuracy": 0.8099897101521492, "step": 630 }, { "epoch": 0.6250769041466716, "grad_norm": 0.2738396227359772, "learning_rate": 1.9292691107860374e-05, "loss": 0.6031, "mean_token_accuracy": 0.8127053424715995, "step": 635 }, { "epoch": 0.6299987695336533, "grad_norm": 0.28948965668678284, "learning_rate": 1.927136123550534e-05, "loss": 0.6115, "mean_token_accuracy": 0.8103477448225022, "step": 640 }, { "epoch": 0.6349206349206349, "grad_norm": 0.27830740809440613, "learning_rate": 1.9249726657195534e-05, "loss": 0.608, "mean_token_accuracy": 0.8116561621427536, "step": 645 }, { "epoch": 0.6398425003076166, "grad_norm": 0.2712289094924927, "learning_rate": 1.922778808395759e-05, "loss": 0.6054, "mean_token_accuracy": 0.8125208973884582, "step": 650 }, { "epoch": 0.6447643656945983, "grad_norm": 0.29063907265663147, "learning_rate": 1.9205546236809037e-05, "loss": 0.6047, "mean_token_accuracy": 0.8123130992054939, "step": 655 }, { "epoch": 0.64968623108158, "grad_norm": 0.293261855840683, "learning_rate": 1.9183001846734573e-05, "loss": 0.603, "mean_token_accuracy": 0.8129645109176635, "step": 660 }, { "epoch": 0.6546080964685616, "grad_norm": 0.2849041223526001, "learning_rate": 1.9160155654662075e-05, "loss": 0.5926, "mean_token_accuracy": 0.8157610684633255, "step": 665 }, { "epoch": 0.6595299618555432, "grad_norm": 0.2975578010082245, "learning_rate": 1.9137008411438213e-05, "loss": 0.6034, "mean_token_accuracy": 0.8125734269618988, "step": 670 }, { "epoch": 0.6644518272425249, "grad_norm": 0.286842405796051, "learning_rate": 1.9113560877803798e-05, "loss": 0.6045, "mean_token_accuracy": 0.8125320598483086, "step": 675 }, { "epoch": 0.6693736926295066, "grad_norm": 0.33480602502822876, "learning_rate": 1.9089813824368765e-05, "loss": 0.5975, "mean_token_accuracy": 0.8142675384879112, "step": 680 }, { "epoch": 0.6742955580164882, "grad_norm": 0.29252228140830994, "learning_rate": 1.9065768031586864e-05, "loss": 0.6056, "mean_token_accuracy": 0.8120014935731887, "step": 685 }, { "epoch": 0.6792174234034699, "grad_norm": 0.2882521450519562, "learning_rate": 1.9041424289729994e-05, "loss": 0.595, "mean_token_accuracy": 0.8150214269757271, "step": 690 }, { "epoch": 0.6841392887904516, "grad_norm": 0.29731523990631104, "learning_rate": 1.901678339886223e-05, "loss": 0.6013, "mean_token_accuracy": 0.8131750777363778, "step": 695 }, { "epoch": 0.6890611541774333, "grad_norm": 0.26834896206855774, "learning_rate": 1.8991846168813547e-05, "loss": 0.5918, "mean_token_accuracy": 0.8156168267130852, "step": 700 }, { "epoch": 0.6939830195644149, "grad_norm": 0.29199543595314026, "learning_rate": 1.896661341915318e-05, "loss": 0.6033, "mean_token_accuracy": 0.8124941572546959, "step": 705 }, { "epoch": 0.6989048849513966, "grad_norm": 0.28719085454940796, "learning_rate": 1.8941085979162714e-05, "loss": 0.5992, "mean_token_accuracy": 0.8138533607125282, "step": 710 }, { "epoch": 0.7038267503383783, "grad_norm": 0.28042468428611755, "learning_rate": 1.891526468780881e-05, "loss": 0.605, "mean_token_accuracy": 0.8121193930506706, "step": 715 }, { "epoch": 0.70874861572536, "grad_norm": 0.272483766078949, "learning_rate": 1.8889150393715627e-05, "loss": 0.5943, "mean_token_accuracy": 0.8147971466183662, "step": 720 }, { "epoch": 0.7136704811123415, "grad_norm": 0.24886226654052734, "learning_rate": 1.8862743955136966e-05, "loss": 0.5957, "mean_token_accuracy": 0.8145680665969849, "step": 725 }, { "epoch": 0.7185923464993232, "grad_norm": 0.26445212960243225, "learning_rate": 1.8836046239928025e-05, "loss": 0.5948, "mean_token_accuracy": 0.8148575246334075, "step": 730 }, { "epoch": 0.7235142118863049, "grad_norm": 0.2891506850719452, "learning_rate": 1.8809058125516894e-05, "loss": 0.5968, "mean_token_accuracy": 0.8141703933477402, "step": 735 }, { "epoch": 0.7284360772732866, "grad_norm": 0.28364264965057373, "learning_rate": 1.8781780498875727e-05, "loss": 0.6035, "mean_token_accuracy": 0.8124788105487823, "step": 740 }, { "epoch": 0.7333579426602682, "grad_norm": 0.2917366921901703, "learning_rate": 1.8754214256491564e-05, "loss": 0.5928, "mean_token_accuracy": 0.8153851807117463, "step": 745 }, { "epoch": 0.7382798080472499, "grad_norm": 0.2714190185070038, "learning_rate": 1.8726360304336896e-05, "loss": 0.601, "mean_token_accuracy": 0.8129221558570862, "step": 750 }, { "epoch": 0.7432016734342316, "grad_norm": 0.29474568367004395, "learning_rate": 1.8698219557839875e-05, "loss": 0.5963, "mean_token_accuracy": 0.8142225205898285, "step": 755 }, { "epoch": 0.7481235388212132, "grad_norm": 0.2684454619884491, "learning_rate": 1.866979294185423e-05, "loss": 0.5933, "mean_token_accuracy": 0.8149216592311859, "step": 760 }, { "epoch": 0.7530454042081949, "grad_norm": 0.26693102717399597, "learning_rate": 1.864108139062888e-05, "loss": 0.5908, "mean_token_accuracy": 0.8157912597060204, "step": 765 }, { "epoch": 0.7579672695951766, "grad_norm": 0.27418771386146545, "learning_rate": 1.8612085847777215e-05, "loss": 0.5913, "mean_token_accuracy": 0.8156127855181694, "step": 770 }, { "epoch": 0.7628891349821583, "grad_norm": 0.30855274200439453, "learning_rate": 1.858280726624609e-05, "loss": 0.5922, "mean_token_accuracy": 0.81515374481678, "step": 775 }, { "epoch": 0.7678110003691399, "grad_norm": 0.2978297472000122, "learning_rate": 1.855324660828452e-05, "loss": 0.5999, "mean_token_accuracy": 0.8132428601384163, "step": 780 }, { "epoch": 0.7727328657561215, "grad_norm": 0.30609989166259766, "learning_rate": 1.8523404845412028e-05, "loss": 0.5931, "mean_token_accuracy": 0.8152095600962639, "step": 785 }, { "epoch": 0.7776547311431032, "grad_norm": 0.28423747420310974, "learning_rate": 1.849328295838674e-05, "loss": 0.5939, "mean_token_accuracy": 0.8150446817278862, "step": 790 }, { "epoch": 0.7825765965300849, "grad_norm": 0.39114367961883545, "learning_rate": 1.8462881937173144e-05, "loss": 0.5886, "mean_token_accuracy": 0.8164272159337997, "step": 795 }, { "epoch": 0.7874984619170665, "grad_norm": 0.2761843502521515, "learning_rate": 1.8432202780909542e-05, "loss": 0.594, "mean_token_accuracy": 0.8146432772278785, "step": 800 }, { "epoch": 0.7924203273040482, "grad_norm": 0.26402318477630615, "learning_rate": 1.8401246497875238e-05, "loss": 0.5892, "mean_token_accuracy": 0.8162309199571609, "step": 805 }, { "epoch": 0.7973421926910299, "grad_norm": 0.26799553632736206, "learning_rate": 1.8370014105457378e-05, "loss": 0.5901, "mean_token_accuracy": 0.8156055212020874, "step": 810 }, { "epoch": 0.8022640580780116, "grad_norm": 0.3189884126186371, "learning_rate": 1.8338506630117527e-05, "loss": 0.5821, "mean_token_accuracy": 0.8177683308720589, "step": 815 }, { "epoch": 0.8071859234649932, "grad_norm": 0.26993831992149353, "learning_rate": 1.8306725107357933e-05, "loss": 0.5887, "mean_token_accuracy": 0.8162371620535851, "step": 820 }, { "epoch": 0.8121077888519749, "grad_norm": 0.33908817172050476, "learning_rate": 1.827467058168748e-05, "loss": 0.5932, "mean_token_accuracy": 0.8148850262165069, "step": 825 }, { "epoch": 0.8170296542389566, "grad_norm": 0.2749953866004944, "learning_rate": 1.824234410658738e-05, "loss": 0.5807, "mean_token_accuracy": 0.8185225054621696, "step": 830 }, { "epoch": 0.8219515196259383, "grad_norm": 0.28679126501083374, "learning_rate": 1.8209746744476538e-05, "loss": 0.5844, "mean_token_accuracy": 0.81742594987154, "step": 835 }, { "epoch": 0.8268733850129198, "grad_norm": 0.29817092418670654, "learning_rate": 1.817687956667664e-05, "loss": 0.584, "mean_token_accuracy": 0.8173492252826691, "step": 840 }, { "epoch": 0.8317952503999015, "grad_norm": 0.2705828547477722, "learning_rate": 1.8143743653376944e-05, "loss": 0.5955, "mean_token_accuracy": 0.8145547702908515, "step": 845 }, { "epoch": 0.8367171157868832, "grad_norm": 0.28381243348121643, "learning_rate": 1.811034009359877e-05, "loss": 0.5833, "mean_token_accuracy": 0.8177738025784492, "step": 850 }, { "epoch": 0.8416389811738649, "grad_norm": 0.2846708595752716, "learning_rate": 1.8076669985159726e-05, "loss": 0.5817, "mean_token_accuracy": 0.8179952159523964, "step": 855 }, { "epoch": 0.8465608465608465, "grad_norm": 0.2997231185436249, "learning_rate": 1.8042734434637615e-05, "loss": 0.5934, "mean_token_accuracy": 0.8149283960461616, "step": 860 }, { "epoch": 0.8514827119478282, "grad_norm": 0.29204457998275757, "learning_rate": 1.8008534557334064e-05, "loss": 0.5795, "mean_token_accuracy": 0.8184737205505371, "step": 865 }, { "epoch": 0.8564045773348099, "grad_norm": 0.30441614985466003, "learning_rate": 1.7974071477237887e-05, "loss": 0.585, "mean_token_accuracy": 0.8171376779675483, "step": 870 }, { "epoch": 0.8613264427217916, "grad_norm": 0.2779221832752228, "learning_rate": 1.7939346326988127e-05, "loss": 0.5889, "mean_token_accuracy": 0.8160797134041786, "step": 875 }, { "epoch": 0.8662483081087732, "grad_norm": 0.250242680311203, "learning_rate": 1.7904360247836838e-05, "loss": 0.5894, "mean_token_accuracy": 0.81572295576334, "step": 880 }, { "epoch": 0.8711701734957549, "grad_norm": 0.26801884174346924, "learning_rate": 1.7869114389611574e-05, "loss": 0.5853, "mean_token_accuracy": 0.8168028473854065, "step": 885 }, { "epoch": 0.8760920388827366, "grad_norm": 0.33699533343315125, "learning_rate": 1.7833609910677613e-05, "loss": 0.5804, "mean_token_accuracy": 0.8181165441870689, "step": 890 }, { "epoch": 0.8810139042697183, "grad_norm": 0.28362491726875305, "learning_rate": 1.7797847977899873e-05, "loss": 0.5823, "mean_token_accuracy": 0.8177706867456436, "step": 895 }, { "epoch": 0.8859357696566998, "grad_norm": 0.2863147556781769, "learning_rate": 1.7761829766604556e-05, "loss": 0.5797, "mean_token_accuracy": 0.8185298308730126, "step": 900 }, { "epoch": 0.8908576350436815, "grad_norm": 0.27263742685317993, "learning_rate": 1.7725556460540553e-05, "loss": 0.5825, "mean_token_accuracy": 0.8175166144967079, "step": 905 }, { "epoch": 0.8957795004306632, "grad_norm": 0.28120777010917664, "learning_rate": 1.7689029251840492e-05, "loss": 0.5788, "mean_token_accuracy": 0.8185988172888756, "step": 910 }, { "epoch": 0.9007013658176449, "grad_norm": 0.3469211459159851, "learning_rate": 1.7652249340981608e-05, "loss": 0.5877, "mean_token_accuracy": 0.8159551978111267, "step": 915 }, { "epoch": 0.9056232312046265, "grad_norm": 0.3101508617401123, "learning_rate": 1.7615217936746246e-05, "loss": 0.5819, "mean_token_accuracy": 0.8174650520086288, "step": 920 }, { "epoch": 0.9105450965916082, "grad_norm": 0.38838618993759155, "learning_rate": 1.757793625618217e-05, "loss": 0.5755, "mean_token_accuracy": 0.8196040257811547, "step": 925 }, { "epoch": 0.9154669619785899, "grad_norm": 0.3253493309020996, "learning_rate": 1.7540405524562533e-05, "loss": 0.5777, "mean_token_accuracy": 0.8182825416326522, "step": 930 }, { "epoch": 0.9203888273655716, "grad_norm": 0.2917826175689697, "learning_rate": 1.750262697534563e-05, "loss": 0.5809, "mean_token_accuracy": 0.8180661648511887, "step": 935 }, { "epoch": 0.9253106927525532, "grad_norm": 0.25714483857154846, "learning_rate": 1.7464601850134353e-05, "loss": 0.5752, "mean_token_accuracy": 0.8194984391331672, "step": 940 }, { "epoch": 0.9302325581395349, "grad_norm": 0.28597357869148254, "learning_rate": 1.742633139863538e-05, "loss": 0.579, "mean_token_accuracy": 0.8184013769030571, "step": 945 }, { "epoch": 0.9351544235265166, "grad_norm": 0.9777734875679016, "learning_rate": 1.738781687861812e-05, "loss": 0.5789, "mean_token_accuracy": 0.8188063263893127, "step": 950 }, { "epoch": 0.9400762889134983, "grad_norm": 0.26717498898506165, "learning_rate": 1.7349059555873348e-05, "loss": 0.5754, "mean_token_accuracy": 0.8191799059510231, "step": 955 }, { "epoch": 0.9449981543004798, "grad_norm": 0.29053807258605957, "learning_rate": 1.731006070417163e-05, "loss": 0.5726, "mean_token_accuracy": 0.8204409092664718, "step": 960 }, { "epoch": 0.9499200196874615, "grad_norm": 0.3052172362804413, "learning_rate": 1.7270821605221448e-05, "loss": 0.5764, "mean_token_accuracy": 0.819102555513382, "step": 965 }, { "epoch": 0.9548418850744432, "grad_norm": 0.33640167117118835, "learning_rate": 1.7231343548627085e-05, "loss": 0.5789, "mean_token_accuracy": 0.8184890508651733, "step": 970 }, { "epoch": 0.9597637504614249, "grad_norm": 0.2829669415950775, "learning_rate": 1.7191627831846226e-05, "loss": 0.5803, "mean_token_accuracy": 0.8179109930992127, "step": 975 }, { "epoch": 0.9646856158484065, "grad_norm": 0.2560986280441284, "learning_rate": 1.7151675760147325e-05, "loss": 0.5721, "mean_token_accuracy": 0.8198479250073433, "step": 980 }, { "epoch": 0.9696074812353882, "grad_norm": 0.27663761377334595, "learning_rate": 1.7111488646566728e-05, "loss": 0.5851, "mean_token_accuracy": 0.8171452388167382, "step": 985 }, { "epoch": 0.9745293466223699, "grad_norm": 0.2673356235027313, "learning_rate": 1.7071067811865477e-05, "loss": 0.5751, "mean_token_accuracy": 0.8194502517580986, "step": 990 }, { "epoch": 0.9794512120093516, "grad_norm": 0.2639131546020508, "learning_rate": 1.7030414584485938e-05, "loss": 0.5757, "mean_token_accuracy": 0.8192202031612397, "step": 995 }, { "epoch": 0.9843730773963332, "grad_norm": 0.2639618515968323, "learning_rate": 1.6989530300508126e-05, "loss": 0.576, "mean_token_accuracy": 0.8191347226500512, "step": 1000 }, { "epoch": 0.9892949427833149, "grad_norm": 0.2554817199707031, "learning_rate": 1.6948416303605796e-05, "loss": 0.5778, "mean_token_accuracy": 0.8186899140477181, "step": 1005 }, { "epoch": 0.9942168081702966, "grad_norm": 0.25301820039749146, "learning_rate": 1.690707394500229e-05, "loss": 0.576, "mean_token_accuracy": 0.8191317170858383, "step": 1010 }, { "epoch": 0.9991386735572783, "grad_norm": 0.2470293790102005, "learning_rate": 1.6865504583426117e-05, "loss": 0.5707, "mean_token_accuracy": 0.8204790607094765, "step": 1015 }, { "epoch": 1.0049218653869816, "grad_norm": 0.3501671254634857, "learning_rate": 1.6823709585066308e-05, "loss": 0.6648, "mean_token_accuracy": 0.824617318990754, "step": 1020 }, { "epoch": 1.0098437307739634, "grad_norm": 0.30985623598098755, "learning_rate": 1.6781690323527512e-05, "loss": 0.5503, "mean_token_accuracy": 0.8255873426795006, "step": 1025 }, { "epoch": 1.014765596160945, "grad_norm": 0.2879364788532257, "learning_rate": 1.6739448179784846e-05, "loss": 0.5529, "mean_token_accuracy": 0.8247572600841522, "step": 1030 }, { "epoch": 1.0196874615479268, "grad_norm": 0.27657514810562134, "learning_rate": 1.669698454213852e-05, "loss": 0.55, "mean_token_accuracy": 0.8258542969822884, "step": 1035 }, { "epoch": 1.0246093269349084, "grad_norm": 0.259316623210907, "learning_rate": 1.665430080616821e-05, "loss": 0.5435, "mean_token_accuracy": 0.8273309215903282, "step": 1040 }, { "epoch": 1.02953119232189, "grad_norm": 0.27227073907852173, "learning_rate": 1.6611398374687172e-05, "loss": 0.5494, "mean_token_accuracy": 0.8259153485298156, "step": 1045 }, { "epoch": 1.0344530577088717, "grad_norm": 0.2718289792537689, "learning_rate": 1.6568278657696166e-05, "loss": 0.5445, "mean_token_accuracy": 0.827112241089344, "step": 1050 }, { "epoch": 1.0393749230958533, "grad_norm": 0.28744345903396606, "learning_rate": 1.6524943072337094e-05, "loss": 0.5501, "mean_token_accuracy": 0.8256638810038567, "step": 1055 }, { "epoch": 1.044296788482835, "grad_norm": 0.26266416907310486, "learning_rate": 1.6481393042846442e-05, "loss": 0.5467, "mean_token_accuracy": 0.8264568135142326, "step": 1060 }, { "epoch": 1.0492186538698167, "grad_norm": 0.25888925790786743, "learning_rate": 1.6437630000508466e-05, "loss": 0.5522, "mean_token_accuracy": 0.8247309610247612, "step": 1065 }, { "epoch": 1.0541405192567983, "grad_norm": 0.25061705708503723, "learning_rate": 1.6393655383608132e-05, "loss": 0.5459, "mean_token_accuracy": 0.8267670929431915, "step": 1070 }, { "epoch": 1.0590623846437799, "grad_norm": 0.25011131167411804, "learning_rate": 1.634947063738389e-05, "loss": 0.5483, "mean_token_accuracy": 0.8261876925826073, "step": 1075 }, { "epoch": 1.0639842500307617, "grad_norm": 0.26051655411720276, "learning_rate": 1.630507721398013e-05, "loss": 0.5452, "mean_token_accuracy": 0.82709851115942, "step": 1080 }, { "epoch": 1.0689061154177433, "grad_norm": 0.2643815279006958, "learning_rate": 1.6260476572399494e-05, "loss": 0.5497, "mean_token_accuracy": 0.825461483001709, "step": 1085 }, { "epoch": 1.073827980804725, "grad_norm": 0.3040525019168854, "learning_rate": 1.6215670178454893e-05, "loss": 0.5478, "mean_token_accuracy": 0.8264098614454269, "step": 1090 }, { "epoch": 1.0787498461917067, "grad_norm": 0.28461357951164246, "learning_rate": 1.6170659504721365e-05, "loss": 0.5474, "mean_token_accuracy": 0.8261038646101951, "step": 1095 }, { "epoch": 1.0836717115786882, "grad_norm": 0.24723611772060394, "learning_rate": 1.6125446030487642e-05, "loss": 0.542, "mean_token_accuracy": 0.8277976959943771, "step": 1100 }, { "epoch": 1.08859357696567, "grad_norm": 0.4478602707386017, "learning_rate": 1.608003124170758e-05, "loss": 0.5435, "mean_token_accuracy": 0.8271990329027176, "step": 1105 }, { "epoch": 1.0935154423526516, "grad_norm": 0.2758786082267761, "learning_rate": 1.6034416630951265e-05, "loss": 0.5546, "mean_token_accuracy": 0.8245001256465911, "step": 1110 }, { "epoch": 1.0984373077396332, "grad_norm": 0.8616223335266113, "learning_rate": 1.598860369735601e-05, "loss": 0.5419, "mean_token_accuracy": 0.827488873898983, "step": 1115 }, { "epoch": 1.103359173126615, "grad_norm": 0.24690531194210052, "learning_rate": 1.594259394657707e-05, "loss": 0.5493, "mean_token_accuracy": 0.8259517803788186, "step": 1120 }, { "epoch": 1.1082810385135966, "grad_norm": 0.24601490795612335, "learning_rate": 1.589638889073813e-05, "loss": 0.5563, "mean_token_accuracy": 0.8240275859832764, "step": 1125 }, { "epoch": 1.1132029039005784, "grad_norm": 0.32801708579063416, "learning_rate": 1.584999004838165e-05, "loss": 0.5474, "mean_token_accuracy": 0.8265691444277763, "step": 1130 }, { "epoch": 1.11812476928756, "grad_norm": 0.25093355774879456, "learning_rate": 1.5803398944418934e-05, "loss": 0.5426, "mean_token_accuracy": 0.8273544386029243, "step": 1135 }, { "epoch": 1.1230466346745416, "grad_norm": 0.2600312829017639, "learning_rate": 1.5756617110080023e-05, "loss": 0.5522, "mean_token_accuracy": 0.8249027922749519, "step": 1140 }, { "epoch": 1.1279685000615234, "grad_norm": 0.26066142320632935, "learning_rate": 1.570964608286336e-05, "loss": 0.5442, "mean_token_accuracy": 0.8270187392830849, "step": 1145 }, { "epoch": 1.132890365448505, "grad_norm": 0.27738282084465027, "learning_rate": 1.5662487406485273e-05, "loss": 0.5361, "mean_token_accuracy": 0.8295004799962044, "step": 1150 }, { "epoch": 1.1378122308354865, "grad_norm": 0.3502300977706909, "learning_rate": 1.561514263082923e-05, "loss": 0.5482, "mean_token_accuracy": 0.8256632193922997, "step": 1155 }, { "epoch": 1.1427340962224684, "grad_norm": 0.5840310454368591, "learning_rate": 1.5567613311894908e-05, "loss": 0.5337, "mean_token_accuracy": 0.8303180441260338, "step": 1160 }, { "epoch": 1.14765596160945, "grad_norm": 0.2714439034461975, "learning_rate": 1.5519901011747046e-05, "loss": 0.5479, "mean_token_accuracy": 0.8258592769503593, "step": 1165 }, { "epoch": 1.1525778269964317, "grad_norm": 0.2692211866378784, "learning_rate": 1.5472007298464117e-05, "loss": 0.5439, "mean_token_accuracy": 0.8271799921989441, "step": 1170 }, { "epoch": 1.1574996923834133, "grad_norm": 0.2637535631656647, "learning_rate": 1.5423933746086793e-05, "loss": 0.5382, "mean_token_accuracy": 0.8288466781377792, "step": 1175 }, { "epoch": 1.162421557770395, "grad_norm": 0.25311315059661865, "learning_rate": 1.5375681934566203e-05, "loss": 0.5399, "mean_token_accuracy": 0.8281501397490502, "step": 1180 }, { "epoch": 1.1673434231573767, "grad_norm": 0.25321346521377563, "learning_rate": 1.532725344971202e-05, "loss": 0.5482, "mean_token_accuracy": 0.8261646762490272, "step": 1185 }, { "epoch": 1.1722652885443583, "grad_norm": 0.25499051809310913, "learning_rate": 1.527864988314033e-05, "loss": 0.5425, "mean_token_accuracy": 0.8275581628084183, "step": 1190 }, { "epoch": 1.17718715393134, "grad_norm": 0.2546637952327728, "learning_rate": 1.5229872832221336e-05, "loss": 0.5397, "mean_token_accuracy": 0.8283757612109184, "step": 1195 }, { "epoch": 1.1821090193183217, "grad_norm": 0.2738707363605499, "learning_rate": 1.5180923900026847e-05, "loss": 0.5386, "mean_token_accuracy": 0.8282813474535942, "step": 1200 }, { "epoch": 1.1870308847053033, "grad_norm": 0.2539266347885132, "learning_rate": 1.5131804695277612e-05, "loss": 0.5462, "mean_token_accuracy": 0.826425202190876, "step": 1205 }, { "epoch": 1.1919527500922849, "grad_norm": 0.2745126187801361, "learning_rate": 1.5082516832290424e-05, "loss": 0.5404, "mean_token_accuracy": 0.8284027636051178, "step": 1210 }, { "epoch": 1.1968746154792667, "grad_norm": 0.2544495165348053, "learning_rate": 1.5033061930925081e-05, "loss": 0.532, "mean_token_accuracy": 0.8300672218203544, "step": 1215 }, { "epoch": 1.2017964808662482, "grad_norm": 0.27299556136131287, "learning_rate": 1.4983441616531152e-05, "loss": 0.5396, "mean_token_accuracy": 0.8280036672949791, "step": 1220 }, { "epoch": 1.20671834625323, "grad_norm": 0.28981074690818787, "learning_rate": 1.4933657519894542e-05, "loss": 0.5524, "mean_token_accuracy": 0.8247063636779786, "step": 1225 }, { "epoch": 1.2116402116402116, "grad_norm": 0.30510908365249634, "learning_rate": 1.4883711277183917e-05, "loss": 0.5379, "mean_token_accuracy": 0.8288484767079354, "step": 1230 }, { "epoch": 1.2165620770271932, "grad_norm": 0.2616790533065796, "learning_rate": 1.483360452989691e-05, "loss": 0.5415, "mean_token_accuracy": 0.8275775909423828, "step": 1235 }, { "epoch": 1.221483942414175, "grad_norm": 0.2551945745944977, "learning_rate": 1.4783338924806191e-05, "loss": 0.5347, "mean_token_accuracy": 0.8295770674943924, "step": 1240 }, { "epoch": 1.2264058078011566, "grad_norm": 0.28227224946022034, "learning_rate": 1.4732916113905336e-05, "loss": 0.5425, "mean_token_accuracy": 0.8273839592933655, "step": 1245 }, { "epoch": 1.2313276731881384, "grad_norm": 0.260978102684021, "learning_rate": 1.4682337754354534e-05, "loss": 0.5431, "mean_token_accuracy": 0.8270445480942726, "step": 1250 }, { "epoch": 1.23624953857512, "grad_norm": 0.279462605714798, "learning_rate": 1.4631605508426124e-05, "loss": 0.5379, "mean_token_accuracy": 0.828822860121727, "step": 1255 }, { "epoch": 1.2411714039621016, "grad_norm": 0.2665978670120239, "learning_rate": 1.4580721043449968e-05, "loss": 0.5403, "mean_token_accuracy": 0.8279185205698013, "step": 1260 }, { "epoch": 1.2460932693490834, "grad_norm": 0.24216796457767487, "learning_rate": 1.4529686031758642e-05, "loss": 0.5409, "mean_token_accuracy": 0.8280630350112915, "step": 1265 }, { "epoch": 1.251015134736065, "grad_norm": 0.2504848837852478, "learning_rate": 1.4478502150632503e-05, "loss": 0.5389, "mean_token_accuracy": 0.8282234400510788, "step": 1270 }, { "epoch": 1.2559370001230468, "grad_norm": 0.25835323333740234, "learning_rate": 1.4427171082244523e-05, "loss": 0.5471, "mean_token_accuracy": 0.8258385419845581, "step": 1275 }, { "epoch": 1.2608588655100283, "grad_norm": 0.26074373722076416, "learning_rate": 1.4375694513605037e-05, "loss": 0.5413, "mean_token_accuracy": 0.8273946106433868, "step": 1280 }, { "epoch": 1.26578073089701, "grad_norm": 0.2714027762413025, "learning_rate": 1.4324074136506283e-05, "loss": 0.5399, "mean_token_accuracy": 0.8278847292065621, "step": 1285 }, { "epoch": 1.2707025962839915, "grad_norm": 0.24950872361660004, "learning_rate": 1.427231164746681e-05, "loss": 0.5429, "mean_token_accuracy": 0.827368488907814, "step": 1290 }, { "epoch": 1.2756244616709733, "grad_norm": 0.2415134608745575, "learning_rate": 1.4220408747675714e-05, "loss": 0.5417, "mean_token_accuracy": 0.8275652229785919, "step": 1295 }, { "epoch": 1.280546327057955, "grad_norm": 0.23719871044158936, "learning_rate": 1.4168367142936736e-05, "loss": 0.5442, "mean_token_accuracy": 0.8268394738435745, "step": 1300 }, { "epoch": 1.2854681924449367, "grad_norm": 0.2537670135498047, "learning_rate": 1.4116188543612182e-05, "loss": 0.5329, "mean_token_accuracy": 0.8299818679690361, "step": 1305 }, { "epoch": 1.2903900578319183, "grad_norm": 0.2709537446498871, "learning_rate": 1.4063874664566734e-05, "loss": 0.5419, "mean_token_accuracy": 0.8275921046733856, "step": 1310 }, { "epoch": 1.2953119232188999, "grad_norm": 0.26924365758895874, "learning_rate": 1.4011427225111091e-05, "loss": 0.5321, "mean_token_accuracy": 0.8305203005671501, "step": 1315 }, { "epoch": 1.3002337886058817, "grad_norm": 0.2832610607147217, "learning_rate": 1.3958847948945428e-05, "loss": 0.5391, "mean_token_accuracy": 0.8282249644398689, "step": 1320 }, { "epoch": 1.3051556539928633, "grad_norm": 0.2596539258956909, "learning_rate": 1.3906138564102794e-05, "loss": 0.5356, "mean_token_accuracy": 0.829230573773384, "step": 1325 }, { "epoch": 1.310077519379845, "grad_norm": 0.2699119448661804, "learning_rate": 1.3853300802892285e-05, "loss": 0.5417, "mean_token_accuracy": 0.8279038980603218, "step": 1330 }, { "epoch": 1.3149993847668267, "grad_norm": 0.2658538520336151, "learning_rate": 1.380033640184213e-05, "loss": 0.5462, "mean_token_accuracy": 0.8260830625891685, "step": 1335 }, { "epoch": 1.3199212501538082, "grad_norm": 0.25977060198783875, "learning_rate": 1.3747247101642605e-05, "loss": 0.5347, "mean_token_accuracy": 0.8293716937303544, "step": 1340 }, { "epoch": 1.32484311554079, "grad_norm": 0.24537616968154907, "learning_rate": 1.369403464708884e-05, "loss": 0.5367, "mean_token_accuracy": 0.8292932540178299, "step": 1345 }, { "epoch": 1.3297649809277716, "grad_norm": 0.2559899091720581, "learning_rate": 1.3640700787023465e-05, "loss": 0.5398, "mean_token_accuracy": 0.8283236369490623, "step": 1350 }, { "epoch": 1.3346868463147534, "grad_norm": 0.274198979139328, "learning_rate": 1.358724727427914e-05, "loss": 0.5376, "mean_token_accuracy": 0.8286082163453102, "step": 1355 }, { "epoch": 1.339608711701735, "grad_norm": 0.22712701559066772, "learning_rate": 1.3533675865620937e-05, "loss": 0.5336, "mean_token_accuracy": 0.8294816762208939, "step": 1360 }, { "epoch": 1.3445305770887166, "grad_norm": 0.24095574021339417, "learning_rate": 1.3479988321688619e-05, "loss": 0.536, "mean_token_accuracy": 0.829172083735466, "step": 1365 }, { "epoch": 1.3494524424756982, "grad_norm": 0.2448059618473053, "learning_rate": 1.3426186406938769e-05, "loss": 0.5337, "mean_token_accuracy": 0.8295143947005272, "step": 1370 }, { "epoch": 1.35437430786268, "grad_norm": 0.2575864791870117, "learning_rate": 1.337227188958679e-05, "loss": 0.5456, "mean_token_accuracy": 0.8261685460805893, "step": 1375 }, { "epoch": 1.3592961732496616, "grad_norm": 0.25145259499549866, "learning_rate": 1.3318246541548812e-05, "loss": 0.5319, "mean_token_accuracy": 0.8304190933704376, "step": 1380 }, { "epoch": 1.3642180386366434, "grad_norm": 0.2565249502658844, "learning_rate": 1.3264112138383445e-05, "loss": 0.5358, "mean_token_accuracy": 0.8293601229786873, "step": 1385 }, { "epoch": 1.369139904023625, "grad_norm": 0.8961818814277649, "learning_rate": 1.3209870459233422e-05, "loss": 0.528, "mean_token_accuracy": 0.8313272252678872, "step": 1390 }, { "epoch": 1.3740617694106065, "grad_norm": 0.26537856459617615, "learning_rate": 1.315552328676714e-05, "loss": 0.531, "mean_token_accuracy": 0.8308784514665604, "step": 1395 }, { "epoch": 1.3789836347975883, "grad_norm": 0.28985780477523804, "learning_rate": 1.3101072407120056e-05, "loss": 0.5406, "mean_token_accuracy": 0.8277209624648094, "step": 1400 }, { "epoch": 1.38390550018457, "grad_norm": 0.2510998249053955, "learning_rate": 1.3046519609836002e-05, "loss": 0.5406, "mean_token_accuracy": 0.827545890212059, "step": 1405 }, { "epoch": 1.3888273655715517, "grad_norm": 0.2563679814338684, "learning_rate": 1.2991866687808355e-05, "loss": 0.5394, "mean_token_accuracy": 0.8279638543725014, "step": 1410 }, { "epoch": 1.3937492309585333, "grad_norm": 0.2674863338470459, "learning_rate": 1.2937115437221119e-05, "loss": 0.547, "mean_token_accuracy": 0.8261717170476913, "step": 1415 }, { "epoch": 1.398671096345515, "grad_norm": 0.24103465676307678, "learning_rate": 1.2882267657489908e-05, "loss": 0.5428, "mean_token_accuracy": 0.8272509336471557, "step": 1420 }, { "epoch": 1.4035929617324965, "grad_norm": 0.22528545558452606, "learning_rate": 1.2827325151202783e-05, "loss": 0.5368, "mean_token_accuracy": 0.8288370996713639, "step": 1425 }, { "epoch": 1.4085148271194783, "grad_norm": 0.23950906097888947, "learning_rate": 1.2772289724061015e-05, "loss": 0.5309, "mean_token_accuracy": 0.8302434518933296, "step": 1430 }, { "epoch": 1.4134366925064599, "grad_norm": 0.22913850843906403, "learning_rate": 1.2717163184819761e-05, "loss": 0.5397, "mean_token_accuracy": 0.8278713747859001, "step": 1435 }, { "epoch": 1.4183585578934417, "grad_norm": 0.22565315663814545, "learning_rate": 1.2661947345228593e-05, "loss": 0.546, "mean_token_accuracy": 0.826079449057579, "step": 1440 }, { "epoch": 1.4232804232804233, "grad_norm": 0.2397647351026535, "learning_rate": 1.2606644019971967e-05, "loss": 0.5396, "mean_token_accuracy": 0.8280595645308495, "step": 1445 }, { "epoch": 1.4282022886674048, "grad_norm": 0.23136766254901886, "learning_rate": 1.255125502660958e-05, "loss": 0.5288, "mean_token_accuracy": 0.8313645005226136, "step": 1450 }, { "epoch": 1.4331241540543866, "grad_norm": 0.2330116331577301, "learning_rate": 1.2495782185516638e-05, "loss": 0.5364, "mean_token_accuracy": 0.828608725965023, "step": 1455 }, { "epoch": 1.4380460194413682, "grad_norm": 0.23435364663600922, "learning_rate": 1.2440227319824024e-05, "loss": 0.5323, "mean_token_accuracy": 0.8299019247293472, "step": 1460 }, { "epoch": 1.44296788482835, "grad_norm": 0.2517502009868622, "learning_rate": 1.2384592255358385e-05, "loss": 0.537, "mean_token_accuracy": 0.8284672737121582, "step": 1465 }, { "epoch": 1.4478897502153316, "grad_norm": 0.2454364001750946, "learning_rate": 1.2328878820582122e-05, "loss": 0.5282, "mean_token_accuracy": 0.8314993128180503, "step": 1470 }, { "epoch": 1.4528116156023132, "grad_norm": 0.2604913115501404, "learning_rate": 1.2273088846533303e-05, "loss": 0.5404, "mean_token_accuracy": 0.8278495371341705, "step": 1475 }, { "epoch": 1.457733480989295, "grad_norm": 0.277908593416214, "learning_rate": 1.2217224166765478e-05, "loss": 0.5285, "mean_token_accuracy": 0.8310411602258683, "step": 1480 }, { "epoch": 1.4626553463762766, "grad_norm": 0.23699437081813812, "learning_rate": 1.216128661728742e-05, "loss": 0.5359, "mean_token_accuracy": 0.8288247928023338, "step": 1485 }, { "epoch": 1.4675772117632584, "grad_norm": 0.2528901994228363, "learning_rate": 1.2105278036502787e-05, "loss": 0.543, "mean_token_accuracy": 0.8267820864915848, "step": 1490 }, { "epoch": 1.47249907715024, "grad_norm": 0.25504714250564575, "learning_rate": 1.204920026514971e-05, "loss": 0.5391, "mean_token_accuracy": 0.8281295597553253, "step": 1495 }, { "epoch": 1.4774209425372216, "grad_norm": 0.26783859729766846, "learning_rate": 1.1993055146240273e-05, "loss": 0.5325, "mean_token_accuracy": 0.8299062862992287, "step": 1500 }, { "epoch": 1.4823428079242031, "grad_norm": 0.25482243299484253, "learning_rate": 1.1936844524999966e-05, "loss": 0.5271, "mean_token_accuracy": 0.8315476939082146, "step": 1505 }, { "epoch": 1.487264673311185, "grad_norm": 0.2603563964366913, "learning_rate": 1.1880570248807033e-05, "loss": 0.5299, "mean_token_accuracy": 0.8303808271884918, "step": 1510 }, { "epoch": 1.4921865386981665, "grad_norm": 0.2345011830329895, "learning_rate": 1.1824234167131748e-05, "loss": 0.5274, "mean_token_accuracy": 0.8310874328017235, "step": 1515 }, { "epoch": 1.4971084040851483, "grad_norm": 0.3448658883571625, "learning_rate": 1.1767838131475654e-05, "loss": 0.5318, "mean_token_accuracy": 0.8301808550953865, "step": 1520 }, { "epoch": 1.50203026947213, "grad_norm": 0.26358914375305176, "learning_rate": 1.171138399531068e-05, "loss": 0.5341, "mean_token_accuracy": 0.8296466439962387, "step": 1525 }, { "epoch": 1.5069521348591115, "grad_norm": 0.23463788628578186, "learning_rate": 1.1654873614018266e-05, "loss": 0.5337, "mean_token_accuracy": 0.8297147572040557, "step": 1530 }, { "epoch": 1.5118740002460933, "grad_norm": 0.37559443712234497, "learning_rate": 1.1598308844828348e-05, "loss": 0.5281, "mean_token_accuracy": 0.8311620846390724, "step": 1535 }, { "epoch": 1.516795865633075, "grad_norm": 0.24298147857189178, "learning_rate": 1.1541691546758343e-05, "loss": 0.5353, "mean_token_accuracy": 0.8288328930735588, "step": 1540 }, { "epoch": 1.5217177310200567, "grad_norm": 0.2316361665725708, "learning_rate": 1.1485023580552039e-05, "loss": 0.5217, "mean_token_accuracy": 0.8330785930156708, "step": 1545 }, { "epoch": 1.5266395964070383, "grad_norm": 0.22819174826145172, "learning_rate": 1.1428306808618456e-05, "loss": 0.53, "mean_token_accuracy": 0.8303656697273254, "step": 1550 }, { "epoch": 1.5315614617940199, "grad_norm": 0.22326573729515076, "learning_rate": 1.1371543094970624e-05, "loss": 0.53, "mean_token_accuracy": 0.8304451867938042, "step": 1555 }, { "epoch": 1.5364833271810014, "grad_norm": 0.23267020285129547, "learning_rate": 1.131473430516432e-05, "loss": 0.5284, "mean_token_accuracy": 0.8309284761548043, "step": 1560 }, { "epoch": 1.5414051925679833, "grad_norm": 0.3377299904823303, "learning_rate": 1.1257882306236776e-05, "loss": 0.5336, "mean_token_accuracy": 0.8295429393649101, "step": 1565 }, { "epoch": 1.546327057954965, "grad_norm": 0.24768434464931488, "learning_rate": 1.1200988966645286e-05, "loss": 0.5326, "mean_token_accuracy": 0.8297705203294754, "step": 1570 }, { "epoch": 1.5512489233419466, "grad_norm": 0.22998486459255219, "learning_rate": 1.1144056156205834e-05, "loss": 0.5298, "mean_token_accuracy": 0.8307420760393143, "step": 1575 }, { "epoch": 1.5561707887289282, "grad_norm": 0.22251376509666443, "learning_rate": 1.1087085746031612e-05, "loss": 0.528, "mean_token_accuracy": 0.8313020512461662, "step": 1580 }, { "epoch": 1.5610926541159098, "grad_norm": 0.2297334372997284, "learning_rate": 1.1030079608471544e-05, "loss": 0.5335, "mean_token_accuracy": 0.8294809475541115, "step": 1585 }, { "epoch": 1.5660145195028916, "grad_norm": 0.23138615489006042, "learning_rate": 1.0973039617048748e-05, "loss": 0.5333, "mean_token_accuracy": 0.829520358145237, "step": 1590 }, { "epoch": 1.5709363848898734, "grad_norm": 0.23547935485839844, "learning_rate": 1.091596764639895e-05, "loss": 0.5267, "mean_token_accuracy": 0.8314588502049446, "step": 1595 }, { "epoch": 1.575858250276855, "grad_norm": 0.2409500926733017, "learning_rate": 1.0858865572208892e-05, "loss": 0.5346, "mean_token_accuracy": 0.8291632473468781, "step": 1600 }, { "epoch": 1.5807801156638366, "grad_norm": 0.2276252955198288, "learning_rate": 1.080173527115467e-05, "loss": 0.5273, "mean_token_accuracy": 0.831089685857296, "step": 1605 }, { "epoch": 1.5857019810508182, "grad_norm": 0.2589430809020996, "learning_rate": 1.0744578620840065e-05, "loss": 0.5388, "mean_token_accuracy": 0.8279580160975456, "step": 1610 }, { "epoch": 1.5906238464378, "grad_norm": 0.2499450445175171, "learning_rate": 1.0687397499734842e-05, "loss": 0.5268, "mean_token_accuracy": 0.8311406090855599, "step": 1615 }, { "epoch": 1.5955457118247816, "grad_norm": 0.2377663552761078, "learning_rate": 1.0630193787112994e-05, "loss": 0.5257, "mean_token_accuracy": 0.8319837361574173, "step": 1620 }, { "epoch": 1.6004675772117634, "grad_norm": 0.24260112643241882, "learning_rate": 1.0572969362991e-05, "loss": 0.5316, "mean_token_accuracy": 0.8302173331379891, "step": 1625 }, { "epoch": 1.605389442598745, "grad_norm": 1.525187611579895, "learning_rate": 1.0515726108066025e-05, "loss": 0.5315, "mean_token_accuracy": 0.8299267381429672, "step": 1630 }, { "epoch": 1.6103113079857265, "grad_norm": 0.23062676191329956, "learning_rate": 1.0458465903654107e-05, "loss": 0.5298, "mean_token_accuracy": 0.8305988430976867, "step": 1635 }, { "epoch": 1.615233173372708, "grad_norm": 0.23293638229370117, "learning_rate": 1.0401190631628348e-05, "loss": 0.5304, "mean_token_accuracy": 0.8300972327589988, "step": 1640 }, { "epoch": 1.62015503875969, "grad_norm": 0.22877627611160278, "learning_rate": 1.034390217435704e-05, "loss": 0.5287, "mean_token_accuracy": 0.8309306666254997, "step": 1645 }, { "epoch": 1.6250769041466717, "grad_norm": 0.23190174996852875, "learning_rate": 1.0286602414641818e-05, "loss": 0.5303, "mean_token_accuracy": 0.8306381091475487, "step": 1650 }, { "epoch": 1.6299987695336533, "grad_norm": 0.23290394246578217, "learning_rate": 1.0229293235655768e-05, "loss": 0.5221, "mean_token_accuracy": 0.8326445773243905, "step": 1655 }, { "epoch": 1.6349206349206349, "grad_norm": 0.22114625573158264, "learning_rate": 1.0171976520881552e-05, "loss": 0.5263, "mean_token_accuracy": 0.8315576672554016, "step": 1660 }, { "epoch": 1.6398425003076165, "grad_norm": 0.2297578752040863, "learning_rate": 1.011465415404949e-05, "loss": 0.5252, "mean_token_accuracy": 0.8321317434310913, "step": 1665 }, { "epoch": 1.6447643656945983, "grad_norm": 0.23588469624519348, "learning_rate": 1.005732801907567e-05, "loss": 0.5262, "mean_token_accuracy": 0.831513050198555, "step": 1670 }, { "epoch": 1.64968623108158, "grad_norm": 0.22704197466373444, "learning_rate": 1e-05, "loss": 0.5382, "mean_token_accuracy": 0.8281245142221451, "step": 1675 }, { "epoch": 1.6546080964685617, "grad_norm": 0.22588326036930084, "learning_rate": 9.942671980924336e-06, "loss": 0.5286, "mean_token_accuracy": 0.8307414755225182, "step": 1680 }, { "epoch": 1.6595299618555432, "grad_norm": 0.22511065006256104, "learning_rate": 9.88534584595051e-06, "loss": 0.5279, "mean_token_accuracy": 0.83111013174057, "step": 1685 }, { "epoch": 1.6644518272425248, "grad_norm": 0.24989110231399536, "learning_rate": 9.82802347911845e-06, "loss": 0.5257, "mean_token_accuracy": 0.8317268043756485, "step": 1690 }, { "epoch": 1.6693736926295066, "grad_norm": 0.23859356343746185, "learning_rate": 9.770706764344235e-06, "loss": 0.534, "mean_token_accuracy": 0.8294050306081772, "step": 1695 }, { "epoch": 1.6742955580164882, "grad_norm": 0.2304782122373581, "learning_rate": 9.713397585358189e-06, "loss": 0.528, "mean_token_accuracy": 0.8308202102780342, "step": 1700 }, { "epoch": 1.67921742340347, "grad_norm": 0.2276812344789505, "learning_rate": 9.65609782564296e-06, "loss": 0.5267, "mean_token_accuracy": 0.8312249034643173, "step": 1705 }, { "epoch": 1.6841392887904516, "grad_norm": 0.3979962170124054, "learning_rate": 9.598809368371656e-06, "loss": 0.5266, "mean_token_accuracy": 0.8312003433704376, "step": 1710 }, { "epoch": 1.6890611541774332, "grad_norm": 0.25581249594688416, "learning_rate": 9.541534096345896e-06, "loss": 0.526, "mean_token_accuracy": 0.8315127685666084, "step": 1715 }, { "epoch": 1.6939830195644148, "grad_norm": 0.2141893208026886, "learning_rate": 9.484273891933982e-06, "loss": 0.5252, "mean_token_accuracy": 0.8317378848791123, "step": 1720 }, { "epoch": 1.6989048849513966, "grad_norm": 0.4327445924282074, "learning_rate": 9.427030637009002e-06, "loss": 0.5361, "mean_token_accuracy": 0.828312310576439, "step": 1725 }, { "epoch": 1.7038267503383784, "grad_norm": 0.22412188351154327, "learning_rate": 9.369806212887008e-06, "loss": 0.5299, "mean_token_accuracy": 0.830331552028656, "step": 1730 }, { "epoch": 1.70874861572536, "grad_norm": 0.22056014835834503, "learning_rate": 9.312602500265162e-06, "loss": 0.5259, "mean_token_accuracy": 0.831749576330185, "step": 1735 }, { "epoch": 1.7136704811123415, "grad_norm": 0.23633216321468353, "learning_rate": 9.255421379159935e-06, "loss": 0.5152, "mean_token_accuracy": 0.8346669390797615, "step": 1740 }, { "epoch": 1.7185923464993231, "grad_norm": 0.21674410998821259, "learning_rate": 9.198264728845332e-06, "loss": 0.5188, "mean_token_accuracy": 0.8335284858942031, "step": 1745 }, { "epoch": 1.723514211886305, "grad_norm": 0.22083686292171478, "learning_rate": 9.14113442779111e-06, "loss": 0.5283, "mean_token_accuracy": 0.8306051269173622, "step": 1750 }, { "epoch": 1.7284360772732867, "grad_norm": 0.2326516956090927, "learning_rate": 9.084032353601053e-06, "loss": 0.5329, "mean_token_accuracy": 0.8295654147863388, "step": 1755 }, { "epoch": 1.7333579426602683, "grad_norm": 0.23140785098075867, "learning_rate": 9.026960382951253e-06, "loss": 0.5243, "mean_token_accuracy": 0.8315977454185486, "step": 1760 }, { "epoch": 1.73827980804725, "grad_norm": 0.24312028288841248, "learning_rate": 8.969920391528459e-06, "loss": 0.5218, "mean_token_accuracy": 0.8328249961137771, "step": 1765 }, { "epoch": 1.7432016734342315, "grad_norm": 0.22412382066249847, "learning_rate": 8.912914253968391e-06, "loss": 0.5312, "mean_token_accuracy": 0.8298890963196754, "step": 1770 }, { "epoch": 1.748123538821213, "grad_norm": 0.2266296148300171, "learning_rate": 8.855943843794171e-06, "loss": 0.5234, "mean_token_accuracy": 0.8323718756437302, "step": 1775 }, { "epoch": 1.7530454042081949, "grad_norm": 0.21898606419563293, "learning_rate": 8.799011033354716e-06, "loss": 0.5288, "mean_token_accuracy": 0.8307971671223641, "step": 1780 }, { "epoch": 1.7579672695951767, "grad_norm": 0.2306451052427292, "learning_rate": 8.742117693763229e-06, "loss": 0.5271, "mean_token_accuracy": 0.8316369831562043, "step": 1785 }, { "epoch": 1.7628891349821583, "grad_norm": 0.22924001514911652, "learning_rate": 8.685265694835681e-06, "loss": 0.5272, "mean_token_accuracy": 0.8311286598443985, "step": 1790 }, { "epoch": 1.7678110003691399, "grad_norm": 0.33131736516952515, "learning_rate": 8.628456905029383e-06, "loss": 0.5195, "mean_token_accuracy": 0.833528995513916, "step": 1795 }, { "epoch": 1.7727328657561214, "grad_norm": 0.24447475373744965, "learning_rate": 8.571693191381545e-06, "loss": 0.5221, "mean_token_accuracy": 0.8324113413691521, "step": 1800 }, { "epoch": 1.7776547311431032, "grad_norm": 0.23472720384597778, "learning_rate": 8.514976419447963e-06, "loss": 0.5282, "mean_token_accuracy": 0.8306461483240127, "step": 1805 }, { "epoch": 1.782576596530085, "grad_norm": 0.25232747197151184, "learning_rate": 8.458308453241664e-06, "loss": 0.519, "mean_token_accuracy": 0.8334705844521523, "step": 1810 }, { "epoch": 1.7874984619170666, "grad_norm": 0.22827033698558807, "learning_rate": 8.401691155171654e-06, "loss": 0.5353, "mean_token_accuracy": 0.8289692014455795, "step": 1815 }, { "epoch": 1.7924203273040482, "grad_norm": 0.21775387227535248, "learning_rate": 8.345126385981737e-06, "loss": 0.5217, "mean_token_accuracy": 0.8326601728796958, "step": 1820 }, { "epoch": 1.7973421926910298, "grad_norm": 0.22691109776496887, "learning_rate": 8.288616004689321e-06, "loss": 0.5208, "mean_token_accuracy": 0.8330274626612664, "step": 1825 }, { "epoch": 1.8022640580780116, "grad_norm": 0.23031188547611237, "learning_rate": 8.23216186852435e-06, "loss": 0.5251, "mean_token_accuracy": 0.8317318856716156, "step": 1830 }, { "epoch": 1.8071859234649932, "grad_norm": 0.23658455908298492, "learning_rate": 8.175765832868252e-06, "loss": 0.5263, "mean_token_accuracy": 0.8314035385847092, "step": 1835 }, { "epoch": 1.812107788851975, "grad_norm": 0.21728812158107758, "learning_rate": 8.119429751192972e-06, "loss": 0.5283, "mean_token_accuracy": 0.830833038687706, "step": 1840 }, { "epoch": 1.8170296542389566, "grad_norm": 0.22863180935382843, "learning_rate": 8.063155475000037e-06, "loss": 0.5231, "mean_token_accuracy": 0.8322245612740516, "step": 1845 }, { "epoch": 1.8219515196259382, "grad_norm": 0.22922097146511078, "learning_rate": 8.006944853759732e-06, "loss": 0.5242, "mean_token_accuracy": 0.8318595319986344, "step": 1850 }, { "epoch": 1.8268733850129197, "grad_norm": 0.209337517619133, "learning_rate": 7.950799734850292e-06, "loss": 0.5195, "mean_token_accuracy": 0.8333837404847145, "step": 1855 }, { "epoch": 1.8317952503999015, "grad_norm": 0.22603721916675568, "learning_rate": 7.894721963497214e-06, "loss": 0.5218, "mean_token_accuracy": 0.8325009673833847, "step": 1860 }, { "epoch": 1.8367171157868833, "grad_norm": 0.2327803522348404, "learning_rate": 7.838713382712583e-06, "loss": 0.5111, "mean_token_accuracy": 0.8357574358582497, "step": 1865 }, { "epoch": 1.841638981173865, "grad_norm": 0.23280593752861023, "learning_rate": 7.782775833234522e-06, "loss": 0.5333, "mean_token_accuracy": 0.8295109212398529, "step": 1870 }, { "epoch": 1.8465608465608465, "grad_norm": 0.2219589352607727, "learning_rate": 7.726911153466699e-06, "loss": 0.5255, "mean_token_accuracy": 0.8316129177808762, "step": 1875 }, { "epoch": 1.851482711947828, "grad_norm": 0.22274133563041687, "learning_rate": 7.67112117941788e-06, "loss": 0.5197, "mean_token_accuracy": 0.8331713795661926, "step": 1880 }, { "epoch": 1.85640457733481, "grad_norm": 0.20765641331672668, "learning_rate": 7.615407744641618e-06, "loss": 0.5222, "mean_token_accuracy": 0.8323680445551872, "step": 1885 }, { "epoch": 1.8613264427217917, "grad_norm": 0.22262942790985107, "learning_rate": 7.559772680175979e-06, "loss": 0.5256, "mean_token_accuracy": 0.8315785735845566, "step": 1890 }, { "epoch": 1.8662483081087733, "grad_norm": 0.23786763846874237, "learning_rate": 7.504217814483364e-06, "loss": 0.5225, "mean_token_accuracy": 0.8326525434851646, "step": 1895 }, { "epoch": 1.8711701734957549, "grad_norm": 0.22120903432369232, "learning_rate": 7.448744973390423e-06, "loss": 0.5322, "mean_token_accuracy": 0.8296578034758568, "step": 1900 }, { "epoch": 1.8760920388827365, "grad_norm": 0.22359086573123932, "learning_rate": 7.393355980028039e-06, "loss": 0.524, "mean_token_accuracy": 0.8320103421807289, "step": 1905 }, { "epoch": 1.8810139042697183, "grad_norm": 0.21293464303016663, "learning_rate": 7.338052654771407e-06, "loss": 0.5201, "mean_token_accuracy": 0.8330625906586647, "step": 1910 }, { "epoch": 1.8859357696566998, "grad_norm": 0.212773397564888, "learning_rate": 7.282836815180241e-06, "loss": 0.5212, "mean_token_accuracy": 0.8328917175531387, "step": 1915 }, { "epoch": 1.8908576350436817, "grad_norm": 0.2229495495557785, "learning_rate": 7.227710275938987e-06, "loss": 0.5177, "mean_token_accuracy": 0.8338592052459717, "step": 1920 }, { "epoch": 1.8957795004306632, "grad_norm": 0.22714777290821075, "learning_rate": 7.172674848797218e-06, "loss": 0.5196, "mean_token_accuracy": 0.8332103446125985, "step": 1925 }, { "epoch": 1.9007013658176448, "grad_norm": 0.5862542986869812, "learning_rate": 7.117732342510093e-06, "loss": 0.5148, "mean_token_accuracy": 0.8348309084773063, "step": 1930 }, { "epoch": 1.9056232312046264, "grad_norm": 0.21524302661418915, "learning_rate": 7.062884562778883e-06, "loss": 0.5225, "mean_token_accuracy": 0.8324376299977303, "step": 1935 }, { "epoch": 1.9105450965916082, "grad_norm": 0.22445465624332428, "learning_rate": 7.008133312191649e-06, "loss": 0.5239, "mean_token_accuracy": 0.8318991348147392, "step": 1940 }, { "epoch": 1.91546696197859, "grad_norm": 0.21925503015518188, "learning_rate": 6.953480390164001e-06, "loss": 0.5243, "mean_token_accuracy": 0.8320589557290077, "step": 1945 }, { "epoch": 1.9203888273655716, "grad_norm": 0.21358764171600342, "learning_rate": 6.898927592879945e-06, "loss": 0.5276, "mean_token_accuracy": 0.8309697136282921, "step": 1950 }, { "epoch": 1.9253106927525532, "grad_norm": 0.21541139483451843, "learning_rate": 6.844476713232863e-06, "loss": 0.5183, "mean_token_accuracy": 0.8336074352264404, "step": 1955 }, { "epoch": 1.9302325581395348, "grad_norm": 0.253334105014801, "learning_rate": 6.790129540766581e-06, "loss": 0.5217, "mean_token_accuracy": 0.8321399599313736, "step": 1960 }, { "epoch": 1.9351544235265166, "grad_norm": 0.2311272770166397, "learning_rate": 6.735887861616555e-06, "loss": 0.5226, "mean_token_accuracy": 0.832192762196064, "step": 1965 }, { "epoch": 1.9400762889134984, "grad_norm": 0.2155195027589798, "learning_rate": 6.68175345845119e-06, "loss": 0.5214, "mean_token_accuracy": 0.8325791984796524, "step": 1970 }, { "epoch": 1.94499815430048, "grad_norm": 0.2229234129190445, "learning_rate": 6.627728110413214e-06, "loss": 0.5228, "mean_token_accuracy": 0.8320748254656791, "step": 1975 }, { "epoch": 1.9499200196874615, "grad_norm": 0.2595667839050293, "learning_rate": 6.5738135930612355e-06, "loss": 0.5257, "mean_token_accuracy": 0.831524421274662, "step": 1980 }, { "epoch": 1.9548418850744431, "grad_norm": 0.21894799172878265, "learning_rate": 6.520011678311382e-06, "loss": 0.5135, "mean_token_accuracy": 0.8349313631653785, "step": 1985 }, { "epoch": 1.959763750461425, "grad_norm": 0.215131938457489, "learning_rate": 6.466324134379066e-06, "loss": 0.5125, "mean_token_accuracy": 0.8354373678565026, "step": 1990 }, { "epoch": 1.9646856158484065, "grad_norm": 0.227864071726799, "learning_rate": 6.412752725720864e-06, "loss": 0.5166, "mean_token_accuracy": 0.8339696109294892, "step": 1995 }, { "epoch": 1.9696074812353883, "grad_norm": 0.21633465588092804, "learning_rate": 6.359299212976535e-06, "loss": 0.5236, "mean_token_accuracy": 0.8324458003044128, "step": 2000 }, { "epoch": 1.97452934662237, "grad_norm": 0.2214214950799942, "learning_rate": 6.305965352911162e-06, "loss": 0.5186, "mean_token_accuracy": 0.8334563329815865, "step": 2005 }, { "epoch": 1.9794512120093515, "grad_norm": 0.20772044360637665, "learning_rate": 6.252752898357397e-06, "loss": 0.5146, "mean_token_accuracy": 0.8346970349550247, "step": 2010 }, { "epoch": 1.984373077396333, "grad_norm": 0.2208469659090042, "learning_rate": 6.1996635981578755e-06, "loss": 0.521, "mean_token_accuracy": 0.8330862745642662, "step": 2015 }, { "epoch": 1.9892949427833149, "grad_norm": 0.21841764450073242, "learning_rate": 6.146699197107715e-06, "loss": 0.5141, "mean_token_accuracy": 0.8346462666988372, "step": 2020 }, { "epoch": 1.9942168081702967, "grad_norm": 0.22905802726745605, "learning_rate": 6.093861435897208e-06, "loss": 0.5161, "mean_token_accuracy": 0.8341751024127007, "step": 2025 }, { "epoch": 1.9991386735572783, "grad_norm": 0.2205893099308014, "learning_rate": 6.041152051054575e-06, "loss": 0.5135, "mean_token_accuracy": 0.8350084885954857, "step": 2030 }, { "epoch": 2.0049218653869816, "grad_norm": 0.27798768877983093, "learning_rate": 5.988572774888913e-06, "loss": 0.5979, "mean_token_accuracy": 0.8386082910909886, "step": 2035 }, { "epoch": 2.009843730773963, "grad_norm": 0.24996507167816162, "learning_rate": 5.936125335433265e-06, "loss": 0.4945, "mean_token_accuracy": 0.839720045030117, "step": 2040 }, { "epoch": 2.014765596160945, "grad_norm": 0.2548527121543884, "learning_rate": 5.883811456387821e-06, "loss": 0.4941, "mean_token_accuracy": 0.8400543674826622, "step": 2045 }, { "epoch": 2.0196874615479268, "grad_norm": 0.2184976190328598, "learning_rate": 5.831632857063271e-06, "loss": 0.4902, "mean_token_accuracy": 0.8409830510616303, "step": 2050 }, { "epoch": 2.0246093269349084, "grad_norm": 0.22762830555438995, "learning_rate": 5.779591252324286e-06, "loss": 0.4904, "mean_token_accuracy": 0.8408440828323365, "step": 2055 }, { "epoch": 2.02953119232189, "grad_norm": 0.23035886883735657, "learning_rate": 5.7276883525331915e-06, "loss": 0.4943, "mean_token_accuracy": 0.8397367835044861, "step": 2060 }, { "epoch": 2.0344530577088715, "grad_norm": 0.22349004447460175, "learning_rate": 5.675925863493721e-06, "loss": 0.5009, "mean_token_accuracy": 0.8379953891038895, "step": 2065 }, { "epoch": 2.0393749230958536, "grad_norm": 0.22588923573493958, "learning_rate": 5.6243054863949675e-06, "loss": 0.494, "mean_token_accuracy": 0.8397265374660492, "step": 2070 }, { "epoch": 2.044296788482835, "grad_norm": 0.2168150246143341, "learning_rate": 5.5728289177554805e-06, "loss": 0.4975, "mean_token_accuracy": 0.8389487206935883, "step": 2075 }, { "epoch": 2.0492186538698167, "grad_norm": 0.22331282496452332, "learning_rate": 5.521497849367501e-06, "loss": 0.4859, "mean_token_accuracy": 0.8422671511769295, "step": 2080 }, { "epoch": 2.0541405192567983, "grad_norm": 0.21221551299095154, "learning_rate": 5.4703139682413585e-06, "loss": 0.4866, "mean_token_accuracy": 0.8420242533087731, "step": 2085 }, { "epoch": 2.05906238464378, "grad_norm": 0.22058208286762238, "learning_rate": 5.419278956550037e-06, "loss": 0.4955, "mean_token_accuracy": 0.8394055813550949, "step": 2090 }, { "epoch": 2.0639842500307615, "grad_norm": 0.22200560569763184, "learning_rate": 5.368394491573876e-06, "loss": 0.493, "mean_token_accuracy": 0.8402127623558044, "step": 2095 }, { "epoch": 2.0689061154177435, "grad_norm": 0.2220141738653183, "learning_rate": 5.31766224564547e-06, "loss": 0.4958, "mean_token_accuracy": 0.8393116250634194, "step": 2100 }, { "epoch": 2.073827980804725, "grad_norm": 0.21074913442134857, "learning_rate": 5.267083886094668e-06, "loss": 0.4931, "mean_token_accuracy": 0.840206652879715, "step": 2105 }, { "epoch": 2.0787498461917067, "grad_norm": 0.2276320606470108, "learning_rate": 5.216661075193814e-06, "loss": 0.4955, "mean_token_accuracy": 0.8393134921789169, "step": 2110 }, { "epoch": 2.0836717115786882, "grad_norm": 0.2224099338054657, "learning_rate": 5.166395470103092e-06, "loss": 0.4937, "mean_token_accuracy": 0.8397904768586159, "step": 2115 }, { "epoch": 2.08859357696567, "grad_norm": 0.22312206029891968, "learning_rate": 5.116288722816087e-06, "loss": 0.493, "mean_token_accuracy": 0.8403119757771492, "step": 2120 }, { "epoch": 2.093515442352652, "grad_norm": 0.2194313257932663, "learning_rate": 5.06634248010546e-06, "loss": 0.4935, "mean_token_accuracy": 0.8400413483381272, "step": 2125 }, { "epoch": 2.0984373077396334, "grad_norm": 0.22484691441059113, "learning_rate": 5.016558383468851e-06, "loss": 0.49, "mean_token_accuracy": 0.8409391462802887, "step": 2130 }, { "epoch": 2.103359173126615, "grad_norm": 0.22470517456531525, "learning_rate": 4.9669380690749215e-06, "loss": 0.497, "mean_token_accuracy": 0.8389460816979408, "step": 2135 }, { "epoch": 2.1082810385135966, "grad_norm": 0.21832752227783203, "learning_rate": 4.91748316770958e-06, "loss": 0.4926, "mean_token_accuracy": 0.8401527449488639, "step": 2140 }, { "epoch": 2.113202903900578, "grad_norm": 0.21521726250648499, "learning_rate": 4.868195304722391e-06, "loss": 0.4979, "mean_token_accuracy": 0.8387278065085411, "step": 2145 }, { "epoch": 2.1181247692875598, "grad_norm": 0.21682803332805634, "learning_rate": 4.819076099973152e-06, "loss": 0.5014, "mean_token_accuracy": 0.83763497620821, "step": 2150 }, { "epoch": 2.123046634674542, "grad_norm": 0.2204725295305252, "learning_rate": 4.77012716777867e-06, "loss": 0.4989, "mean_token_accuracy": 0.8380599915981293, "step": 2155 }, { "epoch": 2.1279685000615234, "grad_norm": 0.2179991751909256, "learning_rate": 4.721350116859675e-06, "loss": 0.4946, "mean_token_accuracy": 0.8396460056304932, "step": 2160 }, { "epoch": 2.132890365448505, "grad_norm": 0.21851445734500885, "learning_rate": 4.672746550287985e-06, "loss": 0.4947, "mean_token_accuracy": 0.8395410850644112, "step": 2165 }, { "epoch": 2.1378122308354865, "grad_norm": 0.21560297906398773, "learning_rate": 4.6243180654337975e-06, "loss": 0.4857, "mean_token_accuracy": 0.8421663656830788, "step": 2170 }, { "epoch": 2.142734096222468, "grad_norm": 0.21567942202091217, "learning_rate": 4.576066253913209e-06, "loss": 0.493, "mean_token_accuracy": 0.840301775932312, "step": 2175 }, { "epoch": 2.14765596160945, "grad_norm": 0.22145864367485046, "learning_rate": 4.527992701535884e-06, "loss": 0.4844, "mean_token_accuracy": 0.8423144072294235, "step": 2180 }, { "epoch": 2.1525778269964317, "grad_norm": 0.217710942029953, "learning_rate": 4.480098988252958e-06, "loss": 0.4919, "mean_token_accuracy": 0.84017314016819, "step": 2185 }, { "epoch": 2.1574996923834133, "grad_norm": 0.2169259786605835, "learning_rate": 4.432386688105095e-06, "loss": 0.4929, "mean_token_accuracy": 0.840173925459385, "step": 2190 }, { "epoch": 2.162421557770395, "grad_norm": 0.21104402840137482, "learning_rate": 4.384857369170772e-06, "loss": 0.4875, "mean_token_accuracy": 0.8417868033051491, "step": 2195 }, { "epoch": 2.1673434231573765, "grad_norm": 0.21658702194690704, "learning_rate": 4.337512593514729e-06, "loss": 0.4947, "mean_token_accuracy": 0.8395476669073105, "step": 2200 }, { "epoch": 2.1722652885443585, "grad_norm": 0.22858913242816925, "learning_rate": 4.290353917136639e-06, "loss": 0.4901, "mean_token_accuracy": 0.8408517464995384, "step": 2205 }, { "epoch": 2.17718715393134, "grad_norm": 0.4094144105911255, "learning_rate": 4.243382889919981e-06, "loss": 0.496, "mean_token_accuracy": 0.8392629832029342, "step": 2210 }, { "epoch": 2.1821090193183217, "grad_norm": 0.21924547851085663, "learning_rate": 4.1966010555810696e-06, "loss": 0.4899, "mean_token_accuracy": 0.841227824985981, "step": 2215 }, { "epoch": 2.1870308847053033, "grad_norm": 0.21283064782619476, "learning_rate": 4.1500099516183555e-06, "loss": 0.4913, "mean_token_accuracy": 0.8405321702361107, "step": 2220 }, { "epoch": 2.191952750092285, "grad_norm": 0.21150268614292145, "learning_rate": 4.1036111092618725e-06, "loss": 0.4895, "mean_token_accuracy": 0.8410715743899345, "step": 2225 }, { "epoch": 2.1968746154792664, "grad_norm": 0.20887652039527893, "learning_rate": 4.057406053422933e-06, "loss": 0.4935, "mean_token_accuracy": 0.8398977249860764, "step": 2230 }, { "epoch": 2.2017964808662485, "grad_norm": 0.20756816864013672, "learning_rate": 4.011396302643989e-06, "loss": 0.4846, "mean_token_accuracy": 0.842858923971653, "step": 2235 }, { "epoch": 2.20671834625323, "grad_norm": 0.23419924080371857, "learning_rate": 3.965583369048737e-06, "loss": 0.4963, "mean_token_accuracy": 0.8392103880643844, "step": 2240 }, { "epoch": 2.2116402116402116, "grad_norm": 0.21532607078552246, "learning_rate": 3.919968758292425e-06, "loss": 0.4883, "mean_token_accuracy": 0.8413224458694458, "step": 2245 }, { "epoch": 2.216562077027193, "grad_norm": 0.2164084017276764, "learning_rate": 3.874553969512358e-06, "loss": 0.4885, "mean_token_accuracy": 0.8415488794445991, "step": 2250 }, { "epoch": 2.221483942414175, "grad_norm": 0.21010589599609375, "learning_rate": 3.82934049527864e-06, "loss": 0.4918, "mean_token_accuracy": 0.8404750242829323, "step": 2255 }, { "epoch": 2.226405807801157, "grad_norm": 0.20962242782115936, "learning_rate": 3.784329821545105e-06, "loss": 0.4962, "mean_token_accuracy": 0.839095975458622, "step": 2260 }, { "epoch": 2.2313276731881384, "grad_norm": 0.20551133155822754, "learning_rate": 3.739523427600509e-06, "loss": 0.4911, "mean_token_accuracy": 0.8407798200845719, "step": 2265 }, { "epoch": 2.23624953857512, "grad_norm": 0.21332746744155884, "learning_rate": 3.6949227860198712e-06, "loss": 0.492, "mean_token_accuracy": 0.8405194252729415, "step": 2270 }, { "epoch": 2.2411714039621016, "grad_norm": 0.26087722182273865, "learning_rate": 3.650529362616113e-06, "loss": 0.4875, "mean_token_accuracy": 0.8417001351714134, "step": 2275 }, { "epoch": 2.246093269349083, "grad_norm": 0.20974403619766235, "learning_rate": 3.606344616391867e-06, "loss": 0.4938, "mean_token_accuracy": 0.8395893201231956, "step": 2280 }, { "epoch": 2.2510151347360647, "grad_norm": 0.22249352931976318, "learning_rate": 3.5623699994915363e-06, "loss": 0.4916, "mean_token_accuracy": 0.840800578892231, "step": 2285 }, { "epoch": 2.2559370001230468, "grad_norm": 0.20673160254955292, "learning_rate": 3.5186069571535575e-06, "loss": 0.4876, "mean_token_accuracy": 0.8417642295360566, "step": 2290 }, { "epoch": 2.2608588655100283, "grad_norm": 0.2050849050283432, "learning_rate": 3.475056927662912e-06, "loss": 0.4922, "mean_token_accuracy": 0.8401932448148728, "step": 2295 }, { "epoch": 2.26578073089701, "grad_norm": 0.2113514542579651, "learning_rate": 3.4317213423038386e-06, "loss": 0.4925, "mean_token_accuracy": 0.8401719897985458, "step": 2300 }, { "epoch": 2.2707025962839915, "grad_norm": 0.21461407840251923, "learning_rate": 3.388601625312833e-06, "loss": 0.4892, "mean_token_accuracy": 0.841229310631752, "step": 2305 }, { "epoch": 2.275624461670973, "grad_norm": 0.20549601316452026, "learning_rate": 3.345699193831795e-06, "loss": 0.4917, "mean_token_accuracy": 0.8405207619071007, "step": 2310 }, { "epoch": 2.280546327057955, "grad_norm": 0.21262629330158234, "learning_rate": 3.3030154578614783e-06, "loss": 0.4898, "mean_token_accuracy": 0.8410497605800629, "step": 2315 }, { "epoch": 2.2854681924449367, "grad_norm": 0.2351827323436737, "learning_rate": 3.2605518202151577e-06, "loss": 0.4945, "mean_token_accuracy": 0.8394208237528801, "step": 2320 }, { "epoch": 2.2903900578319183, "grad_norm": 0.21704116463661194, "learning_rate": 3.218309676472492e-06, "loss": 0.489, "mean_token_accuracy": 0.8411409676074981, "step": 2325 }, { "epoch": 2.2953119232189, "grad_norm": 0.20750364661216736, "learning_rate": 3.1762904149336947e-06, "loss": 0.4942, "mean_token_accuracy": 0.8396940395236016, "step": 2330 }, { "epoch": 2.3002337886058815, "grad_norm": 0.20055250823497772, "learning_rate": 3.134495416573884e-06, "loss": 0.4871, "mean_token_accuracy": 0.8417407006025315, "step": 2335 }, { "epoch": 2.3051556539928635, "grad_norm": 0.20621967315673828, "learning_rate": 3.0929260549977116e-06, "loss": 0.4883, "mean_token_accuracy": 0.8415425732731819, "step": 2340 }, { "epoch": 2.310077519379845, "grad_norm": 0.210305854678154, "learning_rate": 3.0515836963942056e-06, "loss": 0.4929, "mean_token_accuracy": 0.8403278931975364, "step": 2345 }, { "epoch": 2.3149993847668267, "grad_norm": 0.25147390365600586, "learning_rate": 3.01046969949188e-06, "loss": 0.4909, "mean_token_accuracy": 0.8407050803303718, "step": 2350 }, { "epoch": 2.3199212501538082, "grad_norm": 0.21020571887493134, "learning_rate": 2.9695854155140648e-06, "loss": 0.4895, "mean_token_accuracy": 0.8410211369395256, "step": 2355 }, { "epoch": 2.32484311554079, "grad_norm": 0.21094508469104767, "learning_rate": 2.9289321881345257e-06, "loss": 0.4889, "mean_token_accuracy": 0.841056476533413, "step": 2360 }, { "epoch": 2.329764980927772, "grad_norm": 0.21813294291496277, "learning_rate": 2.8885113534332742e-06, "loss": 0.4928, "mean_token_accuracy": 0.8402146637439728, "step": 2365 }, { "epoch": 2.3346868463147534, "grad_norm": 0.21038471162319183, "learning_rate": 2.8483242398526723e-06, "loss": 0.4875, "mean_token_accuracy": 0.8416903391480446, "step": 2370 }, { "epoch": 2.339608711701735, "grad_norm": 0.21476763486862183, "learning_rate": 2.80837216815378e-06, "loss": 0.4883, "mean_token_accuracy": 0.8410104081034661, "step": 2375 }, { "epoch": 2.3445305770887166, "grad_norm": 0.2148827761411667, "learning_rate": 2.7686564513729198e-06, "loss": 0.4938, "mean_token_accuracy": 0.8401752710342407, "step": 2380 }, { "epoch": 2.349452442475698, "grad_norm": 0.20347550511360168, "learning_rate": 2.7291783947785544e-06, "loss": 0.4891, "mean_token_accuracy": 0.841368468105793, "step": 2385 }, { "epoch": 2.35437430786268, "grad_norm": 0.2156437486410141, "learning_rate": 2.689939295828371e-06, "loss": 0.4926, "mean_token_accuracy": 0.8401880413293839, "step": 2390 }, { "epoch": 2.359296173249662, "grad_norm": 0.20905110239982605, "learning_rate": 2.650940444126654e-06, "loss": 0.4915, "mean_token_accuracy": 0.8407162860035896, "step": 2395 }, { "epoch": 2.3642180386366434, "grad_norm": 0.20476758480072021, "learning_rate": 2.6121831213818825e-06, "loss": 0.4932, "mean_token_accuracy": 0.840287271142006, "step": 2400 }, { "epoch": 2.369139904023625, "grad_norm": 0.1986178457736969, "learning_rate": 2.5736686013646226e-06, "loss": 0.4857, "mean_token_accuracy": 0.8420573100447655, "step": 2405 }, { "epoch": 2.3740617694106065, "grad_norm": 0.21784992516040802, "learning_rate": 2.535398149865651e-06, "loss": 0.4888, "mean_token_accuracy": 0.8410965353250504, "step": 2410 }, { "epoch": 2.378983634797588, "grad_norm": 0.20018485188484192, "learning_rate": 2.4973730246543736e-06, "loss": 0.4913, "mean_token_accuracy": 0.8406006515026092, "step": 2415 }, { "epoch": 2.3839055001845697, "grad_norm": 0.21187762916088104, "learning_rate": 2.4595944754374723e-06, "loss": 0.4972, "mean_token_accuracy": 0.8388384222984314, "step": 2420 }, { "epoch": 2.3888273655715517, "grad_norm": 0.2048918604850769, "learning_rate": 2.422063743817832e-06, "loss": 0.4936, "mean_token_accuracy": 0.8397043973207474, "step": 2425 }, { "epoch": 2.3937492309585333, "grad_norm": 0.2068692445755005, "learning_rate": 2.3847820632537565e-06, "loss": 0.4973, "mean_token_accuracy": 0.8392092302441597, "step": 2430 }, { "epoch": 2.398671096345515, "grad_norm": 0.2050062119960785, "learning_rate": 2.347750659018397e-06, "loss": 0.4964, "mean_token_accuracy": 0.8390960440039634, "step": 2435 }, { "epoch": 2.4035929617324965, "grad_norm": 0.20241810381412506, "learning_rate": 2.3109707481595113e-06, "loss": 0.4826, "mean_token_accuracy": 0.8431760326027871, "step": 2440 }, { "epoch": 2.408514827119478, "grad_norm": 0.2023165076971054, "learning_rate": 2.27444353945945e-06, "loss": 0.484, "mean_token_accuracy": 0.8427256375551224, "step": 2445 }, { "epoch": 2.41343669250646, "grad_norm": 0.2395012527704239, "learning_rate": 2.2381702333954436e-06, "loss": 0.4843, "mean_token_accuracy": 0.8425970792770385, "step": 2450 }, { "epoch": 2.4183585578934417, "grad_norm": 0.20210982859134674, "learning_rate": 2.2021520221001304e-06, "loss": 0.488, "mean_token_accuracy": 0.8415813356637954, "step": 2455 }, { "epoch": 2.4232804232804233, "grad_norm": 0.2082945853471756, "learning_rate": 2.16639008932239e-06, "loss": 0.4937, "mean_token_accuracy": 0.8398790895938874, "step": 2460 }, { "epoch": 2.428202288667405, "grad_norm": 0.20752127468585968, "learning_rate": 2.130885610388428e-06, "loss": 0.4959, "mean_token_accuracy": 0.839399340748787, "step": 2465 }, { "epoch": 2.4331241540543864, "grad_norm": 0.20869506895542145, "learning_rate": 2.0956397521631666e-06, "loss": 0.4868, "mean_token_accuracy": 0.8415920332074165, "step": 2470 }, { "epoch": 2.4380460194413685, "grad_norm": 0.20477741956710815, "learning_rate": 2.0606536730118767e-06, "loss": 0.4829, "mean_token_accuracy": 0.8429039210081101, "step": 2475 }, { "epoch": 2.44296788482835, "grad_norm": 0.20474423468112946, "learning_rate": 2.0259285227621152e-06, "loss": 0.4981, "mean_token_accuracy": 0.8382045805454255, "step": 2480 }, { "epoch": 2.4478897502153316, "grad_norm": 0.20369385182857513, "learning_rate": 1.9914654426659374e-06, "loss": 0.4926, "mean_token_accuracy": 0.839960803091526, "step": 2485 }, { "epoch": 2.452811615602313, "grad_norm": 0.2068207710981369, "learning_rate": 1.9572655653623884e-06, "loss": 0.4935, "mean_token_accuracy": 0.8397150009870529, "step": 2490 }, { "epoch": 2.457733480989295, "grad_norm": 0.20661979913711548, "learning_rate": 1.9233300148402767e-06, "loss": 0.4924, "mean_token_accuracy": 0.8401017665863038, "step": 2495 }, { "epoch": 2.462655346376277, "grad_norm": 0.21355277299880981, "learning_rate": 1.88965990640123e-06, "loss": 0.487, "mean_token_accuracy": 0.8420075699687004, "step": 2500 }, { "epoch": 2.4675772117632584, "grad_norm": 0.209817573428154, "learning_rate": 1.8562563466230577e-06, "loss": 0.4924, "mean_token_accuracy": 0.8402795165777206, "step": 2505 }, { "epoch": 2.47249907715024, "grad_norm": 0.1972341388463974, "learning_rate": 1.823120433323361e-06, "loss": 0.4912, "mean_token_accuracy": 0.8408435776829719, "step": 2510 }, { "epoch": 2.4774209425372216, "grad_norm": 0.20761115849018097, "learning_rate": 1.7902532555234653e-06, "loss": 0.4977, "mean_token_accuracy": 0.838873790204525, "step": 2515 }, { "epoch": 2.482342807924203, "grad_norm": 0.22367697954177856, "learning_rate": 1.757655893412622e-06, "loss": 0.4876, "mean_token_accuracy": 0.8413331776857376, "step": 2520 }, { "epoch": 2.487264673311185, "grad_norm": 0.20876270532608032, "learning_rate": 1.7253294183125223e-06, "loss": 0.4901, "mean_token_accuracy": 0.8411200374364853, "step": 2525 }, { "epoch": 2.4921865386981668, "grad_norm": 0.20132075250148773, "learning_rate": 1.6932748926420695e-06, "loss": 0.4953, "mean_token_accuracy": 0.8395631924271584, "step": 2530 }, { "epoch": 2.4971084040851483, "grad_norm": 0.1999741941690445, "learning_rate": 1.661493369882473e-06, "loss": 0.4796, "mean_token_accuracy": 0.843748077750206, "step": 2535 }, { "epoch": 2.50203026947213, "grad_norm": 0.21044902503490448, "learning_rate": 1.6299858945426251e-06, "loss": 0.4856, "mean_token_accuracy": 0.8423863723874092, "step": 2540 }, { "epoch": 2.5069521348591115, "grad_norm": 0.19819578528404236, "learning_rate": 1.5987535021247668e-06, "loss": 0.4855, "mean_token_accuracy": 0.8423318341374397, "step": 2545 }, { "epoch": 2.5118740002460935, "grad_norm": 0.2015785425901413, "learning_rate": 1.5677972190904623e-06, "loss": 0.4873, "mean_token_accuracy": 0.8417120486497879, "step": 2550 }, { "epoch": 2.5167958656330747, "grad_norm": 0.20403100550174713, "learning_rate": 1.537118062826859e-06, "loss": 0.4809, "mean_token_accuracy": 0.8435953631997108, "step": 2555 }, { "epoch": 2.5217177310200567, "grad_norm": 0.2051580399274826, "learning_rate": 1.5067170416132603e-06, "loss": 0.4841, "mean_token_accuracy": 0.842904870212078, "step": 2560 }, { "epoch": 2.5266395964070383, "grad_norm": 0.20559805631637573, "learning_rate": 1.4765951545879732e-06, "loss": 0.4953, "mean_token_accuracy": 0.8392938315868378, "step": 2565 }, { "epoch": 2.53156146179402, "grad_norm": 0.21315298974514008, "learning_rate": 1.4467533917154842e-06, "loss": 0.4812, "mean_token_accuracy": 0.8433891490101815, "step": 2570 }, { "epoch": 2.5364833271810014, "grad_norm": 0.33885088562965393, "learning_rate": 1.4171927337539103e-06, "loss": 0.4925, "mean_token_accuracy": 0.8398235127329826, "step": 2575 }, { "epoch": 2.541405192567983, "grad_norm": 0.19653761386871338, "learning_rate": 1.3879141522227878e-06, "loss": 0.4903, "mean_token_accuracy": 0.8408400386571884, "step": 2580 }, { "epoch": 2.546327057954965, "grad_norm": 0.19870713353157043, "learning_rate": 1.3589186093711227e-06, "loss": 0.4811, "mean_token_accuracy": 0.8433947190642357, "step": 2585 }, { "epoch": 2.5512489233419466, "grad_norm": 0.20051565766334534, "learning_rate": 1.3302070581457716e-06, "loss": 0.4994, "mean_token_accuracy": 0.838576278090477, "step": 2590 }, { "epoch": 2.5561707887289282, "grad_norm": 0.2312447875738144, "learning_rate": 1.3017804421601298e-06, "loss": 0.492, "mean_token_accuracy": 0.8404266074299812, "step": 2595 }, { "epoch": 2.56109265411591, "grad_norm": 0.21526625752449036, "learning_rate": 1.273639695663108e-06, "loss": 0.4916, "mean_token_accuracy": 0.8403177246451378, "step": 2600 }, { "epoch": 2.5660145195028914, "grad_norm": 0.4974516034126282, "learning_rate": 1.245785743508441e-06, "loss": 0.4887, "mean_token_accuracy": 0.8414172142744064, "step": 2605 }, { "epoch": 2.5709363848898734, "grad_norm": 0.19956116378307343, "learning_rate": 1.2182195011242747e-06, "loss": 0.5017, "mean_token_accuracy": 0.837465213239193, "step": 2610 }, { "epoch": 2.575858250276855, "grad_norm": 0.19986701011657715, "learning_rate": 1.1909418744831048e-06, "loss": 0.4878, "mean_token_accuracy": 0.8414024114608765, "step": 2615 }, { "epoch": 2.5807801156638366, "grad_norm": 0.20174540579319, "learning_rate": 1.1639537600719764e-06, "loss": 0.4858, "mean_token_accuracy": 0.8420050874352455, "step": 2620 }, { "epoch": 2.585701981050818, "grad_norm": 0.20654183626174927, "learning_rate": 1.1372560448630377e-06, "loss": 0.4938, "mean_token_accuracy": 0.8395126640796662, "step": 2625 }, { "epoch": 2.5906238464377997, "grad_norm": 0.19598302245140076, "learning_rate": 1.1108496062843743e-06, "loss": 0.486, "mean_token_accuracy": 0.8420949026942253, "step": 2630 }, { "epoch": 2.5955457118247818, "grad_norm": 0.20486712455749512, "learning_rate": 1.0847353121911952e-06, "loss": 0.4891, "mean_token_accuracy": 0.8409939989447593, "step": 2635 }, { "epoch": 2.6004675772117634, "grad_norm": 0.2051970511674881, "learning_rate": 1.0589140208372872e-06, "loss": 0.4871, "mean_token_accuracy": 0.8416621774435044, "step": 2640 }, { "epoch": 2.605389442598745, "grad_norm": 0.20128969848155975, "learning_rate": 1.0333865808468203e-06, "loss": 0.4824, "mean_token_accuracy": 0.8431450635194778, "step": 2645 }, { "epoch": 2.6103113079857265, "grad_norm": 0.2007114738225937, "learning_rate": 1.008153831186457e-06, "loss": 0.4917, "mean_token_accuracy": 0.8406037405133248, "step": 2650 }, { "epoch": 2.615233173372708, "grad_norm": 0.19757139682769775, "learning_rate": 9.83216601137773e-07, "loss": 0.488, "mean_token_accuracy": 0.8414921492338181, "step": 2655 }, { "epoch": 2.62015503875969, "grad_norm": 0.21764694154262543, "learning_rate": 9.58575710270011e-07, "loss": 0.4819, "mean_token_accuracy": 0.8431682124733925, "step": 2660 }, { "epoch": 2.6250769041466717, "grad_norm": 0.20229902863502502, "learning_rate": 9.342319684131396e-07, "loss": 0.4916, "mean_token_accuracy": 0.8404648944735527, "step": 2665 }, { "epoch": 2.6299987695336533, "grad_norm": 0.22413024306297302, "learning_rate": 9.101861756312369e-07, "loss": 0.489, "mean_token_accuracy": 0.8410172060132026, "step": 2670 }, { "epoch": 2.634920634920635, "grad_norm": 0.1993047147989273, "learning_rate": 8.864391221962065e-07, "loss": 0.488, "mean_token_accuracy": 0.841397476196289, "step": 2675 }, { "epoch": 2.6398425003076165, "grad_norm": 0.20383085310459137, "learning_rate": 8.629915885617912e-07, "loss": 0.4906, "mean_token_accuracy": 0.8405807599425316, "step": 2680 }, { "epoch": 2.6447643656945985, "grad_norm": 0.19943130016326904, "learning_rate": 8.398443453379268e-07, "loss": 0.4872, "mean_token_accuracy": 0.841593649983406, "step": 2685 }, { "epoch": 2.64968623108158, "grad_norm": 0.19960327446460724, "learning_rate": 8.169981532654269e-07, "loss": 0.4854, "mean_token_accuracy": 0.8422250881791115, "step": 2690 }, { "epoch": 2.6546080964685617, "grad_norm": 0.20726507902145386, "learning_rate": 7.944537631909666e-07, "loss": 0.4855, "mean_token_accuracy": 0.8422259956598281, "step": 2695 }, { "epoch": 2.6595299618555432, "grad_norm": 0.19812346994876862, "learning_rate": 7.722119160424113e-07, "loss": 0.4867, "mean_token_accuracy": 0.842007802426815, "step": 2700 }, { "epoch": 2.664451827242525, "grad_norm": 0.19591908156871796, "learning_rate": 7.502733428044684e-07, "loss": 0.486, "mean_token_accuracy": 0.8423181056976319, "step": 2705 }, { "epoch": 2.669373692629507, "grad_norm": 0.195572167634964, "learning_rate": 7.286387644946602e-07, "loss": 0.4965, "mean_token_accuracy": 0.8387840166687965, "step": 2710 }, { "epoch": 2.674295558016488, "grad_norm": 0.2031807154417038, "learning_rate": 7.073088921396287e-07, "loss": 0.4907, "mean_token_accuracy": 0.840399731695652, "step": 2715 }, { "epoch": 2.67921742340347, "grad_norm": 0.2004314363002777, "learning_rate": 6.862844267517643e-07, "loss": 0.4846, "mean_token_accuracy": 0.8423734799027442, "step": 2720 }, { "epoch": 2.6841392887904516, "grad_norm": 0.20816642045974731, "learning_rate": 6.655660593061719e-07, "loss": 0.4982, "mean_token_accuracy": 0.8385626211762428, "step": 2725 }, { "epoch": 2.689061154177433, "grad_norm": 0.20351089537143707, "learning_rate": 6.451544707179635e-07, "loss": 0.4948, "mean_token_accuracy": 0.8395294427871705, "step": 2730 }, { "epoch": 2.6939830195644148, "grad_norm": 0.20076881349086761, "learning_rate": 6.250503318198664e-07, "loss": 0.4888, "mean_token_accuracy": 0.8412301942706109, "step": 2735 }, { "epoch": 2.6989048849513964, "grad_norm": 0.25244539976119995, "learning_rate": 6.052543033401892e-07, "loss": 0.4918, "mean_token_accuracy": 0.8402833178639412, "step": 2740 }, { "epoch": 2.7038267503383784, "grad_norm": 0.2058088779449463, "learning_rate": 5.857670358811096e-07, "loss": 0.4914, "mean_token_accuracy": 0.8405940279364585, "step": 2745 }, { "epoch": 2.70874861572536, "grad_norm": 0.2002749741077423, "learning_rate": 5.665891698972769e-07, "loss": 0.4956, "mean_token_accuracy": 0.8391197189688683, "step": 2750 }, { "epoch": 2.7136704811123415, "grad_norm": 0.19865228235721588, "learning_rate": 5.477213356747746e-07, "loss": 0.4894, "mean_token_accuracy": 0.8410469844937325, "step": 2755 }, { "epoch": 2.718592346499323, "grad_norm": 0.20059484243392944, "learning_rate": 5.291641533104053e-07, "loss": 0.4817, "mean_token_accuracy": 0.8434463173151017, "step": 2760 }, { "epoch": 2.7235142118863047, "grad_norm": 0.19962534308433533, "learning_rate": 5.109182326913053e-07, "loss": 0.4815, "mean_token_accuracy": 0.8433682397007942, "step": 2765 }, { "epoch": 2.7284360772732867, "grad_norm": 0.1976374238729477, "learning_rate": 4.929841734749063e-07, "loss": 0.4824, "mean_token_accuracy": 0.8429444268345833, "step": 2770 }, { "epoch": 2.7333579426602683, "grad_norm": 0.1919257491827011, "learning_rate": 4.7536256506922507e-07, "loss": 0.4858, "mean_token_accuracy": 0.8420413583517075, "step": 2775 }, { "epoch": 2.73827980804725, "grad_norm": 0.21447736024856567, "learning_rate": 4.580539866134914e-07, "loss": 0.4898, "mean_token_accuracy": 0.8408365085721016, "step": 2780 }, { "epoch": 2.7432016734342315, "grad_norm": 0.20053516328334808, "learning_rate": 4.410590069591192e-07, "loss": 0.4918, "mean_token_accuracy": 0.8403174698352813, "step": 2785 }, { "epoch": 2.748123538821213, "grad_norm": 0.3303152620792389, "learning_rate": 4.2437818465100313e-07, "loss": 0.4812, "mean_token_accuracy": 0.8434215649962425, "step": 2790 }, { "epoch": 2.753045404208195, "grad_norm": 0.194558247923851, "learning_rate": 4.0801206790916815e-07, "loss": 0.4804, "mean_token_accuracy": 0.8438364923000335, "step": 2795 }, { "epoch": 2.7579672695951767, "grad_norm": 0.19499559700489044, "learning_rate": 3.919611946107493e-07, "loss": 0.4825, "mean_token_accuracy": 0.8429989367723465, "step": 2800 }, { "epoch": 2.7628891349821583, "grad_norm": 0.19578364491462708, "learning_rate": 3.762260922723182e-07, "loss": 0.4866, "mean_token_accuracy": 0.8416179150342942, "step": 2805 }, { "epoch": 2.76781100036914, "grad_norm": 0.20279313623905182, "learning_rate": 3.6080727803254003e-07, "loss": 0.4913, "mean_token_accuracy": 0.8406556889414787, "step": 2810 }, { "epoch": 2.7727328657561214, "grad_norm": 0.20414599776268005, "learning_rate": 3.457052586351817e-07, "loss": 0.4921, "mean_token_accuracy": 0.8403137296438217, "step": 2815 }, { "epoch": 2.7776547311431035, "grad_norm": 0.20257827639579773, "learning_rate": 3.309205304124552e-07, "loss": 0.4888, "mean_token_accuracy": 0.841057425737381, "step": 2820 }, { "epoch": 2.782576596530085, "grad_norm": 0.19924387335777283, "learning_rate": 3.1645357926870957e-07, "loss": 0.4966, "mean_token_accuracy": 0.8389097020030022, "step": 2825 }, { "epoch": 2.7874984619170666, "grad_norm": 0.20351967215538025, "learning_rate": 3.0230488066445465e-07, "loss": 0.4912, "mean_token_accuracy": 0.8404456153512001, "step": 2830 }, { "epoch": 2.792420327304048, "grad_norm": 0.199168398976326, "learning_rate": 2.8847489960074136e-07, "loss": 0.4936, "mean_token_accuracy": 0.8398653537034988, "step": 2835 }, { "epoch": 2.79734219269103, "grad_norm": 0.19794094562530518, "learning_rate": 2.7496409060387973e-07, "loss": 0.4962, "mean_token_accuracy": 0.8388495057821274, "step": 2840 }, { "epoch": 2.802264058078012, "grad_norm": 0.19937507808208466, "learning_rate": 2.6177289771049274e-07, "loss": 0.4895, "mean_token_accuracy": 0.8410208597779274, "step": 2845 }, { "epoch": 2.807185923464993, "grad_norm": 0.19925516843795776, "learning_rate": 2.489017544529315e-07, "loss": 0.4875, "mean_token_accuracy": 0.8415358811616898, "step": 2850 }, { "epoch": 2.812107788851975, "grad_norm": 0.19592879712581635, "learning_rate": 2.3635108384502003e-07, "loss": 0.4949, "mean_token_accuracy": 0.839320321381092, "step": 2855 }, { "epoch": 2.8170296542389566, "grad_norm": 0.19561193883419037, "learning_rate": 2.2412129836816287e-07, "loss": 0.4913, "mean_token_accuracy": 0.840375654399395, "step": 2860 }, { "epoch": 2.821951519625938, "grad_norm": 0.1935349404811859, "learning_rate": 2.1221279995777833e-07, "loss": 0.4859, "mean_token_accuracy": 0.8416187852621079, "step": 2865 }, { "epoch": 2.8268733850129197, "grad_norm": 0.19886697828769684, "learning_rate": 2.0062597999009114e-07, "loss": 0.4821, "mean_token_accuracy": 0.8432388514280319, "step": 2870 }, { "epoch": 2.8317952503999013, "grad_norm": 0.19826510548591614, "learning_rate": 1.8936121926927508e-07, "loss": 0.49, "mean_token_accuracy": 0.8409401133656502, "step": 2875 }, { "epoch": 2.8367171157868833, "grad_norm": 0.21422724425792694, "learning_rate": 1.7841888801493178e-07, "loss": 0.4897, "mean_token_accuracy": 0.840906199812889, "step": 2880 }, { "epoch": 2.841638981173865, "grad_norm": 0.2021849900484085, "learning_rate": 1.677993458499272e-07, "loss": 0.4871, "mean_token_accuracy": 0.8416887044906616, "step": 2885 }, { "epoch": 2.8465608465608465, "grad_norm": 0.19902034103870392, "learning_rate": 1.5750294178856872e-07, "loss": 0.4884, "mean_token_accuracy": 0.8414162322878838, "step": 2890 }, { "epoch": 2.851482711947828, "grad_norm": 0.19861221313476562, "learning_rate": 1.4753001422514125e-07, "loss": 0.4926, "mean_token_accuracy": 0.8401012614369392, "step": 2895 }, { "epoch": 2.8564045773348097, "grad_norm": 0.19735361635684967, "learning_rate": 1.378808909227769e-07, "loss": 0.4849, "mean_token_accuracy": 0.8422791570425033, "step": 2900 }, { "epoch": 2.8613264427217917, "grad_norm": 0.20118270814418793, "learning_rate": 1.2855588900269057e-07, "loss": 0.4912, "mean_token_accuracy": 0.8406861796975136, "step": 2905 }, { "epoch": 2.8662483081087733, "grad_norm": 0.19249391555786133, "learning_rate": 1.1955531493375137e-07, "loss": 0.4795, "mean_token_accuracy": 0.8438849881291389, "step": 2910 }, { "epoch": 2.871170173495755, "grad_norm": 0.19686251878738403, "learning_rate": 1.1087946452241871e-07, "loss": 0.4937, "mean_token_accuracy": 0.8399393901228904, "step": 2915 }, { "epoch": 2.8760920388827365, "grad_norm": 0.1956326812505722, "learning_rate": 1.0252862290301092e-07, "loss": 0.4887, "mean_token_accuracy": 0.841577798128128, "step": 2920 }, { "epoch": 2.881013904269718, "grad_norm": 0.2053905874490738, "learning_rate": 9.45030645283418e-08, "loss": 0.4897, "mean_token_accuracy": 0.8410707041621208, "step": 2925 }, { "epoch": 2.8859357696567, "grad_norm": 0.19495834410190582, "learning_rate": 8.68030531606967e-08, "loss": 0.4927, "mean_token_accuracy": 0.8402184978127479, "step": 2930 }, { "epoch": 2.8908576350436817, "grad_norm": 0.1992396116256714, "learning_rate": 7.94288418631639e-08, "loss": 0.4857, "mean_token_accuracy": 0.842261828482151, "step": 2935 }, { "epoch": 2.8957795004306632, "grad_norm": 0.20448440313339233, "learning_rate": 7.238067299131901e-08, "loss": 0.4907, "mean_token_accuracy": 0.841072927415371, "step": 2940 }, { "epoch": 2.900701365817645, "grad_norm": 0.19940471649169922, "learning_rate": 6.565877818526245e-08, "loss": 0.4886, "mean_token_accuracy": 0.8412072688341141, "step": 2945 }, { "epoch": 2.9056232312046264, "grad_norm": 0.19256047904491425, "learning_rate": 5.926337836199891e-08, "loss": 0.4867, "mean_token_accuracy": 0.8416444838047028, "step": 2950 }, { "epoch": 2.9105450965916084, "grad_norm": 0.19797919690608978, "learning_rate": 5.319468370818537e-08, "loss": 0.4897, "mean_token_accuracy": 0.8410748258233071, "step": 2955 }, { "epoch": 2.91546696197859, "grad_norm": 0.1998082846403122, "learning_rate": 4.7452893673216596e-08, "loss": 0.4845, "mean_token_accuracy": 0.8427498519420624, "step": 2960 }, { "epoch": 2.9203888273655716, "grad_norm": 0.19540701806545258, "learning_rate": 4.203819696267486e-08, "loss": 0.4907, "mean_token_accuracy": 0.8408638656139373, "step": 2965 }, { "epoch": 2.925310692752553, "grad_norm": 0.19913552701473236, "learning_rate": 3.6950771532126004e-08, "loss": 0.4983, "mean_token_accuracy": 0.8385754480957985, "step": 2970 }, { "epoch": 2.9302325581395348, "grad_norm": 0.19257843494415283, "learning_rate": 3.2190784581270786e-08, "loss": 0.4878, "mean_token_accuracy": 0.841645573079586, "step": 2975 }, { "epoch": 2.935154423526517, "grad_norm": 0.19568364322185516, "learning_rate": 2.7758392548449253e-08, "loss": 0.4891, "mean_token_accuracy": 0.8412896126508713, "step": 2980 }, { "epoch": 2.9400762889134984, "grad_norm": 0.20067226886749268, "learning_rate": 2.3653741105499338e-08, "loss": 0.4836, "mean_token_accuracy": 0.8427690804004669, "step": 2985 }, { "epoch": 2.94499815430048, "grad_norm": 0.19799287617206573, "learning_rate": 1.9876965152975102e-08, "loss": 0.4895, "mean_token_accuracy": 0.8405489608645439, "step": 2990 }, { "epoch": 2.9499200196874615, "grad_norm": 1.0325350761413574, "learning_rate": 1.6428188815703627e-08, "loss": 0.4896, "mean_token_accuracy": 0.8411920800805092, "step": 2995 }, { "epoch": 2.954841885074443, "grad_norm": 0.1966339498758316, "learning_rate": 1.3307525438711611e-08, "loss": 0.488, "mean_token_accuracy": 0.841396550834179, "step": 3000 }, { "epoch": 2.959763750461425, "grad_norm": 0.2234841138124466, "learning_rate": 1.0515077583498346e-08, "loss": 0.4911, "mean_token_accuracy": 0.8406392633914948, "step": 3005 }, { "epoch": 2.9646856158484063, "grad_norm": 0.27488455176353455, "learning_rate": 8.050937024666195e-09, "loss": 0.4942, "mean_token_accuracy": 0.8396434351801872, "step": 3010 }, { "epoch": 2.9696074812353883, "grad_norm": 0.1911349892616272, "learning_rate": 5.9151847469041125e-09, "loss": 0.4823, "mean_token_accuracy": 0.8430395260453224, "step": 3015 }, { "epoch": 2.97452934662237, "grad_norm": 0.19882096350193024, "learning_rate": 4.1078909423253325e-09, "loss": 0.4995, "mean_token_accuracy": 0.8379872292280197, "step": 3020 }, { "epoch": 2.9794512120093515, "grad_norm": 0.20069076120853424, "learning_rate": 2.629115008160321e-09, "loss": 0.4964, "mean_token_accuracy": 0.8388297706842422, "step": 3025 }, { "epoch": 2.984373077396333, "grad_norm": 0.19437766075134277, "learning_rate": 1.4789055448061195e-09, "loss": 0.4851, "mean_token_accuracy": 0.8421405225992202, "step": 3030 }, { "epoch": 2.9892949427833146, "grad_norm": 0.19950829446315765, "learning_rate": 6.573003542276191e-10, "loss": 0.4889, "mean_token_accuracy": 0.8408236041665077, "step": 3035 }, { "epoch": 2.9942168081702967, "grad_norm": 0.19173409044742584, "learning_rate": 1.6432643871633346e-10, "loss": 0.4873, "mean_token_accuracy": 0.8419449985027313, "step": 3040 }, { "epoch": 2.9991386735572783, "grad_norm": 0.1980327069759369, "learning_rate": 0.0, "loss": 0.4895, "mean_token_accuracy": 0.8409327268600464, "step": 3045 }, { "epoch": 2.9991386735572783, "step": 3045, "total_flos": 2550348896010240.0, "train_loss": 0.5881131024979214, "train_runtime": 268544.791, "train_samples_per_second": 1.452, "train_steps_per_second": 0.011 } ], "logging_steps": 5, "max_steps": 3045, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2550348896010240.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }