{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 1578, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0063371356147021544, "grad_norm": 7.199723076955636, "learning_rate": 3.164556962025317e-07, "loss": 1.4397, "mean_token_accuracy": 0.6951015710830688, "step": 5 }, { "epoch": 0.012674271229404309, "grad_norm": 7.116001217650177, "learning_rate": 6.329113924050634e-07, "loss": 1.4552, "mean_token_accuracy": 0.6930991888046265, "step": 10 }, { "epoch": 0.019011406844106463, "grad_norm": 4.64551484224683, "learning_rate": 9.493670886075951e-07, "loss": 1.3993, "mean_token_accuracy": 0.6986153647303581, "step": 15 }, { "epoch": 0.025348542458808618, "grad_norm": 3.0027875956032366, "learning_rate": 1.2658227848101267e-06, "loss": 1.3103, "mean_token_accuracy": 0.7071203991770745, "step": 20 }, { "epoch": 0.031685678073510776, "grad_norm": 3.1251366551644244, "learning_rate": 1.5822784810126585e-06, "loss": 1.2458, "mean_token_accuracy": 0.7130974352359771, "step": 25 }, { "epoch": 0.03802281368821293, "grad_norm": 2.289149516732586, "learning_rate": 1.8987341772151901e-06, "loss": 1.1709, "mean_token_accuracy": 0.7238569274544716, "step": 30 }, { "epoch": 0.044359949302915085, "grad_norm": 2.0282034523858945, "learning_rate": 2.2151898734177215e-06, "loss": 1.1025, "mean_token_accuracy": 0.7365155085921288, "step": 35 }, { "epoch": 0.050697084917617236, "grad_norm": 1.4393865600169713, "learning_rate": 2.5316455696202535e-06, "loss": 1.0754, "mean_token_accuracy": 0.7417579337954521, "step": 40 }, { "epoch": 0.057034220532319393, "grad_norm": 0.9968108242787993, "learning_rate": 2.848101265822785e-06, "loss": 1.0382, "mean_token_accuracy": 0.7486504480242729, "step": 45 }, { "epoch": 0.06337135614702155, "grad_norm": 0.9450280587173171, "learning_rate": 3.164556962025317e-06, "loss": 1.0089, "mean_token_accuracy": 0.7545697376132011, "step": 50 }, { "epoch": 0.0697084917617237, "grad_norm": 0.9255531790602832, "learning_rate": 3.4810126582278487e-06, "loss": 0.974, "mean_token_accuracy": 0.7610737249255181, "step": 55 }, { "epoch": 0.07604562737642585, "grad_norm": 0.784862301963824, "learning_rate": 3.7974683544303802e-06, "loss": 0.9446, "mean_token_accuracy": 0.7658546909689903, "step": 60 }, { "epoch": 0.08238276299112801, "grad_norm": 0.8654366770506445, "learning_rate": 4.113924050632912e-06, "loss": 0.9532, "mean_token_accuracy": 0.764112365245819, "step": 65 }, { "epoch": 0.08871989860583017, "grad_norm": 0.8440093108811262, "learning_rate": 4.430379746835443e-06, "loss": 0.9019, "mean_token_accuracy": 0.7735387146472931, "step": 70 }, { "epoch": 0.09505703422053231, "grad_norm": 0.7242515634546599, "learning_rate": 4.746835443037975e-06, "loss": 0.8972, "mean_token_accuracy": 0.7742968738079071, "step": 75 }, { "epoch": 0.10139416983523447, "grad_norm": 0.7207614994286641, "learning_rate": 5.063291139240507e-06, "loss": 0.8872, "mean_token_accuracy": 0.7761535227298737, "step": 80 }, { "epoch": 0.10773130544993663, "grad_norm": 0.742791199954622, "learning_rate": 5.379746835443038e-06, "loss": 0.8559, "mean_token_accuracy": 0.7819930538535118, "step": 85 }, { "epoch": 0.11406844106463879, "grad_norm": 0.7678641716925835, "learning_rate": 5.69620253164557e-06, "loss": 0.8473, "mean_token_accuracy": 0.7834332928061485, "step": 90 }, { "epoch": 0.12040557667934093, "grad_norm": 0.71180994773894, "learning_rate": 6.012658227848101e-06, "loss": 0.8352, "mean_token_accuracy": 0.7855397373437881, "step": 95 }, { "epoch": 0.1267427122940431, "grad_norm": 0.7993738785147041, "learning_rate": 6.329113924050634e-06, "loss": 0.8589, "mean_token_accuracy": 0.7812221512198448, "step": 100 }, { "epoch": 0.13307984790874525, "grad_norm": 0.7568194042750847, "learning_rate": 6.645569620253165e-06, "loss": 0.8431, "mean_token_accuracy": 0.7850423708558083, "step": 105 }, { "epoch": 0.1394169835234474, "grad_norm": 0.7969657403354691, "learning_rate": 6.962025316455697e-06, "loss": 0.8146, "mean_token_accuracy": 0.7894491747021675, "step": 110 }, { "epoch": 0.14575411913814956, "grad_norm": 0.7814384559927074, "learning_rate": 7.2784810126582285e-06, "loss": 0.816, "mean_token_accuracy": 0.7893038675189018, "step": 115 }, { "epoch": 0.1520912547528517, "grad_norm": 0.7970973600599863, "learning_rate": 7.5949367088607605e-06, "loss": 0.8168, "mean_token_accuracy": 0.7892953917384148, "step": 120 }, { "epoch": 0.15842839036755388, "grad_norm": 0.7289531586042841, "learning_rate": 7.911392405063292e-06, "loss": 0.8036, "mean_token_accuracy": 0.7918314695358276, "step": 125 }, { "epoch": 0.16476552598225602, "grad_norm": 0.8996289034177167, "learning_rate": 8.227848101265824e-06, "loss": 0.7886, "mean_token_accuracy": 0.7948763906955719, "step": 130 }, { "epoch": 0.17110266159695817, "grad_norm": 0.9505048466982942, "learning_rate": 8.544303797468356e-06, "loss": 0.7765, "mean_token_accuracy": 0.7972663462162017, "step": 135 }, { "epoch": 0.17743979721166034, "grad_norm": 0.8547089186827208, "learning_rate": 8.860759493670886e-06, "loss": 0.7778, "mean_token_accuracy": 0.7966541960835457, "step": 140 }, { "epoch": 0.18377693282636248, "grad_norm": 0.8115832093940138, "learning_rate": 9.177215189873418e-06, "loss": 0.7755, "mean_token_accuracy": 0.7976241648197174, "step": 145 }, { "epoch": 0.19011406844106463, "grad_norm": 0.7240367508508893, "learning_rate": 9.49367088607595e-06, "loss": 0.7679, "mean_token_accuracy": 0.7988486766815186, "step": 150 }, { "epoch": 0.1964512040557668, "grad_norm": 0.8604548037210017, "learning_rate": 9.810126582278482e-06, "loss": 0.7666, "mean_token_accuracy": 0.7985615819692612, "step": 155 }, { "epoch": 0.20278833967046894, "grad_norm": 0.7650650036074902, "learning_rate": 9.99995105342046e-06, "loss": 0.7615, "mean_token_accuracy": 0.8001444712281227, "step": 160 }, { "epoch": 0.20912547528517111, "grad_norm": 0.7907667013526878, "learning_rate": 9.999400415406145e-06, "loss": 0.7662, "mean_token_accuracy": 0.7991914421319961, "step": 165 }, { "epoch": 0.21546261089987326, "grad_norm": 0.8483269008499935, "learning_rate": 9.998238023756727e-06, "loss": 0.7597, "mean_token_accuracy": 0.800473365187645, "step": 170 }, { "epoch": 0.2217997465145754, "grad_norm": 0.8423617289320647, "learning_rate": 9.996464020708734e-06, "loss": 0.7598, "mean_token_accuracy": 0.7996020078659057, "step": 175 }, { "epoch": 0.22813688212927757, "grad_norm": 0.8057636513151334, "learning_rate": 9.994078623338757e-06, "loss": 0.7566, "mean_token_accuracy": 0.800896917283535, "step": 180 }, { "epoch": 0.23447401774397972, "grad_norm": 0.8990102449117932, "learning_rate": 9.991082123536902e-06, "loss": 0.7522, "mean_token_accuracy": 0.8013818353414536, "step": 185 }, { "epoch": 0.24081115335868186, "grad_norm": 0.9698242122380497, "learning_rate": 9.987474887971067e-06, "loss": 0.7463, "mean_token_accuracy": 0.8028701841831207, "step": 190 }, { "epoch": 0.24714828897338403, "grad_norm": 0.960138107922893, "learning_rate": 9.983257358042076e-06, "loss": 0.7401, "mean_token_accuracy": 0.8041222214698791, "step": 195 }, { "epoch": 0.2534854245880862, "grad_norm": 0.8305799821436418, "learning_rate": 9.978430049829672e-06, "loss": 0.7601, "mean_token_accuracy": 0.8001280605793, "step": 200 }, { "epoch": 0.2598225602027883, "grad_norm": 0.7129991697669212, "learning_rate": 9.972993554029357e-06, "loss": 0.7575, "mean_token_accuracy": 0.8003058210015297, "step": 205 }, { "epoch": 0.2661596958174905, "grad_norm": 0.8819778135317846, "learning_rate": 9.966948535880118e-06, "loss": 0.7444, "mean_token_accuracy": 0.8032929092645645, "step": 210 }, { "epoch": 0.27249683143219267, "grad_norm": 0.7833769376025013, "learning_rate": 9.960295735083023e-06, "loss": 0.7151, "mean_token_accuracy": 0.8091372177004814, "step": 215 }, { "epoch": 0.2788339670468948, "grad_norm": 1.2183716099383828, "learning_rate": 9.953035965710707e-06, "loss": 0.7346, "mean_token_accuracy": 0.8045761153101921, "step": 220 }, { "epoch": 0.28517110266159695, "grad_norm": 0.901875985078722, "learning_rate": 9.945170116107758e-06, "loss": 0.7337, "mean_token_accuracy": 0.805341312289238, "step": 225 }, { "epoch": 0.2915082382762991, "grad_norm": 0.8010789142797352, "learning_rate": 9.936699148782018e-06, "loss": 0.737, "mean_token_accuracy": 0.8051745280623436, "step": 230 }, { "epoch": 0.29784537389100124, "grad_norm": 0.7625075089546256, "learning_rate": 9.927624100286795e-06, "loss": 0.7288, "mean_token_accuracy": 0.8064413368701935, "step": 235 }, { "epoch": 0.3041825095057034, "grad_norm": 0.7665506308733163, "learning_rate": 9.917946081094033e-06, "loss": 0.7001, "mean_token_accuracy": 0.8119662031531334, "step": 240 }, { "epoch": 0.3105196451204056, "grad_norm": 0.8173005458001997, "learning_rate": 9.907666275458432e-06, "loss": 0.7171, "mean_token_accuracy": 0.8087792381644249, "step": 245 }, { "epoch": 0.31685678073510776, "grad_norm": 0.7944707699072153, "learning_rate": 9.896785941272524e-06, "loss": 0.7169, "mean_token_accuracy": 0.808886106312275, "step": 250 }, { "epoch": 0.3231939163498099, "grad_norm": 0.7652270272633694, "learning_rate": 9.885306409912767e-06, "loss": 0.7122, "mean_token_accuracy": 0.8092179223895073, "step": 255 }, { "epoch": 0.32953105196451205, "grad_norm": 0.8298720811434571, "learning_rate": 9.87322908607661e-06, "loss": 0.7106, "mean_token_accuracy": 0.8099273145198822, "step": 260 }, { "epoch": 0.3358681875792142, "grad_norm": 0.6635750146125453, "learning_rate": 9.860555447610626e-06, "loss": 0.7205, "mean_token_accuracy": 0.8083759486675263, "step": 265 }, { "epoch": 0.34220532319391633, "grad_norm": 0.7701167215766528, "learning_rate": 9.847287045329665e-06, "loss": 0.7178, "mean_token_accuracy": 0.8084105476737022, "step": 270 }, { "epoch": 0.3485424588086185, "grad_norm": 0.8129554147937268, "learning_rate": 9.833425502827087e-06, "loss": 0.7191, "mean_token_accuracy": 0.8078344166278839, "step": 275 }, { "epoch": 0.3548795944233207, "grad_norm": 0.7153635278024493, "learning_rate": 9.818972516276096e-06, "loss": 0.6973, "mean_token_accuracy": 0.8126269072294235, "step": 280 }, { "epoch": 0.3612167300380228, "grad_norm": 0.7019835045316667, "learning_rate": 9.803929854222182e-06, "loss": 0.704, "mean_token_accuracy": 0.8114176645874978, "step": 285 }, { "epoch": 0.36755386565272496, "grad_norm": 0.7615682616292789, "learning_rate": 9.788299357366717e-06, "loss": 0.7089, "mean_token_accuracy": 0.8106587365269661, "step": 290 }, { "epoch": 0.37389100126742714, "grad_norm": 0.9786947635111585, "learning_rate": 9.772082938341706e-06, "loss": 0.7014, "mean_token_accuracy": 0.8121261984109879, "step": 295 }, { "epoch": 0.38022813688212925, "grad_norm": 0.8212453521500733, "learning_rate": 9.755282581475769e-06, "loss": 0.7072, "mean_token_accuracy": 0.8106094494462013, "step": 300 }, { "epoch": 0.3865652724968314, "grad_norm": 0.865055505917381, "learning_rate": 9.7379003425513e-06, "loss": 0.7163, "mean_token_accuracy": 0.8092033118009567, "step": 305 }, { "epoch": 0.3929024081115336, "grad_norm": 0.6716631342519259, "learning_rate": 9.71993834855293e-06, "loss": 0.7045, "mean_token_accuracy": 0.8109571009874343, "step": 310 }, { "epoch": 0.39923954372623577, "grad_norm": 0.7649280200479434, "learning_rate": 9.701398797407258e-06, "loss": 0.7044, "mean_token_accuracy": 0.8110996559262276, "step": 315 }, { "epoch": 0.4055766793409379, "grad_norm": 0.732205588585856, "learning_rate": 9.68228395771388e-06, "loss": 0.6906, "mean_token_accuracy": 0.8138323068618775, "step": 320 }, { "epoch": 0.41191381495564006, "grad_norm": 0.8349157198044287, "learning_rate": 9.662596168467823e-06, "loss": 0.6963, "mean_token_accuracy": 0.8128764078021049, "step": 325 }, { "epoch": 0.41825095057034223, "grad_norm": 0.7284163977404773, "learning_rate": 9.6423378387733e-06, "loss": 0.6926, "mean_token_accuracy": 0.8138028383255005, "step": 330 }, { "epoch": 0.42458808618504434, "grad_norm": 0.6903566919121088, "learning_rate": 9.621511447548946e-06, "loss": 0.6992, "mean_token_accuracy": 0.8125665381550788, "step": 335 }, { "epoch": 0.4309252217997465, "grad_norm": 0.7031778472809708, "learning_rate": 9.600119543224467e-06, "loss": 0.6935, "mean_token_accuracy": 0.8134042397141457, "step": 340 }, { "epoch": 0.4372623574144487, "grad_norm": 0.8781454719155253, "learning_rate": 9.578164743428808e-06, "loss": 0.6938, "mean_token_accuracy": 0.8132070809602737, "step": 345 }, { "epoch": 0.4435994930291508, "grad_norm": 0.8306105321262149, "learning_rate": 9.55564973466984e-06, "loss": 0.6928, "mean_token_accuracy": 0.8133361831307411, "step": 350 }, { "epoch": 0.449936628643853, "grad_norm": 0.7046997598602632, "learning_rate": 9.532577272005637e-06, "loss": 0.679, "mean_token_accuracy": 0.8159057974815369, "step": 355 }, { "epoch": 0.45627376425855515, "grad_norm": 0.889929783644245, "learning_rate": 9.508950178707335e-06, "loss": 0.6872, "mean_token_accuracy": 0.8148621737957, "step": 360 }, { "epoch": 0.46261089987325726, "grad_norm": 0.9776823189593525, "learning_rate": 9.484771345913673e-06, "loss": 0.6902, "mean_token_accuracy": 0.8141683742403985, "step": 365 }, { "epoch": 0.46894803548795944, "grad_norm": 0.7706175050493524, "learning_rate": 9.460043732277213e-06, "loss": 0.6908, "mean_token_accuracy": 0.8145220652222633, "step": 370 }, { "epoch": 0.4752851711026616, "grad_norm": 0.6524214668295174, "learning_rate": 9.434770363602307e-06, "loss": 0.6983, "mean_token_accuracy": 0.8123016864061355, "step": 375 }, { "epoch": 0.4816223067173637, "grad_norm": 0.717843515473759, "learning_rate": 9.408954332474845e-06, "loss": 0.6677, "mean_token_accuracy": 0.8185531318187713, "step": 380 }, { "epoch": 0.4879594423320659, "grad_norm": 0.7618453217486776, "learning_rate": 9.382598797883811e-06, "loss": 0.6795, "mean_token_accuracy": 0.8164624303579331, "step": 385 }, { "epoch": 0.49429657794676807, "grad_norm": 0.7324610745032628, "learning_rate": 9.355706984834765e-06, "loss": 0.6836, "mean_token_accuracy": 0.8149291038513183, "step": 390 }, { "epoch": 0.5006337135614702, "grad_norm": 0.6779242960146721, "learning_rate": 9.328282183955179e-06, "loss": 0.6884, "mean_token_accuracy": 0.8146958678960801, "step": 395 }, { "epoch": 0.5069708491761724, "grad_norm": 0.827105664596769, "learning_rate": 9.300327751091806e-06, "loss": 0.6873, "mean_token_accuracy": 0.814927139878273, "step": 400 }, { "epoch": 0.5133079847908745, "grad_norm": 0.6798030291558349, "learning_rate": 9.271847106900022e-06, "loss": 0.6659, "mean_token_accuracy": 0.8187542855739594, "step": 405 }, { "epoch": 0.5196451204055766, "grad_norm": 0.6549281773308409, "learning_rate": 9.242843736425269e-06, "loss": 0.6749, "mean_token_accuracy": 0.8172334164381028, "step": 410 }, { "epoch": 0.5259822560202788, "grad_norm": 0.702757870059226, "learning_rate": 9.213321188676595e-06, "loss": 0.6799, "mean_token_accuracy": 0.8162769109010697, "step": 415 }, { "epoch": 0.532319391634981, "grad_norm": 0.663720096138884, "learning_rate": 9.183283076192386e-06, "loss": 0.6688, "mean_token_accuracy": 0.8184930950403213, "step": 420 }, { "epoch": 0.5386565272496832, "grad_norm": 0.6874302061015839, "learning_rate": 9.152733074598312e-06, "loss": 0.6742, "mean_token_accuracy": 0.8174020066857338, "step": 425 }, { "epoch": 0.5449936628643853, "grad_norm": 0.7315428759079102, "learning_rate": 9.121674922157558e-06, "loss": 0.6738, "mean_token_accuracy": 0.817636775970459, "step": 430 }, { "epoch": 0.5513307984790875, "grad_norm": 0.786643495084509, "learning_rate": 9.090112419313395e-06, "loss": 0.6736, "mean_token_accuracy": 0.817160977423191, "step": 435 }, { "epoch": 0.5576679340937896, "grad_norm": 0.6591137928729695, "learning_rate": 9.058049428224128e-06, "loss": 0.6617, "mean_token_accuracy": 0.8197388723492622, "step": 440 }, { "epoch": 0.5640050697084917, "grad_norm": 0.7812371618484959, "learning_rate": 9.025489872290511e-06, "loss": 0.6634, "mean_token_accuracy": 0.8193035304546357, "step": 445 }, { "epoch": 0.5703422053231939, "grad_norm": 0.6845961589145344, "learning_rate": 8.99243773567565e-06, "loss": 0.6834, "mean_token_accuracy": 0.8159178540110588, "step": 450 }, { "epoch": 0.5766793409378961, "grad_norm": 0.6657472122738636, "learning_rate": 8.958897062817491e-06, "loss": 0.6892, "mean_token_accuracy": 0.8144657433032989, "step": 455 }, { "epoch": 0.5830164765525983, "grad_norm": 0.6161660996953076, "learning_rate": 8.924871957933904e-06, "loss": 0.6746, "mean_token_accuracy": 0.8171708762645722, "step": 460 }, { "epoch": 0.5893536121673004, "grad_norm": 0.702287591616562, "learning_rate": 8.890366584520482e-06, "loss": 0.6696, "mean_token_accuracy": 0.8184025406837463, "step": 465 }, { "epoch": 0.5956907477820025, "grad_norm": 0.6857028656369465, "learning_rate": 8.855385164841072e-06, "loss": 0.6758, "mean_token_accuracy": 0.8170812010765076, "step": 470 }, { "epoch": 0.6020278833967047, "grad_norm": 0.6231296442226781, "learning_rate": 8.819931979411107e-06, "loss": 0.6734, "mean_token_accuracy": 0.81716128885746, "step": 475 }, { "epoch": 0.6083650190114068, "grad_norm": 0.6948180768376443, "learning_rate": 8.78401136647383e-06, "loss": 0.654, "mean_token_accuracy": 0.8216980487108231, "step": 480 }, { "epoch": 0.614702154626109, "grad_norm": 0.6911375954678098, "learning_rate": 8.747627721469437e-06, "loss": 0.6635, "mean_token_accuracy": 0.8201975762844086, "step": 485 }, { "epoch": 0.6210392902408112, "grad_norm": 0.7213580744708442, "learning_rate": 8.710785496497226e-06, "loss": 0.6651, "mean_token_accuracy": 0.8194010749459266, "step": 490 }, { "epoch": 0.6273764258555133, "grad_norm": 0.6543956981962712, "learning_rate": 8.673489199770819e-06, "loss": 0.6607, "mean_token_accuracy": 0.8201611772179603, "step": 495 }, { "epoch": 0.6337135614702155, "grad_norm": 0.7036498788596002, "learning_rate": 8.635743395066511e-06, "loss": 0.651, "mean_token_accuracy": 0.8222277790307999, "step": 500 }, { "epoch": 0.6400506970849176, "grad_norm": 0.6168799535449692, "learning_rate": 8.597552701164818e-06, "loss": 0.6592, "mean_token_accuracy": 0.8199419066309929, "step": 505 }, { "epoch": 0.6463878326996197, "grad_norm": 0.697481072279894, "learning_rate": 8.558921791285304e-06, "loss": 0.6513, "mean_token_accuracy": 0.8216616123914718, "step": 510 }, { "epoch": 0.6527249683143219, "grad_norm": 0.7978637983888536, "learning_rate": 8.519855392514734e-06, "loss": 0.6469, "mean_token_accuracy": 0.8225123390555382, "step": 515 }, { "epoch": 0.6590621039290241, "grad_norm": 0.709080907162204, "learning_rate": 8.480358285228648e-06, "loss": 0.6656, "mean_token_accuracy": 0.8191539570689201, "step": 520 }, { "epoch": 0.6653992395437263, "grad_norm": 0.7897211062263881, "learning_rate": 8.440435302506405e-06, "loss": 0.6412, "mean_token_accuracy": 0.8238195776939392, "step": 525 }, { "epoch": 0.6717363751584284, "grad_norm": 0.6661911891034582, "learning_rate": 8.400091329539784e-06, "loss": 0.6611, "mean_token_accuracy": 0.8201816022396088, "step": 530 }, { "epoch": 0.6780735107731305, "grad_norm": 0.6195761512766995, "learning_rate": 8.359331303035205e-06, "loss": 0.6593, "mean_token_accuracy": 0.8203893005847931, "step": 535 }, { "epoch": 0.6844106463878327, "grad_norm": 0.6310438648482855, "learning_rate": 8.31816021060964e-06, "loss": 0.6634, "mean_token_accuracy": 0.8192948743700981, "step": 540 }, { "epoch": 0.6907477820025348, "grad_norm": 0.6512895144306936, "learning_rate": 8.276583090180311e-06, "loss": 0.6666, "mean_token_accuracy": 0.8186753287911415, "step": 545 }, { "epoch": 0.697084917617237, "grad_norm": 0.6124105415248355, "learning_rate": 8.234605029348224e-06, "loss": 0.6511, "mean_token_accuracy": 0.8219994261860848, "step": 550 }, { "epoch": 0.7034220532319392, "grad_norm": 0.6601442192692848, "learning_rate": 8.192231164775609e-06, "loss": 0.6391, "mean_token_accuracy": 0.8252027094364166, "step": 555 }, { "epoch": 0.7097591888466414, "grad_norm": 0.7028934088221325, "learning_rate": 8.149466681557384e-06, "loss": 0.6558, "mean_token_accuracy": 0.8209778189659118, "step": 560 }, { "epoch": 0.7160963244613435, "grad_norm": 0.7857750596740971, "learning_rate": 8.106316812586676e-06, "loss": 0.6486, "mean_token_accuracy": 0.8220974311232567, "step": 565 }, { "epoch": 0.7224334600760456, "grad_norm": 0.8961234969300941, "learning_rate": 8.062786837914492e-06, "loss": 0.6386, "mean_token_accuracy": 0.824979268014431, "step": 570 }, { "epoch": 0.7287705956907478, "grad_norm": 0.686280664966789, "learning_rate": 8.01888208410362e-06, "loss": 0.6622, "mean_token_accuracy": 0.8198226556181908, "step": 575 }, { "epoch": 0.7351077313054499, "grad_norm": 0.8344864616701414, "learning_rate": 7.974607923576859e-06, "loss": 0.6537, "mean_token_accuracy": 0.821578212082386, "step": 580 }, { "epoch": 0.7414448669201521, "grad_norm": 0.9938826585970929, "learning_rate": 7.9299697739596e-06, "loss": 0.6544, "mean_token_accuracy": 0.8208117336034775, "step": 585 }, { "epoch": 0.7477820025348543, "grad_norm": 0.6249233717628465, "learning_rate": 7.884973097416908e-06, "loss": 0.6591, "mean_token_accuracy": 0.8208227157592773, "step": 590 }, { "epoch": 0.7541191381495564, "grad_norm": 0.6761543444596165, "learning_rate": 7.83962339998514e-06, "loss": 0.6439, "mean_token_accuracy": 0.8236203759908676, "step": 595 }, { "epoch": 0.7604562737642585, "grad_norm": 0.8850862666109794, "learning_rate": 7.793926230898187e-06, "loss": 0.6418, "mean_token_accuracy": 0.8238036289811135, "step": 600 }, { "epoch": 0.7667934093789607, "grad_norm": 0.6931013119334469, "learning_rate": 7.747887181908464e-06, "loss": 0.6513, "mean_token_accuracy": 0.8221172288060188, "step": 605 }, { "epoch": 0.7731305449936628, "grad_norm": 0.9142852384539434, "learning_rate": 7.701511886602643e-06, "loss": 0.6522, "mean_token_accuracy": 0.8214233443140984, "step": 610 }, { "epoch": 0.779467680608365, "grad_norm": 0.693942629552867, "learning_rate": 7.65480601971232e-06, "loss": 0.6555, "mean_token_accuracy": 0.8214162334799766, "step": 615 }, { "epoch": 0.7858048162230672, "grad_norm": 0.7185534733688809, "learning_rate": 7.6077752964196095e-06, "loss": 0.6514, "mean_token_accuracy": 0.821819719672203, "step": 620 }, { "epoch": 0.7921419518377694, "grad_norm": 0.7933553753697458, "learning_rate": 7.560425471657814e-06, "loss": 0.6507, "mean_token_accuracy": 0.8215969070792198, "step": 625 }, { "epoch": 0.7984790874524715, "grad_norm": 0.9942445013974323, "learning_rate": 7.512762339407214e-06, "loss": 0.6426, "mean_token_accuracy": 0.8233709827065467, "step": 630 }, { "epoch": 0.8048162230671736, "grad_norm": 0.7111122316238967, "learning_rate": 7.464791731986084e-06, "loss": 0.6446, "mean_token_accuracy": 0.8233424022793769, "step": 635 }, { "epoch": 0.8111533586818758, "grad_norm": 0.6760326829400325, "learning_rate": 7.4165195193370245e-06, "loss": 0.6411, "mean_token_accuracy": 0.8234749510884285, "step": 640 }, { "epoch": 0.8174904942965779, "grad_norm": 0.7157859491675397, "learning_rate": 7.3679516083086785e-06, "loss": 0.6403, "mean_token_accuracy": 0.8245514526963234, "step": 645 }, { "epoch": 0.8238276299112801, "grad_norm": 0.6125130117848593, "learning_rate": 7.319093941932941e-06, "loss": 0.648, "mean_token_accuracy": 0.8229272648692131, "step": 650 }, { "epoch": 0.8301647655259823, "grad_norm": 0.6193392226038144, "learning_rate": 7.269952498697734e-06, "loss": 0.6568, "mean_token_accuracy": 0.8208993718028068, "step": 655 }, { "epoch": 0.8365019011406845, "grad_norm": 0.5569382668639404, "learning_rate": 7.2205332918154525e-06, "loss": 0.6471, "mean_token_accuracy": 0.8230623930692673, "step": 660 }, { "epoch": 0.8428390367553865, "grad_norm": 0.6854397276184668, "learning_rate": 7.170842368487145e-06, "loss": 0.6394, "mean_token_accuracy": 0.8240847915410996, "step": 665 }, { "epoch": 0.8491761723700887, "grad_norm": 0.7247430930413721, "learning_rate": 7.120885809162561e-06, "loss": 0.6496, "mean_token_accuracy": 0.8226393803954124, "step": 670 }, { "epoch": 0.8555133079847909, "grad_norm": 0.5833185802048395, "learning_rate": 7.070669726796095e-06, "loss": 0.644, "mean_token_accuracy": 0.8238432243466377, "step": 675 }, { "epoch": 0.861850443599493, "grad_norm": 0.6587621871435737, "learning_rate": 7.020200266098791e-06, "loss": 0.6367, "mean_token_accuracy": 0.8251640364527703, "step": 680 }, { "epoch": 0.8681875792141952, "grad_norm": 0.9240470458812879, "learning_rate": 6.969483602786429e-06, "loss": 0.6335, "mean_token_accuracy": 0.8250990778207778, "step": 685 }, { "epoch": 0.8745247148288974, "grad_norm": 0.6647921620988979, "learning_rate": 6.918525942823836e-06, "loss": 0.6358, "mean_token_accuracy": 0.8253032699227333, "step": 690 }, { "epoch": 0.8808618504435995, "grad_norm": 0.7460235517208977, "learning_rate": 6.8673335216654945e-06, "loss": 0.6364, "mean_token_accuracy": 0.8251613467931748, "step": 695 }, { "epoch": 0.8871989860583016, "grad_norm": 0.5692165964237054, "learning_rate": 6.815912603492531e-06, "loss": 0.63, "mean_token_accuracy": 0.8269012838602066, "step": 700 }, { "epoch": 0.8935361216730038, "grad_norm": 0.7678044598257266, "learning_rate": 6.7642694804462026e-06, "loss": 0.641, "mean_token_accuracy": 0.8240568235516548, "step": 705 }, { "epoch": 0.899873257287706, "grad_norm": 0.6476587911488177, "learning_rate": 6.712410471857955e-06, "loss": 0.6389, "mean_token_accuracy": 0.8243090897798538, "step": 710 }, { "epoch": 0.9062103929024081, "grad_norm": 0.6996232991940935, "learning_rate": 6.660341923476152e-06, "loss": 0.6309, "mean_token_accuracy": 0.8264057099819183, "step": 715 }, { "epoch": 0.9125475285171103, "grad_norm": 0.6140056059724183, "learning_rate": 6.608070206689583e-06, "loss": 0.6284, "mean_token_accuracy": 0.826878672838211, "step": 720 }, { "epoch": 0.9188846641318125, "grad_norm": 0.5994244215051143, "learning_rate": 6.555601717747815e-06, "loss": 0.6469, "mean_token_accuracy": 0.8231760680675506, "step": 725 }, { "epoch": 0.9252217997465145, "grad_norm": 0.671715865180922, "learning_rate": 6.502942876978524e-06, "loss": 0.626, "mean_token_accuracy": 0.8275385439395905, "step": 730 }, { "epoch": 0.9315589353612167, "grad_norm": 0.6964725986892187, "learning_rate": 6.450100128001861e-06, "loss": 0.615, "mean_token_accuracy": 0.8296460658311844, "step": 735 }, { "epoch": 0.9378960709759189, "grad_norm": 0.6643867039068622, "learning_rate": 6.397079936941975e-06, "loss": 0.6425, "mean_token_accuracy": 0.823666226863861, "step": 740 }, { "epoch": 0.944233206590621, "grad_norm": 0.612108400302355, "learning_rate": 6.343888791635797e-06, "loss": 0.6222, "mean_token_accuracy": 0.8274678066372871, "step": 745 }, { "epoch": 0.9505703422053232, "grad_norm": 0.5888135214791528, "learning_rate": 6.2905332008391304e-06, "loss": 0.6457, "mean_token_accuracy": 0.8232318565249443, "step": 750 }, { "epoch": 0.9569074778200254, "grad_norm": 0.6023978340437303, "learning_rate": 6.237019693430227e-06, "loss": 0.6244, "mean_token_accuracy": 0.8275379940867424, "step": 755 }, { "epoch": 0.9632446134347274, "grad_norm": 0.5860893552553069, "learning_rate": 6.18335481761086e-06, "loss": 0.6258, "mean_token_accuracy": 0.8275753378868103, "step": 760 }, { "epoch": 0.9695817490494296, "grad_norm": 0.6183329734308459, "learning_rate": 6.1295451401050645e-06, "loss": 0.6487, "mean_token_accuracy": 0.8231626331806183, "step": 765 }, { "epoch": 0.9759188846641318, "grad_norm": 0.6472859730529533, "learning_rate": 6.075597245355589e-06, "loss": 0.6367, "mean_token_accuracy": 0.8252906337380409, "step": 770 }, { "epoch": 0.982256020278834, "grad_norm": 0.7048827333728572, "learning_rate": 6.021517734718193e-06, "loss": 0.6331, "mean_token_accuracy": 0.8252324685454369, "step": 775 }, { "epoch": 0.9885931558935361, "grad_norm": 0.670917489168749, "learning_rate": 5.967313225653863e-06, "loss": 0.6311, "mean_token_accuracy": 0.8262254923582077, "step": 780 }, { "epoch": 0.9949302915082383, "grad_norm": 0.6294039276801214, "learning_rate": 5.912990350919075e-06, "loss": 0.6366, "mean_token_accuracy": 0.8250793889164925, "step": 785 }, { "epoch": 1.0012674271229405, "grad_norm": 0.5750660780176277, "learning_rate": 5.85855575775416e-06, "loss": 0.6356, "mean_token_accuracy": 0.8255759388208389, "step": 790 }, { "epoch": 1.0076045627376427, "grad_norm": 0.5873083803261483, "learning_rate": 5.804016107069922e-06, "loss": 0.5899, "mean_token_accuracy": 0.8365576922893524, "step": 795 }, { "epoch": 1.0139416983523448, "grad_norm": 0.7245732012982048, "learning_rate": 5.749378072632572e-06, "loss": 0.5924, "mean_token_accuracy": 0.8353384211659431, "step": 800 }, { "epoch": 1.020278833967047, "grad_norm": 0.5625102045050165, "learning_rate": 5.694648340247087e-06, "loss": 0.5855, "mean_token_accuracy": 0.8365451633930207, "step": 805 }, { "epoch": 1.026615969581749, "grad_norm": 0.5811796962078384, "learning_rate": 5.639833606939103e-06, "loss": 0.5835, "mean_token_accuracy": 0.8374374285340309, "step": 810 }, { "epoch": 1.0329531051964511, "grad_norm": 0.6064815307080854, "learning_rate": 5.584940580135423e-06, "loss": 0.5918, "mean_token_accuracy": 0.835510890185833, "step": 815 }, { "epoch": 1.0392902408111533, "grad_norm": 0.5469353793310435, "learning_rate": 5.529975976843268e-06, "loss": 0.5765, "mean_token_accuracy": 0.839336322247982, "step": 820 }, { "epoch": 1.0456273764258555, "grad_norm": 0.591194718615289, "learning_rate": 5.474946522828344e-06, "loss": 0.571, "mean_token_accuracy": 0.8397138401865959, "step": 825 }, { "epoch": 1.0519645120405576, "grad_norm": 0.6529351075330402, "learning_rate": 5.419858951791842e-06, "loss": 0.587, "mean_token_accuracy": 0.8367372244596482, "step": 830 }, { "epoch": 1.0583016476552598, "grad_norm": 0.5750159656566077, "learning_rate": 5.364720004546467e-06, "loss": 0.5713, "mean_token_accuracy": 0.8396085217595101, "step": 835 }, { "epoch": 1.064638783269962, "grad_norm": 0.5356356446036812, "learning_rate": 5.3095364281915905e-06, "loss": 0.5743, "mean_token_accuracy": 0.8390779420733452, "step": 840 }, { "epoch": 1.0709759188846641, "grad_norm": 0.5657627605570825, "learning_rate": 5.254314975287649e-06, "loss": 0.5768, "mean_token_accuracy": 0.8388962477445603, "step": 845 }, { "epoch": 1.0773130544993663, "grad_norm": 0.5994046834255601, "learning_rate": 5.199062403029851e-06, "loss": 0.5779, "mean_token_accuracy": 0.838576190173626, "step": 850 }, { "epoch": 1.0836501901140685, "grad_norm": 0.5512378922303693, "learning_rate": 5.143785472421341e-06, "loss": 0.5736, "mean_token_accuracy": 0.8392498835921287, "step": 855 }, { "epoch": 1.0899873257287707, "grad_norm": 0.6231063067990558, "learning_rate": 5.088490947445884e-06, "loss": 0.5787, "mean_token_accuracy": 0.8382582783699035, "step": 860 }, { "epoch": 1.0963244613434728, "grad_norm": 0.6258759211004005, "learning_rate": 5.033185594240184e-06, "loss": 0.5867, "mean_token_accuracy": 0.8368578165769577, "step": 865 }, { "epoch": 1.102661596958175, "grad_norm": 0.5758426513877979, "learning_rate": 4.977876180265948e-06, "loss": 0.5781, "mean_token_accuracy": 0.8380098447203637, "step": 870 }, { "epoch": 1.1089987325728772, "grad_norm": 0.5362099307940532, "learning_rate": 4.922569473481779e-06, "loss": 0.579, "mean_token_accuracy": 0.8374864637851716, "step": 875 }, { "epoch": 1.1153358681875791, "grad_norm": 0.6117359275633708, "learning_rate": 4.867272241515013e-06, "loss": 0.5745, "mean_token_accuracy": 0.8394086301326752, "step": 880 }, { "epoch": 1.1216730038022813, "grad_norm": 0.6163525031585341, "learning_rate": 4.811991250833598e-06, "loss": 0.575, "mean_token_accuracy": 0.8387202203273774, "step": 885 }, { "epoch": 1.1280101394169835, "grad_norm": 0.5344005108748248, "learning_rate": 4.756733265918111e-06, "loss": 0.5805, "mean_token_accuracy": 0.8385160818696022, "step": 890 }, { "epoch": 1.1343472750316856, "grad_norm": 0.5606427842219186, "learning_rate": 4.701505048434017e-06, "loss": 0.58, "mean_token_accuracy": 0.837983712553978, "step": 895 }, { "epoch": 1.1406844106463878, "grad_norm": 0.5635525365545201, "learning_rate": 4.646313356404278e-06, "loss": 0.5721, "mean_token_accuracy": 0.8402201250195503, "step": 900 }, { "epoch": 1.14702154626109, "grad_norm": 0.5279253266696204, "learning_rate": 4.5911649433824055e-06, "loss": 0.5722, "mean_token_accuracy": 0.8398120388388634, "step": 905 }, { "epoch": 1.1533586818757922, "grad_norm": 0.5355638715895371, "learning_rate": 4.536066557626057e-06, "loss": 0.5717, "mean_token_accuracy": 0.8396236389875412, "step": 910 }, { "epoch": 1.1596958174904943, "grad_norm": 0.5298050755566127, "learning_rate": 4.481024941271283e-06, "loss": 0.5825, "mean_token_accuracy": 0.837471354007721, "step": 915 }, { "epoch": 1.1660329531051965, "grad_norm": 0.6099091835516977, "learning_rate": 4.426046829507525e-06, "loss": 0.5739, "mean_token_accuracy": 0.8395572647452354, "step": 920 }, { "epoch": 1.1723700887198987, "grad_norm": 0.5282685583180019, "learning_rate": 4.371138949753457e-06, "loss": 0.5758, "mean_token_accuracy": 0.8386889979243278, "step": 925 }, { "epoch": 1.1787072243346008, "grad_norm": 0.5498053929758666, "learning_rate": 4.316308020833788e-06, "loss": 0.5717, "mean_token_accuracy": 0.8401581376791001, "step": 930 }, { "epoch": 1.1850443599493028, "grad_norm": 0.545684866299052, "learning_rate": 4.261560752157106e-06, "loss": 0.5821, "mean_token_accuracy": 0.8375889748334885, "step": 935 }, { "epoch": 1.1913814955640052, "grad_norm": 0.5275676276739754, "learning_rate": 4.20690384289488e-06, "loss": 0.5865, "mean_token_accuracy": 0.8369634434580803, "step": 940 }, { "epoch": 1.1977186311787071, "grad_norm": 0.5147709084468725, "learning_rate": 4.152343981161713e-06, "loss": 0.5735, "mean_token_accuracy": 0.8388126537203788, "step": 945 }, { "epoch": 1.2040557667934093, "grad_norm": 0.5553445767071536, "learning_rate": 4.097887843196949e-06, "loss": 0.5706, "mean_token_accuracy": 0.8400391504168511, "step": 950 }, { "epoch": 1.2103929024081115, "grad_norm": 0.5755093000837989, "learning_rate": 4.043542092547729e-06, "loss": 0.5738, "mean_token_accuracy": 0.8393745362758637, "step": 955 }, { "epoch": 1.2167300380228137, "grad_norm": 0.5323369182306833, "learning_rate": 3.989313379253609e-06, "loss": 0.5707, "mean_token_accuracy": 0.8395906254649163, "step": 960 }, { "epoch": 1.2230671736375158, "grad_norm": 0.5398923057065517, "learning_rate": 3.935208339032819e-06, "loss": 0.5773, "mean_token_accuracy": 0.8380544230341911, "step": 965 }, { "epoch": 1.229404309252218, "grad_norm": 0.5138506118324425, "learning_rate": 3.881233592470287e-06, "loss": 0.5697, "mean_token_accuracy": 0.8401115134358406, "step": 970 }, { "epoch": 1.2357414448669202, "grad_norm": 0.531254594806762, "learning_rate": 3.827395744207504e-06, "loss": 0.5802, "mean_token_accuracy": 0.8385789826512337, "step": 975 }, { "epoch": 1.2420785804816223, "grad_norm": 0.5209427066358759, "learning_rate": 3.773701382134345e-06, "loss": 0.5788, "mean_token_accuracy": 0.8383644595742226, "step": 980 }, { "epoch": 1.2484157160963245, "grad_norm": 0.4981386065382922, "learning_rate": 3.7201570765829405e-06, "loss": 0.5803, "mean_token_accuracy": 0.8378679618239403, "step": 985 }, { "epoch": 1.2547528517110267, "grad_norm": 0.5310216835837045, "learning_rate": 3.666769379523695e-06, "loss": 0.5816, "mean_token_accuracy": 0.8382963240146637, "step": 990 }, { "epoch": 1.2610899873257289, "grad_norm": 0.5302964748937399, "learning_rate": 3.6135448237635505e-06, "loss": 0.568, "mean_token_accuracy": 0.8408621445298194, "step": 995 }, { "epoch": 1.2674271229404308, "grad_norm": 0.6043312455865852, "learning_rate": 3.5604899221466003e-06, "loss": 0.5797, "mean_token_accuracy": 0.837955892086029, "step": 1000 }, { "epoch": 1.2737642585551332, "grad_norm": 0.5404711838738012, "learning_rate": 3.507611166757141e-06, "loss": 0.577, "mean_token_accuracy": 0.8382121488451958, "step": 1005 }, { "epoch": 1.2801013941698351, "grad_norm": 0.5313905403777647, "learning_rate": 3.4549150281252635e-06, "loss": 0.5759, "mean_token_accuracy": 0.8386555135250091, "step": 1010 }, { "epoch": 1.2864385297845373, "grad_norm": 0.5312545340451698, "learning_rate": 3.4024079544350874e-06, "loss": 0.5766, "mean_token_accuracy": 0.8384982272982597, "step": 1015 }, { "epoch": 1.2927756653992395, "grad_norm": 0.574010488002488, "learning_rate": 3.3500963707357236e-06, "loss": 0.5817, "mean_token_accuracy": 0.838199020922184, "step": 1020 }, { "epoch": 1.2991128010139417, "grad_norm": 0.5162313236359333, "learning_rate": 3.297986678155074e-06, "loss": 0.5596, "mean_token_accuracy": 0.8421908557415009, "step": 1025 }, { "epoch": 1.3054499366286438, "grad_norm": 0.6187258006031299, "learning_rate": 3.24608525311655e-06, "loss": 0.5633, "mean_token_accuracy": 0.842179323732853, "step": 1030 }, { "epoch": 1.311787072243346, "grad_norm": 0.5140882862368508, "learning_rate": 3.1943984465588253e-06, "loss": 0.5704, "mean_token_accuracy": 0.8403183802962303, "step": 1035 }, { "epoch": 1.3181242078580482, "grad_norm": 0.5261806551468972, "learning_rate": 3.142932583158693e-06, "loss": 0.5664, "mean_token_accuracy": 0.8412504211068154, "step": 1040 }, { "epoch": 1.3244613434727504, "grad_norm": 0.5355046745744655, "learning_rate": 3.0916939605571534e-06, "loss": 0.5668, "mean_token_accuracy": 0.8411947041749954, "step": 1045 }, { "epoch": 1.3307984790874525, "grad_norm": 0.5828342485781398, "learning_rate": 3.040688848588788e-06, "loss": 0.5683, "mean_token_accuracy": 0.8403848618268966, "step": 1050 }, { "epoch": 1.3371356147021547, "grad_norm": 0.515568887419182, "learning_rate": 2.989923488514566e-06, "loss": 0.5734, "mean_token_accuracy": 0.8396067947149277, "step": 1055 }, { "epoch": 1.3434727503168569, "grad_norm": 0.533119717549416, "learning_rate": 2.9394040922581123e-06, "loss": 0.5788, "mean_token_accuracy": 0.8387560814619064, "step": 1060 }, { "epoch": 1.3498098859315588, "grad_norm": 0.5574493299249907, "learning_rate": 2.889136841645592e-06, "loss": 0.5738, "mean_token_accuracy": 0.839569516479969, "step": 1065 }, { "epoch": 1.3561470215462612, "grad_norm": 0.5301348229908708, "learning_rate": 2.839127887649271e-06, "loss": 0.5751, "mean_token_accuracy": 0.8394772946834564, "step": 1070 }, { "epoch": 1.3624841571609632, "grad_norm": 0.5071728571486687, "learning_rate": 2.789383349634841e-06, "loss": 0.5711, "mean_token_accuracy": 0.8398226588964463, "step": 1075 }, { "epoch": 1.3688212927756653, "grad_norm": 0.4997381831510659, "learning_rate": 2.73990931461263e-06, "loss": 0.5783, "mean_token_accuracy": 0.8384912863373757, "step": 1080 }, { "epoch": 1.3751584283903675, "grad_norm": 0.5019388182436546, "learning_rate": 2.690711836492758e-06, "loss": 0.5711, "mean_token_accuracy": 0.8396464511752129, "step": 1085 }, { "epoch": 1.3814955640050697, "grad_norm": 0.5165116686484276, "learning_rate": 2.6417969353443484e-06, "loss": 0.5721, "mean_token_accuracy": 0.8395859107375145, "step": 1090 }, { "epoch": 1.3878326996197718, "grad_norm": 0.5372603660779312, "learning_rate": 2.5931705966588803e-06, "loss": 0.5826, "mean_token_accuracy": 0.8370852112770081, "step": 1095 }, { "epoch": 1.394169835234474, "grad_norm": 0.5104565997924485, "learning_rate": 2.544838770617772e-06, "loss": 0.5785, "mean_token_accuracy": 0.8393797069787979, "step": 1100 }, { "epoch": 1.4005069708491762, "grad_norm": 0.5336610190327751, "learning_rate": 2.496807371364283e-06, "loss": 0.5759, "mean_token_accuracy": 0.8390834912657738, "step": 1105 }, { "epoch": 1.4068441064638784, "grad_norm": 0.662951455066245, "learning_rate": 2.44908227627983e-06, "loss": 0.5712, "mean_token_accuracy": 0.8397842928767204, "step": 1110 }, { "epoch": 1.4131812420785805, "grad_norm": 0.5438222471825553, "learning_rate": 2.4016693252647954e-06, "loss": 0.5703, "mean_token_accuracy": 0.8397609844803811, "step": 1115 }, { "epoch": 1.4195183776932827, "grad_norm": 0.5457903944622784, "learning_rate": 2.3545743200239303e-06, "loss": 0.5756, "mean_token_accuracy": 0.8387856274843216, "step": 1120 }, { "epoch": 1.4258555133079849, "grad_norm": 0.5413159299268847, "learning_rate": 2.3078030233564203e-06, "loss": 0.5796, "mean_token_accuracy": 0.8379950270056724, "step": 1125 }, { "epoch": 1.4321926489226868, "grad_norm": 0.5017485230997426, "learning_rate": 2.2613611584507227e-06, "loss": 0.5843, "mean_token_accuracy": 0.8371415048837662, "step": 1130 }, { "epoch": 1.4385297845373892, "grad_norm": 0.5036035556859302, "learning_rate": 2.215254408184249e-06, "loss": 0.5733, "mean_token_accuracy": 0.8397385001182556, "step": 1135 }, { "epoch": 1.4448669201520912, "grad_norm": 0.5512472367603704, "learning_rate": 2.169488414427969e-06, "loss": 0.5665, "mean_token_accuracy": 0.8411229193210602, "step": 1140 }, { "epoch": 1.4512040557667933, "grad_norm": 0.5122324337296091, "learning_rate": 2.1240687773560476e-06, "loss": 0.5754, "mean_token_accuracy": 0.838901475071907, "step": 1145 }, { "epoch": 1.4575411913814955, "grad_norm": 0.514428924855705, "learning_rate": 2.0790010547605743e-06, "loss": 0.5773, "mean_token_accuracy": 0.8385174334049225, "step": 1150 }, { "epoch": 1.4638783269961977, "grad_norm": 0.541489817693485, "learning_rate": 2.0342907613714837e-06, "loss": 0.5724, "mean_token_accuracy": 0.839878860116005, "step": 1155 }, { "epoch": 1.4702154626108999, "grad_norm": 0.5233399327286699, "learning_rate": 1.989943368181741e-06, "loss": 0.5683, "mean_token_accuracy": 0.8406485706567765, "step": 1160 }, { "epoch": 1.476552598225602, "grad_norm": 0.4977622157535387, "learning_rate": 1.945964301777883e-06, "loss": 0.5568, "mean_token_accuracy": 0.8429565221071244, "step": 1165 }, { "epoch": 1.4828897338403042, "grad_norm": 0.502171168050283, "learning_rate": 1.9023589436759954e-06, "loss": 0.555, "mean_token_accuracy": 0.8435925453901291, "step": 1170 }, { "epoch": 1.4892268694550064, "grad_norm": 0.5026240018805591, "learning_rate": 1.859132629663194e-06, "loss": 0.5609, "mean_token_accuracy": 0.8420811951160431, "step": 1175 }, { "epoch": 1.4955640050697085, "grad_norm": 0.5071369135189446, "learning_rate": 1.8162906491447136e-06, "loss": 0.5751, "mean_token_accuracy": 0.8397066414356231, "step": 1180 }, { "epoch": 1.5019011406844105, "grad_norm": 0.5012155091792143, "learning_rate": 1.7738382444966668e-06, "loss": 0.5714, "mean_token_accuracy": 0.839833353459835, "step": 1185 }, { "epoch": 1.508238276299113, "grad_norm": 0.4943163959620169, "learning_rate": 1.7317806104245599e-06, "loss": 0.5614, "mean_token_accuracy": 0.8422631338238716, "step": 1190 }, { "epoch": 1.5145754119138148, "grad_norm": 0.5168969148185261, "learning_rate": 1.6901228933276381e-06, "loss": 0.5734, "mean_token_accuracy": 0.8398737594485283, "step": 1195 }, { "epoch": 1.5209125475285172, "grad_norm": 0.5085722470934201, "learning_rate": 1.6488701906691462e-06, "loss": 0.5743, "mean_token_accuracy": 0.8395018294453621, "step": 1200 }, { "epoch": 1.5272496831432192, "grad_norm": 0.5145560441594629, "learning_rate": 1.6080275503525754e-06, "loss": 0.5714, "mean_token_accuracy": 0.8400074362754821, "step": 1205 }, { "epoch": 1.5335868187579216, "grad_norm": 0.5142209477213089, "learning_rate": 1.5675999701039734e-06, "loss": 0.5731, "mean_token_accuracy": 0.8395378664135933, "step": 1210 }, { "epoch": 1.5399239543726235, "grad_norm": 0.4817695083655761, "learning_rate": 1.5275923968603967e-06, "loss": 0.5668, "mean_token_accuracy": 0.840859878063202, "step": 1215 }, { "epoch": 1.5462610899873257, "grad_norm": 0.4958218170076731, "learning_rate": 1.4880097261645765e-06, "loss": 0.575, "mean_token_accuracy": 0.8392793446779251, "step": 1220 }, { "epoch": 1.5525982256020279, "grad_norm": 0.5150469794513786, "learning_rate": 1.4488568015658738e-06, "loss": 0.5702, "mean_token_accuracy": 0.8403733685612679, "step": 1225 }, { "epoch": 1.55893536121673, "grad_norm": 0.5415616286404993, "learning_rate": 1.4101384140275947e-06, "loss": 0.5724, "mean_token_accuracy": 0.8399771124124527, "step": 1230 }, { "epoch": 1.5652724968314322, "grad_norm": 0.5125659970580118, "learning_rate": 1.3718593013407455e-06, "loss": 0.565, "mean_token_accuracy": 0.8413113921880722, "step": 1235 }, { "epoch": 1.5716096324461344, "grad_norm": 0.5172557001838594, "learning_rate": 1.3340241475442889e-06, "loss": 0.5666, "mean_token_accuracy": 0.8413270160555839, "step": 1240 }, { "epoch": 1.5779467680608366, "grad_norm": 0.5218390924731011, "learning_rate": 1.296637582351979e-06, "loss": 0.5811, "mean_token_accuracy": 0.8378918588161468, "step": 1245 }, { "epoch": 1.5842839036755385, "grad_norm": 0.49941956793616216, "learning_rate": 1.2597041805858469e-06, "loss": 0.5597, "mean_token_accuracy": 0.8421694174408912, "step": 1250 }, { "epoch": 1.590621039290241, "grad_norm": 0.4810003693281146, "learning_rate": 1.2232284616163986e-06, "loss": 0.5646, "mean_token_accuracy": 0.8418364375829697, "step": 1255 }, { "epoch": 1.5969581749049429, "grad_norm": 0.49642278969512443, "learning_rate": 1.1872148888096024e-06, "loss": 0.5686, "mean_token_accuracy": 0.840269310772419, "step": 1260 }, { "epoch": 1.6032953105196452, "grad_norm": 0.5258808050772633, "learning_rate": 1.1516678689807249e-06, "loss": 0.5665, "mean_token_accuracy": 0.8409392833709717, "step": 1265 }, { "epoch": 1.6096324461343472, "grad_norm": 0.4807160453689938, "learning_rate": 1.1165917518550913e-06, "loss": 0.5671, "mean_token_accuracy": 0.8411058440804482, "step": 1270 }, { "epoch": 1.6159695817490496, "grad_norm": 0.48965855513910594, "learning_rate": 1.0819908295358284e-06, "loss": 0.5588, "mean_token_accuracy": 0.8429983571171761, "step": 1275 }, { "epoch": 1.6223067173637515, "grad_norm": 0.5202990154276527, "learning_rate": 1.0478693359786612e-06, "loss": 0.5716, "mean_token_accuracy": 0.8400727063417435, "step": 1280 }, { "epoch": 1.6286438529784537, "grad_norm": 0.5171890350253132, "learning_rate": 1.0142314464738195e-06, "loss": 0.5517, "mean_token_accuracy": 0.8443869799375534, "step": 1285 }, { "epoch": 1.6349809885931559, "grad_norm": 0.48132181865431867, "learning_rate": 9.810812771351335e-07, "loss": 0.5784, "mean_token_accuracy": 0.8387523666024208, "step": 1290 }, { "epoch": 1.641318124207858, "grad_norm": 0.48031587809861415, "learning_rate": 9.484228843963577e-07, "loss": 0.5609, "mean_token_accuracy": 0.8421882972121238, "step": 1295 }, { "epoch": 1.6476552598225602, "grad_norm": 0.48862410273482815, "learning_rate": 9.16260264514805e-07, "loss": 0.5739, "mean_token_accuracy": 0.8393760696053505, "step": 1300 }, { "epoch": 1.6539923954372624, "grad_norm": 0.4974345984726092, "learning_rate": 8.845973530823443e-07, "loss": 0.5623, "mean_token_accuracy": 0.842260554432869, "step": 1305 }, { "epoch": 1.6603295310519646, "grad_norm": 0.4969870292569671, "learning_rate": 8.534380245438212e-07, "loss": 0.5806, "mean_token_accuracy": 0.8379565149545669, "step": 1310 }, { "epoch": 1.6666666666666665, "grad_norm": 0.51170305488906, "learning_rate": 8.22786091722958e-07, "loss": 0.5744, "mean_token_accuracy": 0.8394851118326188, "step": 1315 }, { "epoch": 1.673003802281369, "grad_norm": 0.4882536601279716, "learning_rate": 7.926453053557948e-07, "loss": 0.5694, "mean_token_accuracy": 0.8412208631634712, "step": 1320 }, { "epoch": 1.6793409378960709, "grad_norm": 0.5201633381345815, "learning_rate": 7.630193536317354e-07, "loss": 0.5779, "mean_token_accuracy": 0.8387572214007377, "step": 1325 }, { "epoch": 1.6856780735107733, "grad_norm": 0.4872309884092355, "learning_rate": 7.339118617422325e-07, "loss": 0.5721, "mean_token_accuracy": 0.840134784579277, "step": 1330 }, { "epoch": 1.6920152091254752, "grad_norm": 0.4742262519043048, "learning_rate": 7.05326391437195e-07, "loss": 0.567, "mean_token_accuracy": 0.8408115699887275, "step": 1335 }, { "epoch": 1.6983523447401776, "grad_norm": 0.48084605496078786, "learning_rate": 6.772664405891505e-07, "loss": 0.5739, "mean_token_accuracy": 0.8401078969240189, "step": 1340 }, { "epoch": 1.7046894803548795, "grad_norm": 0.4836055364313366, "learning_rate": 6.49735442765228e-07, "loss": 0.5771, "mean_token_accuracy": 0.8388657510280609, "step": 1345 }, { "epoch": 1.7110266159695817, "grad_norm": 0.4955193703741457, "learning_rate": 6.227367668070084e-07, "loss": 0.5641, "mean_token_accuracy": 0.8420116931200028, "step": 1350 }, { "epoch": 1.717363751584284, "grad_norm": 0.47888043666477453, "learning_rate": 5.962737164182942e-07, "loss": 0.5695, "mean_token_accuracy": 0.8411467924714089, "step": 1355 }, { "epoch": 1.723700887198986, "grad_norm": 0.48358100558875267, "learning_rate": 5.703495297608486e-07, "loss": 0.5672, "mean_token_accuracy": 0.8408854246139527, "step": 1360 }, { "epoch": 1.7300380228136882, "grad_norm": 0.48450381732440073, "learning_rate": 5.449673790581611e-07, "loss": 0.5756, "mean_token_accuracy": 0.8394671693444252, "step": 1365 }, { "epoch": 1.7363751584283904, "grad_norm": 0.524224789009983, "learning_rate": 5.201303702072724e-07, "loss": 0.564, "mean_token_accuracy": 0.8414558693766594, "step": 1370 }, { "epoch": 1.7427122940430926, "grad_norm": 0.47448699280100953, "learning_rate": 4.958415423987229e-07, "loss": 0.5576, "mean_token_accuracy": 0.8432327851653099, "step": 1375 }, { "epoch": 1.7490494296577945, "grad_norm": 0.4999589058798834, "learning_rate": 4.721038677446599e-07, "loss": 0.5543, "mean_token_accuracy": 0.8434969082474708, "step": 1380 }, { "epoch": 1.755386565272497, "grad_norm": 0.49519130319734356, "learning_rate": 4.4892025091515465e-07, "loss": 0.5744, "mean_token_accuracy": 0.8392727747559547, "step": 1385 }, { "epoch": 1.7617237008871989, "grad_norm": 0.47996153862103574, "learning_rate": 4.2629352878276964e-07, "loss": 0.5757, "mean_token_accuracy": 0.8395681723952293, "step": 1390 }, { "epoch": 1.7680608365019013, "grad_norm": 0.4743677174789034, "learning_rate": 4.04226470075425e-07, "loss": 0.5793, "mean_token_accuracy": 0.8383775666356087, "step": 1395 }, { "epoch": 1.7743979721166032, "grad_norm": 0.47358657093352546, "learning_rate": 3.8272177503760277e-07, "loss": 0.5666, "mean_token_accuracy": 0.8409555062651635, "step": 1400 }, { "epoch": 1.7807351077313056, "grad_norm": 0.47898043422535136, "learning_rate": 3.6178207509992623e-07, "loss": 0.5588, "mean_token_accuracy": 0.8429359510540962, "step": 1405 }, { "epoch": 1.7870722433460076, "grad_norm": 0.48612638069980213, "learning_rate": 3.4140993255717123e-07, "loss": 0.5687, "mean_token_accuracy": 0.840995529294014, "step": 1410 }, { "epoch": 1.7934093789607097, "grad_norm": 0.47802067614271637, "learning_rate": 3.216078402547218e-07, "loss": 0.5651, "mean_token_accuracy": 0.8413813829421997, "step": 1415 }, { "epoch": 1.799746514575412, "grad_norm": 0.45575767680162316, "learning_rate": 3.0237822128353744e-07, "loss": 0.5551, "mean_token_accuracy": 0.8439073666930199, "step": 1420 }, { "epoch": 1.806083650190114, "grad_norm": 0.5008888425261698, "learning_rate": 2.8372342868364934e-07, "loss": 0.5763, "mean_token_accuracy": 0.8394736155867577, "step": 1425 }, { "epoch": 1.8124207858048162, "grad_norm": 0.47883052147679717, "learning_rate": 2.656457451562283e-07, "loss": 0.5847, "mean_token_accuracy": 0.8371838569641114, "step": 1430 }, { "epoch": 1.8187579214195184, "grad_norm": 0.48136837053701437, "learning_rate": 2.4814738278426287e-07, "loss": 0.5713, "mean_token_accuracy": 0.8400285989046097, "step": 1435 }, { "epoch": 1.8250950570342206, "grad_norm": 0.47630227923243995, "learning_rate": 2.3123048276187722e-07, "loss": 0.5663, "mean_token_accuracy": 0.8415055811405182, "step": 1440 }, { "epoch": 1.8314321926489225, "grad_norm": 0.48067639897927306, "learning_rate": 2.1489711513232038e-07, "loss": 0.5702, "mean_token_accuracy": 0.8404717803001404, "step": 1445 }, { "epoch": 1.837769328263625, "grad_norm": 0.48817733468841595, "learning_rate": 1.991492785346677e-07, "loss": 0.5659, "mean_token_accuracy": 0.8410487651824952, "step": 1450 }, { "epoch": 1.8441064638783269, "grad_norm": 0.4753854139627654, "learning_rate": 1.8398889995925428e-07, "loss": 0.5612, "mean_token_accuracy": 0.842425537109375, "step": 1455 }, { "epoch": 1.8504435994930293, "grad_norm": 0.4979097318389579, "learning_rate": 1.694178345118791e-07, "loss": 0.5554, "mean_token_accuracy": 0.843775661289692, "step": 1460 }, { "epoch": 1.8567807351077312, "grad_norm": 0.4829356927499738, "learning_rate": 1.5543786518680436e-07, "loss": 0.556, "mean_token_accuracy": 0.8434767201542854, "step": 1465 }, { "epoch": 1.8631178707224336, "grad_norm": 0.4651233227253299, "learning_rate": 1.4205070264857901e-07, "loss": 0.5704, "mean_token_accuracy": 0.8402711316943169, "step": 1470 }, { "epoch": 1.8694550063371356, "grad_norm": 0.47253676852018517, "learning_rate": 1.292579850227099e-07, "loss": 0.5777, "mean_token_accuracy": 0.8392020970582962, "step": 1475 }, { "epoch": 1.8757921419518377, "grad_norm": 0.4800740772721781, "learning_rate": 1.170612776952168e-07, "loss": 0.566, "mean_token_accuracy": 0.8414452761411667, "step": 1480 }, { "epoch": 1.88212927756654, "grad_norm": 0.46528025174750537, "learning_rate": 1.0546207312107814e-07, "loss": 0.5636, "mean_token_accuracy": 0.8416185140609741, "step": 1485 }, { "epoch": 1.888466413181242, "grad_norm": 0.47693097112640276, "learning_rate": 9.44617906416101e-08, "loss": 0.5727, "mean_token_accuracy": 0.8405211389064788, "step": 1490 }, { "epoch": 1.8948035487959443, "grad_norm": 0.4787485103517413, "learning_rate": 8.406177631078594e-08, "loss": 0.5708, "mean_token_accuracy": 0.8403903424739838, "step": 1495 }, { "epoch": 1.9011406844106464, "grad_norm": 0.45967120152380847, "learning_rate": 7.426330273052618e-08, "loss": 0.5496, "mean_token_accuracy": 0.8449963420629502, "step": 1500 }, { "epoch": 1.9074778200253486, "grad_norm": 0.46451147059266606, "learning_rate": 6.506756889497756e-08, "loss": 0.5608, "mean_token_accuracy": 0.8425014033913613, "step": 1505 }, { "epoch": 1.9138149556400506, "grad_norm": 0.5057760468937542, "learning_rate": 5.647570004379432e-08, "loss": 0.5602, "mean_token_accuracy": 0.8427406966686248, "step": 1510 }, { "epoch": 1.920152091254753, "grad_norm": 0.48061481353459495, "learning_rate": 4.848874752445221e-08, "loss": 0.5675, "mean_token_accuracy": 0.8411912024021149, "step": 1515 }, { "epoch": 1.926489226869455, "grad_norm": 0.4689935228428535, "learning_rate": 4.110768866359638e-08, "loss": 0.5631, "mean_token_accuracy": 0.8418816044926644, "step": 1520 }, { "epoch": 1.9328263624841573, "grad_norm": 0.4698265767310371, "learning_rate": 3.43334266474521e-08, "loss": 0.5635, "mean_token_accuracy": 0.8423062637448311, "step": 1525 }, { "epoch": 1.9391634980988592, "grad_norm": 0.49190745957035076, "learning_rate": 2.8166790411304766e-08, "loss": 0.5644, "mean_token_accuracy": 0.8418506249785424, "step": 1530 }, { "epoch": 1.9455006337135616, "grad_norm": 0.4676519055114557, "learning_rate": 2.260853453806944e-08, "loss": 0.5691, "mean_token_accuracy": 0.8408907786011696, "step": 1535 }, { "epoch": 1.9518377693282636, "grad_norm": 0.4857147511585138, "learning_rate": 1.7659339165952417e-08, "loss": 0.5699, "mean_token_accuracy": 0.8406305849552155, "step": 1540 }, { "epoch": 1.9581749049429658, "grad_norm": 0.48303973039403075, "learning_rate": 1.3319809905228409e-08, "loss": 0.5765, "mean_token_accuracy": 0.8395203098654747, "step": 1545 }, { "epoch": 1.964512040557668, "grad_norm": 0.47745966065458134, "learning_rate": 9.590477764135353e-09, "loss": 0.5641, "mean_token_accuracy": 0.8417988792061806, "step": 1550 }, { "epoch": 1.97084917617237, "grad_norm": 0.4675012014451023, "learning_rate": 6.47179908389417e-09, "loss": 0.5699, "mean_token_accuracy": 0.8404615536332131, "step": 1555 }, { "epoch": 1.9771863117870723, "grad_norm": 0.4956658011789385, "learning_rate": 3.964155482871213e-09, "loss": 0.5592, "mean_token_accuracy": 0.842540180683136, "step": 1560 }, { "epoch": 1.9835234474017744, "grad_norm": 0.4689038627708401, "learning_rate": 2.0678538098806158e-09, "loss": 0.5745, "mean_token_accuracy": 0.8394525855779648, "step": 1565 }, { "epoch": 1.9898605830164766, "grad_norm": 0.4660374899806601, "learning_rate": 7.83126106637111e-10, "loss": 0.5643, "mean_token_accuracy": 0.8416339352726936, "step": 1570 }, { "epoch": 1.9961977186311786, "grad_norm": 0.4729664926744204, "learning_rate": 1.1012957935985224e-10, "loss": 0.5636, "mean_token_accuracy": 0.8414568796753883, "step": 1575 }, { "epoch": 2.0, "mean_token_accuracy": 0.8406301041444143, "step": 1578, "total_flos": 827207983300608.0, "train_loss": 0.6521280055868006, "train_runtime": 235151.6683, "train_samples_per_second": 1.718, "train_steps_per_second": 0.007 } ], "logging_steps": 5, "max_steps": 1578, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 827207983300608.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }