Lansechen's picture
Model save
3cefa9e verified
raw
history blame
137 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9991386735572783,
"eval_steps": 100,
"global_step": 3045,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004921865386981666,
"grad_norm": 10.908417701721191,
"learning_rate": 3.278688524590164e-07,
"loss": 2.6851,
"mean_token_accuracy": 0.490550322830677,
"step": 5
},
{
"epoch": 0.009843730773963333,
"grad_norm": 10.821477890014648,
"learning_rate": 6.557377049180328e-07,
"loss": 2.6916,
"mean_token_accuracy": 0.4892874449491501,
"step": 10
},
{
"epoch": 0.014765596160944998,
"grad_norm": 9.100831031799316,
"learning_rate": 9.836065573770493e-07,
"loss": 2.6563,
"mean_token_accuracy": 0.49268135130405427,
"step": 15
},
{
"epoch": 0.019687461547926666,
"grad_norm": 6.744043827056885,
"learning_rate": 1.3114754098360657e-06,
"loss": 2.4838,
"mean_token_accuracy": 0.503991749882698,
"step": 20
},
{
"epoch": 0.02460932693490833,
"grad_norm": 4.111428737640381,
"learning_rate": 1.6393442622950819e-06,
"loss": 2.3481,
"mean_token_accuracy": 0.5121142826974392,
"step": 25
},
{
"epoch": 0.029531192321889995,
"grad_norm": 3.504826068878174,
"learning_rate": 1.9672131147540985e-06,
"loss": 2.1834,
"mean_token_accuracy": 0.525759468972683,
"step": 30
},
{
"epoch": 0.034453057708871665,
"grad_norm": 2.371668577194214,
"learning_rate": 2.295081967213115e-06,
"loss": 1.9992,
"mean_token_accuracy": 0.5471328645944595,
"step": 35
},
{
"epoch": 0.03937492309585333,
"grad_norm": 1.910736083984375,
"learning_rate": 2.6229508196721314e-06,
"loss": 1.8619,
"mean_token_accuracy": 0.5657269343733787,
"step": 40
},
{
"epoch": 0.044296788482835,
"grad_norm": 1.6694586277008057,
"learning_rate": 2.9508196721311478e-06,
"loss": 1.7324,
"mean_token_accuracy": 0.582801228761673,
"step": 45
},
{
"epoch": 0.04921865386981666,
"grad_norm": 1.3371120691299438,
"learning_rate": 3.2786885245901638e-06,
"loss": 1.5922,
"mean_token_accuracy": 0.6066210582852364,
"step": 50
},
{
"epoch": 0.054140519256798324,
"grad_norm": 1.153715968132019,
"learning_rate": 3.6065573770491806e-06,
"loss": 1.4607,
"mean_token_accuracy": 0.629358272254467,
"step": 55
},
{
"epoch": 0.05906238464377999,
"grad_norm": 1.011682391166687,
"learning_rate": 3.934426229508197e-06,
"loss": 1.3312,
"mean_token_accuracy": 0.6534152328968048,
"step": 60
},
{
"epoch": 0.06398425003076166,
"grad_norm": 0.8580278158187866,
"learning_rate": 4.2622950819672135e-06,
"loss": 1.2163,
"mean_token_accuracy": 0.676006656885147,
"step": 65
},
{
"epoch": 0.06890611541774333,
"grad_norm": 0.7737818360328674,
"learning_rate": 4.59016393442623e-06,
"loss": 1.1256,
"mean_token_accuracy": 0.695121419429779,
"step": 70
},
{
"epoch": 0.073827980804725,
"grad_norm": 0.6026164889335632,
"learning_rate": 4.918032786885246e-06,
"loss": 1.0456,
"mean_token_accuracy": 0.7120692700147628,
"step": 75
},
{
"epoch": 0.07874984619170666,
"grad_norm": 20.797266006469727,
"learning_rate": 5.245901639344263e-06,
"loss": 0.9884,
"mean_token_accuracy": 0.7246918171644211,
"step": 80
},
{
"epoch": 0.08367171157868833,
"grad_norm": 24.53761100769043,
"learning_rate": 5.573770491803278e-06,
"loss": 0.9471,
"mean_token_accuracy": 0.7344574183225632,
"step": 85
},
{
"epoch": 0.08859357696567,
"grad_norm": 7.69836950302124,
"learning_rate": 5.9016393442622956e-06,
"loss": 0.9291,
"mean_token_accuracy": 0.7384938269853591,
"step": 90
},
{
"epoch": 0.09351544235265165,
"grad_norm": 0.42971891164779663,
"learning_rate": 6.229508196721312e-06,
"loss": 0.9071,
"mean_token_accuracy": 0.743149445950985,
"step": 95
},
{
"epoch": 0.09843730773963331,
"grad_norm": 0.4011496901512146,
"learning_rate": 6.5573770491803276e-06,
"loss": 0.8839,
"mean_token_accuracy": 0.7489838138222694,
"step": 100
},
{
"epoch": 0.10335917312661498,
"grad_norm": 0.4182426631450653,
"learning_rate": 6.885245901639345e-06,
"loss": 0.864,
"mean_token_accuracy": 0.7533508613705635,
"step": 105
},
{
"epoch": 0.10828103851359665,
"grad_norm": 0.4418739080429077,
"learning_rate": 7.213114754098361e-06,
"loss": 0.8461,
"mean_token_accuracy": 0.7571793958544731,
"step": 110
},
{
"epoch": 0.11320290390057831,
"grad_norm": 4.76384973526001,
"learning_rate": 7.540983606557377e-06,
"loss": 0.8478,
"mean_token_accuracy": 0.7560782924294471,
"step": 115
},
{
"epoch": 0.11812476928755998,
"grad_norm": 0.426782488822937,
"learning_rate": 7.868852459016394e-06,
"loss": 0.8262,
"mean_token_accuracy": 0.7621309965848923,
"step": 120
},
{
"epoch": 0.12304663467454165,
"grad_norm": 3.5404343605041504,
"learning_rate": 8.19672131147541e-06,
"loss": 0.8239,
"mean_token_accuracy": 0.7624999329447746,
"step": 125
},
{
"epoch": 0.12796850006152333,
"grad_norm": 0.6128109097480774,
"learning_rate": 8.524590163934427e-06,
"loss": 0.8125,
"mean_token_accuracy": 0.7650709196925163,
"step": 130
},
{
"epoch": 0.132890365448505,
"grad_norm": 0.4441392719745636,
"learning_rate": 8.852459016393443e-06,
"loss": 0.8178,
"mean_token_accuracy": 0.7635303542017937,
"step": 135
},
{
"epoch": 0.13781223083548666,
"grad_norm": 0.6959536075592041,
"learning_rate": 9.18032786885246e-06,
"loss": 0.797,
"mean_token_accuracy": 0.7682553365826607,
"step": 140
},
{
"epoch": 0.14273409622246833,
"grad_norm": 0.4633159935474396,
"learning_rate": 9.508196721311476e-06,
"loss": 0.7972,
"mean_token_accuracy": 0.7677757993340493,
"step": 145
},
{
"epoch": 0.14765596160945,
"grad_norm": 0.3808494806289673,
"learning_rate": 9.836065573770493e-06,
"loss": 0.7956,
"mean_token_accuracy": 0.7682796508073807,
"step": 150
},
{
"epoch": 0.15257782699643166,
"grad_norm": 1.2230223417282104,
"learning_rate": 1.0163934426229509e-05,
"loss": 0.7714,
"mean_token_accuracy": 0.7741705477237701,
"step": 155
},
{
"epoch": 0.15749969238341333,
"grad_norm": 1.2708261013031006,
"learning_rate": 1.0491803278688525e-05,
"loss": 0.7671,
"mean_token_accuracy": 0.7750522747635842,
"step": 160
},
{
"epoch": 0.162421557770395,
"grad_norm": 0.4153311252593994,
"learning_rate": 1.0819672131147544e-05,
"loss": 0.762,
"mean_token_accuracy": 0.776003035902977,
"step": 165
},
{
"epoch": 0.16734342315737666,
"grad_norm": 0.48690149188041687,
"learning_rate": 1.1147540983606557e-05,
"loss": 0.7611,
"mean_token_accuracy": 0.776053948700428,
"step": 170
},
{
"epoch": 0.17226528854435832,
"grad_norm": 0.3839600682258606,
"learning_rate": 1.1475409836065575e-05,
"loss": 0.7518,
"mean_token_accuracy": 0.7784286484122276,
"step": 175
},
{
"epoch": 0.17718715393134,
"grad_norm": 0.33650702238082886,
"learning_rate": 1.1803278688524591e-05,
"loss": 0.7425,
"mean_token_accuracy": 0.7807790979743003,
"step": 180
},
{
"epoch": 0.18210901931832166,
"grad_norm": 0.34878674149513245,
"learning_rate": 1.2131147540983608e-05,
"loss": 0.7469,
"mean_token_accuracy": 0.779270826280117,
"step": 185
},
{
"epoch": 0.1870308847053033,
"grad_norm": 0.4435058534145355,
"learning_rate": 1.2459016393442624e-05,
"loss": 0.7414,
"mean_token_accuracy": 0.7804962411522866,
"step": 190
},
{
"epoch": 0.19195275009228496,
"grad_norm": 0.34793269634246826,
"learning_rate": 1.2786885245901642e-05,
"loss": 0.7368,
"mean_token_accuracy": 0.7817707493901253,
"step": 195
},
{
"epoch": 0.19687461547926663,
"grad_norm": 0.32821062207221985,
"learning_rate": 1.3114754098360655e-05,
"loss": 0.7309,
"mean_token_accuracy": 0.7830819576978684,
"step": 200
},
{
"epoch": 0.2017964808662483,
"grad_norm": 0.3908160626888275,
"learning_rate": 1.3442622950819673e-05,
"loss": 0.7349,
"mean_token_accuracy": 0.7820746794342994,
"step": 205
},
{
"epoch": 0.20671834625322996,
"grad_norm": 1.239039659500122,
"learning_rate": 1.377049180327869e-05,
"loss": 0.7315,
"mean_token_accuracy": 0.7830250725150109,
"step": 210
},
{
"epoch": 0.21164021164021163,
"grad_norm": 0.437558650970459,
"learning_rate": 1.4098360655737706e-05,
"loss": 0.7213,
"mean_token_accuracy": 0.785545514523983,
"step": 215
},
{
"epoch": 0.2165620770271933,
"grad_norm": 0.3581276535987854,
"learning_rate": 1.4426229508196722e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.7868386089801789,
"step": 220
},
{
"epoch": 0.22148394241417496,
"grad_norm": 0.393839031457901,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.7108,
"mean_token_accuracy": 0.7875275865197182,
"step": 225
},
{
"epoch": 0.22640580780115663,
"grad_norm": 0.4203226566314697,
"learning_rate": 1.5081967213114754e-05,
"loss": 0.7115,
"mean_token_accuracy": 0.7875282734632492,
"step": 230
},
{
"epoch": 0.2313276731881383,
"grad_norm": 0.4379311501979828,
"learning_rate": 1.5409836065573772e-05,
"loss": 0.7176,
"mean_token_accuracy": 0.7859495177865028,
"step": 235
},
{
"epoch": 0.23624953857511996,
"grad_norm": 0.5987364053726196,
"learning_rate": 1.5737704918032788e-05,
"loss": 0.7047,
"mean_token_accuracy": 0.7892461016774177,
"step": 240
},
{
"epoch": 0.24117140396210163,
"grad_norm": 0.39721059799194336,
"learning_rate": 1.6065573770491805e-05,
"loss": 0.7082,
"mean_token_accuracy": 0.7879156336188317,
"step": 245
},
{
"epoch": 0.2460932693490833,
"grad_norm": 0.35150638222694397,
"learning_rate": 1.639344262295082e-05,
"loss": 0.7015,
"mean_token_accuracy": 0.7899731829762459,
"step": 250
},
{
"epoch": 0.25101513473606496,
"grad_norm": 0.37812677025794983,
"learning_rate": 1.6721311475409837e-05,
"loss": 0.7112,
"mean_token_accuracy": 0.7869908154010773,
"step": 255
},
{
"epoch": 0.25593700012304665,
"grad_norm": 0.37921008467674255,
"learning_rate": 1.7049180327868854e-05,
"loss": 0.695,
"mean_token_accuracy": 0.7912393018603325,
"step": 260
},
{
"epoch": 0.2608588655100283,
"grad_norm": 0.3776193857192993,
"learning_rate": 1.737704918032787e-05,
"loss": 0.6975,
"mean_token_accuracy": 0.7903847828507423,
"step": 265
},
{
"epoch": 0.26578073089701,
"grad_norm": 0.34160885214805603,
"learning_rate": 1.7704918032786887e-05,
"loss": 0.7005,
"mean_token_accuracy": 0.7901133581995964,
"step": 270
},
{
"epoch": 0.2707025962839916,
"grad_norm": 0.3151760399341583,
"learning_rate": 1.8032786885245903e-05,
"loss": 0.6838,
"mean_token_accuracy": 0.7940751999616623,
"step": 275
},
{
"epoch": 0.2756244616709733,
"grad_norm": 0.3251655101776123,
"learning_rate": 1.836065573770492e-05,
"loss": 0.683,
"mean_token_accuracy": 0.7942519947886467,
"step": 280
},
{
"epoch": 0.28054632705795496,
"grad_norm": 0.392980694770813,
"learning_rate": 1.8688524590163936e-05,
"loss": 0.6779,
"mean_token_accuracy": 0.7953907087445259,
"step": 285
},
{
"epoch": 0.28546819244493665,
"grad_norm": 0.42777085304260254,
"learning_rate": 1.9016393442622952e-05,
"loss": 0.696,
"mean_token_accuracy": 0.7913835749030114,
"step": 290
},
{
"epoch": 0.2903900578319183,
"grad_norm": 0.38064613938331604,
"learning_rate": 1.934426229508197e-05,
"loss": 0.6777,
"mean_token_accuracy": 0.79527537971735,
"step": 295
},
{
"epoch": 0.2953119232189,
"grad_norm": 0.35906219482421875,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.6772,
"mean_token_accuracy": 0.7954441845417023,
"step": 300
},
{
"epoch": 0.3002337886058816,
"grad_norm": 0.4336443543434143,
"learning_rate": 2e-05,
"loss": 0.6672,
"mean_token_accuracy": 0.7982369065284729,
"step": 305
},
{
"epoch": 0.3051556539928633,
"grad_norm": 0.35013464093208313,
"learning_rate": 1.9999835673561284e-05,
"loss": 0.6823,
"mean_token_accuracy": 0.7940784975886345,
"step": 310
},
{
"epoch": 0.31007751937984496,
"grad_norm": 0.4209573566913605,
"learning_rate": 1.9999342699645774e-05,
"loss": 0.6705,
"mean_token_accuracy": 0.7970875754952431,
"step": 315
},
{
"epoch": 0.31499938476682665,
"grad_norm": 0.3402932584285736,
"learning_rate": 1.9998521094455198e-05,
"loss": 0.6733,
"mean_token_accuracy": 0.7962517961859703,
"step": 320
},
{
"epoch": 0.3199212501538083,
"grad_norm": 0.3613898456096649,
"learning_rate": 1.9997370884991842e-05,
"loss": 0.6659,
"mean_token_accuracy": 0.7986094921827316,
"step": 325
},
{
"epoch": 0.32484311554079,
"grad_norm": 0.8141839504241943,
"learning_rate": 1.9995892109057675e-05,
"loss": 0.6682,
"mean_token_accuracy": 0.7979325890541077,
"step": 330
},
{
"epoch": 0.3297649809277716,
"grad_norm": 0.32822492718696594,
"learning_rate": 1.99940848152531e-05,
"loss": 0.6592,
"mean_token_accuracy": 0.799762362241745,
"step": 335
},
{
"epoch": 0.3346868463147533,
"grad_norm": 0.32193639874458313,
"learning_rate": 1.9991949062975336e-05,
"loss": 0.6669,
"mean_token_accuracy": 0.7977916583418846,
"step": 340
},
{
"epoch": 0.33960871170173496,
"grad_norm": 0.6516172885894775,
"learning_rate": 1.9989484922416503e-05,
"loss": 0.6636,
"mean_token_accuracy": 0.7989253982901573,
"step": 345
},
{
"epoch": 0.34453057708871665,
"grad_norm": 0.6252678036689758,
"learning_rate": 1.9986692474561292e-05,
"loss": 0.6549,
"mean_token_accuracy": 0.8010424450039864,
"step": 350
},
{
"epoch": 0.3494524424756983,
"grad_norm": 0.39426907896995544,
"learning_rate": 1.9983571811184297e-05,
"loss": 0.6583,
"mean_token_accuracy": 0.8001298069953918,
"step": 355
},
{
"epoch": 0.35437430786268,
"grad_norm": 0.4398311972618103,
"learning_rate": 1.9980123034847025e-05,
"loss": 0.6569,
"mean_token_accuracy": 0.8002386093139648,
"step": 360
},
{
"epoch": 0.3592961732496616,
"grad_norm": 0.36181896924972534,
"learning_rate": 1.9976346258894502e-05,
"loss": 0.6572,
"mean_token_accuracy": 0.7999640181660652,
"step": 365
},
{
"epoch": 0.3642180386366433,
"grad_norm": 0.33937492966651917,
"learning_rate": 1.9972241607451552e-05,
"loss": 0.6534,
"mean_token_accuracy": 0.8008638471364975,
"step": 370
},
{
"epoch": 0.36913990402362495,
"grad_norm": 0.3220241665840149,
"learning_rate": 1.996780921541873e-05,
"loss": 0.6491,
"mean_token_accuracy": 0.8024497851729393,
"step": 375
},
{
"epoch": 0.3740617694106066,
"grad_norm": 0.3588990867137909,
"learning_rate": 1.9963049228467875e-05,
"loss": 0.6519,
"mean_token_accuracy": 0.8013440445065498,
"step": 380
},
{
"epoch": 0.3789836347975883,
"grad_norm": 0.3850741982460022,
"learning_rate": 1.9957961803037325e-05,
"loss": 0.6539,
"mean_token_accuracy": 0.8007026329636574,
"step": 385
},
{
"epoch": 0.3839055001845699,
"grad_norm": 0.39418673515319824,
"learning_rate": 1.9952547106326787e-05,
"loss": 0.6511,
"mean_token_accuracy": 0.8013290241360664,
"step": 390
},
{
"epoch": 0.3888273655715516,
"grad_norm": 0.33889254927635193,
"learning_rate": 1.9946805316291817e-05,
"loss": 0.6523,
"mean_token_accuracy": 0.8005807921290398,
"step": 395
},
{
"epoch": 0.39374923095853326,
"grad_norm": 0.7381798624992371,
"learning_rate": 1.9940736621638e-05,
"loss": 0.649,
"mean_token_accuracy": 0.8016207367181778,
"step": 400
},
{
"epoch": 0.39867109634551495,
"grad_norm": 0.3772973120212555,
"learning_rate": 1.993434122181474e-05,
"loss": 0.6458,
"mean_token_accuracy": 0.802768674492836,
"step": 405
},
{
"epoch": 0.4035929617324966,
"grad_norm": 0.33333730697631836,
"learning_rate": 1.992761932700868e-05,
"loss": 0.6444,
"mean_token_accuracy": 0.8025879472494125,
"step": 410
},
{
"epoch": 0.4085148271194783,
"grad_norm": 0.3165677785873413,
"learning_rate": 1.9920571158136837e-05,
"loss": 0.639,
"mean_token_accuracy": 0.8042329683899879,
"step": 415
},
{
"epoch": 0.4134366925064599,
"grad_norm": 0.3313787579536438,
"learning_rate": 1.9913196946839304e-05,
"loss": 0.6422,
"mean_token_accuracy": 0.803669148683548,
"step": 420
},
{
"epoch": 0.4183585578934416,
"grad_norm": 0.2832159101963043,
"learning_rate": 1.990549693547166e-05,
"loss": 0.6378,
"mean_token_accuracy": 0.8049987867474556,
"step": 425
},
{
"epoch": 0.42328042328042326,
"grad_norm": 0.3278089463710785,
"learning_rate": 1.9897471377096992e-05,
"loss": 0.638,
"mean_token_accuracy": 0.8043939173221588,
"step": 430
},
{
"epoch": 0.42820228866740495,
"grad_norm": 0.33513346314430237,
"learning_rate": 1.9889120535477584e-05,
"loss": 0.6366,
"mean_token_accuracy": 0.80514996945858,
"step": 435
},
{
"epoch": 0.4331241540543866,
"grad_norm": 0.36697131395339966,
"learning_rate": 1.9880444685066252e-05,
"loss": 0.6322,
"mean_token_accuracy": 0.8064638406038285,
"step": 440
},
{
"epoch": 0.4380460194413683,
"grad_norm": 0.34239935874938965,
"learning_rate": 1.987144411099731e-05,
"loss": 0.6328,
"mean_token_accuracy": 0.8058159291744232,
"step": 445
},
{
"epoch": 0.4429678848283499,
"grad_norm": 0.29778754711151123,
"learning_rate": 1.9862119109077226e-05,
"loss": 0.6442,
"mean_token_accuracy": 0.8030599504709244,
"step": 450
},
{
"epoch": 0.4478897502153316,
"grad_norm": 0.31139907240867615,
"learning_rate": 1.985246998577486e-05,
"loss": 0.6507,
"mean_token_accuracy": 0.8007849171757698,
"step": 455
},
{
"epoch": 0.45281161560231326,
"grad_norm": 0.32070034742355347,
"learning_rate": 1.984249705821143e-05,
"loss": 0.6405,
"mean_token_accuracy": 0.8038340613245964,
"step": 460
},
{
"epoch": 0.45773348098929495,
"grad_norm": 0.3086022734642029,
"learning_rate": 1.9832200654150077e-05,
"loss": 0.6316,
"mean_token_accuracy": 0.8058078184723854,
"step": 465
},
{
"epoch": 0.4626553463762766,
"grad_norm": 0.30972251296043396,
"learning_rate": 1.9821581111985072e-05,
"loss": 0.6343,
"mean_token_accuracy": 0.8051379904150963,
"step": 470
},
{
"epoch": 0.4675772117632583,
"grad_norm": 0.2832852005958557,
"learning_rate": 1.981063878073073e-05,
"loss": 0.6324,
"mean_token_accuracy": 0.8058837354183197,
"step": 475
},
{
"epoch": 0.4724990771502399,
"grad_norm": 0.909318208694458,
"learning_rate": 1.979937402000991e-05,
"loss": 0.6319,
"mean_token_accuracy": 0.8056973502039909,
"step": 480
},
{
"epoch": 0.4774209425372216,
"grad_norm": 0.31788304448127747,
"learning_rate": 1.9787787200042224e-05,
"loss": 0.6354,
"mean_token_accuracy": 0.8051144614815712,
"step": 485
},
{
"epoch": 0.48234280792420325,
"grad_norm": 0.2922450602054596,
"learning_rate": 1.977587870163184e-05,
"loss": 0.6278,
"mean_token_accuracy": 0.8066384568810463,
"step": 490
},
{
"epoch": 0.48726467331118495,
"grad_norm": 0.287406325340271,
"learning_rate": 1.9763648916154982e-05,
"loss": 0.6271,
"mean_token_accuracy": 0.8069956362247467,
"step": 495
},
{
"epoch": 0.4921865386981666,
"grad_norm": 0.34040403366088867,
"learning_rate": 1.975109824554707e-05,
"loss": 0.6288,
"mean_token_accuracy": 0.806525257229805,
"step": 500
},
{
"epoch": 0.4971084040851483,
"grad_norm": 0.3302447199821472,
"learning_rate": 1.973822710228951e-05,
"loss": 0.6257,
"mean_token_accuracy": 0.8072399228811264,
"step": 505
},
{
"epoch": 0.5020302694721299,
"grad_norm": 0.288161963224411,
"learning_rate": 1.972503590939612e-05,
"loss": 0.6234,
"mean_token_accuracy": 0.8078823387622833,
"step": 510
},
{
"epoch": 0.5069521348591116,
"grad_norm": 0.3387835919857025,
"learning_rate": 1.971152510039926e-05,
"loss": 0.6269,
"mean_token_accuracy": 0.8067226454615593,
"step": 515
},
{
"epoch": 0.5118740002460933,
"grad_norm": 0.290519118309021,
"learning_rate": 1.9697695119335547e-05,
"loss": 0.6213,
"mean_token_accuracy": 0.8083379164338111,
"step": 520
},
{
"epoch": 0.5167958656330749,
"grad_norm": 0.3701138496398926,
"learning_rate": 1.9683546420731292e-05,
"loss": 0.6246,
"mean_token_accuracy": 0.8079604268074035,
"step": 525
},
{
"epoch": 0.5217177310200566,
"grad_norm": 0.39614954590797424,
"learning_rate": 1.9669079469587548e-05,
"loss": 0.6287,
"mean_token_accuracy": 0.8067878499627114,
"step": 530
},
{
"epoch": 0.5266395964070383,
"grad_norm": 0.32784542441368103,
"learning_rate": 1.965429474136482e-05,
"loss": 0.6156,
"mean_token_accuracy": 0.8098407059907913,
"step": 535
},
{
"epoch": 0.53156146179402,
"grad_norm": 0.30213144421577454,
"learning_rate": 1.963919272196746e-05,
"loss": 0.6207,
"mean_token_accuracy": 0.8086924180388451,
"step": 540
},
{
"epoch": 0.5364833271810016,
"grad_norm": 0.32220178842544556,
"learning_rate": 1.9623773907727682e-05,
"loss": 0.6157,
"mean_token_accuracy": 0.8098208606243134,
"step": 545
},
{
"epoch": 0.5414051925679833,
"grad_norm": 0.3250666856765747,
"learning_rate": 1.9608038805389253e-05,
"loss": 0.6195,
"mean_token_accuracy": 0.8085113659501075,
"step": 550
},
{
"epoch": 0.546327057954965,
"grad_norm": 0.36724722385406494,
"learning_rate": 1.9591987932090836e-05,
"loss": 0.6115,
"mean_token_accuracy": 0.8109661117196083,
"step": 555
},
{
"epoch": 0.5512489233419466,
"grad_norm": 0.30343472957611084,
"learning_rate": 1.9575621815349e-05,
"loss": 0.6204,
"mean_token_accuracy": 0.8083494484424592,
"step": 560
},
{
"epoch": 0.5561707887289282,
"grad_norm": 0.3323419988155365,
"learning_rate": 1.9558940993040885e-05,
"loss": 0.6232,
"mean_token_accuracy": 0.8077159106731415,
"step": 565
},
{
"epoch": 0.5610926541159099,
"grad_norm": 0.31035885214805603,
"learning_rate": 1.954194601338651e-05,
"loss": 0.6157,
"mean_token_accuracy": 0.8096732005476952,
"step": 570
},
{
"epoch": 0.5660145195028916,
"grad_norm": 0.2931119501590729,
"learning_rate": 1.952463743493078e-05,
"loss": 0.6199,
"mean_token_accuracy": 0.808499938249588,
"step": 575
},
{
"epoch": 0.5709363848898733,
"grad_norm": 0.27563023567199707,
"learning_rate": 1.9507015826525096e-05,
"loss": 0.6046,
"mean_token_accuracy": 0.8128907606005669,
"step": 580
},
{
"epoch": 0.5758582502768549,
"grad_norm": 0.28453299403190613,
"learning_rate": 1.9489081767308696e-05,
"loss": 0.6105,
"mean_token_accuracy": 0.8113355338573456,
"step": 585
},
{
"epoch": 0.5807801156638366,
"grad_norm": 0.37042465806007385,
"learning_rate": 1.9470835846689596e-05,
"loss": 0.6127,
"mean_token_accuracy": 0.8106034889817237,
"step": 590
},
{
"epoch": 0.5857019810508183,
"grad_norm": 0.2963549792766571,
"learning_rate": 1.9452278664325227e-05,
"loss": 0.6194,
"mean_token_accuracy": 0.8086869075894356,
"step": 595
},
{
"epoch": 0.5906238464378,
"grad_norm": 0.2905316948890686,
"learning_rate": 1.9433410830102724e-05,
"loss": 0.61,
"mean_token_accuracy": 0.811042046546936,
"step": 600
},
{
"epoch": 0.5955457118247816,
"grad_norm": 0.2674277424812317,
"learning_rate": 1.9414232964118893e-05,
"loss": 0.6119,
"mean_token_accuracy": 0.8104571312665939,
"step": 605
},
{
"epoch": 0.6004675772117632,
"grad_norm": 0.28245261311531067,
"learning_rate": 1.939474569665981e-05,
"loss": 0.6115,
"mean_token_accuracy": 0.8106845885515213,
"step": 610
},
{
"epoch": 0.6053894425987449,
"grad_norm": 0.2713403105735779,
"learning_rate": 1.937494966818014e-05,
"loss": 0.6096,
"mean_token_accuracy": 0.8106750875711441,
"step": 615
},
{
"epoch": 0.6103113079857266,
"grad_norm": 0.31770050525665283,
"learning_rate": 1.9354845529282042e-05,
"loss": 0.6142,
"mean_token_accuracy": 0.8098479628562927,
"step": 620
},
{
"epoch": 0.6152331733727082,
"grad_norm": 0.28526055812835693,
"learning_rate": 1.933443394069383e-05,
"loss": 0.6062,
"mean_token_accuracy": 0.8120482847094536,
"step": 625
},
{
"epoch": 0.6201550387596899,
"grad_norm": 0.5695453882217407,
"learning_rate": 1.9313715573248238e-05,
"loss": 0.6122,
"mean_token_accuracy": 0.8099897101521492,
"step": 630
},
{
"epoch": 0.6250769041466716,
"grad_norm": 0.2738396227359772,
"learning_rate": 1.9292691107860374e-05,
"loss": 0.6031,
"mean_token_accuracy": 0.8127053424715995,
"step": 635
},
{
"epoch": 0.6299987695336533,
"grad_norm": 0.28948965668678284,
"learning_rate": 1.927136123550534e-05,
"loss": 0.6115,
"mean_token_accuracy": 0.8103477448225022,
"step": 640
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.27830740809440613,
"learning_rate": 1.9249726657195534e-05,
"loss": 0.608,
"mean_token_accuracy": 0.8116561621427536,
"step": 645
},
{
"epoch": 0.6398425003076166,
"grad_norm": 0.2712289094924927,
"learning_rate": 1.922778808395759e-05,
"loss": 0.6054,
"mean_token_accuracy": 0.8125208973884582,
"step": 650
},
{
"epoch": 0.6447643656945983,
"grad_norm": 0.29063907265663147,
"learning_rate": 1.9205546236809037e-05,
"loss": 0.6047,
"mean_token_accuracy": 0.8123130992054939,
"step": 655
},
{
"epoch": 0.64968623108158,
"grad_norm": 0.293261855840683,
"learning_rate": 1.9183001846734573e-05,
"loss": 0.603,
"mean_token_accuracy": 0.8129645109176635,
"step": 660
},
{
"epoch": 0.6546080964685616,
"grad_norm": 0.2849041223526001,
"learning_rate": 1.9160155654662075e-05,
"loss": 0.5926,
"mean_token_accuracy": 0.8157610684633255,
"step": 665
},
{
"epoch": 0.6595299618555432,
"grad_norm": 0.2975578010082245,
"learning_rate": 1.9137008411438213e-05,
"loss": 0.6034,
"mean_token_accuracy": 0.8125734269618988,
"step": 670
},
{
"epoch": 0.6644518272425249,
"grad_norm": 0.286842405796051,
"learning_rate": 1.9113560877803798e-05,
"loss": 0.6045,
"mean_token_accuracy": 0.8125320598483086,
"step": 675
},
{
"epoch": 0.6693736926295066,
"grad_norm": 0.33480602502822876,
"learning_rate": 1.9089813824368765e-05,
"loss": 0.5975,
"mean_token_accuracy": 0.8142675384879112,
"step": 680
},
{
"epoch": 0.6742955580164882,
"grad_norm": 0.29252228140830994,
"learning_rate": 1.9065768031586864e-05,
"loss": 0.6056,
"mean_token_accuracy": 0.8120014935731887,
"step": 685
},
{
"epoch": 0.6792174234034699,
"grad_norm": 0.2882521450519562,
"learning_rate": 1.9041424289729994e-05,
"loss": 0.595,
"mean_token_accuracy": 0.8150214269757271,
"step": 690
},
{
"epoch": 0.6841392887904516,
"grad_norm": 0.29731523990631104,
"learning_rate": 1.901678339886223e-05,
"loss": 0.6013,
"mean_token_accuracy": 0.8131750777363778,
"step": 695
},
{
"epoch": 0.6890611541774333,
"grad_norm": 0.26834896206855774,
"learning_rate": 1.8991846168813547e-05,
"loss": 0.5918,
"mean_token_accuracy": 0.8156168267130852,
"step": 700
},
{
"epoch": 0.6939830195644149,
"grad_norm": 0.29199543595314026,
"learning_rate": 1.896661341915318e-05,
"loss": 0.6033,
"mean_token_accuracy": 0.8124941572546959,
"step": 705
},
{
"epoch": 0.6989048849513966,
"grad_norm": 0.28719085454940796,
"learning_rate": 1.8941085979162714e-05,
"loss": 0.5992,
"mean_token_accuracy": 0.8138533607125282,
"step": 710
},
{
"epoch": 0.7038267503383783,
"grad_norm": 0.28042468428611755,
"learning_rate": 1.891526468780881e-05,
"loss": 0.605,
"mean_token_accuracy": 0.8121193930506706,
"step": 715
},
{
"epoch": 0.70874861572536,
"grad_norm": 0.272483766078949,
"learning_rate": 1.8889150393715627e-05,
"loss": 0.5943,
"mean_token_accuracy": 0.8147971466183662,
"step": 720
},
{
"epoch": 0.7136704811123415,
"grad_norm": 0.24886226654052734,
"learning_rate": 1.8862743955136966e-05,
"loss": 0.5957,
"mean_token_accuracy": 0.8145680665969849,
"step": 725
},
{
"epoch": 0.7185923464993232,
"grad_norm": 0.26445212960243225,
"learning_rate": 1.8836046239928025e-05,
"loss": 0.5948,
"mean_token_accuracy": 0.8148575246334075,
"step": 730
},
{
"epoch": 0.7235142118863049,
"grad_norm": 0.2891506850719452,
"learning_rate": 1.8809058125516894e-05,
"loss": 0.5968,
"mean_token_accuracy": 0.8141703933477402,
"step": 735
},
{
"epoch": 0.7284360772732866,
"grad_norm": 0.28364264965057373,
"learning_rate": 1.8781780498875727e-05,
"loss": 0.6035,
"mean_token_accuracy": 0.8124788105487823,
"step": 740
},
{
"epoch": 0.7333579426602682,
"grad_norm": 0.2917366921901703,
"learning_rate": 1.8754214256491564e-05,
"loss": 0.5928,
"mean_token_accuracy": 0.8153851807117463,
"step": 745
},
{
"epoch": 0.7382798080472499,
"grad_norm": 0.2714190185070038,
"learning_rate": 1.8726360304336896e-05,
"loss": 0.601,
"mean_token_accuracy": 0.8129221558570862,
"step": 750
},
{
"epoch": 0.7432016734342316,
"grad_norm": 0.29474568367004395,
"learning_rate": 1.8698219557839875e-05,
"loss": 0.5963,
"mean_token_accuracy": 0.8142225205898285,
"step": 755
},
{
"epoch": 0.7481235388212132,
"grad_norm": 0.2684454619884491,
"learning_rate": 1.866979294185423e-05,
"loss": 0.5933,
"mean_token_accuracy": 0.8149216592311859,
"step": 760
},
{
"epoch": 0.7530454042081949,
"grad_norm": 0.26693102717399597,
"learning_rate": 1.864108139062888e-05,
"loss": 0.5908,
"mean_token_accuracy": 0.8157912597060204,
"step": 765
},
{
"epoch": 0.7579672695951766,
"grad_norm": 0.27418771386146545,
"learning_rate": 1.8612085847777215e-05,
"loss": 0.5913,
"mean_token_accuracy": 0.8156127855181694,
"step": 770
},
{
"epoch": 0.7628891349821583,
"grad_norm": 0.30855274200439453,
"learning_rate": 1.858280726624609e-05,
"loss": 0.5922,
"mean_token_accuracy": 0.81515374481678,
"step": 775
},
{
"epoch": 0.7678110003691399,
"grad_norm": 0.2978297472000122,
"learning_rate": 1.855324660828452e-05,
"loss": 0.5999,
"mean_token_accuracy": 0.8132428601384163,
"step": 780
},
{
"epoch": 0.7727328657561215,
"grad_norm": 0.30609989166259766,
"learning_rate": 1.8523404845412028e-05,
"loss": 0.5931,
"mean_token_accuracy": 0.8152095600962639,
"step": 785
},
{
"epoch": 0.7776547311431032,
"grad_norm": 0.28423747420310974,
"learning_rate": 1.849328295838674e-05,
"loss": 0.5939,
"mean_token_accuracy": 0.8150446817278862,
"step": 790
},
{
"epoch": 0.7825765965300849,
"grad_norm": 0.39114367961883545,
"learning_rate": 1.8462881937173144e-05,
"loss": 0.5886,
"mean_token_accuracy": 0.8164272159337997,
"step": 795
},
{
"epoch": 0.7874984619170665,
"grad_norm": 0.2761843502521515,
"learning_rate": 1.8432202780909542e-05,
"loss": 0.594,
"mean_token_accuracy": 0.8146432772278785,
"step": 800
},
{
"epoch": 0.7924203273040482,
"grad_norm": 0.26402318477630615,
"learning_rate": 1.8401246497875238e-05,
"loss": 0.5892,
"mean_token_accuracy": 0.8162309199571609,
"step": 805
},
{
"epoch": 0.7973421926910299,
"grad_norm": 0.26799553632736206,
"learning_rate": 1.8370014105457378e-05,
"loss": 0.5901,
"mean_token_accuracy": 0.8156055212020874,
"step": 810
},
{
"epoch": 0.8022640580780116,
"grad_norm": 0.3189884126186371,
"learning_rate": 1.8338506630117527e-05,
"loss": 0.5821,
"mean_token_accuracy": 0.8177683308720589,
"step": 815
},
{
"epoch": 0.8071859234649932,
"grad_norm": 0.26993831992149353,
"learning_rate": 1.8306725107357933e-05,
"loss": 0.5887,
"mean_token_accuracy": 0.8162371620535851,
"step": 820
},
{
"epoch": 0.8121077888519749,
"grad_norm": 0.33908817172050476,
"learning_rate": 1.827467058168748e-05,
"loss": 0.5932,
"mean_token_accuracy": 0.8148850262165069,
"step": 825
},
{
"epoch": 0.8170296542389566,
"grad_norm": 0.2749953866004944,
"learning_rate": 1.824234410658738e-05,
"loss": 0.5807,
"mean_token_accuracy": 0.8185225054621696,
"step": 830
},
{
"epoch": 0.8219515196259383,
"grad_norm": 0.28679126501083374,
"learning_rate": 1.8209746744476538e-05,
"loss": 0.5844,
"mean_token_accuracy": 0.81742594987154,
"step": 835
},
{
"epoch": 0.8268733850129198,
"grad_norm": 0.29817092418670654,
"learning_rate": 1.817687956667664e-05,
"loss": 0.584,
"mean_token_accuracy": 0.8173492252826691,
"step": 840
},
{
"epoch": 0.8317952503999015,
"grad_norm": 0.2705828547477722,
"learning_rate": 1.8143743653376944e-05,
"loss": 0.5955,
"mean_token_accuracy": 0.8145547702908515,
"step": 845
},
{
"epoch": 0.8367171157868832,
"grad_norm": 0.28381243348121643,
"learning_rate": 1.811034009359877e-05,
"loss": 0.5833,
"mean_token_accuracy": 0.8177738025784492,
"step": 850
},
{
"epoch": 0.8416389811738649,
"grad_norm": 0.2846708595752716,
"learning_rate": 1.8076669985159726e-05,
"loss": 0.5817,
"mean_token_accuracy": 0.8179952159523964,
"step": 855
},
{
"epoch": 0.8465608465608465,
"grad_norm": 0.2997231185436249,
"learning_rate": 1.8042734434637615e-05,
"loss": 0.5934,
"mean_token_accuracy": 0.8149283960461616,
"step": 860
},
{
"epoch": 0.8514827119478282,
"grad_norm": 0.29204457998275757,
"learning_rate": 1.8008534557334064e-05,
"loss": 0.5795,
"mean_token_accuracy": 0.8184737205505371,
"step": 865
},
{
"epoch": 0.8564045773348099,
"grad_norm": 0.30441614985466003,
"learning_rate": 1.7974071477237887e-05,
"loss": 0.585,
"mean_token_accuracy": 0.8171376779675483,
"step": 870
},
{
"epoch": 0.8613264427217916,
"grad_norm": 0.2779221832752228,
"learning_rate": 1.7939346326988127e-05,
"loss": 0.5889,
"mean_token_accuracy": 0.8160797134041786,
"step": 875
},
{
"epoch": 0.8662483081087732,
"grad_norm": 0.250242680311203,
"learning_rate": 1.7904360247836838e-05,
"loss": 0.5894,
"mean_token_accuracy": 0.81572295576334,
"step": 880
},
{
"epoch": 0.8711701734957549,
"grad_norm": 0.26801884174346924,
"learning_rate": 1.7869114389611574e-05,
"loss": 0.5853,
"mean_token_accuracy": 0.8168028473854065,
"step": 885
},
{
"epoch": 0.8760920388827366,
"grad_norm": 0.33699533343315125,
"learning_rate": 1.7833609910677613e-05,
"loss": 0.5804,
"mean_token_accuracy": 0.8181165441870689,
"step": 890
},
{
"epoch": 0.8810139042697183,
"grad_norm": 0.28362491726875305,
"learning_rate": 1.7797847977899873e-05,
"loss": 0.5823,
"mean_token_accuracy": 0.8177706867456436,
"step": 895
},
{
"epoch": 0.8859357696566998,
"grad_norm": 0.2863147556781769,
"learning_rate": 1.7761829766604556e-05,
"loss": 0.5797,
"mean_token_accuracy": 0.8185298308730126,
"step": 900
},
{
"epoch": 0.8908576350436815,
"grad_norm": 0.27263742685317993,
"learning_rate": 1.7725556460540553e-05,
"loss": 0.5825,
"mean_token_accuracy": 0.8175166144967079,
"step": 905
},
{
"epoch": 0.8957795004306632,
"grad_norm": 0.28120777010917664,
"learning_rate": 1.7689029251840492e-05,
"loss": 0.5788,
"mean_token_accuracy": 0.8185988172888756,
"step": 910
},
{
"epoch": 0.9007013658176449,
"grad_norm": 0.3469211459159851,
"learning_rate": 1.7652249340981608e-05,
"loss": 0.5877,
"mean_token_accuracy": 0.8159551978111267,
"step": 915
},
{
"epoch": 0.9056232312046265,
"grad_norm": 0.3101508617401123,
"learning_rate": 1.7615217936746246e-05,
"loss": 0.5819,
"mean_token_accuracy": 0.8174650520086288,
"step": 920
},
{
"epoch": 0.9105450965916082,
"grad_norm": 0.38838618993759155,
"learning_rate": 1.757793625618217e-05,
"loss": 0.5755,
"mean_token_accuracy": 0.8196040257811547,
"step": 925
},
{
"epoch": 0.9154669619785899,
"grad_norm": 0.3253493309020996,
"learning_rate": 1.7540405524562533e-05,
"loss": 0.5777,
"mean_token_accuracy": 0.8182825416326522,
"step": 930
},
{
"epoch": 0.9203888273655716,
"grad_norm": 0.2917826175689697,
"learning_rate": 1.750262697534563e-05,
"loss": 0.5809,
"mean_token_accuracy": 0.8180661648511887,
"step": 935
},
{
"epoch": 0.9253106927525532,
"grad_norm": 0.25714483857154846,
"learning_rate": 1.7464601850134353e-05,
"loss": 0.5752,
"mean_token_accuracy": 0.8194984391331672,
"step": 940
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.28597357869148254,
"learning_rate": 1.742633139863538e-05,
"loss": 0.579,
"mean_token_accuracy": 0.8184013769030571,
"step": 945
},
{
"epoch": 0.9351544235265166,
"grad_norm": 0.9777734875679016,
"learning_rate": 1.738781687861812e-05,
"loss": 0.5789,
"mean_token_accuracy": 0.8188063263893127,
"step": 950
},
{
"epoch": 0.9400762889134983,
"grad_norm": 0.26717498898506165,
"learning_rate": 1.7349059555873348e-05,
"loss": 0.5754,
"mean_token_accuracy": 0.8191799059510231,
"step": 955
},
{
"epoch": 0.9449981543004798,
"grad_norm": 0.29053807258605957,
"learning_rate": 1.731006070417163e-05,
"loss": 0.5726,
"mean_token_accuracy": 0.8204409092664718,
"step": 960
},
{
"epoch": 0.9499200196874615,
"grad_norm": 0.3052172362804413,
"learning_rate": 1.7270821605221448e-05,
"loss": 0.5764,
"mean_token_accuracy": 0.819102555513382,
"step": 965
},
{
"epoch": 0.9548418850744432,
"grad_norm": 0.33640167117118835,
"learning_rate": 1.7231343548627085e-05,
"loss": 0.5789,
"mean_token_accuracy": 0.8184890508651733,
"step": 970
},
{
"epoch": 0.9597637504614249,
"grad_norm": 0.2829669415950775,
"learning_rate": 1.7191627831846226e-05,
"loss": 0.5803,
"mean_token_accuracy": 0.8179109930992127,
"step": 975
},
{
"epoch": 0.9646856158484065,
"grad_norm": 0.2560986280441284,
"learning_rate": 1.7151675760147325e-05,
"loss": 0.5721,
"mean_token_accuracy": 0.8198479250073433,
"step": 980
},
{
"epoch": 0.9696074812353882,
"grad_norm": 0.27663761377334595,
"learning_rate": 1.7111488646566728e-05,
"loss": 0.5851,
"mean_token_accuracy": 0.8171452388167382,
"step": 985
},
{
"epoch": 0.9745293466223699,
"grad_norm": 0.2673356235027313,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.5751,
"mean_token_accuracy": 0.8194502517580986,
"step": 990
},
{
"epoch": 0.9794512120093516,
"grad_norm": 0.2639131546020508,
"learning_rate": 1.7030414584485938e-05,
"loss": 0.5757,
"mean_token_accuracy": 0.8192202031612397,
"step": 995
},
{
"epoch": 0.9843730773963332,
"grad_norm": 0.2639618515968323,
"learning_rate": 1.6989530300508126e-05,
"loss": 0.576,
"mean_token_accuracy": 0.8191347226500512,
"step": 1000
},
{
"epoch": 0.9892949427833149,
"grad_norm": 0.2554817199707031,
"learning_rate": 1.6948416303605796e-05,
"loss": 0.5778,
"mean_token_accuracy": 0.8186899140477181,
"step": 1005
},
{
"epoch": 0.9942168081702966,
"grad_norm": 0.25301820039749146,
"learning_rate": 1.690707394500229e-05,
"loss": 0.576,
"mean_token_accuracy": 0.8191317170858383,
"step": 1010
},
{
"epoch": 0.9991386735572783,
"grad_norm": 0.2470293790102005,
"learning_rate": 1.6865504583426117e-05,
"loss": 0.5707,
"mean_token_accuracy": 0.8204790607094765,
"step": 1015
},
{
"epoch": 1.0049218653869816,
"grad_norm": 0.3501671254634857,
"learning_rate": 1.6823709585066308e-05,
"loss": 0.6648,
"mean_token_accuracy": 0.824617318990754,
"step": 1020
},
{
"epoch": 1.0098437307739634,
"grad_norm": 0.30985623598098755,
"learning_rate": 1.6781690323527512e-05,
"loss": 0.5503,
"mean_token_accuracy": 0.8255873426795006,
"step": 1025
},
{
"epoch": 1.014765596160945,
"grad_norm": 0.2879364788532257,
"learning_rate": 1.6739448179784846e-05,
"loss": 0.5529,
"mean_token_accuracy": 0.8247572600841522,
"step": 1030
},
{
"epoch": 1.0196874615479268,
"grad_norm": 0.27657514810562134,
"learning_rate": 1.669698454213852e-05,
"loss": 0.55,
"mean_token_accuracy": 0.8258542969822884,
"step": 1035
},
{
"epoch": 1.0246093269349084,
"grad_norm": 0.259316623210907,
"learning_rate": 1.665430080616821e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8273309215903282,
"step": 1040
},
{
"epoch": 1.02953119232189,
"grad_norm": 0.27227073907852173,
"learning_rate": 1.6611398374687172e-05,
"loss": 0.5494,
"mean_token_accuracy": 0.8259153485298156,
"step": 1045
},
{
"epoch": 1.0344530577088717,
"grad_norm": 0.2718289792537689,
"learning_rate": 1.6568278657696166e-05,
"loss": 0.5445,
"mean_token_accuracy": 0.827112241089344,
"step": 1050
},
{
"epoch": 1.0393749230958533,
"grad_norm": 0.28744345903396606,
"learning_rate": 1.6524943072337094e-05,
"loss": 0.5501,
"mean_token_accuracy": 0.8256638810038567,
"step": 1055
},
{
"epoch": 1.044296788482835,
"grad_norm": 0.26266416907310486,
"learning_rate": 1.6481393042846442e-05,
"loss": 0.5467,
"mean_token_accuracy": 0.8264568135142326,
"step": 1060
},
{
"epoch": 1.0492186538698167,
"grad_norm": 0.25888925790786743,
"learning_rate": 1.6437630000508466e-05,
"loss": 0.5522,
"mean_token_accuracy": 0.8247309610247612,
"step": 1065
},
{
"epoch": 1.0541405192567983,
"grad_norm": 0.25061705708503723,
"learning_rate": 1.6393655383608132e-05,
"loss": 0.5459,
"mean_token_accuracy": 0.8267670929431915,
"step": 1070
},
{
"epoch": 1.0590623846437799,
"grad_norm": 0.25011131167411804,
"learning_rate": 1.634947063738389e-05,
"loss": 0.5483,
"mean_token_accuracy": 0.8261876925826073,
"step": 1075
},
{
"epoch": 1.0639842500307617,
"grad_norm": 0.26051655411720276,
"learning_rate": 1.630507721398013e-05,
"loss": 0.5452,
"mean_token_accuracy": 0.82709851115942,
"step": 1080
},
{
"epoch": 1.0689061154177433,
"grad_norm": 0.2643815279006958,
"learning_rate": 1.6260476572399494e-05,
"loss": 0.5497,
"mean_token_accuracy": 0.825461483001709,
"step": 1085
},
{
"epoch": 1.073827980804725,
"grad_norm": 0.3040525019168854,
"learning_rate": 1.6215670178454893e-05,
"loss": 0.5478,
"mean_token_accuracy": 0.8264098614454269,
"step": 1090
},
{
"epoch": 1.0787498461917067,
"grad_norm": 0.28461357951164246,
"learning_rate": 1.6170659504721365e-05,
"loss": 0.5474,
"mean_token_accuracy": 0.8261038646101951,
"step": 1095
},
{
"epoch": 1.0836717115786882,
"grad_norm": 0.24723611772060394,
"learning_rate": 1.6125446030487642e-05,
"loss": 0.542,
"mean_token_accuracy": 0.8277976959943771,
"step": 1100
},
{
"epoch": 1.08859357696567,
"grad_norm": 0.4478602707386017,
"learning_rate": 1.608003124170758e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8271990329027176,
"step": 1105
},
{
"epoch": 1.0935154423526516,
"grad_norm": 0.2758786082267761,
"learning_rate": 1.6034416630951265e-05,
"loss": 0.5546,
"mean_token_accuracy": 0.8245001256465911,
"step": 1110
},
{
"epoch": 1.0984373077396332,
"grad_norm": 0.8616223335266113,
"learning_rate": 1.598860369735601e-05,
"loss": 0.5419,
"mean_token_accuracy": 0.827488873898983,
"step": 1115
},
{
"epoch": 1.103359173126615,
"grad_norm": 0.24690531194210052,
"learning_rate": 1.594259394657707e-05,
"loss": 0.5493,
"mean_token_accuracy": 0.8259517803788186,
"step": 1120
},
{
"epoch": 1.1082810385135966,
"grad_norm": 0.24601490795612335,
"learning_rate": 1.589638889073813e-05,
"loss": 0.5563,
"mean_token_accuracy": 0.8240275859832764,
"step": 1125
},
{
"epoch": 1.1132029039005784,
"grad_norm": 0.32801708579063416,
"learning_rate": 1.584999004838165e-05,
"loss": 0.5474,
"mean_token_accuracy": 0.8265691444277763,
"step": 1130
},
{
"epoch": 1.11812476928756,
"grad_norm": 0.25093355774879456,
"learning_rate": 1.5803398944418934e-05,
"loss": 0.5426,
"mean_token_accuracy": 0.8273544386029243,
"step": 1135
},
{
"epoch": 1.1230466346745416,
"grad_norm": 0.2600312829017639,
"learning_rate": 1.5756617110080023e-05,
"loss": 0.5522,
"mean_token_accuracy": 0.8249027922749519,
"step": 1140
},
{
"epoch": 1.1279685000615234,
"grad_norm": 0.26066142320632935,
"learning_rate": 1.570964608286336e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8270187392830849,
"step": 1145
},
{
"epoch": 1.132890365448505,
"grad_norm": 0.27738282084465027,
"learning_rate": 1.5662487406485273e-05,
"loss": 0.5361,
"mean_token_accuracy": 0.8295004799962044,
"step": 1150
},
{
"epoch": 1.1378122308354865,
"grad_norm": 0.3502300977706909,
"learning_rate": 1.561514263082923e-05,
"loss": 0.5482,
"mean_token_accuracy": 0.8256632193922997,
"step": 1155
},
{
"epoch": 1.1427340962224684,
"grad_norm": 0.5840310454368591,
"learning_rate": 1.5567613311894908e-05,
"loss": 0.5337,
"mean_token_accuracy": 0.8303180441260338,
"step": 1160
},
{
"epoch": 1.14765596160945,
"grad_norm": 0.2714439034461975,
"learning_rate": 1.5519901011747046e-05,
"loss": 0.5479,
"mean_token_accuracy": 0.8258592769503593,
"step": 1165
},
{
"epoch": 1.1525778269964317,
"grad_norm": 0.2692211866378784,
"learning_rate": 1.5472007298464117e-05,
"loss": 0.5439,
"mean_token_accuracy": 0.8271799921989441,
"step": 1170
},
{
"epoch": 1.1574996923834133,
"grad_norm": 0.2637535631656647,
"learning_rate": 1.5423933746086793e-05,
"loss": 0.5382,
"mean_token_accuracy": 0.8288466781377792,
"step": 1175
},
{
"epoch": 1.162421557770395,
"grad_norm": 0.25311315059661865,
"learning_rate": 1.5375681934566203e-05,
"loss": 0.5399,
"mean_token_accuracy": 0.8281501397490502,
"step": 1180
},
{
"epoch": 1.1673434231573767,
"grad_norm": 0.25321346521377563,
"learning_rate": 1.532725344971202e-05,
"loss": 0.5482,
"mean_token_accuracy": 0.8261646762490272,
"step": 1185
},
{
"epoch": 1.1722652885443583,
"grad_norm": 0.25499051809310913,
"learning_rate": 1.527864988314033e-05,
"loss": 0.5425,
"mean_token_accuracy": 0.8275581628084183,
"step": 1190
},
{
"epoch": 1.17718715393134,
"grad_norm": 0.2546637952327728,
"learning_rate": 1.5229872832221336e-05,
"loss": 0.5397,
"mean_token_accuracy": 0.8283757612109184,
"step": 1195
},
{
"epoch": 1.1821090193183217,
"grad_norm": 0.2738707363605499,
"learning_rate": 1.5180923900026847e-05,
"loss": 0.5386,
"mean_token_accuracy": 0.8282813474535942,
"step": 1200
},
{
"epoch": 1.1870308847053033,
"grad_norm": 0.2539266347885132,
"learning_rate": 1.5131804695277612e-05,
"loss": 0.5462,
"mean_token_accuracy": 0.826425202190876,
"step": 1205
},
{
"epoch": 1.1919527500922849,
"grad_norm": 0.2745126187801361,
"learning_rate": 1.5082516832290424e-05,
"loss": 0.5404,
"mean_token_accuracy": 0.8284027636051178,
"step": 1210
},
{
"epoch": 1.1968746154792667,
"grad_norm": 0.2544495165348053,
"learning_rate": 1.5033061930925081e-05,
"loss": 0.532,
"mean_token_accuracy": 0.8300672218203544,
"step": 1215
},
{
"epoch": 1.2017964808662482,
"grad_norm": 0.27299556136131287,
"learning_rate": 1.4983441616531152e-05,
"loss": 0.5396,
"mean_token_accuracy": 0.8280036672949791,
"step": 1220
},
{
"epoch": 1.20671834625323,
"grad_norm": 0.28981074690818787,
"learning_rate": 1.4933657519894542e-05,
"loss": 0.5524,
"mean_token_accuracy": 0.8247063636779786,
"step": 1225
},
{
"epoch": 1.2116402116402116,
"grad_norm": 0.30510908365249634,
"learning_rate": 1.4883711277183917e-05,
"loss": 0.5379,
"mean_token_accuracy": 0.8288484767079354,
"step": 1230
},
{
"epoch": 1.2165620770271932,
"grad_norm": 0.2616790533065796,
"learning_rate": 1.483360452989691e-05,
"loss": 0.5415,
"mean_token_accuracy": 0.8275775909423828,
"step": 1235
},
{
"epoch": 1.221483942414175,
"grad_norm": 0.2551945745944977,
"learning_rate": 1.4783338924806191e-05,
"loss": 0.5347,
"mean_token_accuracy": 0.8295770674943924,
"step": 1240
},
{
"epoch": 1.2264058078011566,
"grad_norm": 0.28227224946022034,
"learning_rate": 1.4732916113905336e-05,
"loss": 0.5425,
"mean_token_accuracy": 0.8273839592933655,
"step": 1245
},
{
"epoch": 1.2313276731881384,
"grad_norm": 0.260978102684021,
"learning_rate": 1.4682337754354534e-05,
"loss": 0.5431,
"mean_token_accuracy": 0.8270445480942726,
"step": 1250
},
{
"epoch": 1.23624953857512,
"grad_norm": 0.279462605714798,
"learning_rate": 1.4631605508426124e-05,
"loss": 0.5379,
"mean_token_accuracy": 0.828822860121727,
"step": 1255
},
{
"epoch": 1.2411714039621016,
"grad_norm": 0.2665978670120239,
"learning_rate": 1.4580721043449968e-05,
"loss": 0.5403,
"mean_token_accuracy": 0.8279185205698013,
"step": 1260
},
{
"epoch": 1.2460932693490834,
"grad_norm": 0.24216796457767487,
"learning_rate": 1.4529686031758642e-05,
"loss": 0.5409,
"mean_token_accuracy": 0.8280630350112915,
"step": 1265
},
{
"epoch": 1.251015134736065,
"grad_norm": 0.2504848837852478,
"learning_rate": 1.4478502150632503e-05,
"loss": 0.5389,
"mean_token_accuracy": 0.8282234400510788,
"step": 1270
},
{
"epoch": 1.2559370001230468,
"grad_norm": 0.25835323333740234,
"learning_rate": 1.4427171082244523e-05,
"loss": 0.5471,
"mean_token_accuracy": 0.8258385419845581,
"step": 1275
},
{
"epoch": 1.2608588655100283,
"grad_norm": 0.26074373722076416,
"learning_rate": 1.4375694513605037e-05,
"loss": 0.5413,
"mean_token_accuracy": 0.8273946106433868,
"step": 1280
},
{
"epoch": 1.26578073089701,
"grad_norm": 0.2714027762413025,
"learning_rate": 1.4324074136506283e-05,
"loss": 0.5399,
"mean_token_accuracy": 0.8278847292065621,
"step": 1285
},
{
"epoch": 1.2707025962839915,
"grad_norm": 0.24950872361660004,
"learning_rate": 1.427231164746681e-05,
"loss": 0.5429,
"mean_token_accuracy": 0.827368488907814,
"step": 1290
},
{
"epoch": 1.2756244616709733,
"grad_norm": 0.2415134608745575,
"learning_rate": 1.4220408747675714e-05,
"loss": 0.5417,
"mean_token_accuracy": 0.8275652229785919,
"step": 1295
},
{
"epoch": 1.280546327057955,
"grad_norm": 0.23719871044158936,
"learning_rate": 1.4168367142936736e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8268394738435745,
"step": 1300
},
{
"epoch": 1.2854681924449367,
"grad_norm": 0.2537670135498047,
"learning_rate": 1.4116188543612182e-05,
"loss": 0.5329,
"mean_token_accuracy": 0.8299818679690361,
"step": 1305
},
{
"epoch": 1.2903900578319183,
"grad_norm": 0.2709537446498871,
"learning_rate": 1.4063874664566734e-05,
"loss": 0.5419,
"mean_token_accuracy": 0.8275921046733856,
"step": 1310
},
{
"epoch": 1.2953119232188999,
"grad_norm": 0.26924365758895874,
"learning_rate": 1.4011427225111091e-05,
"loss": 0.5321,
"mean_token_accuracy": 0.8305203005671501,
"step": 1315
},
{
"epoch": 1.3002337886058817,
"grad_norm": 0.2832610607147217,
"learning_rate": 1.3958847948945428e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.8282249644398689,
"step": 1320
},
{
"epoch": 1.3051556539928633,
"grad_norm": 0.2596539258956909,
"learning_rate": 1.3906138564102794e-05,
"loss": 0.5356,
"mean_token_accuracy": 0.829230573773384,
"step": 1325
},
{
"epoch": 1.310077519379845,
"grad_norm": 0.2699119448661804,
"learning_rate": 1.3853300802892285e-05,
"loss": 0.5417,
"mean_token_accuracy": 0.8279038980603218,
"step": 1330
},
{
"epoch": 1.3149993847668267,
"grad_norm": 0.2658538520336151,
"learning_rate": 1.380033640184213e-05,
"loss": 0.5462,
"mean_token_accuracy": 0.8260830625891685,
"step": 1335
},
{
"epoch": 1.3199212501538082,
"grad_norm": 0.25977060198783875,
"learning_rate": 1.3747247101642605e-05,
"loss": 0.5347,
"mean_token_accuracy": 0.8293716937303544,
"step": 1340
},
{
"epoch": 1.32484311554079,
"grad_norm": 0.24537616968154907,
"learning_rate": 1.369403464708884e-05,
"loss": 0.5367,
"mean_token_accuracy": 0.8292932540178299,
"step": 1345
},
{
"epoch": 1.3297649809277716,
"grad_norm": 0.2559899091720581,
"learning_rate": 1.3640700787023465e-05,
"loss": 0.5398,
"mean_token_accuracy": 0.8283236369490623,
"step": 1350
},
{
"epoch": 1.3346868463147534,
"grad_norm": 0.274198979139328,
"learning_rate": 1.358724727427914e-05,
"loss": 0.5376,
"mean_token_accuracy": 0.8286082163453102,
"step": 1355
},
{
"epoch": 1.339608711701735,
"grad_norm": 0.22712701559066772,
"learning_rate": 1.3533675865620937e-05,
"loss": 0.5336,
"mean_token_accuracy": 0.8294816762208939,
"step": 1360
},
{
"epoch": 1.3445305770887166,
"grad_norm": 0.24095574021339417,
"learning_rate": 1.3479988321688619e-05,
"loss": 0.536,
"mean_token_accuracy": 0.829172083735466,
"step": 1365
},
{
"epoch": 1.3494524424756982,
"grad_norm": 0.2448059618473053,
"learning_rate": 1.3426186406938769e-05,
"loss": 0.5337,
"mean_token_accuracy": 0.8295143947005272,
"step": 1370
},
{
"epoch": 1.35437430786268,
"grad_norm": 0.2575864791870117,
"learning_rate": 1.337227188958679e-05,
"loss": 0.5456,
"mean_token_accuracy": 0.8261685460805893,
"step": 1375
},
{
"epoch": 1.3592961732496616,
"grad_norm": 0.25145259499549866,
"learning_rate": 1.3318246541548812e-05,
"loss": 0.5319,
"mean_token_accuracy": 0.8304190933704376,
"step": 1380
},
{
"epoch": 1.3642180386366434,
"grad_norm": 0.2565249502658844,
"learning_rate": 1.3264112138383445e-05,
"loss": 0.5358,
"mean_token_accuracy": 0.8293601229786873,
"step": 1385
},
{
"epoch": 1.369139904023625,
"grad_norm": 0.8961818814277649,
"learning_rate": 1.3209870459233422e-05,
"loss": 0.528,
"mean_token_accuracy": 0.8313272252678872,
"step": 1390
},
{
"epoch": 1.3740617694106065,
"grad_norm": 0.26537856459617615,
"learning_rate": 1.315552328676714e-05,
"loss": 0.531,
"mean_token_accuracy": 0.8308784514665604,
"step": 1395
},
{
"epoch": 1.3789836347975883,
"grad_norm": 0.28985780477523804,
"learning_rate": 1.3101072407120056e-05,
"loss": 0.5406,
"mean_token_accuracy": 0.8277209624648094,
"step": 1400
},
{
"epoch": 1.38390550018457,
"grad_norm": 0.2510998249053955,
"learning_rate": 1.3046519609836002e-05,
"loss": 0.5406,
"mean_token_accuracy": 0.827545890212059,
"step": 1405
},
{
"epoch": 1.3888273655715517,
"grad_norm": 0.2563679814338684,
"learning_rate": 1.2991866687808355e-05,
"loss": 0.5394,
"mean_token_accuracy": 0.8279638543725014,
"step": 1410
},
{
"epoch": 1.3937492309585333,
"grad_norm": 0.2674863338470459,
"learning_rate": 1.2937115437221119e-05,
"loss": 0.547,
"mean_token_accuracy": 0.8261717170476913,
"step": 1415
},
{
"epoch": 1.398671096345515,
"grad_norm": 0.24103465676307678,
"learning_rate": 1.2882267657489908e-05,
"loss": 0.5428,
"mean_token_accuracy": 0.8272509336471557,
"step": 1420
},
{
"epoch": 1.4035929617324965,
"grad_norm": 0.22528545558452606,
"learning_rate": 1.2827325151202783e-05,
"loss": 0.5368,
"mean_token_accuracy": 0.8288370996713639,
"step": 1425
},
{
"epoch": 1.4085148271194783,
"grad_norm": 0.23950906097888947,
"learning_rate": 1.2772289724061015e-05,
"loss": 0.5309,
"mean_token_accuracy": 0.8302434518933296,
"step": 1430
},
{
"epoch": 1.4134366925064599,
"grad_norm": 0.22913850843906403,
"learning_rate": 1.2717163184819761e-05,
"loss": 0.5397,
"mean_token_accuracy": 0.8278713747859001,
"step": 1435
},
{
"epoch": 1.4183585578934417,
"grad_norm": 0.22565315663814545,
"learning_rate": 1.2661947345228593e-05,
"loss": 0.546,
"mean_token_accuracy": 0.826079449057579,
"step": 1440
},
{
"epoch": 1.4232804232804233,
"grad_norm": 0.2397647351026535,
"learning_rate": 1.2606644019971967e-05,
"loss": 0.5396,
"mean_token_accuracy": 0.8280595645308495,
"step": 1445
},
{
"epoch": 1.4282022886674048,
"grad_norm": 0.23136766254901886,
"learning_rate": 1.255125502660958e-05,
"loss": 0.5288,
"mean_token_accuracy": 0.8313645005226136,
"step": 1450
},
{
"epoch": 1.4331241540543866,
"grad_norm": 0.2330116331577301,
"learning_rate": 1.2495782185516638e-05,
"loss": 0.5364,
"mean_token_accuracy": 0.828608725965023,
"step": 1455
},
{
"epoch": 1.4380460194413682,
"grad_norm": 0.23435364663600922,
"learning_rate": 1.2440227319824024e-05,
"loss": 0.5323,
"mean_token_accuracy": 0.8299019247293472,
"step": 1460
},
{
"epoch": 1.44296788482835,
"grad_norm": 0.2517502009868622,
"learning_rate": 1.2384592255358385e-05,
"loss": 0.537,
"mean_token_accuracy": 0.8284672737121582,
"step": 1465
},
{
"epoch": 1.4478897502153316,
"grad_norm": 0.2454364001750946,
"learning_rate": 1.2328878820582122e-05,
"loss": 0.5282,
"mean_token_accuracy": 0.8314993128180503,
"step": 1470
},
{
"epoch": 1.4528116156023132,
"grad_norm": 0.2604913115501404,
"learning_rate": 1.2273088846533303e-05,
"loss": 0.5404,
"mean_token_accuracy": 0.8278495371341705,
"step": 1475
},
{
"epoch": 1.457733480989295,
"grad_norm": 0.277908593416214,
"learning_rate": 1.2217224166765478e-05,
"loss": 0.5285,
"mean_token_accuracy": 0.8310411602258683,
"step": 1480
},
{
"epoch": 1.4626553463762766,
"grad_norm": 0.23699437081813812,
"learning_rate": 1.216128661728742e-05,
"loss": 0.5359,
"mean_token_accuracy": 0.8288247928023338,
"step": 1485
},
{
"epoch": 1.4675772117632584,
"grad_norm": 0.2528901994228363,
"learning_rate": 1.2105278036502787e-05,
"loss": 0.543,
"mean_token_accuracy": 0.8267820864915848,
"step": 1490
},
{
"epoch": 1.47249907715024,
"grad_norm": 0.25504714250564575,
"learning_rate": 1.204920026514971e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.8281295597553253,
"step": 1495
},
{
"epoch": 1.4774209425372216,
"grad_norm": 0.26783859729766846,
"learning_rate": 1.1993055146240273e-05,
"loss": 0.5325,
"mean_token_accuracy": 0.8299062862992287,
"step": 1500
},
{
"epoch": 1.4823428079242031,
"grad_norm": 0.25482243299484253,
"learning_rate": 1.1936844524999966e-05,
"loss": 0.5271,
"mean_token_accuracy": 0.8315476939082146,
"step": 1505
},
{
"epoch": 1.487264673311185,
"grad_norm": 0.2603563964366913,
"learning_rate": 1.1880570248807033e-05,
"loss": 0.5299,
"mean_token_accuracy": 0.8303808271884918,
"step": 1510
},
{
"epoch": 1.4921865386981665,
"grad_norm": 0.2345011830329895,
"learning_rate": 1.1824234167131748e-05,
"loss": 0.5274,
"mean_token_accuracy": 0.8310874328017235,
"step": 1515
},
{
"epoch": 1.4971084040851483,
"grad_norm": 0.3448658883571625,
"learning_rate": 1.1767838131475654e-05,
"loss": 0.5318,
"mean_token_accuracy": 0.8301808550953865,
"step": 1520
},
{
"epoch": 1.50203026947213,
"grad_norm": 0.26358914375305176,
"learning_rate": 1.171138399531068e-05,
"loss": 0.5341,
"mean_token_accuracy": 0.8296466439962387,
"step": 1525
},
{
"epoch": 1.5069521348591115,
"grad_norm": 0.23463788628578186,
"learning_rate": 1.1654873614018266e-05,
"loss": 0.5337,
"mean_token_accuracy": 0.8297147572040557,
"step": 1530
},
{
"epoch": 1.5118740002460933,
"grad_norm": 0.37559443712234497,
"learning_rate": 1.1598308844828348e-05,
"loss": 0.5281,
"mean_token_accuracy": 0.8311620846390724,
"step": 1535
},
{
"epoch": 1.516795865633075,
"grad_norm": 0.24298147857189178,
"learning_rate": 1.1541691546758343e-05,
"loss": 0.5353,
"mean_token_accuracy": 0.8288328930735588,
"step": 1540
},
{
"epoch": 1.5217177310200567,
"grad_norm": 0.2316361665725708,
"learning_rate": 1.1485023580552039e-05,
"loss": 0.5217,
"mean_token_accuracy": 0.8330785930156708,
"step": 1545
},
{
"epoch": 1.5266395964070383,
"grad_norm": 0.22819174826145172,
"learning_rate": 1.1428306808618456e-05,
"loss": 0.53,
"mean_token_accuracy": 0.8303656697273254,
"step": 1550
},
{
"epoch": 1.5315614617940199,
"grad_norm": 0.22326573729515076,
"learning_rate": 1.1371543094970624e-05,
"loss": 0.53,
"mean_token_accuracy": 0.8304451867938042,
"step": 1555
},
{
"epoch": 1.5364833271810014,
"grad_norm": 0.23267020285129547,
"learning_rate": 1.131473430516432e-05,
"loss": 0.5284,
"mean_token_accuracy": 0.8309284761548043,
"step": 1560
},
{
"epoch": 1.5414051925679833,
"grad_norm": 0.3377299904823303,
"learning_rate": 1.1257882306236776e-05,
"loss": 0.5336,
"mean_token_accuracy": 0.8295429393649101,
"step": 1565
},
{
"epoch": 1.546327057954965,
"grad_norm": 0.24768434464931488,
"learning_rate": 1.1200988966645286e-05,
"loss": 0.5326,
"mean_token_accuracy": 0.8297705203294754,
"step": 1570
},
{
"epoch": 1.5512489233419466,
"grad_norm": 0.22998486459255219,
"learning_rate": 1.1144056156205834e-05,
"loss": 0.5298,
"mean_token_accuracy": 0.8307420760393143,
"step": 1575
},
{
"epoch": 1.5561707887289282,
"grad_norm": 0.22251376509666443,
"learning_rate": 1.1087085746031612e-05,
"loss": 0.528,
"mean_token_accuracy": 0.8313020512461662,
"step": 1580
},
{
"epoch": 1.5610926541159098,
"grad_norm": 0.2297334372997284,
"learning_rate": 1.1030079608471544e-05,
"loss": 0.5335,
"mean_token_accuracy": 0.8294809475541115,
"step": 1585
},
{
"epoch": 1.5660145195028916,
"grad_norm": 0.23138615489006042,
"learning_rate": 1.0973039617048748e-05,
"loss": 0.5333,
"mean_token_accuracy": 0.829520358145237,
"step": 1590
},
{
"epoch": 1.5709363848898734,
"grad_norm": 0.23547935485839844,
"learning_rate": 1.091596764639895e-05,
"loss": 0.5267,
"mean_token_accuracy": 0.8314588502049446,
"step": 1595
},
{
"epoch": 1.575858250276855,
"grad_norm": 0.2409500926733017,
"learning_rate": 1.0858865572208892e-05,
"loss": 0.5346,
"mean_token_accuracy": 0.8291632473468781,
"step": 1600
},
{
"epoch": 1.5807801156638366,
"grad_norm": 0.2276252955198288,
"learning_rate": 1.080173527115467e-05,
"loss": 0.5273,
"mean_token_accuracy": 0.831089685857296,
"step": 1605
},
{
"epoch": 1.5857019810508182,
"grad_norm": 0.2589430809020996,
"learning_rate": 1.0744578620840065e-05,
"loss": 0.5388,
"mean_token_accuracy": 0.8279580160975456,
"step": 1610
},
{
"epoch": 1.5906238464378,
"grad_norm": 0.2499450445175171,
"learning_rate": 1.0687397499734842e-05,
"loss": 0.5268,
"mean_token_accuracy": 0.8311406090855599,
"step": 1615
},
{
"epoch": 1.5955457118247816,
"grad_norm": 0.2377663552761078,
"learning_rate": 1.0630193787112994e-05,
"loss": 0.5257,
"mean_token_accuracy": 0.8319837361574173,
"step": 1620
},
{
"epoch": 1.6004675772117634,
"grad_norm": 0.24260112643241882,
"learning_rate": 1.0572969362991e-05,
"loss": 0.5316,
"mean_token_accuracy": 0.8302173331379891,
"step": 1625
},
{
"epoch": 1.605389442598745,
"grad_norm": 1.525187611579895,
"learning_rate": 1.0515726108066025e-05,
"loss": 0.5315,
"mean_token_accuracy": 0.8299267381429672,
"step": 1630
},
{
"epoch": 1.6103113079857265,
"grad_norm": 0.23062676191329956,
"learning_rate": 1.0458465903654107e-05,
"loss": 0.5298,
"mean_token_accuracy": 0.8305988430976867,
"step": 1635
},
{
"epoch": 1.615233173372708,
"grad_norm": 0.23293638229370117,
"learning_rate": 1.0401190631628348e-05,
"loss": 0.5304,
"mean_token_accuracy": 0.8300972327589988,
"step": 1640
},
{
"epoch": 1.62015503875969,
"grad_norm": 0.22877627611160278,
"learning_rate": 1.034390217435704e-05,
"loss": 0.5287,
"mean_token_accuracy": 0.8309306666254997,
"step": 1645
},
{
"epoch": 1.6250769041466717,
"grad_norm": 0.23190174996852875,
"learning_rate": 1.0286602414641818e-05,
"loss": 0.5303,
"mean_token_accuracy": 0.8306381091475487,
"step": 1650
},
{
"epoch": 1.6299987695336533,
"grad_norm": 0.23290394246578217,
"learning_rate": 1.0229293235655768e-05,
"loss": 0.5221,
"mean_token_accuracy": 0.8326445773243905,
"step": 1655
},
{
"epoch": 1.6349206349206349,
"grad_norm": 0.22114625573158264,
"learning_rate": 1.0171976520881552e-05,
"loss": 0.5263,
"mean_token_accuracy": 0.8315576672554016,
"step": 1660
},
{
"epoch": 1.6398425003076165,
"grad_norm": 0.2297578752040863,
"learning_rate": 1.011465415404949e-05,
"loss": 0.5252,
"mean_token_accuracy": 0.8321317434310913,
"step": 1665
},
{
"epoch": 1.6447643656945983,
"grad_norm": 0.23588469624519348,
"learning_rate": 1.005732801907567e-05,
"loss": 0.5262,
"mean_token_accuracy": 0.831513050198555,
"step": 1670
},
{
"epoch": 1.64968623108158,
"grad_norm": 0.22704197466373444,
"learning_rate": 1e-05,
"loss": 0.5382,
"mean_token_accuracy": 0.8281245142221451,
"step": 1675
},
{
"epoch": 1.6546080964685617,
"grad_norm": 0.22588326036930084,
"learning_rate": 9.942671980924336e-06,
"loss": 0.5286,
"mean_token_accuracy": 0.8307414755225182,
"step": 1680
},
{
"epoch": 1.6595299618555432,
"grad_norm": 0.22511065006256104,
"learning_rate": 9.88534584595051e-06,
"loss": 0.5279,
"mean_token_accuracy": 0.83111013174057,
"step": 1685
},
{
"epoch": 1.6644518272425248,
"grad_norm": 0.24989110231399536,
"learning_rate": 9.82802347911845e-06,
"loss": 0.5257,
"mean_token_accuracy": 0.8317268043756485,
"step": 1690
},
{
"epoch": 1.6693736926295066,
"grad_norm": 0.23859356343746185,
"learning_rate": 9.770706764344235e-06,
"loss": 0.534,
"mean_token_accuracy": 0.8294050306081772,
"step": 1695
},
{
"epoch": 1.6742955580164882,
"grad_norm": 0.2304782122373581,
"learning_rate": 9.713397585358189e-06,
"loss": 0.528,
"mean_token_accuracy": 0.8308202102780342,
"step": 1700
},
{
"epoch": 1.67921742340347,
"grad_norm": 0.2276812344789505,
"learning_rate": 9.65609782564296e-06,
"loss": 0.5267,
"mean_token_accuracy": 0.8312249034643173,
"step": 1705
},
{
"epoch": 1.6841392887904516,
"grad_norm": 0.3979962170124054,
"learning_rate": 9.598809368371656e-06,
"loss": 0.5266,
"mean_token_accuracy": 0.8312003433704376,
"step": 1710
},
{
"epoch": 1.6890611541774332,
"grad_norm": 0.25581249594688416,
"learning_rate": 9.541534096345896e-06,
"loss": 0.526,
"mean_token_accuracy": 0.8315127685666084,
"step": 1715
},
{
"epoch": 1.6939830195644148,
"grad_norm": 0.2141893208026886,
"learning_rate": 9.484273891933982e-06,
"loss": 0.5252,
"mean_token_accuracy": 0.8317378848791123,
"step": 1720
},
{
"epoch": 1.6989048849513966,
"grad_norm": 0.4327445924282074,
"learning_rate": 9.427030637009002e-06,
"loss": 0.5361,
"mean_token_accuracy": 0.828312310576439,
"step": 1725
},
{
"epoch": 1.7038267503383784,
"grad_norm": 0.22412188351154327,
"learning_rate": 9.369806212887008e-06,
"loss": 0.5299,
"mean_token_accuracy": 0.830331552028656,
"step": 1730
},
{
"epoch": 1.70874861572536,
"grad_norm": 0.22056014835834503,
"learning_rate": 9.312602500265162e-06,
"loss": 0.5259,
"mean_token_accuracy": 0.831749576330185,
"step": 1735
},
{
"epoch": 1.7136704811123415,
"grad_norm": 0.23633216321468353,
"learning_rate": 9.255421379159935e-06,
"loss": 0.5152,
"mean_token_accuracy": 0.8346669390797615,
"step": 1740
},
{
"epoch": 1.7185923464993231,
"grad_norm": 0.21674410998821259,
"learning_rate": 9.198264728845332e-06,
"loss": 0.5188,
"mean_token_accuracy": 0.8335284858942031,
"step": 1745
},
{
"epoch": 1.723514211886305,
"grad_norm": 0.22083686292171478,
"learning_rate": 9.14113442779111e-06,
"loss": 0.5283,
"mean_token_accuracy": 0.8306051269173622,
"step": 1750
},
{
"epoch": 1.7284360772732867,
"grad_norm": 0.2326516956090927,
"learning_rate": 9.084032353601053e-06,
"loss": 0.5329,
"mean_token_accuracy": 0.8295654147863388,
"step": 1755
},
{
"epoch": 1.7333579426602683,
"grad_norm": 0.23140785098075867,
"learning_rate": 9.026960382951253e-06,
"loss": 0.5243,
"mean_token_accuracy": 0.8315977454185486,
"step": 1760
},
{
"epoch": 1.73827980804725,
"grad_norm": 0.24312028288841248,
"learning_rate": 8.969920391528459e-06,
"loss": 0.5218,
"mean_token_accuracy": 0.8328249961137771,
"step": 1765
},
{
"epoch": 1.7432016734342315,
"grad_norm": 0.22412382066249847,
"learning_rate": 8.912914253968391e-06,
"loss": 0.5312,
"mean_token_accuracy": 0.8298890963196754,
"step": 1770
},
{
"epoch": 1.748123538821213,
"grad_norm": 0.2266296148300171,
"learning_rate": 8.855943843794171e-06,
"loss": 0.5234,
"mean_token_accuracy": 0.8323718756437302,
"step": 1775
},
{
"epoch": 1.7530454042081949,
"grad_norm": 0.21898606419563293,
"learning_rate": 8.799011033354716e-06,
"loss": 0.5288,
"mean_token_accuracy": 0.8307971671223641,
"step": 1780
},
{
"epoch": 1.7579672695951767,
"grad_norm": 0.2306451052427292,
"learning_rate": 8.742117693763229e-06,
"loss": 0.5271,
"mean_token_accuracy": 0.8316369831562043,
"step": 1785
},
{
"epoch": 1.7628891349821583,
"grad_norm": 0.22924001514911652,
"learning_rate": 8.685265694835681e-06,
"loss": 0.5272,
"mean_token_accuracy": 0.8311286598443985,
"step": 1790
},
{
"epoch": 1.7678110003691399,
"grad_norm": 0.33131736516952515,
"learning_rate": 8.628456905029383e-06,
"loss": 0.5195,
"mean_token_accuracy": 0.833528995513916,
"step": 1795
},
{
"epoch": 1.7727328657561214,
"grad_norm": 0.24447475373744965,
"learning_rate": 8.571693191381545e-06,
"loss": 0.5221,
"mean_token_accuracy": 0.8324113413691521,
"step": 1800
},
{
"epoch": 1.7776547311431032,
"grad_norm": 0.23472720384597778,
"learning_rate": 8.514976419447963e-06,
"loss": 0.5282,
"mean_token_accuracy": 0.8306461483240127,
"step": 1805
},
{
"epoch": 1.782576596530085,
"grad_norm": 0.25232747197151184,
"learning_rate": 8.458308453241664e-06,
"loss": 0.519,
"mean_token_accuracy": 0.8334705844521523,
"step": 1810
},
{
"epoch": 1.7874984619170666,
"grad_norm": 0.22827033698558807,
"learning_rate": 8.401691155171654e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.8289692014455795,
"step": 1815
},
{
"epoch": 1.7924203273040482,
"grad_norm": 0.21775387227535248,
"learning_rate": 8.345126385981737e-06,
"loss": 0.5217,
"mean_token_accuracy": 0.8326601728796958,
"step": 1820
},
{
"epoch": 1.7973421926910298,
"grad_norm": 0.22691109776496887,
"learning_rate": 8.288616004689321e-06,
"loss": 0.5208,
"mean_token_accuracy": 0.8330274626612664,
"step": 1825
},
{
"epoch": 1.8022640580780116,
"grad_norm": 0.23031188547611237,
"learning_rate": 8.23216186852435e-06,
"loss": 0.5251,
"mean_token_accuracy": 0.8317318856716156,
"step": 1830
},
{
"epoch": 1.8071859234649932,
"grad_norm": 0.23658455908298492,
"learning_rate": 8.175765832868252e-06,
"loss": 0.5263,
"mean_token_accuracy": 0.8314035385847092,
"step": 1835
},
{
"epoch": 1.812107788851975,
"grad_norm": 0.21728812158107758,
"learning_rate": 8.119429751192972e-06,
"loss": 0.5283,
"mean_token_accuracy": 0.830833038687706,
"step": 1840
},
{
"epoch": 1.8170296542389566,
"grad_norm": 0.22863180935382843,
"learning_rate": 8.063155475000037e-06,
"loss": 0.5231,
"mean_token_accuracy": 0.8322245612740516,
"step": 1845
},
{
"epoch": 1.8219515196259382,
"grad_norm": 0.22922097146511078,
"learning_rate": 8.006944853759732e-06,
"loss": 0.5242,
"mean_token_accuracy": 0.8318595319986344,
"step": 1850
},
{
"epoch": 1.8268733850129197,
"grad_norm": 0.209337517619133,
"learning_rate": 7.950799734850292e-06,
"loss": 0.5195,
"mean_token_accuracy": 0.8333837404847145,
"step": 1855
},
{
"epoch": 1.8317952503999015,
"grad_norm": 0.22603721916675568,
"learning_rate": 7.894721963497214e-06,
"loss": 0.5218,
"mean_token_accuracy": 0.8325009673833847,
"step": 1860
},
{
"epoch": 1.8367171157868833,
"grad_norm": 0.2327803522348404,
"learning_rate": 7.838713382712583e-06,
"loss": 0.5111,
"mean_token_accuracy": 0.8357574358582497,
"step": 1865
},
{
"epoch": 1.841638981173865,
"grad_norm": 0.23280593752861023,
"learning_rate": 7.782775833234522e-06,
"loss": 0.5333,
"mean_token_accuracy": 0.8295109212398529,
"step": 1870
},
{
"epoch": 1.8465608465608465,
"grad_norm": 0.2219589352607727,
"learning_rate": 7.726911153466699e-06,
"loss": 0.5255,
"mean_token_accuracy": 0.8316129177808762,
"step": 1875
},
{
"epoch": 1.851482711947828,
"grad_norm": 0.22274133563041687,
"learning_rate": 7.67112117941788e-06,
"loss": 0.5197,
"mean_token_accuracy": 0.8331713795661926,
"step": 1880
},
{
"epoch": 1.85640457733481,
"grad_norm": 0.20765641331672668,
"learning_rate": 7.615407744641618e-06,
"loss": 0.5222,
"mean_token_accuracy": 0.8323680445551872,
"step": 1885
},
{
"epoch": 1.8613264427217917,
"grad_norm": 0.22262942790985107,
"learning_rate": 7.559772680175979e-06,
"loss": 0.5256,
"mean_token_accuracy": 0.8315785735845566,
"step": 1890
},
{
"epoch": 1.8662483081087733,
"grad_norm": 0.23786763846874237,
"learning_rate": 7.504217814483364e-06,
"loss": 0.5225,
"mean_token_accuracy": 0.8326525434851646,
"step": 1895
},
{
"epoch": 1.8711701734957549,
"grad_norm": 0.22120903432369232,
"learning_rate": 7.448744973390423e-06,
"loss": 0.5322,
"mean_token_accuracy": 0.8296578034758568,
"step": 1900
},
{
"epoch": 1.8760920388827365,
"grad_norm": 0.22359086573123932,
"learning_rate": 7.393355980028039e-06,
"loss": 0.524,
"mean_token_accuracy": 0.8320103421807289,
"step": 1905
},
{
"epoch": 1.8810139042697183,
"grad_norm": 0.21293464303016663,
"learning_rate": 7.338052654771407e-06,
"loss": 0.5201,
"mean_token_accuracy": 0.8330625906586647,
"step": 1910
},
{
"epoch": 1.8859357696566998,
"grad_norm": 0.212773397564888,
"learning_rate": 7.282836815180241e-06,
"loss": 0.5212,
"mean_token_accuracy": 0.8328917175531387,
"step": 1915
},
{
"epoch": 1.8908576350436817,
"grad_norm": 0.2229495495557785,
"learning_rate": 7.227710275938987e-06,
"loss": 0.5177,
"mean_token_accuracy": 0.8338592052459717,
"step": 1920
},
{
"epoch": 1.8957795004306632,
"grad_norm": 0.22714777290821075,
"learning_rate": 7.172674848797218e-06,
"loss": 0.5196,
"mean_token_accuracy": 0.8332103446125985,
"step": 1925
},
{
"epoch": 1.9007013658176448,
"grad_norm": 0.5862542986869812,
"learning_rate": 7.117732342510093e-06,
"loss": 0.5148,
"mean_token_accuracy": 0.8348309084773063,
"step": 1930
},
{
"epoch": 1.9056232312046264,
"grad_norm": 0.21524302661418915,
"learning_rate": 7.062884562778883e-06,
"loss": 0.5225,
"mean_token_accuracy": 0.8324376299977303,
"step": 1935
},
{
"epoch": 1.9105450965916082,
"grad_norm": 0.22445465624332428,
"learning_rate": 7.008133312191649e-06,
"loss": 0.5239,
"mean_token_accuracy": 0.8318991348147392,
"step": 1940
},
{
"epoch": 1.91546696197859,
"grad_norm": 0.21925503015518188,
"learning_rate": 6.953480390164001e-06,
"loss": 0.5243,
"mean_token_accuracy": 0.8320589557290077,
"step": 1945
},
{
"epoch": 1.9203888273655716,
"grad_norm": 0.21358764171600342,
"learning_rate": 6.898927592879945e-06,
"loss": 0.5276,
"mean_token_accuracy": 0.8309697136282921,
"step": 1950
},
{
"epoch": 1.9253106927525532,
"grad_norm": 0.21541139483451843,
"learning_rate": 6.844476713232863e-06,
"loss": 0.5183,
"mean_token_accuracy": 0.8336074352264404,
"step": 1955
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.253334105014801,
"learning_rate": 6.790129540766581e-06,
"loss": 0.5217,
"mean_token_accuracy": 0.8321399599313736,
"step": 1960
},
{
"epoch": 1.9351544235265166,
"grad_norm": 0.2311272770166397,
"learning_rate": 6.735887861616555e-06,
"loss": 0.5226,
"mean_token_accuracy": 0.832192762196064,
"step": 1965
},
{
"epoch": 1.9400762889134984,
"grad_norm": 0.2155195027589798,
"learning_rate": 6.68175345845119e-06,
"loss": 0.5214,
"mean_token_accuracy": 0.8325791984796524,
"step": 1970
},
{
"epoch": 1.94499815430048,
"grad_norm": 0.2229234129190445,
"learning_rate": 6.627728110413214e-06,
"loss": 0.5228,
"mean_token_accuracy": 0.8320748254656791,
"step": 1975
},
{
"epoch": 1.9499200196874615,
"grad_norm": 0.2595667839050293,
"learning_rate": 6.5738135930612355e-06,
"loss": 0.5257,
"mean_token_accuracy": 0.831524421274662,
"step": 1980
},
{
"epoch": 1.9548418850744431,
"grad_norm": 0.21894799172878265,
"learning_rate": 6.520011678311382e-06,
"loss": 0.5135,
"mean_token_accuracy": 0.8349313631653785,
"step": 1985
},
{
"epoch": 1.959763750461425,
"grad_norm": 0.215131938457489,
"learning_rate": 6.466324134379066e-06,
"loss": 0.5125,
"mean_token_accuracy": 0.8354373678565026,
"step": 1990
},
{
"epoch": 1.9646856158484065,
"grad_norm": 0.227864071726799,
"learning_rate": 6.412752725720864e-06,
"loss": 0.5166,
"mean_token_accuracy": 0.8339696109294892,
"step": 1995
},
{
"epoch": 1.9696074812353883,
"grad_norm": 0.21633465588092804,
"learning_rate": 6.359299212976535e-06,
"loss": 0.5236,
"mean_token_accuracy": 0.8324458003044128,
"step": 2000
},
{
"epoch": 1.97452934662237,
"grad_norm": 0.2214214950799942,
"learning_rate": 6.305965352911162e-06,
"loss": 0.5186,
"mean_token_accuracy": 0.8334563329815865,
"step": 2005
},
{
"epoch": 1.9794512120093515,
"grad_norm": 0.20772044360637665,
"learning_rate": 6.252752898357397e-06,
"loss": 0.5146,
"mean_token_accuracy": 0.8346970349550247,
"step": 2010
},
{
"epoch": 1.984373077396333,
"grad_norm": 0.2208469659090042,
"learning_rate": 6.1996635981578755e-06,
"loss": 0.521,
"mean_token_accuracy": 0.8330862745642662,
"step": 2015
},
{
"epoch": 1.9892949427833149,
"grad_norm": 0.21841764450073242,
"learning_rate": 6.146699197107715e-06,
"loss": 0.5141,
"mean_token_accuracy": 0.8346462666988372,
"step": 2020
},
{
"epoch": 1.9942168081702967,
"grad_norm": 0.22905802726745605,
"learning_rate": 6.093861435897208e-06,
"loss": 0.5161,
"mean_token_accuracy": 0.8341751024127007,
"step": 2025
},
{
"epoch": 1.9991386735572783,
"grad_norm": 0.2205893099308014,
"learning_rate": 6.041152051054575e-06,
"loss": 0.5135,
"mean_token_accuracy": 0.8350084885954857,
"step": 2030
},
{
"epoch": 2.0049218653869816,
"grad_norm": 0.27798768877983093,
"learning_rate": 5.988572774888913e-06,
"loss": 0.5979,
"mean_token_accuracy": 0.8386082910909886,
"step": 2035
},
{
"epoch": 2.009843730773963,
"grad_norm": 0.24996507167816162,
"learning_rate": 5.936125335433265e-06,
"loss": 0.4945,
"mean_token_accuracy": 0.839720045030117,
"step": 2040
},
{
"epoch": 2.014765596160945,
"grad_norm": 0.2548527121543884,
"learning_rate": 5.883811456387821e-06,
"loss": 0.4941,
"mean_token_accuracy": 0.8400543674826622,
"step": 2045
},
{
"epoch": 2.0196874615479268,
"grad_norm": 0.2184976190328598,
"learning_rate": 5.831632857063271e-06,
"loss": 0.4902,
"mean_token_accuracy": 0.8409830510616303,
"step": 2050
},
{
"epoch": 2.0246093269349084,
"grad_norm": 0.22762830555438995,
"learning_rate": 5.779591252324286e-06,
"loss": 0.4904,
"mean_token_accuracy": 0.8408440828323365,
"step": 2055
},
{
"epoch": 2.02953119232189,
"grad_norm": 0.23035886883735657,
"learning_rate": 5.7276883525331915e-06,
"loss": 0.4943,
"mean_token_accuracy": 0.8397367835044861,
"step": 2060
},
{
"epoch": 2.0344530577088715,
"grad_norm": 0.22349004447460175,
"learning_rate": 5.675925863493721e-06,
"loss": 0.5009,
"mean_token_accuracy": 0.8379953891038895,
"step": 2065
},
{
"epoch": 2.0393749230958536,
"grad_norm": 0.22588923573493958,
"learning_rate": 5.6243054863949675e-06,
"loss": 0.494,
"mean_token_accuracy": 0.8397265374660492,
"step": 2070
},
{
"epoch": 2.044296788482835,
"grad_norm": 0.2168150246143341,
"learning_rate": 5.5728289177554805e-06,
"loss": 0.4975,
"mean_token_accuracy": 0.8389487206935883,
"step": 2075
},
{
"epoch": 2.0492186538698167,
"grad_norm": 0.22331282496452332,
"learning_rate": 5.521497849367501e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.8422671511769295,
"step": 2080
},
{
"epoch": 2.0541405192567983,
"grad_norm": 0.21221551299095154,
"learning_rate": 5.4703139682413585e-06,
"loss": 0.4866,
"mean_token_accuracy": 0.8420242533087731,
"step": 2085
},
{
"epoch": 2.05906238464378,
"grad_norm": 0.22058208286762238,
"learning_rate": 5.419278956550037e-06,
"loss": 0.4955,
"mean_token_accuracy": 0.8394055813550949,
"step": 2090
},
{
"epoch": 2.0639842500307615,
"grad_norm": 0.22200560569763184,
"learning_rate": 5.368394491573876e-06,
"loss": 0.493,
"mean_token_accuracy": 0.8402127623558044,
"step": 2095
},
{
"epoch": 2.0689061154177435,
"grad_norm": 0.2220141738653183,
"learning_rate": 5.31766224564547e-06,
"loss": 0.4958,
"mean_token_accuracy": 0.8393116250634194,
"step": 2100
},
{
"epoch": 2.073827980804725,
"grad_norm": 0.21074913442134857,
"learning_rate": 5.267083886094668e-06,
"loss": 0.4931,
"mean_token_accuracy": 0.840206652879715,
"step": 2105
},
{
"epoch": 2.0787498461917067,
"grad_norm": 0.2276320606470108,
"learning_rate": 5.216661075193814e-06,
"loss": 0.4955,
"mean_token_accuracy": 0.8393134921789169,
"step": 2110
},
{
"epoch": 2.0836717115786882,
"grad_norm": 0.2224099338054657,
"learning_rate": 5.166395470103092e-06,
"loss": 0.4937,
"mean_token_accuracy": 0.8397904768586159,
"step": 2115
},
{
"epoch": 2.08859357696567,
"grad_norm": 0.22312206029891968,
"learning_rate": 5.116288722816087e-06,
"loss": 0.493,
"mean_token_accuracy": 0.8403119757771492,
"step": 2120
},
{
"epoch": 2.093515442352652,
"grad_norm": 0.2194313257932663,
"learning_rate": 5.06634248010546e-06,
"loss": 0.4935,
"mean_token_accuracy": 0.8400413483381272,
"step": 2125
},
{
"epoch": 2.0984373077396334,
"grad_norm": 0.22484691441059113,
"learning_rate": 5.016558383468851e-06,
"loss": 0.49,
"mean_token_accuracy": 0.8409391462802887,
"step": 2130
},
{
"epoch": 2.103359173126615,
"grad_norm": 0.22470517456531525,
"learning_rate": 4.9669380690749215e-06,
"loss": 0.497,
"mean_token_accuracy": 0.8389460816979408,
"step": 2135
},
{
"epoch": 2.1082810385135966,
"grad_norm": 0.21832752227783203,
"learning_rate": 4.91748316770958e-06,
"loss": 0.4926,
"mean_token_accuracy": 0.8401527449488639,
"step": 2140
},
{
"epoch": 2.113202903900578,
"grad_norm": 0.21521726250648499,
"learning_rate": 4.868195304722391e-06,
"loss": 0.4979,
"mean_token_accuracy": 0.8387278065085411,
"step": 2145
},
{
"epoch": 2.1181247692875598,
"grad_norm": 0.21682803332805634,
"learning_rate": 4.819076099973152e-06,
"loss": 0.5014,
"mean_token_accuracy": 0.83763497620821,
"step": 2150
},
{
"epoch": 2.123046634674542,
"grad_norm": 0.2204725295305252,
"learning_rate": 4.77012716777867e-06,
"loss": 0.4989,
"mean_token_accuracy": 0.8380599915981293,
"step": 2155
},
{
"epoch": 2.1279685000615234,
"grad_norm": 0.2179991751909256,
"learning_rate": 4.721350116859675e-06,
"loss": 0.4946,
"mean_token_accuracy": 0.8396460056304932,
"step": 2160
},
{
"epoch": 2.132890365448505,
"grad_norm": 0.21851445734500885,
"learning_rate": 4.672746550287985e-06,
"loss": 0.4947,
"mean_token_accuracy": 0.8395410850644112,
"step": 2165
},
{
"epoch": 2.1378122308354865,
"grad_norm": 0.21560297906398773,
"learning_rate": 4.6243180654337975e-06,
"loss": 0.4857,
"mean_token_accuracy": 0.8421663656830788,
"step": 2170
},
{
"epoch": 2.142734096222468,
"grad_norm": 0.21567942202091217,
"learning_rate": 4.576066253913209e-06,
"loss": 0.493,
"mean_token_accuracy": 0.840301775932312,
"step": 2175
},
{
"epoch": 2.14765596160945,
"grad_norm": 0.22145864367485046,
"learning_rate": 4.527992701535884e-06,
"loss": 0.4844,
"mean_token_accuracy": 0.8423144072294235,
"step": 2180
},
{
"epoch": 2.1525778269964317,
"grad_norm": 0.217710942029953,
"learning_rate": 4.480098988252958e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.84017314016819,
"step": 2185
},
{
"epoch": 2.1574996923834133,
"grad_norm": 0.2169259786605835,
"learning_rate": 4.432386688105095e-06,
"loss": 0.4929,
"mean_token_accuracy": 0.840173925459385,
"step": 2190
},
{
"epoch": 2.162421557770395,
"grad_norm": 0.21104402840137482,
"learning_rate": 4.384857369170772e-06,
"loss": 0.4875,
"mean_token_accuracy": 0.8417868033051491,
"step": 2195
},
{
"epoch": 2.1673434231573765,
"grad_norm": 0.21658702194690704,
"learning_rate": 4.337512593514729e-06,
"loss": 0.4947,
"mean_token_accuracy": 0.8395476669073105,
"step": 2200
},
{
"epoch": 2.1722652885443585,
"grad_norm": 0.22858913242816925,
"learning_rate": 4.290353917136639e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.8408517464995384,
"step": 2205
},
{
"epoch": 2.17718715393134,
"grad_norm": 0.4094144105911255,
"learning_rate": 4.243382889919981e-06,
"loss": 0.496,
"mean_token_accuracy": 0.8392629832029342,
"step": 2210
},
{
"epoch": 2.1821090193183217,
"grad_norm": 0.21924547851085663,
"learning_rate": 4.1966010555810696e-06,
"loss": 0.4899,
"mean_token_accuracy": 0.841227824985981,
"step": 2215
},
{
"epoch": 2.1870308847053033,
"grad_norm": 0.21283064782619476,
"learning_rate": 4.1500099516183555e-06,
"loss": 0.4913,
"mean_token_accuracy": 0.8405321702361107,
"step": 2220
},
{
"epoch": 2.191952750092285,
"grad_norm": 0.21150268614292145,
"learning_rate": 4.1036111092618725e-06,
"loss": 0.4895,
"mean_token_accuracy": 0.8410715743899345,
"step": 2225
},
{
"epoch": 2.1968746154792664,
"grad_norm": 0.20887652039527893,
"learning_rate": 4.057406053422933e-06,
"loss": 0.4935,
"mean_token_accuracy": 0.8398977249860764,
"step": 2230
},
{
"epoch": 2.2017964808662485,
"grad_norm": 0.20756816864013672,
"learning_rate": 4.011396302643989e-06,
"loss": 0.4846,
"mean_token_accuracy": 0.842858923971653,
"step": 2235
},
{
"epoch": 2.20671834625323,
"grad_norm": 0.23419924080371857,
"learning_rate": 3.965583369048737e-06,
"loss": 0.4963,
"mean_token_accuracy": 0.8392103880643844,
"step": 2240
},
{
"epoch": 2.2116402116402116,
"grad_norm": 0.21532607078552246,
"learning_rate": 3.919968758292425e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8413224458694458,
"step": 2245
},
{
"epoch": 2.216562077027193,
"grad_norm": 0.2164084017276764,
"learning_rate": 3.874553969512358e-06,
"loss": 0.4885,
"mean_token_accuracy": 0.8415488794445991,
"step": 2250
},
{
"epoch": 2.221483942414175,
"grad_norm": 0.21010589599609375,
"learning_rate": 3.82934049527864e-06,
"loss": 0.4918,
"mean_token_accuracy": 0.8404750242829323,
"step": 2255
},
{
"epoch": 2.226405807801157,
"grad_norm": 0.20962242782115936,
"learning_rate": 3.784329821545105e-06,
"loss": 0.4962,
"mean_token_accuracy": 0.839095975458622,
"step": 2260
},
{
"epoch": 2.2313276731881384,
"grad_norm": 0.20551133155822754,
"learning_rate": 3.739523427600509e-06,
"loss": 0.4911,
"mean_token_accuracy": 0.8407798200845719,
"step": 2265
},
{
"epoch": 2.23624953857512,
"grad_norm": 0.21332746744155884,
"learning_rate": 3.6949227860198712e-06,
"loss": 0.492,
"mean_token_accuracy": 0.8405194252729415,
"step": 2270
},
{
"epoch": 2.2411714039621016,
"grad_norm": 0.26087722182273865,
"learning_rate": 3.650529362616113e-06,
"loss": 0.4875,
"mean_token_accuracy": 0.8417001351714134,
"step": 2275
},
{
"epoch": 2.246093269349083,
"grad_norm": 0.20974403619766235,
"learning_rate": 3.606344616391867e-06,
"loss": 0.4938,
"mean_token_accuracy": 0.8395893201231956,
"step": 2280
},
{
"epoch": 2.2510151347360647,
"grad_norm": 0.22249352931976318,
"learning_rate": 3.5623699994915363e-06,
"loss": 0.4916,
"mean_token_accuracy": 0.840800578892231,
"step": 2285
},
{
"epoch": 2.2559370001230468,
"grad_norm": 0.20673160254955292,
"learning_rate": 3.5186069571535575e-06,
"loss": 0.4876,
"mean_token_accuracy": 0.8417642295360566,
"step": 2290
},
{
"epoch": 2.2608588655100283,
"grad_norm": 0.2050849050283432,
"learning_rate": 3.475056927662912e-06,
"loss": 0.4922,
"mean_token_accuracy": 0.8401932448148728,
"step": 2295
},
{
"epoch": 2.26578073089701,
"grad_norm": 0.2113514542579651,
"learning_rate": 3.4317213423038386e-06,
"loss": 0.4925,
"mean_token_accuracy": 0.8401719897985458,
"step": 2300
},
{
"epoch": 2.2707025962839915,
"grad_norm": 0.21461407840251923,
"learning_rate": 3.388601625312833e-06,
"loss": 0.4892,
"mean_token_accuracy": 0.841229310631752,
"step": 2305
},
{
"epoch": 2.275624461670973,
"grad_norm": 0.20549601316452026,
"learning_rate": 3.345699193831795e-06,
"loss": 0.4917,
"mean_token_accuracy": 0.8405207619071007,
"step": 2310
},
{
"epoch": 2.280546327057955,
"grad_norm": 0.21262629330158234,
"learning_rate": 3.3030154578614783e-06,
"loss": 0.4898,
"mean_token_accuracy": 0.8410497605800629,
"step": 2315
},
{
"epoch": 2.2854681924449367,
"grad_norm": 0.2351827323436737,
"learning_rate": 3.2605518202151577e-06,
"loss": 0.4945,
"mean_token_accuracy": 0.8394208237528801,
"step": 2320
},
{
"epoch": 2.2903900578319183,
"grad_norm": 0.21704116463661194,
"learning_rate": 3.218309676472492e-06,
"loss": 0.489,
"mean_token_accuracy": 0.8411409676074981,
"step": 2325
},
{
"epoch": 2.2953119232189,
"grad_norm": 0.20750364661216736,
"learning_rate": 3.1762904149336947e-06,
"loss": 0.4942,
"mean_token_accuracy": 0.8396940395236016,
"step": 2330
},
{
"epoch": 2.3002337886058815,
"grad_norm": 0.20055250823497772,
"learning_rate": 3.134495416573884e-06,
"loss": 0.4871,
"mean_token_accuracy": 0.8417407006025315,
"step": 2335
},
{
"epoch": 2.3051556539928635,
"grad_norm": 0.20621967315673828,
"learning_rate": 3.0929260549977116e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8415425732731819,
"step": 2340
},
{
"epoch": 2.310077519379845,
"grad_norm": 0.210305854678154,
"learning_rate": 3.0515836963942056e-06,
"loss": 0.4929,
"mean_token_accuracy": 0.8403278931975364,
"step": 2345
},
{
"epoch": 2.3149993847668267,
"grad_norm": 0.25147390365600586,
"learning_rate": 3.01046969949188e-06,
"loss": 0.4909,
"mean_token_accuracy": 0.8407050803303718,
"step": 2350
},
{
"epoch": 2.3199212501538082,
"grad_norm": 0.21020571887493134,
"learning_rate": 2.9695854155140648e-06,
"loss": 0.4895,
"mean_token_accuracy": 0.8410211369395256,
"step": 2355
},
{
"epoch": 2.32484311554079,
"grad_norm": 0.21094508469104767,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.4889,
"mean_token_accuracy": 0.841056476533413,
"step": 2360
},
{
"epoch": 2.329764980927772,
"grad_norm": 0.21813294291496277,
"learning_rate": 2.8885113534332742e-06,
"loss": 0.4928,
"mean_token_accuracy": 0.8402146637439728,
"step": 2365
},
{
"epoch": 2.3346868463147534,
"grad_norm": 0.21038471162319183,
"learning_rate": 2.8483242398526723e-06,
"loss": 0.4875,
"mean_token_accuracy": 0.8416903391480446,
"step": 2370
},
{
"epoch": 2.339608711701735,
"grad_norm": 0.21476763486862183,
"learning_rate": 2.80837216815378e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.8410104081034661,
"step": 2375
},
{
"epoch": 2.3445305770887166,
"grad_norm": 0.2148827761411667,
"learning_rate": 2.7686564513729198e-06,
"loss": 0.4938,
"mean_token_accuracy": 0.8401752710342407,
"step": 2380
},
{
"epoch": 2.349452442475698,
"grad_norm": 0.20347550511360168,
"learning_rate": 2.7291783947785544e-06,
"loss": 0.4891,
"mean_token_accuracy": 0.841368468105793,
"step": 2385
},
{
"epoch": 2.35437430786268,
"grad_norm": 0.2156437486410141,
"learning_rate": 2.689939295828371e-06,
"loss": 0.4926,
"mean_token_accuracy": 0.8401880413293839,
"step": 2390
},
{
"epoch": 2.359296173249662,
"grad_norm": 0.20905110239982605,
"learning_rate": 2.650940444126654e-06,
"loss": 0.4915,
"mean_token_accuracy": 0.8407162860035896,
"step": 2395
},
{
"epoch": 2.3642180386366434,
"grad_norm": 0.20476758480072021,
"learning_rate": 2.6121831213818825e-06,
"loss": 0.4932,
"mean_token_accuracy": 0.840287271142006,
"step": 2400
},
{
"epoch": 2.369139904023625,
"grad_norm": 0.1986178457736969,
"learning_rate": 2.5736686013646226e-06,
"loss": 0.4857,
"mean_token_accuracy": 0.8420573100447655,
"step": 2405
},
{
"epoch": 2.3740617694106065,
"grad_norm": 0.21784992516040802,
"learning_rate": 2.535398149865651e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.8410965353250504,
"step": 2410
},
{
"epoch": 2.378983634797588,
"grad_norm": 0.20018485188484192,
"learning_rate": 2.4973730246543736e-06,
"loss": 0.4913,
"mean_token_accuracy": 0.8406006515026092,
"step": 2415
},
{
"epoch": 2.3839055001845697,
"grad_norm": 0.21187762916088104,
"learning_rate": 2.4595944754374723e-06,
"loss": 0.4972,
"mean_token_accuracy": 0.8388384222984314,
"step": 2420
},
{
"epoch": 2.3888273655715517,
"grad_norm": 0.2048918604850769,
"learning_rate": 2.422063743817832e-06,
"loss": 0.4936,
"mean_token_accuracy": 0.8397043973207474,
"step": 2425
},
{
"epoch": 2.3937492309585333,
"grad_norm": 0.2068692445755005,
"learning_rate": 2.3847820632537565e-06,
"loss": 0.4973,
"mean_token_accuracy": 0.8392092302441597,
"step": 2430
},
{
"epoch": 2.398671096345515,
"grad_norm": 0.2050062119960785,
"learning_rate": 2.347750659018397e-06,
"loss": 0.4964,
"mean_token_accuracy": 0.8390960440039634,
"step": 2435
},
{
"epoch": 2.4035929617324965,
"grad_norm": 0.20241810381412506,
"learning_rate": 2.3109707481595113e-06,
"loss": 0.4826,
"mean_token_accuracy": 0.8431760326027871,
"step": 2440
},
{
"epoch": 2.408514827119478,
"grad_norm": 0.2023165076971054,
"learning_rate": 2.27444353945945e-06,
"loss": 0.484,
"mean_token_accuracy": 0.8427256375551224,
"step": 2445
},
{
"epoch": 2.41343669250646,
"grad_norm": 0.2395012527704239,
"learning_rate": 2.2381702333954436e-06,
"loss": 0.4843,
"mean_token_accuracy": 0.8425970792770385,
"step": 2450
},
{
"epoch": 2.4183585578934417,
"grad_norm": 0.20210982859134674,
"learning_rate": 2.2021520221001304e-06,
"loss": 0.488,
"mean_token_accuracy": 0.8415813356637954,
"step": 2455
},
{
"epoch": 2.4232804232804233,
"grad_norm": 0.2082945853471756,
"learning_rate": 2.16639008932239e-06,
"loss": 0.4937,
"mean_token_accuracy": 0.8398790895938874,
"step": 2460
},
{
"epoch": 2.428202288667405,
"grad_norm": 0.20752127468585968,
"learning_rate": 2.130885610388428e-06,
"loss": 0.4959,
"mean_token_accuracy": 0.839399340748787,
"step": 2465
},
{
"epoch": 2.4331241540543864,
"grad_norm": 0.20869506895542145,
"learning_rate": 2.0956397521631666e-06,
"loss": 0.4868,
"mean_token_accuracy": 0.8415920332074165,
"step": 2470
},
{
"epoch": 2.4380460194413685,
"grad_norm": 0.20477741956710815,
"learning_rate": 2.0606536730118767e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.8429039210081101,
"step": 2475
},
{
"epoch": 2.44296788482835,
"grad_norm": 0.20474423468112946,
"learning_rate": 2.0259285227621152e-06,
"loss": 0.4981,
"mean_token_accuracy": 0.8382045805454255,
"step": 2480
},
{
"epoch": 2.4478897502153316,
"grad_norm": 0.20369385182857513,
"learning_rate": 1.9914654426659374e-06,
"loss": 0.4926,
"mean_token_accuracy": 0.839960803091526,
"step": 2485
},
{
"epoch": 2.452811615602313,
"grad_norm": 0.2068207710981369,
"learning_rate": 1.9572655653623884e-06,
"loss": 0.4935,
"mean_token_accuracy": 0.8397150009870529,
"step": 2490
},
{
"epoch": 2.457733480989295,
"grad_norm": 0.20661979913711548,
"learning_rate": 1.9233300148402767e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8401017665863038,
"step": 2495
},
{
"epoch": 2.462655346376277,
"grad_norm": 0.21355277299880981,
"learning_rate": 1.88965990640123e-06,
"loss": 0.487,
"mean_token_accuracy": 0.8420075699687004,
"step": 2500
},
{
"epoch": 2.4675772117632584,
"grad_norm": 0.209817573428154,
"learning_rate": 1.8562563466230577e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8402795165777206,
"step": 2505
},
{
"epoch": 2.47249907715024,
"grad_norm": 0.1972341388463974,
"learning_rate": 1.823120433323361e-06,
"loss": 0.4912,
"mean_token_accuracy": 0.8408435776829719,
"step": 2510
},
{
"epoch": 2.4774209425372216,
"grad_norm": 0.20761115849018097,
"learning_rate": 1.7902532555234653e-06,
"loss": 0.4977,
"mean_token_accuracy": 0.838873790204525,
"step": 2515
},
{
"epoch": 2.482342807924203,
"grad_norm": 0.22367697954177856,
"learning_rate": 1.757655893412622e-06,
"loss": 0.4876,
"mean_token_accuracy": 0.8413331776857376,
"step": 2520
},
{
"epoch": 2.487264673311185,
"grad_norm": 0.20876270532608032,
"learning_rate": 1.7253294183125223e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.8411200374364853,
"step": 2525
},
{
"epoch": 2.4921865386981668,
"grad_norm": 0.20132075250148773,
"learning_rate": 1.6932748926420695e-06,
"loss": 0.4953,
"mean_token_accuracy": 0.8395631924271584,
"step": 2530
},
{
"epoch": 2.4971084040851483,
"grad_norm": 0.1999741941690445,
"learning_rate": 1.661493369882473e-06,
"loss": 0.4796,
"mean_token_accuracy": 0.843748077750206,
"step": 2535
},
{
"epoch": 2.50203026947213,
"grad_norm": 0.21044902503490448,
"learning_rate": 1.6299858945426251e-06,
"loss": 0.4856,
"mean_token_accuracy": 0.8423863723874092,
"step": 2540
},
{
"epoch": 2.5069521348591115,
"grad_norm": 0.19819578528404236,
"learning_rate": 1.5987535021247668e-06,
"loss": 0.4855,
"mean_token_accuracy": 0.8423318341374397,
"step": 2545
},
{
"epoch": 2.5118740002460935,
"grad_norm": 0.2015785425901413,
"learning_rate": 1.5677972190904623e-06,
"loss": 0.4873,
"mean_token_accuracy": 0.8417120486497879,
"step": 2550
},
{
"epoch": 2.5167958656330747,
"grad_norm": 0.20403100550174713,
"learning_rate": 1.537118062826859e-06,
"loss": 0.4809,
"mean_token_accuracy": 0.8435953631997108,
"step": 2555
},
{
"epoch": 2.5217177310200567,
"grad_norm": 0.2051580399274826,
"learning_rate": 1.5067170416132603e-06,
"loss": 0.4841,
"mean_token_accuracy": 0.842904870212078,
"step": 2560
},
{
"epoch": 2.5266395964070383,
"grad_norm": 0.20559805631637573,
"learning_rate": 1.4765951545879732e-06,
"loss": 0.4953,
"mean_token_accuracy": 0.8392938315868378,
"step": 2565
},
{
"epoch": 2.53156146179402,
"grad_norm": 0.21315298974514008,
"learning_rate": 1.4467533917154842e-06,
"loss": 0.4812,
"mean_token_accuracy": 0.8433891490101815,
"step": 2570
},
{
"epoch": 2.5364833271810014,
"grad_norm": 0.33885088562965393,
"learning_rate": 1.4171927337539103e-06,
"loss": 0.4925,
"mean_token_accuracy": 0.8398235127329826,
"step": 2575
},
{
"epoch": 2.541405192567983,
"grad_norm": 0.19653761386871338,
"learning_rate": 1.3879141522227878e-06,
"loss": 0.4903,
"mean_token_accuracy": 0.8408400386571884,
"step": 2580
},
{
"epoch": 2.546327057954965,
"grad_norm": 0.19870713353157043,
"learning_rate": 1.3589186093711227e-06,
"loss": 0.4811,
"mean_token_accuracy": 0.8433947190642357,
"step": 2585
},
{
"epoch": 2.5512489233419466,
"grad_norm": 0.20051565766334534,
"learning_rate": 1.3302070581457716e-06,
"loss": 0.4994,
"mean_token_accuracy": 0.838576278090477,
"step": 2590
},
{
"epoch": 2.5561707887289282,
"grad_norm": 0.2312447875738144,
"learning_rate": 1.3017804421601298e-06,
"loss": 0.492,
"mean_token_accuracy": 0.8404266074299812,
"step": 2595
},
{
"epoch": 2.56109265411591,
"grad_norm": 0.21526625752449036,
"learning_rate": 1.273639695663108e-06,
"loss": 0.4916,
"mean_token_accuracy": 0.8403177246451378,
"step": 2600
},
{
"epoch": 2.5660145195028914,
"grad_norm": 0.4974516034126282,
"learning_rate": 1.245785743508441e-06,
"loss": 0.4887,
"mean_token_accuracy": 0.8414172142744064,
"step": 2605
},
{
"epoch": 2.5709363848898734,
"grad_norm": 0.19956116378307343,
"learning_rate": 1.2182195011242747e-06,
"loss": 0.5017,
"mean_token_accuracy": 0.837465213239193,
"step": 2610
},
{
"epoch": 2.575858250276855,
"grad_norm": 0.19986701011657715,
"learning_rate": 1.1909418744831048e-06,
"loss": 0.4878,
"mean_token_accuracy": 0.8414024114608765,
"step": 2615
},
{
"epoch": 2.5807801156638366,
"grad_norm": 0.20174540579319,
"learning_rate": 1.1639537600719764e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.8420050874352455,
"step": 2620
},
{
"epoch": 2.585701981050818,
"grad_norm": 0.20654183626174927,
"learning_rate": 1.1372560448630377e-06,
"loss": 0.4938,
"mean_token_accuracy": 0.8395126640796662,
"step": 2625
},
{
"epoch": 2.5906238464377997,
"grad_norm": 0.19598302245140076,
"learning_rate": 1.1108496062843743e-06,
"loss": 0.486,
"mean_token_accuracy": 0.8420949026942253,
"step": 2630
},
{
"epoch": 2.5955457118247818,
"grad_norm": 0.20486712455749512,
"learning_rate": 1.0847353121911952e-06,
"loss": 0.4891,
"mean_token_accuracy": 0.8409939989447593,
"step": 2635
},
{
"epoch": 2.6004675772117634,
"grad_norm": 0.2051970511674881,
"learning_rate": 1.0589140208372872e-06,
"loss": 0.4871,
"mean_token_accuracy": 0.8416621774435044,
"step": 2640
},
{
"epoch": 2.605389442598745,
"grad_norm": 0.20128969848155975,
"learning_rate": 1.0333865808468203e-06,
"loss": 0.4824,
"mean_token_accuracy": 0.8431450635194778,
"step": 2645
},
{
"epoch": 2.6103113079857265,
"grad_norm": 0.2007114738225937,
"learning_rate": 1.008153831186457e-06,
"loss": 0.4917,
"mean_token_accuracy": 0.8406037405133248,
"step": 2650
},
{
"epoch": 2.615233173372708,
"grad_norm": 0.19757139682769775,
"learning_rate": 9.83216601137773e-07,
"loss": 0.488,
"mean_token_accuracy": 0.8414921492338181,
"step": 2655
},
{
"epoch": 2.62015503875969,
"grad_norm": 0.21764694154262543,
"learning_rate": 9.58575710270011e-07,
"loss": 0.4819,
"mean_token_accuracy": 0.8431682124733925,
"step": 2660
},
{
"epoch": 2.6250769041466717,
"grad_norm": 0.20229902863502502,
"learning_rate": 9.342319684131396e-07,
"loss": 0.4916,
"mean_token_accuracy": 0.8404648944735527,
"step": 2665
},
{
"epoch": 2.6299987695336533,
"grad_norm": 0.22413024306297302,
"learning_rate": 9.101861756312369e-07,
"loss": 0.489,
"mean_token_accuracy": 0.8410172060132026,
"step": 2670
},
{
"epoch": 2.634920634920635,
"grad_norm": 0.1993047147989273,
"learning_rate": 8.864391221962065e-07,
"loss": 0.488,
"mean_token_accuracy": 0.841397476196289,
"step": 2675
},
{
"epoch": 2.6398425003076165,
"grad_norm": 0.20383085310459137,
"learning_rate": 8.629915885617912e-07,
"loss": 0.4906,
"mean_token_accuracy": 0.8405807599425316,
"step": 2680
},
{
"epoch": 2.6447643656945985,
"grad_norm": 0.19943130016326904,
"learning_rate": 8.398443453379268e-07,
"loss": 0.4872,
"mean_token_accuracy": 0.841593649983406,
"step": 2685
},
{
"epoch": 2.64968623108158,
"grad_norm": 0.19960327446460724,
"learning_rate": 8.169981532654269e-07,
"loss": 0.4854,
"mean_token_accuracy": 0.8422250881791115,
"step": 2690
},
{
"epoch": 2.6546080964685617,
"grad_norm": 0.20726507902145386,
"learning_rate": 7.944537631909666e-07,
"loss": 0.4855,
"mean_token_accuracy": 0.8422259956598281,
"step": 2695
},
{
"epoch": 2.6595299618555432,
"grad_norm": 0.19812346994876862,
"learning_rate": 7.722119160424113e-07,
"loss": 0.4867,
"mean_token_accuracy": 0.842007802426815,
"step": 2700
},
{
"epoch": 2.664451827242525,
"grad_norm": 0.19591908156871796,
"learning_rate": 7.502733428044684e-07,
"loss": 0.486,
"mean_token_accuracy": 0.8423181056976319,
"step": 2705
},
{
"epoch": 2.669373692629507,
"grad_norm": 0.195572167634964,
"learning_rate": 7.286387644946602e-07,
"loss": 0.4965,
"mean_token_accuracy": 0.8387840166687965,
"step": 2710
},
{
"epoch": 2.674295558016488,
"grad_norm": 0.2031807154417038,
"learning_rate": 7.073088921396287e-07,
"loss": 0.4907,
"mean_token_accuracy": 0.840399731695652,
"step": 2715
},
{
"epoch": 2.67921742340347,
"grad_norm": 0.2004314363002777,
"learning_rate": 6.862844267517643e-07,
"loss": 0.4846,
"mean_token_accuracy": 0.8423734799027442,
"step": 2720
},
{
"epoch": 2.6841392887904516,
"grad_norm": 0.20816642045974731,
"learning_rate": 6.655660593061719e-07,
"loss": 0.4982,
"mean_token_accuracy": 0.8385626211762428,
"step": 2725
},
{
"epoch": 2.689061154177433,
"grad_norm": 0.20351089537143707,
"learning_rate": 6.451544707179635e-07,
"loss": 0.4948,
"mean_token_accuracy": 0.8395294427871705,
"step": 2730
},
{
"epoch": 2.6939830195644148,
"grad_norm": 0.20076881349086761,
"learning_rate": 6.250503318198664e-07,
"loss": 0.4888,
"mean_token_accuracy": 0.8412301942706109,
"step": 2735
},
{
"epoch": 2.6989048849513964,
"grad_norm": 0.25244539976119995,
"learning_rate": 6.052543033401892e-07,
"loss": 0.4918,
"mean_token_accuracy": 0.8402833178639412,
"step": 2740
},
{
"epoch": 2.7038267503383784,
"grad_norm": 0.2058088779449463,
"learning_rate": 5.857670358811096e-07,
"loss": 0.4914,
"mean_token_accuracy": 0.8405940279364585,
"step": 2745
},
{
"epoch": 2.70874861572536,
"grad_norm": 0.2002749741077423,
"learning_rate": 5.665891698972769e-07,
"loss": 0.4956,
"mean_token_accuracy": 0.8391197189688683,
"step": 2750
},
{
"epoch": 2.7136704811123415,
"grad_norm": 0.19865228235721588,
"learning_rate": 5.477213356747746e-07,
"loss": 0.4894,
"mean_token_accuracy": 0.8410469844937325,
"step": 2755
},
{
"epoch": 2.718592346499323,
"grad_norm": 0.20059484243392944,
"learning_rate": 5.291641533104053e-07,
"loss": 0.4817,
"mean_token_accuracy": 0.8434463173151017,
"step": 2760
},
{
"epoch": 2.7235142118863047,
"grad_norm": 0.19962534308433533,
"learning_rate": 5.109182326913053e-07,
"loss": 0.4815,
"mean_token_accuracy": 0.8433682397007942,
"step": 2765
},
{
"epoch": 2.7284360772732867,
"grad_norm": 0.1976374238729477,
"learning_rate": 4.929841734749063e-07,
"loss": 0.4824,
"mean_token_accuracy": 0.8429444268345833,
"step": 2770
},
{
"epoch": 2.7333579426602683,
"grad_norm": 0.1919257491827011,
"learning_rate": 4.7536256506922507e-07,
"loss": 0.4858,
"mean_token_accuracy": 0.8420413583517075,
"step": 2775
},
{
"epoch": 2.73827980804725,
"grad_norm": 0.21447736024856567,
"learning_rate": 4.580539866134914e-07,
"loss": 0.4898,
"mean_token_accuracy": 0.8408365085721016,
"step": 2780
},
{
"epoch": 2.7432016734342315,
"grad_norm": 0.20053516328334808,
"learning_rate": 4.410590069591192e-07,
"loss": 0.4918,
"mean_token_accuracy": 0.8403174698352813,
"step": 2785
},
{
"epoch": 2.748123538821213,
"grad_norm": 0.3303152620792389,
"learning_rate": 4.2437818465100313e-07,
"loss": 0.4812,
"mean_token_accuracy": 0.8434215649962425,
"step": 2790
},
{
"epoch": 2.753045404208195,
"grad_norm": 0.194558247923851,
"learning_rate": 4.0801206790916815e-07,
"loss": 0.4804,
"mean_token_accuracy": 0.8438364923000335,
"step": 2795
},
{
"epoch": 2.7579672695951767,
"grad_norm": 0.19499559700489044,
"learning_rate": 3.919611946107493e-07,
"loss": 0.4825,
"mean_token_accuracy": 0.8429989367723465,
"step": 2800
},
{
"epoch": 2.7628891349821583,
"grad_norm": 0.19578364491462708,
"learning_rate": 3.762260922723182e-07,
"loss": 0.4866,
"mean_token_accuracy": 0.8416179150342942,
"step": 2805
},
{
"epoch": 2.76781100036914,
"grad_norm": 0.20279313623905182,
"learning_rate": 3.6080727803254003e-07,
"loss": 0.4913,
"mean_token_accuracy": 0.8406556889414787,
"step": 2810
},
{
"epoch": 2.7727328657561214,
"grad_norm": 0.20414599776268005,
"learning_rate": 3.457052586351817e-07,
"loss": 0.4921,
"mean_token_accuracy": 0.8403137296438217,
"step": 2815
},
{
"epoch": 2.7776547311431035,
"grad_norm": 0.20257827639579773,
"learning_rate": 3.309205304124552e-07,
"loss": 0.4888,
"mean_token_accuracy": 0.841057425737381,
"step": 2820
},
{
"epoch": 2.782576596530085,
"grad_norm": 0.19924387335777283,
"learning_rate": 3.1645357926870957e-07,
"loss": 0.4966,
"mean_token_accuracy": 0.8389097020030022,
"step": 2825
},
{
"epoch": 2.7874984619170666,
"grad_norm": 0.20351967215538025,
"learning_rate": 3.0230488066445465e-07,
"loss": 0.4912,
"mean_token_accuracy": 0.8404456153512001,
"step": 2830
},
{
"epoch": 2.792420327304048,
"grad_norm": 0.199168398976326,
"learning_rate": 2.8847489960074136e-07,
"loss": 0.4936,
"mean_token_accuracy": 0.8398653537034988,
"step": 2835
},
{
"epoch": 2.79734219269103,
"grad_norm": 0.19794094562530518,
"learning_rate": 2.7496409060387973e-07,
"loss": 0.4962,
"mean_token_accuracy": 0.8388495057821274,
"step": 2840
},
{
"epoch": 2.802264058078012,
"grad_norm": 0.19937507808208466,
"learning_rate": 2.6177289771049274e-07,
"loss": 0.4895,
"mean_token_accuracy": 0.8410208597779274,
"step": 2845
},
{
"epoch": 2.807185923464993,
"grad_norm": 0.19925516843795776,
"learning_rate": 2.489017544529315e-07,
"loss": 0.4875,
"mean_token_accuracy": 0.8415358811616898,
"step": 2850
},
{
"epoch": 2.812107788851975,
"grad_norm": 0.19592879712581635,
"learning_rate": 2.3635108384502003e-07,
"loss": 0.4949,
"mean_token_accuracy": 0.839320321381092,
"step": 2855
},
{
"epoch": 2.8170296542389566,
"grad_norm": 0.19561193883419037,
"learning_rate": 2.2412129836816287e-07,
"loss": 0.4913,
"mean_token_accuracy": 0.840375654399395,
"step": 2860
},
{
"epoch": 2.821951519625938,
"grad_norm": 0.1935349404811859,
"learning_rate": 2.1221279995777833e-07,
"loss": 0.4859,
"mean_token_accuracy": 0.8416187852621079,
"step": 2865
},
{
"epoch": 2.8268733850129197,
"grad_norm": 0.19886697828769684,
"learning_rate": 2.0062597999009114e-07,
"loss": 0.4821,
"mean_token_accuracy": 0.8432388514280319,
"step": 2870
},
{
"epoch": 2.8317952503999013,
"grad_norm": 0.19826510548591614,
"learning_rate": 1.8936121926927508e-07,
"loss": 0.49,
"mean_token_accuracy": 0.8409401133656502,
"step": 2875
},
{
"epoch": 2.8367171157868833,
"grad_norm": 0.21422724425792694,
"learning_rate": 1.7841888801493178e-07,
"loss": 0.4897,
"mean_token_accuracy": 0.840906199812889,
"step": 2880
},
{
"epoch": 2.841638981173865,
"grad_norm": 0.2021849900484085,
"learning_rate": 1.677993458499272e-07,
"loss": 0.4871,
"mean_token_accuracy": 0.8416887044906616,
"step": 2885
},
{
"epoch": 2.8465608465608465,
"grad_norm": 0.19902034103870392,
"learning_rate": 1.5750294178856872e-07,
"loss": 0.4884,
"mean_token_accuracy": 0.8414162322878838,
"step": 2890
},
{
"epoch": 2.851482711947828,
"grad_norm": 0.19861221313476562,
"learning_rate": 1.4753001422514125e-07,
"loss": 0.4926,
"mean_token_accuracy": 0.8401012614369392,
"step": 2895
},
{
"epoch": 2.8564045773348097,
"grad_norm": 0.19735361635684967,
"learning_rate": 1.378808909227769e-07,
"loss": 0.4849,
"mean_token_accuracy": 0.8422791570425033,
"step": 2900
},
{
"epoch": 2.8613264427217917,
"grad_norm": 0.20118270814418793,
"learning_rate": 1.2855588900269057e-07,
"loss": 0.4912,
"mean_token_accuracy": 0.8406861796975136,
"step": 2905
},
{
"epoch": 2.8662483081087733,
"grad_norm": 0.19249391555786133,
"learning_rate": 1.1955531493375137e-07,
"loss": 0.4795,
"mean_token_accuracy": 0.8438849881291389,
"step": 2910
},
{
"epoch": 2.871170173495755,
"grad_norm": 0.19686251878738403,
"learning_rate": 1.1087946452241871e-07,
"loss": 0.4937,
"mean_token_accuracy": 0.8399393901228904,
"step": 2915
},
{
"epoch": 2.8760920388827365,
"grad_norm": 0.1956326812505722,
"learning_rate": 1.0252862290301092e-07,
"loss": 0.4887,
"mean_token_accuracy": 0.841577798128128,
"step": 2920
},
{
"epoch": 2.881013904269718,
"grad_norm": 0.2053905874490738,
"learning_rate": 9.45030645283418e-08,
"loss": 0.4897,
"mean_token_accuracy": 0.8410707041621208,
"step": 2925
},
{
"epoch": 2.8859357696567,
"grad_norm": 0.19495834410190582,
"learning_rate": 8.68030531606967e-08,
"loss": 0.4927,
"mean_token_accuracy": 0.8402184978127479,
"step": 2930
},
{
"epoch": 2.8908576350436817,
"grad_norm": 0.1992396116256714,
"learning_rate": 7.94288418631639e-08,
"loss": 0.4857,
"mean_token_accuracy": 0.842261828482151,
"step": 2935
},
{
"epoch": 2.8957795004306632,
"grad_norm": 0.20448440313339233,
"learning_rate": 7.238067299131901e-08,
"loss": 0.4907,
"mean_token_accuracy": 0.841072927415371,
"step": 2940
},
{
"epoch": 2.900701365817645,
"grad_norm": 0.19940471649169922,
"learning_rate": 6.565877818526245e-08,
"loss": 0.4886,
"mean_token_accuracy": 0.8412072688341141,
"step": 2945
},
{
"epoch": 2.9056232312046264,
"grad_norm": 0.19256047904491425,
"learning_rate": 5.926337836199891e-08,
"loss": 0.4867,
"mean_token_accuracy": 0.8416444838047028,
"step": 2950
},
{
"epoch": 2.9105450965916084,
"grad_norm": 0.19797919690608978,
"learning_rate": 5.319468370818537e-08,
"loss": 0.4897,
"mean_token_accuracy": 0.8410748258233071,
"step": 2955
},
{
"epoch": 2.91546696197859,
"grad_norm": 0.1998082846403122,
"learning_rate": 4.7452893673216596e-08,
"loss": 0.4845,
"mean_token_accuracy": 0.8427498519420624,
"step": 2960
},
{
"epoch": 2.9203888273655716,
"grad_norm": 0.19540701806545258,
"learning_rate": 4.203819696267486e-08,
"loss": 0.4907,
"mean_token_accuracy": 0.8408638656139373,
"step": 2965
},
{
"epoch": 2.925310692752553,
"grad_norm": 0.19913552701473236,
"learning_rate": 3.6950771532126004e-08,
"loss": 0.4983,
"mean_token_accuracy": 0.8385754480957985,
"step": 2970
},
{
"epoch": 2.9302325581395348,
"grad_norm": 0.19257843494415283,
"learning_rate": 3.2190784581270786e-08,
"loss": 0.4878,
"mean_token_accuracy": 0.841645573079586,
"step": 2975
},
{
"epoch": 2.935154423526517,
"grad_norm": 0.19568364322185516,
"learning_rate": 2.7758392548449253e-08,
"loss": 0.4891,
"mean_token_accuracy": 0.8412896126508713,
"step": 2980
},
{
"epoch": 2.9400762889134984,
"grad_norm": 0.20067226886749268,
"learning_rate": 2.3653741105499338e-08,
"loss": 0.4836,
"mean_token_accuracy": 0.8427690804004669,
"step": 2985
},
{
"epoch": 2.94499815430048,
"grad_norm": 0.19799287617206573,
"learning_rate": 1.9876965152975102e-08,
"loss": 0.4895,
"mean_token_accuracy": 0.8405489608645439,
"step": 2990
},
{
"epoch": 2.9499200196874615,
"grad_norm": 1.0325350761413574,
"learning_rate": 1.6428188815703627e-08,
"loss": 0.4896,
"mean_token_accuracy": 0.8411920800805092,
"step": 2995
},
{
"epoch": 2.954841885074443,
"grad_norm": 0.1966339498758316,
"learning_rate": 1.3307525438711611e-08,
"loss": 0.488,
"mean_token_accuracy": 0.841396550834179,
"step": 3000
},
{
"epoch": 2.959763750461425,
"grad_norm": 0.2234841138124466,
"learning_rate": 1.0515077583498346e-08,
"loss": 0.4911,
"mean_token_accuracy": 0.8406392633914948,
"step": 3005
},
{
"epoch": 2.9646856158484063,
"grad_norm": 0.27488455176353455,
"learning_rate": 8.050937024666195e-09,
"loss": 0.4942,
"mean_token_accuracy": 0.8396434351801872,
"step": 3010
},
{
"epoch": 2.9696074812353883,
"grad_norm": 0.1911349892616272,
"learning_rate": 5.9151847469041125e-09,
"loss": 0.4823,
"mean_token_accuracy": 0.8430395260453224,
"step": 3015
},
{
"epoch": 2.97452934662237,
"grad_norm": 0.19882096350193024,
"learning_rate": 4.1078909423253325e-09,
"loss": 0.4995,
"mean_token_accuracy": 0.8379872292280197,
"step": 3020
},
{
"epoch": 2.9794512120093515,
"grad_norm": 0.20069076120853424,
"learning_rate": 2.629115008160321e-09,
"loss": 0.4964,
"mean_token_accuracy": 0.8388297706842422,
"step": 3025
},
{
"epoch": 2.984373077396333,
"grad_norm": 0.19437766075134277,
"learning_rate": 1.4789055448061195e-09,
"loss": 0.4851,
"mean_token_accuracy": 0.8421405225992202,
"step": 3030
},
{
"epoch": 2.9892949427833146,
"grad_norm": 0.19950829446315765,
"learning_rate": 6.573003542276191e-10,
"loss": 0.4889,
"mean_token_accuracy": 0.8408236041665077,
"step": 3035
},
{
"epoch": 2.9942168081702967,
"grad_norm": 0.19173409044742584,
"learning_rate": 1.6432643871633346e-10,
"loss": 0.4873,
"mean_token_accuracy": 0.8419449985027313,
"step": 3040
},
{
"epoch": 2.9991386735572783,
"grad_norm": 0.1980327069759369,
"learning_rate": 0.0,
"loss": 0.4895,
"mean_token_accuracy": 0.8409327268600464,
"step": 3045
},
{
"epoch": 2.9991386735572783,
"step": 3045,
"total_flos": 2550348896010240.0,
"train_loss": 0.5881131024979214,
"train_runtime": 268544.791,
"train_samples_per_second": 1.452,
"train_steps_per_second": 0.011
}
],
"logging_steps": 5,
"max_steps": 3045,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2550348896010240.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}