|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9991386735572783, |
|
"eval_steps": 100, |
|
"global_step": 3045, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004921865386981666, |
|
"grad_norm": 10.908417701721191, |
|
"learning_rate": 3.278688524590164e-07, |
|
"loss": 2.6851, |
|
"mean_token_accuracy": 0.490550322830677, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.009843730773963333, |
|
"grad_norm": 10.821477890014648, |
|
"learning_rate": 6.557377049180328e-07, |
|
"loss": 2.6916, |
|
"mean_token_accuracy": 0.4892874449491501, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014765596160944998, |
|
"grad_norm": 9.100831031799316, |
|
"learning_rate": 9.836065573770493e-07, |
|
"loss": 2.6563, |
|
"mean_token_accuracy": 0.49268135130405427, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.019687461547926666, |
|
"grad_norm": 6.744043827056885, |
|
"learning_rate": 1.3114754098360657e-06, |
|
"loss": 2.4838, |
|
"mean_token_accuracy": 0.503991749882698, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02460932693490833, |
|
"grad_norm": 4.111428737640381, |
|
"learning_rate": 1.6393442622950819e-06, |
|
"loss": 2.3481, |
|
"mean_token_accuracy": 0.5121142826974392, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.029531192321889995, |
|
"grad_norm": 3.504826068878174, |
|
"learning_rate": 1.9672131147540985e-06, |
|
"loss": 2.1834, |
|
"mean_token_accuracy": 0.525759468972683, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.034453057708871665, |
|
"grad_norm": 2.371668577194214, |
|
"learning_rate": 2.295081967213115e-06, |
|
"loss": 1.9992, |
|
"mean_token_accuracy": 0.5471328645944595, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03937492309585333, |
|
"grad_norm": 1.910736083984375, |
|
"learning_rate": 2.6229508196721314e-06, |
|
"loss": 1.8619, |
|
"mean_token_accuracy": 0.5657269343733787, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.044296788482835, |
|
"grad_norm": 1.6694586277008057, |
|
"learning_rate": 2.9508196721311478e-06, |
|
"loss": 1.7324, |
|
"mean_token_accuracy": 0.582801228761673, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04921865386981666, |
|
"grad_norm": 1.3371120691299438, |
|
"learning_rate": 3.2786885245901638e-06, |
|
"loss": 1.5922, |
|
"mean_token_accuracy": 0.6066210582852364, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.054140519256798324, |
|
"grad_norm": 1.153715968132019, |
|
"learning_rate": 3.6065573770491806e-06, |
|
"loss": 1.4607, |
|
"mean_token_accuracy": 0.629358272254467, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05906238464377999, |
|
"grad_norm": 1.011682391166687, |
|
"learning_rate": 3.934426229508197e-06, |
|
"loss": 1.3312, |
|
"mean_token_accuracy": 0.6534152328968048, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06398425003076166, |
|
"grad_norm": 0.8580278158187866, |
|
"learning_rate": 4.2622950819672135e-06, |
|
"loss": 1.2163, |
|
"mean_token_accuracy": 0.676006656885147, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.06890611541774333, |
|
"grad_norm": 0.7737818360328674, |
|
"learning_rate": 4.59016393442623e-06, |
|
"loss": 1.1256, |
|
"mean_token_accuracy": 0.695121419429779, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.073827980804725, |
|
"grad_norm": 0.6026164889335632, |
|
"learning_rate": 4.918032786885246e-06, |
|
"loss": 1.0456, |
|
"mean_token_accuracy": 0.7120692700147628, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07874984619170666, |
|
"grad_norm": 20.797266006469727, |
|
"learning_rate": 5.245901639344263e-06, |
|
"loss": 0.9884, |
|
"mean_token_accuracy": 0.7246918171644211, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08367171157868833, |
|
"grad_norm": 24.53761100769043, |
|
"learning_rate": 5.573770491803278e-06, |
|
"loss": 0.9471, |
|
"mean_token_accuracy": 0.7344574183225632, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.08859357696567, |
|
"grad_norm": 7.69836950302124, |
|
"learning_rate": 5.9016393442622956e-06, |
|
"loss": 0.9291, |
|
"mean_token_accuracy": 0.7384938269853591, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09351544235265165, |
|
"grad_norm": 0.42971891164779663, |
|
"learning_rate": 6.229508196721312e-06, |
|
"loss": 0.9071, |
|
"mean_token_accuracy": 0.743149445950985, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.09843730773963331, |
|
"grad_norm": 0.4011496901512146, |
|
"learning_rate": 6.5573770491803276e-06, |
|
"loss": 0.8839, |
|
"mean_token_accuracy": 0.7489838138222694, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10335917312661498, |
|
"grad_norm": 0.4182426631450653, |
|
"learning_rate": 6.885245901639345e-06, |
|
"loss": 0.864, |
|
"mean_token_accuracy": 0.7533508613705635, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.10828103851359665, |
|
"grad_norm": 0.4418739080429077, |
|
"learning_rate": 7.213114754098361e-06, |
|
"loss": 0.8461, |
|
"mean_token_accuracy": 0.7571793958544731, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11320290390057831, |
|
"grad_norm": 4.76384973526001, |
|
"learning_rate": 7.540983606557377e-06, |
|
"loss": 0.8478, |
|
"mean_token_accuracy": 0.7560782924294471, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11812476928755998, |
|
"grad_norm": 0.426782488822937, |
|
"learning_rate": 7.868852459016394e-06, |
|
"loss": 0.8262, |
|
"mean_token_accuracy": 0.7621309965848923, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12304663467454165, |
|
"grad_norm": 3.5404343605041504, |
|
"learning_rate": 8.19672131147541e-06, |
|
"loss": 0.8239, |
|
"mean_token_accuracy": 0.7624999329447746, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.12796850006152333, |
|
"grad_norm": 0.6128109097480774, |
|
"learning_rate": 8.524590163934427e-06, |
|
"loss": 0.8125, |
|
"mean_token_accuracy": 0.7650709196925163, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.132890365448505, |
|
"grad_norm": 0.4441392719745636, |
|
"learning_rate": 8.852459016393443e-06, |
|
"loss": 0.8178, |
|
"mean_token_accuracy": 0.7635303542017937, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.13781223083548666, |
|
"grad_norm": 0.6959536075592041, |
|
"learning_rate": 9.18032786885246e-06, |
|
"loss": 0.797, |
|
"mean_token_accuracy": 0.7682553365826607, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14273409622246833, |
|
"grad_norm": 0.4633159935474396, |
|
"learning_rate": 9.508196721311476e-06, |
|
"loss": 0.7972, |
|
"mean_token_accuracy": 0.7677757993340493, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.14765596160945, |
|
"grad_norm": 0.3808494806289673, |
|
"learning_rate": 9.836065573770493e-06, |
|
"loss": 0.7956, |
|
"mean_token_accuracy": 0.7682796508073807, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15257782699643166, |
|
"grad_norm": 1.2230223417282104, |
|
"learning_rate": 1.0163934426229509e-05, |
|
"loss": 0.7714, |
|
"mean_token_accuracy": 0.7741705477237701, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.15749969238341333, |
|
"grad_norm": 1.2708261013031006, |
|
"learning_rate": 1.0491803278688525e-05, |
|
"loss": 0.7671, |
|
"mean_token_accuracy": 0.7750522747635842, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.162421557770395, |
|
"grad_norm": 0.4153311252593994, |
|
"learning_rate": 1.0819672131147544e-05, |
|
"loss": 0.762, |
|
"mean_token_accuracy": 0.776003035902977, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.16734342315737666, |
|
"grad_norm": 0.48690149188041687, |
|
"learning_rate": 1.1147540983606557e-05, |
|
"loss": 0.7611, |
|
"mean_token_accuracy": 0.776053948700428, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17226528854435832, |
|
"grad_norm": 0.3839600682258606, |
|
"learning_rate": 1.1475409836065575e-05, |
|
"loss": 0.7518, |
|
"mean_token_accuracy": 0.7784286484122276, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.17718715393134, |
|
"grad_norm": 0.33650702238082886, |
|
"learning_rate": 1.1803278688524591e-05, |
|
"loss": 0.7425, |
|
"mean_token_accuracy": 0.7807790979743003, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18210901931832166, |
|
"grad_norm": 0.34878674149513245, |
|
"learning_rate": 1.2131147540983608e-05, |
|
"loss": 0.7469, |
|
"mean_token_accuracy": 0.779270826280117, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1870308847053033, |
|
"grad_norm": 0.4435058534145355, |
|
"learning_rate": 1.2459016393442624e-05, |
|
"loss": 0.7414, |
|
"mean_token_accuracy": 0.7804962411522866, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19195275009228496, |
|
"grad_norm": 0.34793269634246826, |
|
"learning_rate": 1.2786885245901642e-05, |
|
"loss": 0.7368, |
|
"mean_token_accuracy": 0.7817707493901253, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.19687461547926663, |
|
"grad_norm": 0.32821062207221985, |
|
"learning_rate": 1.3114754098360655e-05, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.7830819576978684, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2017964808662483, |
|
"grad_norm": 0.3908160626888275, |
|
"learning_rate": 1.3442622950819673e-05, |
|
"loss": 0.7349, |
|
"mean_token_accuracy": 0.7820746794342994, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.20671834625322996, |
|
"grad_norm": 1.239039659500122, |
|
"learning_rate": 1.377049180327869e-05, |
|
"loss": 0.7315, |
|
"mean_token_accuracy": 0.7830250725150109, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21164021164021163, |
|
"grad_norm": 0.437558650970459, |
|
"learning_rate": 1.4098360655737706e-05, |
|
"loss": 0.7213, |
|
"mean_token_accuracy": 0.785545514523983, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2165620770271933, |
|
"grad_norm": 0.3581276535987854, |
|
"learning_rate": 1.4426229508196722e-05, |
|
"loss": 0.7156, |
|
"mean_token_accuracy": 0.7868386089801789, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22148394241417496, |
|
"grad_norm": 0.393839031457901, |
|
"learning_rate": 1.4754098360655739e-05, |
|
"loss": 0.7108, |
|
"mean_token_accuracy": 0.7875275865197182, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22640580780115663, |
|
"grad_norm": 0.4203226566314697, |
|
"learning_rate": 1.5081967213114754e-05, |
|
"loss": 0.7115, |
|
"mean_token_accuracy": 0.7875282734632492, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2313276731881383, |
|
"grad_norm": 0.4379311501979828, |
|
"learning_rate": 1.5409836065573772e-05, |
|
"loss": 0.7176, |
|
"mean_token_accuracy": 0.7859495177865028, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.23624953857511996, |
|
"grad_norm": 0.5987364053726196, |
|
"learning_rate": 1.5737704918032788e-05, |
|
"loss": 0.7047, |
|
"mean_token_accuracy": 0.7892461016774177, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24117140396210163, |
|
"grad_norm": 0.39721059799194336, |
|
"learning_rate": 1.6065573770491805e-05, |
|
"loss": 0.7082, |
|
"mean_token_accuracy": 0.7879156336188317, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2460932693490833, |
|
"grad_norm": 0.35150638222694397, |
|
"learning_rate": 1.639344262295082e-05, |
|
"loss": 0.7015, |
|
"mean_token_accuracy": 0.7899731829762459, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25101513473606496, |
|
"grad_norm": 0.37812677025794983, |
|
"learning_rate": 1.6721311475409837e-05, |
|
"loss": 0.7112, |
|
"mean_token_accuracy": 0.7869908154010773, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.25593700012304665, |
|
"grad_norm": 0.37921008467674255, |
|
"learning_rate": 1.7049180327868854e-05, |
|
"loss": 0.695, |
|
"mean_token_accuracy": 0.7912393018603325, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2608588655100283, |
|
"grad_norm": 0.3776193857192993, |
|
"learning_rate": 1.737704918032787e-05, |
|
"loss": 0.6975, |
|
"mean_token_accuracy": 0.7903847828507423, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.26578073089701, |
|
"grad_norm": 0.34160885214805603, |
|
"learning_rate": 1.7704918032786887e-05, |
|
"loss": 0.7005, |
|
"mean_token_accuracy": 0.7901133581995964, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2707025962839916, |
|
"grad_norm": 0.3151760399341583, |
|
"learning_rate": 1.8032786885245903e-05, |
|
"loss": 0.6838, |
|
"mean_token_accuracy": 0.7940751999616623, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2756244616709733, |
|
"grad_norm": 0.3251655101776123, |
|
"learning_rate": 1.836065573770492e-05, |
|
"loss": 0.683, |
|
"mean_token_accuracy": 0.7942519947886467, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.28054632705795496, |
|
"grad_norm": 0.392980694770813, |
|
"learning_rate": 1.8688524590163936e-05, |
|
"loss": 0.6779, |
|
"mean_token_accuracy": 0.7953907087445259, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.28546819244493665, |
|
"grad_norm": 0.42777085304260254, |
|
"learning_rate": 1.9016393442622952e-05, |
|
"loss": 0.696, |
|
"mean_token_accuracy": 0.7913835749030114, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2903900578319183, |
|
"grad_norm": 0.38064613938331604, |
|
"learning_rate": 1.934426229508197e-05, |
|
"loss": 0.6777, |
|
"mean_token_accuracy": 0.79527537971735, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2953119232189, |
|
"grad_norm": 0.35906219482421875, |
|
"learning_rate": 1.9672131147540985e-05, |
|
"loss": 0.6772, |
|
"mean_token_accuracy": 0.7954441845417023, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3002337886058816, |
|
"grad_norm": 0.4336443543434143, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6672, |
|
"mean_token_accuracy": 0.7982369065284729, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3051556539928633, |
|
"grad_norm": 0.35013464093208313, |
|
"learning_rate": 1.9999835673561284e-05, |
|
"loss": 0.6823, |
|
"mean_token_accuracy": 0.7940784975886345, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.31007751937984496, |
|
"grad_norm": 0.4209573566913605, |
|
"learning_rate": 1.9999342699645774e-05, |
|
"loss": 0.6705, |
|
"mean_token_accuracy": 0.7970875754952431, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.31499938476682665, |
|
"grad_norm": 0.3402932584285736, |
|
"learning_rate": 1.9998521094455198e-05, |
|
"loss": 0.6733, |
|
"mean_token_accuracy": 0.7962517961859703, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3199212501538083, |
|
"grad_norm": 0.3613898456096649, |
|
"learning_rate": 1.9997370884991842e-05, |
|
"loss": 0.6659, |
|
"mean_token_accuracy": 0.7986094921827316, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.32484311554079, |
|
"grad_norm": 0.8141839504241943, |
|
"learning_rate": 1.9995892109057675e-05, |
|
"loss": 0.6682, |
|
"mean_token_accuracy": 0.7979325890541077, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3297649809277716, |
|
"grad_norm": 0.32822492718696594, |
|
"learning_rate": 1.99940848152531e-05, |
|
"loss": 0.6592, |
|
"mean_token_accuracy": 0.799762362241745, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3346868463147533, |
|
"grad_norm": 0.32193639874458313, |
|
"learning_rate": 1.9991949062975336e-05, |
|
"loss": 0.6669, |
|
"mean_token_accuracy": 0.7977916583418846, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.33960871170173496, |
|
"grad_norm": 0.6516172885894775, |
|
"learning_rate": 1.9989484922416503e-05, |
|
"loss": 0.6636, |
|
"mean_token_accuracy": 0.7989253982901573, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.34453057708871665, |
|
"grad_norm": 0.6252678036689758, |
|
"learning_rate": 1.9986692474561292e-05, |
|
"loss": 0.6549, |
|
"mean_token_accuracy": 0.8010424450039864, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3494524424756983, |
|
"grad_norm": 0.39426907896995544, |
|
"learning_rate": 1.9983571811184297e-05, |
|
"loss": 0.6583, |
|
"mean_token_accuracy": 0.8001298069953918, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.35437430786268, |
|
"grad_norm": 0.4398311972618103, |
|
"learning_rate": 1.9980123034847025e-05, |
|
"loss": 0.6569, |
|
"mean_token_accuracy": 0.8002386093139648, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3592961732496616, |
|
"grad_norm": 0.36181896924972534, |
|
"learning_rate": 1.9976346258894502e-05, |
|
"loss": 0.6572, |
|
"mean_token_accuracy": 0.7999640181660652, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3642180386366433, |
|
"grad_norm": 0.33937492966651917, |
|
"learning_rate": 1.9972241607451552e-05, |
|
"loss": 0.6534, |
|
"mean_token_accuracy": 0.8008638471364975, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.36913990402362495, |
|
"grad_norm": 0.3220241665840149, |
|
"learning_rate": 1.996780921541873e-05, |
|
"loss": 0.6491, |
|
"mean_token_accuracy": 0.8024497851729393, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3740617694106066, |
|
"grad_norm": 0.3588990867137909, |
|
"learning_rate": 1.9963049228467875e-05, |
|
"loss": 0.6519, |
|
"mean_token_accuracy": 0.8013440445065498, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3789836347975883, |
|
"grad_norm": 0.3850741982460022, |
|
"learning_rate": 1.9957961803037325e-05, |
|
"loss": 0.6539, |
|
"mean_token_accuracy": 0.8007026329636574, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3839055001845699, |
|
"grad_norm": 0.39418673515319824, |
|
"learning_rate": 1.9952547106326787e-05, |
|
"loss": 0.6511, |
|
"mean_token_accuracy": 0.8013290241360664, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3888273655715516, |
|
"grad_norm": 0.33889254927635193, |
|
"learning_rate": 1.9946805316291817e-05, |
|
"loss": 0.6523, |
|
"mean_token_accuracy": 0.8005807921290398, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.39374923095853326, |
|
"grad_norm": 0.7381798624992371, |
|
"learning_rate": 1.9940736621638e-05, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.8016207367181778, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.39867109634551495, |
|
"grad_norm": 0.3772973120212555, |
|
"learning_rate": 1.993434122181474e-05, |
|
"loss": 0.6458, |
|
"mean_token_accuracy": 0.802768674492836, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4035929617324966, |
|
"grad_norm": 0.33333730697631836, |
|
"learning_rate": 1.992761932700868e-05, |
|
"loss": 0.6444, |
|
"mean_token_accuracy": 0.8025879472494125, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4085148271194783, |
|
"grad_norm": 0.3165677785873413, |
|
"learning_rate": 1.9920571158136837e-05, |
|
"loss": 0.639, |
|
"mean_token_accuracy": 0.8042329683899879, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4134366925064599, |
|
"grad_norm": 0.3313787579536438, |
|
"learning_rate": 1.9913196946839304e-05, |
|
"loss": 0.6422, |
|
"mean_token_accuracy": 0.803669148683548, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4183585578934416, |
|
"grad_norm": 0.2832159101963043, |
|
"learning_rate": 1.990549693547166e-05, |
|
"loss": 0.6378, |
|
"mean_token_accuracy": 0.8049987867474556, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.42328042328042326, |
|
"grad_norm": 0.3278089463710785, |
|
"learning_rate": 1.9897471377096992e-05, |
|
"loss": 0.638, |
|
"mean_token_accuracy": 0.8043939173221588, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.42820228866740495, |
|
"grad_norm": 0.33513346314430237, |
|
"learning_rate": 1.9889120535477584e-05, |
|
"loss": 0.6366, |
|
"mean_token_accuracy": 0.80514996945858, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4331241540543866, |
|
"grad_norm": 0.36697131395339966, |
|
"learning_rate": 1.9880444685066252e-05, |
|
"loss": 0.6322, |
|
"mean_token_accuracy": 0.8064638406038285, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4380460194413683, |
|
"grad_norm": 0.34239935874938965, |
|
"learning_rate": 1.987144411099731e-05, |
|
"loss": 0.6328, |
|
"mean_token_accuracy": 0.8058159291744232, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4429678848283499, |
|
"grad_norm": 0.29778754711151123, |
|
"learning_rate": 1.9862119109077226e-05, |
|
"loss": 0.6442, |
|
"mean_token_accuracy": 0.8030599504709244, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4478897502153316, |
|
"grad_norm": 0.31139907240867615, |
|
"learning_rate": 1.985246998577486e-05, |
|
"loss": 0.6507, |
|
"mean_token_accuracy": 0.8007849171757698, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.45281161560231326, |
|
"grad_norm": 0.32070034742355347, |
|
"learning_rate": 1.984249705821143e-05, |
|
"loss": 0.6405, |
|
"mean_token_accuracy": 0.8038340613245964, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.45773348098929495, |
|
"grad_norm": 0.3086022734642029, |
|
"learning_rate": 1.9832200654150077e-05, |
|
"loss": 0.6316, |
|
"mean_token_accuracy": 0.8058078184723854, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4626553463762766, |
|
"grad_norm": 0.30972251296043396, |
|
"learning_rate": 1.9821581111985072e-05, |
|
"loss": 0.6343, |
|
"mean_token_accuracy": 0.8051379904150963, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4675772117632583, |
|
"grad_norm": 0.2832852005958557, |
|
"learning_rate": 1.981063878073073e-05, |
|
"loss": 0.6324, |
|
"mean_token_accuracy": 0.8058837354183197, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4724990771502399, |
|
"grad_norm": 0.909318208694458, |
|
"learning_rate": 1.979937402000991e-05, |
|
"loss": 0.6319, |
|
"mean_token_accuracy": 0.8056973502039909, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4774209425372216, |
|
"grad_norm": 0.31788304448127747, |
|
"learning_rate": 1.9787787200042224e-05, |
|
"loss": 0.6354, |
|
"mean_token_accuracy": 0.8051144614815712, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.48234280792420325, |
|
"grad_norm": 0.2922450602054596, |
|
"learning_rate": 1.977587870163184e-05, |
|
"loss": 0.6278, |
|
"mean_token_accuracy": 0.8066384568810463, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.48726467331118495, |
|
"grad_norm": 0.287406325340271, |
|
"learning_rate": 1.9763648916154982e-05, |
|
"loss": 0.6271, |
|
"mean_token_accuracy": 0.8069956362247467, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4921865386981666, |
|
"grad_norm": 0.34040403366088867, |
|
"learning_rate": 1.975109824554707e-05, |
|
"loss": 0.6288, |
|
"mean_token_accuracy": 0.806525257229805, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4971084040851483, |
|
"grad_norm": 0.3302447199821472, |
|
"learning_rate": 1.973822710228951e-05, |
|
"loss": 0.6257, |
|
"mean_token_accuracy": 0.8072399228811264, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5020302694721299, |
|
"grad_norm": 0.288161963224411, |
|
"learning_rate": 1.972503590939612e-05, |
|
"loss": 0.6234, |
|
"mean_token_accuracy": 0.8078823387622833, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5069521348591116, |
|
"grad_norm": 0.3387835919857025, |
|
"learning_rate": 1.971152510039926e-05, |
|
"loss": 0.6269, |
|
"mean_token_accuracy": 0.8067226454615593, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5118740002460933, |
|
"grad_norm": 0.290519118309021, |
|
"learning_rate": 1.9697695119335547e-05, |
|
"loss": 0.6213, |
|
"mean_token_accuracy": 0.8083379164338111, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5167958656330749, |
|
"grad_norm": 0.3701138496398926, |
|
"learning_rate": 1.9683546420731292e-05, |
|
"loss": 0.6246, |
|
"mean_token_accuracy": 0.8079604268074035, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5217177310200566, |
|
"grad_norm": 0.39614954590797424, |
|
"learning_rate": 1.9669079469587548e-05, |
|
"loss": 0.6287, |
|
"mean_token_accuracy": 0.8067878499627114, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5266395964070383, |
|
"grad_norm": 0.32784542441368103, |
|
"learning_rate": 1.965429474136482e-05, |
|
"loss": 0.6156, |
|
"mean_token_accuracy": 0.8098407059907913, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.53156146179402, |
|
"grad_norm": 0.30213144421577454, |
|
"learning_rate": 1.963919272196746e-05, |
|
"loss": 0.6207, |
|
"mean_token_accuracy": 0.8086924180388451, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5364833271810016, |
|
"grad_norm": 0.32220178842544556, |
|
"learning_rate": 1.9623773907727682e-05, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.8098208606243134, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5414051925679833, |
|
"grad_norm": 0.3250666856765747, |
|
"learning_rate": 1.9608038805389253e-05, |
|
"loss": 0.6195, |
|
"mean_token_accuracy": 0.8085113659501075, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.546327057954965, |
|
"grad_norm": 0.36724722385406494, |
|
"learning_rate": 1.9591987932090836e-05, |
|
"loss": 0.6115, |
|
"mean_token_accuracy": 0.8109661117196083, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5512489233419466, |
|
"grad_norm": 0.30343472957611084, |
|
"learning_rate": 1.9575621815349e-05, |
|
"loss": 0.6204, |
|
"mean_token_accuracy": 0.8083494484424592, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5561707887289282, |
|
"grad_norm": 0.3323419988155365, |
|
"learning_rate": 1.9558940993040885e-05, |
|
"loss": 0.6232, |
|
"mean_token_accuracy": 0.8077159106731415, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5610926541159099, |
|
"grad_norm": 0.31035885214805603, |
|
"learning_rate": 1.954194601338651e-05, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.8096732005476952, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5660145195028916, |
|
"grad_norm": 0.2931119501590729, |
|
"learning_rate": 1.952463743493078e-05, |
|
"loss": 0.6199, |
|
"mean_token_accuracy": 0.808499938249588, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5709363848898733, |
|
"grad_norm": 0.27563023567199707, |
|
"learning_rate": 1.9507015826525096e-05, |
|
"loss": 0.6046, |
|
"mean_token_accuracy": 0.8128907606005669, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5758582502768549, |
|
"grad_norm": 0.28453299403190613, |
|
"learning_rate": 1.9489081767308696e-05, |
|
"loss": 0.6105, |
|
"mean_token_accuracy": 0.8113355338573456, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5807801156638366, |
|
"grad_norm": 0.37042465806007385, |
|
"learning_rate": 1.9470835846689596e-05, |
|
"loss": 0.6127, |
|
"mean_token_accuracy": 0.8106034889817237, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5857019810508183, |
|
"grad_norm": 0.2963549792766571, |
|
"learning_rate": 1.9452278664325227e-05, |
|
"loss": 0.6194, |
|
"mean_token_accuracy": 0.8086869075894356, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5906238464378, |
|
"grad_norm": 0.2905316948890686, |
|
"learning_rate": 1.9433410830102724e-05, |
|
"loss": 0.61, |
|
"mean_token_accuracy": 0.811042046546936, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5955457118247816, |
|
"grad_norm": 0.2674277424812317, |
|
"learning_rate": 1.9414232964118893e-05, |
|
"loss": 0.6119, |
|
"mean_token_accuracy": 0.8104571312665939, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6004675772117632, |
|
"grad_norm": 0.28245261311531067, |
|
"learning_rate": 1.939474569665981e-05, |
|
"loss": 0.6115, |
|
"mean_token_accuracy": 0.8106845885515213, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6053894425987449, |
|
"grad_norm": 0.2713403105735779, |
|
"learning_rate": 1.937494966818014e-05, |
|
"loss": 0.6096, |
|
"mean_token_accuracy": 0.8106750875711441, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6103113079857266, |
|
"grad_norm": 0.31770050525665283, |
|
"learning_rate": 1.9354845529282042e-05, |
|
"loss": 0.6142, |
|
"mean_token_accuracy": 0.8098479628562927, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6152331733727082, |
|
"grad_norm": 0.28526055812835693, |
|
"learning_rate": 1.933443394069383e-05, |
|
"loss": 0.6062, |
|
"mean_token_accuracy": 0.8120482847094536, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6201550387596899, |
|
"grad_norm": 0.5695453882217407, |
|
"learning_rate": 1.9313715573248238e-05, |
|
"loss": 0.6122, |
|
"mean_token_accuracy": 0.8099897101521492, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6250769041466716, |
|
"grad_norm": 0.2738396227359772, |
|
"learning_rate": 1.9292691107860374e-05, |
|
"loss": 0.6031, |
|
"mean_token_accuracy": 0.8127053424715995, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6299987695336533, |
|
"grad_norm": 0.28948965668678284, |
|
"learning_rate": 1.927136123550534e-05, |
|
"loss": 0.6115, |
|
"mean_token_accuracy": 0.8103477448225022, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 0.27830740809440613, |
|
"learning_rate": 1.9249726657195534e-05, |
|
"loss": 0.608, |
|
"mean_token_accuracy": 0.8116561621427536, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6398425003076166, |
|
"grad_norm": 0.2712289094924927, |
|
"learning_rate": 1.922778808395759e-05, |
|
"loss": 0.6054, |
|
"mean_token_accuracy": 0.8125208973884582, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6447643656945983, |
|
"grad_norm": 0.29063907265663147, |
|
"learning_rate": 1.9205546236809037e-05, |
|
"loss": 0.6047, |
|
"mean_token_accuracy": 0.8123130992054939, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.64968623108158, |
|
"grad_norm": 0.293261855840683, |
|
"learning_rate": 1.9183001846734573e-05, |
|
"loss": 0.603, |
|
"mean_token_accuracy": 0.8129645109176635, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6546080964685616, |
|
"grad_norm": 0.2849041223526001, |
|
"learning_rate": 1.9160155654662075e-05, |
|
"loss": 0.5926, |
|
"mean_token_accuracy": 0.8157610684633255, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6595299618555432, |
|
"grad_norm": 0.2975578010082245, |
|
"learning_rate": 1.9137008411438213e-05, |
|
"loss": 0.6034, |
|
"mean_token_accuracy": 0.8125734269618988, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6644518272425249, |
|
"grad_norm": 0.286842405796051, |
|
"learning_rate": 1.9113560877803798e-05, |
|
"loss": 0.6045, |
|
"mean_token_accuracy": 0.8125320598483086, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6693736926295066, |
|
"grad_norm": 0.33480602502822876, |
|
"learning_rate": 1.9089813824368765e-05, |
|
"loss": 0.5975, |
|
"mean_token_accuracy": 0.8142675384879112, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6742955580164882, |
|
"grad_norm": 0.29252228140830994, |
|
"learning_rate": 1.9065768031586864e-05, |
|
"loss": 0.6056, |
|
"mean_token_accuracy": 0.8120014935731887, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6792174234034699, |
|
"grad_norm": 0.2882521450519562, |
|
"learning_rate": 1.9041424289729994e-05, |
|
"loss": 0.595, |
|
"mean_token_accuracy": 0.8150214269757271, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6841392887904516, |
|
"grad_norm": 0.29731523990631104, |
|
"learning_rate": 1.901678339886223e-05, |
|
"loss": 0.6013, |
|
"mean_token_accuracy": 0.8131750777363778, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6890611541774333, |
|
"grad_norm": 0.26834896206855774, |
|
"learning_rate": 1.8991846168813547e-05, |
|
"loss": 0.5918, |
|
"mean_token_accuracy": 0.8156168267130852, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6939830195644149, |
|
"grad_norm": 0.29199543595314026, |
|
"learning_rate": 1.896661341915318e-05, |
|
"loss": 0.6033, |
|
"mean_token_accuracy": 0.8124941572546959, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6989048849513966, |
|
"grad_norm": 0.28719085454940796, |
|
"learning_rate": 1.8941085979162714e-05, |
|
"loss": 0.5992, |
|
"mean_token_accuracy": 0.8138533607125282, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7038267503383783, |
|
"grad_norm": 0.28042468428611755, |
|
"learning_rate": 1.891526468780881e-05, |
|
"loss": 0.605, |
|
"mean_token_accuracy": 0.8121193930506706, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.70874861572536, |
|
"grad_norm": 0.272483766078949, |
|
"learning_rate": 1.8889150393715627e-05, |
|
"loss": 0.5943, |
|
"mean_token_accuracy": 0.8147971466183662, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7136704811123415, |
|
"grad_norm": 0.24886226654052734, |
|
"learning_rate": 1.8862743955136966e-05, |
|
"loss": 0.5957, |
|
"mean_token_accuracy": 0.8145680665969849, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7185923464993232, |
|
"grad_norm": 0.26445212960243225, |
|
"learning_rate": 1.8836046239928025e-05, |
|
"loss": 0.5948, |
|
"mean_token_accuracy": 0.8148575246334075, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7235142118863049, |
|
"grad_norm": 0.2891506850719452, |
|
"learning_rate": 1.8809058125516894e-05, |
|
"loss": 0.5968, |
|
"mean_token_accuracy": 0.8141703933477402, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7284360772732866, |
|
"grad_norm": 0.28364264965057373, |
|
"learning_rate": 1.8781780498875727e-05, |
|
"loss": 0.6035, |
|
"mean_token_accuracy": 0.8124788105487823, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7333579426602682, |
|
"grad_norm": 0.2917366921901703, |
|
"learning_rate": 1.8754214256491564e-05, |
|
"loss": 0.5928, |
|
"mean_token_accuracy": 0.8153851807117463, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7382798080472499, |
|
"grad_norm": 0.2714190185070038, |
|
"learning_rate": 1.8726360304336896e-05, |
|
"loss": 0.601, |
|
"mean_token_accuracy": 0.8129221558570862, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7432016734342316, |
|
"grad_norm": 0.29474568367004395, |
|
"learning_rate": 1.8698219557839875e-05, |
|
"loss": 0.5963, |
|
"mean_token_accuracy": 0.8142225205898285, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7481235388212132, |
|
"grad_norm": 0.2684454619884491, |
|
"learning_rate": 1.866979294185423e-05, |
|
"loss": 0.5933, |
|
"mean_token_accuracy": 0.8149216592311859, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7530454042081949, |
|
"grad_norm": 0.26693102717399597, |
|
"learning_rate": 1.864108139062888e-05, |
|
"loss": 0.5908, |
|
"mean_token_accuracy": 0.8157912597060204, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7579672695951766, |
|
"grad_norm": 0.27418771386146545, |
|
"learning_rate": 1.8612085847777215e-05, |
|
"loss": 0.5913, |
|
"mean_token_accuracy": 0.8156127855181694, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7628891349821583, |
|
"grad_norm": 0.30855274200439453, |
|
"learning_rate": 1.858280726624609e-05, |
|
"loss": 0.5922, |
|
"mean_token_accuracy": 0.81515374481678, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7678110003691399, |
|
"grad_norm": 0.2978297472000122, |
|
"learning_rate": 1.855324660828452e-05, |
|
"loss": 0.5999, |
|
"mean_token_accuracy": 0.8132428601384163, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7727328657561215, |
|
"grad_norm": 0.30609989166259766, |
|
"learning_rate": 1.8523404845412028e-05, |
|
"loss": 0.5931, |
|
"mean_token_accuracy": 0.8152095600962639, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7776547311431032, |
|
"grad_norm": 0.28423747420310974, |
|
"learning_rate": 1.849328295838674e-05, |
|
"loss": 0.5939, |
|
"mean_token_accuracy": 0.8150446817278862, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7825765965300849, |
|
"grad_norm": 0.39114367961883545, |
|
"learning_rate": 1.8462881937173144e-05, |
|
"loss": 0.5886, |
|
"mean_token_accuracy": 0.8164272159337997, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7874984619170665, |
|
"grad_norm": 0.2761843502521515, |
|
"learning_rate": 1.8432202780909542e-05, |
|
"loss": 0.594, |
|
"mean_token_accuracy": 0.8146432772278785, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7924203273040482, |
|
"grad_norm": 0.26402318477630615, |
|
"learning_rate": 1.8401246497875238e-05, |
|
"loss": 0.5892, |
|
"mean_token_accuracy": 0.8162309199571609, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7973421926910299, |
|
"grad_norm": 0.26799553632736206, |
|
"learning_rate": 1.8370014105457378e-05, |
|
"loss": 0.5901, |
|
"mean_token_accuracy": 0.8156055212020874, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8022640580780116, |
|
"grad_norm": 0.3189884126186371, |
|
"learning_rate": 1.8338506630117527e-05, |
|
"loss": 0.5821, |
|
"mean_token_accuracy": 0.8177683308720589, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8071859234649932, |
|
"grad_norm": 0.26993831992149353, |
|
"learning_rate": 1.8306725107357933e-05, |
|
"loss": 0.5887, |
|
"mean_token_accuracy": 0.8162371620535851, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8121077888519749, |
|
"grad_norm": 0.33908817172050476, |
|
"learning_rate": 1.827467058168748e-05, |
|
"loss": 0.5932, |
|
"mean_token_accuracy": 0.8148850262165069, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8170296542389566, |
|
"grad_norm": 0.2749953866004944, |
|
"learning_rate": 1.824234410658738e-05, |
|
"loss": 0.5807, |
|
"mean_token_accuracy": 0.8185225054621696, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8219515196259383, |
|
"grad_norm": 0.28679126501083374, |
|
"learning_rate": 1.8209746744476538e-05, |
|
"loss": 0.5844, |
|
"mean_token_accuracy": 0.81742594987154, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8268733850129198, |
|
"grad_norm": 0.29817092418670654, |
|
"learning_rate": 1.817687956667664e-05, |
|
"loss": 0.584, |
|
"mean_token_accuracy": 0.8173492252826691, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8317952503999015, |
|
"grad_norm": 0.2705828547477722, |
|
"learning_rate": 1.8143743653376944e-05, |
|
"loss": 0.5955, |
|
"mean_token_accuracy": 0.8145547702908515, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8367171157868832, |
|
"grad_norm": 0.28381243348121643, |
|
"learning_rate": 1.811034009359877e-05, |
|
"loss": 0.5833, |
|
"mean_token_accuracy": 0.8177738025784492, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8416389811738649, |
|
"grad_norm": 0.2846708595752716, |
|
"learning_rate": 1.8076669985159726e-05, |
|
"loss": 0.5817, |
|
"mean_token_accuracy": 0.8179952159523964, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8465608465608465, |
|
"grad_norm": 0.2997231185436249, |
|
"learning_rate": 1.8042734434637615e-05, |
|
"loss": 0.5934, |
|
"mean_token_accuracy": 0.8149283960461616, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8514827119478282, |
|
"grad_norm": 0.29204457998275757, |
|
"learning_rate": 1.8008534557334064e-05, |
|
"loss": 0.5795, |
|
"mean_token_accuracy": 0.8184737205505371, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8564045773348099, |
|
"grad_norm": 0.30441614985466003, |
|
"learning_rate": 1.7974071477237887e-05, |
|
"loss": 0.585, |
|
"mean_token_accuracy": 0.8171376779675483, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8613264427217916, |
|
"grad_norm": 0.2779221832752228, |
|
"learning_rate": 1.7939346326988127e-05, |
|
"loss": 0.5889, |
|
"mean_token_accuracy": 0.8160797134041786, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8662483081087732, |
|
"grad_norm": 0.250242680311203, |
|
"learning_rate": 1.7904360247836838e-05, |
|
"loss": 0.5894, |
|
"mean_token_accuracy": 0.81572295576334, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8711701734957549, |
|
"grad_norm": 0.26801884174346924, |
|
"learning_rate": 1.7869114389611574e-05, |
|
"loss": 0.5853, |
|
"mean_token_accuracy": 0.8168028473854065, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8760920388827366, |
|
"grad_norm": 0.33699533343315125, |
|
"learning_rate": 1.7833609910677613e-05, |
|
"loss": 0.5804, |
|
"mean_token_accuracy": 0.8181165441870689, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8810139042697183, |
|
"grad_norm": 0.28362491726875305, |
|
"learning_rate": 1.7797847977899873e-05, |
|
"loss": 0.5823, |
|
"mean_token_accuracy": 0.8177706867456436, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.8859357696566998, |
|
"grad_norm": 0.2863147556781769, |
|
"learning_rate": 1.7761829766604556e-05, |
|
"loss": 0.5797, |
|
"mean_token_accuracy": 0.8185298308730126, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8908576350436815, |
|
"grad_norm": 0.27263742685317993, |
|
"learning_rate": 1.7725556460540553e-05, |
|
"loss": 0.5825, |
|
"mean_token_accuracy": 0.8175166144967079, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.8957795004306632, |
|
"grad_norm": 0.28120777010917664, |
|
"learning_rate": 1.7689029251840492e-05, |
|
"loss": 0.5788, |
|
"mean_token_accuracy": 0.8185988172888756, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9007013658176449, |
|
"grad_norm": 0.3469211459159851, |
|
"learning_rate": 1.7652249340981608e-05, |
|
"loss": 0.5877, |
|
"mean_token_accuracy": 0.8159551978111267, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9056232312046265, |
|
"grad_norm": 0.3101508617401123, |
|
"learning_rate": 1.7615217936746246e-05, |
|
"loss": 0.5819, |
|
"mean_token_accuracy": 0.8174650520086288, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9105450965916082, |
|
"grad_norm": 0.38838618993759155, |
|
"learning_rate": 1.757793625618217e-05, |
|
"loss": 0.5755, |
|
"mean_token_accuracy": 0.8196040257811547, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9154669619785899, |
|
"grad_norm": 0.3253493309020996, |
|
"learning_rate": 1.7540405524562533e-05, |
|
"loss": 0.5777, |
|
"mean_token_accuracy": 0.8182825416326522, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9203888273655716, |
|
"grad_norm": 0.2917826175689697, |
|
"learning_rate": 1.750262697534563e-05, |
|
"loss": 0.5809, |
|
"mean_token_accuracy": 0.8180661648511887, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9253106927525532, |
|
"grad_norm": 0.25714483857154846, |
|
"learning_rate": 1.7464601850134353e-05, |
|
"loss": 0.5752, |
|
"mean_token_accuracy": 0.8194984391331672, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 0.28597357869148254, |
|
"learning_rate": 1.742633139863538e-05, |
|
"loss": 0.579, |
|
"mean_token_accuracy": 0.8184013769030571, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9351544235265166, |
|
"grad_norm": 0.9777734875679016, |
|
"learning_rate": 1.738781687861812e-05, |
|
"loss": 0.5789, |
|
"mean_token_accuracy": 0.8188063263893127, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9400762889134983, |
|
"grad_norm": 0.26717498898506165, |
|
"learning_rate": 1.7349059555873348e-05, |
|
"loss": 0.5754, |
|
"mean_token_accuracy": 0.8191799059510231, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.9449981543004798, |
|
"grad_norm": 0.29053807258605957, |
|
"learning_rate": 1.731006070417163e-05, |
|
"loss": 0.5726, |
|
"mean_token_accuracy": 0.8204409092664718, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9499200196874615, |
|
"grad_norm": 0.3052172362804413, |
|
"learning_rate": 1.7270821605221448e-05, |
|
"loss": 0.5764, |
|
"mean_token_accuracy": 0.819102555513382, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9548418850744432, |
|
"grad_norm": 0.33640167117118835, |
|
"learning_rate": 1.7231343548627085e-05, |
|
"loss": 0.5789, |
|
"mean_token_accuracy": 0.8184890508651733, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9597637504614249, |
|
"grad_norm": 0.2829669415950775, |
|
"learning_rate": 1.7191627831846226e-05, |
|
"loss": 0.5803, |
|
"mean_token_accuracy": 0.8179109930992127, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9646856158484065, |
|
"grad_norm": 0.2560986280441284, |
|
"learning_rate": 1.7151675760147325e-05, |
|
"loss": 0.5721, |
|
"mean_token_accuracy": 0.8198479250073433, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9696074812353882, |
|
"grad_norm": 0.27663761377334595, |
|
"learning_rate": 1.7111488646566728e-05, |
|
"loss": 0.5851, |
|
"mean_token_accuracy": 0.8171452388167382, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.9745293466223699, |
|
"grad_norm": 0.2673356235027313, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.5751, |
|
"mean_token_accuracy": 0.8194502517580986, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9794512120093516, |
|
"grad_norm": 0.2639131546020508, |
|
"learning_rate": 1.7030414584485938e-05, |
|
"loss": 0.5757, |
|
"mean_token_accuracy": 0.8192202031612397, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.9843730773963332, |
|
"grad_norm": 0.2639618515968323, |
|
"learning_rate": 1.6989530300508126e-05, |
|
"loss": 0.576, |
|
"mean_token_accuracy": 0.8191347226500512, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9892949427833149, |
|
"grad_norm": 0.2554817199707031, |
|
"learning_rate": 1.6948416303605796e-05, |
|
"loss": 0.5778, |
|
"mean_token_accuracy": 0.8186899140477181, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.9942168081702966, |
|
"grad_norm": 0.25301820039749146, |
|
"learning_rate": 1.690707394500229e-05, |
|
"loss": 0.576, |
|
"mean_token_accuracy": 0.8191317170858383, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9991386735572783, |
|
"grad_norm": 0.2470293790102005, |
|
"learning_rate": 1.6865504583426117e-05, |
|
"loss": 0.5707, |
|
"mean_token_accuracy": 0.8204790607094765, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.0049218653869816, |
|
"grad_norm": 0.3501671254634857, |
|
"learning_rate": 1.6823709585066308e-05, |
|
"loss": 0.6648, |
|
"mean_token_accuracy": 0.824617318990754, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.0098437307739634, |
|
"grad_norm": 0.30985623598098755, |
|
"learning_rate": 1.6781690323527512e-05, |
|
"loss": 0.5503, |
|
"mean_token_accuracy": 0.8255873426795006, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.014765596160945, |
|
"grad_norm": 0.2879364788532257, |
|
"learning_rate": 1.6739448179784846e-05, |
|
"loss": 0.5529, |
|
"mean_token_accuracy": 0.8247572600841522, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.0196874615479268, |
|
"grad_norm": 0.27657514810562134, |
|
"learning_rate": 1.669698454213852e-05, |
|
"loss": 0.55, |
|
"mean_token_accuracy": 0.8258542969822884, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.0246093269349084, |
|
"grad_norm": 0.259316623210907, |
|
"learning_rate": 1.665430080616821e-05, |
|
"loss": 0.5435, |
|
"mean_token_accuracy": 0.8273309215903282, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.02953119232189, |
|
"grad_norm": 0.27227073907852173, |
|
"learning_rate": 1.6611398374687172e-05, |
|
"loss": 0.5494, |
|
"mean_token_accuracy": 0.8259153485298156, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.0344530577088717, |
|
"grad_norm": 0.2718289792537689, |
|
"learning_rate": 1.6568278657696166e-05, |
|
"loss": 0.5445, |
|
"mean_token_accuracy": 0.827112241089344, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0393749230958533, |
|
"grad_norm": 0.28744345903396606, |
|
"learning_rate": 1.6524943072337094e-05, |
|
"loss": 0.5501, |
|
"mean_token_accuracy": 0.8256638810038567, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.044296788482835, |
|
"grad_norm": 0.26266416907310486, |
|
"learning_rate": 1.6481393042846442e-05, |
|
"loss": 0.5467, |
|
"mean_token_accuracy": 0.8264568135142326, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0492186538698167, |
|
"grad_norm": 0.25888925790786743, |
|
"learning_rate": 1.6437630000508466e-05, |
|
"loss": 0.5522, |
|
"mean_token_accuracy": 0.8247309610247612, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.0541405192567983, |
|
"grad_norm": 0.25061705708503723, |
|
"learning_rate": 1.6393655383608132e-05, |
|
"loss": 0.5459, |
|
"mean_token_accuracy": 0.8267670929431915, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.0590623846437799, |
|
"grad_norm": 0.25011131167411804, |
|
"learning_rate": 1.634947063738389e-05, |
|
"loss": 0.5483, |
|
"mean_token_accuracy": 0.8261876925826073, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.0639842500307617, |
|
"grad_norm": 0.26051655411720276, |
|
"learning_rate": 1.630507721398013e-05, |
|
"loss": 0.5452, |
|
"mean_token_accuracy": 0.82709851115942, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0689061154177433, |
|
"grad_norm": 0.2643815279006958, |
|
"learning_rate": 1.6260476572399494e-05, |
|
"loss": 0.5497, |
|
"mean_token_accuracy": 0.825461483001709, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.073827980804725, |
|
"grad_norm": 0.3040525019168854, |
|
"learning_rate": 1.6215670178454893e-05, |
|
"loss": 0.5478, |
|
"mean_token_accuracy": 0.8264098614454269, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0787498461917067, |
|
"grad_norm": 0.28461357951164246, |
|
"learning_rate": 1.6170659504721365e-05, |
|
"loss": 0.5474, |
|
"mean_token_accuracy": 0.8261038646101951, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.0836717115786882, |
|
"grad_norm": 0.24723611772060394, |
|
"learning_rate": 1.6125446030487642e-05, |
|
"loss": 0.542, |
|
"mean_token_accuracy": 0.8277976959943771, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.08859357696567, |
|
"grad_norm": 0.4478602707386017, |
|
"learning_rate": 1.608003124170758e-05, |
|
"loss": 0.5435, |
|
"mean_token_accuracy": 0.8271990329027176, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.0935154423526516, |
|
"grad_norm": 0.2758786082267761, |
|
"learning_rate": 1.6034416630951265e-05, |
|
"loss": 0.5546, |
|
"mean_token_accuracy": 0.8245001256465911, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0984373077396332, |
|
"grad_norm": 0.8616223335266113, |
|
"learning_rate": 1.598860369735601e-05, |
|
"loss": 0.5419, |
|
"mean_token_accuracy": 0.827488873898983, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.103359173126615, |
|
"grad_norm": 0.24690531194210052, |
|
"learning_rate": 1.594259394657707e-05, |
|
"loss": 0.5493, |
|
"mean_token_accuracy": 0.8259517803788186, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.1082810385135966, |
|
"grad_norm": 0.24601490795612335, |
|
"learning_rate": 1.589638889073813e-05, |
|
"loss": 0.5563, |
|
"mean_token_accuracy": 0.8240275859832764, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.1132029039005784, |
|
"grad_norm": 0.32801708579063416, |
|
"learning_rate": 1.584999004838165e-05, |
|
"loss": 0.5474, |
|
"mean_token_accuracy": 0.8265691444277763, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.11812476928756, |
|
"grad_norm": 0.25093355774879456, |
|
"learning_rate": 1.5803398944418934e-05, |
|
"loss": 0.5426, |
|
"mean_token_accuracy": 0.8273544386029243, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.1230466346745416, |
|
"grad_norm": 0.2600312829017639, |
|
"learning_rate": 1.5756617110080023e-05, |
|
"loss": 0.5522, |
|
"mean_token_accuracy": 0.8249027922749519, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.1279685000615234, |
|
"grad_norm": 0.26066142320632935, |
|
"learning_rate": 1.570964608286336e-05, |
|
"loss": 0.5442, |
|
"mean_token_accuracy": 0.8270187392830849, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.132890365448505, |
|
"grad_norm": 0.27738282084465027, |
|
"learning_rate": 1.5662487406485273e-05, |
|
"loss": 0.5361, |
|
"mean_token_accuracy": 0.8295004799962044, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1378122308354865, |
|
"grad_norm": 0.3502300977706909, |
|
"learning_rate": 1.561514263082923e-05, |
|
"loss": 0.5482, |
|
"mean_token_accuracy": 0.8256632193922997, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.1427340962224684, |
|
"grad_norm": 0.5840310454368591, |
|
"learning_rate": 1.5567613311894908e-05, |
|
"loss": 0.5337, |
|
"mean_token_accuracy": 0.8303180441260338, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.14765596160945, |
|
"grad_norm": 0.2714439034461975, |
|
"learning_rate": 1.5519901011747046e-05, |
|
"loss": 0.5479, |
|
"mean_token_accuracy": 0.8258592769503593, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.1525778269964317, |
|
"grad_norm": 0.2692211866378784, |
|
"learning_rate": 1.5472007298464117e-05, |
|
"loss": 0.5439, |
|
"mean_token_accuracy": 0.8271799921989441, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1574996923834133, |
|
"grad_norm": 0.2637535631656647, |
|
"learning_rate": 1.5423933746086793e-05, |
|
"loss": 0.5382, |
|
"mean_token_accuracy": 0.8288466781377792, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.162421557770395, |
|
"grad_norm": 0.25311315059661865, |
|
"learning_rate": 1.5375681934566203e-05, |
|
"loss": 0.5399, |
|
"mean_token_accuracy": 0.8281501397490502, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1673434231573767, |
|
"grad_norm": 0.25321346521377563, |
|
"learning_rate": 1.532725344971202e-05, |
|
"loss": 0.5482, |
|
"mean_token_accuracy": 0.8261646762490272, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.1722652885443583, |
|
"grad_norm": 0.25499051809310913, |
|
"learning_rate": 1.527864988314033e-05, |
|
"loss": 0.5425, |
|
"mean_token_accuracy": 0.8275581628084183, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.17718715393134, |
|
"grad_norm": 0.2546637952327728, |
|
"learning_rate": 1.5229872832221336e-05, |
|
"loss": 0.5397, |
|
"mean_token_accuracy": 0.8283757612109184, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.1821090193183217, |
|
"grad_norm": 0.2738707363605499, |
|
"learning_rate": 1.5180923900026847e-05, |
|
"loss": 0.5386, |
|
"mean_token_accuracy": 0.8282813474535942, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1870308847053033, |
|
"grad_norm": 0.2539266347885132, |
|
"learning_rate": 1.5131804695277612e-05, |
|
"loss": 0.5462, |
|
"mean_token_accuracy": 0.826425202190876, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.1919527500922849, |
|
"grad_norm": 0.2745126187801361, |
|
"learning_rate": 1.5082516832290424e-05, |
|
"loss": 0.5404, |
|
"mean_token_accuracy": 0.8284027636051178, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.1968746154792667, |
|
"grad_norm": 0.2544495165348053, |
|
"learning_rate": 1.5033061930925081e-05, |
|
"loss": 0.532, |
|
"mean_token_accuracy": 0.8300672218203544, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.2017964808662482, |
|
"grad_norm": 0.27299556136131287, |
|
"learning_rate": 1.4983441616531152e-05, |
|
"loss": 0.5396, |
|
"mean_token_accuracy": 0.8280036672949791, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.20671834625323, |
|
"grad_norm": 0.28981074690818787, |
|
"learning_rate": 1.4933657519894542e-05, |
|
"loss": 0.5524, |
|
"mean_token_accuracy": 0.8247063636779786, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.2116402116402116, |
|
"grad_norm": 0.30510908365249634, |
|
"learning_rate": 1.4883711277183917e-05, |
|
"loss": 0.5379, |
|
"mean_token_accuracy": 0.8288484767079354, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.2165620770271932, |
|
"grad_norm": 0.2616790533065796, |
|
"learning_rate": 1.483360452989691e-05, |
|
"loss": 0.5415, |
|
"mean_token_accuracy": 0.8275775909423828, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.221483942414175, |
|
"grad_norm": 0.2551945745944977, |
|
"learning_rate": 1.4783338924806191e-05, |
|
"loss": 0.5347, |
|
"mean_token_accuracy": 0.8295770674943924, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2264058078011566, |
|
"grad_norm": 0.28227224946022034, |
|
"learning_rate": 1.4732916113905336e-05, |
|
"loss": 0.5425, |
|
"mean_token_accuracy": 0.8273839592933655, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.2313276731881384, |
|
"grad_norm": 0.260978102684021, |
|
"learning_rate": 1.4682337754354534e-05, |
|
"loss": 0.5431, |
|
"mean_token_accuracy": 0.8270445480942726, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.23624953857512, |
|
"grad_norm": 0.279462605714798, |
|
"learning_rate": 1.4631605508426124e-05, |
|
"loss": 0.5379, |
|
"mean_token_accuracy": 0.828822860121727, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.2411714039621016, |
|
"grad_norm": 0.2665978670120239, |
|
"learning_rate": 1.4580721043449968e-05, |
|
"loss": 0.5403, |
|
"mean_token_accuracy": 0.8279185205698013, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2460932693490834, |
|
"grad_norm": 0.24216796457767487, |
|
"learning_rate": 1.4529686031758642e-05, |
|
"loss": 0.5409, |
|
"mean_token_accuracy": 0.8280630350112915, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.251015134736065, |
|
"grad_norm": 0.2504848837852478, |
|
"learning_rate": 1.4478502150632503e-05, |
|
"loss": 0.5389, |
|
"mean_token_accuracy": 0.8282234400510788, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2559370001230468, |
|
"grad_norm": 0.25835323333740234, |
|
"learning_rate": 1.4427171082244523e-05, |
|
"loss": 0.5471, |
|
"mean_token_accuracy": 0.8258385419845581, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.2608588655100283, |
|
"grad_norm": 0.26074373722076416, |
|
"learning_rate": 1.4375694513605037e-05, |
|
"loss": 0.5413, |
|
"mean_token_accuracy": 0.8273946106433868, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.26578073089701, |
|
"grad_norm": 0.2714027762413025, |
|
"learning_rate": 1.4324074136506283e-05, |
|
"loss": 0.5399, |
|
"mean_token_accuracy": 0.8278847292065621, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.2707025962839915, |
|
"grad_norm": 0.24950872361660004, |
|
"learning_rate": 1.427231164746681e-05, |
|
"loss": 0.5429, |
|
"mean_token_accuracy": 0.827368488907814, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2756244616709733, |
|
"grad_norm": 0.2415134608745575, |
|
"learning_rate": 1.4220408747675714e-05, |
|
"loss": 0.5417, |
|
"mean_token_accuracy": 0.8275652229785919, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.280546327057955, |
|
"grad_norm": 0.23719871044158936, |
|
"learning_rate": 1.4168367142936736e-05, |
|
"loss": 0.5442, |
|
"mean_token_accuracy": 0.8268394738435745, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2854681924449367, |
|
"grad_norm": 0.2537670135498047, |
|
"learning_rate": 1.4116188543612182e-05, |
|
"loss": 0.5329, |
|
"mean_token_accuracy": 0.8299818679690361, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.2903900578319183, |
|
"grad_norm": 0.2709537446498871, |
|
"learning_rate": 1.4063874664566734e-05, |
|
"loss": 0.5419, |
|
"mean_token_accuracy": 0.8275921046733856, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.2953119232188999, |
|
"grad_norm": 0.26924365758895874, |
|
"learning_rate": 1.4011427225111091e-05, |
|
"loss": 0.5321, |
|
"mean_token_accuracy": 0.8305203005671501, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.3002337886058817, |
|
"grad_norm": 0.2832610607147217, |
|
"learning_rate": 1.3958847948945428e-05, |
|
"loss": 0.5391, |
|
"mean_token_accuracy": 0.8282249644398689, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.3051556539928633, |
|
"grad_norm": 0.2596539258956909, |
|
"learning_rate": 1.3906138564102794e-05, |
|
"loss": 0.5356, |
|
"mean_token_accuracy": 0.829230573773384, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.310077519379845, |
|
"grad_norm": 0.2699119448661804, |
|
"learning_rate": 1.3853300802892285e-05, |
|
"loss": 0.5417, |
|
"mean_token_accuracy": 0.8279038980603218, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.3149993847668267, |
|
"grad_norm": 0.2658538520336151, |
|
"learning_rate": 1.380033640184213e-05, |
|
"loss": 0.5462, |
|
"mean_token_accuracy": 0.8260830625891685, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.3199212501538082, |
|
"grad_norm": 0.25977060198783875, |
|
"learning_rate": 1.3747247101642605e-05, |
|
"loss": 0.5347, |
|
"mean_token_accuracy": 0.8293716937303544, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.32484311554079, |
|
"grad_norm": 0.24537616968154907, |
|
"learning_rate": 1.369403464708884e-05, |
|
"loss": 0.5367, |
|
"mean_token_accuracy": 0.8292932540178299, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.3297649809277716, |
|
"grad_norm": 0.2559899091720581, |
|
"learning_rate": 1.3640700787023465e-05, |
|
"loss": 0.5398, |
|
"mean_token_accuracy": 0.8283236369490623, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3346868463147534, |
|
"grad_norm": 0.274198979139328, |
|
"learning_rate": 1.358724727427914e-05, |
|
"loss": 0.5376, |
|
"mean_token_accuracy": 0.8286082163453102, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.339608711701735, |
|
"grad_norm": 0.22712701559066772, |
|
"learning_rate": 1.3533675865620937e-05, |
|
"loss": 0.5336, |
|
"mean_token_accuracy": 0.8294816762208939, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3445305770887166, |
|
"grad_norm": 0.24095574021339417, |
|
"learning_rate": 1.3479988321688619e-05, |
|
"loss": 0.536, |
|
"mean_token_accuracy": 0.829172083735466, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.3494524424756982, |
|
"grad_norm": 0.2448059618473053, |
|
"learning_rate": 1.3426186406938769e-05, |
|
"loss": 0.5337, |
|
"mean_token_accuracy": 0.8295143947005272, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.35437430786268, |
|
"grad_norm": 0.2575864791870117, |
|
"learning_rate": 1.337227188958679e-05, |
|
"loss": 0.5456, |
|
"mean_token_accuracy": 0.8261685460805893, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.3592961732496616, |
|
"grad_norm": 0.25145259499549866, |
|
"learning_rate": 1.3318246541548812e-05, |
|
"loss": 0.5319, |
|
"mean_token_accuracy": 0.8304190933704376, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3642180386366434, |
|
"grad_norm": 0.2565249502658844, |
|
"learning_rate": 1.3264112138383445e-05, |
|
"loss": 0.5358, |
|
"mean_token_accuracy": 0.8293601229786873, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.369139904023625, |
|
"grad_norm": 0.8961818814277649, |
|
"learning_rate": 1.3209870459233422e-05, |
|
"loss": 0.528, |
|
"mean_token_accuracy": 0.8313272252678872, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.3740617694106065, |
|
"grad_norm": 0.26537856459617615, |
|
"learning_rate": 1.315552328676714e-05, |
|
"loss": 0.531, |
|
"mean_token_accuracy": 0.8308784514665604, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.3789836347975883, |
|
"grad_norm": 0.28985780477523804, |
|
"learning_rate": 1.3101072407120056e-05, |
|
"loss": 0.5406, |
|
"mean_token_accuracy": 0.8277209624648094, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.38390550018457, |
|
"grad_norm": 0.2510998249053955, |
|
"learning_rate": 1.3046519609836002e-05, |
|
"loss": 0.5406, |
|
"mean_token_accuracy": 0.827545890212059, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.3888273655715517, |
|
"grad_norm": 0.2563679814338684, |
|
"learning_rate": 1.2991866687808355e-05, |
|
"loss": 0.5394, |
|
"mean_token_accuracy": 0.8279638543725014, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.3937492309585333, |
|
"grad_norm": 0.2674863338470459, |
|
"learning_rate": 1.2937115437221119e-05, |
|
"loss": 0.547, |
|
"mean_token_accuracy": 0.8261717170476913, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.398671096345515, |
|
"grad_norm": 0.24103465676307678, |
|
"learning_rate": 1.2882267657489908e-05, |
|
"loss": 0.5428, |
|
"mean_token_accuracy": 0.8272509336471557, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.4035929617324965, |
|
"grad_norm": 0.22528545558452606, |
|
"learning_rate": 1.2827325151202783e-05, |
|
"loss": 0.5368, |
|
"mean_token_accuracy": 0.8288370996713639, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.4085148271194783, |
|
"grad_norm": 0.23950906097888947, |
|
"learning_rate": 1.2772289724061015e-05, |
|
"loss": 0.5309, |
|
"mean_token_accuracy": 0.8302434518933296, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.4134366925064599, |
|
"grad_norm": 0.22913850843906403, |
|
"learning_rate": 1.2717163184819761e-05, |
|
"loss": 0.5397, |
|
"mean_token_accuracy": 0.8278713747859001, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.4183585578934417, |
|
"grad_norm": 0.22565315663814545, |
|
"learning_rate": 1.2661947345228593e-05, |
|
"loss": 0.546, |
|
"mean_token_accuracy": 0.826079449057579, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.4232804232804233, |
|
"grad_norm": 0.2397647351026535, |
|
"learning_rate": 1.2606644019971967e-05, |
|
"loss": 0.5396, |
|
"mean_token_accuracy": 0.8280595645308495, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.4282022886674048, |
|
"grad_norm": 0.23136766254901886, |
|
"learning_rate": 1.255125502660958e-05, |
|
"loss": 0.5288, |
|
"mean_token_accuracy": 0.8313645005226136, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.4331241540543866, |
|
"grad_norm": 0.2330116331577301, |
|
"learning_rate": 1.2495782185516638e-05, |
|
"loss": 0.5364, |
|
"mean_token_accuracy": 0.828608725965023, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.4380460194413682, |
|
"grad_norm": 0.23435364663600922, |
|
"learning_rate": 1.2440227319824024e-05, |
|
"loss": 0.5323, |
|
"mean_token_accuracy": 0.8299019247293472, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.44296788482835, |
|
"grad_norm": 0.2517502009868622, |
|
"learning_rate": 1.2384592255358385e-05, |
|
"loss": 0.537, |
|
"mean_token_accuracy": 0.8284672737121582, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.4478897502153316, |
|
"grad_norm": 0.2454364001750946, |
|
"learning_rate": 1.2328878820582122e-05, |
|
"loss": 0.5282, |
|
"mean_token_accuracy": 0.8314993128180503, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.4528116156023132, |
|
"grad_norm": 0.2604913115501404, |
|
"learning_rate": 1.2273088846533303e-05, |
|
"loss": 0.5404, |
|
"mean_token_accuracy": 0.8278495371341705, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.457733480989295, |
|
"grad_norm": 0.277908593416214, |
|
"learning_rate": 1.2217224166765478e-05, |
|
"loss": 0.5285, |
|
"mean_token_accuracy": 0.8310411602258683, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.4626553463762766, |
|
"grad_norm": 0.23699437081813812, |
|
"learning_rate": 1.216128661728742e-05, |
|
"loss": 0.5359, |
|
"mean_token_accuracy": 0.8288247928023338, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.4675772117632584, |
|
"grad_norm": 0.2528901994228363, |
|
"learning_rate": 1.2105278036502787e-05, |
|
"loss": 0.543, |
|
"mean_token_accuracy": 0.8267820864915848, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.47249907715024, |
|
"grad_norm": 0.25504714250564575, |
|
"learning_rate": 1.204920026514971e-05, |
|
"loss": 0.5391, |
|
"mean_token_accuracy": 0.8281295597553253, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.4774209425372216, |
|
"grad_norm": 0.26783859729766846, |
|
"learning_rate": 1.1993055146240273e-05, |
|
"loss": 0.5325, |
|
"mean_token_accuracy": 0.8299062862992287, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4823428079242031, |
|
"grad_norm": 0.25482243299484253, |
|
"learning_rate": 1.1936844524999966e-05, |
|
"loss": 0.5271, |
|
"mean_token_accuracy": 0.8315476939082146, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.487264673311185, |
|
"grad_norm": 0.2603563964366913, |
|
"learning_rate": 1.1880570248807033e-05, |
|
"loss": 0.5299, |
|
"mean_token_accuracy": 0.8303808271884918, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.4921865386981665, |
|
"grad_norm": 0.2345011830329895, |
|
"learning_rate": 1.1824234167131748e-05, |
|
"loss": 0.5274, |
|
"mean_token_accuracy": 0.8310874328017235, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.4971084040851483, |
|
"grad_norm": 0.3448658883571625, |
|
"learning_rate": 1.1767838131475654e-05, |
|
"loss": 0.5318, |
|
"mean_token_accuracy": 0.8301808550953865, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.50203026947213, |
|
"grad_norm": 0.26358914375305176, |
|
"learning_rate": 1.171138399531068e-05, |
|
"loss": 0.5341, |
|
"mean_token_accuracy": 0.8296466439962387, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.5069521348591115, |
|
"grad_norm": 0.23463788628578186, |
|
"learning_rate": 1.1654873614018266e-05, |
|
"loss": 0.5337, |
|
"mean_token_accuracy": 0.8297147572040557, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.5118740002460933, |
|
"grad_norm": 0.37559443712234497, |
|
"learning_rate": 1.1598308844828348e-05, |
|
"loss": 0.5281, |
|
"mean_token_accuracy": 0.8311620846390724, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.516795865633075, |
|
"grad_norm": 0.24298147857189178, |
|
"learning_rate": 1.1541691546758343e-05, |
|
"loss": 0.5353, |
|
"mean_token_accuracy": 0.8288328930735588, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.5217177310200567, |
|
"grad_norm": 0.2316361665725708, |
|
"learning_rate": 1.1485023580552039e-05, |
|
"loss": 0.5217, |
|
"mean_token_accuracy": 0.8330785930156708, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.5266395964070383, |
|
"grad_norm": 0.22819174826145172, |
|
"learning_rate": 1.1428306808618456e-05, |
|
"loss": 0.53, |
|
"mean_token_accuracy": 0.8303656697273254, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.5315614617940199, |
|
"grad_norm": 0.22326573729515076, |
|
"learning_rate": 1.1371543094970624e-05, |
|
"loss": 0.53, |
|
"mean_token_accuracy": 0.8304451867938042, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.5364833271810014, |
|
"grad_norm": 0.23267020285129547, |
|
"learning_rate": 1.131473430516432e-05, |
|
"loss": 0.5284, |
|
"mean_token_accuracy": 0.8309284761548043, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5414051925679833, |
|
"grad_norm": 0.3377299904823303, |
|
"learning_rate": 1.1257882306236776e-05, |
|
"loss": 0.5336, |
|
"mean_token_accuracy": 0.8295429393649101, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.546327057954965, |
|
"grad_norm": 0.24768434464931488, |
|
"learning_rate": 1.1200988966645286e-05, |
|
"loss": 0.5326, |
|
"mean_token_accuracy": 0.8297705203294754, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5512489233419466, |
|
"grad_norm": 0.22998486459255219, |
|
"learning_rate": 1.1144056156205834e-05, |
|
"loss": 0.5298, |
|
"mean_token_accuracy": 0.8307420760393143, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.5561707887289282, |
|
"grad_norm": 0.22251376509666443, |
|
"learning_rate": 1.1087085746031612e-05, |
|
"loss": 0.528, |
|
"mean_token_accuracy": 0.8313020512461662, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5610926541159098, |
|
"grad_norm": 0.2297334372997284, |
|
"learning_rate": 1.1030079608471544e-05, |
|
"loss": 0.5335, |
|
"mean_token_accuracy": 0.8294809475541115, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.5660145195028916, |
|
"grad_norm": 0.23138615489006042, |
|
"learning_rate": 1.0973039617048748e-05, |
|
"loss": 0.5333, |
|
"mean_token_accuracy": 0.829520358145237, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5709363848898734, |
|
"grad_norm": 0.23547935485839844, |
|
"learning_rate": 1.091596764639895e-05, |
|
"loss": 0.5267, |
|
"mean_token_accuracy": 0.8314588502049446, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.575858250276855, |
|
"grad_norm": 0.2409500926733017, |
|
"learning_rate": 1.0858865572208892e-05, |
|
"loss": 0.5346, |
|
"mean_token_accuracy": 0.8291632473468781, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5807801156638366, |
|
"grad_norm": 0.2276252955198288, |
|
"learning_rate": 1.080173527115467e-05, |
|
"loss": 0.5273, |
|
"mean_token_accuracy": 0.831089685857296, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.5857019810508182, |
|
"grad_norm": 0.2589430809020996, |
|
"learning_rate": 1.0744578620840065e-05, |
|
"loss": 0.5388, |
|
"mean_token_accuracy": 0.8279580160975456, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.5906238464378, |
|
"grad_norm": 0.2499450445175171, |
|
"learning_rate": 1.0687397499734842e-05, |
|
"loss": 0.5268, |
|
"mean_token_accuracy": 0.8311406090855599, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.5955457118247816, |
|
"grad_norm": 0.2377663552761078, |
|
"learning_rate": 1.0630193787112994e-05, |
|
"loss": 0.5257, |
|
"mean_token_accuracy": 0.8319837361574173, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.6004675772117634, |
|
"grad_norm": 0.24260112643241882, |
|
"learning_rate": 1.0572969362991e-05, |
|
"loss": 0.5316, |
|
"mean_token_accuracy": 0.8302173331379891, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.605389442598745, |
|
"grad_norm": 1.525187611579895, |
|
"learning_rate": 1.0515726108066025e-05, |
|
"loss": 0.5315, |
|
"mean_token_accuracy": 0.8299267381429672, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.6103113079857265, |
|
"grad_norm": 0.23062676191329956, |
|
"learning_rate": 1.0458465903654107e-05, |
|
"loss": 0.5298, |
|
"mean_token_accuracy": 0.8305988430976867, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.615233173372708, |
|
"grad_norm": 0.23293638229370117, |
|
"learning_rate": 1.0401190631628348e-05, |
|
"loss": 0.5304, |
|
"mean_token_accuracy": 0.8300972327589988, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.62015503875969, |
|
"grad_norm": 0.22877627611160278, |
|
"learning_rate": 1.034390217435704e-05, |
|
"loss": 0.5287, |
|
"mean_token_accuracy": 0.8309306666254997, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.6250769041466717, |
|
"grad_norm": 0.23190174996852875, |
|
"learning_rate": 1.0286602414641818e-05, |
|
"loss": 0.5303, |
|
"mean_token_accuracy": 0.8306381091475487, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6299987695336533, |
|
"grad_norm": 0.23290394246578217, |
|
"learning_rate": 1.0229293235655768e-05, |
|
"loss": 0.5221, |
|
"mean_token_accuracy": 0.8326445773243905, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.6349206349206349, |
|
"grad_norm": 0.22114625573158264, |
|
"learning_rate": 1.0171976520881552e-05, |
|
"loss": 0.5263, |
|
"mean_token_accuracy": 0.8315576672554016, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6398425003076165, |
|
"grad_norm": 0.2297578752040863, |
|
"learning_rate": 1.011465415404949e-05, |
|
"loss": 0.5252, |
|
"mean_token_accuracy": 0.8321317434310913, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.6447643656945983, |
|
"grad_norm": 0.23588469624519348, |
|
"learning_rate": 1.005732801907567e-05, |
|
"loss": 0.5262, |
|
"mean_token_accuracy": 0.831513050198555, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.64968623108158, |
|
"grad_norm": 0.22704197466373444, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5382, |
|
"mean_token_accuracy": 0.8281245142221451, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.6546080964685617, |
|
"grad_norm": 0.22588326036930084, |
|
"learning_rate": 9.942671980924336e-06, |
|
"loss": 0.5286, |
|
"mean_token_accuracy": 0.8307414755225182, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.6595299618555432, |
|
"grad_norm": 0.22511065006256104, |
|
"learning_rate": 9.88534584595051e-06, |
|
"loss": 0.5279, |
|
"mean_token_accuracy": 0.83111013174057, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.6644518272425248, |
|
"grad_norm": 0.24989110231399536, |
|
"learning_rate": 9.82802347911845e-06, |
|
"loss": 0.5257, |
|
"mean_token_accuracy": 0.8317268043756485, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.6693736926295066, |
|
"grad_norm": 0.23859356343746185, |
|
"learning_rate": 9.770706764344235e-06, |
|
"loss": 0.534, |
|
"mean_token_accuracy": 0.8294050306081772, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.6742955580164882, |
|
"grad_norm": 0.2304782122373581, |
|
"learning_rate": 9.713397585358189e-06, |
|
"loss": 0.528, |
|
"mean_token_accuracy": 0.8308202102780342, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.67921742340347, |
|
"grad_norm": 0.2276812344789505, |
|
"learning_rate": 9.65609782564296e-06, |
|
"loss": 0.5267, |
|
"mean_token_accuracy": 0.8312249034643173, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.6841392887904516, |
|
"grad_norm": 0.3979962170124054, |
|
"learning_rate": 9.598809368371656e-06, |
|
"loss": 0.5266, |
|
"mean_token_accuracy": 0.8312003433704376, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.6890611541774332, |
|
"grad_norm": 0.25581249594688416, |
|
"learning_rate": 9.541534096345896e-06, |
|
"loss": 0.526, |
|
"mean_token_accuracy": 0.8315127685666084, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.6939830195644148, |
|
"grad_norm": 0.2141893208026886, |
|
"learning_rate": 9.484273891933982e-06, |
|
"loss": 0.5252, |
|
"mean_token_accuracy": 0.8317378848791123, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.6989048849513966, |
|
"grad_norm": 0.4327445924282074, |
|
"learning_rate": 9.427030637009002e-06, |
|
"loss": 0.5361, |
|
"mean_token_accuracy": 0.828312310576439, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.7038267503383784, |
|
"grad_norm": 0.22412188351154327, |
|
"learning_rate": 9.369806212887008e-06, |
|
"loss": 0.5299, |
|
"mean_token_accuracy": 0.830331552028656, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.70874861572536, |
|
"grad_norm": 0.22056014835834503, |
|
"learning_rate": 9.312602500265162e-06, |
|
"loss": 0.5259, |
|
"mean_token_accuracy": 0.831749576330185, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.7136704811123415, |
|
"grad_norm": 0.23633216321468353, |
|
"learning_rate": 9.255421379159935e-06, |
|
"loss": 0.5152, |
|
"mean_token_accuracy": 0.8346669390797615, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.7185923464993231, |
|
"grad_norm": 0.21674410998821259, |
|
"learning_rate": 9.198264728845332e-06, |
|
"loss": 0.5188, |
|
"mean_token_accuracy": 0.8335284858942031, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.723514211886305, |
|
"grad_norm": 0.22083686292171478, |
|
"learning_rate": 9.14113442779111e-06, |
|
"loss": 0.5283, |
|
"mean_token_accuracy": 0.8306051269173622, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.7284360772732867, |
|
"grad_norm": 0.2326516956090927, |
|
"learning_rate": 9.084032353601053e-06, |
|
"loss": 0.5329, |
|
"mean_token_accuracy": 0.8295654147863388, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.7333579426602683, |
|
"grad_norm": 0.23140785098075867, |
|
"learning_rate": 9.026960382951253e-06, |
|
"loss": 0.5243, |
|
"mean_token_accuracy": 0.8315977454185486, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.73827980804725, |
|
"grad_norm": 0.24312028288841248, |
|
"learning_rate": 8.969920391528459e-06, |
|
"loss": 0.5218, |
|
"mean_token_accuracy": 0.8328249961137771, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.7432016734342315, |
|
"grad_norm": 0.22412382066249847, |
|
"learning_rate": 8.912914253968391e-06, |
|
"loss": 0.5312, |
|
"mean_token_accuracy": 0.8298890963196754, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.748123538821213, |
|
"grad_norm": 0.2266296148300171, |
|
"learning_rate": 8.855943843794171e-06, |
|
"loss": 0.5234, |
|
"mean_token_accuracy": 0.8323718756437302, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.7530454042081949, |
|
"grad_norm": 0.21898606419563293, |
|
"learning_rate": 8.799011033354716e-06, |
|
"loss": 0.5288, |
|
"mean_token_accuracy": 0.8307971671223641, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.7579672695951767, |
|
"grad_norm": 0.2306451052427292, |
|
"learning_rate": 8.742117693763229e-06, |
|
"loss": 0.5271, |
|
"mean_token_accuracy": 0.8316369831562043, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.7628891349821583, |
|
"grad_norm": 0.22924001514911652, |
|
"learning_rate": 8.685265694835681e-06, |
|
"loss": 0.5272, |
|
"mean_token_accuracy": 0.8311286598443985, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7678110003691399, |
|
"grad_norm": 0.33131736516952515, |
|
"learning_rate": 8.628456905029383e-06, |
|
"loss": 0.5195, |
|
"mean_token_accuracy": 0.833528995513916, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.7727328657561214, |
|
"grad_norm": 0.24447475373744965, |
|
"learning_rate": 8.571693191381545e-06, |
|
"loss": 0.5221, |
|
"mean_token_accuracy": 0.8324113413691521, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7776547311431032, |
|
"grad_norm": 0.23472720384597778, |
|
"learning_rate": 8.514976419447963e-06, |
|
"loss": 0.5282, |
|
"mean_token_accuracy": 0.8306461483240127, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.782576596530085, |
|
"grad_norm": 0.25232747197151184, |
|
"learning_rate": 8.458308453241664e-06, |
|
"loss": 0.519, |
|
"mean_token_accuracy": 0.8334705844521523, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.7874984619170666, |
|
"grad_norm": 0.22827033698558807, |
|
"learning_rate": 8.401691155171654e-06, |
|
"loss": 0.5353, |
|
"mean_token_accuracy": 0.8289692014455795, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.7924203273040482, |
|
"grad_norm": 0.21775387227535248, |
|
"learning_rate": 8.345126385981737e-06, |
|
"loss": 0.5217, |
|
"mean_token_accuracy": 0.8326601728796958, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.7973421926910298, |
|
"grad_norm": 0.22691109776496887, |
|
"learning_rate": 8.288616004689321e-06, |
|
"loss": 0.5208, |
|
"mean_token_accuracy": 0.8330274626612664, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.8022640580780116, |
|
"grad_norm": 0.23031188547611237, |
|
"learning_rate": 8.23216186852435e-06, |
|
"loss": 0.5251, |
|
"mean_token_accuracy": 0.8317318856716156, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.8071859234649932, |
|
"grad_norm": 0.23658455908298492, |
|
"learning_rate": 8.175765832868252e-06, |
|
"loss": 0.5263, |
|
"mean_token_accuracy": 0.8314035385847092, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.812107788851975, |
|
"grad_norm": 0.21728812158107758, |
|
"learning_rate": 8.119429751192972e-06, |
|
"loss": 0.5283, |
|
"mean_token_accuracy": 0.830833038687706, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.8170296542389566, |
|
"grad_norm": 0.22863180935382843, |
|
"learning_rate": 8.063155475000037e-06, |
|
"loss": 0.5231, |
|
"mean_token_accuracy": 0.8322245612740516, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.8219515196259382, |
|
"grad_norm": 0.22922097146511078, |
|
"learning_rate": 8.006944853759732e-06, |
|
"loss": 0.5242, |
|
"mean_token_accuracy": 0.8318595319986344, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.8268733850129197, |
|
"grad_norm": 0.209337517619133, |
|
"learning_rate": 7.950799734850292e-06, |
|
"loss": 0.5195, |
|
"mean_token_accuracy": 0.8333837404847145, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.8317952503999015, |
|
"grad_norm": 0.22603721916675568, |
|
"learning_rate": 7.894721963497214e-06, |
|
"loss": 0.5218, |
|
"mean_token_accuracy": 0.8325009673833847, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8367171157868833, |
|
"grad_norm": 0.2327803522348404, |
|
"learning_rate": 7.838713382712583e-06, |
|
"loss": 0.5111, |
|
"mean_token_accuracy": 0.8357574358582497, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.841638981173865, |
|
"grad_norm": 0.23280593752861023, |
|
"learning_rate": 7.782775833234522e-06, |
|
"loss": 0.5333, |
|
"mean_token_accuracy": 0.8295109212398529, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8465608465608465, |
|
"grad_norm": 0.2219589352607727, |
|
"learning_rate": 7.726911153466699e-06, |
|
"loss": 0.5255, |
|
"mean_token_accuracy": 0.8316129177808762, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.851482711947828, |
|
"grad_norm": 0.22274133563041687, |
|
"learning_rate": 7.67112117941788e-06, |
|
"loss": 0.5197, |
|
"mean_token_accuracy": 0.8331713795661926, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.85640457733481, |
|
"grad_norm": 0.20765641331672668, |
|
"learning_rate": 7.615407744641618e-06, |
|
"loss": 0.5222, |
|
"mean_token_accuracy": 0.8323680445551872, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.8613264427217917, |
|
"grad_norm": 0.22262942790985107, |
|
"learning_rate": 7.559772680175979e-06, |
|
"loss": 0.5256, |
|
"mean_token_accuracy": 0.8315785735845566, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.8662483081087733, |
|
"grad_norm": 0.23786763846874237, |
|
"learning_rate": 7.504217814483364e-06, |
|
"loss": 0.5225, |
|
"mean_token_accuracy": 0.8326525434851646, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.8711701734957549, |
|
"grad_norm": 0.22120903432369232, |
|
"learning_rate": 7.448744973390423e-06, |
|
"loss": 0.5322, |
|
"mean_token_accuracy": 0.8296578034758568, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.8760920388827365, |
|
"grad_norm": 0.22359086573123932, |
|
"learning_rate": 7.393355980028039e-06, |
|
"loss": 0.524, |
|
"mean_token_accuracy": 0.8320103421807289, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.8810139042697183, |
|
"grad_norm": 0.21293464303016663, |
|
"learning_rate": 7.338052654771407e-06, |
|
"loss": 0.5201, |
|
"mean_token_accuracy": 0.8330625906586647, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.8859357696566998, |
|
"grad_norm": 0.212773397564888, |
|
"learning_rate": 7.282836815180241e-06, |
|
"loss": 0.5212, |
|
"mean_token_accuracy": 0.8328917175531387, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.8908576350436817, |
|
"grad_norm": 0.2229495495557785, |
|
"learning_rate": 7.227710275938987e-06, |
|
"loss": 0.5177, |
|
"mean_token_accuracy": 0.8338592052459717, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.8957795004306632, |
|
"grad_norm": 0.22714777290821075, |
|
"learning_rate": 7.172674848797218e-06, |
|
"loss": 0.5196, |
|
"mean_token_accuracy": 0.8332103446125985, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.9007013658176448, |
|
"grad_norm": 0.5862542986869812, |
|
"learning_rate": 7.117732342510093e-06, |
|
"loss": 0.5148, |
|
"mean_token_accuracy": 0.8348309084773063, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.9056232312046264, |
|
"grad_norm": 0.21524302661418915, |
|
"learning_rate": 7.062884562778883e-06, |
|
"loss": 0.5225, |
|
"mean_token_accuracy": 0.8324376299977303, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.9105450965916082, |
|
"grad_norm": 0.22445465624332428, |
|
"learning_rate": 7.008133312191649e-06, |
|
"loss": 0.5239, |
|
"mean_token_accuracy": 0.8318991348147392, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.91546696197859, |
|
"grad_norm": 0.21925503015518188, |
|
"learning_rate": 6.953480390164001e-06, |
|
"loss": 0.5243, |
|
"mean_token_accuracy": 0.8320589557290077, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.9203888273655716, |
|
"grad_norm": 0.21358764171600342, |
|
"learning_rate": 6.898927592879945e-06, |
|
"loss": 0.5276, |
|
"mean_token_accuracy": 0.8309697136282921, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.9253106927525532, |
|
"grad_norm": 0.21541139483451843, |
|
"learning_rate": 6.844476713232863e-06, |
|
"loss": 0.5183, |
|
"mean_token_accuracy": 0.8336074352264404, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.9302325581395348, |
|
"grad_norm": 0.253334105014801, |
|
"learning_rate": 6.790129540766581e-06, |
|
"loss": 0.5217, |
|
"mean_token_accuracy": 0.8321399599313736, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.9351544235265166, |
|
"grad_norm": 0.2311272770166397, |
|
"learning_rate": 6.735887861616555e-06, |
|
"loss": 0.5226, |
|
"mean_token_accuracy": 0.832192762196064, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.9400762889134984, |
|
"grad_norm": 0.2155195027589798, |
|
"learning_rate": 6.68175345845119e-06, |
|
"loss": 0.5214, |
|
"mean_token_accuracy": 0.8325791984796524, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.94499815430048, |
|
"grad_norm": 0.2229234129190445, |
|
"learning_rate": 6.627728110413214e-06, |
|
"loss": 0.5228, |
|
"mean_token_accuracy": 0.8320748254656791, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.9499200196874615, |
|
"grad_norm": 0.2595667839050293, |
|
"learning_rate": 6.5738135930612355e-06, |
|
"loss": 0.5257, |
|
"mean_token_accuracy": 0.831524421274662, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.9548418850744431, |
|
"grad_norm": 0.21894799172878265, |
|
"learning_rate": 6.520011678311382e-06, |
|
"loss": 0.5135, |
|
"mean_token_accuracy": 0.8349313631653785, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.959763750461425, |
|
"grad_norm": 0.215131938457489, |
|
"learning_rate": 6.466324134379066e-06, |
|
"loss": 0.5125, |
|
"mean_token_accuracy": 0.8354373678565026, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.9646856158484065, |
|
"grad_norm": 0.227864071726799, |
|
"learning_rate": 6.412752725720864e-06, |
|
"loss": 0.5166, |
|
"mean_token_accuracy": 0.8339696109294892, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.9696074812353883, |
|
"grad_norm": 0.21633465588092804, |
|
"learning_rate": 6.359299212976535e-06, |
|
"loss": 0.5236, |
|
"mean_token_accuracy": 0.8324458003044128, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.97452934662237, |
|
"grad_norm": 0.2214214950799942, |
|
"learning_rate": 6.305965352911162e-06, |
|
"loss": 0.5186, |
|
"mean_token_accuracy": 0.8334563329815865, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.9794512120093515, |
|
"grad_norm": 0.20772044360637665, |
|
"learning_rate": 6.252752898357397e-06, |
|
"loss": 0.5146, |
|
"mean_token_accuracy": 0.8346970349550247, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.984373077396333, |
|
"grad_norm": 0.2208469659090042, |
|
"learning_rate": 6.1996635981578755e-06, |
|
"loss": 0.521, |
|
"mean_token_accuracy": 0.8330862745642662, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.9892949427833149, |
|
"grad_norm": 0.21841764450073242, |
|
"learning_rate": 6.146699197107715e-06, |
|
"loss": 0.5141, |
|
"mean_token_accuracy": 0.8346462666988372, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.9942168081702967, |
|
"grad_norm": 0.22905802726745605, |
|
"learning_rate": 6.093861435897208e-06, |
|
"loss": 0.5161, |
|
"mean_token_accuracy": 0.8341751024127007, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.9991386735572783, |
|
"grad_norm": 0.2205893099308014, |
|
"learning_rate": 6.041152051054575e-06, |
|
"loss": 0.5135, |
|
"mean_token_accuracy": 0.8350084885954857, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.0049218653869816, |
|
"grad_norm": 0.27798768877983093, |
|
"learning_rate": 5.988572774888913e-06, |
|
"loss": 0.5979, |
|
"mean_token_accuracy": 0.8386082910909886, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.009843730773963, |
|
"grad_norm": 0.24996507167816162, |
|
"learning_rate": 5.936125335433265e-06, |
|
"loss": 0.4945, |
|
"mean_token_accuracy": 0.839720045030117, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.014765596160945, |
|
"grad_norm": 0.2548527121543884, |
|
"learning_rate": 5.883811456387821e-06, |
|
"loss": 0.4941, |
|
"mean_token_accuracy": 0.8400543674826622, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.0196874615479268, |
|
"grad_norm": 0.2184976190328598, |
|
"learning_rate": 5.831632857063271e-06, |
|
"loss": 0.4902, |
|
"mean_token_accuracy": 0.8409830510616303, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.0246093269349084, |
|
"grad_norm": 0.22762830555438995, |
|
"learning_rate": 5.779591252324286e-06, |
|
"loss": 0.4904, |
|
"mean_token_accuracy": 0.8408440828323365, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.02953119232189, |
|
"grad_norm": 0.23035886883735657, |
|
"learning_rate": 5.7276883525331915e-06, |
|
"loss": 0.4943, |
|
"mean_token_accuracy": 0.8397367835044861, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.0344530577088715, |
|
"grad_norm": 0.22349004447460175, |
|
"learning_rate": 5.675925863493721e-06, |
|
"loss": 0.5009, |
|
"mean_token_accuracy": 0.8379953891038895, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.0393749230958536, |
|
"grad_norm": 0.22588923573493958, |
|
"learning_rate": 5.6243054863949675e-06, |
|
"loss": 0.494, |
|
"mean_token_accuracy": 0.8397265374660492, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.044296788482835, |
|
"grad_norm": 0.2168150246143341, |
|
"learning_rate": 5.5728289177554805e-06, |
|
"loss": 0.4975, |
|
"mean_token_accuracy": 0.8389487206935883, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.0492186538698167, |
|
"grad_norm": 0.22331282496452332, |
|
"learning_rate": 5.521497849367501e-06, |
|
"loss": 0.4859, |
|
"mean_token_accuracy": 0.8422671511769295, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.0541405192567983, |
|
"grad_norm": 0.21221551299095154, |
|
"learning_rate": 5.4703139682413585e-06, |
|
"loss": 0.4866, |
|
"mean_token_accuracy": 0.8420242533087731, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.05906238464378, |
|
"grad_norm": 0.22058208286762238, |
|
"learning_rate": 5.419278956550037e-06, |
|
"loss": 0.4955, |
|
"mean_token_accuracy": 0.8394055813550949, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.0639842500307615, |
|
"grad_norm": 0.22200560569763184, |
|
"learning_rate": 5.368394491573876e-06, |
|
"loss": 0.493, |
|
"mean_token_accuracy": 0.8402127623558044, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.0689061154177435, |
|
"grad_norm": 0.2220141738653183, |
|
"learning_rate": 5.31766224564547e-06, |
|
"loss": 0.4958, |
|
"mean_token_accuracy": 0.8393116250634194, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.073827980804725, |
|
"grad_norm": 0.21074913442134857, |
|
"learning_rate": 5.267083886094668e-06, |
|
"loss": 0.4931, |
|
"mean_token_accuracy": 0.840206652879715, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.0787498461917067, |
|
"grad_norm": 0.2276320606470108, |
|
"learning_rate": 5.216661075193814e-06, |
|
"loss": 0.4955, |
|
"mean_token_accuracy": 0.8393134921789169, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.0836717115786882, |
|
"grad_norm": 0.2224099338054657, |
|
"learning_rate": 5.166395470103092e-06, |
|
"loss": 0.4937, |
|
"mean_token_accuracy": 0.8397904768586159, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.08859357696567, |
|
"grad_norm": 0.22312206029891968, |
|
"learning_rate": 5.116288722816087e-06, |
|
"loss": 0.493, |
|
"mean_token_accuracy": 0.8403119757771492, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.093515442352652, |
|
"grad_norm": 0.2194313257932663, |
|
"learning_rate": 5.06634248010546e-06, |
|
"loss": 0.4935, |
|
"mean_token_accuracy": 0.8400413483381272, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.0984373077396334, |
|
"grad_norm": 0.22484691441059113, |
|
"learning_rate": 5.016558383468851e-06, |
|
"loss": 0.49, |
|
"mean_token_accuracy": 0.8409391462802887, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.103359173126615, |
|
"grad_norm": 0.22470517456531525, |
|
"learning_rate": 4.9669380690749215e-06, |
|
"loss": 0.497, |
|
"mean_token_accuracy": 0.8389460816979408, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.1082810385135966, |
|
"grad_norm": 0.21832752227783203, |
|
"learning_rate": 4.91748316770958e-06, |
|
"loss": 0.4926, |
|
"mean_token_accuracy": 0.8401527449488639, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.113202903900578, |
|
"grad_norm": 0.21521726250648499, |
|
"learning_rate": 4.868195304722391e-06, |
|
"loss": 0.4979, |
|
"mean_token_accuracy": 0.8387278065085411, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.1181247692875598, |
|
"grad_norm": 0.21682803332805634, |
|
"learning_rate": 4.819076099973152e-06, |
|
"loss": 0.5014, |
|
"mean_token_accuracy": 0.83763497620821, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.123046634674542, |
|
"grad_norm": 0.2204725295305252, |
|
"learning_rate": 4.77012716777867e-06, |
|
"loss": 0.4989, |
|
"mean_token_accuracy": 0.8380599915981293, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.1279685000615234, |
|
"grad_norm": 0.2179991751909256, |
|
"learning_rate": 4.721350116859675e-06, |
|
"loss": 0.4946, |
|
"mean_token_accuracy": 0.8396460056304932, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.132890365448505, |
|
"grad_norm": 0.21851445734500885, |
|
"learning_rate": 4.672746550287985e-06, |
|
"loss": 0.4947, |
|
"mean_token_accuracy": 0.8395410850644112, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.1378122308354865, |
|
"grad_norm": 0.21560297906398773, |
|
"learning_rate": 4.6243180654337975e-06, |
|
"loss": 0.4857, |
|
"mean_token_accuracy": 0.8421663656830788, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.142734096222468, |
|
"grad_norm": 0.21567942202091217, |
|
"learning_rate": 4.576066253913209e-06, |
|
"loss": 0.493, |
|
"mean_token_accuracy": 0.840301775932312, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.14765596160945, |
|
"grad_norm": 0.22145864367485046, |
|
"learning_rate": 4.527992701535884e-06, |
|
"loss": 0.4844, |
|
"mean_token_accuracy": 0.8423144072294235, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.1525778269964317, |
|
"grad_norm": 0.217710942029953, |
|
"learning_rate": 4.480098988252958e-06, |
|
"loss": 0.4919, |
|
"mean_token_accuracy": 0.84017314016819, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.1574996923834133, |
|
"grad_norm": 0.2169259786605835, |
|
"learning_rate": 4.432386688105095e-06, |
|
"loss": 0.4929, |
|
"mean_token_accuracy": 0.840173925459385, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.162421557770395, |
|
"grad_norm": 0.21104402840137482, |
|
"learning_rate": 4.384857369170772e-06, |
|
"loss": 0.4875, |
|
"mean_token_accuracy": 0.8417868033051491, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.1673434231573765, |
|
"grad_norm": 0.21658702194690704, |
|
"learning_rate": 4.337512593514729e-06, |
|
"loss": 0.4947, |
|
"mean_token_accuracy": 0.8395476669073105, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1722652885443585, |
|
"grad_norm": 0.22858913242816925, |
|
"learning_rate": 4.290353917136639e-06, |
|
"loss": 0.4901, |
|
"mean_token_accuracy": 0.8408517464995384, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.17718715393134, |
|
"grad_norm": 0.4094144105911255, |
|
"learning_rate": 4.243382889919981e-06, |
|
"loss": 0.496, |
|
"mean_token_accuracy": 0.8392629832029342, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.1821090193183217, |
|
"grad_norm": 0.21924547851085663, |
|
"learning_rate": 4.1966010555810696e-06, |
|
"loss": 0.4899, |
|
"mean_token_accuracy": 0.841227824985981, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.1870308847053033, |
|
"grad_norm": 0.21283064782619476, |
|
"learning_rate": 4.1500099516183555e-06, |
|
"loss": 0.4913, |
|
"mean_token_accuracy": 0.8405321702361107, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.191952750092285, |
|
"grad_norm": 0.21150268614292145, |
|
"learning_rate": 4.1036111092618725e-06, |
|
"loss": 0.4895, |
|
"mean_token_accuracy": 0.8410715743899345, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.1968746154792664, |
|
"grad_norm": 0.20887652039527893, |
|
"learning_rate": 4.057406053422933e-06, |
|
"loss": 0.4935, |
|
"mean_token_accuracy": 0.8398977249860764, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.2017964808662485, |
|
"grad_norm": 0.20756816864013672, |
|
"learning_rate": 4.011396302643989e-06, |
|
"loss": 0.4846, |
|
"mean_token_accuracy": 0.842858923971653, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.20671834625323, |
|
"grad_norm": 0.23419924080371857, |
|
"learning_rate": 3.965583369048737e-06, |
|
"loss": 0.4963, |
|
"mean_token_accuracy": 0.8392103880643844, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.2116402116402116, |
|
"grad_norm": 0.21532607078552246, |
|
"learning_rate": 3.919968758292425e-06, |
|
"loss": 0.4883, |
|
"mean_token_accuracy": 0.8413224458694458, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.216562077027193, |
|
"grad_norm": 0.2164084017276764, |
|
"learning_rate": 3.874553969512358e-06, |
|
"loss": 0.4885, |
|
"mean_token_accuracy": 0.8415488794445991, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.221483942414175, |
|
"grad_norm": 0.21010589599609375, |
|
"learning_rate": 3.82934049527864e-06, |
|
"loss": 0.4918, |
|
"mean_token_accuracy": 0.8404750242829323, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.226405807801157, |
|
"grad_norm": 0.20962242782115936, |
|
"learning_rate": 3.784329821545105e-06, |
|
"loss": 0.4962, |
|
"mean_token_accuracy": 0.839095975458622, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.2313276731881384, |
|
"grad_norm": 0.20551133155822754, |
|
"learning_rate": 3.739523427600509e-06, |
|
"loss": 0.4911, |
|
"mean_token_accuracy": 0.8407798200845719, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.23624953857512, |
|
"grad_norm": 0.21332746744155884, |
|
"learning_rate": 3.6949227860198712e-06, |
|
"loss": 0.492, |
|
"mean_token_accuracy": 0.8405194252729415, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.2411714039621016, |
|
"grad_norm": 0.26087722182273865, |
|
"learning_rate": 3.650529362616113e-06, |
|
"loss": 0.4875, |
|
"mean_token_accuracy": 0.8417001351714134, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.246093269349083, |
|
"grad_norm": 0.20974403619766235, |
|
"learning_rate": 3.606344616391867e-06, |
|
"loss": 0.4938, |
|
"mean_token_accuracy": 0.8395893201231956, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.2510151347360647, |
|
"grad_norm": 0.22249352931976318, |
|
"learning_rate": 3.5623699994915363e-06, |
|
"loss": 0.4916, |
|
"mean_token_accuracy": 0.840800578892231, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.2559370001230468, |
|
"grad_norm": 0.20673160254955292, |
|
"learning_rate": 3.5186069571535575e-06, |
|
"loss": 0.4876, |
|
"mean_token_accuracy": 0.8417642295360566, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.2608588655100283, |
|
"grad_norm": 0.2050849050283432, |
|
"learning_rate": 3.475056927662912e-06, |
|
"loss": 0.4922, |
|
"mean_token_accuracy": 0.8401932448148728, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.26578073089701, |
|
"grad_norm": 0.2113514542579651, |
|
"learning_rate": 3.4317213423038386e-06, |
|
"loss": 0.4925, |
|
"mean_token_accuracy": 0.8401719897985458, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.2707025962839915, |
|
"grad_norm": 0.21461407840251923, |
|
"learning_rate": 3.388601625312833e-06, |
|
"loss": 0.4892, |
|
"mean_token_accuracy": 0.841229310631752, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.275624461670973, |
|
"grad_norm": 0.20549601316452026, |
|
"learning_rate": 3.345699193831795e-06, |
|
"loss": 0.4917, |
|
"mean_token_accuracy": 0.8405207619071007, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.280546327057955, |
|
"grad_norm": 0.21262629330158234, |
|
"learning_rate": 3.3030154578614783e-06, |
|
"loss": 0.4898, |
|
"mean_token_accuracy": 0.8410497605800629, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.2854681924449367, |
|
"grad_norm": 0.2351827323436737, |
|
"learning_rate": 3.2605518202151577e-06, |
|
"loss": 0.4945, |
|
"mean_token_accuracy": 0.8394208237528801, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.2903900578319183, |
|
"grad_norm": 0.21704116463661194, |
|
"learning_rate": 3.218309676472492e-06, |
|
"loss": 0.489, |
|
"mean_token_accuracy": 0.8411409676074981, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.2953119232189, |
|
"grad_norm": 0.20750364661216736, |
|
"learning_rate": 3.1762904149336947e-06, |
|
"loss": 0.4942, |
|
"mean_token_accuracy": 0.8396940395236016, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.3002337886058815, |
|
"grad_norm": 0.20055250823497772, |
|
"learning_rate": 3.134495416573884e-06, |
|
"loss": 0.4871, |
|
"mean_token_accuracy": 0.8417407006025315, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.3051556539928635, |
|
"grad_norm": 0.20621967315673828, |
|
"learning_rate": 3.0929260549977116e-06, |
|
"loss": 0.4883, |
|
"mean_token_accuracy": 0.8415425732731819, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.310077519379845, |
|
"grad_norm": 0.210305854678154, |
|
"learning_rate": 3.0515836963942056e-06, |
|
"loss": 0.4929, |
|
"mean_token_accuracy": 0.8403278931975364, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.3149993847668267, |
|
"grad_norm": 0.25147390365600586, |
|
"learning_rate": 3.01046969949188e-06, |
|
"loss": 0.4909, |
|
"mean_token_accuracy": 0.8407050803303718, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.3199212501538082, |
|
"grad_norm": 0.21020571887493134, |
|
"learning_rate": 2.9695854155140648e-06, |
|
"loss": 0.4895, |
|
"mean_token_accuracy": 0.8410211369395256, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.32484311554079, |
|
"grad_norm": 0.21094508469104767, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.4889, |
|
"mean_token_accuracy": 0.841056476533413, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.329764980927772, |
|
"grad_norm": 0.21813294291496277, |
|
"learning_rate": 2.8885113534332742e-06, |
|
"loss": 0.4928, |
|
"mean_token_accuracy": 0.8402146637439728, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.3346868463147534, |
|
"grad_norm": 0.21038471162319183, |
|
"learning_rate": 2.8483242398526723e-06, |
|
"loss": 0.4875, |
|
"mean_token_accuracy": 0.8416903391480446, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.339608711701735, |
|
"grad_norm": 0.21476763486862183, |
|
"learning_rate": 2.80837216815378e-06, |
|
"loss": 0.4883, |
|
"mean_token_accuracy": 0.8410104081034661, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.3445305770887166, |
|
"grad_norm": 0.2148827761411667, |
|
"learning_rate": 2.7686564513729198e-06, |
|
"loss": 0.4938, |
|
"mean_token_accuracy": 0.8401752710342407, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.349452442475698, |
|
"grad_norm": 0.20347550511360168, |
|
"learning_rate": 2.7291783947785544e-06, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.841368468105793, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.35437430786268, |
|
"grad_norm": 0.2156437486410141, |
|
"learning_rate": 2.689939295828371e-06, |
|
"loss": 0.4926, |
|
"mean_token_accuracy": 0.8401880413293839, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.359296173249662, |
|
"grad_norm": 0.20905110239982605, |
|
"learning_rate": 2.650940444126654e-06, |
|
"loss": 0.4915, |
|
"mean_token_accuracy": 0.8407162860035896, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.3642180386366434, |
|
"grad_norm": 0.20476758480072021, |
|
"learning_rate": 2.6121831213818825e-06, |
|
"loss": 0.4932, |
|
"mean_token_accuracy": 0.840287271142006, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.369139904023625, |
|
"grad_norm": 0.1986178457736969, |
|
"learning_rate": 2.5736686013646226e-06, |
|
"loss": 0.4857, |
|
"mean_token_accuracy": 0.8420573100447655, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.3740617694106065, |
|
"grad_norm": 0.21784992516040802, |
|
"learning_rate": 2.535398149865651e-06, |
|
"loss": 0.4888, |
|
"mean_token_accuracy": 0.8410965353250504, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.378983634797588, |
|
"grad_norm": 0.20018485188484192, |
|
"learning_rate": 2.4973730246543736e-06, |
|
"loss": 0.4913, |
|
"mean_token_accuracy": 0.8406006515026092, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.3839055001845697, |
|
"grad_norm": 0.21187762916088104, |
|
"learning_rate": 2.4595944754374723e-06, |
|
"loss": 0.4972, |
|
"mean_token_accuracy": 0.8388384222984314, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.3888273655715517, |
|
"grad_norm": 0.2048918604850769, |
|
"learning_rate": 2.422063743817832e-06, |
|
"loss": 0.4936, |
|
"mean_token_accuracy": 0.8397043973207474, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.3937492309585333, |
|
"grad_norm": 0.2068692445755005, |
|
"learning_rate": 2.3847820632537565e-06, |
|
"loss": 0.4973, |
|
"mean_token_accuracy": 0.8392092302441597, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.398671096345515, |
|
"grad_norm": 0.2050062119960785, |
|
"learning_rate": 2.347750659018397e-06, |
|
"loss": 0.4964, |
|
"mean_token_accuracy": 0.8390960440039634, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.4035929617324965, |
|
"grad_norm": 0.20241810381412506, |
|
"learning_rate": 2.3109707481595113e-06, |
|
"loss": 0.4826, |
|
"mean_token_accuracy": 0.8431760326027871, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.408514827119478, |
|
"grad_norm": 0.2023165076971054, |
|
"learning_rate": 2.27444353945945e-06, |
|
"loss": 0.484, |
|
"mean_token_accuracy": 0.8427256375551224, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.41343669250646, |
|
"grad_norm": 0.2395012527704239, |
|
"learning_rate": 2.2381702333954436e-06, |
|
"loss": 0.4843, |
|
"mean_token_accuracy": 0.8425970792770385, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.4183585578934417, |
|
"grad_norm": 0.20210982859134674, |
|
"learning_rate": 2.2021520221001304e-06, |
|
"loss": 0.488, |
|
"mean_token_accuracy": 0.8415813356637954, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.4232804232804233, |
|
"grad_norm": 0.2082945853471756, |
|
"learning_rate": 2.16639008932239e-06, |
|
"loss": 0.4937, |
|
"mean_token_accuracy": 0.8398790895938874, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.428202288667405, |
|
"grad_norm": 0.20752127468585968, |
|
"learning_rate": 2.130885610388428e-06, |
|
"loss": 0.4959, |
|
"mean_token_accuracy": 0.839399340748787, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.4331241540543864, |
|
"grad_norm": 0.20869506895542145, |
|
"learning_rate": 2.0956397521631666e-06, |
|
"loss": 0.4868, |
|
"mean_token_accuracy": 0.8415920332074165, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.4380460194413685, |
|
"grad_norm": 0.20477741956710815, |
|
"learning_rate": 2.0606536730118767e-06, |
|
"loss": 0.4829, |
|
"mean_token_accuracy": 0.8429039210081101, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.44296788482835, |
|
"grad_norm": 0.20474423468112946, |
|
"learning_rate": 2.0259285227621152e-06, |
|
"loss": 0.4981, |
|
"mean_token_accuracy": 0.8382045805454255, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.4478897502153316, |
|
"grad_norm": 0.20369385182857513, |
|
"learning_rate": 1.9914654426659374e-06, |
|
"loss": 0.4926, |
|
"mean_token_accuracy": 0.839960803091526, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.452811615602313, |
|
"grad_norm": 0.2068207710981369, |
|
"learning_rate": 1.9572655653623884e-06, |
|
"loss": 0.4935, |
|
"mean_token_accuracy": 0.8397150009870529, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.457733480989295, |
|
"grad_norm": 0.20661979913711548, |
|
"learning_rate": 1.9233300148402767e-06, |
|
"loss": 0.4924, |
|
"mean_token_accuracy": 0.8401017665863038, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.462655346376277, |
|
"grad_norm": 0.21355277299880981, |
|
"learning_rate": 1.88965990640123e-06, |
|
"loss": 0.487, |
|
"mean_token_accuracy": 0.8420075699687004, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.4675772117632584, |
|
"grad_norm": 0.209817573428154, |
|
"learning_rate": 1.8562563466230577e-06, |
|
"loss": 0.4924, |
|
"mean_token_accuracy": 0.8402795165777206, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.47249907715024, |
|
"grad_norm": 0.1972341388463974, |
|
"learning_rate": 1.823120433323361e-06, |
|
"loss": 0.4912, |
|
"mean_token_accuracy": 0.8408435776829719, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.4774209425372216, |
|
"grad_norm": 0.20761115849018097, |
|
"learning_rate": 1.7902532555234653e-06, |
|
"loss": 0.4977, |
|
"mean_token_accuracy": 0.838873790204525, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.482342807924203, |
|
"grad_norm": 0.22367697954177856, |
|
"learning_rate": 1.757655893412622e-06, |
|
"loss": 0.4876, |
|
"mean_token_accuracy": 0.8413331776857376, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.487264673311185, |
|
"grad_norm": 0.20876270532608032, |
|
"learning_rate": 1.7253294183125223e-06, |
|
"loss": 0.4901, |
|
"mean_token_accuracy": 0.8411200374364853, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.4921865386981668, |
|
"grad_norm": 0.20132075250148773, |
|
"learning_rate": 1.6932748926420695e-06, |
|
"loss": 0.4953, |
|
"mean_token_accuracy": 0.8395631924271584, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.4971084040851483, |
|
"grad_norm": 0.1999741941690445, |
|
"learning_rate": 1.661493369882473e-06, |
|
"loss": 0.4796, |
|
"mean_token_accuracy": 0.843748077750206, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.50203026947213, |
|
"grad_norm": 0.21044902503490448, |
|
"learning_rate": 1.6299858945426251e-06, |
|
"loss": 0.4856, |
|
"mean_token_accuracy": 0.8423863723874092, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.5069521348591115, |
|
"grad_norm": 0.19819578528404236, |
|
"learning_rate": 1.5987535021247668e-06, |
|
"loss": 0.4855, |
|
"mean_token_accuracy": 0.8423318341374397, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.5118740002460935, |
|
"grad_norm": 0.2015785425901413, |
|
"learning_rate": 1.5677972190904623e-06, |
|
"loss": 0.4873, |
|
"mean_token_accuracy": 0.8417120486497879, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.5167958656330747, |
|
"grad_norm": 0.20403100550174713, |
|
"learning_rate": 1.537118062826859e-06, |
|
"loss": 0.4809, |
|
"mean_token_accuracy": 0.8435953631997108, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.5217177310200567, |
|
"grad_norm": 0.2051580399274826, |
|
"learning_rate": 1.5067170416132603e-06, |
|
"loss": 0.4841, |
|
"mean_token_accuracy": 0.842904870212078, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.5266395964070383, |
|
"grad_norm": 0.20559805631637573, |
|
"learning_rate": 1.4765951545879732e-06, |
|
"loss": 0.4953, |
|
"mean_token_accuracy": 0.8392938315868378, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.53156146179402, |
|
"grad_norm": 0.21315298974514008, |
|
"learning_rate": 1.4467533917154842e-06, |
|
"loss": 0.4812, |
|
"mean_token_accuracy": 0.8433891490101815, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.5364833271810014, |
|
"grad_norm": 0.33885088562965393, |
|
"learning_rate": 1.4171927337539103e-06, |
|
"loss": 0.4925, |
|
"mean_token_accuracy": 0.8398235127329826, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.541405192567983, |
|
"grad_norm": 0.19653761386871338, |
|
"learning_rate": 1.3879141522227878e-06, |
|
"loss": 0.4903, |
|
"mean_token_accuracy": 0.8408400386571884, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.546327057954965, |
|
"grad_norm": 0.19870713353157043, |
|
"learning_rate": 1.3589186093711227e-06, |
|
"loss": 0.4811, |
|
"mean_token_accuracy": 0.8433947190642357, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.5512489233419466, |
|
"grad_norm": 0.20051565766334534, |
|
"learning_rate": 1.3302070581457716e-06, |
|
"loss": 0.4994, |
|
"mean_token_accuracy": 0.838576278090477, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.5561707887289282, |
|
"grad_norm": 0.2312447875738144, |
|
"learning_rate": 1.3017804421601298e-06, |
|
"loss": 0.492, |
|
"mean_token_accuracy": 0.8404266074299812, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.56109265411591, |
|
"grad_norm": 0.21526625752449036, |
|
"learning_rate": 1.273639695663108e-06, |
|
"loss": 0.4916, |
|
"mean_token_accuracy": 0.8403177246451378, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.5660145195028914, |
|
"grad_norm": 0.4974516034126282, |
|
"learning_rate": 1.245785743508441e-06, |
|
"loss": 0.4887, |
|
"mean_token_accuracy": 0.8414172142744064, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.5709363848898734, |
|
"grad_norm": 0.19956116378307343, |
|
"learning_rate": 1.2182195011242747e-06, |
|
"loss": 0.5017, |
|
"mean_token_accuracy": 0.837465213239193, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.575858250276855, |
|
"grad_norm": 0.19986701011657715, |
|
"learning_rate": 1.1909418744831048e-06, |
|
"loss": 0.4878, |
|
"mean_token_accuracy": 0.8414024114608765, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.5807801156638366, |
|
"grad_norm": 0.20174540579319, |
|
"learning_rate": 1.1639537600719764e-06, |
|
"loss": 0.4858, |
|
"mean_token_accuracy": 0.8420050874352455, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.585701981050818, |
|
"grad_norm": 0.20654183626174927, |
|
"learning_rate": 1.1372560448630377e-06, |
|
"loss": 0.4938, |
|
"mean_token_accuracy": 0.8395126640796662, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.5906238464377997, |
|
"grad_norm": 0.19598302245140076, |
|
"learning_rate": 1.1108496062843743e-06, |
|
"loss": 0.486, |
|
"mean_token_accuracy": 0.8420949026942253, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.5955457118247818, |
|
"grad_norm": 0.20486712455749512, |
|
"learning_rate": 1.0847353121911952e-06, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.8409939989447593, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.6004675772117634, |
|
"grad_norm": 0.2051970511674881, |
|
"learning_rate": 1.0589140208372872e-06, |
|
"loss": 0.4871, |
|
"mean_token_accuracy": 0.8416621774435044, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.605389442598745, |
|
"grad_norm": 0.20128969848155975, |
|
"learning_rate": 1.0333865808468203e-06, |
|
"loss": 0.4824, |
|
"mean_token_accuracy": 0.8431450635194778, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.6103113079857265, |
|
"grad_norm": 0.2007114738225937, |
|
"learning_rate": 1.008153831186457e-06, |
|
"loss": 0.4917, |
|
"mean_token_accuracy": 0.8406037405133248, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.615233173372708, |
|
"grad_norm": 0.19757139682769775, |
|
"learning_rate": 9.83216601137773e-07, |
|
"loss": 0.488, |
|
"mean_token_accuracy": 0.8414921492338181, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.62015503875969, |
|
"grad_norm": 0.21764694154262543, |
|
"learning_rate": 9.58575710270011e-07, |
|
"loss": 0.4819, |
|
"mean_token_accuracy": 0.8431682124733925, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.6250769041466717, |
|
"grad_norm": 0.20229902863502502, |
|
"learning_rate": 9.342319684131396e-07, |
|
"loss": 0.4916, |
|
"mean_token_accuracy": 0.8404648944735527, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.6299987695336533, |
|
"grad_norm": 0.22413024306297302, |
|
"learning_rate": 9.101861756312369e-07, |
|
"loss": 0.489, |
|
"mean_token_accuracy": 0.8410172060132026, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.634920634920635, |
|
"grad_norm": 0.1993047147989273, |
|
"learning_rate": 8.864391221962065e-07, |
|
"loss": 0.488, |
|
"mean_token_accuracy": 0.841397476196289, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.6398425003076165, |
|
"grad_norm": 0.20383085310459137, |
|
"learning_rate": 8.629915885617912e-07, |
|
"loss": 0.4906, |
|
"mean_token_accuracy": 0.8405807599425316, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.6447643656945985, |
|
"grad_norm": 0.19943130016326904, |
|
"learning_rate": 8.398443453379268e-07, |
|
"loss": 0.4872, |
|
"mean_token_accuracy": 0.841593649983406, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.64968623108158, |
|
"grad_norm": 0.19960327446460724, |
|
"learning_rate": 8.169981532654269e-07, |
|
"loss": 0.4854, |
|
"mean_token_accuracy": 0.8422250881791115, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.6546080964685617, |
|
"grad_norm": 0.20726507902145386, |
|
"learning_rate": 7.944537631909666e-07, |
|
"loss": 0.4855, |
|
"mean_token_accuracy": 0.8422259956598281, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.6595299618555432, |
|
"grad_norm": 0.19812346994876862, |
|
"learning_rate": 7.722119160424113e-07, |
|
"loss": 0.4867, |
|
"mean_token_accuracy": 0.842007802426815, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.664451827242525, |
|
"grad_norm": 0.19591908156871796, |
|
"learning_rate": 7.502733428044684e-07, |
|
"loss": 0.486, |
|
"mean_token_accuracy": 0.8423181056976319, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.669373692629507, |
|
"grad_norm": 0.195572167634964, |
|
"learning_rate": 7.286387644946602e-07, |
|
"loss": 0.4965, |
|
"mean_token_accuracy": 0.8387840166687965, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.674295558016488, |
|
"grad_norm": 0.2031807154417038, |
|
"learning_rate": 7.073088921396287e-07, |
|
"loss": 0.4907, |
|
"mean_token_accuracy": 0.840399731695652, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.67921742340347, |
|
"grad_norm": 0.2004314363002777, |
|
"learning_rate": 6.862844267517643e-07, |
|
"loss": 0.4846, |
|
"mean_token_accuracy": 0.8423734799027442, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.6841392887904516, |
|
"grad_norm": 0.20816642045974731, |
|
"learning_rate": 6.655660593061719e-07, |
|
"loss": 0.4982, |
|
"mean_token_accuracy": 0.8385626211762428, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.689061154177433, |
|
"grad_norm": 0.20351089537143707, |
|
"learning_rate": 6.451544707179635e-07, |
|
"loss": 0.4948, |
|
"mean_token_accuracy": 0.8395294427871705, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.6939830195644148, |
|
"grad_norm": 0.20076881349086761, |
|
"learning_rate": 6.250503318198664e-07, |
|
"loss": 0.4888, |
|
"mean_token_accuracy": 0.8412301942706109, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.6989048849513964, |
|
"grad_norm": 0.25244539976119995, |
|
"learning_rate": 6.052543033401892e-07, |
|
"loss": 0.4918, |
|
"mean_token_accuracy": 0.8402833178639412, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.7038267503383784, |
|
"grad_norm": 0.2058088779449463, |
|
"learning_rate": 5.857670358811096e-07, |
|
"loss": 0.4914, |
|
"mean_token_accuracy": 0.8405940279364585, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.70874861572536, |
|
"grad_norm": 0.2002749741077423, |
|
"learning_rate": 5.665891698972769e-07, |
|
"loss": 0.4956, |
|
"mean_token_accuracy": 0.8391197189688683, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.7136704811123415, |
|
"grad_norm": 0.19865228235721588, |
|
"learning_rate": 5.477213356747746e-07, |
|
"loss": 0.4894, |
|
"mean_token_accuracy": 0.8410469844937325, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.718592346499323, |
|
"grad_norm": 0.20059484243392944, |
|
"learning_rate": 5.291641533104053e-07, |
|
"loss": 0.4817, |
|
"mean_token_accuracy": 0.8434463173151017, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.7235142118863047, |
|
"grad_norm": 0.19962534308433533, |
|
"learning_rate": 5.109182326913053e-07, |
|
"loss": 0.4815, |
|
"mean_token_accuracy": 0.8433682397007942, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.7284360772732867, |
|
"grad_norm": 0.1976374238729477, |
|
"learning_rate": 4.929841734749063e-07, |
|
"loss": 0.4824, |
|
"mean_token_accuracy": 0.8429444268345833, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.7333579426602683, |
|
"grad_norm": 0.1919257491827011, |
|
"learning_rate": 4.7536256506922507e-07, |
|
"loss": 0.4858, |
|
"mean_token_accuracy": 0.8420413583517075, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.73827980804725, |
|
"grad_norm": 0.21447736024856567, |
|
"learning_rate": 4.580539866134914e-07, |
|
"loss": 0.4898, |
|
"mean_token_accuracy": 0.8408365085721016, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.7432016734342315, |
|
"grad_norm": 0.20053516328334808, |
|
"learning_rate": 4.410590069591192e-07, |
|
"loss": 0.4918, |
|
"mean_token_accuracy": 0.8403174698352813, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.748123538821213, |
|
"grad_norm": 0.3303152620792389, |
|
"learning_rate": 4.2437818465100313e-07, |
|
"loss": 0.4812, |
|
"mean_token_accuracy": 0.8434215649962425, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.753045404208195, |
|
"grad_norm": 0.194558247923851, |
|
"learning_rate": 4.0801206790916815e-07, |
|
"loss": 0.4804, |
|
"mean_token_accuracy": 0.8438364923000335, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.7579672695951767, |
|
"grad_norm": 0.19499559700489044, |
|
"learning_rate": 3.919611946107493e-07, |
|
"loss": 0.4825, |
|
"mean_token_accuracy": 0.8429989367723465, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.7628891349821583, |
|
"grad_norm": 0.19578364491462708, |
|
"learning_rate": 3.762260922723182e-07, |
|
"loss": 0.4866, |
|
"mean_token_accuracy": 0.8416179150342942, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.76781100036914, |
|
"grad_norm": 0.20279313623905182, |
|
"learning_rate": 3.6080727803254003e-07, |
|
"loss": 0.4913, |
|
"mean_token_accuracy": 0.8406556889414787, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.7727328657561214, |
|
"grad_norm": 0.20414599776268005, |
|
"learning_rate": 3.457052586351817e-07, |
|
"loss": 0.4921, |
|
"mean_token_accuracy": 0.8403137296438217, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.7776547311431035, |
|
"grad_norm": 0.20257827639579773, |
|
"learning_rate": 3.309205304124552e-07, |
|
"loss": 0.4888, |
|
"mean_token_accuracy": 0.841057425737381, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.782576596530085, |
|
"grad_norm": 0.19924387335777283, |
|
"learning_rate": 3.1645357926870957e-07, |
|
"loss": 0.4966, |
|
"mean_token_accuracy": 0.8389097020030022, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.7874984619170666, |
|
"grad_norm": 0.20351967215538025, |
|
"learning_rate": 3.0230488066445465e-07, |
|
"loss": 0.4912, |
|
"mean_token_accuracy": 0.8404456153512001, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.792420327304048, |
|
"grad_norm": 0.199168398976326, |
|
"learning_rate": 2.8847489960074136e-07, |
|
"loss": 0.4936, |
|
"mean_token_accuracy": 0.8398653537034988, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.79734219269103, |
|
"grad_norm": 0.19794094562530518, |
|
"learning_rate": 2.7496409060387973e-07, |
|
"loss": 0.4962, |
|
"mean_token_accuracy": 0.8388495057821274, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.802264058078012, |
|
"grad_norm": 0.19937507808208466, |
|
"learning_rate": 2.6177289771049274e-07, |
|
"loss": 0.4895, |
|
"mean_token_accuracy": 0.8410208597779274, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 2.807185923464993, |
|
"grad_norm": 0.19925516843795776, |
|
"learning_rate": 2.489017544529315e-07, |
|
"loss": 0.4875, |
|
"mean_token_accuracy": 0.8415358811616898, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.812107788851975, |
|
"grad_norm": 0.19592879712581635, |
|
"learning_rate": 2.3635108384502003e-07, |
|
"loss": 0.4949, |
|
"mean_token_accuracy": 0.839320321381092, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 2.8170296542389566, |
|
"grad_norm": 0.19561193883419037, |
|
"learning_rate": 2.2412129836816287e-07, |
|
"loss": 0.4913, |
|
"mean_token_accuracy": 0.840375654399395, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.821951519625938, |
|
"grad_norm": 0.1935349404811859, |
|
"learning_rate": 2.1221279995777833e-07, |
|
"loss": 0.4859, |
|
"mean_token_accuracy": 0.8416187852621079, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 2.8268733850129197, |
|
"grad_norm": 0.19886697828769684, |
|
"learning_rate": 2.0062597999009114e-07, |
|
"loss": 0.4821, |
|
"mean_token_accuracy": 0.8432388514280319, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.8317952503999013, |
|
"grad_norm": 0.19826510548591614, |
|
"learning_rate": 1.8936121926927508e-07, |
|
"loss": 0.49, |
|
"mean_token_accuracy": 0.8409401133656502, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.8367171157868833, |
|
"grad_norm": 0.21422724425792694, |
|
"learning_rate": 1.7841888801493178e-07, |
|
"loss": 0.4897, |
|
"mean_token_accuracy": 0.840906199812889, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.841638981173865, |
|
"grad_norm": 0.2021849900484085, |
|
"learning_rate": 1.677993458499272e-07, |
|
"loss": 0.4871, |
|
"mean_token_accuracy": 0.8416887044906616, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 2.8465608465608465, |
|
"grad_norm": 0.19902034103870392, |
|
"learning_rate": 1.5750294178856872e-07, |
|
"loss": 0.4884, |
|
"mean_token_accuracy": 0.8414162322878838, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.851482711947828, |
|
"grad_norm": 0.19861221313476562, |
|
"learning_rate": 1.4753001422514125e-07, |
|
"loss": 0.4926, |
|
"mean_token_accuracy": 0.8401012614369392, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 2.8564045773348097, |
|
"grad_norm": 0.19735361635684967, |
|
"learning_rate": 1.378808909227769e-07, |
|
"loss": 0.4849, |
|
"mean_token_accuracy": 0.8422791570425033, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.8613264427217917, |
|
"grad_norm": 0.20118270814418793, |
|
"learning_rate": 1.2855588900269057e-07, |
|
"loss": 0.4912, |
|
"mean_token_accuracy": 0.8406861796975136, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 2.8662483081087733, |
|
"grad_norm": 0.19249391555786133, |
|
"learning_rate": 1.1955531493375137e-07, |
|
"loss": 0.4795, |
|
"mean_token_accuracy": 0.8438849881291389, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.871170173495755, |
|
"grad_norm": 0.19686251878738403, |
|
"learning_rate": 1.1087946452241871e-07, |
|
"loss": 0.4937, |
|
"mean_token_accuracy": 0.8399393901228904, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 2.8760920388827365, |
|
"grad_norm": 0.1956326812505722, |
|
"learning_rate": 1.0252862290301092e-07, |
|
"loss": 0.4887, |
|
"mean_token_accuracy": 0.841577798128128, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.881013904269718, |
|
"grad_norm": 0.2053905874490738, |
|
"learning_rate": 9.45030645283418e-08, |
|
"loss": 0.4897, |
|
"mean_token_accuracy": 0.8410707041621208, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.8859357696567, |
|
"grad_norm": 0.19495834410190582, |
|
"learning_rate": 8.68030531606967e-08, |
|
"loss": 0.4927, |
|
"mean_token_accuracy": 0.8402184978127479, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.8908576350436817, |
|
"grad_norm": 0.1992396116256714, |
|
"learning_rate": 7.94288418631639e-08, |
|
"loss": 0.4857, |
|
"mean_token_accuracy": 0.842261828482151, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 2.8957795004306632, |
|
"grad_norm": 0.20448440313339233, |
|
"learning_rate": 7.238067299131901e-08, |
|
"loss": 0.4907, |
|
"mean_token_accuracy": 0.841072927415371, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.900701365817645, |
|
"grad_norm": 0.19940471649169922, |
|
"learning_rate": 6.565877818526245e-08, |
|
"loss": 0.4886, |
|
"mean_token_accuracy": 0.8412072688341141, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 2.9056232312046264, |
|
"grad_norm": 0.19256047904491425, |
|
"learning_rate": 5.926337836199891e-08, |
|
"loss": 0.4867, |
|
"mean_token_accuracy": 0.8416444838047028, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.9105450965916084, |
|
"grad_norm": 0.19797919690608978, |
|
"learning_rate": 5.319468370818537e-08, |
|
"loss": 0.4897, |
|
"mean_token_accuracy": 0.8410748258233071, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 2.91546696197859, |
|
"grad_norm": 0.1998082846403122, |
|
"learning_rate": 4.7452893673216596e-08, |
|
"loss": 0.4845, |
|
"mean_token_accuracy": 0.8427498519420624, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.9203888273655716, |
|
"grad_norm": 0.19540701806545258, |
|
"learning_rate": 4.203819696267486e-08, |
|
"loss": 0.4907, |
|
"mean_token_accuracy": 0.8408638656139373, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 2.925310692752553, |
|
"grad_norm": 0.19913552701473236, |
|
"learning_rate": 3.6950771532126004e-08, |
|
"loss": 0.4983, |
|
"mean_token_accuracy": 0.8385754480957985, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.9302325581395348, |
|
"grad_norm": 0.19257843494415283, |
|
"learning_rate": 3.2190784581270786e-08, |
|
"loss": 0.4878, |
|
"mean_token_accuracy": 0.841645573079586, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.935154423526517, |
|
"grad_norm": 0.19568364322185516, |
|
"learning_rate": 2.7758392548449253e-08, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.8412896126508713, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.9400762889134984, |
|
"grad_norm": 0.20067226886749268, |
|
"learning_rate": 2.3653741105499338e-08, |
|
"loss": 0.4836, |
|
"mean_token_accuracy": 0.8427690804004669, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 2.94499815430048, |
|
"grad_norm": 0.19799287617206573, |
|
"learning_rate": 1.9876965152975102e-08, |
|
"loss": 0.4895, |
|
"mean_token_accuracy": 0.8405489608645439, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.9499200196874615, |
|
"grad_norm": 1.0325350761413574, |
|
"learning_rate": 1.6428188815703627e-08, |
|
"loss": 0.4896, |
|
"mean_token_accuracy": 0.8411920800805092, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 2.954841885074443, |
|
"grad_norm": 0.1966339498758316, |
|
"learning_rate": 1.3307525438711611e-08, |
|
"loss": 0.488, |
|
"mean_token_accuracy": 0.841396550834179, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.959763750461425, |
|
"grad_norm": 0.2234841138124466, |
|
"learning_rate": 1.0515077583498346e-08, |
|
"loss": 0.4911, |
|
"mean_token_accuracy": 0.8406392633914948, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 2.9646856158484063, |
|
"grad_norm": 0.27488455176353455, |
|
"learning_rate": 8.050937024666195e-09, |
|
"loss": 0.4942, |
|
"mean_token_accuracy": 0.8396434351801872, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.9696074812353883, |
|
"grad_norm": 0.1911349892616272, |
|
"learning_rate": 5.9151847469041125e-09, |
|
"loss": 0.4823, |
|
"mean_token_accuracy": 0.8430395260453224, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 2.97452934662237, |
|
"grad_norm": 0.19882096350193024, |
|
"learning_rate": 4.1078909423253325e-09, |
|
"loss": 0.4995, |
|
"mean_token_accuracy": 0.8379872292280197, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.9794512120093515, |
|
"grad_norm": 0.20069076120853424, |
|
"learning_rate": 2.629115008160321e-09, |
|
"loss": 0.4964, |
|
"mean_token_accuracy": 0.8388297706842422, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.984373077396333, |
|
"grad_norm": 0.19437766075134277, |
|
"learning_rate": 1.4789055448061195e-09, |
|
"loss": 0.4851, |
|
"mean_token_accuracy": 0.8421405225992202, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.9892949427833146, |
|
"grad_norm": 0.19950829446315765, |
|
"learning_rate": 6.573003542276191e-10, |
|
"loss": 0.4889, |
|
"mean_token_accuracy": 0.8408236041665077, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 2.9942168081702967, |
|
"grad_norm": 0.19173409044742584, |
|
"learning_rate": 1.6432643871633346e-10, |
|
"loss": 0.4873, |
|
"mean_token_accuracy": 0.8419449985027313, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.9991386735572783, |
|
"grad_norm": 0.1980327069759369, |
|
"learning_rate": 0.0, |
|
"loss": 0.4895, |
|
"mean_token_accuracy": 0.8409327268600464, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 2.9991386735572783, |
|
"step": 3045, |
|
"total_flos": 2550348896010240.0, |
|
"train_loss": 0.5881131024979214, |
|
"train_runtime": 268544.791, |
|
"train_samples_per_second": 1.452, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3045, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2550348896010240.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|