|
{ |
|
"best_global_step": 2080, |
|
"best_metric": 0.12297015637159348, |
|
"best_model_checkpoint": "./llama3-ft-deepspeed/checkpoint-2080", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 2080, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024052916416115455, |
|
"grad_norm": 1.146120548248291, |
|
"learning_rate": 0.0001995673076923077, |
|
"loss": 1.6924, |
|
"mean_token_accuracy": 0.6730133682489395, |
|
"num_tokens": 10240.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04810583283223091, |
|
"grad_norm": 1.4788161516189575, |
|
"learning_rate": 0.00019908653846153847, |
|
"loss": 0.7696, |
|
"mean_token_accuracy": 0.835153691470623, |
|
"num_tokens": 20480.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07215874924834637, |
|
"grad_norm": 1.395352840423584, |
|
"learning_rate": 0.00019860576923076922, |
|
"loss": 0.4512, |
|
"mean_token_accuracy": 0.8917673289775848, |
|
"num_tokens": 30720.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09621166566446182, |
|
"grad_norm": 1.7515279054641724, |
|
"learning_rate": 0.000198125, |
|
"loss": 0.3253, |
|
"mean_token_accuracy": 0.9200380504131317, |
|
"num_tokens": 40960.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12026458208057728, |
|
"grad_norm": 0.7726802229881287, |
|
"learning_rate": 0.00019764423076923079, |
|
"loss": 0.2675, |
|
"mean_token_accuracy": 0.9355374664068222, |
|
"num_tokens": 51200.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14431749849669273, |
|
"grad_norm": 0.6123030781745911, |
|
"learning_rate": 0.00019716346153846154, |
|
"loss": 0.2365, |
|
"mean_token_accuracy": 0.9364401072263717, |
|
"num_tokens": 61440.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1683704149128082, |
|
"grad_norm": 0.6146537065505981, |
|
"learning_rate": 0.00019668269230769232, |
|
"loss": 0.2189, |
|
"mean_token_accuracy": 0.9391294255852699, |
|
"num_tokens": 71680.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19242333132892364, |
|
"grad_norm": 0.5948600769042969, |
|
"learning_rate": 0.0001962019230769231, |
|
"loss": 0.2081, |
|
"mean_token_accuracy": 0.9405477479100227, |
|
"num_tokens": 81920.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2164762477450391, |
|
"grad_norm": 0.5457659363746643, |
|
"learning_rate": 0.00019572115384615385, |
|
"loss": 0.2019, |
|
"mean_token_accuracy": 0.9453761458396912, |
|
"num_tokens": 92160.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24052916416115455, |
|
"grad_norm": 0.49840784072875977, |
|
"learning_rate": 0.00019524038461538463, |
|
"loss": 0.2039, |
|
"mean_token_accuracy": 0.9441700100898742, |
|
"num_tokens": 102400.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26458208057727, |
|
"grad_norm": 0.46384331583976746, |
|
"learning_rate": 0.00019475961538461541, |
|
"loss": 0.2028, |
|
"mean_token_accuracy": 0.9462413385510444, |
|
"num_tokens": 112640.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28863499699338546, |
|
"grad_norm": 0.44089949131011963, |
|
"learning_rate": 0.00019427884615384617, |
|
"loss": 0.1871, |
|
"mean_token_accuracy": 0.9464857250452041, |
|
"num_tokens": 122880.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3126879134095009, |
|
"grad_norm": 0.3707946240901947, |
|
"learning_rate": 0.00019379807692307695, |
|
"loss": 0.177, |
|
"mean_token_accuracy": 0.9484001085162163, |
|
"num_tokens": 133120.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3367408298256164, |
|
"grad_norm": 0.3341757655143738, |
|
"learning_rate": 0.0001933173076923077, |
|
"loss": 0.1847, |
|
"mean_token_accuracy": 0.9493804201483727, |
|
"num_tokens": 143360.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3607937462417318, |
|
"grad_norm": 0.4739825129508972, |
|
"learning_rate": 0.00019283653846153845, |
|
"loss": 0.1631, |
|
"mean_token_accuracy": 0.9556754723191261, |
|
"num_tokens": 153600.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3848466626578473, |
|
"grad_norm": 0.3651292622089386, |
|
"learning_rate": 0.00019235576923076923, |
|
"loss": 0.1597, |
|
"mean_token_accuracy": 0.9511243969202041, |
|
"num_tokens": 163840.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.40889957907396274, |
|
"grad_norm": 0.3972345292568207, |
|
"learning_rate": 0.00019187500000000002, |
|
"loss": 0.19, |
|
"mean_token_accuracy": 0.9461832985281944, |
|
"num_tokens": 174080.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4329524954900782, |
|
"grad_norm": 0.281608521938324, |
|
"learning_rate": 0.00019139423076923077, |
|
"loss": 0.1693, |
|
"mean_token_accuracy": 0.9498817101120949, |
|
"num_tokens": 184320.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.45700541190619365, |
|
"grad_norm": 0.3879983723163605, |
|
"learning_rate": 0.00019091346153846155, |
|
"loss": 0.1549, |
|
"mean_token_accuracy": 0.9546341434121132, |
|
"num_tokens": 194560.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4810583283223091, |
|
"grad_norm": 0.31266576051712036, |
|
"learning_rate": 0.00019043269230769233, |
|
"loss": 0.1689, |
|
"mean_token_accuracy": 0.95065086632967, |
|
"num_tokens": 204800.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5051112447384245, |
|
"grad_norm": 0.41939643025398254, |
|
"learning_rate": 0.00018995192307692308, |
|
"loss": 0.1798, |
|
"mean_token_accuracy": 0.9457689806818962, |
|
"num_tokens": 215040.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.52916416115454, |
|
"grad_norm": 0.45524027943611145, |
|
"learning_rate": 0.00018947115384615386, |
|
"loss": 0.1708, |
|
"mean_token_accuracy": 0.9461533144116402, |
|
"num_tokens": 225280.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5532170775706554, |
|
"grad_norm": 0.3720427453517914, |
|
"learning_rate": 0.00018899038461538462, |
|
"loss": 0.162, |
|
"mean_token_accuracy": 0.9516115784645081, |
|
"num_tokens": 235520.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5772699939867709, |
|
"grad_norm": 0.3217442035675049, |
|
"learning_rate": 0.0001885096153846154, |
|
"loss": 0.163, |
|
"mean_token_accuracy": 0.9550325214862824, |
|
"num_tokens": 245760.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6013229104028863, |
|
"grad_norm": 0.38512468338012695, |
|
"learning_rate": 0.00018802884615384618, |
|
"loss": 0.1607, |
|
"mean_token_accuracy": 0.9519079640507698, |
|
"num_tokens": 256000.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6253758268190018, |
|
"grad_norm": 0.31379130482673645, |
|
"learning_rate": 0.00018754807692307693, |
|
"loss": 0.1379, |
|
"mean_token_accuracy": 0.9568226426839829, |
|
"num_tokens": 266240.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6494287432351172, |
|
"grad_norm": 0.3925431966781616, |
|
"learning_rate": 0.00018706730769230768, |
|
"loss": 0.1527, |
|
"mean_token_accuracy": 0.9536312386393547, |
|
"num_tokens": 276480.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6734816596512327, |
|
"grad_norm": 0.3034338355064392, |
|
"learning_rate": 0.00018658653846153847, |
|
"loss": 0.1648, |
|
"mean_token_accuracy": 0.9480760931968689, |
|
"num_tokens": 286720.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6975345760673481, |
|
"grad_norm": 0.36241453886032104, |
|
"learning_rate": 0.00018610576923076925, |
|
"loss": 0.1757, |
|
"mean_token_accuracy": 0.9463522598147392, |
|
"num_tokens": 296960.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7215874924834637, |
|
"grad_norm": 0.3405214846134186, |
|
"learning_rate": 0.000185625, |
|
"loss": 0.1631, |
|
"mean_token_accuracy": 0.949561494588852, |
|
"num_tokens": 307200.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.745640408899579, |
|
"grad_norm": 0.2847406268119812, |
|
"learning_rate": 0.00018514423076923078, |
|
"loss": 0.1533, |
|
"mean_token_accuracy": 0.9555268749594689, |
|
"num_tokens": 317440.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7696933253156946, |
|
"grad_norm": 0.26305094361305237, |
|
"learning_rate": 0.00018466346153846153, |
|
"loss": 0.1611, |
|
"mean_token_accuracy": 0.9532571867108345, |
|
"num_tokens": 327680.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.79374624173181, |
|
"grad_norm": 0.2989274263381958, |
|
"learning_rate": 0.0001841826923076923, |
|
"loss": 0.1441, |
|
"mean_token_accuracy": 0.9533735632896423, |
|
"num_tokens": 337920.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8177991581479255, |
|
"grad_norm": 0.254688560962677, |
|
"learning_rate": 0.0001837019230769231, |
|
"loss": 0.1495, |
|
"mean_token_accuracy": 0.9542877942323684, |
|
"num_tokens": 348160.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8418520745640409, |
|
"grad_norm": 0.26552721858024597, |
|
"learning_rate": 0.00018322115384615385, |
|
"loss": 0.149, |
|
"mean_token_accuracy": 0.9527715161442757, |
|
"num_tokens": 358400.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8659049909801564, |
|
"grad_norm": 0.2920278012752533, |
|
"learning_rate": 0.00018274038461538463, |
|
"loss": 0.1506, |
|
"mean_token_accuracy": 0.9556835174560547, |
|
"num_tokens": 368640.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8899579073962718, |
|
"grad_norm": 0.34060242772102356, |
|
"learning_rate": 0.0001822596153846154, |
|
"loss": 0.1332, |
|
"mean_token_accuracy": 0.959219790995121, |
|
"num_tokens": 378880.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9140108238123873, |
|
"grad_norm": 0.3549470603466034, |
|
"learning_rate": 0.00018177884615384616, |
|
"loss": 0.1553, |
|
"mean_token_accuracy": 0.9529827669262886, |
|
"num_tokens": 389120.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9380637402285027, |
|
"grad_norm": 0.26259645819664, |
|
"learning_rate": 0.00018129807692307694, |
|
"loss": 0.1355, |
|
"mean_token_accuracy": 0.9546903222799301, |
|
"num_tokens": 399360.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9621166566446182, |
|
"grad_norm": 0.28768494725227356, |
|
"learning_rate": 0.0001808173076923077, |
|
"loss": 0.1314, |
|
"mean_token_accuracy": 0.9578663051128388, |
|
"num_tokens": 409600.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9861695730607336, |
|
"grad_norm": 0.23446713387966156, |
|
"learning_rate": 0.00018033653846153848, |
|
"loss": 0.1433, |
|
"mean_token_accuracy": 0.9555784195661545, |
|
"num_tokens": 419840.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.14436590671539307, |
|
"eval_mean_token_accuracy": 0.9552223542903332, |
|
"eval_num_tokens": 425728.0, |
|
"eval_runtime": 25.1182, |
|
"eval_samples_per_second": 14.73, |
|
"eval_steps_per_second": 1.871, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.009621166566446, |
|
"grad_norm": 0.2881328761577606, |
|
"learning_rate": 0.00017985576923076923, |
|
"loss": 0.1424, |
|
"mean_token_accuracy": 0.9537300452207907, |
|
"num_tokens": 429824.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0336740829825617, |
|
"grad_norm": 1.1323237419128418, |
|
"learning_rate": 0.000179375, |
|
"loss": 0.117, |
|
"mean_token_accuracy": 0.960891704261303, |
|
"num_tokens": 440064.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.057726999398677, |
|
"grad_norm": 0.3971687853336334, |
|
"learning_rate": 0.00017889423076923076, |
|
"loss": 0.1233, |
|
"mean_token_accuracy": 0.9594320252537727, |
|
"num_tokens": 450304.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0817799158147925, |
|
"grad_norm": 0.235861137509346, |
|
"learning_rate": 0.00017841346153846154, |
|
"loss": 0.1322, |
|
"mean_token_accuracy": 0.9574261009693146, |
|
"num_tokens": 460544.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1058328322309081, |
|
"grad_norm": 0.2572536766529083, |
|
"learning_rate": 0.00017793269230769232, |
|
"loss": 0.1288, |
|
"mean_token_accuracy": 0.9600686863064766, |
|
"num_tokens": 470784.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1298857486470235, |
|
"grad_norm": 0.33651918172836304, |
|
"learning_rate": 0.00017745192307692308, |
|
"loss": 0.1367, |
|
"mean_token_accuracy": 0.9558201640844345, |
|
"num_tokens": 481024.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.153938665063139, |
|
"grad_norm": 0.2679121196269989, |
|
"learning_rate": 0.00017697115384615386, |
|
"loss": 0.1218, |
|
"mean_token_accuracy": 0.9592645660042762, |
|
"num_tokens": 491264.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1779915814792543, |
|
"grad_norm": 0.24321414530277252, |
|
"learning_rate": 0.00017649038461538464, |
|
"loss": 0.1338, |
|
"mean_token_accuracy": 0.958324646949768, |
|
"num_tokens": 501504.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2020444978953697, |
|
"grad_norm": 0.29272493720054626, |
|
"learning_rate": 0.0001760096153846154, |
|
"loss": 0.1252, |
|
"mean_token_accuracy": 0.9605333045125007, |
|
"num_tokens": 511744.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2260974143114853, |
|
"grad_norm": 0.24240529537200928, |
|
"learning_rate": 0.00017552884615384617, |
|
"loss": 0.1208, |
|
"mean_token_accuracy": 0.960656826198101, |
|
"num_tokens": 521984.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2501503307276007, |
|
"grad_norm": 0.3100011944770813, |
|
"learning_rate": 0.00017504807692307695, |
|
"loss": 0.1254, |
|
"mean_token_accuracy": 0.9609130263328552, |
|
"num_tokens": 532224.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2742032471437161, |
|
"grad_norm": 0.28045424818992615, |
|
"learning_rate": 0.00017456730769230768, |
|
"loss": 0.1304, |
|
"mean_token_accuracy": 0.9579013884067535, |
|
"num_tokens": 542464.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2982561635598318, |
|
"grad_norm": 0.3832496702671051, |
|
"learning_rate": 0.00017408653846153846, |
|
"loss": 0.1427, |
|
"mean_token_accuracy": 0.9550067007541656, |
|
"num_tokens": 552704.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3223090799759472, |
|
"grad_norm": 0.32106658816337585, |
|
"learning_rate": 0.00017360576923076924, |
|
"loss": 0.129, |
|
"mean_token_accuracy": 0.956248240172863, |
|
"num_tokens": 562944.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3463619963920626, |
|
"grad_norm": 0.3141852915287018, |
|
"learning_rate": 0.000173125, |
|
"loss": 0.1145, |
|
"mean_token_accuracy": 0.9619206488132477, |
|
"num_tokens": 573184.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.370414912808178, |
|
"grad_norm": 0.31353870034217834, |
|
"learning_rate": 0.00017264423076923077, |
|
"loss": 0.126, |
|
"mean_token_accuracy": 0.9610635504126549, |
|
"num_tokens": 583424.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3944678292242934, |
|
"grad_norm": 0.24016191065311432, |
|
"learning_rate": 0.00017216346153846155, |
|
"loss": 0.1318, |
|
"mean_token_accuracy": 0.9551108077168464, |
|
"num_tokens": 593664.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.418520745640409, |
|
"grad_norm": 0.2362779825925827, |
|
"learning_rate": 0.0001716826923076923, |
|
"loss": 0.1163, |
|
"mean_token_accuracy": 0.9617116883397102, |
|
"num_tokens": 603904.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4425736620565244, |
|
"grad_norm": 0.2012677937746048, |
|
"learning_rate": 0.0001712019230769231, |
|
"loss": 0.1378, |
|
"mean_token_accuracy": 0.9564808994531632, |
|
"num_tokens": 614144.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4666265784726398, |
|
"grad_norm": 0.33629047870635986, |
|
"learning_rate": 0.00017072115384615387, |
|
"loss": 0.1215, |
|
"mean_token_accuracy": 0.9621520712971687, |
|
"num_tokens": 624384.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4906794948887554, |
|
"grad_norm": 0.30020079016685486, |
|
"learning_rate": 0.00017024038461538462, |
|
"loss": 0.1318, |
|
"mean_token_accuracy": 0.9570673331618309, |
|
"num_tokens": 634624.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5147324113048706, |
|
"grad_norm": 0.2715687155723572, |
|
"learning_rate": 0.0001697596153846154, |
|
"loss": 0.1248, |
|
"mean_token_accuracy": 0.9611581727862358, |
|
"num_tokens": 644864.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5387853277209862, |
|
"grad_norm": 0.2919619381427765, |
|
"learning_rate": 0.00016927884615384618, |
|
"loss": 0.116, |
|
"mean_token_accuracy": 0.9624506369233131, |
|
"num_tokens": 655104.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5628382441371016, |
|
"grad_norm": 0.26573172211647034, |
|
"learning_rate": 0.00016879807692307694, |
|
"loss": 0.1253, |
|
"mean_token_accuracy": 0.9609674572944641, |
|
"num_tokens": 665344.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.586891160553217, |
|
"grad_norm": 0.3533133566379547, |
|
"learning_rate": 0.0001683173076923077, |
|
"loss": 0.129, |
|
"mean_token_accuracy": 0.9564981684088707, |
|
"num_tokens": 675584.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6109440769693326, |
|
"grad_norm": 0.23616884648799896, |
|
"learning_rate": 0.00016783653846153847, |
|
"loss": 0.1198, |
|
"mean_token_accuracy": 0.9604200229048729, |
|
"num_tokens": 685824.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.634996993385448, |
|
"grad_norm": 0.29362770915031433, |
|
"learning_rate": 0.00016735576923076922, |
|
"loss": 0.1248, |
|
"mean_token_accuracy": 0.9565964505076409, |
|
"num_tokens": 696064.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6590499098015634, |
|
"grad_norm": 0.32813316583633423, |
|
"learning_rate": 0.000166875, |
|
"loss": 0.1404, |
|
"mean_token_accuracy": 0.9544960707426071, |
|
"num_tokens": 706304.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.683102826217679, |
|
"grad_norm": 0.23794318735599518, |
|
"learning_rate": 0.00016639423076923078, |
|
"loss": 0.1239, |
|
"mean_token_accuracy": 0.9590726107358932, |
|
"num_tokens": 716544.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7071557426337942, |
|
"grad_norm": 0.3290494680404663, |
|
"learning_rate": 0.00016591346153846154, |
|
"loss": 0.1136, |
|
"mean_token_accuracy": 0.9635635167360306, |
|
"num_tokens": 726784.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7312086590499098, |
|
"grad_norm": 0.2637302577495575, |
|
"learning_rate": 0.00016543269230769232, |
|
"loss": 0.1211, |
|
"mean_token_accuracy": 0.957936991751194, |
|
"num_tokens": 737024.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7552615754660252, |
|
"grad_norm": 0.2866067588329315, |
|
"learning_rate": 0.0001649519230769231, |
|
"loss": 0.1188, |
|
"mean_token_accuracy": 0.9595590904355049, |
|
"num_tokens": 747264.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7793144918821406, |
|
"grad_norm": 0.31299787759780884, |
|
"learning_rate": 0.00016447115384615385, |
|
"loss": 0.1303, |
|
"mean_token_accuracy": 0.9595273941755295, |
|
"num_tokens": 757504.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8033674082982563, |
|
"grad_norm": 0.25704699754714966, |
|
"learning_rate": 0.00016399038461538463, |
|
"loss": 0.12, |
|
"mean_token_accuracy": 0.9622810766100883, |
|
"num_tokens": 767744.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8274203247143717, |
|
"grad_norm": 0.20122775435447693, |
|
"learning_rate": 0.0001635096153846154, |
|
"loss": 0.1153, |
|
"mean_token_accuracy": 0.961222605407238, |
|
"num_tokens": 777984.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.851473241130487, |
|
"grad_norm": 0.21299275755882263, |
|
"learning_rate": 0.00016302884615384617, |
|
"loss": 0.114, |
|
"mean_token_accuracy": 0.9608678832650185, |
|
"num_tokens": 788224.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8755261575466027, |
|
"grad_norm": 0.25124669075012207, |
|
"learning_rate": 0.00016254807692307695, |
|
"loss": 0.122, |
|
"mean_token_accuracy": 0.962291096150875, |
|
"num_tokens": 798464.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8995790739627179, |
|
"grad_norm": 0.2945667803287506, |
|
"learning_rate": 0.0001620673076923077, |
|
"loss": 0.1183, |
|
"mean_token_accuracy": 0.9604008629918098, |
|
"num_tokens": 808704.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9236319903788335, |
|
"grad_norm": 0.2346087247133255, |
|
"learning_rate": 0.00016158653846153845, |
|
"loss": 0.1136, |
|
"mean_token_accuracy": 0.963145537674427, |
|
"num_tokens": 818944.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9476849067949489, |
|
"grad_norm": 0.22119984030723572, |
|
"learning_rate": 0.00016110576923076923, |
|
"loss": 0.1235, |
|
"mean_token_accuracy": 0.9591810956597329, |
|
"num_tokens": 829184.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9717378232110643, |
|
"grad_norm": 0.2481156885623932, |
|
"learning_rate": 0.00016062500000000001, |
|
"loss": 0.1208, |
|
"mean_token_accuracy": 0.9615909501910209, |
|
"num_tokens": 839424.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.99579073962718, |
|
"grad_norm": 0.24333243072032928, |
|
"learning_rate": 0.00016014423076923077, |
|
"loss": 0.121, |
|
"mean_token_accuracy": 0.9590676620602607, |
|
"num_tokens": 849664.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.1264333575963974, |
|
"eval_mean_token_accuracy": 0.9596163808031285, |
|
"eval_num_tokens": 851456.0, |
|
"eval_runtime": 24.9078, |
|
"eval_samples_per_second": 14.855, |
|
"eval_steps_per_second": 1.887, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 2.019242333132892, |
|
"grad_norm": 0.25562378764152527, |
|
"learning_rate": 0.00015966346153846155, |
|
"loss": 0.1082, |
|
"mean_token_accuracy": 0.9661401877036462, |
|
"num_tokens": 859648.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.0432952495490078, |
|
"grad_norm": 0.2851448059082031, |
|
"learning_rate": 0.00015918269230769233, |
|
"loss": 0.0988, |
|
"mean_token_accuracy": 0.9670166626572609, |
|
"num_tokens": 869888.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.0673481659651234, |
|
"grad_norm": 0.3552910387516022, |
|
"learning_rate": 0.00015870192307692308, |
|
"loss": 0.1058, |
|
"mean_token_accuracy": 0.961455948650837, |
|
"num_tokens": 880128.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.0914010823812386, |
|
"grad_norm": 0.18810558319091797, |
|
"learning_rate": 0.00015822115384615386, |
|
"loss": 0.0962, |
|
"mean_token_accuracy": 0.965205217897892, |
|
"num_tokens": 890368.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.115453998797354, |
|
"grad_norm": 0.24357327818870544, |
|
"learning_rate": 0.00015774038461538462, |
|
"loss": 0.1121, |
|
"mean_token_accuracy": 0.9618488609790802, |
|
"num_tokens": 900608.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.13950691521347, |
|
"grad_norm": 0.24040566384792328, |
|
"learning_rate": 0.0001572596153846154, |
|
"loss": 0.1064, |
|
"mean_token_accuracy": 0.9660862430930137, |
|
"num_tokens": 910848.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.163559831629585, |
|
"grad_norm": 0.24545449018478394, |
|
"learning_rate": 0.00015677884615384618, |
|
"loss": 0.0965, |
|
"mean_token_accuracy": 0.9677914634346962, |
|
"num_tokens": 921088.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1876127480457006, |
|
"grad_norm": 0.30429190397262573, |
|
"learning_rate": 0.00015629807692307693, |
|
"loss": 0.105, |
|
"mean_token_accuracy": 0.9644672557711601, |
|
"num_tokens": 931328.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.2116656644618162, |
|
"grad_norm": 0.2934926450252533, |
|
"learning_rate": 0.00015581730769230768, |
|
"loss": 0.0968, |
|
"mean_token_accuracy": 0.965617573261261, |
|
"num_tokens": 941568.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2357185808779314, |
|
"grad_norm": 0.3533467948436737, |
|
"learning_rate": 0.00015533653846153846, |
|
"loss": 0.1073, |
|
"mean_token_accuracy": 0.9617443799972534, |
|
"num_tokens": 951808.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.259771497294047, |
|
"grad_norm": 0.15458804368972778, |
|
"learning_rate": 0.00015485576923076924, |
|
"loss": 0.1069, |
|
"mean_token_accuracy": 0.9629932105541229, |
|
"num_tokens": 962048.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.283824413710162, |
|
"grad_norm": 0.6568232774734497, |
|
"learning_rate": 0.000154375, |
|
"loss": 0.1077, |
|
"mean_token_accuracy": 0.9618542537093162, |
|
"num_tokens": 972288.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.307877330126278, |
|
"grad_norm": 0.3730143904685974, |
|
"learning_rate": 0.00015389423076923078, |
|
"loss": 0.1139, |
|
"mean_token_accuracy": 0.9590795397758484, |
|
"num_tokens": 982528.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.3319302465423934, |
|
"grad_norm": 0.26199349761009216, |
|
"learning_rate": 0.00015341346153846153, |
|
"loss": 0.115, |
|
"mean_token_accuracy": 0.9604379132390022, |
|
"num_tokens": 992768.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3559831629585086, |
|
"grad_norm": 0.3791094720363617, |
|
"learning_rate": 0.0001529326923076923, |
|
"loss": 0.1066, |
|
"mean_token_accuracy": 0.9632134348154068, |
|
"num_tokens": 1003008.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3800360793746242, |
|
"grad_norm": 0.1697998046875, |
|
"learning_rate": 0.0001524519230769231, |
|
"loss": 0.1024, |
|
"mean_token_accuracy": 0.9637576416134834, |
|
"num_tokens": 1013248.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.4040889957907394, |
|
"grad_norm": 0.3326902389526367, |
|
"learning_rate": 0.00015197115384615385, |
|
"loss": 0.1066, |
|
"mean_token_accuracy": 0.9614829778671264, |
|
"num_tokens": 1023488.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.428141912206855, |
|
"grad_norm": 0.22747723758220673, |
|
"learning_rate": 0.00015149038461538463, |
|
"loss": 0.0924, |
|
"mean_token_accuracy": 0.9677571937441826, |
|
"num_tokens": 1033728.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4521948286229707, |
|
"grad_norm": 0.25410860776901245, |
|
"learning_rate": 0.0001510096153846154, |
|
"loss": 0.1097, |
|
"mean_token_accuracy": 0.963656097650528, |
|
"num_tokens": 1043968.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.476247745039086, |
|
"grad_norm": 0.2268102765083313, |
|
"learning_rate": 0.00015052884615384616, |
|
"loss": 0.1018, |
|
"mean_token_accuracy": 0.9653624445199966, |
|
"num_tokens": 1054208.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.5003006614552015, |
|
"grad_norm": 0.23511724174022675, |
|
"learning_rate": 0.00015004807692307694, |
|
"loss": 0.0969, |
|
"mean_token_accuracy": 0.9658154919743538, |
|
"num_tokens": 1064448.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5243535778713166, |
|
"grad_norm": 0.23023869097232819, |
|
"learning_rate": 0.0001495673076923077, |
|
"loss": 0.1053, |
|
"mean_token_accuracy": 0.96552574634552, |
|
"num_tokens": 1074688.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.5484064942874323, |
|
"grad_norm": 0.25081828236579895, |
|
"learning_rate": 0.00014908653846153845, |
|
"loss": 0.1044, |
|
"mean_token_accuracy": 0.9639346837997437, |
|
"num_tokens": 1084928.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.572459410703548, |
|
"grad_norm": 0.2557835280895233, |
|
"learning_rate": 0.00014860576923076923, |
|
"loss": 0.106, |
|
"mean_token_accuracy": 0.9623663991689682, |
|
"num_tokens": 1095168.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5965123271196635, |
|
"grad_norm": 0.1407734751701355, |
|
"learning_rate": 0.000148125, |
|
"loss": 0.1092, |
|
"mean_token_accuracy": 0.9625474750995636, |
|
"num_tokens": 1105408.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.6205652435357787, |
|
"grad_norm": 0.2862281799316406, |
|
"learning_rate": 0.00014764423076923076, |
|
"loss": 0.1053, |
|
"mean_token_accuracy": 0.9632299616932869, |
|
"num_tokens": 1115648.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.6446181599518943, |
|
"grad_norm": 0.26334843039512634, |
|
"learning_rate": 0.00014716346153846154, |
|
"loss": 0.1209, |
|
"mean_token_accuracy": 0.9601381734013558, |
|
"num_tokens": 1125888.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.6686710763680095, |
|
"grad_norm": 0.20137883722782135, |
|
"learning_rate": 0.00014668269230769232, |
|
"loss": 0.1057, |
|
"mean_token_accuracy": 0.9626315057277679, |
|
"num_tokens": 1136128.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.692723992784125, |
|
"grad_norm": 0.3017769157886505, |
|
"learning_rate": 0.00014620192307692308, |
|
"loss": 0.104, |
|
"mean_token_accuracy": 0.9653211057186126, |
|
"num_tokens": 1146368.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.7167769092002407, |
|
"grad_norm": 0.24447724223136902, |
|
"learning_rate": 0.00014572115384615386, |
|
"loss": 0.111, |
|
"mean_token_accuracy": 0.9617329522967338, |
|
"num_tokens": 1156608.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.740829825616356, |
|
"grad_norm": 0.22615140676498413, |
|
"learning_rate": 0.00014524038461538464, |
|
"loss": 0.113, |
|
"mean_token_accuracy": 0.9617576941847801, |
|
"num_tokens": 1166848.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7648827420324715, |
|
"grad_norm": 0.20445455610752106, |
|
"learning_rate": 0.0001447596153846154, |
|
"loss": 0.1064, |
|
"mean_token_accuracy": 0.965658301115036, |
|
"num_tokens": 1177088.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7889356584485867, |
|
"grad_norm": 0.2821711003780365, |
|
"learning_rate": 0.00014427884615384617, |
|
"loss": 0.1088, |
|
"mean_token_accuracy": 0.9635483458638191, |
|
"num_tokens": 1187328.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.8129885748647023, |
|
"grad_norm": 0.21199429035186768, |
|
"learning_rate": 0.00014379807692307695, |
|
"loss": 0.1071, |
|
"mean_token_accuracy": 0.9626901999115944, |
|
"num_tokens": 1197568.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.837041491280818, |
|
"grad_norm": 0.23910683393478394, |
|
"learning_rate": 0.00014331730769230768, |
|
"loss": 0.1095, |
|
"mean_token_accuracy": 0.9642450660467148, |
|
"num_tokens": 1207808.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.861094407696933, |
|
"grad_norm": 0.22157999873161316, |
|
"learning_rate": 0.00014283653846153846, |
|
"loss": 0.0968, |
|
"mean_token_accuracy": 0.964881993830204, |
|
"num_tokens": 1218048.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8851473241130488, |
|
"grad_norm": 0.25650596618652344, |
|
"learning_rate": 0.00014235576923076924, |
|
"loss": 0.1027, |
|
"mean_token_accuracy": 0.9612545475363732, |
|
"num_tokens": 1228288.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.909200240529164, |
|
"grad_norm": 0.32955703139305115, |
|
"learning_rate": 0.000141875, |
|
"loss": 0.1096, |
|
"mean_token_accuracy": 0.9617963835597039, |
|
"num_tokens": 1238528.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.9332531569452795, |
|
"grad_norm": 0.21780824661254883, |
|
"learning_rate": 0.00014139423076923077, |
|
"loss": 0.1033, |
|
"mean_token_accuracy": 0.9646632343530654, |
|
"num_tokens": 1248768.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.957306073361395, |
|
"grad_norm": 0.2339126020669937, |
|
"learning_rate": 0.00014091346153846155, |
|
"loss": 0.1031, |
|
"mean_token_accuracy": 0.964007930457592, |
|
"num_tokens": 1259008.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.981358989777511, |
|
"grad_norm": 0.31986188888549805, |
|
"learning_rate": 0.0001404326923076923, |
|
"loss": 0.1018, |
|
"mean_token_accuracy": 0.9668645352125168, |
|
"num_tokens": 1269248.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.12327124178409576, |
|
"eval_mean_token_accuracy": 0.9614017339462929, |
|
"eval_num_tokens": 1277184.0, |
|
"eval_runtime": 25.0119, |
|
"eval_samples_per_second": 14.793, |
|
"eval_steps_per_second": 1.879, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 3.004810583283223, |
|
"grad_norm": 0.20351627469062805, |
|
"learning_rate": 0.0001399519230769231, |
|
"loss": 0.0923, |
|
"mean_token_accuracy": 0.9675439733725327, |
|
"num_tokens": 1279232.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.0288634996993387, |
|
"grad_norm": 0.3330595791339874, |
|
"learning_rate": 0.00013947115384615387, |
|
"loss": 0.0906, |
|
"mean_token_accuracy": 0.9674146190285683, |
|
"num_tokens": 1289472.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.052916416115454, |
|
"grad_norm": 0.2875508666038513, |
|
"learning_rate": 0.00013899038461538462, |
|
"loss": 0.0953, |
|
"mean_token_accuracy": 0.9661522597074509, |
|
"num_tokens": 1299712.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.0769693325315695, |
|
"grad_norm": 0.26365041732788086, |
|
"learning_rate": 0.0001385096153846154, |
|
"loss": 0.0898, |
|
"mean_token_accuracy": 0.9656254693865776, |
|
"num_tokens": 1309952.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.101022248947685, |
|
"grad_norm": 0.22576822340488434, |
|
"learning_rate": 0.00013802884615384618, |
|
"loss": 0.0911, |
|
"mean_token_accuracy": 0.9678579106926918, |
|
"num_tokens": 1320192.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.1250751653638003, |
|
"grad_norm": 0.2698352038860321, |
|
"learning_rate": 0.00013754807692307694, |
|
"loss": 0.098, |
|
"mean_token_accuracy": 0.9671777725219727, |
|
"num_tokens": 1330432.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.149128081779916, |
|
"grad_norm": 0.29429781436920166, |
|
"learning_rate": 0.0001370673076923077, |
|
"loss": 0.0883, |
|
"mean_token_accuracy": 0.9666375696659089, |
|
"num_tokens": 1340672.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.173180998196031, |
|
"grad_norm": 0.25614482164382935, |
|
"learning_rate": 0.00013658653846153847, |
|
"loss": 0.0915, |
|
"mean_token_accuracy": 0.967202040553093, |
|
"num_tokens": 1350912.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.1972339146121467, |
|
"grad_norm": 0.2897791862487793, |
|
"learning_rate": 0.00013610576923076922, |
|
"loss": 0.0956, |
|
"mean_token_accuracy": 0.965019428730011, |
|
"num_tokens": 1361152.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.2212868310282623, |
|
"grad_norm": 0.1813499927520752, |
|
"learning_rate": 0.000135625, |
|
"loss": 0.0941, |
|
"mean_token_accuracy": 0.9646437510848045, |
|
"num_tokens": 1371392.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.2453397474443775, |
|
"grad_norm": 0.19243107736110687, |
|
"learning_rate": 0.00013514423076923078, |
|
"loss": 0.091, |
|
"mean_token_accuracy": 0.9650159657001496, |
|
"num_tokens": 1381632.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.269392663860493, |
|
"grad_norm": 0.22238536179065704, |
|
"learning_rate": 0.00013466346153846154, |
|
"loss": 0.0924, |
|
"mean_token_accuracy": 0.9653570145368576, |
|
"num_tokens": 1391872.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.2934455802766087, |
|
"grad_norm": 0.15673308074474335, |
|
"learning_rate": 0.00013418269230769232, |
|
"loss": 0.0892, |
|
"mean_token_accuracy": 0.9664774596691131, |
|
"num_tokens": 1402112.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.317498496692724, |
|
"grad_norm": 0.2173730581998825, |
|
"learning_rate": 0.0001337019230769231, |
|
"loss": 0.0978, |
|
"mean_token_accuracy": 0.9646672755479813, |
|
"num_tokens": 1412352.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.3415514131088395, |
|
"grad_norm": 0.17452774941921234, |
|
"learning_rate": 0.00013322115384615385, |
|
"loss": 0.0988, |
|
"mean_token_accuracy": 0.9650528088212014, |
|
"num_tokens": 1422592.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.365604329524955, |
|
"grad_norm": 0.19794107973575592, |
|
"learning_rate": 0.00013274038461538463, |
|
"loss": 0.0906, |
|
"mean_token_accuracy": 0.9670823276042938, |
|
"num_tokens": 1432832.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.3896572459410703, |
|
"grad_norm": 0.17740049958229065, |
|
"learning_rate": 0.00013225961538461539, |
|
"loss": 0.0917, |
|
"mean_token_accuracy": 0.9663808658719063, |
|
"num_tokens": 1443072.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.413710162357186, |
|
"grad_norm": 0.24540650844573975, |
|
"learning_rate": 0.00013177884615384617, |
|
"loss": 0.0889, |
|
"mean_token_accuracy": 0.9661073669791221, |
|
"num_tokens": 1453312.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.437763078773301, |
|
"grad_norm": 0.2300598919391632, |
|
"learning_rate": 0.00013129807692307695, |
|
"loss": 0.0865, |
|
"mean_token_accuracy": 0.9688371613621711, |
|
"num_tokens": 1463552.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.4618159951894167, |
|
"grad_norm": 0.18420913815498352, |
|
"learning_rate": 0.0001308173076923077, |
|
"loss": 0.0963, |
|
"mean_token_accuracy": 0.9648661985993385, |
|
"num_tokens": 1473792.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.4858689116055324, |
|
"grad_norm": 0.27637794613838196, |
|
"learning_rate": 0.00013033653846153845, |
|
"loss": 0.0984, |
|
"mean_token_accuracy": 0.9647489741444588, |
|
"num_tokens": 1484032.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.5099218280216475, |
|
"grad_norm": 0.28515446186065674, |
|
"learning_rate": 0.00012985576923076923, |
|
"loss": 0.0915, |
|
"mean_token_accuracy": 0.9642152190208435, |
|
"num_tokens": 1494272.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.533974744437763, |
|
"grad_norm": 0.19887416064739227, |
|
"learning_rate": 0.00012937500000000001, |
|
"loss": 0.0963, |
|
"mean_token_accuracy": 0.9659793645143508, |
|
"num_tokens": 1504512.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.5580276608538783, |
|
"grad_norm": 0.2664344906806946, |
|
"learning_rate": 0.00012889423076923077, |
|
"loss": 0.0896, |
|
"mean_token_accuracy": 0.9680359989404679, |
|
"num_tokens": 1514752.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.582080577269994, |
|
"grad_norm": 0.23911890387535095, |
|
"learning_rate": 0.00012841346153846155, |
|
"loss": 0.0966, |
|
"mean_token_accuracy": 0.9650948256254196, |
|
"num_tokens": 1524992.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.6061334936861096, |
|
"grad_norm": 0.21762046217918396, |
|
"learning_rate": 0.0001279326923076923, |
|
"loss": 0.0952, |
|
"mean_token_accuracy": 0.9666063264012337, |
|
"num_tokens": 1535232.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.6301864101022248, |
|
"grad_norm": 0.263232558965683, |
|
"learning_rate": 0.00012745192307692308, |
|
"loss": 0.0993, |
|
"mean_token_accuracy": 0.9661560922861099, |
|
"num_tokens": 1545472.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.6542393265183404, |
|
"grad_norm": 0.20630541443824768, |
|
"learning_rate": 0.00012697115384615386, |
|
"loss": 0.0918, |
|
"mean_token_accuracy": 0.9665474951267242, |
|
"num_tokens": 1555712.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.6782922429344556, |
|
"grad_norm": 0.21548768877983093, |
|
"learning_rate": 0.00012649038461538462, |
|
"loss": 0.0942, |
|
"mean_token_accuracy": 0.9646090790629387, |
|
"num_tokens": 1565952.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.702345159350571, |
|
"grad_norm": 0.21956314146518707, |
|
"learning_rate": 0.0001260096153846154, |
|
"loss": 0.0963, |
|
"mean_token_accuracy": 0.9646272197365761, |
|
"num_tokens": 1576192.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.726398075766687, |
|
"grad_norm": 0.22929218411445618, |
|
"learning_rate": 0.00012552884615384618, |
|
"loss": 0.0946, |
|
"mean_token_accuracy": 0.9660244584083557, |
|
"num_tokens": 1586432.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.7504509921828024, |
|
"grad_norm": 0.20628125965595245, |
|
"learning_rate": 0.00012504807692307693, |
|
"loss": 0.0939, |
|
"mean_token_accuracy": 0.9671428605914116, |
|
"num_tokens": 1596672.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.7745039085989176, |
|
"grad_norm": 0.1761239767074585, |
|
"learning_rate": 0.00012456730769230768, |
|
"loss": 0.0985, |
|
"mean_token_accuracy": 0.9652422875165939, |
|
"num_tokens": 1606912.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.7985568250150332, |
|
"grad_norm": 0.20456120371818542, |
|
"learning_rate": 0.00012408653846153846, |
|
"loss": 0.0992, |
|
"mean_token_accuracy": 0.9635955929756165, |
|
"num_tokens": 1617152.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.8226097414311484, |
|
"grad_norm": 0.24852736294269562, |
|
"learning_rate": 0.00012360576923076922, |
|
"loss": 0.0969, |
|
"mean_token_accuracy": 0.9656632468104362, |
|
"num_tokens": 1627392.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.846662657847264, |
|
"grad_norm": 0.2334773987531662, |
|
"learning_rate": 0.000123125, |
|
"loss": 0.0945, |
|
"mean_token_accuracy": 0.9682855114340783, |
|
"num_tokens": 1637632.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.8707155742633796, |
|
"grad_norm": 0.2439630627632141, |
|
"learning_rate": 0.00012264423076923078, |
|
"loss": 0.0945, |
|
"mean_token_accuracy": 0.9671804904937744, |
|
"num_tokens": 1647872.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.894768490679495, |
|
"grad_norm": 0.21514557301998138, |
|
"learning_rate": 0.00012216346153846153, |
|
"loss": 0.0952, |
|
"mean_token_accuracy": 0.9666045814752579, |
|
"num_tokens": 1658112.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.9188214070956104, |
|
"grad_norm": 0.16191978752613068, |
|
"learning_rate": 0.00012168269230769231, |
|
"loss": 0.097, |
|
"mean_token_accuracy": 0.966627161204815, |
|
"num_tokens": 1668352.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.9428743235117256, |
|
"grad_norm": 0.17086118459701538, |
|
"learning_rate": 0.00012120192307692308, |
|
"loss": 0.0923, |
|
"mean_token_accuracy": 0.9672711491584778, |
|
"num_tokens": 1678592.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.9669272399278412, |
|
"grad_norm": 0.19103848934173584, |
|
"learning_rate": 0.00012072115384615386, |
|
"loss": 0.0921, |
|
"mean_token_accuracy": 0.9661801844835282, |
|
"num_tokens": 1688832.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.990980156343957, |
|
"grad_norm": 0.22176852822303772, |
|
"learning_rate": 0.00012024038461538463, |
|
"loss": 0.0917, |
|
"mean_token_accuracy": 0.9642520830035209, |
|
"num_tokens": 1699072.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.12371965497732162, |
|
"eval_mean_token_accuracy": 0.9598764219182603, |
|
"eval_num_tokens": 1702912.0, |
|
"eval_runtime": 25.0613, |
|
"eval_samples_per_second": 14.764, |
|
"eval_steps_per_second": 1.875, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 4.0144317498496696, |
|
"grad_norm": 0.1616247445344925, |
|
"learning_rate": 0.0001197596153846154, |
|
"loss": 0.0846, |
|
"mean_token_accuracy": 0.9689040917616624, |
|
"num_tokens": 1709056.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.038484666265784, |
|
"grad_norm": 0.1974208503961563, |
|
"learning_rate": 0.00011927884615384617, |
|
"loss": 0.0902, |
|
"mean_token_accuracy": 0.9659950643777847, |
|
"num_tokens": 1719296.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.0625375826819, |
|
"grad_norm": 0.19191399216651917, |
|
"learning_rate": 0.00011879807692307694, |
|
"loss": 0.0876, |
|
"mean_token_accuracy": 0.9678742095828057, |
|
"num_tokens": 1729536.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.0865904990980155, |
|
"grad_norm": 0.17952245473861694, |
|
"learning_rate": 0.0001183173076923077, |
|
"loss": 0.0935, |
|
"mean_token_accuracy": 0.9667584493756294, |
|
"num_tokens": 1739776.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.110643415514131, |
|
"grad_norm": 0.2491922825574875, |
|
"learning_rate": 0.00011783653846153846, |
|
"loss": 0.0829, |
|
"mean_token_accuracy": 0.9714985772967338, |
|
"num_tokens": 1750016.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.134696331930247, |
|
"grad_norm": 0.23436853289604187, |
|
"learning_rate": 0.00011735576923076923, |
|
"loss": 0.0884, |
|
"mean_token_accuracy": 0.9674542516469955, |
|
"num_tokens": 1760256.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.158749248346362, |
|
"grad_norm": 0.21396978199481964, |
|
"learning_rate": 0.000116875, |
|
"loss": 0.0871, |
|
"mean_token_accuracy": 0.9674944341182709, |
|
"num_tokens": 1770496.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.182802164762477, |
|
"grad_norm": 0.15618039667606354, |
|
"learning_rate": 0.00011639423076923078, |
|
"loss": 0.0762, |
|
"mean_token_accuracy": 0.97198745906353, |
|
"num_tokens": 1780736.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.206855081178593, |
|
"grad_norm": 0.16134659945964813, |
|
"learning_rate": 0.00011591346153846154, |
|
"loss": 0.093, |
|
"mean_token_accuracy": 0.9636217251420021, |
|
"num_tokens": 1790976.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.230907997594708, |
|
"grad_norm": 0.12519049644470215, |
|
"learning_rate": 0.00011543269230769231, |
|
"loss": 0.0817, |
|
"mean_token_accuracy": 0.9709686204791069, |
|
"num_tokens": 1801216.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.254960914010824, |
|
"grad_norm": 0.17119652032852173, |
|
"learning_rate": 0.00011495192307692309, |
|
"loss": 0.0821, |
|
"mean_token_accuracy": 0.9688223898410797, |
|
"num_tokens": 1811456.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.27901383042694, |
|
"grad_norm": 0.13762086629867554, |
|
"learning_rate": 0.00011447115384615386, |
|
"loss": 0.0877, |
|
"mean_token_accuracy": 0.9673933520913124, |
|
"num_tokens": 1821696.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.303066746843054, |
|
"grad_norm": 0.22409266233444214, |
|
"learning_rate": 0.00011399038461538462, |
|
"loss": 0.0907, |
|
"mean_token_accuracy": 0.9678563743829727, |
|
"num_tokens": 1831936.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.32711966325917, |
|
"grad_norm": 0.21305358409881592, |
|
"learning_rate": 0.0001135096153846154, |
|
"loss": 0.0845, |
|
"mean_token_accuracy": 0.9678771600127221, |
|
"num_tokens": 1842176.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.351172579675286, |
|
"grad_norm": 0.24969004094600677, |
|
"learning_rate": 0.00011302884615384617, |
|
"loss": 0.0896, |
|
"mean_token_accuracy": 0.967383436858654, |
|
"num_tokens": 1852416.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.375225496091401, |
|
"grad_norm": 0.22428163886070251, |
|
"learning_rate": 0.00011254807692307694, |
|
"loss": 0.0895, |
|
"mean_token_accuracy": 0.9665916368365288, |
|
"num_tokens": 1862656.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.399278412507517, |
|
"grad_norm": 0.2337762862443924, |
|
"learning_rate": 0.00011206730769230769, |
|
"loss": 0.0926, |
|
"mean_token_accuracy": 0.964175409078598, |
|
"num_tokens": 1872896.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.4233313289236325, |
|
"grad_norm": 0.24770590662956238, |
|
"learning_rate": 0.00011158653846153846, |
|
"loss": 0.0916, |
|
"mean_token_accuracy": 0.9662819102406501, |
|
"num_tokens": 1883136.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.447384245339747, |
|
"grad_norm": 0.2134462147951126, |
|
"learning_rate": 0.00011110576923076923, |
|
"loss": 0.0881, |
|
"mean_token_accuracy": 0.9670575857162476, |
|
"num_tokens": 1893376.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.471437161755863, |
|
"grad_norm": 0.16387952864170074, |
|
"learning_rate": 0.000110625, |
|
"loss": 0.0871, |
|
"mean_token_accuracy": 0.9688689664006234, |
|
"num_tokens": 1903616.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.495490078171978, |
|
"grad_norm": 0.1718788743019104, |
|
"learning_rate": 0.00011014423076923077, |
|
"loss": 0.0887, |
|
"mean_token_accuracy": 0.9679651662707329, |
|
"num_tokens": 1913856.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.519542994588094, |
|
"grad_norm": 0.13868968188762665, |
|
"learning_rate": 0.00010966346153846154, |
|
"loss": 0.0869, |
|
"mean_token_accuracy": 0.9675926879048348, |
|
"num_tokens": 1924096.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.543595911004209, |
|
"grad_norm": 0.23226922750473022, |
|
"learning_rate": 0.00010918269230769232, |
|
"loss": 0.0893, |
|
"mean_token_accuracy": 0.9671968877315521, |
|
"num_tokens": 1934336.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.567648827420324, |
|
"grad_norm": 0.2364928126335144, |
|
"learning_rate": 0.00010870192307692309, |
|
"loss": 0.0868, |
|
"mean_token_accuracy": 0.9687703013420105, |
|
"num_tokens": 1944576.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.59170174383644, |
|
"grad_norm": 0.46209245920181274, |
|
"learning_rate": 0.00010822115384615385, |
|
"loss": 0.0908, |
|
"mean_token_accuracy": 0.9664398193359375, |
|
"num_tokens": 1954816.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.615754660252556, |
|
"grad_norm": 0.164833664894104, |
|
"learning_rate": 0.00010774038461538462, |
|
"loss": 0.0848, |
|
"mean_token_accuracy": 0.9686171919107437, |
|
"num_tokens": 1965056.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.639807576668671, |
|
"grad_norm": 0.2000456005334854, |
|
"learning_rate": 0.0001072596153846154, |
|
"loss": 0.0847, |
|
"mean_token_accuracy": 0.9683021917939186, |
|
"num_tokens": 1975296.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.663860493084787, |
|
"grad_norm": 0.1978590190410614, |
|
"learning_rate": 0.00010677884615384617, |
|
"loss": 0.0881, |
|
"mean_token_accuracy": 0.9670929208397865, |
|
"num_tokens": 1985536.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.687913409500902, |
|
"grad_norm": 0.3182586133480072, |
|
"learning_rate": 0.00010629807692307694, |
|
"loss": 0.0913, |
|
"mean_token_accuracy": 0.9663867846131324, |
|
"num_tokens": 1995776.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.711966325917017, |
|
"grad_norm": 0.18092605471611023, |
|
"learning_rate": 0.00010581730769230769, |
|
"loss": 0.0905, |
|
"mean_token_accuracy": 0.9664200291037559, |
|
"num_tokens": 2006016.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.736019242333133, |
|
"grad_norm": 0.17231811583042145, |
|
"learning_rate": 0.00010533653846153846, |
|
"loss": 0.0859, |
|
"mean_token_accuracy": 0.968836921453476, |
|
"num_tokens": 2016256.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.7600721587492485, |
|
"grad_norm": 0.15652543306350708, |
|
"learning_rate": 0.00010485576923076924, |
|
"loss": 0.0834, |
|
"mean_token_accuracy": 0.9684307157993317, |
|
"num_tokens": 2026496.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.784125075165364, |
|
"grad_norm": 0.13773974776268005, |
|
"learning_rate": 0.000104375, |
|
"loss": 0.0868, |
|
"mean_token_accuracy": 0.970444829761982, |
|
"num_tokens": 2036736.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.808177991581479, |
|
"grad_norm": 0.239529550075531, |
|
"learning_rate": 0.00010389423076923077, |
|
"loss": 0.0824, |
|
"mean_token_accuracy": 0.9694223091006279, |
|
"num_tokens": 2046976.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.8322309079975945, |
|
"grad_norm": 0.18709523975849152, |
|
"learning_rate": 0.00010341346153846154, |
|
"loss": 0.0949, |
|
"mean_token_accuracy": 0.965866394340992, |
|
"num_tokens": 2057216.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.85628382441371, |
|
"grad_norm": 0.2615446448326111, |
|
"learning_rate": 0.00010293269230769232, |
|
"loss": 0.083, |
|
"mean_token_accuracy": 0.9698115825653076, |
|
"num_tokens": 2067456.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.880336740829826, |
|
"grad_norm": 0.1910138875246048, |
|
"learning_rate": 0.00010245192307692308, |
|
"loss": 0.0907, |
|
"mean_token_accuracy": 0.9668553739786148, |
|
"num_tokens": 2077696.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.904389657245941, |
|
"grad_norm": 0.19025610387325287, |
|
"learning_rate": 0.00010197115384615385, |
|
"loss": 0.093, |
|
"mean_token_accuracy": 0.9664333924651146, |
|
"num_tokens": 2087936.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.928442573662057, |
|
"grad_norm": 0.35547563433647156, |
|
"learning_rate": 0.00010149038461538463, |
|
"loss": 0.0863, |
|
"mean_token_accuracy": 0.967856514453888, |
|
"num_tokens": 2098176.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.952495490078172, |
|
"grad_norm": 0.15040579438209534, |
|
"learning_rate": 0.0001010096153846154, |
|
"loss": 0.0823, |
|
"mean_token_accuracy": 0.9676536738872528, |
|
"num_tokens": 2108416.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.976548406494287, |
|
"grad_norm": 0.24159961938858032, |
|
"learning_rate": 0.00010052884615384617, |
|
"loss": 0.0853, |
|
"mean_token_accuracy": 0.9684896349906922, |
|
"num_tokens": 2118656.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.21156367659568787, |
|
"learning_rate": 0.00010004807692307693, |
|
"loss": 0.0872, |
|
"mean_token_accuracy": 0.9677492471841666, |
|
"num_tokens": 2128640.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.12297015637159348, |
|
"eval_mean_token_accuracy": 0.960414043132295, |
|
"eval_num_tokens": 2128640.0, |
|
"eval_runtime": 26.101, |
|
"eval_samples_per_second": 14.176, |
|
"eval_steps_per_second": 1.801, |
|
"step": 2080 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4160, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.631112336965632e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|