Divyanshu04's picture
Upload folder using huggingface_hub
679f965 verified
{
"best_global_step": 2080,
"best_metric": 0.12297015637159348,
"best_model_checkpoint": "./llama3-ft-deepspeed/checkpoint-2080",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2080,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024052916416115455,
"grad_norm": 1.146120548248291,
"learning_rate": 0.0001995673076923077,
"loss": 1.6924,
"mean_token_accuracy": 0.6730133682489395,
"num_tokens": 10240.0,
"step": 10
},
{
"epoch": 0.04810583283223091,
"grad_norm": 1.4788161516189575,
"learning_rate": 0.00019908653846153847,
"loss": 0.7696,
"mean_token_accuracy": 0.835153691470623,
"num_tokens": 20480.0,
"step": 20
},
{
"epoch": 0.07215874924834637,
"grad_norm": 1.395352840423584,
"learning_rate": 0.00019860576923076922,
"loss": 0.4512,
"mean_token_accuracy": 0.8917673289775848,
"num_tokens": 30720.0,
"step": 30
},
{
"epoch": 0.09621166566446182,
"grad_norm": 1.7515279054641724,
"learning_rate": 0.000198125,
"loss": 0.3253,
"mean_token_accuracy": 0.9200380504131317,
"num_tokens": 40960.0,
"step": 40
},
{
"epoch": 0.12026458208057728,
"grad_norm": 0.7726802229881287,
"learning_rate": 0.00019764423076923079,
"loss": 0.2675,
"mean_token_accuracy": 0.9355374664068222,
"num_tokens": 51200.0,
"step": 50
},
{
"epoch": 0.14431749849669273,
"grad_norm": 0.6123030781745911,
"learning_rate": 0.00019716346153846154,
"loss": 0.2365,
"mean_token_accuracy": 0.9364401072263717,
"num_tokens": 61440.0,
"step": 60
},
{
"epoch": 0.1683704149128082,
"grad_norm": 0.6146537065505981,
"learning_rate": 0.00019668269230769232,
"loss": 0.2189,
"mean_token_accuracy": 0.9391294255852699,
"num_tokens": 71680.0,
"step": 70
},
{
"epoch": 0.19242333132892364,
"grad_norm": 0.5948600769042969,
"learning_rate": 0.0001962019230769231,
"loss": 0.2081,
"mean_token_accuracy": 0.9405477479100227,
"num_tokens": 81920.0,
"step": 80
},
{
"epoch": 0.2164762477450391,
"grad_norm": 0.5457659363746643,
"learning_rate": 0.00019572115384615385,
"loss": 0.2019,
"mean_token_accuracy": 0.9453761458396912,
"num_tokens": 92160.0,
"step": 90
},
{
"epoch": 0.24052916416115455,
"grad_norm": 0.49840784072875977,
"learning_rate": 0.00019524038461538463,
"loss": 0.2039,
"mean_token_accuracy": 0.9441700100898742,
"num_tokens": 102400.0,
"step": 100
},
{
"epoch": 0.26458208057727,
"grad_norm": 0.46384331583976746,
"learning_rate": 0.00019475961538461541,
"loss": 0.2028,
"mean_token_accuracy": 0.9462413385510444,
"num_tokens": 112640.0,
"step": 110
},
{
"epoch": 0.28863499699338546,
"grad_norm": 0.44089949131011963,
"learning_rate": 0.00019427884615384617,
"loss": 0.1871,
"mean_token_accuracy": 0.9464857250452041,
"num_tokens": 122880.0,
"step": 120
},
{
"epoch": 0.3126879134095009,
"grad_norm": 0.3707946240901947,
"learning_rate": 0.00019379807692307695,
"loss": 0.177,
"mean_token_accuracy": 0.9484001085162163,
"num_tokens": 133120.0,
"step": 130
},
{
"epoch": 0.3367408298256164,
"grad_norm": 0.3341757655143738,
"learning_rate": 0.0001933173076923077,
"loss": 0.1847,
"mean_token_accuracy": 0.9493804201483727,
"num_tokens": 143360.0,
"step": 140
},
{
"epoch": 0.3607937462417318,
"grad_norm": 0.4739825129508972,
"learning_rate": 0.00019283653846153845,
"loss": 0.1631,
"mean_token_accuracy": 0.9556754723191261,
"num_tokens": 153600.0,
"step": 150
},
{
"epoch": 0.3848466626578473,
"grad_norm": 0.3651292622089386,
"learning_rate": 0.00019235576923076923,
"loss": 0.1597,
"mean_token_accuracy": 0.9511243969202041,
"num_tokens": 163840.0,
"step": 160
},
{
"epoch": 0.40889957907396274,
"grad_norm": 0.3972345292568207,
"learning_rate": 0.00019187500000000002,
"loss": 0.19,
"mean_token_accuracy": 0.9461832985281944,
"num_tokens": 174080.0,
"step": 170
},
{
"epoch": 0.4329524954900782,
"grad_norm": 0.281608521938324,
"learning_rate": 0.00019139423076923077,
"loss": 0.1693,
"mean_token_accuracy": 0.9498817101120949,
"num_tokens": 184320.0,
"step": 180
},
{
"epoch": 0.45700541190619365,
"grad_norm": 0.3879983723163605,
"learning_rate": 0.00019091346153846155,
"loss": 0.1549,
"mean_token_accuracy": 0.9546341434121132,
"num_tokens": 194560.0,
"step": 190
},
{
"epoch": 0.4810583283223091,
"grad_norm": 0.31266576051712036,
"learning_rate": 0.00019043269230769233,
"loss": 0.1689,
"mean_token_accuracy": 0.95065086632967,
"num_tokens": 204800.0,
"step": 200
},
{
"epoch": 0.5051112447384245,
"grad_norm": 0.41939643025398254,
"learning_rate": 0.00018995192307692308,
"loss": 0.1798,
"mean_token_accuracy": 0.9457689806818962,
"num_tokens": 215040.0,
"step": 210
},
{
"epoch": 0.52916416115454,
"grad_norm": 0.45524027943611145,
"learning_rate": 0.00018947115384615386,
"loss": 0.1708,
"mean_token_accuracy": 0.9461533144116402,
"num_tokens": 225280.0,
"step": 220
},
{
"epoch": 0.5532170775706554,
"grad_norm": 0.3720427453517914,
"learning_rate": 0.00018899038461538462,
"loss": 0.162,
"mean_token_accuracy": 0.9516115784645081,
"num_tokens": 235520.0,
"step": 230
},
{
"epoch": 0.5772699939867709,
"grad_norm": 0.3217442035675049,
"learning_rate": 0.0001885096153846154,
"loss": 0.163,
"mean_token_accuracy": 0.9550325214862824,
"num_tokens": 245760.0,
"step": 240
},
{
"epoch": 0.6013229104028863,
"grad_norm": 0.38512468338012695,
"learning_rate": 0.00018802884615384618,
"loss": 0.1607,
"mean_token_accuracy": 0.9519079640507698,
"num_tokens": 256000.0,
"step": 250
},
{
"epoch": 0.6253758268190018,
"grad_norm": 0.31379130482673645,
"learning_rate": 0.00018754807692307693,
"loss": 0.1379,
"mean_token_accuracy": 0.9568226426839829,
"num_tokens": 266240.0,
"step": 260
},
{
"epoch": 0.6494287432351172,
"grad_norm": 0.3925431966781616,
"learning_rate": 0.00018706730769230768,
"loss": 0.1527,
"mean_token_accuracy": 0.9536312386393547,
"num_tokens": 276480.0,
"step": 270
},
{
"epoch": 0.6734816596512327,
"grad_norm": 0.3034338355064392,
"learning_rate": 0.00018658653846153847,
"loss": 0.1648,
"mean_token_accuracy": 0.9480760931968689,
"num_tokens": 286720.0,
"step": 280
},
{
"epoch": 0.6975345760673481,
"grad_norm": 0.36241453886032104,
"learning_rate": 0.00018610576923076925,
"loss": 0.1757,
"mean_token_accuracy": 0.9463522598147392,
"num_tokens": 296960.0,
"step": 290
},
{
"epoch": 0.7215874924834637,
"grad_norm": 0.3405214846134186,
"learning_rate": 0.000185625,
"loss": 0.1631,
"mean_token_accuracy": 0.949561494588852,
"num_tokens": 307200.0,
"step": 300
},
{
"epoch": 0.745640408899579,
"grad_norm": 0.2847406268119812,
"learning_rate": 0.00018514423076923078,
"loss": 0.1533,
"mean_token_accuracy": 0.9555268749594689,
"num_tokens": 317440.0,
"step": 310
},
{
"epoch": 0.7696933253156946,
"grad_norm": 0.26305094361305237,
"learning_rate": 0.00018466346153846153,
"loss": 0.1611,
"mean_token_accuracy": 0.9532571867108345,
"num_tokens": 327680.0,
"step": 320
},
{
"epoch": 0.79374624173181,
"grad_norm": 0.2989274263381958,
"learning_rate": 0.0001841826923076923,
"loss": 0.1441,
"mean_token_accuracy": 0.9533735632896423,
"num_tokens": 337920.0,
"step": 330
},
{
"epoch": 0.8177991581479255,
"grad_norm": 0.254688560962677,
"learning_rate": 0.0001837019230769231,
"loss": 0.1495,
"mean_token_accuracy": 0.9542877942323684,
"num_tokens": 348160.0,
"step": 340
},
{
"epoch": 0.8418520745640409,
"grad_norm": 0.26552721858024597,
"learning_rate": 0.00018322115384615385,
"loss": 0.149,
"mean_token_accuracy": 0.9527715161442757,
"num_tokens": 358400.0,
"step": 350
},
{
"epoch": 0.8659049909801564,
"grad_norm": 0.2920278012752533,
"learning_rate": 0.00018274038461538463,
"loss": 0.1506,
"mean_token_accuracy": 0.9556835174560547,
"num_tokens": 368640.0,
"step": 360
},
{
"epoch": 0.8899579073962718,
"grad_norm": 0.34060242772102356,
"learning_rate": 0.0001822596153846154,
"loss": 0.1332,
"mean_token_accuracy": 0.959219790995121,
"num_tokens": 378880.0,
"step": 370
},
{
"epoch": 0.9140108238123873,
"grad_norm": 0.3549470603466034,
"learning_rate": 0.00018177884615384616,
"loss": 0.1553,
"mean_token_accuracy": 0.9529827669262886,
"num_tokens": 389120.0,
"step": 380
},
{
"epoch": 0.9380637402285027,
"grad_norm": 0.26259645819664,
"learning_rate": 0.00018129807692307694,
"loss": 0.1355,
"mean_token_accuracy": 0.9546903222799301,
"num_tokens": 399360.0,
"step": 390
},
{
"epoch": 0.9621166566446182,
"grad_norm": 0.28768494725227356,
"learning_rate": 0.0001808173076923077,
"loss": 0.1314,
"mean_token_accuracy": 0.9578663051128388,
"num_tokens": 409600.0,
"step": 400
},
{
"epoch": 0.9861695730607336,
"grad_norm": 0.23446713387966156,
"learning_rate": 0.00018033653846153848,
"loss": 0.1433,
"mean_token_accuracy": 0.9555784195661545,
"num_tokens": 419840.0,
"step": 410
},
{
"epoch": 1.0,
"eval_loss": 0.14436590671539307,
"eval_mean_token_accuracy": 0.9552223542903332,
"eval_num_tokens": 425728.0,
"eval_runtime": 25.1182,
"eval_samples_per_second": 14.73,
"eval_steps_per_second": 1.871,
"step": 416
},
{
"epoch": 1.009621166566446,
"grad_norm": 0.2881328761577606,
"learning_rate": 0.00017985576923076923,
"loss": 0.1424,
"mean_token_accuracy": 0.9537300452207907,
"num_tokens": 429824.0,
"step": 420
},
{
"epoch": 1.0336740829825617,
"grad_norm": 1.1323237419128418,
"learning_rate": 0.000179375,
"loss": 0.117,
"mean_token_accuracy": 0.960891704261303,
"num_tokens": 440064.0,
"step": 430
},
{
"epoch": 1.057726999398677,
"grad_norm": 0.3971687853336334,
"learning_rate": 0.00017889423076923076,
"loss": 0.1233,
"mean_token_accuracy": 0.9594320252537727,
"num_tokens": 450304.0,
"step": 440
},
{
"epoch": 1.0817799158147925,
"grad_norm": 0.235861137509346,
"learning_rate": 0.00017841346153846154,
"loss": 0.1322,
"mean_token_accuracy": 0.9574261009693146,
"num_tokens": 460544.0,
"step": 450
},
{
"epoch": 1.1058328322309081,
"grad_norm": 0.2572536766529083,
"learning_rate": 0.00017793269230769232,
"loss": 0.1288,
"mean_token_accuracy": 0.9600686863064766,
"num_tokens": 470784.0,
"step": 460
},
{
"epoch": 1.1298857486470235,
"grad_norm": 0.33651918172836304,
"learning_rate": 0.00017745192307692308,
"loss": 0.1367,
"mean_token_accuracy": 0.9558201640844345,
"num_tokens": 481024.0,
"step": 470
},
{
"epoch": 1.153938665063139,
"grad_norm": 0.2679121196269989,
"learning_rate": 0.00017697115384615386,
"loss": 0.1218,
"mean_token_accuracy": 0.9592645660042762,
"num_tokens": 491264.0,
"step": 480
},
{
"epoch": 1.1779915814792543,
"grad_norm": 0.24321414530277252,
"learning_rate": 0.00017649038461538464,
"loss": 0.1338,
"mean_token_accuracy": 0.958324646949768,
"num_tokens": 501504.0,
"step": 490
},
{
"epoch": 1.2020444978953697,
"grad_norm": 0.29272493720054626,
"learning_rate": 0.0001760096153846154,
"loss": 0.1252,
"mean_token_accuracy": 0.9605333045125007,
"num_tokens": 511744.0,
"step": 500
},
{
"epoch": 1.2260974143114853,
"grad_norm": 0.24240529537200928,
"learning_rate": 0.00017552884615384617,
"loss": 0.1208,
"mean_token_accuracy": 0.960656826198101,
"num_tokens": 521984.0,
"step": 510
},
{
"epoch": 1.2501503307276007,
"grad_norm": 0.3100011944770813,
"learning_rate": 0.00017504807692307695,
"loss": 0.1254,
"mean_token_accuracy": 0.9609130263328552,
"num_tokens": 532224.0,
"step": 520
},
{
"epoch": 1.2742032471437161,
"grad_norm": 0.28045424818992615,
"learning_rate": 0.00017456730769230768,
"loss": 0.1304,
"mean_token_accuracy": 0.9579013884067535,
"num_tokens": 542464.0,
"step": 530
},
{
"epoch": 1.2982561635598318,
"grad_norm": 0.3832496702671051,
"learning_rate": 0.00017408653846153846,
"loss": 0.1427,
"mean_token_accuracy": 0.9550067007541656,
"num_tokens": 552704.0,
"step": 540
},
{
"epoch": 1.3223090799759472,
"grad_norm": 0.32106658816337585,
"learning_rate": 0.00017360576923076924,
"loss": 0.129,
"mean_token_accuracy": 0.956248240172863,
"num_tokens": 562944.0,
"step": 550
},
{
"epoch": 1.3463619963920626,
"grad_norm": 0.3141852915287018,
"learning_rate": 0.000173125,
"loss": 0.1145,
"mean_token_accuracy": 0.9619206488132477,
"num_tokens": 573184.0,
"step": 560
},
{
"epoch": 1.370414912808178,
"grad_norm": 0.31353870034217834,
"learning_rate": 0.00017264423076923077,
"loss": 0.126,
"mean_token_accuracy": 0.9610635504126549,
"num_tokens": 583424.0,
"step": 570
},
{
"epoch": 1.3944678292242934,
"grad_norm": 0.24016191065311432,
"learning_rate": 0.00017216346153846155,
"loss": 0.1318,
"mean_token_accuracy": 0.9551108077168464,
"num_tokens": 593664.0,
"step": 580
},
{
"epoch": 1.418520745640409,
"grad_norm": 0.2362779825925827,
"learning_rate": 0.0001716826923076923,
"loss": 0.1163,
"mean_token_accuracy": 0.9617116883397102,
"num_tokens": 603904.0,
"step": 590
},
{
"epoch": 1.4425736620565244,
"grad_norm": 0.2012677937746048,
"learning_rate": 0.0001712019230769231,
"loss": 0.1378,
"mean_token_accuracy": 0.9564808994531632,
"num_tokens": 614144.0,
"step": 600
},
{
"epoch": 1.4666265784726398,
"grad_norm": 0.33629047870635986,
"learning_rate": 0.00017072115384615387,
"loss": 0.1215,
"mean_token_accuracy": 0.9621520712971687,
"num_tokens": 624384.0,
"step": 610
},
{
"epoch": 1.4906794948887554,
"grad_norm": 0.30020079016685486,
"learning_rate": 0.00017024038461538462,
"loss": 0.1318,
"mean_token_accuracy": 0.9570673331618309,
"num_tokens": 634624.0,
"step": 620
},
{
"epoch": 1.5147324113048706,
"grad_norm": 0.2715687155723572,
"learning_rate": 0.0001697596153846154,
"loss": 0.1248,
"mean_token_accuracy": 0.9611581727862358,
"num_tokens": 644864.0,
"step": 630
},
{
"epoch": 1.5387853277209862,
"grad_norm": 0.2919619381427765,
"learning_rate": 0.00016927884615384618,
"loss": 0.116,
"mean_token_accuracy": 0.9624506369233131,
"num_tokens": 655104.0,
"step": 640
},
{
"epoch": 1.5628382441371016,
"grad_norm": 0.26573172211647034,
"learning_rate": 0.00016879807692307694,
"loss": 0.1253,
"mean_token_accuracy": 0.9609674572944641,
"num_tokens": 665344.0,
"step": 650
},
{
"epoch": 1.586891160553217,
"grad_norm": 0.3533133566379547,
"learning_rate": 0.0001683173076923077,
"loss": 0.129,
"mean_token_accuracy": 0.9564981684088707,
"num_tokens": 675584.0,
"step": 660
},
{
"epoch": 1.6109440769693326,
"grad_norm": 0.23616884648799896,
"learning_rate": 0.00016783653846153847,
"loss": 0.1198,
"mean_token_accuracy": 0.9604200229048729,
"num_tokens": 685824.0,
"step": 670
},
{
"epoch": 1.634996993385448,
"grad_norm": 0.29362770915031433,
"learning_rate": 0.00016735576923076922,
"loss": 0.1248,
"mean_token_accuracy": 0.9565964505076409,
"num_tokens": 696064.0,
"step": 680
},
{
"epoch": 1.6590499098015634,
"grad_norm": 0.32813316583633423,
"learning_rate": 0.000166875,
"loss": 0.1404,
"mean_token_accuracy": 0.9544960707426071,
"num_tokens": 706304.0,
"step": 690
},
{
"epoch": 1.683102826217679,
"grad_norm": 0.23794318735599518,
"learning_rate": 0.00016639423076923078,
"loss": 0.1239,
"mean_token_accuracy": 0.9590726107358932,
"num_tokens": 716544.0,
"step": 700
},
{
"epoch": 1.7071557426337942,
"grad_norm": 0.3290494680404663,
"learning_rate": 0.00016591346153846154,
"loss": 0.1136,
"mean_token_accuracy": 0.9635635167360306,
"num_tokens": 726784.0,
"step": 710
},
{
"epoch": 1.7312086590499098,
"grad_norm": 0.2637302577495575,
"learning_rate": 0.00016543269230769232,
"loss": 0.1211,
"mean_token_accuracy": 0.957936991751194,
"num_tokens": 737024.0,
"step": 720
},
{
"epoch": 1.7552615754660252,
"grad_norm": 0.2866067588329315,
"learning_rate": 0.0001649519230769231,
"loss": 0.1188,
"mean_token_accuracy": 0.9595590904355049,
"num_tokens": 747264.0,
"step": 730
},
{
"epoch": 1.7793144918821406,
"grad_norm": 0.31299787759780884,
"learning_rate": 0.00016447115384615385,
"loss": 0.1303,
"mean_token_accuracy": 0.9595273941755295,
"num_tokens": 757504.0,
"step": 740
},
{
"epoch": 1.8033674082982563,
"grad_norm": 0.25704699754714966,
"learning_rate": 0.00016399038461538463,
"loss": 0.12,
"mean_token_accuracy": 0.9622810766100883,
"num_tokens": 767744.0,
"step": 750
},
{
"epoch": 1.8274203247143717,
"grad_norm": 0.20122775435447693,
"learning_rate": 0.0001635096153846154,
"loss": 0.1153,
"mean_token_accuracy": 0.961222605407238,
"num_tokens": 777984.0,
"step": 760
},
{
"epoch": 1.851473241130487,
"grad_norm": 0.21299275755882263,
"learning_rate": 0.00016302884615384617,
"loss": 0.114,
"mean_token_accuracy": 0.9608678832650185,
"num_tokens": 788224.0,
"step": 770
},
{
"epoch": 1.8755261575466027,
"grad_norm": 0.25124669075012207,
"learning_rate": 0.00016254807692307695,
"loss": 0.122,
"mean_token_accuracy": 0.962291096150875,
"num_tokens": 798464.0,
"step": 780
},
{
"epoch": 1.8995790739627179,
"grad_norm": 0.2945667803287506,
"learning_rate": 0.0001620673076923077,
"loss": 0.1183,
"mean_token_accuracy": 0.9604008629918098,
"num_tokens": 808704.0,
"step": 790
},
{
"epoch": 1.9236319903788335,
"grad_norm": 0.2346087247133255,
"learning_rate": 0.00016158653846153845,
"loss": 0.1136,
"mean_token_accuracy": 0.963145537674427,
"num_tokens": 818944.0,
"step": 800
},
{
"epoch": 1.9476849067949489,
"grad_norm": 0.22119984030723572,
"learning_rate": 0.00016110576923076923,
"loss": 0.1235,
"mean_token_accuracy": 0.9591810956597329,
"num_tokens": 829184.0,
"step": 810
},
{
"epoch": 1.9717378232110643,
"grad_norm": 0.2481156885623932,
"learning_rate": 0.00016062500000000001,
"loss": 0.1208,
"mean_token_accuracy": 0.9615909501910209,
"num_tokens": 839424.0,
"step": 820
},
{
"epoch": 1.99579073962718,
"grad_norm": 0.24333243072032928,
"learning_rate": 0.00016014423076923077,
"loss": 0.121,
"mean_token_accuracy": 0.9590676620602607,
"num_tokens": 849664.0,
"step": 830
},
{
"epoch": 2.0,
"eval_loss": 0.1264333575963974,
"eval_mean_token_accuracy": 0.9596163808031285,
"eval_num_tokens": 851456.0,
"eval_runtime": 24.9078,
"eval_samples_per_second": 14.855,
"eval_steps_per_second": 1.887,
"step": 832
},
{
"epoch": 2.019242333132892,
"grad_norm": 0.25562378764152527,
"learning_rate": 0.00015966346153846155,
"loss": 0.1082,
"mean_token_accuracy": 0.9661401877036462,
"num_tokens": 859648.0,
"step": 840
},
{
"epoch": 2.0432952495490078,
"grad_norm": 0.2851448059082031,
"learning_rate": 0.00015918269230769233,
"loss": 0.0988,
"mean_token_accuracy": 0.9670166626572609,
"num_tokens": 869888.0,
"step": 850
},
{
"epoch": 2.0673481659651234,
"grad_norm": 0.3552910387516022,
"learning_rate": 0.00015870192307692308,
"loss": 0.1058,
"mean_token_accuracy": 0.961455948650837,
"num_tokens": 880128.0,
"step": 860
},
{
"epoch": 2.0914010823812386,
"grad_norm": 0.18810558319091797,
"learning_rate": 0.00015822115384615386,
"loss": 0.0962,
"mean_token_accuracy": 0.965205217897892,
"num_tokens": 890368.0,
"step": 870
},
{
"epoch": 2.115453998797354,
"grad_norm": 0.24357327818870544,
"learning_rate": 0.00015774038461538462,
"loss": 0.1121,
"mean_token_accuracy": 0.9618488609790802,
"num_tokens": 900608.0,
"step": 880
},
{
"epoch": 2.13950691521347,
"grad_norm": 0.24040566384792328,
"learning_rate": 0.0001572596153846154,
"loss": 0.1064,
"mean_token_accuracy": 0.9660862430930137,
"num_tokens": 910848.0,
"step": 890
},
{
"epoch": 2.163559831629585,
"grad_norm": 0.24545449018478394,
"learning_rate": 0.00015677884615384618,
"loss": 0.0965,
"mean_token_accuracy": 0.9677914634346962,
"num_tokens": 921088.0,
"step": 900
},
{
"epoch": 2.1876127480457006,
"grad_norm": 0.30429190397262573,
"learning_rate": 0.00015629807692307693,
"loss": 0.105,
"mean_token_accuracy": 0.9644672557711601,
"num_tokens": 931328.0,
"step": 910
},
{
"epoch": 2.2116656644618162,
"grad_norm": 0.2934926450252533,
"learning_rate": 0.00015581730769230768,
"loss": 0.0968,
"mean_token_accuracy": 0.965617573261261,
"num_tokens": 941568.0,
"step": 920
},
{
"epoch": 2.2357185808779314,
"grad_norm": 0.3533467948436737,
"learning_rate": 0.00015533653846153846,
"loss": 0.1073,
"mean_token_accuracy": 0.9617443799972534,
"num_tokens": 951808.0,
"step": 930
},
{
"epoch": 2.259771497294047,
"grad_norm": 0.15458804368972778,
"learning_rate": 0.00015485576923076924,
"loss": 0.1069,
"mean_token_accuracy": 0.9629932105541229,
"num_tokens": 962048.0,
"step": 940
},
{
"epoch": 2.283824413710162,
"grad_norm": 0.6568232774734497,
"learning_rate": 0.000154375,
"loss": 0.1077,
"mean_token_accuracy": 0.9618542537093162,
"num_tokens": 972288.0,
"step": 950
},
{
"epoch": 2.307877330126278,
"grad_norm": 0.3730143904685974,
"learning_rate": 0.00015389423076923078,
"loss": 0.1139,
"mean_token_accuracy": 0.9590795397758484,
"num_tokens": 982528.0,
"step": 960
},
{
"epoch": 2.3319302465423934,
"grad_norm": 0.26199349761009216,
"learning_rate": 0.00015341346153846153,
"loss": 0.115,
"mean_token_accuracy": 0.9604379132390022,
"num_tokens": 992768.0,
"step": 970
},
{
"epoch": 2.3559831629585086,
"grad_norm": 0.3791094720363617,
"learning_rate": 0.0001529326923076923,
"loss": 0.1066,
"mean_token_accuracy": 0.9632134348154068,
"num_tokens": 1003008.0,
"step": 980
},
{
"epoch": 2.3800360793746242,
"grad_norm": 0.1697998046875,
"learning_rate": 0.0001524519230769231,
"loss": 0.1024,
"mean_token_accuracy": 0.9637576416134834,
"num_tokens": 1013248.0,
"step": 990
},
{
"epoch": 2.4040889957907394,
"grad_norm": 0.3326902389526367,
"learning_rate": 0.00015197115384615385,
"loss": 0.1066,
"mean_token_accuracy": 0.9614829778671264,
"num_tokens": 1023488.0,
"step": 1000
},
{
"epoch": 2.428141912206855,
"grad_norm": 0.22747723758220673,
"learning_rate": 0.00015149038461538463,
"loss": 0.0924,
"mean_token_accuracy": 0.9677571937441826,
"num_tokens": 1033728.0,
"step": 1010
},
{
"epoch": 2.4521948286229707,
"grad_norm": 0.25410860776901245,
"learning_rate": 0.0001510096153846154,
"loss": 0.1097,
"mean_token_accuracy": 0.963656097650528,
"num_tokens": 1043968.0,
"step": 1020
},
{
"epoch": 2.476247745039086,
"grad_norm": 0.2268102765083313,
"learning_rate": 0.00015052884615384616,
"loss": 0.1018,
"mean_token_accuracy": 0.9653624445199966,
"num_tokens": 1054208.0,
"step": 1030
},
{
"epoch": 2.5003006614552015,
"grad_norm": 0.23511724174022675,
"learning_rate": 0.00015004807692307694,
"loss": 0.0969,
"mean_token_accuracy": 0.9658154919743538,
"num_tokens": 1064448.0,
"step": 1040
},
{
"epoch": 2.5243535778713166,
"grad_norm": 0.23023869097232819,
"learning_rate": 0.0001495673076923077,
"loss": 0.1053,
"mean_token_accuracy": 0.96552574634552,
"num_tokens": 1074688.0,
"step": 1050
},
{
"epoch": 2.5484064942874323,
"grad_norm": 0.25081828236579895,
"learning_rate": 0.00014908653846153845,
"loss": 0.1044,
"mean_token_accuracy": 0.9639346837997437,
"num_tokens": 1084928.0,
"step": 1060
},
{
"epoch": 2.572459410703548,
"grad_norm": 0.2557835280895233,
"learning_rate": 0.00014860576923076923,
"loss": 0.106,
"mean_token_accuracy": 0.9623663991689682,
"num_tokens": 1095168.0,
"step": 1070
},
{
"epoch": 2.5965123271196635,
"grad_norm": 0.1407734751701355,
"learning_rate": 0.000148125,
"loss": 0.1092,
"mean_token_accuracy": 0.9625474750995636,
"num_tokens": 1105408.0,
"step": 1080
},
{
"epoch": 2.6205652435357787,
"grad_norm": 0.2862281799316406,
"learning_rate": 0.00014764423076923076,
"loss": 0.1053,
"mean_token_accuracy": 0.9632299616932869,
"num_tokens": 1115648.0,
"step": 1090
},
{
"epoch": 2.6446181599518943,
"grad_norm": 0.26334843039512634,
"learning_rate": 0.00014716346153846154,
"loss": 0.1209,
"mean_token_accuracy": 0.9601381734013558,
"num_tokens": 1125888.0,
"step": 1100
},
{
"epoch": 2.6686710763680095,
"grad_norm": 0.20137883722782135,
"learning_rate": 0.00014668269230769232,
"loss": 0.1057,
"mean_token_accuracy": 0.9626315057277679,
"num_tokens": 1136128.0,
"step": 1110
},
{
"epoch": 2.692723992784125,
"grad_norm": 0.3017769157886505,
"learning_rate": 0.00014620192307692308,
"loss": 0.104,
"mean_token_accuracy": 0.9653211057186126,
"num_tokens": 1146368.0,
"step": 1120
},
{
"epoch": 2.7167769092002407,
"grad_norm": 0.24447724223136902,
"learning_rate": 0.00014572115384615386,
"loss": 0.111,
"mean_token_accuracy": 0.9617329522967338,
"num_tokens": 1156608.0,
"step": 1130
},
{
"epoch": 2.740829825616356,
"grad_norm": 0.22615140676498413,
"learning_rate": 0.00014524038461538464,
"loss": 0.113,
"mean_token_accuracy": 0.9617576941847801,
"num_tokens": 1166848.0,
"step": 1140
},
{
"epoch": 2.7648827420324715,
"grad_norm": 0.20445455610752106,
"learning_rate": 0.0001447596153846154,
"loss": 0.1064,
"mean_token_accuracy": 0.965658301115036,
"num_tokens": 1177088.0,
"step": 1150
},
{
"epoch": 2.7889356584485867,
"grad_norm": 0.2821711003780365,
"learning_rate": 0.00014427884615384617,
"loss": 0.1088,
"mean_token_accuracy": 0.9635483458638191,
"num_tokens": 1187328.0,
"step": 1160
},
{
"epoch": 2.8129885748647023,
"grad_norm": 0.21199429035186768,
"learning_rate": 0.00014379807692307695,
"loss": 0.1071,
"mean_token_accuracy": 0.9626901999115944,
"num_tokens": 1197568.0,
"step": 1170
},
{
"epoch": 2.837041491280818,
"grad_norm": 0.23910683393478394,
"learning_rate": 0.00014331730769230768,
"loss": 0.1095,
"mean_token_accuracy": 0.9642450660467148,
"num_tokens": 1207808.0,
"step": 1180
},
{
"epoch": 2.861094407696933,
"grad_norm": 0.22157999873161316,
"learning_rate": 0.00014283653846153846,
"loss": 0.0968,
"mean_token_accuracy": 0.964881993830204,
"num_tokens": 1218048.0,
"step": 1190
},
{
"epoch": 2.8851473241130488,
"grad_norm": 0.25650596618652344,
"learning_rate": 0.00014235576923076924,
"loss": 0.1027,
"mean_token_accuracy": 0.9612545475363732,
"num_tokens": 1228288.0,
"step": 1200
},
{
"epoch": 2.909200240529164,
"grad_norm": 0.32955703139305115,
"learning_rate": 0.000141875,
"loss": 0.1096,
"mean_token_accuracy": 0.9617963835597039,
"num_tokens": 1238528.0,
"step": 1210
},
{
"epoch": 2.9332531569452795,
"grad_norm": 0.21780824661254883,
"learning_rate": 0.00014139423076923077,
"loss": 0.1033,
"mean_token_accuracy": 0.9646632343530654,
"num_tokens": 1248768.0,
"step": 1220
},
{
"epoch": 2.957306073361395,
"grad_norm": 0.2339126020669937,
"learning_rate": 0.00014091346153846155,
"loss": 0.1031,
"mean_token_accuracy": 0.964007930457592,
"num_tokens": 1259008.0,
"step": 1230
},
{
"epoch": 2.981358989777511,
"grad_norm": 0.31986188888549805,
"learning_rate": 0.0001404326923076923,
"loss": 0.1018,
"mean_token_accuracy": 0.9668645352125168,
"num_tokens": 1269248.0,
"step": 1240
},
{
"epoch": 3.0,
"eval_loss": 0.12327124178409576,
"eval_mean_token_accuracy": 0.9614017339462929,
"eval_num_tokens": 1277184.0,
"eval_runtime": 25.0119,
"eval_samples_per_second": 14.793,
"eval_steps_per_second": 1.879,
"step": 1248
},
{
"epoch": 3.004810583283223,
"grad_norm": 0.20351627469062805,
"learning_rate": 0.0001399519230769231,
"loss": 0.0923,
"mean_token_accuracy": 0.9675439733725327,
"num_tokens": 1279232.0,
"step": 1250
},
{
"epoch": 3.0288634996993387,
"grad_norm": 0.3330595791339874,
"learning_rate": 0.00013947115384615387,
"loss": 0.0906,
"mean_token_accuracy": 0.9674146190285683,
"num_tokens": 1289472.0,
"step": 1260
},
{
"epoch": 3.052916416115454,
"grad_norm": 0.2875508666038513,
"learning_rate": 0.00013899038461538462,
"loss": 0.0953,
"mean_token_accuracy": 0.9661522597074509,
"num_tokens": 1299712.0,
"step": 1270
},
{
"epoch": 3.0769693325315695,
"grad_norm": 0.26365041732788086,
"learning_rate": 0.0001385096153846154,
"loss": 0.0898,
"mean_token_accuracy": 0.9656254693865776,
"num_tokens": 1309952.0,
"step": 1280
},
{
"epoch": 3.101022248947685,
"grad_norm": 0.22576822340488434,
"learning_rate": 0.00013802884615384618,
"loss": 0.0911,
"mean_token_accuracy": 0.9678579106926918,
"num_tokens": 1320192.0,
"step": 1290
},
{
"epoch": 3.1250751653638003,
"grad_norm": 0.2698352038860321,
"learning_rate": 0.00013754807692307694,
"loss": 0.098,
"mean_token_accuracy": 0.9671777725219727,
"num_tokens": 1330432.0,
"step": 1300
},
{
"epoch": 3.149128081779916,
"grad_norm": 0.29429781436920166,
"learning_rate": 0.0001370673076923077,
"loss": 0.0883,
"mean_token_accuracy": 0.9666375696659089,
"num_tokens": 1340672.0,
"step": 1310
},
{
"epoch": 3.173180998196031,
"grad_norm": 0.25614482164382935,
"learning_rate": 0.00013658653846153847,
"loss": 0.0915,
"mean_token_accuracy": 0.967202040553093,
"num_tokens": 1350912.0,
"step": 1320
},
{
"epoch": 3.1972339146121467,
"grad_norm": 0.2897791862487793,
"learning_rate": 0.00013610576923076922,
"loss": 0.0956,
"mean_token_accuracy": 0.965019428730011,
"num_tokens": 1361152.0,
"step": 1330
},
{
"epoch": 3.2212868310282623,
"grad_norm": 0.1813499927520752,
"learning_rate": 0.000135625,
"loss": 0.0941,
"mean_token_accuracy": 0.9646437510848045,
"num_tokens": 1371392.0,
"step": 1340
},
{
"epoch": 3.2453397474443775,
"grad_norm": 0.19243107736110687,
"learning_rate": 0.00013514423076923078,
"loss": 0.091,
"mean_token_accuracy": 0.9650159657001496,
"num_tokens": 1381632.0,
"step": 1350
},
{
"epoch": 3.269392663860493,
"grad_norm": 0.22238536179065704,
"learning_rate": 0.00013466346153846154,
"loss": 0.0924,
"mean_token_accuracy": 0.9653570145368576,
"num_tokens": 1391872.0,
"step": 1360
},
{
"epoch": 3.2934455802766087,
"grad_norm": 0.15673308074474335,
"learning_rate": 0.00013418269230769232,
"loss": 0.0892,
"mean_token_accuracy": 0.9664774596691131,
"num_tokens": 1402112.0,
"step": 1370
},
{
"epoch": 3.317498496692724,
"grad_norm": 0.2173730581998825,
"learning_rate": 0.0001337019230769231,
"loss": 0.0978,
"mean_token_accuracy": 0.9646672755479813,
"num_tokens": 1412352.0,
"step": 1380
},
{
"epoch": 3.3415514131088395,
"grad_norm": 0.17452774941921234,
"learning_rate": 0.00013322115384615385,
"loss": 0.0988,
"mean_token_accuracy": 0.9650528088212014,
"num_tokens": 1422592.0,
"step": 1390
},
{
"epoch": 3.365604329524955,
"grad_norm": 0.19794107973575592,
"learning_rate": 0.00013274038461538463,
"loss": 0.0906,
"mean_token_accuracy": 0.9670823276042938,
"num_tokens": 1432832.0,
"step": 1400
},
{
"epoch": 3.3896572459410703,
"grad_norm": 0.17740049958229065,
"learning_rate": 0.00013225961538461539,
"loss": 0.0917,
"mean_token_accuracy": 0.9663808658719063,
"num_tokens": 1443072.0,
"step": 1410
},
{
"epoch": 3.413710162357186,
"grad_norm": 0.24540650844573975,
"learning_rate": 0.00013177884615384617,
"loss": 0.0889,
"mean_token_accuracy": 0.9661073669791221,
"num_tokens": 1453312.0,
"step": 1420
},
{
"epoch": 3.437763078773301,
"grad_norm": 0.2300598919391632,
"learning_rate": 0.00013129807692307695,
"loss": 0.0865,
"mean_token_accuracy": 0.9688371613621711,
"num_tokens": 1463552.0,
"step": 1430
},
{
"epoch": 3.4618159951894167,
"grad_norm": 0.18420913815498352,
"learning_rate": 0.0001308173076923077,
"loss": 0.0963,
"mean_token_accuracy": 0.9648661985993385,
"num_tokens": 1473792.0,
"step": 1440
},
{
"epoch": 3.4858689116055324,
"grad_norm": 0.27637794613838196,
"learning_rate": 0.00013033653846153845,
"loss": 0.0984,
"mean_token_accuracy": 0.9647489741444588,
"num_tokens": 1484032.0,
"step": 1450
},
{
"epoch": 3.5099218280216475,
"grad_norm": 0.28515446186065674,
"learning_rate": 0.00012985576923076923,
"loss": 0.0915,
"mean_token_accuracy": 0.9642152190208435,
"num_tokens": 1494272.0,
"step": 1460
},
{
"epoch": 3.533974744437763,
"grad_norm": 0.19887416064739227,
"learning_rate": 0.00012937500000000001,
"loss": 0.0963,
"mean_token_accuracy": 0.9659793645143508,
"num_tokens": 1504512.0,
"step": 1470
},
{
"epoch": 3.5580276608538783,
"grad_norm": 0.2664344906806946,
"learning_rate": 0.00012889423076923077,
"loss": 0.0896,
"mean_token_accuracy": 0.9680359989404679,
"num_tokens": 1514752.0,
"step": 1480
},
{
"epoch": 3.582080577269994,
"grad_norm": 0.23911890387535095,
"learning_rate": 0.00012841346153846155,
"loss": 0.0966,
"mean_token_accuracy": 0.9650948256254196,
"num_tokens": 1524992.0,
"step": 1490
},
{
"epoch": 3.6061334936861096,
"grad_norm": 0.21762046217918396,
"learning_rate": 0.0001279326923076923,
"loss": 0.0952,
"mean_token_accuracy": 0.9666063264012337,
"num_tokens": 1535232.0,
"step": 1500
},
{
"epoch": 3.6301864101022248,
"grad_norm": 0.263232558965683,
"learning_rate": 0.00012745192307692308,
"loss": 0.0993,
"mean_token_accuracy": 0.9661560922861099,
"num_tokens": 1545472.0,
"step": 1510
},
{
"epoch": 3.6542393265183404,
"grad_norm": 0.20630541443824768,
"learning_rate": 0.00012697115384615386,
"loss": 0.0918,
"mean_token_accuracy": 0.9665474951267242,
"num_tokens": 1555712.0,
"step": 1520
},
{
"epoch": 3.6782922429344556,
"grad_norm": 0.21548768877983093,
"learning_rate": 0.00012649038461538462,
"loss": 0.0942,
"mean_token_accuracy": 0.9646090790629387,
"num_tokens": 1565952.0,
"step": 1530
},
{
"epoch": 3.702345159350571,
"grad_norm": 0.21956314146518707,
"learning_rate": 0.0001260096153846154,
"loss": 0.0963,
"mean_token_accuracy": 0.9646272197365761,
"num_tokens": 1576192.0,
"step": 1540
},
{
"epoch": 3.726398075766687,
"grad_norm": 0.22929218411445618,
"learning_rate": 0.00012552884615384618,
"loss": 0.0946,
"mean_token_accuracy": 0.9660244584083557,
"num_tokens": 1586432.0,
"step": 1550
},
{
"epoch": 3.7504509921828024,
"grad_norm": 0.20628125965595245,
"learning_rate": 0.00012504807692307693,
"loss": 0.0939,
"mean_token_accuracy": 0.9671428605914116,
"num_tokens": 1596672.0,
"step": 1560
},
{
"epoch": 3.7745039085989176,
"grad_norm": 0.1761239767074585,
"learning_rate": 0.00012456730769230768,
"loss": 0.0985,
"mean_token_accuracy": 0.9652422875165939,
"num_tokens": 1606912.0,
"step": 1570
},
{
"epoch": 3.7985568250150332,
"grad_norm": 0.20456120371818542,
"learning_rate": 0.00012408653846153846,
"loss": 0.0992,
"mean_token_accuracy": 0.9635955929756165,
"num_tokens": 1617152.0,
"step": 1580
},
{
"epoch": 3.8226097414311484,
"grad_norm": 0.24852736294269562,
"learning_rate": 0.00012360576923076922,
"loss": 0.0969,
"mean_token_accuracy": 0.9656632468104362,
"num_tokens": 1627392.0,
"step": 1590
},
{
"epoch": 3.846662657847264,
"grad_norm": 0.2334773987531662,
"learning_rate": 0.000123125,
"loss": 0.0945,
"mean_token_accuracy": 0.9682855114340783,
"num_tokens": 1637632.0,
"step": 1600
},
{
"epoch": 3.8707155742633796,
"grad_norm": 0.2439630627632141,
"learning_rate": 0.00012264423076923078,
"loss": 0.0945,
"mean_token_accuracy": 0.9671804904937744,
"num_tokens": 1647872.0,
"step": 1610
},
{
"epoch": 3.894768490679495,
"grad_norm": 0.21514557301998138,
"learning_rate": 0.00012216346153846153,
"loss": 0.0952,
"mean_token_accuracy": 0.9666045814752579,
"num_tokens": 1658112.0,
"step": 1620
},
{
"epoch": 3.9188214070956104,
"grad_norm": 0.16191978752613068,
"learning_rate": 0.00012168269230769231,
"loss": 0.097,
"mean_token_accuracy": 0.966627161204815,
"num_tokens": 1668352.0,
"step": 1630
},
{
"epoch": 3.9428743235117256,
"grad_norm": 0.17086118459701538,
"learning_rate": 0.00012120192307692308,
"loss": 0.0923,
"mean_token_accuracy": 0.9672711491584778,
"num_tokens": 1678592.0,
"step": 1640
},
{
"epoch": 3.9669272399278412,
"grad_norm": 0.19103848934173584,
"learning_rate": 0.00012072115384615386,
"loss": 0.0921,
"mean_token_accuracy": 0.9661801844835282,
"num_tokens": 1688832.0,
"step": 1650
},
{
"epoch": 3.990980156343957,
"grad_norm": 0.22176852822303772,
"learning_rate": 0.00012024038461538463,
"loss": 0.0917,
"mean_token_accuracy": 0.9642520830035209,
"num_tokens": 1699072.0,
"step": 1660
},
{
"epoch": 4.0,
"eval_loss": 0.12371965497732162,
"eval_mean_token_accuracy": 0.9598764219182603,
"eval_num_tokens": 1702912.0,
"eval_runtime": 25.0613,
"eval_samples_per_second": 14.764,
"eval_steps_per_second": 1.875,
"step": 1664
},
{
"epoch": 4.0144317498496696,
"grad_norm": 0.1616247445344925,
"learning_rate": 0.0001197596153846154,
"loss": 0.0846,
"mean_token_accuracy": 0.9689040917616624,
"num_tokens": 1709056.0,
"step": 1670
},
{
"epoch": 4.038484666265784,
"grad_norm": 0.1974208503961563,
"learning_rate": 0.00011927884615384617,
"loss": 0.0902,
"mean_token_accuracy": 0.9659950643777847,
"num_tokens": 1719296.0,
"step": 1680
},
{
"epoch": 4.0625375826819,
"grad_norm": 0.19191399216651917,
"learning_rate": 0.00011879807692307694,
"loss": 0.0876,
"mean_token_accuracy": 0.9678742095828057,
"num_tokens": 1729536.0,
"step": 1690
},
{
"epoch": 4.0865904990980155,
"grad_norm": 0.17952245473861694,
"learning_rate": 0.0001183173076923077,
"loss": 0.0935,
"mean_token_accuracy": 0.9667584493756294,
"num_tokens": 1739776.0,
"step": 1700
},
{
"epoch": 4.110643415514131,
"grad_norm": 0.2491922825574875,
"learning_rate": 0.00011783653846153846,
"loss": 0.0829,
"mean_token_accuracy": 0.9714985772967338,
"num_tokens": 1750016.0,
"step": 1710
},
{
"epoch": 4.134696331930247,
"grad_norm": 0.23436853289604187,
"learning_rate": 0.00011735576923076923,
"loss": 0.0884,
"mean_token_accuracy": 0.9674542516469955,
"num_tokens": 1760256.0,
"step": 1720
},
{
"epoch": 4.158749248346362,
"grad_norm": 0.21396978199481964,
"learning_rate": 0.000116875,
"loss": 0.0871,
"mean_token_accuracy": 0.9674944341182709,
"num_tokens": 1770496.0,
"step": 1730
},
{
"epoch": 4.182802164762477,
"grad_norm": 0.15618039667606354,
"learning_rate": 0.00011639423076923078,
"loss": 0.0762,
"mean_token_accuracy": 0.97198745906353,
"num_tokens": 1780736.0,
"step": 1740
},
{
"epoch": 4.206855081178593,
"grad_norm": 0.16134659945964813,
"learning_rate": 0.00011591346153846154,
"loss": 0.093,
"mean_token_accuracy": 0.9636217251420021,
"num_tokens": 1790976.0,
"step": 1750
},
{
"epoch": 4.230907997594708,
"grad_norm": 0.12519049644470215,
"learning_rate": 0.00011543269230769231,
"loss": 0.0817,
"mean_token_accuracy": 0.9709686204791069,
"num_tokens": 1801216.0,
"step": 1760
},
{
"epoch": 4.254960914010824,
"grad_norm": 0.17119652032852173,
"learning_rate": 0.00011495192307692309,
"loss": 0.0821,
"mean_token_accuracy": 0.9688223898410797,
"num_tokens": 1811456.0,
"step": 1770
},
{
"epoch": 4.27901383042694,
"grad_norm": 0.13762086629867554,
"learning_rate": 0.00011447115384615386,
"loss": 0.0877,
"mean_token_accuracy": 0.9673933520913124,
"num_tokens": 1821696.0,
"step": 1780
},
{
"epoch": 4.303066746843054,
"grad_norm": 0.22409266233444214,
"learning_rate": 0.00011399038461538462,
"loss": 0.0907,
"mean_token_accuracy": 0.9678563743829727,
"num_tokens": 1831936.0,
"step": 1790
},
{
"epoch": 4.32711966325917,
"grad_norm": 0.21305358409881592,
"learning_rate": 0.0001135096153846154,
"loss": 0.0845,
"mean_token_accuracy": 0.9678771600127221,
"num_tokens": 1842176.0,
"step": 1800
},
{
"epoch": 4.351172579675286,
"grad_norm": 0.24969004094600677,
"learning_rate": 0.00011302884615384617,
"loss": 0.0896,
"mean_token_accuracy": 0.967383436858654,
"num_tokens": 1852416.0,
"step": 1810
},
{
"epoch": 4.375225496091401,
"grad_norm": 0.22428163886070251,
"learning_rate": 0.00011254807692307694,
"loss": 0.0895,
"mean_token_accuracy": 0.9665916368365288,
"num_tokens": 1862656.0,
"step": 1820
},
{
"epoch": 4.399278412507517,
"grad_norm": 0.2337762862443924,
"learning_rate": 0.00011206730769230769,
"loss": 0.0926,
"mean_token_accuracy": 0.964175409078598,
"num_tokens": 1872896.0,
"step": 1830
},
{
"epoch": 4.4233313289236325,
"grad_norm": 0.24770590662956238,
"learning_rate": 0.00011158653846153846,
"loss": 0.0916,
"mean_token_accuracy": 0.9662819102406501,
"num_tokens": 1883136.0,
"step": 1840
},
{
"epoch": 4.447384245339747,
"grad_norm": 0.2134462147951126,
"learning_rate": 0.00011110576923076923,
"loss": 0.0881,
"mean_token_accuracy": 0.9670575857162476,
"num_tokens": 1893376.0,
"step": 1850
},
{
"epoch": 4.471437161755863,
"grad_norm": 0.16387952864170074,
"learning_rate": 0.000110625,
"loss": 0.0871,
"mean_token_accuracy": 0.9688689664006234,
"num_tokens": 1903616.0,
"step": 1860
},
{
"epoch": 4.495490078171978,
"grad_norm": 0.1718788743019104,
"learning_rate": 0.00011014423076923077,
"loss": 0.0887,
"mean_token_accuracy": 0.9679651662707329,
"num_tokens": 1913856.0,
"step": 1870
},
{
"epoch": 4.519542994588094,
"grad_norm": 0.13868968188762665,
"learning_rate": 0.00010966346153846154,
"loss": 0.0869,
"mean_token_accuracy": 0.9675926879048348,
"num_tokens": 1924096.0,
"step": 1880
},
{
"epoch": 4.543595911004209,
"grad_norm": 0.23226922750473022,
"learning_rate": 0.00010918269230769232,
"loss": 0.0893,
"mean_token_accuracy": 0.9671968877315521,
"num_tokens": 1934336.0,
"step": 1890
},
{
"epoch": 4.567648827420324,
"grad_norm": 0.2364928126335144,
"learning_rate": 0.00010870192307692309,
"loss": 0.0868,
"mean_token_accuracy": 0.9687703013420105,
"num_tokens": 1944576.0,
"step": 1900
},
{
"epoch": 4.59170174383644,
"grad_norm": 0.46209245920181274,
"learning_rate": 0.00010822115384615385,
"loss": 0.0908,
"mean_token_accuracy": 0.9664398193359375,
"num_tokens": 1954816.0,
"step": 1910
},
{
"epoch": 4.615754660252556,
"grad_norm": 0.164833664894104,
"learning_rate": 0.00010774038461538462,
"loss": 0.0848,
"mean_token_accuracy": 0.9686171919107437,
"num_tokens": 1965056.0,
"step": 1920
},
{
"epoch": 4.639807576668671,
"grad_norm": 0.2000456005334854,
"learning_rate": 0.0001072596153846154,
"loss": 0.0847,
"mean_token_accuracy": 0.9683021917939186,
"num_tokens": 1975296.0,
"step": 1930
},
{
"epoch": 4.663860493084787,
"grad_norm": 0.1978590190410614,
"learning_rate": 0.00010677884615384617,
"loss": 0.0881,
"mean_token_accuracy": 0.9670929208397865,
"num_tokens": 1985536.0,
"step": 1940
},
{
"epoch": 4.687913409500902,
"grad_norm": 0.3182586133480072,
"learning_rate": 0.00010629807692307694,
"loss": 0.0913,
"mean_token_accuracy": 0.9663867846131324,
"num_tokens": 1995776.0,
"step": 1950
},
{
"epoch": 4.711966325917017,
"grad_norm": 0.18092605471611023,
"learning_rate": 0.00010581730769230769,
"loss": 0.0905,
"mean_token_accuracy": 0.9664200291037559,
"num_tokens": 2006016.0,
"step": 1960
},
{
"epoch": 4.736019242333133,
"grad_norm": 0.17231811583042145,
"learning_rate": 0.00010533653846153846,
"loss": 0.0859,
"mean_token_accuracy": 0.968836921453476,
"num_tokens": 2016256.0,
"step": 1970
},
{
"epoch": 4.7600721587492485,
"grad_norm": 0.15652543306350708,
"learning_rate": 0.00010485576923076924,
"loss": 0.0834,
"mean_token_accuracy": 0.9684307157993317,
"num_tokens": 2026496.0,
"step": 1980
},
{
"epoch": 4.784125075165364,
"grad_norm": 0.13773974776268005,
"learning_rate": 0.000104375,
"loss": 0.0868,
"mean_token_accuracy": 0.970444829761982,
"num_tokens": 2036736.0,
"step": 1990
},
{
"epoch": 4.808177991581479,
"grad_norm": 0.239529550075531,
"learning_rate": 0.00010389423076923077,
"loss": 0.0824,
"mean_token_accuracy": 0.9694223091006279,
"num_tokens": 2046976.0,
"step": 2000
},
{
"epoch": 4.8322309079975945,
"grad_norm": 0.18709523975849152,
"learning_rate": 0.00010341346153846154,
"loss": 0.0949,
"mean_token_accuracy": 0.965866394340992,
"num_tokens": 2057216.0,
"step": 2010
},
{
"epoch": 4.85628382441371,
"grad_norm": 0.2615446448326111,
"learning_rate": 0.00010293269230769232,
"loss": 0.083,
"mean_token_accuracy": 0.9698115825653076,
"num_tokens": 2067456.0,
"step": 2020
},
{
"epoch": 4.880336740829826,
"grad_norm": 0.1910138875246048,
"learning_rate": 0.00010245192307692308,
"loss": 0.0907,
"mean_token_accuracy": 0.9668553739786148,
"num_tokens": 2077696.0,
"step": 2030
},
{
"epoch": 4.904389657245941,
"grad_norm": 0.19025610387325287,
"learning_rate": 0.00010197115384615385,
"loss": 0.093,
"mean_token_accuracy": 0.9664333924651146,
"num_tokens": 2087936.0,
"step": 2040
},
{
"epoch": 4.928442573662057,
"grad_norm": 0.35547563433647156,
"learning_rate": 0.00010149038461538463,
"loss": 0.0863,
"mean_token_accuracy": 0.967856514453888,
"num_tokens": 2098176.0,
"step": 2050
},
{
"epoch": 4.952495490078172,
"grad_norm": 0.15040579438209534,
"learning_rate": 0.0001010096153846154,
"loss": 0.0823,
"mean_token_accuracy": 0.9676536738872528,
"num_tokens": 2108416.0,
"step": 2060
},
{
"epoch": 4.976548406494287,
"grad_norm": 0.24159961938858032,
"learning_rate": 0.00010052884615384617,
"loss": 0.0853,
"mean_token_accuracy": 0.9684896349906922,
"num_tokens": 2118656.0,
"step": 2070
},
{
"epoch": 5.0,
"grad_norm": 0.21156367659568787,
"learning_rate": 0.00010004807692307693,
"loss": 0.0872,
"mean_token_accuracy": 0.9677492471841666,
"num_tokens": 2128640.0,
"step": 2080
},
{
"epoch": 5.0,
"eval_loss": 0.12297015637159348,
"eval_mean_token_accuracy": 0.960414043132295,
"eval_num_tokens": 2128640.0,
"eval_runtime": 26.101,
"eval_samples_per_second": 14.176,
"eval_steps_per_second": 1.801,
"step": 2080
}
],
"logging_steps": 10,
"max_steps": 4160,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.631112336965632e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}