SystemAdmin123's picture
Training in progress, step 2360, checkpoint
320531e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0669380370872907,
"eval_steps": 800,
"global_step": 2360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004522840343735866,
"eval_loss": 2.6005213260650635,
"eval_runtime": 106.0609,
"eval_samples_per_second": 20.79,
"eval_steps_per_second": 5.205,
"step": 1
},
{
"epoch": 0.004522840343735866,
"grad_norm": 4.152566909790039,
"learning_rate": 2.5e-06,
"loss": 2.461,
"step": 10
},
{
"epoch": 0.009045680687471733,
"grad_norm": 3.253302812576294,
"learning_rate": 5e-06,
"loss": 2.1663,
"step": 20
},
{
"epoch": 0.013568521031207599,
"grad_norm": 3.0376248359680176,
"learning_rate": 7.5e-06,
"loss": 1.5723,
"step": 30
},
{
"epoch": 0.018091361374943465,
"grad_norm": 1.5493546724319458,
"learning_rate": 1e-05,
"loss": 1.0123,
"step": 40
},
{
"epoch": 0.022614201718679332,
"grad_norm": 0.7575089335441589,
"learning_rate": 1.25e-05,
"loss": 0.7936,
"step": 50
},
{
"epoch": 0.027137042062415198,
"grad_norm": 1.0295528173446655,
"learning_rate": 1.5e-05,
"loss": 0.688,
"step": 60
},
{
"epoch": 0.031659882406151064,
"grad_norm": 0.5441007614135742,
"learning_rate": 1.75e-05,
"loss": 0.6524,
"step": 70
},
{
"epoch": 0.03618272274988693,
"grad_norm": 0.8951964974403381,
"learning_rate": 2e-05,
"loss": 0.6575,
"step": 80
},
{
"epoch": 0.0407055630936228,
"grad_norm": 0.7289975881576538,
"learning_rate": 2.25e-05,
"loss": 0.6334,
"step": 90
},
{
"epoch": 0.045228403437358664,
"grad_norm": 0.8988147377967834,
"learning_rate": 2.5e-05,
"loss": 0.6591,
"step": 100
},
{
"epoch": 0.04975124378109453,
"grad_norm": 0.4641079902648926,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.6724,
"step": 110
},
{
"epoch": 0.054274084124830396,
"grad_norm": 0.6655707955360413,
"learning_rate": 3e-05,
"loss": 0.6467,
"step": 120
},
{
"epoch": 0.05879692446856626,
"grad_norm": 0.7066192030906677,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.6202,
"step": 130
},
{
"epoch": 0.06331976481230213,
"grad_norm": 0.9096384644508362,
"learning_rate": 3.5e-05,
"loss": 0.6344,
"step": 140
},
{
"epoch": 0.06784260515603799,
"grad_norm": 0.6074104905128479,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.5695,
"step": 150
},
{
"epoch": 0.07236544549977386,
"grad_norm": 0.8516980409622192,
"learning_rate": 4e-05,
"loss": 0.5924,
"step": 160
},
{
"epoch": 0.07688828584350972,
"grad_norm": 0.7963458299636841,
"learning_rate": 4.25e-05,
"loss": 0.6169,
"step": 170
},
{
"epoch": 0.0814111261872456,
"grad_norm": 0.8972164988517761,
"learning_rate": 4.5e-05,
"loss": 0.5872,
"step": 180
},
{
"epoch": 0.08593396653098145,
"grad_norm": 4.4737162590026855,
"learning_rate": 4.75e-05,
"loss": 0.6164,
"step": 190
},
{
"epoch": 0.09045680687471733,
"grad_norm": 0.6346756815910339,
"learning_rate": 5e-05,
"loss": 0.5744,
"step": 200
},
{
"epoch": 0.09497964721845319,
"grad_norm": 0.8093542456626892,
"learning_rate": 5.25e-05,
"loss": 0.5741,
"step": 210
},
{
"epoch": 0.09950248756218906,
"grad_norm": 0.8128954768180847,
"learning_rate": 5.500000000000001e-05,
"loss": 0.5842,
"step": 220
},
{
"epoch": 0.10402532790592492,
"grad_norm": 0.5212199687957764,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.5368,
"step": 230
},
{
"epoch": 0.10854816824966079,
"grad_norm": 0.7929616570472717,
"learning_rate": 6e-05,
"loss": 0.5645,
"step": 240
},
{
"epoch": 0.11307100859339665,
"grad_norm": 0.6961472630500793,
"learning_rate": 6.25e-05,
"loss": 0.5541,
"step": 250
},
{
"epoch": 0.11759384893713253,
"grad_norm": 0.6359185576438904,
"learning_rate": 6.500000000000001e-05,
"loss": 0.5642,
"step": 260
},
{
"epoch": 0.12211668928086838,
"grad_norm": 0.6574355959892273,
"learning_rate": 6.750000000000001e-05,
"loss": 0.5556,
"step": 270
},
{
"epoch": 0.12663952962460426,
"grad_norm": 0.5738745331764221,
"learning_rate": 7e-05,
"loss": 0.5582,
"step": 280
},
{
"epoch": 0.13116236996834013,
"grad_norm": 0.3199772834777832,
"learning_rate": 7.25e-05,
"loss": 0.5796,
"step": 290
},
{
"epoch": 0.13568521031207598,
"grad_norm": 0.3531240224838257,
"learning_rate": 7.500000000000001e-05,
"loss": 0.5612,
"step": 300
},
{
"epoch": 0.14020805065581185,
"grad_norm": 0.5375868678092957,
"learning_rate": 7.75e-05,
"loss": 0.5645,
"step": 310
},
{
"epoch": 0.14473089099954772,
"grad_norm": 0.2688968777656555,
"learning_rate": 8e-05,
"loss": 0.5485,
"step": 320
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.6451207995414734,
"learning_rate": 8.25e-05,
"loss": 0.5242,
"step": 330
},
{
"epoch": 0.15377657168701944,
"grad_norm": 2.1042656898498535,
"learning_rate": 8.5e-05,
"loss": 0.5639,
"step": 340
},
{
"epoch": 0.15829941203075532,
"grad_norm": 0.38682234287261963,
"learning_rate": 8.75e-05,
"loss": 0.5417,
"step": 350
},
{
"epoch": 0.1628222523744912,
"grad_norm": 0.5062561631202698,
"learning_rate": 9e-05,
"loss": 0.5509,
"step": 360
},
{
"epoch": 0.16734509271822703,
"grad_norm": 0.6663705706596375,
"learning_rate": 9.250000000000001e-05,
"loss": 0.557,
"step": 370
},
{
"epoch": 0.1718679330619629,
"grad_norm": 0.4430944323539734,
"learning_rate": 9.5e-05,
"loss": 0.5711,
"step": 380
},
{
"epoch": 0.17639077340569878,
"grad_norm": 0.6567730903625488,
"learning_rate": 9.75e-05,
"loss": 0.5437,
"step": 390
},
{
"epoch": 0.18091361374943465,
"grad_norm": 0.44569894671440125,
"learning_rate": 0.0001,
"loss": 0.5124,
"step": 400
},
{
"epoch": 0.1854364540931705,
"grad_norm": 3.6767895221710205,
"learning_rate": 9.999957281897735e-05,
"loss": 0.5489,
"step": 410
},
{
"epoch": 0.18995929443690637,
"grad_norm": 1.6086912155151367,
"learning_rate": 9.999829128320874e-05,
"loss": 0.5508,
"step": 420
},
{
"epoch": 0.19448213478064225,
"grad_norm": 0.42233482003211975,
"learning_rate": 9.999615541459207e-05,
"loss": 0.5205,
"step": 430
},
{
"epoch": 0.19900497512437812,
"grad_norm": 0.5907697677612305,
"learning_rate": 9.999316524962345e-05,
"loss": 0.5201,
"step": 440
},
{
"epoch": 0.20352781546811397,
"grad_norm": 0.5699201226234436,
"learning_rate": 9.998932083939656e-05,
"loss": 0.5263,
"step": 450
},
{
"epoch": 0.20805065581184984,
"grad_norm": 0.8701225519180298,
"learning_rate": 9.998462224960175e-05,
"loss": 0.5421,
"step": 460
},
{
"epoch": 0.2125734961555857,
"grad_norm": 0.38067907094955444,
"learning_rate": 9.997906956052494e-05,
"loss": 0.5388,
"step": 470
},
{
"epoch": 0.21709633649932158,
"grad_norm": 0.3497803509235382,
"learning_rate": 9.997266286704631e-05,
"loss": 0.5216,
"step": 480
},
{
"epoch": 0.22161917684305743,
"grad_norm": 1.2132240533828735,
"learning_rate": 9.996540227863854e-05,
"loss": 0.5204,
"step": 490
},
{
"epoch": 0.2261420171867933,
"grad_norm": 0.30633747577667236,
"learning_rate": 9.995728791936504e-05,
"loss": 0.5494,
"step": 500
},
{
"epoch": 0.23066485753052918,
"grad_norm": 0.6719753742218018,
"learning_rate": 9.994831992787788e-05,
"loss": 0.5489,
"step": 510
},
{
"epoch": 0.23518769787426505,
"grad_norm": 0.5767594575881958,
"learning_rate": 9.993849845741524e-05,
"loss": 0.5324,
"step": 520
},
{
"epoch": 0.2397105382180009,
"grad_norm": 0.236255943775177,
"learning_rate": 9.992782367579899e-05,
"loss": 0.5181,
"step": 530
},
{
"epoch": 0.24423337856173677,
"grad_norm": 0.788173258304596,
"learning_rate": 9.991629576543163e-05,
"loss": 0.5303,
"step": 540
},
{
"epoch": 0.24875621890547264,
"grad_norm": 0.2152377963066101,
"learning_rate": 9.990391492329341e-05,
"loss": 0.5284,
"step": 550
},
{
"epoch": 0.2532790592492085,
"grad_norm": 0.7022669911384583,
"learning_rate": 9.989068136093873e-05,
"loss": 0.5282,
"step": 560
},
{
"epoch": 0.25780189959294436,
"grad_norm": 0.5153821110725403,
"learning_rate": 9.987659530449268e-05,
"loss": 0.5221,
"step": 570
},
{
"epoch": 0.26232473993668026,
"grad_norm": 0.3221074640750885,
"learning_rate": 9.986165699464705e-05,
"loss": 0.5202,
"step": 580
},
{
"epoch": 0.2668475802804161,
"grad_norm": 0.22985632717609406,
"learning_rate": 9.98458666866564e-05,
"loss": 0.5148,
"step": 590
},
{
"epoch": 0.27137042062415195,
"grad_norm": 0.5174903869628906,
"learning_rate": 9.98292246503335e-05,
"loss": 0.5389,
"step": 600
},
{
"epoch": 0.27589326096788785,
"grad_norm": 0.41488873958587646,
"learning_rate": 9.981173117004484e-05,
"loss": 0.514,
"step": 610
},
{
"epoch": 0.2804161013116237,
"grad_norm": 0.4851066768169403,
"learning_rate": 9.979338654470569e-05,
"loss": 0.5268,
"step": 620
},
{
"epoch": 0.28493894165535955,
"grad_norm": 0.6160866618156433,
"learning_rate": 9.977419108777514e-05,
"loss": 0.5216,
"step": 630
},
{
"epoch": 0.28946178199909545,
"grad_norm": 0.47816914319992065,
"learning_rate": 9.975414512725057e-05,
"loss": 0.5058,
"step": 640
},
{
"epoch": 0.2939846223428313,
"grad_norm": 0.47290515899658203,
"learning_rate": 9.973324900566213e-05,
"loss": 0.5233,
"step": 650
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.5466296076774597,
"learning_rate": 9.97115030800669e-05,
"loss": 0.5208,
"step": 660
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.1515071988105774,
"learning_rate": 9.968890772204271e-05,
"loss": 0.5153,
"step": 670
},
{
"epoch": 0.3075531433740389,
"grad_norm": 0.22212214767932892,
"learning_rate": 9.966546331768191e-05,
"loss": 0.4956,
"step": 680
},
{
"epoch": 0.3120759837177748,
"grad_norm": 0.5279558300971985,
"learning_rate": 9.96411702675847e-05,
"loss": 0.5137,
"step": 690
},
{
"epoch": 0.31659882406151063,
"grad_norm": 0.49932244420051575,
"learning_rate": 9.961602898685226e-05,
"loss": 0.5226,
"step": 700
},
{
"epoch": 0.3211216644052465,
"grad_norm": 0.4041373133659363,
"learning_rate": 9.959003990507972e-05,
"loss": 0.5063,
"step": 710
},
{
"epoch": 0.3256445047489824,
"grad_norm": 0.22287319600582123,
"learning_rate": 9.956320346634876e-05,
"loss": 0.4965,
"step": 720
},
{
"epoch": 0.3301673450927182,
"grad_norm": 0.379319429397583,
"learning_rate": 9.953552012922012e-05,
"loss": 0.4926,
"step": 730
},
{
"epoch": 0.33469018543645407,
"grad_norm": 0.4465825855731964,
"learning_rate": 9.950699036672559e-05,
"loss": 0.5295,
"step": 740
},
{
"epoch": 0.33921302578018997,
"grad_norm": 0.8027335405349731,
"learning_rate": 9.947761466636014e-05,
"loss": 0.5249,
"step": 750
},
{
"epoch": 0.3437358661239258,
"grad_norm": 0.4452328383922577,
"learning_rate": 9.944739353007344e-05,
"loss": 0.5168,
"step": 760
},
{
"epoch": 0.3482587064676617,
"grad_norm": 0.5175443291664124,
"learning_rate": 9.941632747426129e-05,
"loss": 0.5063,
"step": 770
},
{
"epoch": 0.35278154681139756,
"grad_norm": 0.6492401361465454,
"learning_rate": 9.938441702975689e-05,
"loss": 0.5028,
"step": 780
},
{
"epoch": 0.3573043871551334,
"grad_norm": 0.28473520278930664,
"learning_rate": 9.93516627418217e-05,
"loss": 0.502,
"step": 790
},
{
"epoch": 0.3618272274988693,
"grad_norm": 0.29608818888664246,
"learning_rate": 9.931806517013612e-05,
"loss": 0.503,
"step": 800
},
{
"epoch": 0.3618272274988693,
"eval_loss": 0.5416414737701416,
"eval_runtime": 106.1622,
"eval_samples_per_second": 20.77,
"eval_steps_per_second": 5.2,
"step": 800
},
{
"epoch": 0.36635006784260515,
"grad_norm": 0.13932587206363678,
"learning_rate": 9.928362488878996e-05,
"loss": 0.5012,
"step": 810
},
{
"epoch": 0.370872908186341,
"grad_norm": 0.5137602686882019,
"learning_rate": 9.92483424862726e-05,
"loss": 0.5183,
"step": 820
},
{
"epoch": 0.3753957485300769,
"grad_norm": 0.33779120445251465,
"learning_rate": 9.921221856546293e-05,
"loss": 0.5007,
"step": 830
},
{
"epoch": 0.37991858887381275,
"grad_norm": 0.23543624579906464,
"learning_rate": 9.917525374361912e-05,
"loss": 0.5078,
"step": 840
},
{
"epoch": 0.38444142921754865,
"grad_norm": 0.4530757665634155,
"learning_rate": 9.913744865236798e-05,
"loss": 0.519,
"step": 850
},
{
"epoch": 0.3889642695612845,
"grad_norm": 0.29827597737312317,
"learning_rate": 9.90988039376942e-05,
"loss": 0.5017,
"step": 860
},
{
"epoch": 0.39348710990502034,
"grad_norm": 0.8379443883895874,
"learning_rate": 9.905932025992932e-05,
"loss": 0.517,
"step": 870
},
{
"epoch": 0.39800995024875624,
"grad_norm": 0.3807092607021332,
"learning_rate": 9.901899829374047e-05,
"loss": 0.5118,
"step": 880
},
{
"epoch": 0.4025327905924921,
"grad_norm": 0.3721909523010254,
"learning_rate": 9.897783872811882e-05,
"loss": 0.512,
"step": 890
},
{
"epoch": 0.40705563093622793,
"grad_norm": 0.16293685138225555,
"learning_rate": 9.893584226636772e-05,
"loss": 0.4883,
"step": 900
},
{
"epoch": 0.41157847127996383,
"grad_norm": 0.2745698094367981,
"learning_rate": 9.88930096260909e-05,
"loss": 0.4878,
"step": 910
},
{
"epoch": 0.4161013116236997,
"grad_norm": 0.2861226499080658,
"learning_rate": 9.884934153917997e-05,
"loss": 0.4861,
"step": 920
},
{
"epoch": 0.4206241519674355,
"grad_norm": 0.14675496518611908,
"learning_rate": 9.880483875180205e-05,
"loss": 0.502,
"step": 930
},
{
"epoch": 0.4251469923111714,
"grad_norm": 0.3355661928653717,
"learning_rate": 9.8759502024387e-05,
"loss": 0.4769,
"step": 940
},
{
"epoch": 0.42966983265490727,
"grad_norm": 0.3499309718608856,
"learning_rate": 9.871333213161438e-05,
"loss": 0.5044,
"step": 950
},
{
"epoch": 0.43419267299864317,
"grad_norm": 0.13604167103767395,
"learning_rate": 9.86663298624003e-05,
"loss": 0.4837,
"step": 960
},
{
"epoch": 0.438715513342379,
"grad_norm": 0.42066818475723267,
"learning_rate": 9.861849601988383e-05,
"loss": 0.4768,
"step": 970
},
{
"epoch": 0.44323835368611486,
"grad_norm": 0.2138914316892624,
"learning_rate": 9.856983142141339e-05,
"loss": 0.5059,
"step": 980
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.3707284927368164,
"learning_rate": 9.852033689853267e-05,
"loss": 0.5059,
"step": 990
},
{
"epoch": 0.4522840343735866,
"grad_norm": 0.6477210521697998,
"learning_rate": 9.847001329696653e-05,
"loss": 0.4977,
"step": 1000
},
{
"epoch": 0.45680687471732245,
"grad_norm": 0.14800307154655457,
"learning_rate": 9.841886147660645e-05,
"loss": 0.4976,
"step": 1010
},
{
"epoch": 0.46132971506105835,
"grad_norm": 0.47785401344299316,
"learning_rate": 9.836688231149592e-05,
"loss": 0.5012,
"step": 1020
},
{
"epoch": 0.4658525554047942,
"grad_norm": 0.12753424048423767,
"learning_rate": 9.831407668981546e-05,
"loss": 0.4982,
"step": 1030
},
{
"epoch": 0.4703753957485301,
"grad_norm": 0.2438424527645111,
"learning_rate": 9.826044551386744e-05,
"loss": 0.5074,
"step": 1040
},
{
"epoch": 0.47489823609226595,
"grad_norm": 1.7088356018066406,
"learning_rate": 9.820598970006069e-05,
"loss": 0.5032,
"step": 1050
},
{
"epoch": 0.4794210764360018,
"grad_norm": 0.4954736530780792,
"learning_rate": 9.815071017889482e-05,
"loss": 0.5021,
"step": 1060
},
{
"epoch": 0.4839439167797377,
"grad_norm": 0.2744172513484955,
"learning_rate": 9.809460789494432e-05,
"loss": 0.5182,
"step": 1070
},
{
"epoch": 0.48846675712347354,
"grad_norm": 0.3839784264564514,
"learning_rate": 9.803768380684242e-05,
"loss": 0.4917,
"step": 1080
},
{
"epoch": 0.4929895974672094,
"grad_norm": 0.1647842675447464,
"learning_rate": 9.797993888726473e-05,
"loss": 0.5027,
"step": 1090
},
{
"epoch": 0.4975124378109453,
"grad_norm": 0.3968697190284729,
"learning_rate": 9.792137412291265e-05,
"loss": 0.4932,
"step": 1100
},
{
"epoch": 0.5020352781546812,
"grad_norm": 0.28644707798957825,
"learning_rate": 9.786199051449636e-05,
"loss": 0.4935,
"step": 1110
},
{
"epoch": 0.506558118498417,
"grad_norm": 0.4823949933052063,
"learning_rate": 9.780178907671789e-05,
"loss": 0.4999,
"step": 1120
},
{
"epoch": 0.5110809588421529,
"grad_norm": 0.4848293364048004,
"learning_rate": 9.774077083825372e-05,
"loss": 0.5054,
"step": 1130
},
{
"epoch": 0.5156037991858887,
"grad_norm": 0.15839064121246338,
"learning_rate": 9.767893684173721e-05,
"loss": 0.4789,
"step": 1140
},
{
"epoch": 0.5201266395296246,
"grad_norm": 0.34784838557243347,
"learning_rate": 9.761628814374073e-05,
"loss": 0.4942,
"step": 1150
},
{
"epoch": 0.5246494798733605,
"grad_norm": 0.18030278384685516,
"learning_rate": 9.755282581475769e-05,
"loss": 0.4813,
"step": 1160
},
{
"epoch": 0.5291723202170964,
"grad_norm": 0.43726080656051636,
"learning_rate": 9.748855093918417e-05,
"loss": 0.5017,
"step": 1170
},
{
"epoch": 0.5336951605608322,
"grad_norm": 0.22014489769935608,
"learning_rate": 9.742346461530048e-05,
"loss": 0.5077,
"step": 1180
},
{
"epoch": 0.5382180009045681,
"grad_norm": 0.2954523265361786,
"learning_rate": 9.735756795525231e-05,
"loss": 0.4936,
"step": 1190
},
{
"epoch": 0.5427408412483039,
"grad_norm": 0.1660991609096527,
"learning_rate": 9.729086208503174e-05,
"loss": 0.4889,
"step": 1200
},
{
"epoch": 0.5472636815920398,
"grad_norm": 0.4176950454711914,
"learning_rate": 9.722334814445809e-05,
"loss": 0.5041,
"step": 1210
},
{
"epoch": 0.5517865219357757,
"grad_norm": 0.11009762436151505,
"learning_rate": 9.715502728715826e-05,
"loss": 0.4917,
"step": 1220
},
{
"epoch": 0.5563093622795116,
"grad_norm": 0.652611255645752,
"learning_rate": 9.708590068054728e-05,
"loss": 0.4923,
"step": 1230
},
{
"epoch": 0.5608322026232474,
"grad_norm": 0.16778019070625305,
"learning_rate": 9.701596950580806e-05,
"loss": 0.5064,
"step": 1240
},
{
"epoch": 0.5653550429669832,
"grad_norm": 0.3204459547996521,
"learning_rate": 9.694523495787149e-05,
"loss": 0.4911,
"step": 1250
},
{
"epoch": 0.5698778833107191,
"grad_norm": 0.4674127399921417,
"learning_rate": 9.687369824539577e-05,
"loss": 0.4971,
"step": 1260
},
{
"epoch": 0.574400723654455,
"grad_norm": 0.12897251546382904,
"learning_rate": 9.680136059074598e-05,
"loss": 0.4798,
"step": 1270
},
{
"epoch": 0.5789235639981909,
"grad_norm": 0.15009181201457977,
"learning_rate": 9.672822322997305e-05,
"loss": 0.4895,
"step": 1280
},
{
"epoch": 0.5834464043419267,
"grad_norm": 0.304299920797348,
"learning_rate": 9.665428741279266e-05,
"loss": 0.498,
"step": 1290
},
{
"epoch": 0.5879692446856626,
"grad_norm": 0.40142571926116943,
"learning_rate": 9.657955440256395e-05,
"loss": 0.4799,
"step": 1300
},
{
"epoch": 0.5924920850293984,
"grad_norm": 0.17843011021614075,
"learning_rate": 9.650402547626786e-05,
"loss": 0.4848,
"step": 1310
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.579076886177063,
"learning_rate": 9.642770192448536e-05,
"loss": 0.489,
"step": 1320
},
{
"epoch": 0.6015377657168702,
"grad_norm": 0.4647994637489319,
"learning_rate": 9.635058505137536e-05,
"loss": 0.4964,
"step": 1330
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.14807263016700745,
"learning_rate": 9.627267617465243e-05,
"loss": 0.492,
"step": 1340
},
{
"epoch": 0.6105834464043419,
"grad_norm": 0.14985252916812897,
"learning_rate": 9.619397662556435e-05,
"loss": 0.498,
"step": 1350
},
{
"epoch": 0.6151062867480778,
"grad_norm": 0.16624276340007782,
"learning_rate": 9.611448774886924e-05,
"loss": 0.4846,
"step": 1360
},
{
"epoch": 0.6196291270918136,
"grad_norm": 0.20036545395851135,
"learning_rate": 9.60342109028127e-05,
"loss": 0.4919,
"step": 1370
},
{
"epoch": 0.6241519674355496,
"grad_norm": 0.2734001576900482,
"learning_rate": 9.595314745910456e-05,
"loss": 0.4885,
"step": 1380
},
{
"epoch": 0.6286748077792854,
"grad_norm": 0.12794898450374603,
"learning_rate": 9.587129880289538e-05,
"loss": 0.5028,
"step": 1390
},
{
"epoch": 0.6331976481230213,
"grad_norm": 0.40597257018089294,
"learning_rate": 9.578866633275288e-05,
"loss": 0.5009,
"step": 1400
},
{
"epoch": 0.6377204884667571,
"grad_norm": 0.34428995847702026,
"learning_rate": 9.570525146063798e-05,
"loss": 0.4905,
"step": 1410
},
{
"epoch": 0.642243328810493,
"grad_norm": 0.33436307311058044,
"learning_rate": 9.562105561188069e-05,
"loss": 0.4891,
"step": 1420
},
{
"epoch": 0.6467661691542289,
"grad_norm": 0.375051885843277,
"learning_rate": 9.553608022515577e-05,
"loss": 0.5031,
"step": 1430
},
{
"epoch": 0.6512890094979648,
"grad_norm": 0.1948522925376892,
"learning_rate": 9.545032675245813e-05,
"loss": 0.4836,
"step": 1440
},
{
"epoch": 0.6558118498417006,
"grad_norm": 0.08413676917552948,
"learning_rate": 9.5363796659078e-05,
"loss": 0.4831,
"step": 1450
},
{
"epoch": 0.6603346901854364,
"grad_norm": 0.289809912443161,
"learning_rate": 9.527649142357596e-05,
"loss": 0.4851,
"step": 1460
},
{
"epoch": 0.6648575305291723,
"grad_norm": 0.14166517555713654,
"learning_rate": 9.518841253775755e-05,
"loss": 0.4786,
"step": 1470
},
{
"epoch": 0.6693803708729081,
"grad_norm": 0.13443373143672943,
"learning_rate": 9.509956150664796e-05,
"loss": 0.4929,
"step": 1480
},
{
"epoch": 0.6739032112166441,
"grad_norm": 0.1985711306333542,
"learning_rate": 9.500993984846614e-05,
"loss": 0.481,
"step": 1490
},
{
"epoch": 0.6784260515603799,
"grad_norm": 0.3250739276409149,
"learning_rate": 9.491954909459895e-05,
"loss": 0.4872,
"step": 1500
},
{
"epoch": 0.6829488919041158,
"grad_norm": 0.32657676935195923,
"learning_rate": 9.4828390789575e-05,
"loss": 0.49,
"step": 1510
},
{
"epoch": 0.6874717322478516,
"grad_norm": 0.22073891758918762,
"learning_rate": 9.473646649103818e-05,
"loss": 0.5,
"step": 1520
},
{
"epoch": 0.6919945725915875,
"grad_norm": 0.22980472445487976,
"learning_rate": 9.464377776972114e-05,
"loss": 0.4867,
"step": 1530
},
{
"epoch": 0.6965174129353234,
"grad_norm": 0.1441211700439453,
"learning_rate": 9.45503262094184e-05,
"loss": 0.4779,
"step": 1540
},
{
"epoch": 0.7010402532790593,
"grad_norm": 0.19868245720863342,
"learning_rate": 9.445611340695926e-05,
"loss": 0.4917,
"step": 1550
},
{
"epoch": 0.7055630936227951,
"grad_norm": 0.2807522714138031,
"learning_rate": 9.43611409721806e-05,
"loss": 0.4955,
"step": 1560
},
{
"epoch": 0.710085933966531,
"grad_norm": 0.4753507375717163,
"learning_rate": 9.426541052789925e-05,
"loss": 0.4884,
"step": 1570
},
{
"epoch": 0.7146087743102668,
"grad_norm": 0.16993290185928345,
"learning_rate": 9.416892370988444e-05,
"loss": 0.4816,
"step": 1580
},
{
"epoch": 0.7191316146540027,
"grad_norm": 0.31910213828086853,
"learning_rate": 9.407168216682962e-05,
"loss": 0.491,
"step": 1590
},
{
"epoch": 0.7236544549977386,
"grad_norm": 0.14142963290214539,
"learning_rate": 9.397368756032445e-05,
"loss": 0.4923,
"step": 1600
},
{
"epoch": 0.7236544549977386,
"eval_loss": 0.5151379704475403,
"eval_runtime": 104.895,
"eval_samples_per_second": 21.021,
"eval_steps_per_second": 5.262,
"step": 1600
},
{
"epoch": 0.7281772953414745,
"grad_norm": 0.26469337940216064,
"learning_rate": 9.387494156482643e-05,
"loss": 0.4913,
"step": 1610
},
{
"epoch": 0.7327001356852103,
"grad_norm": 0.261874794960022,
"learning_rate": 9.377544586763215e-05,
"loss": 0.4778,
"step": 1620
},
{
"epoch": 0.7372229760289462,
"grad_norm": 0.24810470640659332,
"learning_rate": 9.367520216884856e-05,
"loss": 0.4855,
"step": 1630
},
{
"epoch": 0.741745816372682,
"grad_norm": 0.1436118185520172,
"learning_rate": 9.357421218136386e-05,
"loss": 0.4847,
"step": 1640
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.16066552698612213,
"learning_rate": 9.347247763081835e-05,
"loss": 0.4872,
"step": 1650
},
{
"epoch": 0.7507914970601538,
"grad_norm": 0.4255355894565582,
"learning_rate": 9.337000025557476e-05,
"loss": 0.4928,
"step": 1660
},
{
"epoch": 0.7553143374038896,
"grad_norm": 0.1905021220445633,
"learning_rate": 9.326678180668871e-05,
"loss": 0.4904,
"step": 1670
},
{
"epoch": 0.7598371777476255,
"grad_norm": 0.2327072024345398,
"learning_rate": 9.316282404787871e-05,
"loss": 0.4888,
"step": 1680
},
{
"epoch": 0.7643600180913613,
"grad_norm": 0.20930258929729462,
"learning_rate": 9.305812875549599e-05,
"loss": 0.4858,
"step": 1690
},
{
"epoch": 0.7688828584350973,
"grad_norm": 0.1341727375984192,
"learning_rate": 9.295269771849427e-05,
"loss": 0.4793,
"step": 1700
},
{
"epoch": 0.7734056987788331,
"grad_norm": 0.17430901527404785,
"learning_rate": 9.284653273839905e-05,
"loss": 0.4969,
"step": 1710
},
{
"epoch": 0.777928539122569,
"grad_norm": 0.5216471552848816,
"learning_rate": 9.273963562927695e-05,
"loss": 0.491,
"step": 1720
},
{
"epoch": 0.7824513794663048,
"grad_norm": 0.19990628957748413,
"learning_rate": 9.263200821770461e-05,
"loss": 0.4651,
"step": 1730
},
{
"epoch": 0.7869742198100407,
"grad_norm": 0.5802574753761292,
"learning_rate": 9.252365234273755e-05,
"loss": 0.4775,
"step": 1740
},
{
"epoch": 0.7914970601537765,
"grad_norm": 0.18451005220413208,
"learning_rate": 9.241456985587868e-05,
"loss": 0.4823,
"step": 1750
},
{
"epoch": 0.7960199004975125,
"grad_norm": 0.3577069640159607,
"learning_rate": 9.230476262104677e-05,
"loss": 0.4772,
"step": 1760
},
{
"epoch": 0.8005427408412483,
"grad_norm": 0.13898539543151855,
"learning_rate": 9.219423251454448e-05,
"loss": 0.4746,
"step": 1770
},
{
"epoch": 0.8050655811849842,
"grad_norm": 0.23426567018032074,
"learning_rate": 9.208298142502636e-05,
"loss": 0.4865,
"step": 1780
},
{
"epoch": 0.80958842152872,
"grad_norm": 0.33651697635650635,
"learning_rate": 9.197101125346657e-05,
"loss": 0.5107,
"step": 1790
},
{
"epoch": 0.8141112618724559,
"grad_norm": 0.41530391573905945,
"learning_rate": 9.185832391312644e-05,
"loss": 0.4836,
"step": 1800
},
{
"epoch": 0.8186341022161918,
"grad_norm": 0.4000149965286255,
"learning_rate": 9.174492132952166e-05,
"loss": 0.4799,
"step": 1810
},
{
"epoch": 0.8231569425599277,
"grad_norm": 0.2623349726200104,
"learning_rate": 9.163080544038952e-05,
"loss": 0.4738,
"step": 1820
},
{
"epoch": 0.8276797829036635,
"grad_norm": 0.2306557595729828,
"learning_rate": 9.151597819565571e-05,
"loss": 0.4833,
"step": 1830
},
{
"epoch": 0.8322026232473994,
"grad_norm": 0.11869548261165619,
"learning_rate": 9.140044155740101e-05,
"loss": 0.4836,
"step": 1840
},
{
"epoch": 0.8367254635911352,
"grad_norm": 0.4591820240020752,
"learning_rate": 9.12841974998278e-05,
"loss": 0.4942,
"step": 1850
},
{
"epoch": 0.841248303934871,
"grad_norm": 0.17287275195121765,
"learning_rate": 9.116724800922629e-05,
"loss": 0.4734,
"step": 1860
},
{
"epoch": 0.845771144278607,
"grad_norm": 0.15221983194351196,
"learning_rate": 9.104959508394061e-05,
"loss": 0.4806,
"step": 1870
},
{
"epoch": 0.8502939846223428,
"grad_norm": 0.4994131922721863,
"learning_rate": 9.093124073433463e-05,
"loss": 0.4837,
"step": 1880
},
{
"epoch": 0.8548168249660787,
"grad_norm": 0.47702082991600037,
"learning_rate": 9.081218698275763e-05,
"loss": 0.4836,
"step": 1890
},
{
"epoch": 0.8593396653098145,
"grad_norm": 0.18302541971206665,
"learning_rate": 9.069243586350975e-05,
"loss": 0.4837,
"step": 1900
},
{
"epoch": 0.8638625056535504,
"grad_norm": 0.35281088948249817,
"learning_rate": 9.057198942280722e-05,
"loss": 0.4799,
"step": 1910
},
{
"epoch": 0.8683853459972863,
"grad_norm": 0.30625414848327637,
"learning_rate": 9.045084971874738e-05,
"loss": 0.4743,
"step": 1920
},
{
"epoch": 0.8729081863410222,
"grad_norm": 0.4937283396720886,
"learning_rate": 9.032901882127354e-05,
"loss": 0.5007,
"step": 1930
},
{
"epoch": 0.877431026684758,
"grad_norm": 0.25247976183891296,
"learning_rate": 9.020649881213958e-05,
"loss": 0.4792,
"step": 1940
},
{
"epoch": 0.8819538670284939,
"grad_norm": 0.43513771891593933,
"learning_rate": 9.008329178487442e-05,
"loss": 0.486,
"step": 1950
},
{
"epoch": 0.8864767073722297,
"grad_norm": 0.4003300368785858,
"learning_rate": 8.995939984474624e-05,
"loss": 0.4871,
"step": 1960
},
{
"epoch": 0.8909995477159657,
"grad_norm": 0.14090712368488312,
"learning_rate": 8.983482510872645e-05,
"loss": 0.4849,
"step": 1970
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.5139490962028503,
"learning_rate": 8.970956970545355e-05,
"loss": 0.4954,
"step": 1980
},
{
"epoch": 0.9000452284034374,
"grad_norm": 0.15464723110198975,
"learning_rate": 8.958363577519684e-05,
"loss": 0.4874,
"step": 1990
},
{
"epoch": 0.9045680687471732,
"grad_norm": 0.6348758339881897,
"learning_rate": 8.945702546981969e-05,
"loss": 0.4893,
"step": 2000
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.34100142121315,
"learning_rate": 8.932974095274289e-05,
"loss": 0.4942,
"step": 2010
},
{
"epoch": 0.9136137494346449,
"grad_norm": 0.1424902230501175,
"learning_rate": 8.920178439890765e-05,
"loss": 0.4801,
"step": 2020
},
{
"epoch": 0.9181365897783809,
"grad_norm": 0.07340684533119202,
"learning_rate": 8.907315799473846e-05,
"loss": 0.4734,
"step": 2030
},
{
"epoch": 0.9226594301221167,
"grad_norm": 0.8894658088684082,
"learning_rate": 8.894386393810563e-05,
"loss": 0.4813,
"step": 2040
},
{
"epoch": 0.9271822704658526,
"grad_norm": 3.0061469078063965,
"learning_rate": 8.881390443828787e-05,
"loss": 0.4722,
"step": 2050
},
{
"epoch": 0.9317051108095884,
"grad_norm": 0.435508131980896,
"learning_rate": 8.868328171593448e-05,
"loss": 0.4856,
"step": 2060
},
{
"epoch": 0.9362279511533242,
"grad_norm": 0.30460214614868164,
"learning_rate": 8.855199800302736e-05,
"loss": 0.473,
"step": 2070
},
{
"epoch": 0.9407507914970602,
"grad_norm": 0.48862236738204956,
"learning_rate": 8.842005554284296e-05,
"loss": 0.4914,
"step": 2080
},
{
"epoch": 0.945273631840796,
"grad_norm": 0.16484391689300537,
"learning_rate": 8.828745658991386e-05,
"loss": 0.4872,
"step": 2090
},
{
"epoch": 0.9497964721845319,
"grad_norm": 0.19850239157676697,
"learning_rate": 8.815420340999033e-05,
"loss": 0.4978,
"step": 2100
},
{
"epoch": 0.9543193125282677,
"grad_norm": 0.15624162554740906,
"learning_rate": 8.802029828000156e-05,
"loss": 0.4904,
"step": 2110
},
{
"epoch": 0.9588421528720036,
"grad_norm": 0.20471015572547913,
"learning_rate": 8.788574348801675e-05,
"loss": 0.4818,
"step": 2120
},
{
"epoch": 0.9633649932157394,
"grad_norm": 0.879814088344574,
"learning_rate": 8.775054133320604e-05,
"loss": 0.4832,
"step": 2130
},
{
"epoch": 0.9678878335594754,
"grad_norm": 0.14262792468070984,
"learning_rate": 8.761469412580125e-05,
"loss": 0.4893,
"step": 2140
},
{
"epoch": 0.9724106739032112,
"grad_norm": 0.43501853942871094,
"learning_rate": 8.74782041870563e-05,
"loss": 0.4751,
"step": 2150
},
{
"epoch": 0.9769335142469471,
"grad_norm": 0.10217157751321793,
"learning_rate": 8.73410738492077e-05,
"loss": 0.4863,
"step": 2160
},
{
"epoch": 0.9814563545906829,
"grad_norm": 0.550026535987854,
"learning_rate": 8.720330545543453e-05,
"loss": 0.4832,
"step": 2170
},
{
"epoch": 0.9859791949344188,
"grad_norm": 0.283246785402298,
"learning_rate": 8.706490135981855e-05,
"loss": 0.4895,
"step": 2180
},
{
"epoch": 0.9905020352781547,
"grad_norm": 0.21039070188999176,
"learning_rate": 8.692586392730387e-05,
"loss": 0.494,
"step": 2190
},
{
"epoch": 0.9950248756218906,
"grad_norm": 0.7897614240646362,
"learning_rate": 8.678619553365659e-05,
"loss": 0.4871,
"step": 2200
},
{
"epoch": 0.9995477159656264,
"grad_norm": 0.3698125183582306,
"learning_rate": 8.66458985654242e-05,
"loss": 0.4764,
"step": 2210
},
{
"epoch": 1.0036182722749887,
"grad_norm": 0.529675304889679,
"learning_rate": 8.650497541989482e-05,
"loss": 0.4834,
"step": 2220
},
{
"epoch": 1.0081411126187245,
"grad_norm": 0.10685670375823975,
"learning_rate": 8.636342850505616e-05,
"loss": 0.4864,
"step": 2230
},
{
"epoch": 1.0126639529624604,
"grad_norm": 0.2046063244342804,
"learning_rate": 8.622126023955446e-05,
"loss": 0.4796,
"step": 2240
},
{
"epoch": 1.0171867933061962,
"grad_norm": 0.2893441319465637,
"learning_rate": 8.60784730526531e-05,
"loss": 0.4882,
"step": 2250
},
{
"epoch": 1.021709633649932,
"grad_norm": 0.12210704386234283,
"learning_rate": 8.59350693841912e-05,
"loss": 0.4805,
"step": 2260
},
{
"epoch": 1.0262324739936681,
"grad_norm": 4.210540294647217,
"learning_rate": 8.579105168454173e-05,
"loss": 0.4812,
"step": 2270
},
{
"epoch": 1.030755314337404,
"grad_norm": 0.3887004256248474,
"learning_rate": 8.564642241456986e-05,
"loss": 0.4804,
"step": 2280
},
{
"epoch": 1.0352781546811398,
"grad_norm": 0.4718762934207916,
"learning_rate": 8.550118404559075e-05,
"loss": 0.489,
"step": 2290
},
{
"epoch": 1.0398009950248757,
"grad_norm": 0.26404207944869995,
"learning_rate": 8.535533905932738e-05,
"loss": 0.4829,
"step": 2300
},
{
"epoch": 1.0443238353686115,
"grad_norm": 0.6389791965484619,
"learning_rate": 8.52088899478682e-05,
"loss": 0.4806,
"step": 2310
},
{
"epoch": 1.0488466757123474,
"grad_norm": 1.5533722639083862,
"learning_rate": 8.506183921362443e-05,
"loss": 0.4878,
"step": 2320
},
{
"epoch": 1.0533695160560832,
"grad_norm": 0.30983996391296387,
"learning_rate": 8.491418936928742e-05,
"loss": 0.4808,
"step": 2330
},
{
"epoch": 1.057892356399819,
"grad_norm": 0.16938813030719757,
"learning_rate": 8.476594293778561e-05,
"loss": 0.4863,
"step": 2340
},
{
"epoch": 1.062415196743555,
"grad_norm": 1.2061636447906494,
"learning_rate": 8.461710245224148e-05,
"loss": 0.4806,
"step": 2350
},
{
"epoch": 1.0669380370872907,
"grad_norm": 0.0852380022406578,
"learning_rate": 8.44676704559283e-05,
"loss": 0.4764,
"step": 2360
}
],
"logging_steps": 10,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.09484398947074e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}