tktung's picture
Upload folder using huggingface_hub
e6834b9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9720534629404617,
"eval_steps": 500,
"global_step": 1900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005116070857581378,
"grad_norm": 5.084794521331787,
"learning_rate": 5.115089514066497e-07,
"loss": 2.9408,
"step": 1
},
{
"epoch": 0.0010232141715162755,
"grad_norm": 5.157843112945557,
"learning_rate": 1.0230179028132994e-06,
"loss": 3.0401,
"step": 2
},
{
"epoch": 0.002046428343032551,
"grad_norm": 5.386075973510742,
"learning_rate": 2.0460358056265987e-06,
"loss": 3.071,
"step": 4
},
{
"epoch": 0.0030696425145488263,
"grad_norm": 5.142333984375,
"learning_rate": 3.069053708439898e-06,
"loss": 3.1039,
"step": 6
},
{
"epoch": 0.004092856686065102,
"grad_norm": 3.101259231567383,
"learning_rate": 4.092071611253197e-06,
"loss": 2.8181,
"step": 8
},
{
"epoch": 0.005116070857581378,
"grad_norm": 2.207404375076294,
"learning_rate": 5.1150895140664966e-06,
"loss": 2.5627,
"step": 10
},
{
"epoch": 0.006139285029097653,
"grad_norm": 1.8845449686050415,
"learning_rate": 6.138107416879796e-06,
"loss": 2.6209,
"step": 12
},
{
"epoch": 0.007162499200613928,
"grad_norm": 2.1659703254699707,
"learning_rate": 7.161125319693095e-06,
"loss": 2.5467,
"step": 14
},
{
"epoch": 0.008185713372130204,
"grad_norm": 1.6377224922180176,
"learning_rate": 8.184143222506395e-06,
"loss": 2.5057,
"step": 16
},
{
"epoch": 0.009208927543646479,
"grad_norm": 1.1535893678665161,
"learning_rate": 9.207161125319694e-06,
"loss": 2.538,
"step": 18
},
{
"epoch": 0.010232141715162756,
"grad_norm": 1.0513185262680054,
"learning_rate": 1.0230179028132993e-05,
"loss": 2.5247,
"step": 20
},
{
"epoch": 0.01125535588667903,
"grad_norm": 0.8281764388084412,
"learning_rate": 1.1253196930946292e-05,
"loss": 2.4595,
"step": 22
},
{
"epoch": 0.012278570058195305,
"grad_norm": 0.5129208564758301,
"learning_rate": 1.2276214833759591e-05,
"loss": 2.3742,
"step": 24
},
{
"epoch": 0.013301784229711582,
"grad_norm": 0.46613597869873047,
"learning_rate": 1.3299232736572892e-05,
"loss": 2.4564,
"step": 26
},
{
"epoch": 0.014324998401227857,
"grad_norm": 0.354717493057251,
"learning_rate": 1.432225063938619e-05,
"loss": 2.3467,
"step": 28
},
{
"epoch": 0.015348212572744133,
"grad_norm": 0.3325178623199463,
"learning_rate": 1.534526854219949e-05,
"loss": 2.3978,
"step": 30
},
{
"epoch": 0.016371426744260408,
"grad_norm": 0.32920145988464355,
"learning_rate": 1.636828644501279e-05,
"loss": 2.2522,
"step": 32
},
{
"epoch": 0.017394640915776683,
"grad_norm": 0.25466033816337585,
"learning_rate": 1.739130434782609e-05,
"loss": 2.243,
"step": 34
},
{
"epoch": 0.018417855087292958,
"grad_norm": 0.35631808638572693,
"learning_rate": 1.8414322250639388e-05,
"loss": 2.2527,
"step": 36
},
{
"epoch": 0.019441069258809233,
"grad_norm": 0.23582319915294647,
"learning_rate": 1.9437340153452684e-05,
"loss": 2.1452,
"step": 38
},
{
"epoch": 0.02046428343032551,
"grad_norm": 0.2491885870695114,
"learning_rate": 2.0460358056265986e-05,
"loss": 2.1778,
"step": 40
},
{
"epoch": 0.021487497601841786,
"grad_norm": 0.2993784546852112,
"learning_rate": 2.1483375959079285e-05,
"loss": 2.1006,
"step": 42
},
{
"epoch": 0.02251071177335806,
"grad_norm": 0.21940283477306366,
"learning_rate": 2.2506393861892585e-05,
"loss": 2.1752,
"step": 44
},
{
"epoch": 0.023533925944874336,
"grad_norm": 0.15252649784088135,
"learning_rate": 2.3529411764705884e-05,
"loss": 2.1295,
"step": 46
},
{
"epoch": 0.02455714011639061,
"grad_norm": 0.19182737171649933,
"learning_rate": 2.4552429667519183e-05,
"loss": 2.1181,
"step": 48
},
{
"epoch": 0.02558035428790689,
"grad_norm": 0.19416701793670654,
"learning_rate": 2.5575447570332482e-05,
"loss": 2.0953,
"step": 50
},
{
"epoch": 0.026603568459423164,
"grad_norm": 0.12562625110149384,
"learning_rate": 2.6598465473145784e-05,
"loss": 2.0856,
"step": 52
},
{
"epoch": 0.02762678263093944,
"grad_norm": 0.13417182862758636,
"learning_rate": 2.7621483375959077e-05,
"loss": 2.0948,
"step": 54
},
{
"epoch": 0.028649996802455713,
"grad_norm": 0.10808593034744263,
"learning_rate": 2.864450127877238e-05,
"loss": 2.0541,
"step": 56
},
{
"epoch": 0.02967321097397199,
"grad_norm": 0.14162665605545044,
"learning_rate": 2.966751918158568e-05,
"loss": 2.0756,
"step": 58
},
{
"epoch": 0.030696425145488267,
"grad_norm": 0.10216689854860306,
"learning_rate": 3.069053708439898e-05,
"loss": 2.0502,
"step": 60
},
{
"epoch": 0.03171963931700454,
"grad_norm": 0.0772320106625557,
"learning_rate": 3.171355498721228e-05,
"loss": 2.0598,
"step": 62
},
{
"epoch": 0.032742853488520816,
"grad_norm": 0.07200902700424194,
"learning_rate": 3.273657289002558e-05,
"loss": 2.0416,
"step": 64
},
{
"epoch": 0.03376606766003709,
"grad_norm": 0.07764917612075806,
"learning_rate": 3.375959079283887e-05,
"loss": 2.04,
"step": 66
},
{
"epoch": 0.034789281831553366,
"grad_norm": 0.07703404128551483,
"learning_rate": 3.478260869565218e-05,
"loss": 2.0426,
"step": 68
},
{
"epoch": 0.03581249600306964,
"grad_norm": 0.05096273496747017,
"learning_rate": 3.580562659846548e-05,
"loss": 2.0264,
"step": 70
},
{
"epoch": 0.036835710174585916,
"grad_norm": 0.07172555476427078,
"learning_rate": 3.6828644501278776e-05,
"loss": 1.9799,
"step": 72
},
{
"epoch": 0.03785892434610219,
"grad_norm": 0.05563480406999588,
"learning_rate": 3.7851662404092075e-05,
"loss": 1.9922,
"step": 74
},
{
"epoch": 0.038882138517618466,
"grad_norm": 0.04726962745189667,
"learning_rate": 3.887468030690537e-05,
"loss": 1.9826,
"step": 76
},
{
"epoch": 0.03990535268913475,
"grad_norm": 0.040130794048309326,
"learning_rate": 3.989769820971867e-05,
"loss": 1.9693,
"step": 78
},
{
"epoch": 0.04092856686065102,
"grad_norm": 0.051317401230335236,
"learning_rate": 4.092071611253197e-05,
"loss": 1.9454,
"step": 80
},
{
"epoch": 0.0419517810321673,
"grad_norm": 0.03843973949551582,
"learning_rate": 4.194373401534527e-05,
"loss": 1.9535,
"step": 82
},
{
"epoch": 0.04297499520368357,
"grad_norm": 0.04338320344686508,
"learning_rate": 4.296675191815857e-05,
"loss": 1.9017,
"step": 84
},
{
"epoch": 0.04399820937519985,
"grad_norm": 0.0422111339867115,
"learning_rate": 4.398976982097187e-05,
"loss": 1.9806,
"step": 86
},
{
"epoch": 0.04502142354671612,
"grad_norm": 0.043594423681497574,
"learning_rate": 4.501278772378517e-05,
"loss": 1.9809,
"step": 88
},
{
"epoch": 0.0460446377182324,
"grad_norm": 0.050932493060827255,
"learning_rate": 4.603580562659847e-05,
"loss": 2.002,
"step": 90
},
{
"epoch": 0.04706785188974867,
"grad_norm": 0.039923008531332016,
"learning_rate": 4.705882352941177e-05,
"loss": 1.9898,
"step": 92
},
{
"epoch": 0.048091066061264946,
"grad_norm": 0.04199720919132233,
"learning_rate": 4.8081841432225067e-05,
"loss": 1.9375,
"step": 94
},
{
"epoch": 0.04911428023278122,
"grad_norm": 0.03885011374950409,
"learning_rate": 4.9104859335038366e-05,
"loss": 1.9594,
"step": 96
},
{
"epoch": 0.0501374944042975,
"grad_norm": 0.04459952563047409,
"learning_rate": 5.0127877237851665e-05,
"loss": 1.9327,
"step": 98
},
{
"epoch": 0.05116070857581378,
"grad_norm": 0.04154925048351288,
"learning_rate": 5.1150895140664964e-05,
"loss": 1.9385,
"step": 100
},
{
"epoch": 0.05218392274733005,
"grad_norm": 0.04149138927459717,
"learning_rate": 5.217391304347826e-05,
"loss": 1.9251,
"step": 102
},
{
"epoch": 0.05320713691884633,
"grad_norm": 0.05338102579116821,
"learning_rate": 5.319693094629157e-05,
"loss": 1.9211,
"step": 104
},
{
"epoch": 0.0542303510903626,
"grad_norm": 0.04964439943432808,
"learning_rate": 5.421994884910486e-05,
"loss": 1.8863,
"step": 106
},
{
"epoch": 0.05525356526187888,
"grad_norm": 0.040731314569711685,
"learning_rate": 5.5242966751918154e-05,
"loss": 1.9002,
"step": 108
},
{
"epoch": 0.05627677943339515,
"grad_norm": 0.05813027173280716,
"learning_rate": 5.626598465473146e-05,
"loss": 1.8944,
"step": 110
},
{
"epoch": 0.05729999360491143,
"grad_norm": 0.04966093972325325,
"learning_rate": 5.728900255754476e-05,
"loss": 1.898,
"step": 112
},
{
"epoch": 0.0583232077764277,
"grad_norm": 0.050573479384183884,
"learning_rate": 5.8312020460358065e-05,
"loss": 1.8778,
"step": 114
},
{
"epoch": 0.05934642194794398,
"grad_norm": 0.05025520175695419,
"learning_rate": 5.933503836317136e-05,
"loss": 1.9044,
"step": 116
},
{
"epoch": 0.06036963611946025,
"grad_norm": 0.05153055489063263,
"learning_rate": 6.035805626598465e-05,
"loss": 1.9045,
"step": 118
},
{
"epoch": 0.06139285029097653,
"grad_norm": 0.051311247050762177,
"learning_rate": 6.138107416879796e-05,
"loss": 1.9077,
"step": 120
},
{
"epoch": 0.06241606446249281,
"grad_norm": 0.05084897577762604,
"learning_rate": 6.240409207161125e-05,
"loss": 1.8538,
"step": 122
},
{
"epoch": 0.06343927863400908,
"grad_norm": 0.05961287021636963,
"learning_rate": 6.342710997442456e-05,
"loss": 1.8792,
"step": 124
},
{
"epoch": 0.06446249280552535,
"grad_norm": 0.05775010585784912,
"learning_rate": 6.445012787723786e-05,
"loss": 1.8587,
"step": 126
},
{
"epoch": 0.06548570697704163,
"grad_norm": 0.09344275295734406,
"learning_rate": 6.547314578005116e-05,
"loss": 1.8454,
"step": 128
},
{
"epoch": 0.0665089211485579,
"grad_norm": 0.0748172476887703,
"learning_rate": 6.649616368286446e-05,
"loss": 1.8998,
"step": 130
},
{
"epoch": 0.06753213532007418,
"grad_norm": 0.07188538461923599,
"learning_rate": 6.751918158567774e-05,
"loss": 1.8219,
"step": 132
},
{
"epoch": 0.06855534949159046,
"grad_norm": 0.05799673870205879,
"learning_rate": 6.854219948849106e-05,
"loss": 1.8549,
"step": 134
},
{
"epoch": 0.06957856366310673,
"grad_norm": 0.07886774092912674,
"learning_rate": 6.956521739130436e-05,
"loss": 1.8885,
"step": 136
},
{
"epoch": 0.07060177783462301,
"grad_norm": 0.0599171444773674,
"learning_rate": 7.058823529411765e-05,
"loss": 1.829,
"step": 138
},
{
"epoch": 0.07162499200613928,
"grad_norm": 0.07810111343860626,
"learning_rate": 7.161125319693095e-05,
"loss": 1.8878,
"step": 140
},
{
"epoch": 0.07264820617765556,
"grad_norm": 0.062123704701662064,
"learning_rate": 7.263427109974424e-05,
"loss": 1.8633,
"step": 142
},
{
"epoch": 0.07367142034917183,
"grad_norm": 0.08402098715305328,
"learning_rate": 7.365728900255755e-05,
"loss": 1.8377,
"step": 144
},
{
"epoch": 0.07469463452068811,
"grad_norm": 0.06189502775669098,
"learning_rate": 7.468030690537085e-05,
"loss": 1.8683,
"step": 146
},
{
"epoch": 0.07571784869220438,
"grad_norm": 0.07368986308574677,
"learning_rate": 7.570332480818415e-05,
"loss": 1.8636,
"step": 148
},
{
"epoch": 0.07674106286372066,
"grad_norm": 0.06430894136428833,
"learning_rate": 7.672634271099745e-05,
"loss": 1.8341,
"step": 150
},
{
"epoch": 0.07776427703523693,
"grad_norm": 0.05924483761191368,
"learning_rate": 7.774936061381073e-05,
"loss": 1.9151,
"step": 152
},
{
"epoch": 0.07878749120675321,
"grad_norm": 0.06166929751634598,
"learning_rate": 7.877237851662405e-05,
"loss": 1.8306,
"step": 154
},
{
"epoch": 0.0798107053782695,
"grad_norm": 0.07514499127864838,
"learning_rate": 7.979539641943735e-05,
"loss": 1.8572,
"step": 156
},
{
"epoch": 0.08083391954978576,
"grad_norm": 0.06925056874752045,
"learning_rate": 8.081841432225065e-05,
"loss": 1.8449,
"step": 158
},
{
"epoch": 0.08185713372130204,
"grad_norm": 0.08889607340097427,
"learning_rate": 8.184143222506395e-05,
"loss": 1.8217,
"step": 160
},
{
"epoch": 0.08288034789281831,
"grad_norm": 0.11205849796533585,
"learning_rate": 8.286445012787724e-05,
"loss": 1.7859,
"step": 162
},
{
"epoch": 0.0839035620643346,
"grad_norm": 0.13293609023094177,
"learning_rate": 8.388746803069054e-05,
"loss": 1.8245,
"step": 164
},
{
"epoch": 0.08492677623585086,
"grad_norm": 0.14082959294319153,
"learning_rate": 8.491048593350384e-05,
"loss": 1.8077,
"step": 166
},
{
"epoch": 0.08594999040736714,
"grad_norm": 0.0726478174328804,
"learning_rate": 8.593350383631714e-05,
"loss": 1.8081,
"step": 168
},
{
"epoch": 0.08697320457888341,
"grad_norm": 0.21175715327262878,
"learning_rate": 8.695652173913044e-05,
"loss": 1.8289,
"step": 170
},
{
"epoch": 0.0879964187503997,
"grad_norm": 0.19227363169193268,
"learning_rate": 8.797953964194374e-05,
"loss": 1.8092,
"step": 172
},
{
"epoch": 0.08901963292191598,
"grad_norm": 0.13788004219532013,
"learning_rate": 8.900255754475704e-05,
"loss": 1.7986,
"step": 174
},
{
"epoch": 0.09004284709343224,
"grad_norm": 0.09351494908332825,
"learning_rate": 9.002557544757034e-05,
"loss": 1.8077,
"step": 176
},
{
"epoch": 0.09106606126494853,
"grad_norm": 0.09681002050638199,
"learning_rate": 9.104859335038364e-05,
"loss": 1.794,
"step": 178
},
{
"epoch": 0.0920892754364648,
"grad_norm": 0.061654381453990936,
"learning_rate": 9.207161125319694e-05,
"loss": 1.7935,
"step": 180
},
{
"epoch": 0.09311248960798107,
"grad_norm": 0.06282493472099304,
"learning_rate": 9.309462915601024e-05,
"loss": 1.7758,
"step": 182
},
{
"epoch": 0.09413570377949734,
"grad_norm": 0.08118202537298203,
"learning_rate": 9.411764705882353e-05,
"loss": 1.8209,
"step": 184
},
{
"epoch": 0.09515891795101362,
"grad_norm": 0.0755864828824997,
"learning_rate": 9.514066496163683e-05,
"loss": 1.7672,
"step": 186
},
{
"epoch": 0.09618213212252989,
"grad_norm": 0.07810387760400772,
"learning_rate": 9.616368286445013e-05,
"loss": 1.7655,
"step": 188
},
{
"epoch": 0.09720534629404617,
"grad_norm": 0.08016899228096008,
"learning_rate": 9.718670076726343e-05,
"loss": 1.7818,
"step": 190
},
{
"epoch": 0.09822856046556244,
"grad_norm": 0.07527964562177658,
"learning_rate": 9.820971867007673e-05,
"loss": 1.7386,
"step": 192
},
{
"epoch": 0.09925177463707872,
"grad_norm": 0.08135760575532913,
"learning_rate": 9.923273657289003e-05,
"loss": 1.7678,
"step": 194
},
{
"epoch": 0.100274988808595,
"grad_norm": 0.06465744972229004,
"learning_rate": 0.00010025575447570333,
"loss": 1.8469,
"step": 196
},
{
"epoch": 0.10129820298011127,
"grad_norm": 0.0678311362862587,
"learning_rate": 0.00010127877237851664,
"loss": 1.7856,
"step": 198
},
{
"epoch": 0.10232141715162756,
"grad_norm": 0.06425610929727554,
"learning_rate": 0.00010230179028132993,
"loss": 1.7542,
"step": 200
},
{
"epoch": 0.10334463132314382,
"grad_norm": 0.06820003688335419,
"learning_rate": 0.00010332480818414323,
"loss": 1.783,
"step": 202
},
{
"epoch": 0.1043678454946601,
"grad_norm": 0.0690922886133194,
"learning_rate": 0.00010434782608695653,
"loss": 1.7612,
"step": 204
},
{
"epoch": 0.10539105966617637,
"grad_norm": 0.06488107144832611,
"learning_rate": 0.00010537084398976983,
"loss": 1.7648,
"step": 206
},
{
"epoch": 0.10641427383769266,
"grad_norm": 0.08278009295463562,
"learning_rate": 0.00010639386189258314,
"loss": 1.7661,
"step": 208
},
{
"epoch": 0.10743748800920892,
"grad_norm": 0.08722035586833954,
"learning_rate": 0.00010741687979539642,
"loss": 1.7578,
"step": 210
},
{
"epoch": 0.1084607021807252,
"grad_norm": 0.0737011507153511,
"learning_rate": 0.00010843989769820972,
"loss": 1.7381,
"step": 212
},
{
"epoch": 0.10948391635224147,
"grad_norm": 0.08060843497514725,
"learning_rate": 0.00010946291560102302,
"loss": 1.7967,
"step": 214
},
{
"epoch": 0.11050713052375775,
"grad_norm": 0.10279374569654465,
"learning_rate": 0.00011048593350383631,
"loss": 1.7703,
"step": 216
},
{
"epoch": 0.11153034469527404,
"grad_norm": 0.0777791365981102,
"learning_rate": 0.00011150895140664963,
"loss": 1.8015,
"step": 218
},
{
"epoch": 0.1125535588667903,
"grad_norm": 0.06883997470140457,
"learning_rate": 0.00011253196930946292,
"loss": 1.7731,
"step": 220
},
{
"epoch": 0.11357677303830659,
"grad_norm": 0.06231442466378212,
"learning_rate": 0.00011355498721227622,
"loss": 1.8063,
"step": 222
},
{
"epoch": 0.11459998720982285,
"grad_norm": 0.06607846170663834,
"learning_rate": 0.00011457800511508952,
"loss": 1.7616,
"step": 224
},
{
"epoch": 0.11562320138133914,
"grad_norm": 0.05903138220310211,
"learning_rate": 0.0001156010230179028,
"loss": 1.7993,
"step": 226
},
{
"epoch": 0.1166464155528554,
"grad_norm": 0.07282232493162155,
"learning_rate": 0.00011662404092071613,
"loss": 1.7374,
"step": 228
},
{
"epoch": 0.11766962972437169,
"grad_norm": 0.06793032586574554,
"learning_rate": 0.00011764705882352942,
"loss": 1.7852,
"step": 230
},
{
"epoch": 0.11869284389588795,
"grad_norm": 0.06404048949480057,
"learning_rate": 0.00011867007672634271,
"loss": 1.775,
"step": 232
},
{
"epoch": 0.11971605806740424,
"grad_norm": 0.08423135429620743,
"learning_rate": 0.00011969309462915601,
"loss": 1.779,
"step": 234
},
{
"epoch": 0.1207392722389205,
"grad_norm": 0.0814799889922142,
"learning_rate": 0.0001207161125319693,
"loss": 1.7082,
"step": 236
},
{
"epoch": 0.12176248641043678,
"grad_norm": 0.08876215666532516,
"learning_rate": 0.00012173913043478263,
"loss": 1.7767,
"step": 238
},
{
"epoch": 0.12278570058195307,
"grad_norm": 0.07051345705986023,
"learning_rate": 0.00012276214833759592,
"loss": 1.7181,
"step": 240
},
{
"epoch": 0.12380891475346933,
"grad_norm": 0.07023751735687256,
"learning_rate": 0.00012378516624040922,
"loss": 1.7308,
"step": 242
},
{
"epoch": 0.12483212892498562,
"grad_norm": 0.0754849910736084,
"learning_rate": 0.0001248081841432225,
"loss": 1.7782,
"step": 244
},
{
"epoch": 0.1258553430965019,
"grad_norm": 0.07223635166883469,
"learning_rate": 0.0001258312020460358,
"loss": 1.718,
"step": 246
},
{
"epoch": 0.12687855726801817,
"grad_norm": 0.07007969915866852,
"learning_rate": 0.00012685421994884912,
"loss": 1.7686,
"step": 248
},
{
"epoch": 0.12790177143953443,
"grad_norm": 0.06361662596464157,
"learning_rate": 0.00012787723785166242,
"loss": 1.7217,
"step": 250
},
{
"epoch": 0.1289249856110507,
"grad_norm": 0.08723774552345276,
"learning_rate": 0.00012890025575447572,
"loss": 1.7369,
"step": 252
},
{
"epoch": 0.129948199782567,
"grad_norm": 0.06651702523231506,
"learning_rate": 0.000129923273657289,
"loss": 1.7163,
"step": 254
},
{
"epoch": 0.13097141395408327,
"grad_norm": 0.07153377681970596,
"learning_rate": 0.00013094629156010232,
"loss": 1.7168,
"step": 256
},
{
"epoch": 0.13199462812559953,
"grad_norm": 0.09451760351657867,
"learning_rate": 0.00013196930946291562,
"loss": 1.7182,
"step": 258
},
{
"epoch": 0.1330178422971158,
"grad_norm": 0.08822207897901535,
"learning_rate": 0.00013299232736572892,
"loss": 1.7483,
"step": 260
},
{
"epoch": 0.1340410564686321,
"grad_norm": 0.11073771119117737,
"learning_rate": 0.00013401534526854221,
"loss": 1.7087,
"step": 262
},
{
"epoch": 0.13506427064014837,
"grad_norm": 0.07717689871788025,
"learning_rate": 0.0001350383631713555,
"loss": 1.6943,
"step": 264
},
{
"epoch": 0.13608748481166463,
"grad_norm": 0.09418254345655441,
"learning_rate": 0.0001360613810741688,
"loss": 1.7084,
"step": 266
},
{
"epoch": 0.13711069898318093,
"grad_norm": 0.0922132208943367,
"learning_rate": 0.0001370843989769821,
"loss": 1.7526,
"step": 268
},
{
"epoch": 0.1381339131546972,
"grad_norm": 0.08973314613103867,
"learning_rate": 0.0001381074168797954,
"loss": 1.7049,
"step": 270
},
{
"epoch": 0.13915712732621346,
"grad_norm": 0.0772908478975296,
"learning_rate": 0.0001391304347826087,
"loss": 1.7444,
"step": 272
},
{
"epoch": 0.14018034149772973,
"grad_norm": 0.07179255038499832,
"learning_rate": 0.00014015345268542198,
"loss": 1.7309,
"step": 274
},
{
"epoch": 0.14120355566924603,
"grad_norm": 0.10786614567041397,
"learning_rate": 0.0001411764705882353,
"loss": 1.7413,
"step": 276
},
{
"epoch": 0.1422267698407623,
"grad_norm": 0.0815059244632721,
"learning_rate": 0.0001421994884910486,
"loss": 1.6895,
"step": 278
},
{
"epoch": 0.14324998401227856,
"grad_norm": 0.12658405303955078,
"learning_rate": 0.0001432225063938619,
"loss": 1.7013,
"step": 280
},
{
"epoch": 0.14427319818379483,
"grad_norm": 0.0807737335562706,
"learning_rate": 0.0001442455242966752,
"loss": 1.7378,
"step": 282
},
{
"epoch": 0.14529641235531113,
"grad_norm": 0.09726593643426895,
"learning_rate": 0.00014526854219948848,
"loss": 1.7143,
"step": 284
},
{
"epoch": 0.1463196265268274,
"grad_norm": 0.08326689153909683,
"learning_rate": 0.0001462915601023018,
"loss": 1.7395,
"step": 286
},
{
"epoch": 0.14734284069834366,
"grad_norm": 0.08783421665430069,
"learning_rate": 0.0001473145780051151,
"loss": 1.7466,
"step": 288
},
{
"epoch": 0.14836605486985996,
"grad_norm": 0.0639604702591896,
"learning_rate": 0.0001483375959079284,
"loss": 1.7019,
"step": 290
},
{
"epoch": 0.14938926904137623,
"grad_norm": 0.08028368651866913,
"learning_rate": 0.0001493606138107417,
"loss": 1.7134,
"step": 292
},
{
"epoch": 0.1504124832128925,
"grad_norm": 0.0739947184920311,
"learning_rate": 0.00015038363171355497,
"loss": 1.702,
"step": 294
},
{
"epoch": 0.15143569738440876,
"grad_norm": 0.07335802167654037,
"learning_rate": 0.0001514066496163683,
"loss": 1.7321,
"step": 296
},
{
"epoch": 0.15245891155592506,
"grad_norm": 0.07030144333839417,
"learning_rate": 0.0001524296675191816,
"loss": 1.6654,
"step": 298
},
{
"epoch": 0.15348212572744133,
"grad_norm": 0.07079968601465225,
"learning_rate": 0.0001534526854219949,
"loss": 1.7129,
"step": 300
},
{
"epoch": 0.1545053398989576,
"grad_norm": 0.06605160236358643,
"learning_rate": 0.0001544757033248082,
"loss": 1.713,
"step": 302
},
{
"epoch": 0.15552855407047386,
"grad_norm": 0.08417898416519165,
"learning_rate": 0.00015549872122762147,
"loss": 1.7063,
"step": 304
},
{
"epoch": 0.15655176824199016,
"grad_norm": 0.07255028933286667,
"learning_rate": 0.0001565217391304348,
"loss": 1.742,
"step": 306
},
{
"epoch": 0.15757498241350643,
"grad_norm": 0.06561743468046188,
"learning_rate": 0.0001575447570332481,
"loss": 1.6912,
"step": 308
},
{
"epoch": 0.1585981965850227,
"grad_norm": 0.07030262053012848,
"learning_rate": 0.0001585677749360614,
"loss": 1.7434,
"step": 310
},
{
"epoch": 0.159621410756539,
"grad_norm": 0.076111800968647,
"learning_rate": 0.0001595907928388747,
"loss": 1.6783,
"step": 312
},
{
"epoch": 0.16064462492805526,
"grad_norm": 0.06267083436250687,
"learning_rate": 0.000160613810741688,
"loss": 1.7193,
"step": 314
},
{
"epoch": 0.16166783909957153,
"grad_norm": 0.07638990879058838,
"learning_rate": 0.0001616368286445013,
"loss": 1.7395,
"step": 316
},
{
"epoch": 0.1626910532710878,
"grad_norm": 0.07447683811187744,
"learning_rate": 0.0001626598465473146,
"loss": 1.6574,
"step": 318
},
{
"epoch": 0.1637142674426041,
"grad_norm": 0.07413692772388458,
"learning_rate": 0.0001636828644501279,
"loss": 1.6868,
"step": 320
},
{
"epoch": 0.16473748161412036,
"grad_norm": 0.07566969096660614,
"learning_rate": 0.0001647058823529412,
"loss": 1.779,
"step": 322
},
{
"epoch": 0.16576069578563662,
"grad_norm": 0.09093326330184937,
"learning_rate": 0.0001657289002557545,
"loss": 1.6807,
"step": 324
},
{
"epoch": 0.16678390995715292,
"grad_norm": 0.0930614024400711,
"learning_rate": 0.0001667519181585678,
"loss": 1.7067,
"step": 326
},
{
"epoch": 0.1678071241286692,
"grad_norm": 0.06676892936229706,
"learning_rate": 0.0001677749360613811,
"loss": 1.6609,
"step": 328
},
{
"epoch": 0.16883033830018546,
"grad_norm": 0.08882534503936768,
"learning_rate": 0.00016879795396419439,
"loss": 1.6796,
"step": 330
},
{
"epoch": 0.16985355247170172,
"grad_norm": 0.07226958125829697,
"learning_rate": 0.00016982097186700768,
"loss": 1.7163,
"step": 332
},
{
"epoch": 0.17087676664321802,
"grad_norm": 0.07271122932434082,
"learning_rate": 0.00017084398976982098,
"loss": 1.7585,
"step": 334
},
{
"epoch": 0.1718999808147343,
"grad_norm": 0.08161617070436478,
"learning_rate": 0.00017186700767263428,
"loss": 1.6299,
"step": 336
},
{
"epoch": 0.17292319498625056,
"grad_norm": 0.08419859409332275,
"learning_rate": 0.00017289002557544758,
"loss": 1.6848,
"step": 338
},
{
"epoch": 0.17394640915776682,
"grad_norm": 0.08996909856796265,
"learning_rate": 0.00017391304347826088,
"loss": 1.6582,
"step": 340
},
{
"epoch": 0.17496962332928312,
"grad_norm": 0.09278981387615204,
"learning_rate": 0.00017493606138107418,
"loss": 1.7044,
"step": 342
},
{
"epoch": 0.1759928375007994,
"grad_norm": 0.08387704193592072,
"learning_rate": 0.00017595907928388748,
"loss": 1.6503,
"step": 344
},
{
"epoch": 0.17701605167231566,
"grad_norm": 0.07442387193441391,
"learning_rate": 0.00017698209718670078,
"loss": 1.7058,
"step": 346
},
{
"epoch": 0.17803926584383195,
"grad_norm": 0.06898263841867447,
"learning_rate": 0.00017800511508951408,
"loss": 1.6708,
"step": 348
},
{
"epoch": 0.17906248001534822,
"grad_norm": 0.07982076704502106,
"learning_rate": 0.00017902813299232738,
"loss": 1.6807,
"step": 350
},
{
"epoch": 0.1800856941868645,
"grad_norm": 0.07170634716749191,
"learning_rate": 0.00018005115089514068,
"loss": 1.6753,
"step": 352
},
{
"epoch": 0.18110890835838075,
"grad_norm": 0.07484789937734604,
"learning_rate": 0.00018107416879795398,
"loss": 1.6883,
"step": 354
},
{
"epoch": 0.18213212252989705,
"grad_norm": 0.08390472084283829,
"learning_rate": 0.00018209718670076727,
"loss": 1.6783,
"step": 356
},
{
"epoch": 0.18315533670141332,
"grad_norm": 0.0833701565861702,
"learning_rate": 0.00018312020460358057,
"loss": 1.6804,
"step": 358
},
{
"epoch": 0.1841785508729296,
"grad_norm": 0.07489979267120361,
"learning_rate": 0.00018414322250639387,
"loss": 1.6179,
"step": 360
},
{
"epoch": 0.18520176504444585,
"grad_norm": 0.14307746291160583,
"learning_rate": 0.00018516624040920717,
"loss": 1.6396,
"step": 362
},
{
"epoch": 0.18622497921596215,
"grad_norm": 0.13637496531009674,
"learning_rate": 0.00018618925831202047,
"loss": 1.6425,
"step": 364
},
{
"epoch": 0.18724819338747842,
"grad_norm": 0.13586537539958954,
"learning_rate": 0.00018721227621483377,
"loss": 1.6915,
"step": 366
},
{
"epoch": 0.18827140755899469,
"grad_norm": 0.07892754673957825,
"learning_rate": 0.00018823529411764707,
"loss": 1.6628,
"step": 368
},
{
"epoch": 0.18929462173051098,
"grad_norm": 0.20291955769062042,
"learning_rate": 0.00018925831202046037,
"loss": 1.6572,
"step": 370
},
{
"epoch": 0.19031783590202725,
"grad_norm": 0.3548440933227539,
"learning_rate": 0.00019028132992327367,
"loss": 1.6963,
"step": 372
},
{
"epoch": 0.19134105007354352,
"grad_norm": 0.19051846861839294,
"learning_rate": 0.00019130434782608697,
"loss": 1.6853,
"step": 374
},
{
"epoch": 0.19236426424505979,
"grad_norm": 0.3201465308666229,
"learning_rate": 0.00019232736572890027,
"loss": 1.6549,
"step": 376
},
{
"epoch": 0.19338747841657608,
"grad_norm": 0.1700785905122757,
"learning_rate": 0.00019335038363171357,
"loss": 1.658,
"step": 378
},
{
"epoch": 0.19441069258809235,
"grad_norm": 0.1742287576198578,
"learning_rate": 0.00019437340153452686,
"loss": 1.6644,
"step": 380
},
{
"epoch": 0.19543390675960862,
"grad_norm": 0.0945478230714798,
"learning_rate": 0.00019539641943734016,
"loss": 1.65,
"step": 382
},
{
"epoch": 0.19645712093112488,
"grad_norm": 0.06995284557342529,
"learning_rate": 0.00019641943734015346,
"loss": 1.6608,
"step": 384
},
{
"epoch": 0.19748033510264118,
"grad_norm": 0.07590003311634064,
"learning_rate": 0.00019744245524296676,
"loss": 1.6367,
"step": 386
},
{
"epoch": 0.19850354927415745,
"grad_norm": 0.09830451011657715,
"learning_rate": 0.00019846547314578006,
"loss": 1.6638,
"step": 388
},
{
"epoch": 0.19952676344567372,
"grad_norm": 0.10720949620008469,
"learning_rate": 0.00019948849104859336,
"loss": 1.6571,
"step": 390
},
{
"epoch": 0.20054997761719,
"grad_norm": 0.06915664672851562,
"learning_rate": 0.0001999999910488914,
"loss": 1.669,
"step": 392
},
{
"epoch": 0.20157319178870628,
"grad_norm": 0.04960264638066292,
"learning_rate": 0.00019999991944003202,
"loss": 1.6529,
"step": 394
},
{
"epoch": 0.20259640596022255,
"grad_norm": 0.05139967054128647,
"learning_rate": 0.00019999977622236462,
"loss": 1.6053,
"step": 396
},
{
"epoch": 0.20361962013173882,
"grad_norm": 0.05288904160261154,
"learning_rate": 0.0001999995613959917,
"loss": 1.6905,
"step": 398
},
{
"epoch": 0.2046428343032551,
"grad_norm": 0.056239306926727295,
"learning_rate": 0.00019999927496106707,
"loss": 1.6662,
"step": 400
},
{
"epoch": 0.20566604847477138,
"grad_norm": 0.06484871357679367,
"learning_rate": 0.0001999989169177959,
"loss": 1.6803,
"step": 402
},
{
"epoch": 0.20668926264628765,
"grad_norm": 0.11631152778863907,
"learning_rate": 0.00019999848726643454,
"loss": 1.6389,
"step": 404
},
{
"epoch": 0.20771247681780391,
"grad_norm": 0.06311234086751938,
"learning_rate": 0.00019999798600729064,
"loss": 1.7017,
"step": 406
},
{
"epoch": 0.2087356909893202,
"grad_norm": 0.06155601888895035,
"learning_rate": 0.00019999741314072323,
"loss": 1.7014,
"step": 408
},
{
"epoch": 0.20975890516083648,
"grad_norm": 0.06340397894382477,
"learning_rate": 0.00019999676866714244,
"loss": 1.6735,
"step": 410
},
{
"epoch": 0.21078211933235275,
"grad_norm": 0.06068040430545807,
"learning_rate": 0.00019999605258700983,
"loss": 1.6224,
"step": 412
},
{
"epoch": 0.21180533350386904,
"grad_norm": 0.06651381403207779,
"learning_rate": 0.00019999526490083817,
"loss": 1.6279,
"step": 414
},
{
"epoch": 0.2128285476753853,
"grad_norm": 0.06273658573627472,
"learning_rate": 0.00019999440560919152,
"loss": 1.6591,
"step": 416
},
{
"epoch": 0.21385176184690158,
"grad_norm": 0.06989671289920807,
"learning_rate": 0.00019999347471268516,
"loss": 1.6405,
"step": 418
},
{
"epoch": 0.21487497601841785,
"grad_norm": 0.06204582378268242,
"learning_rate": 0.00019999247221198573,
"loss": 1.6512,
"step": 420
},
{
"epoch": 0.21589819018993414,
"grad_norm": 0.1728357970714569,
"learning_rate": 0.00019999139810781112,
"loss": 1.6332,
"step": 422
},
{
"epoch": 0.2169214043614504,
"grad_norm": 0.0696343332529068,
"learning_rate": 0.00019999025240093044,
"loss": 1.6649,
"step": 424
},
{
"epoch": 0.21794461853296668,
"grad_norm": 0.060923777520656586,
"learning_rate": 0.00019998903509216415,
"loss": 1.6269,
"step": 426
},
{
"epoch": 0.21896783270448295,
"grad_norm": 0.061977677047252655,
"learning_rate": 0.00019998774618238394,
"loss": 1.6636,
"step": 428
},
{
"epoch": 0.21999104687599924,
"grad_norm": 0.07241713255643845,
"learning_rate": 0.0001999863856725128,
"loss": 1.643,
"step": 430
},
{
"epoch": 0.2210142610475155,
"grad_norm": 0.06513350456953049,
"learning_rate": 0.000199984953563525,
"loss": 1.6184,
"step": 432
},
{
"epoch": 0.22203747521903178,
"grad_norm": 0.06109536439180374,
"learning_rate": 0.000199983449856446,
"loss": 1.6734,
"step": 434
},
{
"epoch": 0.22306068939054807,
"grad_norm": 0.09125282615423203,
"learning_rate": 0.0001999818745523526,
"loss": 1.6617,
"step": 436
},
{
"epoch": 0.22408390356206434,
"grad_norm": 0.05963214859366417,
"learning_rate": 0.00019998022765237288,
"loss": 1.648,
"step": 438
},
{
"epoch": 0.2251071177335806,
"grad_norm": 0.18775390088558197,
"learning_rate": 0.00019997850915768613,
"loss": 1.6599,
"step": 440
},
{
"epoch": 0.22613033190509688,
"grad_norm": 0.05968334153294563,
"learning_rate": 0.00019997671906952298,
"loss": 1.6072,
"step": 442
},
{
"epoch": 0.22715354607661317,
"grad_norm": 0.05431201308965683,
"learning_rate": 0.0001999748573891653,
"loss": 1.6315,
"step": 444
},
{
"epoch": 0.22817676024812944,
"grad_norm": 0.05960986390709877,
"learning_rate": 0.00019997292411794618,
"loss": 1.6565,
"step": 446
},
{
"epoch": 0.2291999744196457,
"grad_norm": 0.07451862096786499,
"learning_rate": 0.00019997091925725004,
"loss": 1.6793,
"step": 448
},
{
"epoch": 0.23022318859116198,
"grad_norm": 0.05454723909497261,
"learning_rate": 0.0001999688428085125,
"loss": 1.6055,
"step": 450
},
{
"epoch": 0.23124640276267827,
"grad_norm": 0.05422728881239891,
"learning_rate": 0.00019996669477322055,
"loss": 1.6455,
"step": 452
},
{
"epoch": 0.23226961693419454,
"grad_norm": 0.06064201146364212,
"learning_rate": 0.00019996447515291233,
"loss": 1.5895,
"step": 454
},
{
"epoch": 0.2332928311057108,
"grad_norm": 0.04667961224913597,
"learning_rate": 0.0001999621839491773,
"loss": 1.652,
"step": 456
},
{
"epoch": 0.2343160452772271,
"grad_norm": 0.06072809919714928,
"learning_rate": 0.00019995982116365616,
"loss": 1.6073,
"step": 458
},
{
"epoch": 0.23533925944874337,
"grad_norm": 0.05477429926395416,
"learning_rate": 0.00019995738679804085,
"loss": 1.6412,
"step": 460
},
{
"epoch": 0.23636247362025964,
"grad_norm": 0.08307594060897827,
"learning_rate": 0.00019995488085407462,
"loss": 1.6396,
"step": 462
},
{
"epoch": 0.2373856877917759,
"grad_norm": 0.059893883764743805,
"learning_rate": 0.00019995230333355192,
"loss": 1.6426,
"step": 464
},
{
"epoch": 0.2384089019632922,
"grad_norm": 0.06132538989186287,
"learning_rate": 0.00019994965423831854,
"loss": 1.6133,
"step": 466
},
{
"epoch": 0.23943211613480847,
"grad_norm": 0.07076270133256912,
"learning_rate": 0.00019994693357027138,
"loss": 1.576,
"step": 468
},
{
"epoch": 0.24045533030632474,
"grad_norm": 0.06282426416873932,
"learning_rate": 0.00019994414133135877,
"loss": 1.6373,
"step": 470
},
{
"epoch": 0.241478544477841,
"grad_norm": 0.058667294681072235,
"learning_rate": 0.00019994127752358013,
"loss": 1.619,
"step": 472
},
{
"epoch": 0.2425017586493573,
"grad_norm": 0.08359505236148834,
"learning_rate": 0.00019993834214898626,
"loss": 1.6225,
"step": 474
},
{
"epoch": 0.24352497282087357,
"grad_norm": 0.06758000701665878,
"learning_rate": 0.00019993533520967912,
"loss": 1.5799,
"step": 476
},
{
"epoch": 0.24454818699238984,
"grad_norm": 0.11436283588409424,
"learning_rate": 0.0001999322567078119,
"loss": 1.6385,
"step": 478
},
{
"epoch": 0.24557140116390613,
"grad_norm": 0.05773819610476494,
"learning_rate": 0.00019992910664558915,
"loss": 1.6022,
"step": 480
},
{
"epoch": 0.2465946153354224,
"grad_norm": 0.052521176636219025,
"learning_rate": 0.00019992588502526658,
"loss": 1.6137,
"step": 482
},
{
"epoch": 0.24761782950693867,
"grad_norm": 0.056573059409856796,
"learning_rate": 0.00019992259184915115,
"loss": 1.6065,
"step": 484
},
{
"epoch": 0.24864104367845494,
"grad_norm": 0.05170164257287979,
"learning_rate": 0.00019991922711960102,
"loss": 1.6325,
"step": 486
},
{
"epoch": 0.24966425784997123,
"grad_norm": 0.05951111018657684,
"learning_rate": 0.00019991579083902572,
"loss": 1.6034,
"step": 488
},
{
"epoch": 0.2506874720214875,
"grad_norm": 0.054325833916664124,
"learning_rate": 0.00019991228300988585,
"loss": 1.6102,
"step": 490
},
{
"epoch": 0.2517106861930038,
"grad_norm": 0.07080011814832687,
"learning_rate": 0.0001999087036346934,
"loss": 1.6302,
"step": 492
},
{
"epoch": 0.25273390036452004,
"grad_norm": 0.06116727367043495,
"learning_rate": 0.00019990505271601144,
"loss": 1.6243,
"step": 494
},
{
"epoch": 0.25375711453603633,
"grad_norm": 0.0602283850312233,
"learning_rate": 0.0001999013302564544,
"loss": 1.6024,
"step": 496
},
{
"epoch": 0.2547803287075526,
"grad_norm": 0.06313999742269516,
"learning_rate": 0.0001998975362586879,
"loss": 1.6238,
"step": 498
},
{
"epoch": 0.25580354287906887,
"grad_norm": 0.06217190623283386,
"learning_rate": 0.00019989367072542876,
"loss": 1.6251,
"step": 500
},
{
"epoch": 0.25682675705058516,
"grad_norm": 0.07256064563989639,
"learning_rate": 0.00019988973365944507,
"loss": 1.5929,
"step": 502
},
{
"epoch": 0.2578499712221014,
"grad_norm": 0.062201980501413345,
"learning_rate": 0.00019988572506355606,
"loss": 1.5933,
"step": 504
},
{
"epoch": 0.2588731853936177,
"grad_norm": 0.07168910652399063,
"learning_rate": 0.00019988164494063226,
"loss": 1.6474,
"step": 506
},
{
"epoch": 0.259896399565134,
"grad_norm": 0.056935928761959076,
"learning_rate": 0.00019987749329359548,
"loss": 1.5992,
"step": 508
},
{
"epoch": 0.26091961373665024,
"grad_norm": 0.07088612020015717,
"learning_rate": 0.00019987327012541855,
"loss": 1.5952,
"step": 510
},
{
"epoch": 0.26194282790816653,
"grad_norm": 0.06023348495364189,
"learning_rate": 0.0001998689754391257,
"loss": 1.6064,
"step": 512
},
{
"epoch": 0.2629660420796828,
"grad_norm": 0.05686601996421814,
"learning_rate": 0.0001998646092377923,
"loss": 1.5992,
"step": 514
},
{
"epoch": 0.26398925625119907,
"grad_norm": 0.07028970122337341,
"learning_rate": 0.00019986017152454495,
"loss": 1.5835,
"step": 516
},
{
"epoch": 0.26501247042271536,
"grad_norm": 0.0645250454545021,
"learning_rate": 0.0001998556623025614,
"loss": 1.6055,
"step": 518
},
{
"epoch": 0.2660356845942316,
"grad_norm": 0.0723612904548645,
"learning_rate": 0.00019985108157507067,
"loss": 1.6248,
"step": 520
},
{
"epoch": 0.2670588987657479,
"grad_norm": 0.06222670525312424,
"learning_rate": 0.00019984642934535297,
"loss": 1.6411,
"step": 522
},
{
"epoch": 0.2680821129372642,
"grad_norm": 0.057786975055933,
"learning_rate": 0.00019984170561673976,
"loss": 1.6313,
"step": 524
},
{
"epoch": 0.26910532710878043,
"grad_norm": 0.061039313673973083,
"learning_rate": 0.00019983691039261357,
"loss": 1.5896,
"step": 526
},
{
"epoch": 0.27012854128029673,
"grad_norm": 0.04816308245062828,
"learning_rate": 0.00019983204367640824,
"loss": 1.5986,
"step": 528
},
{
"epoch": 0.271151755451813,
"grad_norm": 0.06095914542675018,
"learning_rate": 0.0001998271054716088,
"loss": 1.5995,
"step": 530
},
{
"epoch": 0.27217496962332927,
"grad_norm": 0.05422305688261986,
"learning_rate": 0.00019982209578175137,
"loss": 1.6047,
"step": 532
},
{
"epoch": 0.27319818379484556,
"grad_norm": 0.05381491780281067,
"learning_rate": 0.0001998170146104234,
"loss": 1.5748,
"step": 534
},
{
"epoch": 0.27422139796636186,
"grad_norm": 0.08168444782495499,
"learning_rate": 0.0001998118619612634,
"loss": 1.5941,
"step": 536
},
{
"epoch": 0.2752446121378781,
"grad_norm": 0.05323650687932968,
"learning_rate": 0.00019980663783796118,
"loss": 1.6015,
"step": 538
},
{
"epoch": 0.2762678263093944,
"grad_norm": 0.08093535900115967,
"learning_rate": 0.0001998013422442577,
"loss": 1.6325,
"step": 540
},
{
"epoch": 0.27729104048091063,
"grad_norm": 0.05909120664000511,
"learning_rate": 0.00019979597518394491,
"loss": 1.6684,
"step": 542
},
{
"epoch": 0.27831425465242693,
"grad_norm": 0.0684690847992897,
"learning_rate": 0.00019979053666086634,
"loss": 1.6682,
"step": 544
},
{
"epoch": 0.2793374688239432,
"grad_norm": 0.05854607746005058,
"learning_rate": 0.00019978502667891625,
"loss": 1.6133,
"step": 546
},
{
"epoch": 0.28036068299545946,
"grad_norm": 0.05019630119204521,
"learning_rate": 0.00019977944524204037,
"loss": 1.5968,
"step": 548
},
{
"epoch": 0.28138389716697576,
"grad_norm": 0.0662982240319252,
"learning_rate": 0.00019977379235423551,
"loss": 1.589,
"step": 550
},
{
"epoch": 0.28240711133849206,
"grad_norm": 0.049058698117733,
"learning_rate": 0.00019976806801954964,
"loss": 1.5979,
"step": 552
},
{
"epoch": 0.2834303255100083,
"grad_norm": 0.058459024876356125,
"learning_rate": 0.00019976227224208183,
"loss": 1.5813,
"step": 554
},
{
"epoch": 0.2844535396815246,
"grad_norm": 0.048455361276865005,
"learning_rate": 0.00019975640502598244,
"loss": 1.5652,
"step": 556
},
{
"epoch": 0.2854767538530409,
"grad_norm": 0.06029395014047623,
"learning_rate": 0.00019975046637545288,
"loss": 1.6166,
"step": 558
},
{
"epoch": 0.28649996802455713,
"grad_norm": 0.05902372673153877,
"learning_rate": 0.00019974445629474574,
"loss": 1.5955,
"step": 560
},
{
"epoch": 0.2875231821960734,
"grad_norm": 0.04898110404610634,
"learning_rate": 0.0001997383747881648,
"loss": 1.5554,
"step": 562
},
{
"epoch": 0.28854639636758966,
"grad_norm": 0.07228821516036987,
"learning_rate": 0.00019973222186006498,
"loss": 1.6178,
"step": 564
},
{
"epoch": 0.28956961053910596,
"grad_norm": 0.07162781804800034,
"learning_rate": 0.00019972599751485226,
"loss": 1.6128,
"step": 566
},
{
"epoch": 0.29059282471062226,
"grad_norm": 0.047708939760923386,
"learning_rate": 0.00019971970175698385,
"loss": 1.5776,
"step": 568
},
{
"epoch": 0.2916160388821385,
"grad_norm": 0.05930710583925247,
"learning_rate": 0.0001997133345909681,
"loss": 1.6095,
"step": 570
},
{
"epoch": 0.2926392530536548,
"grad_norm": 0.057511184364557266,
"learning_rate": 0.00019970689602136438,
"loss": 1.564,
"step": 572
},
{
"epoch": 0.2936624672251711,
"grad_norm": 0.0659165233373642,
"learning_rate": 0.00019970038605278338,
"loss": 1.6057,
"step": 574
},
{
"epoch": 0.2946856813966873,
"grad_norm": 0.0638163760304451,
"learning_rate": 0.00019969380468988677,
"loss": 1.5684,
"step": 576
},
{
"epoch": 0.2957088955682036,
"grad_norm": 0.0477282889187336,
"learning_rate": 0.00019968715193738738,
"loss": 1.5596,
"step": 578
},
{
"epoch": 0.2967321097397199,
"grad_norm": 0.055721577256917953,
"learning_rate": 0.00019968042780004917,
"loss": 1.5854,
"step": 580
},
{
"epoch": 0.29775532391123616,
"grad_norm": 0.05852237716317177,
"learning_rate": 0.00019967363228268724,
"loss": 1.5952,
"step": 582
},
{
"epoch": 0.29877853808275245,
"grad_norm": 0.04583214595913887,
"learning_rate": 0.00019966676539016779,
"loss": 1.5835,
"step": 584
},
{
"epoch": 0.2998017522542687,
"grad_norm": 0.052682552486658096,
"learning_rate": 0.00019965982712740808,
"loss": 1.5932,
"step": 586
},
{
"epoch": 0.300824966425785,
"grad_norm": 0.06101151555776596,
"learning_rate": 0.00019965281749937655,
"loss": 1.661,
"step": 588
},
{
"epoch": 0.3018481805973013,
"grad_norm": 0.052221182733774185,
"learning_rate": 0.0001996457365110927,
"loss": 1.5834,
"step": 590
},
{
"epoch": 0.3028713947688175,
"grad_norm": 0.05288353189826012,
"learning_rate": 0.00019963858416762717,
"loss": 1.561,
"step": 592
},
{
"epoch": 0.3038946089403338,
"grad_norm": 0.05072011053562164,
"learning_rate": 0.00019963136047410166,
"loss": 1.5542,
"step": 594
},
{
"epoch": 0.3049178231118501,
"grad_norm": 0.05482899025082588,
"learning_rate": 0.00019962406543568898,
"loss": 1.6568,
"step": 596
},
{
"epoch": 0.30594103728336636,
"grad_norm": 0.06114513427019119,
"learning_rate": 0.00019961669905761302,
"loss": 1.5619,
"step": 598
},
{
"epoch": 0.30696425145488265,
"grad_norm": 0.14878755807876587,
"learning_rate": 0.00019960926134514873,
"loss": 1.6222,
"step": 600
},
{
"epoch": 0.30798746562639895,
"grad_norm": 0.05369825288653374,
"learning_rate": 0.00019960175230362222,
"loss": 1.574,
"step": 602
},
{
"epoch": 0.3090106797979152,
"grad_norm": 0.04912363365292549,
"learning_rate": 0.00019959417193841063,
"loss": 1.5644,
"step": 604
},
{
"epoch": 0.3100338939694315,
"grad_norm": 0.055376555770635605,
"learning_rate": 0.00019958652025494212,
"loss": 1.5978,
"step": 606
},
{
"epoch": 0.3110571081409477,
"grad_norm": 0.054994821548461914,
"learning_rate": 0.00019957879725869602,
"loss": 1.6327,
"step": 608
},
{
"epoch": 0.312080322312464,
"grad_norm": 0.05939999222755432,
"learning_rate": 0.00019957100295520266,
"loss": 1.5706,
"step": 610
},
{
"epoch": 0.3131035364839803,
"grad_norm": 0.05616987124085426,
"learning_rate": 0.00019956313735004346,
"loss": 1.5932,
"step": 612
},
{
"epoch": 0.31412675065549656,
"grad_norm": 0.10900183767080307,
"learning_rate": 0.00019955520044885087,
"loss": 1.5757,
"step": 614
},
{
"epoch": 0.31514996482701285,
"grad_norm": 1.115419864654541,
"learning_rate": 0.00019954719225730847,
"loss": 1.666,
"step": 616
},
{
"epoch": 0.31617317899852915,
"grad_norm": 0.13737702369689941,
"learning_rate": 0.00019953911278115078,
"loss": 1.6406,
"step": 618
},
{
"epoch": 0.3171963931700454,
"grad_norm": 0.18733379244804382,
"learning_rate": 0.00019953096202616344,
"loss": 1.6465,
"step": 620
},
{
"epoch": 0.3182196073415617,
"grad_norm": 0.513283371925354,
"learning_rate": 0.0001995227399981831,
"loss": 1.6477,
"step": 622
},
{
"epoch": 0.319242821513078,
"grad_norm": 0.30918484926223755,
"learning_rate": 0.0001995144467030975,
"loss": 1.6566,
"step": 624
},
{
"epoch": 0.3202660356845942,
"grad_norm": 0.0951157733798027,
"learning_rate": 0.00019950608214684535,
"loss": 1.6034,
"step": 626
},
{
"epoch": 0.3212892498561105,
"grad_norm": 0.05696268379688263,
"learning_rate": 0.00019949764633541643,
"loss": 1.6518,
"step": 628
},
{
"epoch": 0.32231246402762675,
"grad_norm": 0.06777111440896988,
"learning_rate": 0.00019948913927485146,
"loss": 1.6585,
"step": 630
},
{
"epoch": 0.32333567819914305,
"grad_norm": 0.055656664073467255,
"learning_rate": 0.00019948056097124234,
"loss": 1.5623,
"step": 632
},
{
"epoch": 0.32435889237065935,
"grad_norm": 0.05220302939414978,
"learning_rate": 0.00019947191143073186,
"loss": 1.6067,
"step": 634
},
{
"epoch": 0.3253821065421756,
"grad_norm": 0.05276400223374367,
"learning_rate": 0.00019946319065951382,
"loss": 1.5997,
"step": 636
},
{
"epoch": 0.3264053207136919,
"grad_norm": 0.06689111888408661,
"learning_rate": 0.00019945439866383312,
"loss": 1.5621,
"step": 638
},
{
"epoch": 0.3274285348852082,
"grad_norm": 0.07574088871479034,
"learning_rate": 0.00019944553544998562,
"loss": 1.5873,
"step": 640
},
{
"epoch": 0.3284517490567244,
"grad_norm": 0.1480696201324463,
"learning_rate": 0.0001994366010243181,
"loss": 1.6142,
"step": 642
},
{
"epoch": 0.3294749632282407,
"grad_norm": 0.2425205558538437,
"learning_rate": 0.00019942759539322844,
"loss": 1.6513,
"step": 644
},
{
"epoch": 0.330498177399757,
"grad_norm": 0.10395582765340805,
"learning_rate": 0.00019941851856316548,
"loss": 1.6186,
"step": 646
},
{
"epoch": 0.33152139157127325,
"grad_norm": 0.07959388941526413,
"learning_rate": 0.000199409370540629,
"loss": 1.5954,
"step": 648
},
{
"epoch": 0.33254460574278955,
"grad_norm": 0.08391022682189941,
"learning_rate": 0.00019940015133216985,
"loss": 1.6359,
"step": 650
},
{
"epoch": 0.33356781991430584,
"grad_norm": 0.10863954573869705,
"learning_rate": 0.00019939086094438975,
"loss": 1.5591,
"step": 652
},
{
"epoch": 0.3345910340858221,
"grad_norm": 0.0719527155160904,
"learning_rate": 0.00019938149938394145,
"loss": 1.5536,
"step": 654
},
{
"epoch": 0.3356142482573384,
"grad_norm": 0.054009951651096344,
"learning_rate": 0.0001993720666575287,
"loss": 1.5925,
"step": 656
},
{
"epoch": 0.3366374624288546,
"grad_norm": 0.06805548816919327,
"learning_rate": 0.00019936256277190608,
"loss": 1.6079,
"step": 658
},
{
"epoch": 0.3376606766003709,
"grad_norm": 0.057809535413980484,
"learning_rate": 0.0001993529877338793,
"loss": 1.5569,
"step": 660
},
{
"epoch": 0.3386838907718872,
"grad_norm": 0.05796423181891441,
"learning_rate": 0.0001993433415503049,
"loss": 1.6148,
"step": 662
},
{
"epoch": 0.33970710494340345,
"grad_norm": 0.0450466088950634,
"learning_rate": 0.0001993336242280904,
"loss": 1.6024,
"step": 664
},
{
"epoch": 0.34073031911491974,
"grad_norm": 0.05356905981898308,
"learning_rate": 0.00019932383577419432,
"loss": 1.5696,
"step": 666
},
{
"epoch": 0.34175353328643604,
"grad_norm": 0.04915151000022888,
"learning_rate": 0.00019931397619562597,
"loss": 1.601,
"step": 668
},
{
"epoch": 0.3427767474579523,
"grad_norm": 0.2238396257162094,
"learning_rate": 0.00019930404549944574,
"loss": 1.6144,
"step": 670
},
{
"epoch": 0.3437999616294686,
"grad_norm": 0.07003773748874664,
"learning_rate": 0.00019929404369276488,
"loss": 1.6132,
"step": 672
},
{
"epoch": 0.34482317580098487,
"grad_norm": 0.07609610259532928,
"learning_rate": 0.00019928397078274555,
"loss": 1.5351,
"step": 674
},
{
"epoch": 0.3458463899725011,
"grad_norm": 0.057023849338293076,
"learning_rate": 0.00019927382677660088,
"loss": 1.5643,
"step": 676
},
{
"epoch": 0.3468696041440174,
"grad_norm": 0.0493864081799984,
"learning_rate": 0.0001992636116815948,
"loss": 1.5837,
"step": 678
},
{
"epoch": 0.34789281831553365,
"grad_norm": 0.05028039962053299,
"learning_rate": 0.00019925332550504234,
"loss": 1.6003,
"step": 680
},
{
"epoch": 0.34891603248704994,
"grad_norm": 0.050032299011945724,
"learning_rate": 0.00019924296825430925,
"loss": 1.5583,
"step": 682
},
{
"epoch": 0.34993924665856624,
"grad_norm": 0.04059847444295883,
"learning_rate": 0.00019923253993681225,
"loss": 1.6101,
"step": 684
},
{
"epoch": 0.3509624608300825,
"grad_norm": 0.045728132128715515,
"learning_rate": 0.00019922204056001895,
"loss": 1.5973,
"step": 686
},
{
"epoch": 0.3519856750015988,
"grad_norm": 0.04674302786588669,
"learning_rate": 0.0001992114701314478,
"loss": 1.5785,
"step": 688
},
{
"epoch": 0.35300888917311507,
"grad_norm": 0.04860880225896835,
"learning_rate": 0.00019920082865866818,
"loss": 1.5761,
"step": 690
},
{
"epoch": 0.3540321033446313,
"grad_norm": 0.04689641669392586,
"learning_rate": 0.00019919011614930035,
"loss": 1.6015,
"step": 692
},
{
"epoch": 0.3550553175161476,
"grad_norm": 0.04507840797305107,
"learning_rate": 0.0001991793326110154,
"loss": 1.5762,
"step": 694
},
{
"epoch": 0.3560785316876639,
"grad_norm": 0.04468555748462677,
"learning_rate": 0.00019916847805153526,
"loss": 1.5615,
"step": 696
},
{
"epoch": 0.35710174585918014,
"grad_norm": 0.07028740644454956,
"learning_rate": 0.00019915755247863285,
"loss": 1.6001,
"step": 698
},
{
"epoch": 0.35812496003069644,
"grad_norm": 0.03917892277240753,
"learning_rate": 0.00019914655590013176,
"loss": 1.6153,
"step": 700
},
{
"epoch": 0.3591481742022127,
"grad_norm": 0.06443695724010468,
"learning_rate": 0.0001991354883239066,
"loss": 1.5588,
"step": 702
},
{
"epoch": 0.360171388373729,
"grad_norm": 0.04684121161699295,
"learning_rate": 0.00019912434975788264,
"loss": 1.5726,
"step": 704
},
{
"epoch": 0.36119460254524527,
"grad_norm": 0.04538768157362938,
"learning_rate": 0.00019911314021003613,
"loss": 1.592,
"step": 706
},
{
"epoch": 0.3622178167167615,
"grad_norm": 0.040085602551698685,
"learning_rate": 0.0001991018596883941,
"loss": 1.577,
"step": 708
},
{
"epoch": 0.3632410308882778,
"grad_norm": 0.04734279587864876,
"learning_rate": 0.00019909050820103442,
"loss": 1.6194,
"step": 710
},
{
"epoch": 0.3642642450597941,
"grad_norm": 0.051557011902332306,
"learning_rate": 0.00019907908575608573,
"loss": 1.5776,
"step": 712
},
{
"epoch": 0.36528745923131034,
"grad_norm": 0.042105671018362045,
"learning_rate": 0.00019906759236172752,
"loss": 1.562,
"step": 714
},
{
"epoch": 0.36631067340282664,
"grad_norm": 0.04763809219002724,
"learning_rate": 0.00019905602802619007,
"loss": 1.5727,
"step": 716
},
{
"epoch": 0.36733388757434293,
"grad_norm": 0.05205756798386574,
"learning_rate": 0.00019904439275775452,
"loss": 1.5595,
"step": 718
},
{
"epoch": 0.3683571017458592,
"grad_norm": 0.04210933670401573,
"learning_rate": 0.0001990326865647527,
"loss": 1.5812,
"step": 720
},
{
"epoch": 0.36938031591737547,
"grad_norm": 0.04100721701979637,
"learning_rate": 0.00019902090945556728,
"loss": 1.5492,
"step": 722
},
{
"epoch": 0.3704035300888917,
"grad_norm": 0.04252148047089577,
"learning_rate": 0.0001990090614386318,
"loss": 1.5397,
"step": 724
},
{
"epoch": 0.371426744260408,
"grad_norm": 0.040999703109264374,
"learning_rate": 0.00019899714252243035,
"loss": 1.533,
"step": 726
},
{
"epoch": 0.3724499584319243,
"grad_norm": 0.03823763504624367,
"learning_rate": 0.00019898515271549804,
"loss": 1.5385,
"step": 728
},
{
"epoch": 0.37347317260344054,
"grad_norm": 0.041486915200948715,
"learning_rate": 0.0001989730920264206,
"loss": 1.5975,
"step": 730
},
{
"epoch": 0.37449638677495684,
"grad_norm": 0.042897533625364304,
"learning_rate": 0.00019896096046383456,
"loss": 1.574,
"step": 732
},
{
"epoch": 0.37551960094647313,
"grad_norm": 0.05677172914147377,
"learning_rate": 0.00019894875803642715,
"loss": 1.5564,
"step": 734
},
{
"epoch": 0.37654281511798937,
"grad_norm": 0.0416000559926033,
"learning_rate": 0.00019893648475293648,
"loss": 1.5982,
"step": 736
},
{
"epoch": 0.37756602928950567,
"grad_norm": 0.04389720410108566,
"learning_rate": 0.00019892414062215122,
"loss": 1.5661,
"step": 738
},
{
"epoch": 0.37858924346102196,
"grad_norm": 0.048660341650247574,
"learning_rate": 0.0001989117256529109,
"loss": 1.5554,
"step": 740
},
{
"epoch": 0.3796124576325382,
"grad_norm": 0.04659014940261841,
"learning_rate": 0.00019889923985410576,
"loss": 1.5932,
"step": 742
},
{
"epoch": 0.3806356718040545,
"grad_norm": 0.04693235456943512,
"learning_rate": 0.00019888668323467669,
"loss": 1.5985,
"step": 744
},
{
"epoch": 0.38165888597557074,
"grad_norm": 0.05906931310892105,
"learning_rate": 0.00019887405580361537,
"loss": 1.592,
"step": 746
},
{
"epoch": 0.38268210014708703,
"grad_norm": 0.0707060918211937,
"learning_rate": 0.0001988613575699642,
"loss": 1.5491,
"step": 748
},
{
"epoch": 0.38370531431860333,
"grad_norm": 0.0510844886302948,
"learning_rate": 0.00019884858854281613,
"loss": 1.5433,
"step": 750
},
{
"epoch": 0.38472852849011957,
"grad_norm": 0.058799102902412415,
"learning_rate": 0.00019883574873131503,
"loss": 1.5467,
"step": 752
},
{
"epoch": 0.38575174266163587,
"grad_norm": 0.04918012022972107,
"learning_rate": 0.0001988228381446553,
"loss": 1.5685,
"step": 754
},
{
"epoch": 0.38677495683315216,
"grad_norm": 0.044637810438871384,
"learning_rate": 0.00019880985679208207,
"loss": 1.5767,
"step": 756
},
{
"epoch": 0.3877981710046684,
"grad_norm": 0.052684806287288666,
"learning_rate": 0.0001987968046828911,
"loss": 1.5457,
"step": 758
},
{
"epoch": 0.3888213851761847,
"grad_norm": 0.045015860348939896,
"learning_rate": 0.0001987836818264289,
"loss": 1.5136,
"step": 760
},
{
"epoch": 0.389844599347701,
"grad_norm": 0.0538019984960556,
"learning_rate": 0.0001987704882320926,
"loss": 1.5673,
"step": 762
},
{
"epoch": 0.39086781351921723,
"grad_norm": 0.04201149195432663,
"learning_rate": 0.00019875722390932997,
"loss": 1.5559,
"step": 764
},
{
"epoch": 0.39189102769073353,
"grad_norm": 0.04188109561800957,
"learning_rate": 0.00019874388886763944,
"loss": 1.4982,
"step": 766
},
{
"epoch": 0.39291424186224977,
"grad_norm": 0.0503980815410614,
"learning_rate": 0.00019873048311657007,
"loss": 1.5018,
"step": 768
},
{
"epoch": 0.39393745603376606,
"grad_norm": 0.04854050651192665,
"learning_rate": 0.0001987170066657216,
"loss": 1.5331,
"step": 770
},
{
"epoch": 0.39496067020528236,
"grad_norm": 0.04634295031428337,
"learning_rate": 0.00019870345952474437,
"loss": 1.5304,
"step": 772
},
{
"epoch": 0.3959838843767986,
"grad_norm": 0.04464833438396454,
"learning_rate": 0.0001986898417033393,
"loss": 1.5518,
"step": 774
},
{
"epoch": 0.3970070985483149,
"grad_norm": 0.04434438794851303,
"learning_rate": 0.00019867615321125795,
"loss": 1.5372,
"step": 776
},
{
"epoch": 0.3980303127198312,
"grad_norm": 0.04564082249999046,
"learning_rate": 0.00019866239405830248,
"loss": 1.5373,
"step": 778
},
{
"epoch": 0.39905352689134743,
"grad_norm": 0.042439211159944534,
"learning_rate": 0.00019864856425432574,
"loss": 1.5682,
"step": 780
},
{
"epoch": 0.4000767410628637,
"grad_norm": 0.051853910088539124,
"learning_rate": 0.00019863466380923105,
"loss": 1.5408,
"step": 782
},
{
"epoch": 0.40109995523438,
"grad_norm": 0.04109041020274162,
"learning_rate": 0.00019862069273297232,
"loss": 1.5557,
"step": 784
},
{
"epoch": 0.40212316940589626,
"grad_norm": 0.04249493032693863,
"learning_rate": 0.00019860665103555415,
"loss": 1.5723,
"step": 786
},
{
"epoch": 0.40314638357741256,
"grad_norm": 0.041393015533685684,
"learning_rate": 0.0001985925387270316,
"loss": 1.6034,
"step": 788
},
{
"epoch": 0.4041695977489288,
"grad_norm": 0.03967997431755066,
"learning_rate": 0.00019857835581751037,
"loss": 1.5252,
"step": 790
},
{
"epoch": 0.4051928119204451,
"grad_norm": 0.0383961945772171,
"learning_rate": 0.00019856410231714662,
"loss": 1.5718,
"step": 792
},
{
"epoch": 0.4062160260919614,
"grad_norm": 0.04732939228415489,
"learning_rate": 0.00019854977823614717,
"loss": 1.5473,
"step": 794
},
{
"epoch": 0.40723924026347763,
"grad_norm": 0.04425951838493347,
"learning_rate": 0.00019853538358476932,
"loss": 1.5976,
"step": 796
},
{
"epoch": 0.4082624544349939,
"grad_norm": 0.041833970695734024,
"learning_rate": 0.0001985209183733209,
"loss": 1.6024,
"step": 798
},
{
"epoch": 0.4092856686065102,
"grad_norm": 0.04387862607836723,
"learning_rate": 0.0001985063826121603,
"loss": 1.5384,
"step": 800
},
{
"epoch": 0.41030888277802646,
"grad_norm": 0.04852529242634773,
"learning_rate": 0.00019849177631169643,
"loss": 1.5485,
"step": 802
},
{
"epoch": 0.41133209694954276,
"grad_norm": 0.04267437756061554,
"learning_rate": 0.00019847709948238865,
"loss": 1.5186,
"step": 804
},
{
"epoch": 0.41235531112105905,
"grad_norm": 0.04403737559914589,
"learning_rate": 0.00019846235213474692,
"loss": 1.5374,
"step": 806
},
{
"epoch": 0.4133785252925753,
"grad_norm": 0.04668973386287689,
"learning_rate": 0.00019844753427933164,
"loss": 1.5209,
"step": 808
},
{
"epoch": 0.4144017394640916,
"grad_norm": 0.045447513461112976,
"learning_rate": 0.00019843264592675367,
"loss": 1.5888,
"step": 810
},
{
"epoch": 0.41542495363560783,
"grad_norm": 0.04239337146282196,
"learning_rate": 0.00019841768708767438,
"loss": 1.5866,
"step": 812
},
{
"epoch": 0.4164481678071241,
"grad_norm": 0.04571668431162834,
"learning_rate": 0.0001984026577728057,
"loss": 1.5134,
"step": 814
},
{
"epoch": 0.4174713819786404,
"grad_norm": 0.041478246450424194,
"learning_rate": 0.00019838755799290994,
"loss": 1.5555,
"step": 816
},
{
"epoch": 0.41849459615015666,
"grad_norm": 0.04084784537553787,
"learning_rate": 0.00019837238775879983,
"loss": 1.5847,
"step": 818
},
{
"epoch": 0.41951781032167296,
"grad_norm": 0.0393175333738327,
"learning_rate": 0.00019835714708133862,
"loss": 1.5377,
"step": 820
},
{
"epoch": 0.42054102449318925,
"grad_norm": 0.03987790644168854,
"learning_rate": 0.00019834183597143996,
"loss": 1.5604,
"step": 822
},
{
"epoch": 0.4215642386647055,
"grad_norm": 0.04945560172200203,
"learning_rate": 0.00019832645444006804,
"loss": 1.5239,
"step": 824
},
{
"epoch": 0.4225874528362218,
"grad_norm": 0.042219970375299454,
"learning_rate": 0.00019831100249823733,
"loss": 1.5435,
"step": 826
},
{
"epoch": 0.4236106670077381,
"grad_norm": 0.06793594360351562,
"learning_rate": 0.00019829548015701283,
"loss": 1.5204,
"step": 828
},
{
"epoch": 0.4246338811792543,
"grad_norm": 0.04633813723921776,
"learning_rate": 0.00019827988742750988,
"loss": 1.5494,
"step": 830
},
{
"epoch": 0.4256570953507706,
"grad_norm": 0.041469499468803406,
"learning_rate": 0.0001982642243208943,
"loss": 1.5549,
"step": 832
},
{
"epoch": 0.42668030952228686,
"grad_norm": 0.039512719959020615,
"learning_rate": 0.0001982484908483822,
"loss": 1.5614,
"step": 834
},
{
"epoch": 0.42770352369380316,
"grad_norm": 0.04240869730710983,
"learning_rate": 0.0001982326870212402,
"loss": 1.5597,
"step": 836
},
{
"epoch": 0.42872673786531945,
"grad_norm": 0.04469761997461319,
"learning_rate": 0.00019821681285078522,
"loss": 1.575,
"step": 838
},
{
"epoch": 0.4297499520368357,
"grad_norm": 0.05203311890363693,
"learning_rate": 0.00019820086834838456,
"loss": 1.5144,
"step": 840
},
{
"epoch": 0.430773166208352,
"grad_norm": 0.046044569462537766,
"learning_rate": 0.00019818485352545592,
"loss": 1.5328,
"step": 842
},
{
"epoch": 0.4317963803798683,
"grad_norm": 0.05522793158888817,
"learning_rate": 0.00019816876839346735,
"loss": 1.5266,
"step": 844
},
{
"epoch": 0.4328195945513845,
"grad_norm": 0.04644525796175003,
"learning_rate": 0.00019815261296393715,
"loss": 1.5682,
"step": 846
},
{
"epoch": 0.4338428087229008,
"grad_norm": 0.06290300190448761,
"learning_rate": 0.00019813638724843413,
"loss": 1.5643,
"step": 848
},
{
"epoch": 0.4348660228944171,
"grad_norm": 0.050486985594034195,
"learning_rate": 0.00019812009125857728,
"loss": 1.5491,
"step": 850
},
{
"epoch": 0.43588923706593335,
"grad_norm": 0.05234065279364586,
"learning_rate": 0.000198103725006036,
"loss": 1.5718,
"step": 852
},
{
"epoch": 0.43691245123744965,
"grad_norm": 0.05265431106090546,
"learning_rate": 0.00019808728850253,
"loss": 1.56,
"step": 854
},
{
"epoch": 0.4379356654089659,
"grad_norm": 0.04220706969499588,
"learning_rate": 0.00019807078175982924,
"loss": 1.551,
"step": 856
},
{
"epoch": 0.4389588795804822,
"grad_norm": 0.042153794318437576,
"learning_rate": 0.00019805420478975403,
"loss": 1.5793,
"step": 858
},
{
"epoch": 0.4399820937519985,
"grad_norm": 0.04063679277896881,
"learning_rate": 0.00019803755760417494,
"loss": 1.5404,
"step": 860
},
{
"epoch": 0.4410053079235147,
"grad_norm": 0.04740441218018532,
"learning_rate": 0.0001980208402150128,
"loss": 1.526,
"step": 862
},
{
"epoch": 0.442028522095031,
"grad_norm": 0.04050862789154053,
"learning_rate": 0.0001980040526342388,
"loss": 1.5357,
"step": 864
},
{
"epoch": 0.4430517362665473,
"grad_norm": 0.050952885299921036,
"learning_rate": 0.00019798719487387428,
"loss": 1.5102,
"step": 866
},
{
"epoch": 0.44407495043806355,
"grad_norm": 0.048501502722501755,
"learning_rate": 0.00019797026694599098,
"loss": 1.5637,
"step": 868
},
{
"epoch": 0.44509816460957985,
"grad_norm": 0.03910909220576286,
"learning_rate": 0.0001979532688627107,
"loss": 1.5367,
"step": 870
},
{
"epoch": 0.44612137878109615,
"grad_norm": 0.05638305842876434,
"learning_rate": 0.0001979362006362056,
"loss": 1.5282,
"step": 872
},
{
"epoch": 0.4471445929526124,
"grad_norm": 0.05307792127132416,
"learning_rate": 0.00019791906227869808,
"loss": 1.5467,
"step": 874
},
{
"epoch": 0.4481678071241287,
"grad_norm": 0.04324028640985489,
"learning_rate": 0.0001979018538024607,
"loss": 1.5711,
"step": 876
},
{
"epoch": 0.4491910212956449,
"grad_norm": 0.03858278691768646,
"learning_rate": 0.00019788457521981623,
"loss": 1.5561,
"step": 878
},
{
"epoch": 0.4502142354671612,
"grad_norm": 0.043761543929576874,
"learning_rate": 0.00019786722654313772,
"loss": 1.5187,
"step": 880
},
{
"epoch": 0.4512374496386775,
"grad_norm": 0.08969100564718246,
"learning_rate": 0.00019784980778484834,
"loss": 1.5486,
"step": 882
},
{
"epoch": 0.45226066381019375,
"grad_norm": 0.04808567091822624,
"learning_rate": 0.00019783231895742143,
"loss": 1.5164,
"step": 884
},
{
"epoch": 0.45328387798171005,
"grad_norm": 0.04110665246844292,
"learning_rate": 0.00019781476007338058,
"loss": 1.5177,
"step": 886
},
{
"epoch": 0.45430709215322634,
"grad_norm": 0.050568196922540665,
"learning_rate": 0.00019779713114529947,
"loss": 1.5265,
"step": 888
},
{
"epoch": 0.4553303063247426,
"grad_norm": 0.04753986746072769,
"learning_rate": 0.00019777943218580207,
"loss": 1.5304,
"step": 890
},
{
"epoch": 0.4563535204962589,
"grad_norm": 0.05155970901250839,
"learning_rate": 0.00019776166320756227,
"loss": 1.566,
"step": 892
},
{
"epoch": 0.4573767346677752,
"grad_norm": 0.048765815794467926,
"learning_rate": 0.00019774382422330433,
"loss": 1.5276,
"step": 894
},
{
"epoch": 0.4583999488392914,
"grad_norm": 0.16882531344890594,
"learning_rate": 0.0001977259152458025,
"loss": 1.5074,
"step": 896
},
{
"epoch": 0.4594231630108077,
"grad_norm": 0.04014374688267708,
"learning_rate": 0.00019770793628788122,
"loss": 1.5262,
"step": 898
},
{
"epoch": 0.46044637718232395,
"grad_norm": 0.04874645173549652,
"learning_rate": 0.000197689887362415,
"loss": 1.5158,
"step": 900
},
{
"epoch": 0.46146959135384025,
"grad_norm": 0.049459170550107956,
"learning_rate": 0.00019767176848232846,
"loss": 1.5449,
"step": 902
},
{
"epoch": 0.46249280552535654,
"grad_norm": 0.04516777768731117,
"learning_rate": 0.00019765357966059638,
"loss": 1.5722,
"step": 904
},
{
"epoch": 0.4635160196968728,
"grad_norm": 0.04243026673793793,
"learning_rate": 0.00019763532091024352,
"loss": 1.5562,
"step": 906
},
{
"epoch": 0.4645392338683891,
"grad_norm": 0.04713771492242813,
"learning_rate": 0.00019761699224434475,
"loss": 1.5425,
"step": 908
},
{
"epoch": 0.4655624480399054,
"grad_norm": 0.0495879128575325,
"learning_rate": 0.0001975985936760251,
"loss": 1.5517,
"step": 910
},
{
"epoch": 0.4665856622114216,
"grad_norm": 0.037338342517614365,
"learning_rate": 0.00019758012521845948,
"loss": 1.5923,
"step": 912
},
{
"epoch": 0.4676088763829379,
"grad_norm": 0.044082753360271454,
"learning_rate": 0.000197561586884873,
"loss": 1.5582,
"step": 914
},
{
"epoch": 0.4686320905544542,
"grad_norm": 0.045763563364744186,
"learning_rate": 0.00019754297868854073,
"loss": 1.5435,
"step": 916
},
{
"epoch": 0.46965530472597045,
"grad_norm": 0.04221731796860695,
"learning_rate": 0.00019752430064278777,
"loss": 1.5365,
"step": 918
},
{
"epoch": 0.47067851889748674,
"grad_norm": 0.04800180345773697,
"learning_rate": 0.0001975055527609893,
"loss": 1.5534,
"step": 920
},
{
"epoch": 0.471701733069003,
"grad_norm": 0.05618242546916008,
"learning_rate": 0.00019748673505657046,
"loss": 1.5568,
"step": 922
},
{
"epoch": 0.4727249472405193,
"grad_norm": 0.04696999117732048,
"learning_rate": 0.00019746784754300637,
"loss": 1.5249,
"step": 924
},
{
"epoch": 0.4737481614120356,
"grad_norm": 0.041852448135614395,
"learning_rate": 0.00019744889023382215,
"loss": 1.5415,
"step": 926
},
{
"epoch": 0.4747713755835518,
"grad_norm": 0.04743418097496033,
"learning_rate": 0.00019742986314259299,
"loss": 1.5633,
"step": 928
},
{
"epoch": 0.4757945897550681,
"grad_norm": 0.04543265700340271,
"learning_rate": 0.00019741076628294386,
"loss": 1.5261,
"step": 930
},
{
"epoch": 0.4768178039265844,
"grad_norm": 0.04992993175983429,
"learning_rate": 0.00019739159966854992,
"loss": 1.5175,
"step": 932
},
{
"epoch": 0.47784101809810064,
"grad_norm": 0.05793948844075203,
"learning_rate": 0.00019737236331313608,
"loss": 1.59,
"step": 934
},
{
"epoch": 0.47886423226961694,
"grad_norm": 0.051816169172525406,
"learning_rate": 0.00019735305723047732,
"loss": 1.5008,
"step": 936
},
{
"epoch": 0.47988744644113324,
"grad_norm": 0.04754515737295151,
"learning_rate": 0.0001973336814343985,
"loss": 1.4773,
"step": 938
},
{
"epoch": 0.4809106606126495,
"grad_norm": 0.0393076054751873,
"learning_rate": 0.0001973142359387744,
"loss": 1.5568,
"step": 940
},
{
"epoch": 0.48193387478416577,
"grad_norm": 0.04164562746882439,
"learning_rate": 0.00019729472075752974,
"loss": 1.5319,
"step": 942
},
{
"epoch": 0.482957088955682,
"grad_norm": 0.04371575266122818,
"learning_rate": 0.00019727513590463906,
"loss": 1.5571,
"step": 944
},
{
"epoch": 0.4839803031271983,
"grad_norm": 0.0573207251727581,
"learning_rate": 0.00019725548139412692,
"loss": 1.5372,
"step": 946
},
{
"epoch": 0.4850035172987146,
"grad_norm": 0.04900820180773735,
"learning_rate": 0.00019723575724006767,
"loss": 1.5327,
"step": 948
},
{
"epoch": 0.48602673147023084,
"grad_norm": 0.039241593331098557,
"learning_rate": 0.00019721596345658552,
"loss": 1.5438,
"step": 950
},
{
"epoch": 0.48704994564174714,
"grad_norm": 0.043952930718660355,
"learning_rate": 0.00019719610005785465,
"loss": 1.5577,
"step": 952
},
{
"epoch": 0.48807315981326344,
"grad_norm": 0.038709525018930435,
"learning_rate": 0.0001971761670580989,
"loss": 1.5527,
"step": 954
},
{
"epoch": 0.4890963739847797,
"grad_norm": 0.03867029398679733,
"learning_rate": 0.0001971561644715922,
"loss": 1.5329,
"step": 956
},
{
"epoch": 0.49011958815629597,
"grad_norm": 0.0413273349404335,
"learning_rate": 0.00019713609231265805,
"loss": 1.5415,
"step": 958
},
{
"epoch": 0.49114280232781227,
"grad_norm": 0.03651106357574463,
"learning_rate": 0.00019711595059566998,
"loss": 1.5596,
"step": 960
},
{
"epoch": 0.4921660164993285,
"grad_norm": 0.03891696035861969,
"learning_rate": 0.0001970957393350512,
"loss": 1.5452,
"step": 962
},
{
"epoch": 0.4931892306708448,
"grad_norm": 0.03818392753601074,
"learning_rate": 0.0001970754585452748,
"loss": 1.5821,
"step": 964
},
{
"epoch": 0.49421244484236104,
"grad_norm": 0.03790618106722832,
"learning_rate": 0.0001970551082408636,
"loss": 1.5456,
"step": 966
},
{
"epoch": 0.49523565901387734,
"grad_norm": 0.043467581272125244,
"learning_rate": 0.00019703468843639024,
"loss": 1.4916,
"step": 968
},
{
"epoch": 0.49625887318539363,
"grad_norm": 0.03895978257060051,
"learning_rate": 0.0001970141991464771,
"loss": 1.5529,
"step": 970
},
{
"epoch": 0.4972820873569099,
"grad_norm": 0.03736645728349686,
"learning_rate": 0.0001969936403857963,
"loss": 1.5243,
"step": 972
},
{
"epoch": 0.49830530152842617,
"grad_norm": 0.03589653596282005,
"learning_rate": 0.0001969730121690698,
"loss": 1.5418,
"step": 974
},
{
"epoch": 0.49932851569994247,
"grad_norm": 0.03768768534064293,
"learning_rate": 0.00019695231451106912,
"loss": 1.5114,
"step": 976
},
{
"epoch": 0.5003517298714587,
"grad_norm": 0.04931550845503807,
"learning_rate": 0.00019693154742661575,
"loss": 1.564,
"step": 978
},
{
"epoch": 0.501374944042975,
"grad_norm": 0.04325348883867264,
"learning_rate": 0.0001969107109305807,
"loss": 1.5092,
"step": 980
},
{
"epoch": 0.5023981582144913,
"grad_norm": 0.03987947851419449,
"learning_rate": 0.00019688980503788475,
"loss": 1.5222,
"step": 982
},
{
"epoch": 0.5034213723860076,
"grad_norm": 0.04482003673911095,
"learning_rate": 0.00019686882976349836,
"loss": 1.517,
"step": 984
},
{
"epoch": 0.5044445865575238,
"grad_norm": 0.04025088995695114,
"learning_rate": 0.00019684778512244172,
"loss": 1.5188,
"step": 986
},
{
"epoch": 0.5054678007290401,
"grad_norm": 0.04705490544438362,
"learning_rate": 0.00019682667112978463,
"loss": 1.5266,
"step": 988
},
{
"epoch": 0.5064910149005564,
"grad_norm": 0.0493633933365345,
"learning_rate": 0.0001968054878006466,
"loss": 1.5079,
"step": 990
},
{
"epoch": 0.5075142290720727,
"grad_norm": 0.04063592851161957,
"learning_rate": 0.00019678423515019674,
"loss": 1.5169,
"step": 992
},
{
"epoch": 0.508537443243589,
"grad_norm": 0.04962534457445145,
"learning_rate": 0.00019676291319365387,
"loss": 1.5219,
"step": 994
},
{
"epoch": 0.5095606574151051,
"grad_norm": 0.03995488956570625,
"learning_rate": 0.00019674152194628638,
"loss": 1.5397,
"step": 996
},
{
"epoch": 0.5105838715866214,
"grad_norm": 0.04593009501695633,
"learning_rate": 0.00019672006142341234,
"loss": 1.5616,
"step": 998
},
{
"epoch": 0.5116070857581377,
"grad_norm": 0.04215447977185249,
"learning_rate": 0.00019669853164039933,
"loss": 1.5425,
"step": 1000
},
{
"epoch": 0.512630299929654,
"grad_norm": 0.043728407472372055,
"learning_rate": 0.0001966769326126646,
"loss": 1.5044,
"step": 1002
},
{
"epoch": 0.5136535141011703,
"grad_norm": 0.04384353384375572,
"learning_rate": 0.00019665526435567497,
"loss": 1.5734,
"step": 1004
},
{
"epoch": 0.5146767282726866,
"grad_norm": 0.04542085528373718,
"learning_rate": 0.00019663352688494684,
"loss": 1.5023,
"step": 1006
},
{
"epoch": 0.5156999424442028,
"grad_norm": 0.05727483332157135,
"learning_rate": 0.0001966117202160462,
"loss": 1.5668,
"step": 1008
},
{
"epoch": 0.5167231566157191,
"grad_norm": 0.055995501577854156,
"learning_rate": 0.0001965898443645885,
"loss": 1.5533,
"step": 1010
},
{
"epoch": 0.5177463707872354,
"grad_norm": 0.04521145299077034,
"learning_rate": 0.00019656789934623881,
"loss": 1.5196,
"step": 1012
},
{
"epoch": 0.5187695849587517,
"grad_norm": 0.040051352232694626,
"learning_rate": 0.0001965458851767117,
"loss": 1.5293,
"step": 1014
},
{
"epoch": 0.519792799130268,
"grad_norm": 0.04483609274029732,
"learning_rate": 0.00019652380187177126,
"loss": 1.5028,
"step": 1016
},
{
"epoch": 0.5208160133017842,
"grad_norm": 0.04116397351026535,
"learning_rate": 0.00019650164944723115,
"loss": 1.5272,
"step": 1018
},
{
"epoch": 0.5218392274733005,
"grad_norm": 0.04803440347313881,
"learning_rate": 0.00019647942791895445,
"loss": 1.525,
"step": 1020
},
{
"epoch": 0.5228624416448168,
"grad_norm": 0.05390439182519913,
"learning_rate": 0.00019645713730285366,
"loss": 1.5446,
"step": 1022
},
{
"epoch": 0.5238856558163331,
"grad_norm": 0.04475432634353638,
"learning_rate": 0.00019643477761489096,
"loss": 1.5213,
"step": 1024
},
{
"epoch": 0.5249088699878494,
"grad_norm": 0.04424989968538284,
"learning_rate": 0.00019641234887107778,
"loss": 1.4888,
"step": 1026
},
{
"epoch": 0.5259320841593657,
"grad_norm": 0.049827560782432556,
"learning_rate": 0.00019638985108747515,
"loss": 1.5555,
"step": 1028
},
{
"epoch": 0.5269552983308818,
"grad_norm": 0.04092090204358101,
"learning_rate": 0.0001963672842801934,
"loss": 1.4815,
"step": 1030
},
{
"epoch": 0.5279785125023981,
"grad_norm": 0.052185434848070145,
"learning_rate": 0.00019634464846539246,
"loss": 1.5657,
"step": 1032
},
{
"epoch": 0.5290017266739144,
"grad_norm": 0.04300570487976074,
"learning_rate": 0.00019632194365928153,
"loss": 1.5259,
"step": 1034
},
{
"epoch": 0.5300249408454307,
"grad_norm": 0.04205292835831642,
"learning_rate": 0.00019629916987811926,
"loss": 1.527,
"step": 1036
},
{
"epoch": 0.531048155016947,
"grad_norm": 0.06136661395430565,
"learning_rate": 0.00019627632713821368,
"loss": 1.5541,
"step": 1038
},
{
"epoch": 0.5320713691884632,
"grad_norm": 0.03824898600578308,
"learning_rate": 0.00019625341545592226,
"loss": 1.5496,
"step": 1040
},
{
"epoch": 0.5330945833599795,
"grad_norm": 0.041780851781368256,
"learning_rate": 0.0001962304348476518,
"loss": 1.5283,
"step": 1042
},
{
"epoch": 0.5341177975314958,
"grad_norm": 0.04486005753278732,
"learning_rate": 0.0001962073853298584,
"loss": 1.5312,
"step": 1044
},
{
"epoch": 0.5351410117030121,
"grad_norm": 0.041384853422641754,
"learning_rate": 0.00019618426691904762,
"loss": 1.5011,
"step": 1046
},
{
"epoch": 0.5361642258745284,
"grad_norm": 0.0440378412604332,
"learning_rate": 0.00019616107963177425,
"loss": 1.4855,
"step": 1048
},
{
"epoch": 0.5371874400460447,
"grad_norm": 0.052033115178346634,
"learning_rate": 0.00019613782348464244,
"loss": 1.4811,
"step": 1050
},
{
"epoch": 0.5382106542175609,
"grad_norm": 0.04121650755405426,
"learning_rate": 0.00019611449849430565,
"loss": 1.5653,
"step": 1052
},
{
"epoch": 0.5392338683890772,
"grad_norm": 0.04445752128958702,
"learning_rate": 0.00019609110467746666,
"loss": 1.5098,
"step": 1054
},
{
"epoch": 0.5402570825605935,
"grad_norm": 0.06591064482927322,
"learning_rate": 0.00019606764205087757,
"loss": 1.5304,
"step": 1056
},
{
"epoch": 0.5412802967321098,
"grad_norm": 0.05301080271601677,
"learning_rate": 0.0001960441106313396,
"loss": 1.4871,
"step": 1058
},
{
"epoch": 0.542303510903626,
"grad_norm": 0.040986523032188416,
"learning_rate": 0.0001960205104357034,
"loss": 1.5195,
"step": 1060
},
{
"epoch": 0.5433267250751422,
"grad_norm": 0.03562408685684204,
"learning_rate": 0.00019599684148086878,
"loss": 1.5384,
"step": 1062
},
{
"epoch": 0.5443499392466585,
"grad_norm": 0.04383963719010353,
"learning_rate": 0.00019597310378378476,
"loss": 1.4988,
"step": 1064
},
{
"epoch": 0.5453731534181748,
"grad_norm": 0.06702277064323425,
"learning_rate": 0.00019594929736144976,
"loss": 1.4897,
"step": 1066
},
{
"epoch": 0.5463963675896911,
"grad_norm": 0.0414276085793972,
"learning_rate": 0.00019592542223091118,
"loss": 1.5049,
"step": 1068
},
{
"epoch": 0.5474195817612074,
"grad_norm": 0.0432027168571949,
"learning_rate": 0.00019590147840926577,
"loss": 1.4686,
"step": 1070
},
{
"epoch": 0.5484427959327237,
"grad_norm": 0.044036637991666794,
"learning_rate": 0.00019587746591365941,
"loss": 1.5082,
"step": 1072
},
{
"epoch": 0.5494660101042399,
"grad_norm": 0.04510560259222984,
"learning_rate": 0.0001958533847612872,
"loss": 1.5213,
"step": 1074
},
{
"epoch": 0.5504892242757562,
"grad_norm": 0.04027169942855835,
"learning_rate": 0.00019582923496939337,
"loss": 1.4952,
"step": 1076
},
{
"epoch": 0.5515124384472725,
"grad_norm": 0.08312036097049713,
"learning_rate": 0.00019580501655527133,
"loss": 1.512,
"step": 1078
},
{
"epoch": 0.5525356526187888,
"grad_norm": 0.04634568840265274,
"learning_rate": 0.00019578072953626357,
"loss": 1.5248,
"step": 1080
},
{
"epoch": 0.5535588667903051,
"grad_norm": 0.044149454683065414,
"learning_rate": 0.00019575637392976178,
"loss": 1.4911,
"step": 1082
},
{
"epoch": 0.5545820809618213,
"grad_norm": 0.04358943551778793,
"learning_rate": 0.00019573194975320673,
"loss": 1.5427,
"step": 1084
},
{
"epoch": 0.5556052951333376,
"grad_norm": 0.038042690604925156,
"learning_rate": 0.0001957074570240883,
"loss": 1.5032,
"step": 1086
},
{
"epoch": 0.5566285093048539,
"grad_norm": 0.04171706736087799,
"learning_rate": 0.00019568289575994544,
"loss": 1.493,
"step": 1088
},
{
"epoch": 0.5576517234763702,
"grad_norm": 0.04037075862288475,
"learning_rate": 0.0001956582659783662,
"loss": 1.5334,
"step": 1090
},
{
"epoch": 0.5586749376478864,
"grad_norm": 0.036902882158756256,
"learning_rate": 0.0001956335676969877,
"loss": 1.5093,
"step": 1092
},
{
"epoch": 0.5596981518194027,
"grad_norm": 0.04198329523205757,
"learning_rate": 0.00019560880093349607,
"loss": 1.5069,
"step": 1094
},
{
"epoch": 0.5607213659909189,
"grad_norm": 0.034086357802152634,
"learning_rate": 0.0001955839657056265,
"loss": 1.5101,
"step": 1096
},
{
"epoch": 0.5617445801624352,
"grad_norm": 0.03502487763762474,
"learning_rate": 0.0001955590620311633,
"loss": 1.5305,
"step": 1098
},
{
"epoch": 0.5627677943339515,
"grad_norm": 0.03580254316329956,
"learning_rate": 0.00019553408992793964,
"loss": 1.4984,
"step": 1100
},
{
"epoch": 0.5637910085054678,
"grad_norm": 0.0441250242292881,
"learning_rate": 0.00019550904941383773,
"loss": 1.4956,
"step": 1102
},
{
"epoch": 0.5648142226769841,
"grad_norm": 0.039550572633743286,
"learning_rate": 0.00019548394050678883,
"loss": 1.5041,
"step": 1104
},
{
"epoch": 0.5658374368485003,
"grad_norm": 0.03674033284187317,
"learning_rate": 0.0001954587632247732,
"loss": 1.4694,
"step": 1106
},
{
"epoch": 0.5668606510200166,
"grad_norm": 0.03579515963792801,
"learning_rate": 0.00019543351758581994,
"loss": 1.4789,
"step": 1108
},
{
"epoch": 0.5678838651915329,
"grad_norm": 0.04077816754579544,
"learning_rate": 0.0001954082036080072,
"loss": 1.5221,
"step": 1110
},
{
"epoch": 0.5689070793630492,
"grad_norm": 0.03694437816739082,
"learning_rate": 0.00019538282130946198,
"loss": 1.5273,
"step": 1112
},
{
"epoch": 0.5699302935345655,
"grad_norm": 0.03998146578669548,
"learning_rate": 0.00019535737070836028,
"loss": 1.5426,
"step": 1114
},
{
"epoch": 0.5709535077060818,
"grad_norm": 0.03823567554354668,
"learning_rate": 0.00019533185182292703,
"loss": 1.5264,
"step": 1116
},
{
"epoch": 0.571976721877598,
"grad_norm": 0.03891613706946373,
"learning_rate": 0.000195306264671436,
"loss": 1.5194,
"step": 1118
},
{
"epoch": 0.5729999360491143,
"grad_norm": 0.035352472215890884,
"learning_rate": 0.0001952806092722098,
"loss": 1.5049,
"step": 1120
},
{
"epoch": 0.5740231502206306,
"grad_norm": 0.03947431594133377,
"learning_rate": 0.00019525488564362003,
"loss": 1.5562,
"step": 1122
},
{
"epoch": 0.5750463643921468,
"grad_norm": 0.0398818701505661,
"learning_rate": 0.00019522909380408705,
"loss": 1.5216,
"step": 1124
},
{
"epoch": 0.5760695785636631,
"grad_norm": 0.03842191398143768,
"learning_rate": 0.00019520323377208017,
"loss": 1.5461,
"step": 1126
},
{
"epoch": 0.5770927927351793,
"grad_norm": 0.03299557417631149,
"learning_rate": 0.00019517730556611738,
"loss": 1.4988,
"step": 1128
},
{
"epoch": 0.5781160069066956,
"grad_norm": 0.032452985644340515,
"learning_rate": 0.00019515130920476562,
"loss": 1.4837,
"step": 1130
},
{
"epoch": 0.5791392210782119,
"grad_norm": 0.03567085042595863,
"learning_rate": 0.00019512524470664057,
"loss": 1.5081,
"step": 1132
},
{
"epoch": 0.5801624352497282,
"grad_norm": 0.04303791746497154,
"learning_rate": 0.00019509911209040676,
"loss": 1.517,
"step": 1134
},
{
"epoch": 0.5811856494212445,
"grad_norm": 0.040586575865745544,
"learning_rate": 0.00019507291137477742,
"loss": 1.5494,
"step": 1136
},
{
"epoch": 0.5822088635927608,
"grad_norm": 0.038383904844522476,
"learning_rate": 0.0001950466425785146,
"loss": 1.4641,
"step": 1138
},
{
"epoch": 0.583232077764277,
"grad_norm": 0.0484977550804615,
"learning_rate": 0.0001950203057204291,
"loss": 1.4838,
"step": 1140
},
{
"epoch": 0.5842552919357933,
"grad_norm": 0.03300706669688225,
"learning_rate": 0.00019499390081938046,
"loss": 1.4935,
"step": 1142
},
{
"epoch": 0.5852785061073096,
"grad_norm": 0.041923582553863525,
"learning_rate": 0.00019496742789427683,
"loss": 1.484,
"step": 1144
},
{
"epoch": 0.5863017202788259,
"grad_norm": 0.04476374387741089,
"learning_rate": 0.00019494088696407532,
"loss": 1.5222,
"step": 1146
},
{
"epoch": 0.5873249344503422,
"grad_norm": 0.039443958550691605,
"learning_rate": 0.00019491427804778147,
"loss": 1.4899,
"step": 1148
},
{
"epoch": 0.5883481486218584,
"grad_norm": 0.0458071269094944,
"learning_rate": 0.00019488760116444966,
"loss": 1.5006,
"step": 1150
},
{
"epoch": 0.5893713627933747,
"grad_norm": 0.04912669211626053,
"learning_rate": 0.00019486085633318293,
"loss": 1.5193,
"step": 1152
},
{
"epoch": 0.590394576964891,
"grad_norm": 0.05331273376941681,
"learning_rate": 0.00019483404357313293,
"loss": 1.5115,
"step": 1154
},
{
"epoch": 0.5914177911364072,
"grad_norm": 0.04301870986819267,
"learning_rate": 0.00019480716290349995,
"loss": 1.4997,
"step": 1156
},
{
"epoch": 0.5924410053079235,
"grad_norm": 0.042690206319093704,
"learning_rate": 0.00019478021434353297,
"loss": 1.5014,
"step": 1158
},
{
"epoch": 0.5934642194794398,
"grad_norm": 0.045416899025440216,
"learning_rate": 0.00019475319791252956,
"loss": 1.5287,
"step": 1160
},
{
"epoch": 0.594487433650956,
"grad_norm": 0.04627612978219986,
"learning_rate": 0.0001947261136298358,
"loss": 1.5238,
"step": 1162
},
{
"epoch": 0.5955106478224723,
"grad_norm": 0.0443304218351841,
"learning_rate": 0.00019469896151484654,
"loss": 1.4956,
"step": 1164
},
{
"epoch": 0.5965338619939886,
"grad_norm": 0.042293716222047806,
"learning_rate": 0.00019467174158700504,
"loss": 1.4962,
"step": 1166
},
{
"epoch": 0.5975570761655049,
"grad_norm": 0.035955190658569336,
"learning_rate": 0.0001946444538658032,
"loss": 1.4799,
"step": 1168
},
{
"epoch": 0.5985802903370212,
"grad_norm": 0.04025396704673767,
"learning_rate": 0.00019461709837078145,
"loss": 1.489,
"step": 1170
},
{
"epoch": 0.5996035045085374,
"grad_norm": 0.057371869683265686,
"learning_rate": 0.0001945896751215287,
"loss": 1.4872,
"step": 1172
},
{
"epoch": 0.6006267186800537,
"grad_norm": 0.05806579813361168,
"learning_rate": 0.0001945621841376825,
"loss": 1.5153,
"step": 1174
},
{
"epoch": 0.60164993285157,
"grad_norm": 0.03980225697159767,
"learning_rate": 0.00019453462543892882,
"loss": 1.5093,
"step": 1176
},
{
"epoch": 0.6026731470230863,
"grad_norm": 0.041456956416368484,
"learning_rate": 0.0001945069990450021,
"loss": 1.5115,
"step": 1178
},
{
"epoch": 0.6036963611946026,
"grad_norm": 0.03392681106925011,
"learning_rate": 0.00019447930497568528,
"loss": 1.4863,
"step": 1180
},
{
"epoch": 0.6047195753661189,
"grad_norm": 0.03312285616993904,
"learning_rate": 0.0001944515432508098,
"loss": 1.5321,
"step": 1182
},
{
"epoch": 0.605742789537635,
"grad_norm": 0.03741718456149101,
"learning_rate": 0.00019442371389025552,
"loss": 1.4874,
"step": 1184
},
{
"epoch": 0.6067660037091513,
"grad_norm": 0.03954221308231354,
"learning_rate": 0.00019439581691395067,
"loss": 1.5014,
"step": 1186
},
{
"epoch": 0.6077892178806676,
"grad_norm": 0.03756248950958252,
"learning_rate": 0.00019436785234187205,
"loss": 1.522,
"step": 1188
},
{
"epoch": 0.6088124320521839,
"grad_norm": 0.03895876556634903,
"learning_rate": 0.00019433982019404473,
"loss": 1.5546,
"step": 1190
},
{
"epoch": 0.6098356462237002,
"grad_norm": 0.038288913667201996,
"learning_rate": 0.0001943117204905422,
"loss": 1.4859,
"step": 1192
},
{
"epoch": 0.6108588603952164,
"grad_norm": 0.034622881561517715,
"learning_rate": 0.00019428355325148633,
"loss": 1.5246,
"step": 1194
},
{
"epoch": 0.6118820745667327,
"grad_norm": 0.04585454985499382,
"learning_rate": 0.0001942553184970474,
"loss": 1.5001,
"step": 1196
},
{
"epoch": 0.612905288738249,
"grad_norm": 0.03685140982270241,
"learning_rate": 0.00019422701624744395,
"loss": 1.5114,
"step": 1198
},
{
"epoch": 0.6139285029097653,
"grad_norm": 0.033848248422145844,
"learning_rate": 0.00019419864652294296,
"loss": 1.5047,
"step": 1200
},
{
"epoch": 0.6149517170812816,
"grad_norm": 0.03485368937253952,
"learning_rate": 0.00019417020934385962,
"loss": 1.5412,
"step": 1202
},
{
"epoch": 0.6159749312527979,
"grad_norm": 0.03737105429172516,
"learning_rate": 0.00019414170473055746,
"loss": 1.5014,
"step": 1204
},
{
"epoch": 0.6169981454243141,
"grad_norm": 0.0417652502655983,
"learning_rate": 0.00019411313270344837,
"loss": 1.4963,
"step": 1206
},
{
"epoch": 0.6180213595958304,
"grad_norm": 0.037758734077215195,
"learning_rate": 0.0001940844932829924,
"loss": 1.4935,
"step": 1208
},
{
"epoch": 0.6190445737673467,
"grad_norm": 0.03808191418647766,
"learning_rate": 0.00019405578648969796,
"loss": 1.5181,
"step": 1210
},
{
"epoch": 0.620067787938863,
"grad_norm": 0.03454340249300003,
"learning_rate": 0.00019402701234412162,
"loss": 1.493,
"step": 1212
},
{
"epoch": 0.6210910021103793,
"grad_norm": 0.03708413615822792,
"learning_rate": 0.00019399817086686826,
"loss": 1.4987,
"step": 1214
},
{
"epoch": 0.6221142162818954,
"grad_norm": 0.046957071870565414,
"learning_rate": 0.00019396926207859084,
"loss": 1.473,
"step": 1216
},
{
"epoch": 0.6231374304534117,
"grad_norm": 0.03893362358212471,
"learning_rate": 0.00019394028599999073,
"loss": 1.4915,
"step": 1218
},
{
"epoch": 0.624160644624928,
"grad_norm": 0.04247049614787102,
"learning_rate": 0.0001939112426518173,
"loss": 1.5384,
"step": 1220
},
{
"epoch": 0.6251838587964443,
"grad_norm": 0.036440882831811905,
"learning_rate": 0.00019388213205486822,
"loss": 1.5124,
"step": 1222
},
{
"epoch": 0.6262070729679606,
"grad_norm": 0.037374429404735565,
"learning_rate": 0.00019385295422998921,
"loss": 1.5244,
"step": 1224
},
{
"epoch": 0.6272302871394769,
"grad_norm": 0.0383899062871933,
"learning_rate": 0.00019382370919807419,
"loss": 1.5078,
"step": 1226
},
{
"epoch": 0.6282535013109931,
"grad_norm": 0.03726350888609886,
"learning_rate": 0.0001937943969800652,
"loss": 1.4968,
"step": 1228
},
{
"epoch": 0.6292767154825094,
"grad_norm": 0.037606336176395416,
"learning_rate": 0.0001937650175969524,
"loss": 1.4735,
"step": 1230
},
{
"epoch": 0.6302999296540257,
"grad_norm": 0.03583415970206261,
"learning_rate": 0.000193735571069774,
"loss": 1.4872,
"step": 1232
},
{
"epoch": 0.631323143825542,
"grad_norm": 0.029802750796079636,
"learning_rate": 0.00019370605741961635,
"loss": 1.5037,
"step": 1234
},
{
"epoch": 0.6323463579970583,
"grad_norm": 0.037094760686159134,
"learning_rate": 0.00019367647666761385,
"loss": 1.518,
"step": 1236
},
{
"epoch": 0.6333695721685745,
"grad_norm": 0.03802032023668289,
"learning_rate": 0.00019364682883494893,
"loss": 1.4997,
"step": 1238
},
{
"epoch": 0.6343927863400908,
"grad_norm": 0.03934174031019211,
"learning_rate": 0.00019361711394285202,
"loss": 1.5033,
"step": 1240
},
{
"epoch": 0.6354160005116071,
"grad_norm": 0.03484318405389786,
"learning_rate": 0.00019358733201260169,
"loss": 1.5068,
"step": 1242
},
{
"epoch": 0.6364392146831234,
"grad_norm": 0.03633354604244232,
"learning_rate": 0.00019355748306552442,
"loss": 1.5462,
"step": 1244
},
{
"epoch": 0.6374624288546397,
"grad_norm": 0.05548425391316414,
"learning_rate": 0.00019352756712299468,
"loss": 1.5036,
"step": 1246
},
{
"epoch": 0.638485643026156,
"grad_norm": 0.032225679606199265,
"learning_rate": 0.00019349758420643493,
"loss": 1.5026,
"step": 1248
},
{
"epoch": 0.6395088571976721,
"grad_norm": 0.03236972540616989,
"learning_rate": 0.00019346753433731564,
"loss": 1.5199,
"step": 1250
},
{
"epoch": 0.6405320713691884,
"grad_norm": 0.03576046973466873,
"learning_rate": 0.00019343741753715516,
"loss": 1.5146,
"step": 1252
},
{
"epoch": 0.6415552855407047,
"grad_norm": 0.04308708757162094,
"learning_rate": 0.00019340723382751978,
"loss": 1.5,
"step": 1254
},
{
"epoch": 0.642578499712221,
"grad_norm": 0.035895735025405884,
"learning_rate": 0.0001933769832300237,
"loss": 1.5043,
"step": 1256
},
{
"epoch": 0.6436017138837373,
"grad_norm": 0.03789574280381203,
"learning_rate": 0.00019334666576632906,
"loss": 1.4935,
"step": 1258
},
{
"epoch": 0.6446249280552535,
"grad_norm": 0.03609545901417732,
"learning_rate": 0.00019331628145814587,
"loss": 1.5296,
"step": 1260
},
{
"epoch": 0.6456481422267698,
"grad_norm": 0.0432671383023262,
"learning_rate": 0.00019328583032723193,
"loss": 1.5045,
"step": 1262
},
{
"epoch": 0.6466713563982861,
"grad_norm": 0.038937125355005264,
"learning_rate": 0.000193255312395393,
"loss": 1.4801,
"step": 1264
},
{
"epoch": 0.6476945705698024,
"grad_norm": 0.03925538435578346,
"learning_rate": 0.00019322472768448258,
"loss": 1.4903,
"step": 1266
},
{
"epoch": 0.6487177847413187,
"grad_norm": 0.03581652417778969,
"learning_rate": 0.00019319407621640208,
"loss": 1.471,
"step": 1268
},
{
"epoch": 0.649740998912835,
"grad_norm": 0.03643723577260971,
"learning_rate": 0.00019316335801310063,
"loss": 1.5019,
"step": 1270
},
{
"epoch": 0.6507642130843512,
"grad_norm": 0.03839946910738945,
"learning_rate": 0.0001931325730965752,
"loss": 1.5148,
"step": 1272
},
{
"epoch": 0.6517874272558675,
"grad_norm": 0.04306597262620926,
"learning_rate": 0.00019310172148887054,
"loss": 1.472,
"step": 1274
},
{
"epoch": 0.6528106414273838,
"grad_norm": 0.069839708507061,
"learning_rate": 0.00019307080321207912,
"loss": 1.521,
"step": 1276
},
{
"epoch": 0.6538338555989001,
"grad_norm": 0.05618079751729965,
"learning_rate": 0.00019303981828834113,
"loss": 1.5019,
"step": 1278
},
{
"epoch": 0.6548570697704164,
"grad_norm": 0.04359296336770058,
"learning_rate": 0.00019300876673984462,
"loss": 1.4676,
"step": 1280
},
{
"epoch": 0.6558802839419325,
"grad_norm": 0.038589805364608765,
"learning_rate": 0.00019297764858882514,
"loss": 1.4791,
"step": 1282
},
{
"epoch": 0.6569034981134488,
"grad_norm": 0.0316338986158371,
"learning_rate": 0.00019294646385756612,
"loss": 1.4824,
"step": 1284
},
{
"epoch": 0.6579267122849651,
"grad_norm": 0.03457920625805855,
"learning_rate": 0.00019291521256839858,
"loss": 1.4946,
"step": 1286
},
{
"epoch": 0.6589499264564814,
"grad_norm": 0.04637923464179039,
"learning_rate": 0.00019288389474370117,
"loss": 1.5049,
"step": 1288
},
{
"epoch": 0.6599731406279977,
"grad_norm": 0.05314064025878906,
"learning_rate": 0.0001928525104059003,
"loss": 1.5021,
"step": 1290
},
{
"epoch": 0.660996354799514,
"grad_norm": 0.041335079818964005,
"learning_rate": 0.00019282105957746986,
"loss": 1.4869,
"step": 1292
},
{
"epoch": 0.6620195689710302,
"grad_norm": 0.040912263095378876,
"learning_rate": 0.00019278954228093146,
"loss": 1.5168,
"step": 1294
},
{
"epoch": 0.6630427831425465,
"grad_norm": 0.037110935896635056,
"learning_rate": 0.00019275795853885433,
"loss": 1.4973,
"step": 1296
},
{
"epoch": 0.6640659973140628,
"grad_norm": 0.035204846411943436,
"learning_rate": 0.00019272630837385518,
"loss": 1.5062,
"step": 1298
},
{
"epoch": 0.6650892114855791,
"grad_norm": 0.0464470274746418,
"learning_rate": 0.0001926945918085983,
"loss": 1.5412,
"step": 1300
},
{
"epoch": 0.6661124256570954,
"grad_norm": 0.033444374799728394,
"learning_rate": 0.00019266280886579565,
"loss": 1.4799,
"step": 1302
},
{
"epoch": 0.6671356398286117,
"grad_norm": 0.036789704114198685,
"learning_rate": 0.0001926309595682066,
"loss": 1.5604,
"step": 1304
},
{
"epoch": 0.6681588540001279,
"grad_norm": 0.03726235032081604,
"learning_rate": 0.00019259904393863802,
"loss": 1.5054,
"step": 1306
},
{
"epoch": 0.6691820681716442,
"grad_norm": 0.03499661013484001,
"learning_rate": 0.00019256706199994442,
"loss": 1.5039,
"step": 1308
},
{
"epoch": 0.6702052823431605,
"grad_norm": 0.037414226680994034,
"learning_rate": 0.00019253501377502764,
"loss": 1.4952,
"step": 1310
},
{
"epoch": 0.6712284965146768,
"grad_norm": 0.041186489164829254,
"learning_rate": 0.00019250289928683705,
"loss": 1.519,
"step": 1312
},
{
"epoch": 0.672251710686193,
"grad_norm": 0.050159044563770294,
"learning_rate": 0.0001924707185583695,
"loss": 1.5112,
"step": 1314
},
{
"epoch": 0.6732749248577092,
"grad_norm": 0.05124843865633011,
"learning_rate": 0.0001924384716126692,
"loss": 1.4897,
"step": 1316
},
{
"epoch": 0.6742981390292255,
"grad_norm": 0.03580416738986969,
"learning_rate": 0.00019240615847282788,
"loss": 1.4739,
"step": 1318
},
{
"epoch": 0.6753213532007418,
"grad_norm": 0.03572642430663109,
"learning_rate": 0.00019237377916198458,
"loss": 1.4735,
"step": 1320
},
{
"epoch": 0.6763445673722581,
"grad_norm": 0.04381095990538597,
"learning_rate": 0.00019234133370332578,
"loss": 1.4817,
"step": 1322
},
{
"epoch": 0.6773677815437744,
"grad_norm": 0.03948042169213295,
"learning_rate": 0.00019230882212008528,
"loss": 1.5288,
"step": 1324
},
{
"epoch": 0.6783909957152907,
"grad_norm": 0.04092205688357353,
"learning_rate": 0.00019227624443554425,
"loss": 1.503,
"step": 1326
},
{
"epoch": 0.6794142098868069,
"grad_norm": 0.0372740812599659,
"learning_rate": 0.0001922436006730312,
"loss": 1.5186,
"step": 1328
},
{
"epoch": 0.6804374240583232,
"grad_norm": 0.03410439193248749,
"learning_rate": 0.00019221089085592202,
"loss": 1.5104,
"step": 1330
},
{
"epoch": 0.6814606382298395,
"grad_norm": 0.04406609386205673,
"learning_rate": 0.00019217811500763977,
"loss": 1.497,
"step": 1332
},
{
"epoch": 0.6824838524013558,
"grad_norm": 0.04020300507545471,
"learning_rate": 0.00019214527315165487,
"loss": 1.4589,
"step": 1334
},
{
"epoch": 0.6835070665728721,
"grad_norm": 0.03552987799048424,
"learning_rate": 0.000192112365311485,
"loss": 1.4938,
"step": 1336
},
{
"epoch": 0.6845302807443883,
"grad_norm": 0.035595186054706573,
"learning_rate": 0.00019207939151069515,
"loss": 1.4664,
"step": 1338
},
{
"epoch": 0.6855534949159046,
"grad_norm": 0.030798960477113724,
"learning_rate": 0.00019204635177289743,
"loss": 1.4786,
"step": 1340
},
{
"epoch": 0.6865767090874209,
"grad_norm": 0.03413120657205582,
"learning_rate": 0.00019201324612175123,
"loss": 1.5409,
"step": 1342
},
{
"epoch": 0.6875999232589372,
"grad_norm": 0.03786253184080124,
"learning_rate": 0.0001919800745809631,
"loss": 1.4725,
"step": 1344
},
{
"epoch": 0.6886231374304534,
"grad_norm": 0.0414445661008358,
"learning_rate": 0.00019194683717428687,
"loss": 1.4993,
"step": 1346
},
{
"epoch": 0.6896463516019697,
"grad_norm": 0.0378003790974617,
"learning_rate": 0.00019191353392552344,
"loss": 1.5225,
"step": 1348
},
{
"epoch": 0.6906695657734859,
"grad_norm": 0.0343095101416111,
"learning_rate": 0.0001918801648585209,
"loss": 1.4671,
"step": 1350
},
{
"epoch": 0.6916927799450022,
"grad_norm": 0.03458075597882271,
"learning_rate": 0.0001918467299971744,
"loss": 1.4843,
"step": 1352
},
{
"epoch": 0.6927159941165185,
"grad_norm": 0.03243357688188553,
"learning_rate": 0.00019181322936542635,
"loss": 1.494,
"step": 1354
},
{
"epoch": 0.6937392082880348,
"grad_norm": 0.03002413548529148,
"learning_rate": 0.00019177966298726613,
"loss": 1.5046,
"step": 1356
},
{
"epoch": 0.6947624224595511,
"grad_norm": 0.031211066991090775,
"learning_rate": 0.00019174603088673026,
"loss": 1.4664,
"step": 1358
},
{
"epoch": 0.6957856366310673,
"grad_norm": 0.03740109130740166,
"learning_rate": 0.00019171233308790225,
"loss": 1.4394,
"step": 1360
},
{
"epoch": 0.6968088508025836,
"grad_norm": 0.03566642478108406,
"learning_rate": 0.0001916785696149128,
"loss": 1.4935,
"step": 1362
},
{
"epoch": 0.6978320649740999,
"grad_norm": 0.033135462552309036,
"learning_rate": 0.00019164474049193948,
"loss": 1.5171,
"step": 1364
},
{
"epoch": 0.6988552791456162,
"grad_norm": 0.03240213543176651,
"learning_rate": 0.00019161084574320696,
"loss": 1.4644,
"step": 1366
},
{
"epoch": 0.6998784933171325,
"grad_norm": 0.0337255634367466,
"learning_rate": 0.0001915768853929869,
"loss": 1.4739,
"step": 1368
},
{
"epoch": 0.7009017074886488,
"grad_norm": 0.033216070383787155,
"learning_rate": 0.00019154285946559792,
"loss": 1.4691,
"step": 1370
},
{
"epoch": 0.701924921660165,
"grad_norm": 0.03151748329401016,
"learning_rate": 0.0001915087679854056,
"loss": 1.4882,
"step": 1372
},
{
"epoch": 0.7029481358316813,
"grad_norm": 0.03065643645823002,
"learning_rate": 0.00019147461097682246,
"loss": 1.4608,
"step": 1374
},
{
"epoch": 0.7039713500031975,
"grad_norm": 0.0341670848429203,
"learning_rate": 0.0001914403884643079,
"loss": 1.4714,
"step": 1376
},
{
"epoch": 0.7049945641747138,
"grad_norm": 0.035825930535793304,
"learning_rate": 0.00019140610047236833,
"loss": 1.4752,
"step": 1378
},
{
"epoch": 0.7060177783462301,
"grad_norm": 0.042743559926748276,
"learning_rate": 0.00019137174702555697,
"loss": 1.5077,
"step": 1380
},
{
"epoch": 0.7070409925177463,
"grad_norm": 0.03980020061135292,
"learning_rate": 0.00019133732814847397,
"loss": 1.4813,
"step": 1382
},
{
"epoch": 0.7080642066892626,
"grad_norm": 0.03854946047067642,
"learning_rate": 0.00019130284386576624,
"loss": 1.4623,
"step": 1384
},
{
"epoch": 0.7090874208607789,
"grad_norm": 0.037254948168992996,
"learning_rate": 0.00019126829420212764,
"loss": 1.5247,
"step": 1386
},
{
"epoch": 0.7101106350322952,
"grad_norm": 0.047802574932575226,
"learning_rate": 0.00019123367918229874,
"loss": 1.4989,
"step": 1388
},
{
"epoch": 0.7111338492038115,
"grad_norm": 0.039889827370643616,
"learning_rate": 0.000191198998831067,
"loss": 1.4727,
"step": 1390
},
{
"epoch": 0.7121570633753278,
"grad_norm": 0.03746683895587921,
"learning_rate": 0.0001911642531732666,
"loss": 1.4929,
"step": 1392
},
{
"epoch": 0.713180277546844,
"grad_norm": 0.04323015734553337,
"learning_rate": 0.00019112944223377855,
"loss": 1.4989,
"step": 1394
},
{
"epoch": 0.7142034917183603,
"grad_norm": 0.04086681455373764,
"learning_rate": 0.0001910945660375305,
"loss": 1.4884,
"step": 1396
},
{
"epoch": 0.7152267058898766,
"grad_norm": 0.03528650477528572,
"learning_rate": 0.00019105962460949698,
"loss": 1.4932,
"step": 1398
},
{
"epoch": 0.7162499200613929,
"grad_norm": 0.041061852127313614,
"learning_rate": 0.00019102461797469912,
"loss": 1.5063,
"step": 1400
},
{
"epoch": 0.7172731342329092,
"grad_norm": 0.033481474965810776,
"learning_rate": 0.00019098954615820476,
"loss": 1.4825,
"step": 1402
},
{
"epoch": 0.7182963484044254,
"grad_norm": 0.03925000876188278,
"learning_rate": 0.00019095440918512842,
"loss": 1.513,
"step": 1404
},
{
"epoch": 0.7193195625759417,
"grad_norm": 0.03856325149536133,
"learning_rate": 0.0001909192070806313,
"loss": 1.4907,
"step": 1406
},
{
"epoch": 0.720342776747458,
"grad_norm": 0.03494630753993988,
"learning_rate": 0.00019088393986992124,
"loss": 1.4604,
"step": 1408
},
{
"epoch": 0.7213659909189742,
"grad_norm": 0.03931909799575806,
"learning_rate": 0.00019084860757825268,
"loss": 1.4905,
"step": 1410
},
{
"epoch": 0.7223892050904905,
"grad_norm": 0.03644140437245369,
"learning_rate": 0.00019081321023092668,
"loss": 1.49,
"step": 1412
},
{
"epoch": 0.7234124192620068,
"grad_norm": 0.03480161353945732,
"learning_rate": 0.00019077774785329087,
"loss": 1.5301,
"step": 1414
},
{
"epoch": 0.724435633433523,
"grad_norm": 0.03516329079866409,
"learning_rate": 0.00019074222047073947,
"loss": 1.4801,
"step": 1416
},
{
"epoch": 0.7254588476050393,
"grad_norm": 0.03371971845626831,
"learning_rate": 0.00019070662810871322,
"loss": 1.4724,
"step": 1418
},
{
"epoch": 0.7264820617765556,
"grad_norm": 0.034337956458330154,
"learning_rate": 0.00019067097079269942,
"loss": 1.4726,
"step": 1420
},
{
"epoch": 0.7275052759480719,
"grad_norm": 0.0360429473221302,
"learning_rate": 0.00019063524854823186,
"loss": 1.4856,
"step": 1422
},
{
"epoch": 0.7285284901195882,
"grad_norm": 0.03850055858492851,
"learning_rate": 0.0001905994614008908,
"loss": 1.5022,
"step": 1424
},
{
"epoch": 0.7295517042911044,
"grad_norm": 0.03869333118200302,
"learning_rate": 0.0001905636093763031,
"loss": 1.4949,
"step": 1426
},
{
"epoch": 0.7305749184626207,
"grad_norm": 0.03506360575556755,
"learning_rate": 0.0001905276925001419,
"loss": 1.4617,
"step": 1428
},
{
"epoch": 0.731598132634137,
"grad_norm": 0.033819831907749176,
"learning_rate": 0.00019049171079812692,
"loss": 1.4698,
"step": 1430
},
{
"epoch": 0.7326213468056533,
"grad_norm": 0.03606401011347771,
"learning_rate": 0.00019045566429602424,
"loss": 1.5038,
"step": 1432
},
{
"epoch": 0.7336445609771696,
"grad_norm": 0.04196172207593918,
"learning_rate": 0.00019041955301964632,
"loss": 1.5142,
"step": 1434
},
{
"epoch": 0.7346677751486859,
"grad_norm": 0.03859662637114525,
"learning_rate": 0.00019038337699485208,
"loss": 1.5072,
"step": 1436
},
{
"epoch": 0.735690989320202,
"grad_norm": 0.036224085837602615,
"learning_rate": 0.00019034713624754672,
"loss": 1.5033,
"step": 1438
},
{
"epoch": 0.7367142034917183,
"grad_norm": 0.04655170813202858,
"learning_rate": 0.00019031083080368183,
"loss": 1.5255,
"step": 1440
},
{
"epoch": 0.7377374176632346,
"grad_norm": 0.040406614542007446,
"learning_rate": 0.0001902744606892554,
"loss": 1.5199,
"step": 1442
},
{
"epoch": 0.7387606318347509,
"grad_norm": 0.03488042950630188,
"learning_rate": 0.00019023802593031154,
"loss": 1.5127,
"step": 1444
},
{
"epoch": 0.7397838460062672,
"grad_norm": 0.031517501920461655,
"learning_rate": 0.00019020152655294085,
"loss": 1.4726,
"step": 1446
},
{
"epoch": 0.7408070601777834,
"grad_norm": 0.0331415981054306,
"learning_rate": 0.0001901649625832801,
"loss": 1.473,
"step": 1448
},
{
"epoch": 0.7418302743492997,
"grad_norm": 0.03110121190547943,
"learning_rate": 0.00019012833404751235,
"loss": 1.4693,
"step": 1450
},
{
"epoch": 0.742853488520816,
"grad_norm": 0.03500855341553688,
"learning_rate": 0.00019009164097186684,
"loss": 1.4962,
"step": 1452
},
{
"epoch": 0.7438767026923323,
"grad_norm": 0.03449893742799759,
"learning_rate": 0.0001900548833826191,
"loss": 1.4938,
"step": 1454
},
{
"epoch": 0.7448999168638486,
"grad_norm": 0.03199852257966995,
"learning_rate": 0.0001900180613060908,
"loss": 1.4905,
"step": 1456
},
{
"epoch": 0.7459231310353649,
"grad_norm": 0.03547672927379608,
"learning_rate": 0.00018998117476864984,
"loss": 1.4495,
"step": 1458
},
{
"epoch": 0.7469463452068811,
"grad_norm": 0.03338061273097992,
"learning_rate": 0.00018994422379671016,
"loss": 1.4895,
"step": 1460
},
{
"epoch": 0.7479695593783974,
"grad_norm": 0.036238085478544235,
"learning_rate": 0.00018990720841673207,
"loss": 1.5382,
"step": 1462
},
{
"epoch": 0.7489927735499137,
"grad_norm": 0.03941986709833145,
"learning_rate": 0.0001898701286552218,
"loss": 1.4917,
"step": 1464
},
{
"epoch": 0.75001598772143,
"grad_norm": 0.03612781688570976,
"learning_rate": 0.0001898329845387317,
"loss": 1.4856,
"step": 1466
},
{
"epoch": 0.7510392018929463,
"grad_norm": 0.035338182002305984,
"learning_rate": 0.00018979577609386033,
"loss": 1.4787,
"step": 1468
},
{
"epoch": 0.7520624160644624,
"grad_norm": 0.035387344658374786,
"learning_rate": 0.0001897585033472522,
"loss": 1.489,
"step": 1470
},
{
"epoch": 0.7530856302359787,
"grad_norm": 0.033865489065647125,
"learning_rate": 0.00018972116632559786,
"loss": 1.4958,
"step": 1472
},
{
"epoch": 0.754108844407495,
"grad_norm": 0.03240435943007469,
"learning_rate": 0.000189683765055634,
"loss": 1.48,
"step": 1474
},
{
"epoch": 0.7551320585790113,
"grad_norm": 0.0325872041285038,
"learning_rate": 0.0001896462995641432,
"loss": 1.4685,
"step": 1476
},
{
"epoch": 0.7561552727505276,
"grad_norm": 0.030261578038334846,
"learning_rate": 0.00018960876987795413,
"loss": 1.4985,
"step": 1478
},
{
"epoch": 0.7571784869220439,
"grad_norm": 0.034684158861637115,
"learning_rate": 0.0001895711760239413,
"loss": 1.4869,
"step": 1480
},
{
"epoch": 0.7582017010935601,
"grad_norm": 0.03360000252723694,
"learning_rate": 0.00018953351802902525,
"loss": 1.5089,
"step": 1482
},
{
"epoch": 0.7592249152650764,
"grad_norm": 0.03356654942035675,
"learning_rate": 0.0001894957959201725,
"loss": 1.5119,
"step": 1484
},
{
"epoch": 0.7602481294365927,
"grad_norm": 0.035596925765275955,
"learning_rate": 0.00018945800972439538,
"loss": 1.5242,
"step": 1486
},
{
"epoch": 0.761271343608109,
"grad_norm": 0.03309349715709686,
"learning_rate": 0.00018942015946875215,
"loss": 1.519,
"step": 1488
},
{
"epoch": 0.7622945577796253,
"grad_norm": 0.03727027401328087,
"learning_rate": 0.00018938224518034698,
"loss": 1.4651,
"step": 1490
},
{
"epoch": 0.7633177719511415,
"grad_norm": 0.03802427276968956,
"learning_rate": 0.00018934426688632986,
"loss": 1.4584,
"step": 1492
},
{
"epoch": 0.7643409861226578,
"grad_norm": 0.03257981687784195,
"learning_rate": 0.00018930622461389655,
"loss": 1.4622,
"step": 1494
},
{
"epoch": 0.7653642002941741,
"grad_norm": 0.03339976444840431,
"learning_rate": 0.00018926811839028876,
"loss": 1.4486,
"step": 1496
},
{
"epoch": 0.7663874144656904,
"grad_norm": 0.03176839277148247,
"learning_rate": 0.00018922994824279395,
"loss": 1.478,
"step": 1498
},
{
"epoch": 0.7674106286372067,
"grad_norm": 0.03458357974886894,
"learning_rate": 0.00018919171419874524,
"loss": 1.5167,
"step": 1500
},
{
"epoch": 0.768433842808723,
"grad_norm": 0.037736013531684875,
"learning_rate": 0.00018915341628552166,
"loss": 1.5323,
"step": 1502
},
{
"epoch": 0.7694570569802391,
"grad_norm": 0.03360259160399437,
"learning_rate": 0.00018911505453054786,
"loss": 1.469,
"step": 1504
},
{
"epoch": 0.7704802711517554,
"grad_norm": 0.03466862440109253,
"learning_rate": 0.00018907662896129433,
"loss": 1.5173,
"step": 1506
},
{
"epoch": 0.7715034853232717,
"grad_norm": 0.036147862672805786,
"learning_rate": 0.00018903813960527714,
"loss": 1.4801,
"step": 1508
},
{
"epoch": 0.772526699494788,
"grad_norm": 0.03919236734509468,
"learning_rate": 0.0001889995864900581,
"loss": 1.479,
"step": 1510
},
{
"epoch": 0.7735499136663043,
"grad_norm": 0.03543972223997116,
"learning_rate": 0.0001889609696432446,
"loss": 1.4771,
"step": 1512
},
{
"epoch": 0.7745731278378205,
"grad_norm": 0.04238108918070793,
"learning_rate": 0.00018892228909248978,
"loss": 1.4936,
"step": 1514
},
{
"epoch": 0.7755963420093368,
"grad_norm": 0.035696953535079956,
"learning_rate": 0.00018888354486549237,
"loss": 1.49,
"step": 1516
},
{
"epoch": 0.7766195561808531,
"grad_norm": 0.04000556096434593,
"learning_rate": 0.00018884473698999661,
"loss": 1.5206,
"step": 1518
},
{
"epoch": 0.7776427703523694,
"grad_norm": 0.06562638282775879,
"learning_rate": 0.0001888058654937924,
"loss": 1.4672,
"step": 1520
},
{
"epoch": 0.7786659845238857,
"grad_norm": 0.03467231243848801,
"learning_rate": 0.00018876693040471517,
"loss": 1.5033,
"step": 1522
},
{
"epoch": 0.779689198695402,
"grad_norm": 0.03708554431796074,
"learning_rate": 0.00018872793175064593,
"loss": 1.4606,
"step": 1524
},
{
"epoch": 0.7807124128669182,
"grad_norm": 0.039738163352012634,
"learning_rate": 0.00018868886955951115,
"loss": 1.4506,
"step": 1526
},
{
"epoch": 0.7817356270384345,
"grad_norm": 0.036794066429138184,
"learning_rate": 0.00018864974385928283,
"loss": 1.516,
"step": 1528
},
{
"epoch": 0.7827588412099508,
"grad_norm": 0.037196848541498184,
"learning_rate": 0.0001886105546779784,
"loss": 1.5051,
"step": 1530
},
{
"epoch": 0.7837820553814671,
"grad_norm": 0.03867275267839432,
"learning_rate": 0.00018857130204366084,
"loss": 1.5015,
"step": 1532
},
{
"epoch": 0.7848052695529834,
"grad_norm": 0.03784462809562683,
"learning_rate": 0.00018853198598443852,
"loss": 1.4713,
"step": 1534
},
{
"epoch": 0.7858284837244995,
"grad_norm": 0.04151632636785507,
"learning_rate": 0.00018849260652846519,
"loss": 1.4671,
"step": 1536
},
{
"epoch": 0.7868516978960158,
"grad_norm": 0.04655742272734642,
"learning_rate": 0.00018845316370394005,
"loss": 1.4751,
"step": 1538
},
{
"epoch": 0.7878749120675321,
"grad_norm": 0.037444863468408585,
"learning_rate": 0.00018841365753910765,
"loss": 1.5155,
"step": 1540
},
{
"epoch": 0.7888981262390484,
"grad_norm": 0.04184754192829132,
"learning_rate": 0.0001883740880622579,
"loss": 1.4717,
"step": 1542
},
{
"epoch": 0.7899213404105647,
"grad_norm": 0.042664580047130585,
"learning_rate": 0.00018833445530172605,
"loss": 1.5221,
"step": 1544
},
{
"epoch": 0.790944554582081,
"grad_norm": 0.05149197578430176,
"learning_rate": 0.00018829475928589271,
"loss": 1.4861,
"step": 1546
},
{
"epoch": 0.7919677687535972,
"grad_norm": 0.04174793139100075,
"learning_rate": 0.0001882550000431837,
"loss": 1.4887,
"step": 1548
},
{
"epoch": 0.7929909829251135,
"grad_norm": 0.03560099005699158,
"learning_rate": 0.0001882151776020702,
"loss": 1.5099,
"step": 1550
},
{
"epoch": 0.7940141970966298,
"grad_norm": 0.049874622374773026,
"learning_rate": 0.0001881752919910686,
"loss": 1.4835,
"step": 1552
},
{
"epoch": 0.7950374112681461,
"grad_norm": 0.04354040324687958,
"learning_rate": 0.0001881353432387405,
"loss": 1.4778,
"step": 1554
},
{
"epoch": 0.7960606254396624,
"grad_norm": 0.04164579510688782,
"learning_rate": 0.0001880953313736928,
"loss": 1.4968,
"step": 1556
},
{
"epoch": 0.7970838396111786,
"grad_norm": 0.034870538860559464,
"learning_rate": 0.0001880552564245775,
"loss": 1.4628,
"step": 1558
},
{
"epoch": 0.7981070537826949,
"grad_norm": 0.034135766327381134,
"learning_rate": 0.00018801511842009183,
"loss": 1.4836,
"step": 1560
},
{
"epoch": 0.7991302679542112,
"grad_norm": 0.03587375581264496,
"learning_rate": 0.00018797491738897816,
"loss": 1.4636,
"step": 1562
},
{
"epoch": 0.8001534821257275,
"grad_norm": 0.03559894114732742,
"learning_rate": 0.000187934653360024,
"loss": 1.4874,
"step": 1564
},
{
"epoch": 0.8011766962972438,
"grad_norm": 0.05410682037472725,
"learning_rate": 0.00018789432636206197,
"loss": 1.4701,
"step": 1566
},
{
"epoch": 0.80219991046876,
"grad_norm": 0.046682942658662796,
"learning_rate": 0.00018785393642396976,
"loss": 1.4993,
"step": 1568
},
{
"epoch": 0.8032231246402762,
"grad_norm": 0.03647172451019287,
"learning_rate": 0.00018781348357467013,
"loss": 1.5053,
"step": 1570
},
{
"epoch": 0.8042463388117925,
"grad_norm": 0.035208649933338165,
"learning_rate": 0.00018777296784313095,
"loss": 1.5099,
"step": 1572
},
{
"epoch": 0.8052695529833088,
"grad_norm": 0.03541814163327217,
"learning_rate": 0.00018773238925836507,
"loss": 1.5027,
"step": 1574
},
{
"epoch": 0.8062927671548251,
"grad_norm": 0.04706384614109993,
"learning_rate": 0.0001876917478494303,
"loss": 1.5111,
"step": 1576
},
{
"epoch": 0.8073159813263414,
"grad_norm": 0.042128194123506546,
"learning_rate": 0.00018765104364542955,
"loss": 1.4832,
"step": 1578
},
{
"epoch": 0.8083391954978576,
"grad_norm": 0.033496059477329254,
"learning_rate": 0.00018761027667551063,
"loss": 1.49,
"step": 1580
},
{
"epoch": 0.8093624096693739,
"grad_norm": 0.036655962467193604,
"learning_rate": 0.0001875694469688663,
"loss": 1.4835,
"step": 1582
},
{
"epoch": 0.8103856238408902,
"grad_norm": 0.036248572170734406,
"learning_rate": 0.0001875285545547342,
"loss": 1.5025,
"step": 1584
},
{
"epoch": 0.8114088380124065,
"grad_norm": 0.040282152593135834,
"learning_rate": 0.000187487599462397,
"loss": 1.4776,
"step": 1586
},
{
"epoch": 0.8124320521839228,
"grad_norm": 0.03675289452075958,
"learning_rate": 0.00018744658172118215,
"loss": 1.5036,
"step": 1588
},
{
"epoch": 0.8134552663554391,
"grad_norm": 0.03431113436818123,
"learning_rate": 0.00018740550136046196,
"loss": 1.4701,
"step": 1590
},
{
"epoch": 0.8144784805269553,
"grad_norm": 0.03184695914387703,
"learning_rate": 0.00018736435840965366,
"loss": 1.473,
"step": 1592
},
{
"epoch": 0.8155016946984716,
"grad_norm": 0.031748853623867035,
"learning_rate": 0.00018732315289821921,
"loss": 1.5039,
"step": 1594
},
{
"epoch": 0.8165249088699879,
"grad_norm": 0.034614481031894684,
"learning_rate": 0.00018728188485566544,
"loss": 1.4664,
"step": 1596
},
{
"epoch": 0.8175481230415041,
"grad_norm": 0.0308011993765831,
"learning_rate": 0.0001872405543115439,
"loss": 1.4719,
"step": 1598
},
{
"epoch": 0.8185713372130204,
"grad_norm": 0.031010661274194717,
"learning_rate": 0.00018719916129545093,
"loss": 1.4841,
"step": 1600
},
{
"epoch": 0.8195945513845366,
"grad_norm": 0.03110615722835064,
"learning_rate": 0.0001871577058370276,
"loss": 1.4878,
"step": 1602
},
{
"epoch": 0.8206177655560529,
"grad_norm": 0.030799025669693947,
"learning_rate": 0.00018711618796595972,
"loss": 1.4391,
"step": 1604
},
{
"epoch": 0.8216409797275692,
"grad_norm": 0.029373083263635635,
"learning_rate": 0.00018707460771197774,
"loss": 1.5265,
"step": 1606
},
{
"epoch": 0.8226641938990855,
"grad_norm": 0.03043638914823532,
"learning_rate": 0.0001870329651048568,
"loss": 1.5027,
"step": 1608
},
{
"epoch": 0.8236874080706018,
"grad_norm": 0.0337023101747036,
"learning_rate": 0.00018699126017441672,
"loss": 1.4793,
"step": 1610
},
{
"epoch": 0.8247106222421181,
"grad_norm": 0.03439760580658913,
"learning_rate": 0.0001869494929505219,
"loss": 1.4764,
"step": 1612
},
{
"epoch": 0.8257338364136343,
"grad_norm": 0.03283720836043358,
"learning_rate": 0.00018690766346308145,
"loss": 1.4829,
"step": 1614
},
{
"epoch": 0.8267570505851506,
"grad_norm": 0.030338643118739128,
"learning_rate": 0.00018686577174204885,
"loss": 1.4587,
"step": 1616
},
{
"epoch": 0.8277802647566669,
"grad_norm": 0.03556302934885025,
"learning_rate": 0.00018682381781742245,
"loss": 1.4924,
"step": 1618
},
{
"epoch": 0.8288034789281832,
"grad_norm": 0.032113250344991684,
"learning_rate": 0.00018678180171924485,
"loss": 1.4875,
"step": 1620
},
{
"epoch": 0.8298266930996995,
"grad_norm": 0.1559678167104721,
"learning_rate": 0.00018673972347760338,
"loss": 1.5009,
"step": 1622
},
{
"epoch": 0.8308499072712157,
"grad_norm": 0.06492070108652115,
"learning_rate": 0.00018669758312262976,
"loss": 1.4632,
"step": 1624
},
{
"epoch": 0.831873121442732,
"grad_norm": 0.05882725864648819,
"learning_rate": 0.00018665538068450023,
"loss": 1.472,
"step": 1626
},
{
"epoch": 0.8328963356142483,
"grad_norm": 0.03860605135560036,
"learning_rate": 0.00018661311619343546,
"loss": 1.4662,
"step": 1628
},
{
"epoch": 0.8339195497857645,
"grad_norm": 0.04597290977835655,
"learning_rate": 0.00018657078967970062,
"loss": 1.4706,
"step": 1630
},
{
"epoch": 0.8349427639572808,
"grad_norm": 0.04754943400621414,
"learning_rate": 0.00018652840117360517,
"loss": 1.475,
"step": 1632
},
{
"epoch": 0.8359659781287971,
"grad_norm": 0.03354303911328316,
"learning_rate": 0.0001864859507055031,
"loss": 1.5133,
"step": 1634
},
{
"epoch": 0.8369891923003133,
"grad_norm": 0.042201388627290726,
"learning_rate": 0.0001864434383057927,
"loss": 1.5125,
"step": 1636
},
{
"epoch": 0.8380124064718296,
"grad_norm": 0.0343627855181694,
"learning_rate": 0.00018640086400491658,
"loss": 1.4811,
"step": 1638
},
{
"epoch": 0.8390356206433459,
"grad_norm": 0.03558426350355148,
"learning_rate": 0.00018635822783336174,
"loss": 1.5171,
"step": 1640
},
{
"epoch": 0.8400588348148622,
"grad_norm": 0.03267373517155647,
"learning_rate": 0.00018631552982165944,
"loss": 1.4758,
"step": 1642
},
{
"epoch": 0.8410820489863785,
"grad_norm": 0.03015967085957527,
"learning_rate": 0.00018627277000038533,
"loss": 1.4501,
"step": 1644
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.03152506798505783,
"learning_rate": 0.0001862299484001591,
"loss": 1.4625,
"step": 1646
},
{
"epoch": 0.843128477329411,
"grad_norm": 0.03820090368390083,
"learning_rate": 0.0001861870650516449,
"loss": 1.5065,
"step": 1648
},
{
"epoch": 0.8441516915009273,
"grad_norm": 0.030817920342087746,
"learning_rate": 0.000186144119985551,
"loss": 1.4814,
"step": 1650
},
{
"epoch": 0.8451749056724436,
"grad_norm": 0.03546105697751045,
"learning_rate": 0.00018610111323262986,
"loss": 1.4554,
"step": 1652
},
{
"epoch": 0.8461981198439599,
"grad_norm": 0.033546384423971176,
"learning_rate": 0.00018605804482367807,
"loss": 1.4379,
"step": 1654
},
{
"epoch": 0.8472213340154762,
"grad_norm": 0.035938508808612823,
"learning_rate": 0.00018601491478953657,
"loss": 1.4931,
"step": 1656
},
{
"epoch": 0.8482445481869924,
"grad_norm": 0.03531987965106964,
"learning_rate": 0.00018597172316109015,
"loss": 1.4483,
"step": 1658
},
{
"epoch": 0.8492677623585086,
"grad_norm": 0.03041314333677292,
"learning_rate": 0.00018592846996926793,
"loss": 1.4541,
"step": 1660
},
{
"epoch": 0.850290976530025,
"grad_norm": 0.03549192473292351,
"learning_rate": 0.00018588515524504295,
"loss": 1.4615,
"step": 1662
},
{
"epoch": 0.8513141907015412,
"grad_norm": 0.03376925736665726,
"learning_rate": 0.0001858417790194325,
"loss": 1.4722,
"step": 1664
},
{
"epoch": 0.8523374048730575,
"grad_norm": 0.03313841298222542,
"learning_rate": 0.00018579834132349772,
"loss": 1.4791,
"step": 1666
},
{
"epoch": 0.8533606190445737,
"grad_norm": 0.033985435962677,
"learning_rate": 0.00018575484218834388,
"loss": 1.4443,
"step": 1668
},
{
"epoch": 0.85438383321609,
"grad_norm": 0.032460469752550125,
"learning_rate": 0.00018571128164512023,
"loss": 1.4988,
"step": 1670
},
{
"epoch": 0.8554070473876063,
"grad_norm": 0.03272455185651779,
"learning_rate": 0.00018566765972501993,
"loss": 1.4659,
"step": 1672
},
{
"epoch": 0.8564302615591226,
"grad_norm": 0.031708747148513794,
"learning_rate": 0.0001856239764592802,
"loss": 1.5007,
"step": 1674
},
{
"epoch": 0.8574534757306389,
"grad_norm": 0.034189220517873764,
"learning_rate": 0.0001855802318791821,
"loss": 1.4423,
"step": 1676
},
{
"epoch": 0.8584766899021552,
"grad_norm": 0.03221631050109863,
"learning_rate": 0.00018553642601605068,
"loss": 1.4701,
"step": 1678
},
{
"epoch": 0.8594999040736714,
"grad_norm": 0.029117561876773834,
"learning_rate": 0.00018549255890125475,
"loss": 1.4769,
"step": 1680
},
{
"epoch": 0.8605231182451877,
"grad_norm": 0.029596133157610893,
"learning_rate": 0.00018544863056620708,
"loss": 1.4635,
"step": 1682
},
{
"epoch": 0.861546332416704,
"grad_norm": 0.030032752081751823,
"learning_rate": 0.00018540464104236425,
"loss": 1.4991,
"step": 1684
},
{
"epoch": 0.8625695465882203,
"grad_norm": 0.03227202966809273,
"learning_rate": 0.00018536059036122667,
"loss": 1.4608,
"step": 1686
},
{
"epoch": 0.8635927607597366,
"grad_norm": 0.03331397473812103,
"learning_rate": 0.0001853164785543385,
"loss": 1.4958,
"step": 1688
},
{
"epoch": 0.8646159749312528,
"grad_norm": 0.033648762851953506,
"learning_rate": 0.00018527230565328778,
"loss": 1.4949,
"step": 1690
},
{
"epoch": 0.865639189102769,
"grad_norm": 0.03504339978098869,
"learning_rate": 0.00018522807168970616,
"loss": 1.439,
"step": 1692
},
{
"epoch": 0.8666624032742853,
"grad_norm": 0.034829430282115936,
"learning_rate": 0.0001851837766952691,
"loss": 1.5001,
"step": 1694
},
{
"epoch": 0.8676856174458016,
"grad_norm": 0.03803844377398491,
"learning_rate": 0.0001851394207016957,
"loss": 1.4905,
"step": 1696
},
{
"epoch": 0.8687088316173179,
"grad_norm": 0.0394139364361763,
"learning_rate": 0.00018509500374074884,
"loss": 1.4537,
"step": 1698
},
{
"epoch": 0.8697320457888342,
"grad_norm": 0.039348065853118896,
"learning_rate": 0.000185050525844235,
"loss": 1.4865,
"step": 1700
},
{
"epoch": 0.8707552599603504,
"grad_norm": 0.03650161996483803,
"learning_rate": 0.00018500598704400428,
"loss": 1.4658,
"step": 1702
},
{
"epoch": 0.8717784741318667,
"grad_norm": 0.03312232345342636,
"learning_rate": 0.00018496138737195036,
"loss": 1.477,
"step": 1704
},
{
"epoch": 0.872801688303383,
"grad_norm": 0.031243184581398964,
"learning_rate": 0.00018491672686001066,
"loss": 1.4983,
"step": 1706
},
{
"epoch": 0.8738249024748993,
"grad_norm": 0.03666044771671295,
"learning_rate": 0.00018487200554016602,
"loss": 1.4606,
"step": 1708
},
{
"epoch": 0.8748481166464156,
"grad_norm": 0.035856928676366806,
"learning_rate": 0.00018482722344444086,
"loss": 1.4808,
"step": 1710
},
{
"epoch": 0.8758713308179318,
"grad_norm": 0.03538081422448158,
"learning_rate": 0.00018478238060490312,
"loss": 1.4734,
"step": 1712
},
{
"epoch": 0.8768945449894481,
"grad_norm": 0.02917349338531494,
"learning_rate": 0.00018473747705366426,
"loss": 1.4947,
"step": 1714
},
{
"epoch": 0.8779177591609644,
"grad_norm": 0.035214658826589584,
"learning_rate": 0.0001846925128228792,
"loss": 1.4773,
"step": 1716
},
{
"epoch": 0.8789409733324807,
"grad_norm": 0.03703998774290085,
"learning_rate": 0.00018464748794474634,
"loss": 1.4704,
"step": 1718
},
{
"epoch": 0.879964187503997,
"grad_norm": 0.03480003774166107,
"learning_rate": 0.0001846024024515075,
"loss": 1.4723,
"step": 1720
},
{
"epoch": 0.8809874016755133,
"grad_norm": 0.04090346395969391,
"learning_rate": 0.00018455725637544785,
"loss": 1.4525,
"step": 1722
},
{
"epoch": 0.8820106158470294,
"grad_norm": 0.042412955313920975,
"learning_rate": 0.00018451204974889596,
"loss": 1.4418,
"step": 1724
},
{
"epoch": 0.8830338300185457,
"grad_norm": 0.03738129511475563,
"learning_rate": 0.00018446678260422385,
"loss": 1.4747,
"step": 1726
},
{
"epoch": 0.884057044190062,
"grad_norm": 0.03728758171200752,
"learning_rate": 0.00018442145497384673,
"loss": 1.5007,
"step": 1728
},
{
"epoch": 0.8850802583615783,
"grad_norm": 0.038157109171152115,
"learning_rate": 0.0001843760668902233,
"loss": 1.4937,
"step": 1730
},
{
"epoch": 0.8861034725330946,
"grad_norm": 0.03238663077354431,
"learning_rate": 0.00018433061838585534,
"loss": 1.4631,
"step": 1732
},
{
"epoch": 0.8871266867046108,
"grad_norm": 0.03741516172885895,
"learning_rate": 0.0001842851094932881,
"loss": 1.4887,
"step": 1734
},
{
"epoch": 0.8881499008761271,
"grad_norm": 0.03934532031416893,
"learning_rate": 0.00018423954024510996,
"loss": 1.4208,
"step": 1736
},
{
"epoch": 0.8891731150476434,
"grad_norm": 0.03238905593752861,
"learning_rate": 0.00018419391067395248,
"loss": 1.4587,
"step": 1738
},
{
"epoch": 0.8901963292191597,
"grad_norm": 0.039086490869522095,
"learning_rate": 0.00018414822081249058,
"loss": 1.4545,
"step": 1740
},
{
"epoch": 0.891219543390676,
"grad_norm": 0.0370473712682724,
"learning_rate": 0.00018410247069344218,
"loss": 1.4473,
"step": 1742
},
{
"epoch": 0.8922427575621923,
"grad_norm": 0.034061599522829056,
"learning_rate": 0.00018405666034956844,
"loss": 1.4831,
"step": 1744
},
{
"epoch": 0.8932659717337085,
"grad_norm": 0.0363328754901886,
"learning_rate": 0.00018401078981367363,
"loss": 1.4729,
"step": 1746
},
{
"epoch": 0.8942891859052248,
"grad_norm": 0.035310424864292145,
"learning_rate": 0.00018396485911860512,
"loss": 1.518,
"step": 1748
},
{
"epoch": 0.8953124000767411,
"grad_norm": 0.03476149961352348,
"learning_rate": 0.00018391886829725334,
"loss": 1.4611,
"step": 1750
},
{
"epoch": 0.8963356142482574,
"grad_norm": 0.03310383856296539,
"learning_rate": 0.00018387281738255185,
"loss": 1.4746,
"step": 1752
},
{
"epoch": 0.8973588284197737,
"grad_norm": 0.0307275652885437,
"learning_rate": 0.00018382670640747714,
"loss": 1.4697,
"step": 1754
},
{
"epoch": 0.8983820425912898,
"grad_norm": 0.028024040162563324,
"learning_rate": 0.00018378053540504873,
"loss": 1.4608,
"step": 1756
},
{
"epoch": 0.8994052567628061,
"grad_norm": 0.029499476775527,
"learning_rate": 0.00018373430440832923,
"loss": 1.4614,
"step": 1758
},
{
"epoch": 0.9004284709343224,
"grad_norm": 0.033067066222429276,
"learning_rate": 0.0001836880134504241,
"loss": 1.479,
"step": 1760
},
{
"epoch": 0.9014516851058387,
"grad_norm": 0.03787175565958023,
"learning_rate": 0.00018364166256448173,
"loss": 1.4712,
"step": 1762
},
{
"epoch": 0.902474899277355,
"grad_norm": 0.02690064162015915,
"learning_rate": 0.0001835952517836935,
"loss": 1.4673,
"step": 1764
},
{
"epoch": 0.9034981134488713,
"grad_norm": 0.026671042665839195,
"learning_rate": 0.00018354878114129367,
"loss": 1.4561,
"step": 1766
},
{
"epoch": 0.9045213276203875,
"grad_norm": 0.03277120366692543,
"learning_rate": 0.00018350225067055925,
"loss": 1.4879,
"step": 1768
},
{
"epoch": 0.9055445417919038,
"grad_norm": 0.03682045266032219,
"learning_rate": 0.00018345566040481028,
"loss": 1.467,
"step": 1770
},
{
"epoch": 0.9065677559634201,
"grad_norm": 0.027602965012192726,
"learning_rate": 0.0001834090103774095,
"loss": 1.4514,
"step": 1772
},
{
"epoch": 0.9075909701349364,
"grad_norm": 0.03043595515191555,
"learning_rate": 0.00018336230062176244,
"loss": 1.4835,
"step": 1774
},
{
"epoch": 0.9086141843064527,
"grad_norm": 0.030672984197735786,
"learning_rate": 0.0001833155311713174,
"loss": 1.492,
"step": 1776
},
{
"epoch": 0.9096373984779689,
"grad_norm": 0.032694920897483826,
"learning_rate": 0.00018326870205956553,
"loss": 1.475,
"step": 1778
},
{
"epoch": 0.9106606126494852,
"grad_norm": 0.031511466950178146,
"learning_rate": 0.00018322181332004056,
"loss": 1.4457,
"step": 1780
},
{
"epoch": 0.9116838268210015,
"grad_norm": 0.03155050054192543,
"learning_rate": 0.00018317486498631899,
"loss": 1.5165,
"step": 1782
},
{
"epoch": 0.9127070409925178,
"grad_norm": 0.03132548928260803,
"learning_rate": 0.00018312785709202002,
"loss": 1.5171,
"step": 1784
},
{
"epoch": 0.913730255164034,
"grad_norm": 0.036277156323194504,
"learning_rate": 0.00018308078967080546,
"loss": 1.4726,
"step": 1786
},
{
"epoch": 0.9147534693355504,
"grad_norm": 0.029615385457873344,
"learning_rate": 0.00018303366275637976,
"loss": 1.448,
"step": 1788
},
{
"epoch": 0.9157766835070665,
"grad_norm": 0.029571905732154846,
"learning_rate": 0.00018298647638248996,
"loss": 1.4629,
"step": 1790
},
{
"epoch": 0.9167998976785828,
"grad_norm": 0.028433986008167267,
"learning_rate": 0.0001829392305829257,
"loss": 1.474,
"step": 1792
},
{
"epoch": 0.9178231118500991,
"grad_norm": 0.034186169505119324,
"learning_rate": 0.0001828919253915191,
"loss": 1.4828,
"step": 1794
},
{
"epoch": 0.9188463260216154,
"grad_norm": 0.03323967382311821,
"learning_rate": 0.00018284456084214496,
"loss": 1.4883,
"step": 1796
},
{
"epoch": 0.9198695401931317,
"grad_norm": 0.03627438098192215,
"learning_rate": 0.00018279713696872047,
"loss": 1.4505,
"step": 1798
},
{
"epoch": 0.9208927543646479,
"grad_norm": 0.037414826452732086,
"learning_rate": 0.0001827496538052053,
"loss": 1.5153,
"step": 1800
},
{
"epoch": 0.9219159685361642,
"grad_norm": 0.036538898944854736,
"learning_rate": 0.00018270211138560162,
"loss": 1.4565,
"step": 1802
},
{
"epoch": 0.9229391827076805,
"grad_norm": 0.034286949783563614,
"learning_rate": 0.00018265450974395403,
"loss": 1.4596,
"step": 1804
},
{
"epoch": 0.9239623968791968,
"grad_norm": 0.03332148864865303,
"learning_rate": 0.0001826068489143495,
"loss": 1.4452,
"step": 1806
},
{
"epoch": 0.9249856110507131,
"grad_norm": 0.030349107459187508,
"learning_rate": 0.00018255912893091743,
"loss": 1.4937,
"step": 1808
},
{
"epoch": 0.9260088252222294,
"grad_norm": 0.030373625457286835,
"learning_rate": 0.00018251134982782952,
"loss": 1.4774,
"step": 1810
},
{
"epoch": 0.9270320393937456,
"grad_norm": 0.03661259636282921,
"learning_rate": 0.00018246351163929991,
"loss": 1.4694,
"step": 1812
},
{
"epoch": 0.9280552535652619,
"grad_norm": 0.036550264805555344,
"learning_rate": 0.00018241561439958495,
"loss": 1.4944,
"step": 1814
},
{
"epoch": 0.9290784677367782,
"grad_norm": 0.03492378070950508,
"learning_rate": 0.0001823676581429833,
"loss": 1.445,
"step": 1816
},
{
"epoch": 0.9301016819082945,
"grad_norm": 0.03306609019637108,
"learning_rate": 0.0001823196429038359,
"loss": 1.4222,
"step": 1818
},
{
"epoch": 0.9311248960798107,
"grad_norm": 0.03200085088610649,
"learning_rate": 0.0001822715687165259,
"loss": 1.467,
"step": 1820
},
{
"epoch": 0.9321481102513269,
"grad_norm": 0.036335378885269165,
"learning_rate": 0.00018222343561547874,
"loss": 1.4693,
"step": 1822
},
{
"epoch": 0.9331713244228432,
"grad_norm": 0.039753127843141556,
"learning_rate": 0.00018217524363516193,
"loss": 1.4594,
"step": 1824
},
{
"epoch": 0.9341945385943595,
"grad_norm": 0.03748109191656113,
"learning_rate": 0.0001821269928100852,
"loss": 1.5014,
"step": 1826
},
{
"epoch": 0.9352177527658758,
"grad_norm": 0.04106932878494263,
"learning_rate": 0.00018207868317480046,
"loss": 1.4823,
"step": 1828
},
{
"epoch": 0.9362409669373921,
"grad_norm": 0.032248884439468384,
"learning_rate": 0.00018203031476390167,
"loss": 1.4697,
"step": 1830
},
{
"epoch": 0.9372641811089084,
"grad_norm": 0.047158315777778625,
"learning_rate": 0.00018198188761202487,
"loss": 1.5449,
"step": 1832
},
{
"epoch": 0.9382873952804246,
"grad_norm": 0.03881628066301346,
"learning_rate": 0.00018193340175384824,
"loss": 1.5129,
"step": 1834
},
{
"epoch": 0.9393106094519409,
"grad_norm": 0.038932789117097855,
"learning_rate": 0.00018188485722409197,
"loss": 1.4508,
"step": 1836
},
{
"epoch": 0.9403338236234572,
"grad_norm": 0.042171675711870193,
"learning_rate": 0.00018183625405751816,
"loss": 1.4976,
"step": 1838
},
{
"epoch": 0.9413570377949735,
"grad_norm": 0.03824607655405998,
"learning_rate": 0.00018178759228893108,
"loss": 1.4759,
"step": 1840
},
{
"epoch": 0.9423802519664898,
"grad_norm": 0.0380014143884182,
"learning_rate": 0.0001817388719531768,
"loss": 1.4765,
"step": 1842
},
{
"epoch": 0.943403466138006,
"grad_norm": 0.03372355177998543,
"learning_rate": 0.00018169009308514344,
"loss": 1.4724,
"step": 1844
},
{
"epoch": 0.9444266803095223,
"grad_norm": 0.03503812104463577,
"learning_rate": 0.00018164125571976098,
"loss": 1.4537,
"step": 1846
},
{
"epoch": 0.9454498944810386,
"grad_norm": 0.03842812776565552,
"learning_rate": 0.00018159235989200132,
"loss": 1.4747,
"step": 1848
},
{
"epoch": 0.9464731086525549,
"grad_norm": 0.03686497360467911,
"learning_rate": 0.0001815434056368782,
"loss": 1.4433,
"step": 1850
},
{
"epoch": 0.9474963228240711,
"grad_norm": 0.03216801956295967,
"learning_rate": 0.00018149439298944717,
"loss": 1.4628,
"step": 1852
},
{
"epoch": 0.9485195369955874,
"grad_norm": 0.04245101660490036,
"learning_rate": 0.0001814453219848057,
"loss": 1.5411,
"step": 1854
},
{
"epoch": 0.9495427511671036,
"grad_norm": 0.041708942502737045,
"learning_rate": 0.0001813961926580929,
"loss": 1.4828,
"step": 1856
},
{
"epoch": 0.9505659653386199,
"grad_norm": 0.038249559700489044,
"learning_rate": 0.0001813470050444898,
"loss": 1.4633,
"step": 1858
},
{
"epoch": 0.9515891795101362,
"grad_norm": 0.03623546287417412,
"learning_rate": 0.00018129775917921905,
"loss": 1.4644,
"step": 1860
},
{
"epoch": 0.9526123936816525,
"grad_norm": 0.03886585682630539,
"learning_rate": 0.00018124845509754505,
"loss": 1.4642,
"step": 1862
},
{
"epoch": 0.9536356078531688,
"grad_norm": 0.03367486968636513,
"learning_rate": 0.00018119909283477394,
"loss": 1.4577,
"step": 1864
},
{
"epoch": 0.954658822024685,
"grad_norm": 0.034619078040122986,
"learning_rate": 0.00018114967242625343,
"loss": 1.4424,
"step": 1866
},
{
"epoch": 0.9556820361962013,
"grad_norm": 0.036260370165109634,
"learning_rate": 0.00018110019390737292,
"loss": 1.4749,
"step": 1868
},
{
"epoch": 0.9567052503677176,
"grad_norm": 0.037158943712711334,
"learning_rate": 0.00018105065731356343,
"loss": 1.4185,
"step": 1870
},
{
"epoch": 0.9577284645392339,
"grad_norm": 0.03858686238527298,
"learning_rate": 0.00018100106268029755,
"loss": 1.5027,
"step": 1872
},
{
"epoch": 0.9587516787107502,
"grad_norm": 0.03699406236410141,
"learning_rate": 0.00018095141004308943,
"loss": 1.4283,
"step": 1874
},
{
"epoch": 0.9597748928822665,
"grad_norm": 0.030941152945160866,
"learning_rate": 0.00018090169943749476,
"loss": 1.4729,
"step": 1876
},
{
"epoch": 0.9607981070537827,
"grad_norm": 0.03944398835301399,
"learning_rate": 0.00018085193089911075,
"loss": 1.4636,
"step": 1878
},
{
"epoch": 0.961821321225299,
"grad_norm": 0.03944871574640274,
"learning_rate": 0.00018080210446357606,
"loss": 1.4458,
"step": 1880
},
{
"epoch": 0.9628445353968152,
"grad_norm": 0.042511675506830215,
"learning_rate": 0.00018075222016657088,
"loss": 1.4868,
"step": 1882
},
{
"epoch": 0.9638677495683315,
"grad_norm": 0.036067429929971695,
"learning_rate": 0.00018070227804381674,
"loss": 1.4681,
"step": 1884
},
{
"epoch": 0.9648909637398478,
"grad_norm": 0.030013304203748703,
"learning_rate": 0.00018065227813107666,
"loss": 1.5088,
"step": 1886
},
{
"epoch": 0.965914177911364,
"grad_norm": 0.030714694410562515,
"learning_rate": 0.000180602220464155,
"loss": 1.4443,
"step": 1888
},
{
"epoch": 0.9669373920828803,
"grad_norm": 0.03553122654557228,
"learning_rate": 0.0001805521050788975,
"loss": 1.4667,
"step": 1890
},
{
"epoch": 0.9679606062543966,
"grad_norm": 0.032518330961465836,
"learning_rate": 0.0001805019320111912,
"loss": 1.4756,
"step": 1892
},
{
"epoch": 0.9689838204259129,
"grad_norm": 0.032445941120386124,
"learning_rate": 0.0001804517012969644,
"loss": 1.474,
"step": 1894
},
{
"epoch": 0.9700070345974292,
"grad_norm": 0.03390254080295563,
"learning_rate": 0.00018040141297218695,
"loss": 1.4477,
"step": 1896
},
{
"epoch": 0.9710302487689455,
"grad_norm": 0.02915276773273945,
"learning_rate": 0.00018035106707286954,
"loss": 1.4784,
"step": 1898
},
{
"epoch": 0.9720534629404617,
"grad_norm": 0.028000080958008766,
"learning_rate": 0.00018030066363506437,
"loss": 1.45,
"step": 1900
}
],
"logging_steps": 2,
"max_steps": 7816,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.236992921365381e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}