whiteapple8222's picture
Training in progress, step 1000, checkpoint
4a66d2d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4005607850991388,
"eval_steps": 250,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004005607850991388,
"grad_norm": 2.3918890953063965,
"learning_rate": 2e-05,
"loss": 3.099,
"step": 1
},
{
"epoch": 0.0004005607850991388,
"eval_loss": 3.081904649734497,
"eval_runtime": 32.6252,
"eval_samples_per_second": 32.245,
"eval_steps_per_second": 16.122,
"step": 1
},
{
"epoch": 0.0008011215701982776,
"grad_norm": 1.6500884294509888,
"learning_rate": 4e-05,
"loss": 2.9763,
"step": 2
},
{
"epoch": 0.0012016823552974164,
"grad_norm": 2.728886365890503,
"learning_rate": 6e-05,
"loss": 3.5098,
"step": 3
},
{
"epoch": 0.0016022431403965552,
"grad_norm": 2.935586929321289,
"learning_rate": 8e-05,
"loss": 3.7738,
"step": 4
},
{
"epoch": 0.002002803925495694,
"grad_norm": 2.857252597808838,
"learning_rate": 0.0001,
"loss": 3.2156,
"step": 5
},
{
"epoch": 0.002403364710594833,
"grad_norm": 2.191563129425049,
"learning_rate": 0.00012,
"loss": 2.8823,
"step": 6
},
{
"epoch": 0.0028039254956939716,
"grad_norm": 1.8152596950531006,
"learning_rate": 0.00014,
"loss": 2.8204,
"step": 7
},
{
"epoch": 0.0032044862807931104,
"grad_norm": 2.2765188217163086,
"learning_rate": 0.00016,
"loss": 2.9313,
"step": 8
},
{
"epoch": 0.0036050470658922492,
"grad_norm": 1.715293288230896,
"learning_rate": 0.00018,
"loss": 2.6587,
"step": 9
},
{
"epoch": 0.004005607850991388,
"grad_norm": 2.0164053440093994,
"learning_rate": 0.0002,
"loss": 2.4329,
"step": 10
},
{
"epoch": 0.004406168636090527,
"grad_norm": 1.4467110633850098,
"learning_rate": 0.00019999949650055513,
"loss": 1.8227,
"step": 11
},
{
"epoch": 0.004806729421189666,
"grad_norm": 2.2541091442108154,
"learning_rate": 0.00019999798600729064,
"loss": 2.7455,
"step": 12
},
{
"epoch": 0.0052072902062888045,
"grad_norm": 2.2924540042877197,
"learning_rate": 0.0001999954685354173,
"loss": 2.0061,
"step": 13
},
{
"epoch": 0.005607850991387943,
"grad_norm": 1.9437510967254639,
"learning_rate": 0.00019999194411028594,
"loss": 2.1656,
"step": 14
},
{
"epoch": 0.006008411776487082,
"grad_norm": 1.911136269569397,
"learning_rate": 0.00019998741276738754,
"loss": 2.1187,
"step": 15
},
{
"epoch": 0.006408972561586221,
"grad_norm": 1.9220823049545288,
"learning_rate": 0.0001999818745523526,
"loss": 2.0687,
"step": 16
},
{
"epoch": 0.00680953334668536,
"grad_norm": 1.681922197341919,
"learning_rate": 0.00019997532952095094,
"loss": 2.0196,
"step": 17
},
{
"epoch": 0.0072100941317844985,
"grad_norm": 2.091493844985962,
"learning_rate": 0.00019996777773909093,
"loss": 2.6052,
"step": 18
},
{
"epoch": 0.007610654916883637,
"grad_norm": 1.233519196510315,
"learning_rate": 0.00019995921928281894,
"loss": 2.3185,
"step": 19
},
{
"epoch": 0.008011215701982776,
"grad_norm": 2.18742299079895,
"learning_rate": 0.00019994965423831854,
"loss": 2.0989,
"step": 20
},
{
"epoch": 0.008411776487081914,
"grad_norm": 1.4857062101364136,
"learning_rate": 0.0001999390827019096,
"loss": 2.4048,
"step": 21
},
{
"epoch": 0.008812337272181054,
"grad_norm": 1.6035873889923096,
"learning_rate": 0.00019992750478004738,
"loss": 1.8086,
"step": 22
},
{
"epoch": 0.009212898057280192,
"grad_norm": 1.6860904693603516,
"learning_rate": 0.00019991492058932142,
"loss": 2.3041,
"step": 23
},
{
"epoch": 0.009613458842379331,
"grad_norm": 1.9687587022781372,
"learning_rate": 0.0001999013302564544,
"loss": 1.8581,
"step": 24
},
{
"epoch": 0.01001401962747847,
"grad_norm": 1.0902950763702393,
"learning_rate": 0.0001998867339183008,
"loss": 2.2356,
"step": 25
},
{
"epoch": 0.010414580412577609,
"grad_norm": 1.4099599123001099,
"learning_rate": 0.00019987113172184563,
"loss": 1.9318,
"step": 26
},
{
"epoch": 0.010815141197676747,
"grad_norm": 1.20814049243927,
"learning_rate": 0.00019985452382420275,
"loss": 2.5392,
"step": 27
},
{
"epoch": 0.011215701982775887,
"grad_norm": 1.4848078489303589,
"learning_rate": 0.00019983691039261357,
"loss": 1.9063,
"step": 28
},
{
"epoch": 0.011616262767875024,
"grad_norm": 1.263766884803772,
"learning_rate": 0.00019981829160444514,
"loss": 2.1898,
"step": 29
},
{
"epoch": 0.012016823552974164,
"grad_norm": 1.2411807775497437,
"learning_rate": 0.00019979866764718843,
"loss": 1.9232,
"step": 30
},
{
"epoch": 0.012417384338073302,
"grad_norm": 1.0204557180404663,
"learning_rate": 0.0001997780387184565,
"loss": 1.9399,
"step": 31
},
{
"epoch": 0.012817945123172442,
"grad_norm": 1.5857188701629639,
"learning_rate": 0.00019975640502598244,
"loss": 1.9523,
"step": 32
},
{
"epoch": 0.01321850590827158,
"grad_norm": 1.3558052778244019,
"learning_rate": 0.00019973376678761724,
"loss": 2.32,
"step": 33
},
{
"epoch": 0.01361906669337072,
"grad_norm": 1.187568187713623,
"learning_rate": 0.00019971012423132775,
"loss": 2.2765,
"step": 34
},
{
"epoch": 0.014019627478469857,
"grad_norm": 1.5870120525360107,
"learning_rate": 0.00019968547759519425,
"loss": 2.3662,
"step": 35
},
{
"epoch": 0.014420188263568997,
"grad_norm": 1.1726365089416504,
"learning_rate": 0.00019965982712740808,
"loss": 2.1026,
"step": 36
},
{
"epoch": 0.014820749048668135,
"grad_norm": 1.651104211807251,
"learning_rate": 0.00019963317308626914,
"loss": 2.0235,
"step": 37
},
{
"epoch": 0.015221309833767275,
"grad_norm": 1.4108325242996216,
"learning_rate": 0.0001996055157401834,
"loss": 2.0739,
"step": 38
},
{
"epoch": 0.015621870618866412,
"grad_norm": 1.1008392572402954,
"learning_rate": 0.00019957685536765995,
"loss": 2.2383,
"step": 39
},
{
"epoch": 0.016022431403965552,
"grad_norm": 1.4775562286376953,
"learning_rate": 0.00019954719225730847,
"loss": 1.9138,
"step": 40
},
{
"epoch": 0.016422992189064692,
"grad_norm": 1.7750931978225708,
"learning_rate": 0.00019951652670783615,
"loss": 2.2775,
"step": 41
},
{
"epoch": 0.016823552974163828,
"grad_norm": 1.2284572124481201,
"learning_rate": 0.0001994848590280447,
"loss": 2.579,
"step": 42
},
{
"epoch": 0.017224113759262968,
"grad_norm": 1.8161015510559082,
"learning_rate": 0.00019945218953682734,
"loss": 2.6322,
"step": 43
},
{
"epoch": 0.017624674544362107,
"grad_norm": 1.3858716487884521,
"learning_rate": 0.00019941851856316548,
"loss": 2.241,
"step": 44
},
{
"epoch": 0.018025235329461247,
"grad_norm": 0.994133472442627,
"learning_rate": 0.00019938384644612543,
"loss": 2.0473,
"step": 45
},
{
"epoch": 0.018425796114560383,
"grad_norm": 1.183368444442749,
"learning_rate": 0.00019934817353485501,
"loss": 1.6789,
"step": 46
},
{
"epoch": 0.018826356899659523,
"grad_norm": 1.7817606925964355,
"learning_rate": 0.00019931150018858012,
"loss": 1.7251,
"step": 47
},
{
"epoch": 0.019226917684758663,
"grad_norm": 1.1427100896835327,
"learning_rate": 0.00019927382677660088,
"loss": 1.7527,
"step": 48
},
{
"epoch": 0.019627478469857802,
"grad_norm": 1.077853798866272,
"learning_rate": 0.0001992351536782881,
"loss": 2.4137,
"step": 49
},
{
"epoch": 0.02002803925495694,
"grad_norm": 1.5589914321899414,
"learning_rate": 0.00019919548128307954,
"loss": 1.778,
"step": 50
},
{
"epoch": 0.020428600040056078,
"grad_norm": 1.5429751873016357,
"learning_rate": 0.00019915480999047573,
"loss": 2.0978,
"step": 51
},
{
"epoch": 0.020829160825155218,
"grad_norm": 1.2046990394592285,
"learning_rate": 0.00019911314021003613,
"loss": 1.9611,
"step": 52
},
{
"epoch": 0.021229721610254357,
"grad_norm": 1.5479800701141357,
"learning_rate": 0.00019907047236137498,
"loss": 2.2686,
"step": 53
},
{
"epoch": 0.021630282395353494,
"grad_norm": 1.1960687637329102,
"learning_rate": 0.00019902680687415705,
"loss": 1.8488,
"step": 54
},
{
"epoch": 0.022030843180452633,
"grad_norm": 1.1699072122573853,
"learning_rate": 0.0001989821441880933,
"loss": 1.9985,
"step": 55
},
{
"epoch": 0.022431403965551773,
"grad_norm": 1.3737468719482422,
"learning_rate": 0.00019893648475293648,
"loss": 1.9347,
"step": 56
},
{
"epoch": 0.022831964750650913,
"grad_norm": 1.416020154953003,
"learning_rate": 0.00019888982902847656,
"loss": 2.1075,
"step": 57
},
{
"epoch": 0.02323252553575005,
"grad_norm": 1.4748502969741821,
"learning_rate": 0.00019884217748453623,
"loss": 2.0286,
"step": 58
},
{
"epoch": 0.02363308632084919,
"grad_norm": 1.2293422222137451,
"learning_rate": 0.00019879353060096603,
"loss": 2.0071,
"step": 59
},
{
"epoch": 0.024033647105948328,
"grad_norm": 1.217598795890808,
"learning_rate": 0.00019874388886763944,
"loss": 1.6995,
"step": 60
},
{
"epoch": 0.024434207891047468,
"grad_norm": 1.6704044342041016,
"learning_rate": 0.00019869325278444824,
"loss": 2.1929,
"step": 61
},
{
"epoch": 0.024834768676146604,
"grad_norm": 1.6492418050765991,
"learning_rate": 0.0001986416228612972,
"loss": 2.2252,
"step": 62
},
{
"epoch": 0.025235329461245744,
"grad_norm": 1.2590185403823853,
"learning_rate": 0.00019858899961809905,
"loss": 2.079,
"step": 63
},
{
"epoch": 0.025635890246344883,
"grad_norm": 1.2326877117156982,
"learning_rate": 0.00019853538358476932,
"loss": 1.7934,
"step": 64
},
{
"epoch": 0.026036451031444023,
"grad_norm": 1.123470425605774,
"learning_rate": 0.00019848077530122083,
"loss": 1.9044,
"step": 65
},
{
"epoch": 0.02643701181654316,
"grad_norm": 1.474822998046875,
"learning_rate": 0.00019842517531735838,
"loss": 2.0033,
"step": 66
},
{
"epoch": 0.0268375726016423,
"grad_norm": 1.433060646057129,
"learning_rate": 0.00019836858419307324,
"loss": 1.9392,
"step": 67
},
{
"epoch": 0.02723813338674144,
"grad_norm": 1.131077527999878,
"learning_rate": 0.00019831100249823733,
"loss": 1.98,
"step": 68
},
{
"epoch": 0.02763869417184058,
"grad_norm": 1.2101255655288696,
"learning_rate": 0.00019825243081269774,
"loss": 2.1136,
"step": 69
},
{
"epoch": 0.028039254956939715,
"grad_norm": 1.6935054063796997,
"learning_rate": 0.00019819286972627066,
"loss": 1.9742,
"step": 70
},
{
"epoch": 0.028439815742038854,
"grad_norm": 1.0219398736953735,
"learning_rate": 0.0001981323198387356,
"loss": 2.1198,
"step": 71
},
{
"epoch": 0.028840376527137994,
"grad_norm": 1.2463573217391968,
"learning_rate": 0.00019807078175982924,
"loss": 2.2247,
"step": 72
},
{
"epoch": 0.029240937312237134,
"grad_norm": 1.3553636074066162,
"learning_rate": 0.00019800825610923934,
"loss": 2.235,
"step": 73
},
{
"epoch": 0.02964149809733627,
"grad_norm": 1.2697566747665405,
"learning_rate": 0.00019794474351659852,
"loss": 2.0797,
"step": 74
},
{
"epoch": 0.03004205888243541,
"grad_norm": 1.2558093070983887,
"learning_rate": 0.00019788024462147788,
"loss": 1.5283,
"step": 75
},
{
"epoch": 0.03044261966753455,
"grad_norm": 1.3813128471374512,
"learning_rate": 0.00019781476007338058,
"loss": 2.0476,
"step": 76
},
{
"epoch": 0.03084318045263369,
"grad_norm": 1.2129108905792236,
"learning_rate": 0.00019774829053173526,
"loss": 2.0728,
"step": 77
},
{
"epoch": 0.031243741237732825,
"grad_norm": 1.1871914863586426,
"learning_rate": 0.00019768083666588953,
"loss": 1.8535,
"step": 78
},
{
"epoch": 0.031644302022831965,
"grad_norm": 1.6774837970733643,
"learning_rate": 0.00019761239915510302,
"loss": 2.3511,
"step": 79
},
{
"epoch": 0.032044862807931104,
"grad_norm": 1.4928390979766846,
"learning_rate": 0.00019754297868854073,
"loss": 2.1108,
"step": 80
},
{
"epoch": 0.032445423593030244,
"grad_norm": 1.1172386407852173,
"learning_rate": 0.00019747257596526593,
"loss": 2.1435,
"step": 81
},
{
"epoch": 0.032845984378129384,
"grad_norm": 1.5492007732391357,
"learning_rate": 0.00019740119169423337,
"loss": 1.9114,
"step": 82
},
{
"epoch": 0.03324654516322852,
"grad_norm": 1.2928754091262817,
"learning_rate": 0.00019732882659428177,
"loss": 2.0398,
"step": 83
},
{
"epoch": 0.033647105948327656,
"grad_norm": 1.22319495677948,
"learning_rate": 0.00019725548139412692,
"loss": 1.6426,
"step": 84
},
{
"epoch": 0.034047666733426796,
"grad_norm": 1.3174762725830078,
"learning_rate": 0.00019718115683235417,
"loss": 2.0354,
"step": 85
},
{
"epoch": 0.034448227518525935,
"grad_norm": 0.8051577806472778,
"learning_rate": 0.00019710585365741103,
"loss": 2.3035,
"step": 86
},
{
"epoch": 0.034848788303625075,
"grad_norm": 1.3085061311721802,
"learning_rate": 0.00019702957262759965,
"loss": 1.8958,
"step": 87
},
{
"epoch": 0.035249349088724215,
"grad_norm": 1.5627758502960205,
"learning_rate": 0.00019695231451106912,
"loss": 2.0707,
"step": 88
},
{
"epoch": 0.035649909873823354,
"grad_norm": 1.181349277496338,
"learning_rate": 0.00019687408008580784,
"loss": 1.761,
"step": 89
},
{
"epoch": 0.036050470658922494,
"grad_norm": 1.2541050910949707,
"learning_rate": 0.00019679487013963564,
"loss": 1.9864,
"step": 90
},
{
"epoch": 0.03645103144402163,
"grad_norm": 1.2387609481811523,
"learning_rate": 0.00019671468547019573,
"loss": 1.912,
"step": 91
},
{
"epoch": 0.036851592229120766,
"grad_norm": 1.175809621810913,
"learning_rate": 0.00019663352688494684,
"loss": 2.0847,
"step": 92
},
{
"epoch": 0.037252153014219906,
"grad_norm": 1.1415826082229614,
"learning_rate": 0.0001965513952011551,
"loss": 2.1211,
"step": 93
},
{
"epoch": 0.037652713799319046,
"grad_norm": 1.2264269590377808,
"learning_rate": 0.0001964682912458856,
"loss": 1.8611,
"step": 94
},
{
"epoch": 0.038053274584418185,
"grad_norm": 1.2230157852172852,
"learning_rate": 0.00019638421585599423,
"loss": 2.3288,
"step": 95
},
{
"epoch": 0.038453835369517325,
"grad_norm": 0.8598111271858215,
"learning_rate": 0.00019629916987811926,
"loss": 1.8238,
"step": 96
},
{
"epoch": 0.038854396154616465,
"grad_norm": 1.5519123077392578,
"learning_rate": 0.00019621315416867274,
"loss": 1.8526,
"step": 97
},
{
"epoch": 0.039254956939715605,
"grad_norm": 1.3238226175308228,
"learning_rate": 0.0001961261695938319,
"loss": 2.278,
"step": 98
},
{
"epoch": 0.03965551772481474,
"grad_norm": 1.6846929788589478,
"learning_rate": 0.00019603821702953046,
"loss": 2.0994,
"step": 99
},
{
"epoch": 0.04005607850991388,
"grad_norm": 1.5321959257125854,
"learning_rate": 0.00019594929736144976,
"loss": 1.965,
"step": 100
},
{
"epoch": 0.04045663929501302,
"grad_norm": 1.0697311162948608,
"learning_rate": 0.00019585941148500985,
"loss": 2.1486,
"step": 101
},
{
"epoch": 0.040857200080112156,
"grad_norm": 1.4754281044006348,
"learning_rate": 0.00019576856030536054,
"loss": 1.7525,
"step": 102
},
{
"epoch": 0.041257760865211296,
"grad_norm": 1.2071729898452759,
"learning_rate": 0.00019567674473737218,
"loss": 1.7292,
"step": 103
},
{
"epoch": 0.041658321650310436,
"grad_norm": 1.14888596534729,
"learning_rate": 0.0001955839657056265,
"loss": 2.0676,
"step": 104
},
{
"epoch": 0.042058882435409575,
"grad_norm": 1.120998501777649,
"learning_rate": 0.0001954902241444074,
"loss": 1.7143,
"step": 105
},
{
"epoch": 0.042459443220508715,
"grad_norm": 1.1699857711791992,
"learning_rate": 0.00019539552099769126,
"loss": 2.0488,
"step": 106
},
{
"epoch": 0.04286000400560785,
"grad_norm": 1.3505367040634155,
"learning_rate": 0.00019529985721913778,
"loss": 2.1923,
"step": 107
},
{
"epoch": 0.04326056479070699,
"grad_norm": 1.2390555143356323,
"learning_rate": 0.00019520323377208017,
"loss": 2.0017,
"step": 108
},
{
"epoch": 0.04366112557580613,
"grad_norm": 1.6898435354232788,
"learning_rate": 0.00019510565162951537,
"loss": 2.0393,
"step": 109
},
{
"epoch": 0.04406168636090527,
"grad_norm": 1.3708515167236328,
"learning_rate": 0.00019500711177409454,
"loss": 1.6338,
"step": 110
},
{
"epoch": 0.044462247146004406,
"grad_norm": 1.3160320520401,
"learning_rate": 0.00019490761519811293,
"loss": 1.9489,
"step": 111
},
{
"epoch": 0.044862807931103546,
"grad_norm": 1.2316290140151978,
"learning_rate": 0.00019480716290349995,
"loss": 1.9994,
"step": 112
},
{
"epoch": 0.045263368716202686,
"grad_norm": 1.2488354444503784,
"learning_rate": 0.0001947057559018091,
"loss": 1.9349,
"step": 113
},
{
"epoch": 0.045663929501301825,
"grad_norm": 1.4348840713500977,
"learning_rate": 0.00019460339521420772,
"loss": 1.9112,
"step": 114
},
{
"epoch": 0.04606449028640096,
"grad_norm": 1.389400839805603,
"learning_rate": 0.00019450008187146684,
"loss": 1.9432,
"step": 115
},
{
"epoch": 0.0464650510715001,
"grad_norm": 1.025496006011963,
"learning_rate": 0.00019439581691395067,
"loss": 1.8639,
"step": 116
},
{
"epoch": 0.04686561185659924,
"grad_norm": 0.9544436931610107,
"learning_rate": 0.00019429060139160618,
"loss": 2.0917,
"step": 117
},
{
"epoch": 0.04726617264169838,
"grad_norm": 1.2288682460784912,
"learning_rate": 0.00019418443636395248,
"loss": 1.8996,
"step": 118
},
{
"epoch": 0.04766673342679752,
"grad_norm": 1.1020634174346924,
"learning_rate": 0.00019407732290007023,
"loss": 2.236,
"step": 119
},
{
"epoch": 0.048067294211896656,
"grad_norm": 1.2814069986343384,
"learning_rate": 0.00019396926207859084,
"loss": 2.2541,
"step": 120
},
{
"epoch": 0.048467854996995796,
"grad_norm": 1.3004281520843506,
"learning_rate": 0.00019386025498768558,
"loss": 1.9218,
"step": 121
},
{
"epoch": 0.048868415782094936,
"grad_norm": 1.1537413597106934,
"learning_rate": 0.00019375030272505463,
"loss": 1.8207,
"step": 122
},
{
"epoch": 0.04926897656719407,
"grad_norm": 1.1501256227493286,
"learning_rate": 0.00019363940639791606,
"loss": 1.9654,
"step": 123
},
{
"epoch": 0.04966953735229321,
"grad_norm": 1.511906385421753,
"learning_rate": 0.00019352756712299468,
"loss": 1.9795,
"step": 124
},
{
"epoch": 0.05007009813739235,
"grad_norm": 1.2695921659469604,
"learning_rate": 0.00019341478602651069,
"loss": 1.8491,
"step": 125
},
{
"epoch": 0.05047065892249149,
"grad_norm": 1.4410191774368286,
"learning_rate": 0.00019330106424416852,
"loss": 1.8925,
"step": 126
},
{
"epoch": 0.05087121970759063,
"grad_norm": 1.254278302192688,
"learning_rate": 0.00019318640292114524,
"loss": 1.7206,
"step": 127
},
{
"epoch": 0.05127178049268977,
"grad_norm": 1.2319607734680176,
"learning_rate": 0.00019307080321207912,
"loss": 1.7632,
"step": 128
},
{
"epoch": 0.05167234127778891,
"grad_norm": 1.116660714149475,
"learning_rate": 0.00019295426628105792,
"loss": 1.8059,
"step": 129
},
{
"epoch": 0.052072902062888046,
"grad_norm": 1.2714475393295288,
"learning_rate": 0.00019283679330160726,
"loss": 1.868,
"step": 130
},
{
"epoch": 0.05247346284798718,
"grad_norm": 1.2375353574752808,
"learning_rate": 0.00019271838545667876,
"loss": 2.3638,
"step": 131
},
{
"epoch": 0.05287402363308632,
"grad_norm": 1.0356881618499756,
"learning_rate": 0.00019259904393863802,
"loss": 1.9812,
"step": 132
},
{
"epoch": 0.05327458441818546,
"grad_norm": 1.3008970022201538,
"learning_rate": 0.00019247876994925292,
"loss": 1.6867,
"step": 133
},
{
"epoch": 0.0536751452032846,
"grad_norm": 1.1734683513641357,
"learning_rate": 0.0001923575646996811,
"loss": 2.1067,
"step": 134
},
{
"epoch": 0.05407570598838374,
"grad_norm": 1.2169734239578247,
"learning_rate": 0.00019223542941045817,
"loss": 2.0488,
"step": 135
},
{
"epoch": 0.05447626677348288,
"grad_norm": 1.3238193988800049,
"learning_rate": 0.000192112365311485,
"loss": 1.8312,
"step": 136
},
{
"epoch": 0.05487682755858202,
"grad_norm": 1.255581021308899,
"learning_rate": 0.00019198837364201585,
"loss": 1.8126,
"step": 137
},
{
"epoch": 0.05527738834368116,
"grad_norm": 1.1656538248062134,
"learning_rate": 0.00019186345565064535,
"loss": 1.8241,
"step": 138
},
{
"epoch": 0.05567794912878029,
"grad_norm": 1.4386628866195679,
"learning_rate": 0.00019173761259529633,
"loss": 1.9037,
"step": 139
},
{
"epoch": 0.05607850991387943,
"grad_norm": 1.1676979064941406,
"learning_rate": 0.00019161084574320696,
"loss": 1.9797,
"step": 140
},
{
"epoch": 0.05647907069897857,
"grad_norm": 1.1129257678985596,
"learning_rate": 0.00019148315637091803,
"loss": 1.6362,
"step": 141
},
{
"epoch": 0.05687963148407771,
"grad_norm": 1.295759677886963,
"learning_rate": 0.0001913545457642601,
"loss": 1.7671,
"step": 142
},
{
"epoch": 0.05728019226917685,
"grad_norm": 1.3850711584091187,
"learning_rate": 0.00019122501521834053,
"loss": 2.0576,
"step": 143
},
{
"epoch": 0.05768075305427599,
"grad_norm": 1.1702038049697876,
"learning_rate": 0.0001910945660375305,
"loss": 2.0554,
"step": 144
},
{
"epoch": 0.05808131383937513,
"grad_norm": 1.0817047357559204,
"learning_rate": 0.00019096319953545185,
"loss": 2.0641,
"step": 145
},
{
"epoch": 0.05848187462447427,
"grad_norm": 1.2310954332351685,
"learning_rate": 0.0001908309170349637,
"loss": 1.6907,
"step": 146
},
{
"epoch": 0.0588824354095734,
"grad_norm": 1.5341321229934692,
"learning_rate": 0.00019069771986814947,
"loss": 2.3666,
"step": 147
},
{
"epoch": 0.05928299619467254,
"grad_norm": 1.2268143892288208,
"learning_rate": 0.0001905636093763031,
"loss": 1.9265,
"step": 148
},
{
"epoch": 0.05968355697977168,
"grad_norm": 0.9649202823638916,
"learning_rate": 0.00019042858690991574,
"loss": 2.1886,
"step": 149
},
{
"epoch": 0.06008411776487082,
"grad_norm": 1.3439741134643555,
"learning_rate": 0.00019029265382866214,
"loss": 2.0593,
"step": 150
},
{
"epoch": 0.06048467854996996,
"grad_norm": 1.1927765607833862,
"learning_rate": 0.00019015581150138693,
"loss": 2.1178,
"step": 151
},
{
"epoch": 0.0608852393350691,
"grad_norm": 0.8950446844100952,
"learning_rate": 0.0001900180613060908,
"loss": 2.2191,
"step": 152
},
{
"epoch": 0.06128580012016824,
"grad_norm": 1.1860698461532593,
"learning_rate": 0.0001898794046299167,
"loss": 2.0034,
"step": 153
},
{
"epoch": 0.06168636090526738,
"grad_norm": 1.1506222486495972,
"learning_rate": 0.00018973984286913584,
"loss": 1.901,
"step": 154
},
{
"epoch": 0.06208692169036651,
"grad_norm": 1.6920007467269897,
"learning_rate": 0.00018959937742913359,
"loss": 1.9474,
"step": 155
},
{
"epoch": 0.06248748247546565,
"grad_norm": 1.2259491682052612,
"learning_rate": 0.00018945800972439538,
"loss": 2.31,
"step": 156
},
{
"epoch": 0.0628880432605648,
"grad_norm": 1.2086715698242188,
"learning_rate": 0.0001893157411784924,
"loss": 1.86,
"step": 157
},
{
"epoch": 0.06328860404566393,
"grad_norm": 1.2207906246185303,
"learning_rate": 0.00018917257322406734,
"loss": 1.8438,
"step": 158
},
{
"epoch": 0.06368916483076306,
"grad_norm": 1.1944586038589478,
"learning_rate": 0.00018902850730281992,
"loss": 1.8793,
"step": 159
},
{
"epoch": 0.06408972561586221,
"grad_norm": 1.4343067407608032,
"learning_rate": 0.00018888354486549237,
"loss": 1.9623,
"step": 160
},
{
"epoch": 0.06449028640096134,
"grad_norm": 1.1235885620117188,
"learning_rate": 0.0001887376873718548,
"loss": 2.2875,
"step": 161
},
{
"epoch": 0.06489084718606049,
"grad_norm": 1.3148598670959473,
"learning_rate": 0.00018859093629069058,
"loss": 1.7892,
"step": 162
},
{
"epoch": 0.06529140797115962,
"grad_norm": 1.112668514251709,
"learning_rate": 0.00018844329309978145,
"loss": 2.1598,
"step": 163
},
{
"epoch": 0.06569196875625877,
"grad_norm": 1.1902179718017578,
"learning_rate": 0.00018829475928589271,
"loss": 2.0822,
"step": 164
},
{
"epoch": 0.0660925295413579,
"grad_norm": 1.3088462352752686,
"learning_rate": 0.00018814533634475822,
"loss": 2.2902,
"step": 165
},
{
"epoch": 0.06649309032645705,
"grad_norm": 1.047739028930664,
"learning_rate": 0.00018799502578106534,
"loss": 2.1836,
"step": 166
},
{
"epoch": 0.06689365111155618,
"grad_norm": 1.2094810009002686,
"learning_rate": 0.00018784382910843976,
"loss": 2.0445,
"step": 167
},
{
"epoch": 0.06729421189665531,
"grad_norm": 1.3738199472427368,
"learning_rate": 0.0001876917478494303,
"loss": 1.7812,
"step": 168
},
{
"epoch": 0.06769477268175446,
"grad_norm": 1.4020622968673706,
"learning_rate": 0.00018753878353549357,
"loss": 2.2706,
"step": 169
},
{
"epoch": 0.06809533346685359,
"grad_norm": 1.0181434154510498,
"learning_rate": 0.00018738493770697852,
"loss": 1.759,
"step": 170
},
{
"epoch": 0.06849589425195274,
"grad_norm": 1.2207024097442627,
"learning_rate": 0.0001872302119131109,
"loss": 1.9571,
"step": 171
},
{
"epoch": 0.06889645503705187,
"grad_norm": 0.9442883729934692,
"learning_rate": 0.00018707460771197774,
"loss": 2.1751,
"step": 172
},
{
"epoch": 0.06929701582215102,
"grad_norm": 1.2098711729049683,
"learning_rate": 0.00018691812667051162,
"loss": 1.952,
"step": 173
},
{
"epoch": 0.06969757660725015,
"grad_norm": 1.3631396293640137,
"learning_rate": 0.00018676077036447494,
"loss": 2.0478,
"step": 174
},
{
"epoch": 0.07009813739234928,
"grad_norm": 1.0731444358825684,
"learning_rate": 0.00018660254037844388,
"loss": 1.8923,
"step": 175
},
{
"epoch": 0.07049869817744843,
"grad_norm": 1.225913166999817,
"learning_rate": 0.0001864434383057927,
"loss": 2.1391,
"step": 176
},
{
"epoch": 0.07089925896254756,
"grad_norm": 1.1960365772247314,
"learning_rate": 0.00018628346574867745,
"loss": 2.2535,
"step": 177
},
{
"epoch": 0.07129981974764671,
"grad_norm": 1.1839686632156372,
"learning_rate": 0.00018612262431802007,
"loss": 2.2424,
"step": 178
},
{
"epoch": 0.07170038053274584,
"grad_norm": 1.1837133169174194,
"learning_rate": 0.00018596091563349192,
"loss": 1.9497,
"step": 179
},
{
"epoch": 0.07210094131784499,
"grad_norm": 1.3902225494384766,
"learning_rate": 0.00018579834132349772,
"loss": 2.0473,
"step": 180
},
{
"epoch": 0.07250150210294412,
"grad_norm": 1.141578197479248,
"learning_rate": 0.0001856349030251589,
"loss": 1.6572,
"step": 181
},
{
"epoch": 0.07290206288804325,
"grad_norm": 1.1828128099441528,
"learning_rate": 0.00018547060238429736,
"loss": 1.958,
"step": 182
},
{
"epoch": 0.0733026236731424,
"grad_norm": 1.3398916721343994,
"learning_rate": 0.00018530544105541872,
"loss": 2.0078,
"step": 183
},
{
"epoch": 0.07370318445824153,
"grad_norm": 1.2211058139801025,
"learning_rate": 0.0001851394207016957,
"loss": 1.9945,
"step": 184
},
{
"epoch": 0.07410374524334068,
"grad_norm": 1.2373664379119873,
"learning_rate": 0.00018497254299495146,
"loss": 1.8566,
"step": 185
},
{
"epoch": 0.07450430602843981,
"grad_norm": 1.216086983680725,
"learning_rate": 0.0001848048096156426,
"loss": 1.7227,
"step": 186
},
{
"epoch": 0.07490486681353896,
"grad_norm": 1.2878079414367676,
"learning_rate": 0.00018463622225284242,
"loss": 2.0206,
"step": 187
},
{
"epoch": 0.07530542759863809,
"grad_norm": 0.9321051239967346,
"learning_rate": 0.00018446678260422385,
"loss": 1.784,
"step": 188
},
{
"epoch": 0.07570598838373724,
"grad_norm": 1.0686324834823608,
"learning_rate": 0.00018429649237604217,
"loss": 1.8121,
"step": 189
},
{
"epoch": 0.07610654916883637,
"grad_norm": 1.2810065746307373,
"learning_rate": 0.00018412535328311814,
"loss": 1.9002,
"step": 190
},
{
"epoch": 0.0765071099539355,
"grad_norm": 1.3205995559692383,
"learning_rate": 0.0001839533670488205,
"loss": 2.1796,
"step": 191
},
{
"epoch": 0.07690767073903465,
"grad_norm": 1.2351480722427368,
"learning_rate": 0.00018378053540504873,
"loss": 1.8489,
"step": 192
},
{
"epoch": 0.07730823152413378,
"grad_norm": 1.2140512466430664,
"learning_rate": 0.0001836068600922156,
"loss": 1.9828,
"step": 193
},
{
"epoch": 0.07770879230923293,
"grad_norm": 1.4807522296905518,
"learning_rate": 0.00018343234285922953,
"loss": 1.6552,
"step": 194
},
{
"epoch": 0.07810935309433206,
"grad_norm": 1.23876953125,
"learning_rate": 0.00018325698546347715,
"loss": 2.0277,
"step": 195
},
{
"epoch": 0.07850991387943121,
"grad_norm": 1.4449162483215332,
"learning_rate": 0.00018308078967080546,
"loss": 1.6708,
"step": 196
},
{
"epoch": 0.07891047466453034,
"grad_norm": 1.2765625715255737,
"learning_rate": 0.00018290375725550417,
"loss": 1.8713,
"step": 197
},
{
"epoch": 0.07931103544962947,
"grad_norm": 1.3653018474578857,
"learning_rate": 0.00018272589000028772,
"loss": 1.7254,
"step": 198
},
{
"epoch": 0.07971159623472862,
"grad_norm": 1.4352061748504639,
"learning_rate": 0.0001825471896962774,
"loss": 2.1107,
"step": 199
},
{
"epoch": 0.08011215701982775,
"grad_norm": 1.2856611013412476,
"learning_rate": 0.0001823676581429833,
"loss": 2.0722,
"step": 200
},
{
"epoch": 0.0805127178049269,
"grad_norm": 1.2037805318832397,
"learning_rate": 0.00018218729714828612,
"loss": 1.8017,
"step": 201
},
{
"epoch": 0.08091327859002603,
"grad_norm": 1.1805696487426758,
"learning_rate": 0.00018200610852841913,
"loss": 1.8137,
"step": 202
},
{
"epoch": 0.08131383937512518,
"grad_norm": 1.0390084981918335,
"learning_rate": 0.00018182409410794968,
"loss": 2.0199,
"step": 203
},
{
"epoch": 0.08171440016022431,
"grad_norm": 1.1457184553146362,
"learning_rate": 0.00018164125571976098,
"loss": 1.8555,
"step": 204
},
{
"epoch": 0.08211496094532346,
"grad_norm": 1.3365423679351807,
"learning_rate": 0.00018145759520503358,
"loss": 2.2639,
"step": 205
},
{
"epoch": 0.08251552173042259,
"grad_norm": 1.3933526277542114,
"learning_rate": 0.0001812731144132268,
"loss": 1.5607,
"step": 206
},
{
"epoch": 0.08291608251552172,
"grad_norm": 1.458027720451355,
"learning_rate": 0.0001810878152020602,
"loss": 2.2164,
"step": 207
},
{
"epoch": 0.08331664330062087,
"grad_norm": 1.6003340482711792,
"learning_rate": 0.00018090169943749476,
"loss": 1.9723,
"step": 208
},
{
"epoch": 0.08371720408572,
"grad_norm": 0.9654092788696289,
"learning_rate": 0.00018071476899371414,
"loss": 2.3965,
"step": 209
},
{
"epoch": 0.08411776487081915,
"grad_norm": 1.0213390588760376,
"learning_rate": 0.00018052702575310588,
"loss": 2.2219,
"step": 210
},
{
"epoch": 0.08451832565591828,
"grad_norm": 1.5746159553527832,
"learning_rate": 0.00018033847160624225,
"loss": 1.9594,
"step": 211
},
{
"epoch": 0.08491888644101743,
"grad_norm": 1.3370170593261719,
"learning_rate": 0.00018014910845186153,
"loss": 1.9862,
"step": 212
},
{
"epoch": 0.08531944722611656,
"grad_norm": 1.2249865531921387,
"learning_rate": 0.0001799589381968485,
"loss": 2.0159,
"step": 213
},
{
"epoch": 0.0857200080112157,
"grad_norm": 1.3740154504776,
"learning_rate": 0.00017976796275621555,
"loss": 2.1776,
"step": 214
},
{
"epoch": 0.08612056879631484,
"grad_norm": 1.5516133308410645,
"learning_rate": 0.00017957618405308324,
"loss": 1.917,
"step": 215
},
{
"epoch": 0.08652112958141397,
"grad_norm": 1.3436651229858398,
"learning_rate": 0.00017938360401866093,
"loss": 2.1363,
"step": 216
},
{
"epoch": 0.08692169036651312,
"grad_norm": 1.111444115638733,
"learning_rate": 0.00017919022459222752,
"loss": 2.0363,
"step": 217
},
{
"epoch": 0.08732225115161225,
"grad_norm": 1.0461078882217407,
"learning_rate": 0.00017899604772111163,
"loss": 2.0568,
"step": 218
},
{
"epoch": 0.0877228119367114,
"grad_norm": 1.086348533630371,
"learning_rate": 0.00017880107536067218,
"loss": 2.2919,
"step": 219
},
{
"epoch": 0.08812337272181053,
"grad_norm": 1.2152503728866577,
"learning_rate": 0.00017860530947427875,
"loss": 2.0589,
"step": 220
},
{
"epoch": 0.08852393350690968,
"grad_norm": 1.3051732778549194,
"learning_rate": 0.0001784087520332916,
"loss": 2.1461,
"step": 221
},
{
"epoch": 0.08892449429200881,
"grad_norm": 1.0947463512420654,
"learning_rate": 0.00017821140501704194,
"loss": 1.9385,
"step": 222
},
{
"epoch": 0.08932505507710795,
"grad_norm": 1.056276559829712,
"learning_rate": 0.00017801327041281207,
"loss": 2.5425,
"step": 223
},
{
"epoch": 0.08972561586220709,
"grad_norm": 1.5695077180862427,
"learning_rate": 0.00017781435021581527,
"loss": 2.1143,
"step": 224
},
{
"epoch": 0.09012617664730622,
"grad_norm": 1.2888416051864624,
"learning_rate": 0.0001776146464291757,
"loss": 2.0032,
"step": 225
},
{
"epoch": 0.09052673743240537,
"grad_norm": 1.258169412612915,
"learning_rate": 0.00017741416106390826,
"loss": 2.2619,
"step": 226
},
{
"epoch": 0.0909272982175045,
"grad_norm": 1.469509482383728,
"learning_rate": 0.00017721289613889835,
"loss": 1.8764,
"step": 227
},
{
"epoch": 0.09132785900260365,
"grad_norm": 1.2821156978607178,
"learning_rate": 0.00017701085368088156,
"loss": 2.0395,
"step": 228
},
{
"epoch": 0.09172841978770278,
"grad_norm": 1.2879642248153687,
"learning_rate": 0.00017680803572442318,
"loss": 2.1896,
"step": 229
},
{
"epoch": 0.09212898057280192,
"grad_norm": 1.5035954713821411,
"learning_rate": 0.0001766044443118978,
"loss": 2.0904,
"step": 230
},
{
"epoch": 0.09252954135790106,
"grad_norm": 1.167608618736267,
"learning_rate": 0.00017640008149346866,
"loss": 2.0146,
"step": 231
},
{
"epoch": 0.0929301021430002,
"grad_norm": 1.177674651145935,
"learning_rate": 0.0001761949493270671,
"loss": 1.9045,
"step": 232
},
{
"epoch": 0.09333066292809934,
"grad_norm": 1.1968990564346313,
"learning_rate": 0.0001759890498783717,
"loss": 1.8802,
"step": 233
},
{
"epoch": 0.09373122371319847,
"grad_norm": 1.1479514837265015,
"learning_rate": 0.0001757823852207877,
"loss": 1.9292,
"step": 234
},
{
"epoch": 0.09413178449829762,
"grad_norm": 1.2045358419418335,
"learning_rate": 0.00017557495743542585,
"loss": 1.9526,
"step": 235
},
{
"epoch": 0.09453234528339675,
"grad_norm": 1.094698190689087,
"learning_rate": 0.00017536676861108164,
"loss": 1.8112,
"step": 236
},
{
"epoch": 0.0949329060684959,
"grad_norm": 1.5539491176605225,
"learning_rate": 0.00017515782084421427,
"loss": 2.2441,
"step": 237
},
{
"epoch": 0.09533346685359503,
"grad_norm": 1.2758251428604126,
"learning_rate": 0.0001749481162389254,
"loss": 1.6361,
"step": 238
},
{
"epoch": 0.09573402763869417,
"grad_norm": 0.9369722604751587,
"learning_rate": 0.0001747376569069381,
"loss": 1.8394,
"step": 239
},
{
"epoch": 0.09613458842379331,
"grad_norm": 1.2912318706512451,
"learning_rate": 0.0001745264449675755,
"loss": 1.9176,
"step": 240
},
{
"epoch": 0.09653514920889245,
"grad_norm": 1.3255847692489624,
"learning_rate": 0.00017431448254773944,
"loss": 2.5196,
"step": 241
},
{
"epoch": 0.09693570999399159,
"grad_norm": 1.3182979822158813,
"learning_rate": 0.00017410177178188918,
"loss": 2.0764,
"step": 242
},
{
"epoch": 0.09733627077909073,
"grad_norm": 1.131363034248352,
"learning_rate": 0.00017388831481201977,
"loss": 1.4795,
"step": 243
},
{
"epoch": 0.09773683156418987,
"grad_norm": 1.3598371744155884,
"learning_rate": 0.0001736741137876405,
"loss": 1.9917,
"step": 244
},
{
"epoch": 0.098137392349289,
"grad_norm": 1.2983320951461792,
"learning_rate": 0.00017345917086575332,
"loss": 1.8847,
"step": 245
},
{
"epoch": 0.09853795313438814,
"grad_norm": 1.3627434968948364,
"learning_rate": 0.0001732434882108311,
"loss": 1.9239,
"step": 246
},
{
"epoch": 0.09893851391948728,
"grad_norm": 1.4476374387741089,
"learning_rate": 0.00017302706799479574,
"loss": 1.9497,
"step": 247
},
{
"epoch": 0.09933907470458642,
"grad_norm": 1.002682089805603,
"learning_rate": 0.00017280991239699642,
"loss": 2.3343,
"step": 248
},
{
"epoch": 0.09973963548968556,
"grad_norm": 1.120917558670044,
"learning_rate": 0.00017259202360418762,
"loss": 1.9538,
"step": 249
},
{
"epoch": 0.1001401962747847,
"grad_norm": 1.4410195350646973,
"learning_rate": 0.00017237340381050703,
"loss": 2.1967,
"step": 250
},
{
"epoch": 0.1001401962747847,
"eval_loss": 1.95481538772583,
"eval_runtime": 32.8702,
"eval_samples_per_second": 32.005,
"eval_steps_per_second": 16.002,
"step": 250
},
{
"epoch": 0.10054075705988384,
"grad_norm": 1.3339630365371704,
"learning_rate": 0.00017215405521745357,
"loss": 2.1258,
"step": 251
},
{
"epoch": 0.10094131784498298,
"grad_norm": 1.1717379093170166,
"learning_rate": 0.0001719339800338651,
"loss": 2.0449,
"step": 252
},
{
"epoch": 0.10134187863008212,
"grad_norm": 1.1957095861434937,
"learning_rate": 0.00017171318047589637,
"loss": 1.9656,
"step": 253
},
{
"epoch": 0.10174243941518125,
"grad_norm": 1.501266598701477,
"learning_rate": 0.00017149165876699635,
"loss": 2.459,
"step": 254
},
{
"epoch": 0.10214300020028039,
"grad_norm": 0.8971200585365295,
"learning_rate": 0.00017126941713788632,
"loss": 1.7243,
"step": 255
},
{
"epoch": 0.10254356098537953,
"grad_norm": 1.4776593446731567,
"learning_rate": 0.0001710464578265369,
"loss": 2.0676,
"step": 256
},
{
"epoch": 0.10294412177047867,
"grad_norm": 0.9212047457695007,
"learning_rate": 0.00017082278307814592,
"loss": 1.9708,
"step": 257
},
{
"epoch": 0.10334468255557781,
"grad_norm": 0.9782827496528625,
"learning_rate": 0.00017059839514511565,
"loss": 1.8311,
"step": 258
},
{
"epoch": 0.10374524334067695,
"grad_norm": 1.4289268255233765,
"learning_rate": 0.00017037329628703004,
"loss": 2.2642,
"step": 259
},
{
"epoch": 0.10414580412577609,
"grad_norm": 1.0863878726959229,
"learning_rate": 0.00017014748877063214,
"loss": 1.8184,
"step": 260
},
{
"epoch": 0.10454636491087523,
"grad_norm": 1.4105286598205566,
"learning_rate": 0.00016992097486980107,
"loss": 1.7869,
"step": 261
},
{
"epoch": 0.10494692569597436,
"grad_norm": 1.217506766319275,
"learning_rate": 0.00016969375686552937,
"loss": 2.188,
"step": 262
},
{
"epoch": 0.1053474864810735,
"grad_norm": 1.2353211641311646,
"learning_rate": 0.00016946583704589973,
"loss": 1.9721,
"step": 263
},
{
"epoch": 0.10574804726617264,
"grad_norm": 0.9625990390777588,
"learning_rate": 0.00016923721770606228,
"loss": 1.6519,
"step": 264
},
{
"epoch": 0.10614860805127178,
"grad_norm": 1.0968215465545654,
"learning_rate": 0.00016900790114821122,
"loss": 2.0124,
"step": 265
},
{
"epoch": 0.10654916883637092,
"grad_norm": 1.0629442930221558,
"learning_rate": 0.0001687778896815617,
"loss": 2.3657,
"step": 266
},
{
"epoch": 0.10694972962147006,
"grad_norm": 1.5048670768737793,
"learning_rate": 0.00016854718562232668,
"loss": 2.0883,
"step": 267
},
{
"epoch": 0.1073502904065692,
"grad_norm": 1.4587311744689941,
"learning_rate": 0.00016831579129369346,
"loss": 1.8842,
"step": 268
},
{
"epoch": 0.10775085119166834,
"grad_norm": 1.3808192014694214,
"learning_rate": 0.00016808370902580036,
"loss": 1.9145,
"step": 269
},
{
"epoch": 0.10815141197676748,
"grad_norm": 1.2429652214050293,
"learning_rate": 0.00016785094115571322,
"loss": 2.3074,
"step": 270
},
{
"epoch": 0.10855197276186661,
"grad_norm": 1.3593460321426392,
"learning_rate": 0.00016761749002740193,
"loss": 1.8074,
"step": 271
},
{
"epoch": 0.10895253354696575,
"grad_norm": 1.303536057472229,
"learning_rate": 0.00016738335799171682,
"loss": 1.9934,
"step": 272
},
{
"epoch": 0.10935309433206489,
"grad_norm": 0.9886314272880554,
"learning_rate": 0.00016714854740636478,
"loss": 2.0478,
"step": 273
},
{
"epoch": 0.10975365511716403,
"grad_norm": 1.2797132730484009,
"learning_rate": 0.00016691306063588583,
"loss": 1.6133,
"step": 274
},
{
"epoch": 0.11015421590226317,
"grad_norm": 0.9066633582115173,
"learning_rate": 0.00016667690005162916,
"loss": 2.1399,
"step": 275
},
{
"epoch": 0.11055477668736231,
"grad_norm": 1.3959397077560425,
"learning_rate": 0.00016644006803172924,
"loss": 2.0478,
"step": 276
},
{
"epoch": 0.11095533747246145,
"grad_norm": 1.2920677661895752,
"learning_rate": 0.00016620256696108188,
"loss": 2.0572,
"step": 277
},
{
"epoch": 0.11135589825756058,
"grad_norm": 1.048725962638855,
"learning_rate": 0.00016596439923132017,
"loss": 2.1255,
"step": 278
},
{
"epoch": 0.11175645904265973,
"grad_norm": 1.2046992778778076,
"learning_rate": 0.00016572556724079056,
"loss": 2.0455,
"step": 279
},
{
"epoch": 0.11215701982775886,
"grad_norm": 1.0558348894119263,
"learning_rate": 0.00016548607339452853,
"loss": 1.8228,
"step": 280
},
{
"epoch": 0.112557580612858,
"grad_norm": 1.1831914186477661,
"learning_rate": 0.00016524592010423443,
"loss": 1.7431,
"step": 281
},
{
"epoch": 0.11295814139795714,
"grad_norm": 1.540031909942627,
"learning_rate": 0.00016500510978824926,
"loss": 2.0272,
"step": 282
},
{
"epoch": 0.11335870218305628,
"grad_norm": 1.2096210718154907,
"learning_rate": 0.00016476364487153023,
"loss": 1.77,
"step": 283
},
{
"epoch": 0.11375926296815542,
"grad_norm": 0.9515264630317688,
"learning_rate": 0.0001645215277856263,
"loss": 1.9224,
"step": 284
},
{
"epoch": 0.11415982375325455,
"grad_norm": 0.9934387803077698,
"learning_rate": 0.00016427876096865394,
"loss": 2.0196,
"step": 285
},
{
"epoch": 0.1145603845383537,
"grad_norm": 1.2015880346298218,
"learning_rate": 0.00016403534686527225,
"loss": 1.6666,
"step": 286
},
{
"epoch": 0.11496094532345283,
"grad_norm": 1.052933931350708,
"learning_rate": 0.00016379128792665855,
"loss": 2.2045,
"step": 287
},
{
"epoch": 0.11536150610855198,
"grad_norm": 1.562372088432312,
"learning_rate": 0.00016354658661048364,
"loss": 2.3048,
"step": 288
},
{
"epoch": 0.11576206689365111,
"grad_norm": 1.3387340307235718,
"learning_rate": 0.00016330124538088705,
"loss": 2.1904,
"step": 289
},
{
"epoch": 0.11616262767875025,
"grad_norm": 0.9825554490089417,
"learning_rate": 0.00016305526670845226,
"loss": 2.1263,
"step": 290
},
{
"epoch": 0.11656318846384939,
"grad_norm": 1.253036379814148,
"learning_rate": 0.00016280865307018177,
"loss": 1.7718,
"step": 291
},
{
"epoch": 0.11696374924894853,
"grad_norm": 1.3797401189804077,
"learning_rate": 0.00016256140694947217,
"loss": 1.8047,
"step": 292
},
{
"epoch": 0.11736431003404767,
"grad_norm": 1.1455391645431519,
"learning_rate": 0.00016231353083608912,
"loss": 2.0973,
"step": 293
},
{
"epoch": 0.1177648708191468,
"grad_norm": 1.6505376100540161,
"learning_rate": 0.00016206502722614238,
"loss": 2.0752,
"step": 294
},
{
"epoch": 0.11816543160424595,
"grad_norm": 1.3806008100509644,
"learning_rate": 0.00016181589862206052,
"loss": 2.0264,
"step": 295
},
{
"epoch": 0.11856599238934508,
"grad_norm": 1.1952394247055054,
"learning_rate": 0.0001615661475325658,
"loss": 2.4044,
"step": 296
},
{
"epoch": 0.11896655317444423,
"grad_norm": 1.2005327939987183,
"learning_rate": 0.00016131577647264902,
"loss": 1.9808,
"step": 297
},
{
"epoch": 0.11936711395954336,
"grad_norm": 1.0058878660202026,
"learning_rate": 0.00016106478796354382,
"loss": 2.0834,
"step": 298
},
{
"epoch": 0.1197676747446425,
"grad_norm": 1.2255983352661133,
"learning_rate": 0.0001608131845327018,
"loss": 1.7139,
"step": 299
},
{
"epoch": 0.12016823552974164,
"grad_norm": 1.3495612144470215,
"learning_rate": 0.00016056096871376667,
"loss": 1.6639,
"step": 300
},
{
"epoch": 0.12056879631484077,
"grad_norm": 1.1183319091796875,
"learning_rate": 0.00016030814304654895,
"loss": 1.9833,
"step": 301
},
{
"epoch": 0.12096935709993992,
"grad_norm": 1.3301992416381836,
"learning_rate": 0.00016005471007700031,
"loss": 1.7918,
"step": 302
},
{
"epoch": 0.12136991788503905,
"grad_norm": 1.4026867151260376,
"learning_rate": 0.00015980067235718792,
"loss": 2.1528,
"step": 303
},
{
"epoch": 0.1217704786701382,
"grad_norm": 1.2631279230117798,
"learning_rate": 0.0001595460324452688,
"loss": 2.1449,
"step": 304
},
{
"epoch": 0.12217103945523733,
"grad_norm": 1.7133634090423584,
"learning_rate": 0.00015929079290546408,
"loss": 2.0834,
"step": 305
},
{
"epoch": 0.12257160024033648,
"grad_norm": 0.9135451912879944,
"learning_rate": 0.000159034956308033,
"loss": 2.4581,
"step": 306
},
{
"epoch": 0.12297216102543561,
"grad_norm": 1.2533658742904663,
"learning_rate": 0.00015877852522924732,
"loss": 1.9334,
"step": 307
},
{
"epoch": 0.12337272181053476,
"grad_norm": 1.1332594156265259,
"learning_rate": 0.00015852150225136518,
"loss": 2.1089,
"step": 308
},
{
"epoch": 0.12377328259563389,
"grad_norm": 1.5345416069030762,
"learning_rate": 0.00015826388996260503,
"loss": 2.0917,
"step": 309
},
{
"epoch": 0.12417384338073302,
"grad_norm": 0.9528497457504272,
"learning_rate": 0.00015800569095711982,
"loss": 2.2036,
"step": 310
},
{
"epoch": 0.12457440416583217,
"grad_norm": 1.1167405843734741,
"learning_rate": 0.00015774690783497067,
"loss": 2.1452,
"step": 311
},
{
"epoch": 0.1249749649509313,
"grad_norm": 1.2491390705108643,
"learning_rate": 0.00015748754320210072,
"loss": 1.85,
"step": 312
},
{
"epoch": 0.12537552573603045,
"grad_norm": 0.943801760673523,
"learning_rate": 0.00015722759967030898,
"loss": 1.8039,
"step": 313
},
{
"epoch": 0.1257760865211296,
"grad_norm": 1.1968666315078735,
"learning_rate": 0.0001569670798572239,
"loss": 1.8249,
"step": 314
},
{
"epoch": 0.1261766473062287,
"grad_norm": 0.8715935349464417,
"learning_rate": 0.00015670598638627706,
"loss": 1.9182,
"step": 315
},
{
"epoch": 0.12657720809132786,
"grad_norm": 1.3957470655441284,
"learning_rate": 0.00015644432188667695,
"loss": 2.0424,
"step": 316
},
{
"epoch": 0.126977768876427,
"grad_norm": 1.0456676483154297,
"learning_rate": 0.00015618208899338202,
"loss": 1.821,
"step": 317
},
{
"epoch": 0.12737832966152612,
"grad_norm": 0.8687244057655334,
"learning_rate": 0.0001559192903470747,
"loss": 2.2916,
"step": 318
},
{
"epoch": 0.12777889044662527,
"grad_norm": 1.1597154140472412,
"learning_rate": 0.0001556559285941344,
"loss": 2.2943,
"step": 319
},
{
"epoch": 0.12817945123172442,
"grad_norm": 1.4827574491500854,
"learning_rate": 0.00015539200638661104,
"loss": 2.2057,
"step": 320
},
{
"epoch": 0.12858001201682356,
"grad_norm": 1.2822664976119995,
"learning_rate": 0.00015512752638219835,
"loss": 1.9975,
"step": 321
},
{
"epoch": 0.12898057280192268,
"grad_norm": 1.2990927696228027,
"learning_rate": 0.000154862491244207,
"loss": 1.7885,
"step": 322
},
{
"epoch": 0.12938113358702183,
"grad_norm": 1.2612892389297485,
"learning_rate": 0.0001545969036415379,
"loss": 2.1861,
"step": 323
},
{
"epoch": 0.12978169437212098,
"grad_norm": 0.9105194211006165,
"learning_rate": 0.00015433076624865531,
"loss": 2.0607,
"step": 324
},
{
"epoch": 0.1301822551572201,
"grad_norm": 1.3368383646011353,
"learning_rate": 0.00015406408174555976,
"loss": 2.0913,
"step": 325
},
{
"epoch": 0.13058281594231924,
"grad_norm": 0.7987350225448608,
"learning_rate": 0.00015379685281776125,
"loss": 2.355,
"step": 326
},
{
"epoch": 0.1309833767274184,
"grad_norm": 1.3039599657058716,
"learning_rate": 0.00015352908215625214,
"loss": 2.1028,
"step": 327
},
{
"epoch": 0.13138393751251753,
"grad_norm": 1.3268717527389526,
"learning_rate": 0.00015326077245747999,
"loss": 1.7859,
"step": 328
},
{
"epoch": 0.13178449829761665,
"grad_norm": 1.2853844165802002,
"learning_rate": 0.0001529919264233205,
"loss": 1.7469,
"step": 329
},
{
"epoch": 0.1321850590827158,
"grad_norm": 1.4058549404144287,
"learning_rate": 0.00015272254676105025,
"loss": 1.9602,
"step": 330
},
{
"epoch": 0.13258561986781495,
"grad_norm": 1.1850024461746216,
"learning_rate": 0.00015245263618331945,
"loss": 2.2856,
"step": 331
},
{
"epoch": 0.1329861806529141,
"grad_norm": 1.2287150621414185,
"learning_rate": 0.0001521821974081246,
"loss": 2.0198,
"step": 332
},
{
"epoch": 0.1333867414380132,
"grad_norm": 0.8819483518600464,
"learning_rate": 0.00015191123315878123,
"loss": 2.1921,
"step": 333
},
{
"epoch": 0.13378730222311236,
"grad_norm": 1.0964730978012085,
"learning_rate": 0.0001516397461638962,
"loss": 1.7096,
"step": 334
},
{
"epoch": 0.1341878630082115,
"grad_norm": 0.9685081839561462,
"learning_rate": 0.00015136773915734066,
"loss": 1.8356,
"step": 335
},
{
"epoch": 0.13458842379331062,
"grad_norm": 1.3063654899597168,
"learning_rate": 0.00015109521487822206,
"loss": 1.7442,
"step": 336
},
{
"epoch": 0.13498898457840977,
"grad_norm": 1.117842674255371,
"learning_rate": 0.00015082217607085692,
"loss": 2.0806,
"step": 337
},
{
"epoch": 0.13538954536350892,
"grad_norm": 1.071869969367981,
"learning_rate": 0.000150548625484743,
"loss": 1.6267,
"step": 338
},
{
"epoch": 0.13579010614860806,
"grad_norm": 1.2108855247497559,
"learning_rate": 0.0001502745658745316,
"loss": 1.9425,
"step": 339
},
{
"epoch": 0.13619066693370718,
"grad_norm": 1.4408931732177734,
"learning_rate": 0.00015000000000000001,
"loss": 2.3276,
"step": 340
},
{
"epoch": 0.13659122771880633,
"grad_norm": 1.092643141746521,
"learning_rate": 0.00014972493062602354,
"loss": 1.6824,
"step": 341
},
{
"epoch": 0.13699178850390548,
"grad_norm": 1.1907880306243896,
"learning_rate": 0.0001494493605225477,
"loss": 2.1771,
"step": 342
},
{
"epoch": 0.1373923492890046,
"grad_norm": 1.0870776176452637,
"learning_rate": 0.0001491732924645604,
"loss": 2.1814,
"step": 343
},
{
"epoch": 0.13779291007410374,
"grad_norm": 1.4029686450958252,
"learning_rate": 0.0001488967292320639,
"loss": 1.882,
"step": 344
},
{
"epoch": 0.1381934708592029,
"grad_norm": 0.9258529543876648,
"learning_rate": 0.00014861967361004687,
"loss": 2.2565,
"step": 345
},
{
"epoch": 0.13859403164430203,
"grad_norm": 1.3441503047943115,
"learning_rate": 0.00014834212838845637,
"loss": 1.9838,
"step": 346
},
{
"epoch": 0.13899459242940115,
"grad_norm": 1.1867705583572388,
"learning_rate": 0.00014806409636216973,
"loss": 1.6701,
"step": 347
},
{
"epoch": 0.1393951532145003,
"grad_norm": 1.2288343906402588,
"learning_rate": 0.00014778558033096633,
"loss": 1.932,
"step": 348
},
{
"epoch": 0.13979571399959945,
"grad_norm": 1.100757360458374,
"learning_rate": 0.0001475065830994995,
"loss": 1.6942,
"step": 349
},
{
"epoch": 0.14019627478469857,
"grad_norm": 0.8742533326148987,
"learning_rate": 0.0001472271074772683,
"loss": 1.8398,
"step": 350
},
{
"epoch": 0.1405968355697977,
"grad_norm": 1.299513578414917,
"learning_rate": 0.00014694715627858908,
"loss": 1.5597,
"step": 351
},
{
"epoch": 0.14099739635489686,
"grad_norm": 1.0255597829818726,
"learning_rate": 0.00014666673232256738,
"loss": 2.1195,
"step": 352
},
{
"epoch": 0.141397957139996,
"grad_norm": 0.9807868003845215,
"learning_rate": 0.00014638583843306927,
"loss": 2.0172,
"step": 353
},
{
"epoch": 0.14179851792509512,
"grad_norm": 1.029623031616211,
"learning_rate": 0.00014610447743869314,
"loss": 2.1941,
"step": 354
},
{
"epoch": 0.14219907871019427,
"grad_norm": 1.2617048025131226,
"learning_rate": 0.00014582265217274104,
"loss": 2.0539,
"step": 355
},
{
"epoch": 0.14259963949529342,
"grad_norm": 1.2233887910842896,
"learning_rate": 0.00014554036547319033,
"loss": 2.1597,
"step": 356
},
{
"epoch": 0.14300020028039254,
"grad_norm": 1.778652548789978,
"learning_rate": 0.00014525762018266483,
"loss": 2.6006,
"step": 357
},
{
"epoch": 0.14340076106549168,
"grad_norm": 0.983271062374115,
"learning_rate": 0.0001449744191484066,
"loss": 1.764,
"step": 358
},
{
"epoch": 0.14380132185059083,
"grad_norm": 1.3344851732254028,
"learning_rate": 0.0001446907652222468,
"loss": 1.8017,
"step": 359
},
{
"epoch": 0.14420188263568998,
"grad_norm": 1.0751646757125854,
"learning_rate": 0.00014440666126057744,
"loss": 2.0915,
"step": 360
},
{
"epoch": 0.1446024434207891,
"grad_norm": 1.1397452354431152,
"learning_rate": 0.00014412211012432212,
"loss": 2.232,
"step": 361
},
{
"epoch": 0.14500300420588824,
"grad_norm": 1.3193897008895874,
"learning_rate": 0.00014383711467890774,
"loss": 2.4251,
"step": 362
},
{
"epoch": 0.1454035649909874,
"grad_norm": 1.5369534492492676,
"learning_rate": 0.00014355167779423524,
"loss": 2.1641,
"step": 363
},
{
"epoch": 0.1458041257760865,
"grad_norm": 1.2363543510437012,
"learning_rate": 0.00014326580234465085,
"loss": 2.2259,
"step": 364
},
{
"epoch": 0.14620468656118565,
"grad_norm": 1.2943087816238403,
"learning_rate": 0.00014297949120891718,
"loss": 2.0529,
"step": 365
},
{
"epoch": 0.1466052473462848,
"grad_norm": 1.405510663986206,
"learning_rate": 0.0001426927472701842,
"loss": 1.7095,
"step": 366
},
{
"epoch": 0.14700580813138395,
"grad_norm": 1.560726284980774,
"learning_rate": 0.00014240557341596018,
"loss": 1.8077,
"step": 367
},
{
"epoch": 0.14740636891648307,
"grad_norm": 1.2849904298782349,
"learning_rate": 0.00014211797253808268,
"loss": 1.9614,
"step": 368
},
{
"epoch": 0.1478069297015822,
"grad_norm": 1.205696940422058,
"learning_rate": 0.00014182994753268927,
"loss": 2.1995,
"step": 369
},
{
"epoch": 0.14820749048668136,
"grad_norm": 1.301679253578186,
"learning_rate": 0.00014154150130018866,
"loss": 2.0094,
"step": 370
},
{
"epoch": 0.1486080512717805,
"grad_norm": 1.511448860168457,
"learning_rate": 0.00014125263674523114,
"loss": 1.9612,
"step": 371
},
{
"epoch": 0.14900861205687962,
"grad_norm": 1.176255226135254,
"learning_rate": 0.00014096335677667954,
"loss": 1.6863,
"step": 372
},
{
"epoch": 0.14940917284197877,
"grad_norm": 1.1221837997436523,
"learning_rate": 0.00014067366430758004,
"loss": 2.4452,
"step": 373
},
{
"epoch": 0.14980973362707792,
"grad_norm": 1.1222666501998901,
"learning_rate": 0.00014038356225513248,
"loss": 1.7168,
"step": 374
},
{
"epoch": 0.15021029441217704,
"grad_norm": 1.5494662523269653,
"learning_rate": 0.00014009305354066137,
"loss": 1.9266,
"step": 375
},
{
"epoch": 0.15061085519727618,
"grad_norm": 1.2402633428573608,
"learning_rate": 0.00013980214108958624,
"loss": 1.8818,
"step": 376
},
{
"epoch": 0.15101141598237533,
"grad_norm": 1.3550876379013062,
"learning_rate": 0.0001395108278313922,
"loss": 1.6726,
"step": 377
},
{
"epoch": 0.15141197676747448,
"grad_norm": 0.9974124431610107,
"learning_rate": 0.00013921911669960055,
"loss": 2.3311,
"step": 378
},
{
"epoch": 0.1518125375525736,
"grad_norm": 1.5511209964752197,
"learning_rate": 0.00013892701063173918,
"loss": 1.7031,
"step": 379
},
{
"epoch": 0.15221309833767274,
"grad_norm": 1.1340442895889282,
"learning_rate": 0.00013863451256931287,
"loss": 1.9426,
"step": 380
},
{
"epoch": 0.1526136591227719,
"grad_norm": 1.4769186973571777,
"learning_rate": 0.00013834162545777395,
"loss": 1.9275,
"step": 381
},
{
"epoch": 0.153014219907871,
"grad_norm": 1.0888420343399048,
"learning_rate": 0.0001380483522464923,
"loss": 2.2289,
"step": 382
},
{
"epoch": 0.15341478069297015,
"grad_norm": 1.482541799545288,
"learning_rate": 0.000137754695888726,
"loss": 1.8827,
"step": 383
},
{
"epoch": 0.1538153414780693,
"grad_norm": 1.3306676149368286,
"learning_rate": 0.00013746065934159123,
"loss": 2.2292,
"step": 384
},
{
"epoch": 0.15421590226316845,
"grad_norm": 1.2816635370254517,
"learning_rate": 0.00013716624556603274,
"loss": 1.9847,
"step": 385
},
{
"epoch": 0.15461646304826757,
"grad_norm": 1.5046172142028809,
"learning_rate": 0.0001368714575267941,
"loss": 2.0836,
"step": 386
},
{
"epoch": 0.1550170238333667,
"grad_norm": 0.8114765882492065,
"learning_rate": 0.00013657629819238746,
"loss": 1.9699,
"step": 387
},
{
"epoch": 0.15541758461846586,
"grad_norm": 1.2298461198806763,
"learning_rate": 0.0001362807705350641,
"loss": 2.0324,
"step": 388
},
{
"epoch": 0.15581814540356498,
"grad_norm": 0.8663463592529297,
"learning_rate": 0.00013598487753078425,
"loss": 1.9759,
"step": 389
},
{
"epoch": 0.15621870618866412,
"grad_norm": 1.268539309501648,
"learning_rate": 0.00013568862215918717,
"loss": 1.7154,
"step": 390
},
{
"epoch": 0.15661926697376327,
"grad_norm": 1.3727500438690186,
"learning_rate": 0.00013539200740356118,
"loss": 1.9043,
"step": 391
},
{
"epoch": 0.15701982775886242,
"grad_norm": 1.1378331184387207,
"learning_rate": 0.00013509503625081358,
"loss": 1.7391,
"step": 392
},
{
"epoch": 0.15742038854396154,
"grad_norm": 1.4207416772842407,
"learning_rate": 0.0001347977116914405,
"loss": 2.0419,
"step": 393
},
{
"epoch": 0.15782094932906068,
"grad_norm": 1.185951828956604,
"learning_rate": 0.00013450003671949706,
"loss": 2.2408,
"step": 394
},
{
"epoch": 0.15822151011415983,
"grad_norm": 1.1980212926864624,
"learning_rate": 0.00013420201433256689,
"loss": 2.0313,
"step": 395
},
{
"epoch": 0.15862207089925895,
"grad_norm": 1.2082370519638062,
"learning_rate": 0.00013390364753173206,
"loss": 2.6664,
"step": 396
},
{
"epoch": 0.1590226316843581,
"grad_norm": 1.188747525215149,
"learning_rate": 0.00013360493932154302,
"loss": 1.9586,
"step": 397
},
{
"epoch": 0.15942319246945724,
"grad_norm": 1.4110333919525146,
"learning_rate": 0.00013330589270998808,
"loss": 2.0768,
"step": 398
},
{
"epoch": 0.1598237532545564,
"grad_norm": 1.0672770738601685,
"learning_rate": 0.00013300651070846333,
"loss": 1.8173,
"step": 399
},
{
"epoch": 0.1602243140396555,
"grad_norm": 1.4332971572875977,
"learning_rate": 0.00013270679633174218,
"loss": 2.2275,
"step": 400
},
{
"epoch": 0.16062487482475465,
"grad_norm": 1.1819405555725098,
"learning_rate": 0.00013240675259794507,
"loss": 1.949,
"step": 401
},
{
"epoch": 0.1610254356098538,
"grad_norm": 1.5723814964294434,
"learning_rate": 0.00013210638252850908,
"loss": 1.8285,
"step": 402
},
{
"epoch": 0.16142599639495295,
"grad_norm": 1.25686776638031,
"learning_rate": 0.00013180568914815752,
"loss": 1.5269,
"step": 403
},
{
"epoch": 0.16182655718005207,
"grad_norm": 0.9914525151252747,
"learning_rate": 0.0001315046754848693,
"loss": 1.7253,
"step": 404
},
{
"epoch": 0.1622271179651512,
"grad_norm": 1.1965891122817993,
"learning_rate": 0.0001312033445698487,
"loss": 1.9786,
"step": 405
},
{
"epoch": 0.16262767875025036,
"grad_norm": 1.2011706829071045,
"learning_rate": 0.00013090169943749476,
"loss": 1.7513,
"step": 406
},
{
"epoch": 0.16302823953534948,
"grad_norm": 1.2285467386245728,
"learning_rate": 0.00013059974312537053,
"loss": 2.1979,
"step": 407
},
{
"epoch": 0.16342880032044863,
"grad_norm": 1.1085333824157715,
"learning_rate": 0.00013029747867417276,
"loss": 2.4344,
"step": 408
},
{
"epoch": 0.16382936110554777,
"grad_norm": 1.0605233907699585,
"learning_rate": 0.00012999490912770107,
"loss": 1.9061,
"step": 409
},
{
"epoch": 0.16422992189064692,
"grad_norm": 1.0893604755401611,
"learning_rate": 0.0001296920375328275,
"loss": 2.0987,
"step": 410
},
{
"epoch": 0.16463048267574604,
"grad_norm": 1.1180847883224487,
"learning_rate": 0.0001293888669394656,
"loss": 1.7212,
"step": 411
},
{
"epoch": 0.16503104346084518,
"grad_norm": 1.000549077987671,
"learning_rate": 0.0001290854004005399,
"loss": 1.8383,
"step": 412
},
{
"epoch": 0.16543160424594433,
"grad_norm": 1.3807919025421143,
"learning_rate": 0.0001287816409719551,
"loss": 2.2079,
"step": 413
},
{
"epoch": 0.16583216503104345,
"grad_norm": 0.9979914426803589,
"learning_rate": 0.00012847759171256523,
"loss": 2.0307,
"step": 414
},
{
"epoch": 0.1662327258161426,
"grad_norm": 1.1787306070327759,
"learning_rate": 0.00012817325568414297,
"loss": 1.9271,
"step": 415
},
{
"epoch": 0.16663328660124174,
"grad_norm": 1.1426745653152466,
"learning_rate": 0.0001278686359513488,
"loss": 2.2186,
"step": 416
},
{
"epoch": 0.1670338473863409,
"grad_norm": 1.1593252420425415,
"learning_rate": 0.0001275637355816999,
"loss": 1.8586,
"step": 417
},
{
"epoch": 0.16743440817144,
"grad_norm": 1.3415766954421997,
"learning_rate": 0.0001272585576455398,
"loss": 2.0948,
"step": 418
},
{
"epoch": 0.16783496895653915,
"grad_norm": 1.2215737104415894,
"learning_rate": 0.0001269531052160068,
"loss": 2.1967,
"step": 419
},
{
"epoch": 0.1682355297416383,
"grad_norm": 0.9574311375617981,
"learning_rate": 0.00012664738136900348,
"loss": 1.7765,
"step": 420
},
{
"epoch": 0.16863609052673742,
"grad_norm": 1.449532151222229,
"learning_rate": 0.00012634138918316568,
"loss": 2.0815,
"step": 421
},
{
"epoch": 0.16903665131183657,
"grad_norm": 1.1745095252990723,
"learning_rate": 0.0001260351317398312,
"loss": 1.9517,
"step": 422
},
{
"epoch": 0.1694372120969357,
"grad_norm": 1.4526337385177612,
"learning_rate": 0.00012572861212300918,
"loss": 2.3156,
"step": 423
},
{
"epoch": 0.16983777288203486,
"grad_norm": 1.08524751663208,
"learning_rate": 0.00012542183341934872,
"loss": 1.7148,
"step": 424
},
{
"epoch": 0.17023833366713398,
"grad_norm": 1.2243317365646362,
"learning_rate": 0.0001251147987181079,
"loss": 1.8422,
"step": 425
},
{
"epoch": 0.17063889445223313,
"grad_norm": 1.26092529296875,
"learning_rate": 0.0001248075111111229,
"loss": 1.7454,
"step": 426
},
{
"epoch": 0.17103945523733227,
"grad_norm": 1.1354914903640747,
"learning_rate": 0.0001244999736927764,
"loss": 1.9414,
"step": 427
},
{
"epoch": 0.1714400160224314,
"grad_norm": 1.1014437675476074,
"learning_rate": 0.00012419218955996676,
"loss": 2.2449,
"step": 428
},
{
"epoch": 0.17184057680753054,
"grad_norm": 0.9806610941886902,
"learning_rate": 0.0001238841618120769,
"loss": 1.9453,
"step": 429
},
{
"epoch": 0.17224113759262968,
"grad_norm": 1.0223315954208374,
"learning_rate": 0.00012357589355094275,
"loss": 1.9353,
"step": 430
},
{
"epoch": 0.17264169837772883,
"grad_norm": 1.0702928304672241,
"learning_rate": 0.00012326738788082223,
"loss": 2.0656,
"step": 431
},
{
"epoch": 0.17304225916282795,
"grad_norm": 1.4158331155776978,
"learning_rate": 0.0001229586479083641,
"loss": 2.2464,
"step": 432
},
{
"epoch": 0.1734428199479271,
"grad_norm": 1.3966501951217651,
"learning_rate": 0.00012264967674257646,
"loss": 1.6745,
"step": 433
},
{
"epoch": 0.17384338073302624,
"grad_norm": 1.1722941398620605,
"learning_rate": 0.00012234047749479544,
"loss": 1.8942,
"step": 434
},
{
"epoch": 0.1742439415181254,
"grad_norm": 1.2961608171463013,
"learning_rate": 0.00012203105327865407,
"loss": 1.5352,
"step": 435
},
{
"epoch": 0.1746445023032245,
"grad_norm": 1.529359221458435,
"learning_rate": 0.00012172140721005079,
"loss": 1.8913,
"step": 436
},
{
"epoch": 0.17504506308832365,
"grad_norm": 1.3748160600662231,
"learning_rate": 0.00012141154240711805,
"loss": 1.8422,
"step": 437
},
{
"epoch": 0.1754456238734228,
"grad_norm": 1.4000550508499146,
"learning_rate": 0.000121101461990191,
"loss": 2.3207,
"step": 438
},
{
"epoch": 0.17584618465852192,
"grad_norm": 1.587672472000122,
"learning_rate": 0.00012079116908177593,
"loss": 2.0086,
"step": 439
},
{
"epoch": 0.17624674544362107,
"grad_norm": 0.9797202348709106,
"learning_rate": 0.00012048066680651908,
"loss": 1.6867,
"step": 440
},
{
"epoch": 0.1766473062287202,
"grad_norm": 1.153014063835144,
"learning_rate": 0.00012016995829117488,
"loss": 1.8881,
"step": 441
},
{
"epoch": 0.17704786701381936,
"grad_norm": 1.408564805984497,
"learning_rate": 0.00011985904666457455,
"loss": 2.1962,
"step": 442
},
{
"epoch": 0.17744842779891848,
"grad_norm": 1.1497104167938232,
"learning_rate": 0.00011954793505759483,
"loss": 1.8189,
"step": 443
},
{
"epoch": 0.17784898858401763,
"grad_norm": 0.9838416576385498,
"learning_rate": 0.00011923662660312611,
"loss": 2.0812,
"step": 444
},
{
"epoch": 0.17824954936911677,
"grad_norm": 1.144004464149475,
"learning_rate": 0.00011892512443604102,
"loss": 1.9226,
"step": 445
},
{
"epoch": 0.1786501101542159,
"grad_norm": 1.2132439613342285,
"learning_rate": 0.00011861343169316301,
"loss": 2.155,
"step": 446
},
{
"epoch": 0.17905067093931504,
"grad_norm": 1.1353342533111572,
"learning_rate": 0.00011830155151323446,
"loss": 2.1583,
"step": 447
},
{
"epoch": 0.17945123172441418,
"grad_norm": 1.1002765893936157,
"learning_rate": 0.00011798948703688539,
"loss": 1.6904,
"step": 448
},
{
"epoch": 0.17985179250951333,
"grad_norm": 1.1224271059036255,
"learning_rate": 0.00011767724140660157,
"loss": 1.7376,
"step": 449
},
{
"epoch": 0.18025235329461245,
"grad_norm": 1.2441977262496948,
"learning_rate": 0.00011736481776669306,
"loss": 2.3133,
"step": 450
},
{
"epoch": 0.1806529140797116,
"grad_norm": 0.9376555681228638,
"learning_rate": 0.0001170522192632624,
"loss": 1.7441,
"step": 451
},
{
"epoch": 0.18105347486481074,
"grad_norm": 1.1824123859405518,
"learning_rate": 0.00011673944904417308,
"loss": 1.8746,
"step": 452
},
{
"epoch": 0.18145403564990986,
"grad_norm": 1.1003307104110718,
"learning_rate": 0.00011642651025901772,
"loss": 1.7693,
"step": 453
},
{
"epoch": 0.181854596435009,
"grad_norm": 1.5860931873321533,
"learning_rate": 0.00011611340605908642,
"loss": 2.4127,
"step": 454
},
{
"epoch": 0.18225515722010815,
"grad_norm": 1.332067608833313,
"learning_rate": 0.000115800139597335,
"loss": 1.9801,
"step": 455
},
{
"epoch": 0.1826557180052073,
"grad_norm": 1.4606534242630005,
"learning_rate": 0.00011548671402835325,
"loss": 1.9786,
"step": 456
},
{
"epoch": 0.18305627879030642,
"grad_norm": 1.221954345703125,
"learning_rate": 0.00011517313250833317,
"loss": 1.7502,
"step": 457
},
{
"epoch": 0.18345683957540557,
"grad_norm": 1.2147043943405151,
"learning_rate": 0.00011485939819503717,
"loss": 1.8452,
"step": 458
},
{
"epoch": 0.1838574003605047,
"grad_norm": 0.9856990575790405,
"learning_rate": 0.00011454551424776637,
"loss": 2.2753,
"step": 459
},
{
"epoch": 0.18425796114560383,
"grad_norm": 1.21888267993927,
"learning_rate": 0.00011423148382732853,
"loss": 1.6949,
"step": 460
},
{
"epoch": 0.18465852193070298,
"grad_norm": 1.1283800601959229,
"learning_rate": 0.00011391731009600654,
"loss": 2.1301,
"step": 461
},
{
"epoch": 0.18505908271580213,
"grad_norm": 1.1772786378860474,
"learning_rate": 0.00011360299621752644,
"loss": 1.8366,
"step": 462
},
{
"epoch": 0.18545964350090127,
"grad_norm": 1.0780948400497437,
"learning_rate": 0.00011328854535702543,
"loss": 1.4893,
"step": 463
},
{
"epoch": 0.1858602042860004,
"grad_norm": 1.0804064273834229,
"learning_rate": 0.00011297396068102017,
"loss": 1.9614,
"step": 464
},
{
"epoch": 0.18626076507109954,
"grad_norm": 0.8003168106079102,
"learning_rate": 0.00011265924535737493,
"loss": 1.6996,
"step": 465
},
{
"epoch": 0.18666132585619868,
"grad_norm": 1.1143856048583984,
"learning_rate": 0.00011234440255526948,
"loss": 1.8898,
"step": 466
},
{
"epoch": 0.1870618866412978,
"grad_norm": 1.1231998205184937,
"learning_rate": 0.00011202943544516736,
"loss": 1.9706,
"step": 467
},
{
"epoch": 0.18746244742639695,
"grad_norm": 1.4141852855682373,
"learning_rate": 0.00011171434719878384,
"loss": 1.7378,
"step": 468
},
{
"epoch": 0.1878630082114961,
"grad_norm": 1.2133177518844604,
"learning_rate": 0.00011139914098905406,
"loss": 1.9353,
"step": 469
},
{
"epoch": 0.18826356899659524,
"grad_norm": 1.365179419517517,
"learning_rate": 0.00011108381999010111,
"loss": 2.4052,
"step": 470
},
{
"epoch": 0.18866412978169436,
"grad_norm": 1.2176018953323364,
"learning_rate": 0.00011076838737720392,
"loss": 1.7026,
"step": 471
},
{
"epoch": 0.1890646905667935,
"grad_norm": 1.1604026556015015,
"learning_rate": 0.00011045284632676536,
"loss": 1.9168,
"step": 472
},
{
"epoch": 0.18946525135189266,
"grad_norm": 1.3729729652404785,
"learning_rate": 0.00011013720001628035,
"loss": 2.1561,
"step": 473
},
{
"epoch": 0.1898658121369918,
"grad_norm": 1.2137882709503174,
"learning_rate": 0.00010982145162430373,
"loss": 2.3844,
"step": 474
},
{
"epoch": 0.19026637292209092,
"grad_norm": 0.9215346574783325,
"learning_rate": 0.00010950560433041826,
"loss": 1.8599,
"step": 475
},
{
"epoch": 0.19066693370719007,
"grad_norm": 1.2497369050979614,
"learning_rate": 0.00010918966131520277,
"loss": 1.829,
"step": 476
},
{
"epoch": 0.1910674944922892,
"grad_norm": 1.2438724040985107,
"learning_rate": 0.00010887362576019981,
"loss": 1.907,
"step": 477
},
{
"epoch": 0.19146805527738833,
"grad_norm": 0.7997929453849792,
"learning_rate": 0.00010855750084788398,
"loss": 1.8881,
"step": 478
},
{
"epoch": 0.19186861606248748,
"grad_norm": 1.239675760269165,
"learning_rate": 0.00010824128976162964,
"loss": 2.1791,
"step": 479
},
{
"epoch": 0.19226917684758663,
"grad_norm": 1.1958426237106323,
"learning_rate": 0.00010792499568567884,
"loss": 2.1612,
"step": 480
},
{
"epoch": 0.19266973763268577,
"grad_norm": 1.2139252424240112,
"learning_rate": 0.00010760862180510951,
"loss": 1.7926,
"step": 481
},
{
"epoch": 0.1930702984177849,
"grad_norm": 1.43398118019104,
"learning_rate": 0.0001072921713058031,
"loss": 1.7103,
"step": 482
},
{
"epoch": 0.19347085920288404,
"grad_norm": 0.9710477590560913,
"learning_rate": 0.00010697564737441252,
"loss": 1.8695,
"step": 483
},
{
"epoch": 0.19387141998798318,
"grad_norm": 1.897813320159912,
"learning_rate": 0.00010665905319833041,
"loss": 2.1622,
"step": 484
},
{
"epoch": 0.1942719807730823,
"grad_norm": 1.0615317821502686,
"learning_rate": 0.00010634239196565646,
"loss": 2.4186,
"step": 485
},
{
"epoch": 0.19467254155818145,
"grad_norm": 1.06027090549469,
"learning_rate": 0.00010602566686516586,
"loss": 2.3492,
"step": 486
},
{
"epoch": 0.1950731023432806,
"grad_norm": 1.3258793354034424,
"learning_rate": 0.00010570888108627681,
"loss": 2.0268,
"step": 487
},
{
"epoch": 0.19547366312837974,
"grad_norm": 1.344404935836792,
"learning_rate": 0.00010539203781901861,
"loss": 2.1251,
"step": 488
},
{
"epoch": 0.19587422391347886,
"grad_norm": 1.5554450750350952,
"learning_rate": 0.00010507514025399943,
"loss": 1.6521,
"step": 489
},
{
"epoch": 0.196274784698578,
"grad_norm": 1.1709057092666626,
"learning_rate": 0.00010475819158237425,
"loss": 1.9683,
"step": 490
},
{
"epoch": 0.19667534548367716,
"grad_norm": 1.4332846403121948,
"learning_rate": 0.00010444119499581261,
"loss": 1.8998,
"step": 491
},
{
"epoch": 0.19707590626877627,
"grad_norm": 1.3177927732467651,
"learning_rate": 0.00010412415368646673,
"loss": 1.6543,
"step": 492
},
{
"epoch": 0.19747646705387542,
"grad_norm": 1.3712493181228638,
"learning_rate": 0.00010380707084693901,
"loss": 1.995,
"step": 493
},
{
"epoch": 0.19787702783897457,
"grad_norm": 1.2591536045074463,
"learning_rate": 0.00010348994967025012,
"loss": 1.8751,
"step": 494
},
{
"epoch": 0.19827758862407371,
"grad_norm": 1.3137924671173096,
"learning_rate": 0.00010317279334980678,
"loss": 1.8033,
"step": 495
},
{
"epoch": 0.19867814940917283,
"grad_norm": 1.0903714895248413,
"learning_rate": 0.00010285560507936961,
"loss": 1.7871,
"step": 496
},
{
"epoch": 0.19907871019427198,
"grad_norm": 1.3193492889404297,
"learning_rate": 0.00010253838805302104,
"loss": 2.0395,
"step": 497
},
{
"epoch": 0.19947927097937113,
"grad_norm": 1.2793503999710083,
"learning_rate": 0.00010222114546513295,
"loss": 2.0782,
"step": 498
},
{
"epoch": 0.19987983176447024,
"grad_norm": 1.292874813079834,
"learning_rate": 0.00010190388051033466,
"loss": 1.8531,
"step": 499
},
{
"epoch": 0.2002803925495694,
"grad_norm": 1.136939525604248,
"learning_rate": 0.00010158659638348081,
"loss": 2.0028,
"step": 500
},
{
"epoch": 0.2002803925495694,
"eval_loss": 1.9258522987365723,
"eval_runtime": 32.6934,
"eval_samples_per_second": 32.178,
"eval_steps_per_second": 16.089,
"step": 500
},
{
"epoch": 0.20068095333466854,
"grad_norm": 1.1813311576843262,
"learning_rate": 0.00010126929627961896,
"loss": 1.8187,
"step": 501
},
{
"epoch": 0.20108151411976768,
"grad_norm": 1.4287338256835938,
"learning_rate": 0.00010095198339395769,
"loss": 2.1428,
"step": 502
},
{
"epoch": 0.2014820749048668,
"grad_norm": 1.4133274555206299,
"learning_rate": 0.0001006346609218342,
"loss": 2.0651,
"step": 503
},
{
"epoch": 0.20188263568996595,
"grad_norm": 1.1729075908660889,
"learning_rate": 0.00010031733205868224,
"loss": 1.9114,
"step": 504
},
{
"epoch": 0.2022831964750651,
"grad_norm": 1.2264689207077026,
"learning_rate": 0.0001,
"loss": 1.7613,
"step": 505
},
{
"epoch": 0.20268375726016424,
"grad_norm": 1.221653938293457,
"learning_rate": 9.968266794131777e-05,
"loss": 1.9397,
"step": 506
},
{
"epoch": 0.20308431804526336,
"grad_norm": 1.4370172023773193,
"learning_rate": 9.936533907816584e-05,
"loss": 1.7342,
"step": 507
},
{
"epoch": 0.2034848788303625,
"grad_norm": 1.2867745161056519,
"learning_rate": 9.904801660604234e-05,
"loss": 2.0417,
"step": 508
},
{
"epoch": 0.20388543961546166,
"grad_norm": 1.123342514038086,
"learning_rate": 9.873070372038105e-05,
"loss": 2.0177,
"step": 509
},
{
"epoch": 0.20428600040056077,
"grad_norm": 1.1308317184448242,
"learning_rate": 9.84134036165192e-05,
"loss": 1.6625,
"step": 510
},
{
"epoch": 0.20468656118565992,
"grad_norm": 1.1609257459640503,
"learning_rate": 9.809611948966533e-05,
"loss": 1.7069,
"step": 511
},
{
"epoch": 0.20508712197075907,
"grad_norm": 1.5887484550476074,
"learning_rate": 9.777885453486706e-05,
"loss": 1.9354,
"step": 512
},
{
"epoch": 0.20548768275585821,
"grad_norm": 1.1795055866241455,
"learning_rate": 9.746161194697895e-05,
"loss": 1.9799,
"step": 513
},
{
"epoch": 0.20588824354095733,
"grad_norm": 1.4398356676101685,
"learning_rate": 9.71443949206304e-05,
"loss": 2.0568,
"step": 514
},
{
"epoch": 0.20628880432605648,
"grad_norm": 1.0796585083007812,
"learning_rate": 9.682720665019325e-05,
"loss": 1.9268,
"step": 515
},
{
"epoch": 0.20668936511115563,
"grad_norm": 1.221709132194519,
"learning_rate": 9.651005032974994e-05,
"loss": 1.894,
"step": 516
},
{
"epoch": 0.20708992589625475,
"grad_norm": 1.182438850402832,
"learning_rate": 9.619292915306101e-05,
"loss": 2.5103,
"step": 517
},
{
"epoch": 0.2074904866813539,
"grad_norm": 1.056098222732544,
"learning_rate": 9.587584631353329e-05,
"loss": 1.6194,
"step": 518
},
{
"epoch": 0.20789104746645304,
"grad_norm": 1.1654489040374756,
"learning_rate": 9.55588050041874e-05,
"loss": 2.2766,
"step": 519
},
{
"epoch": 0.20829160825155218,
"grad_norm": 1.1800388097763062,
"learning_rate": 9.524180841762577e-05,
"loss": 1.8447,
"step": 520
},
{
"epoch": 0.2086921690366513,
"grad_norm": 1.13406503200531,
"learning_rate": 9.492485974600059e-05,
"loss": 2.1673,
"step": 521
},
{
"epoch": 0.20909272982175045,
"grad_norm": 1.5622755289077759,
"learning_rate": 9.460796218098143e-05,
"loss": 1.8394,
"step": 522
},
{
"epoch": 0.2094932906068496,
"grad_norm": 0.9142590165138245,
"learning_rate": 9.42911189137232e-05,
"loss": 2.2018,
"step": 523
},
{
"epoch": 0.20989385139194872,
"grad_norm": 0.903015673160553,
"learning_rate": 9.397433313483416e-05,
"loss": 2.114,
"step": 524
},
{
"epoch": 0.21029441217704786,
"grad_norm": 0.8409507870674133,
"learning_rate": 9.365760803434355e-05,
"loss": 2.3348,
"step": 525
},
{
"epoch": 0.210694972962147,
"grad_norm": 0.9404178261756897,
"learning_rate": 9.334094680166962e-05,
"loss": 2.0019,
"step": 526
},
{
"epoch": 0.21109553374724616,
"grad_norm": 1.1843013763427734,
"learning_rate": 9.302435262558747e-05,
"loss": 2.2249,
"step": 527
},
{
"epoch": 0.21149609453234527,
"grad_norm": 1.0555171966552734,
"learning_rate": 9.270782869419694e-05,
"loss": 2.0795,
"step": 528
},
{
"epoch": 0.21189665531744442,
"grad_norm": 1.2614086866378784,
"learning_rate": 9.239137819489047e-05,
"loss": 1.9405,
"step": 529
},
{
"epoch": 0.21229721610254357,
"grad_norm": 1.073330044746399,
"learning_rate": 9.207500431432115e-05,
"loss": 1.8541,
"step": 530
},
{
"epoch": 0.2126977768876427,
"grad_norm": 1.0380101203918457,
"learning_rate": 9.175871023837042e-05,
"loss": 2.0226,
"step": 531
},
{
"epoch": 0.21309833767274183,
"grad_norm": 1.1019260883331299,
"learning_rate": 9.144249915211605e-05,
"loss": 1.926,
"step": 532
},
{
"epoch": 0.21349889845784098,
"grad_norm": 0.8795924782752991,
"learning_rate": 9.112637423980021e-05,
"loss": 2.1929,
"step": 533
},
{
"epoch": 0.21389945924294013,
"grad_norm": 1.0507018566131592,
"learning_rate": 9.081033868479727e-05,
"loss": 2.4226,
"step": 534
},
{
"epoch": 0.21430002002803925,
"grad_norm": 1.4842091798782349,
"learning_rate": 9.049439566958175e-05,
"loss": 2.2066,
"step": 535
},
{
"epoch": 0.2147005808131384,
"grad_norm": 1.1068452596664429,
"learning_rate": 9.01785483756963e-05,
"loss": 1.7091,
"step": 536
},
{
"epoch": 0.21510114159823754,
"grad_norm": 1.0645016431808472,
"learning_rate": 8.986279998371966e-05,
"loss": 1.7829,
"step": 537
},
{
"epoch": 0.21550170238333669,
"grad_norm": 1.3148471117019653,
"learning_rate": 8.954715367323468e-05,
"loss": 1.7944,
"step": 538
},
{
"epoch": 0.2159022631684358,
"grad_norm": 1.3371472358703613,
"learning_rate": 8.92316126227961e-05,
"loss": 1.8282,
"step": 539
},
{
"epoch": 0.21630282395353495,
"grad_norm": 1.3502963781356812,
"learning_rate": 8.891618000989891e-05,
"loss": 1.7832,
"step": 540
},
{
"epoch": 0.2167033847386341,
"grad_norm": 1.118548035621643,
"learning_rate": 8.860085901094595e-05,
"loss": 1.8526,
"step": 541
},
{
"epoch": 0.21710394552373322,
"grad_norm": 1.3419970273971558,
"learning_rate": 8.828565280121617e-05,
"loss": 2.0795,
"step": 542
},
{
"epoch": 0.21750450630883236,
"grad_norm": 1.457339882850647,
"learning_rate": 8.797056455483266e-05,
"loss": 1.9845,
"step": 543
},
{
"epoch": 0.2179050670939315,
"grad_norm": 1.106742024421692,
"learning_rate": 8.765559744473053e-05,
"loss": 1.6308,
"step": 544
},
{
"epoch": 0.21830562787903066,
"grad_norm": 1.377302885055542,
"learning_rate": 8.734075464262507e-05,
"loss": 2.0252,
"step": 545
},
{
"epoch": 0.21870618866412977,
"grad_norm": 1.283003807067871,
"learning_rate": 8.702603931897982e-05,
"loss": 1.7937,
"step": 546
},
{
"epoch": 0.21910674944922892,
"grad_norm": 1.376645803451538,
"learning_rate": 8.67114546429746e-05,
"loss": 2.0404,
"step": 547
},
{
"epoch": 0.21950731023432807,
"grad_norm": 1.074034571647644,
"learning_rate": 8.639700378247361e-05,
"loss": 1.896,
"step": 548
},
{
"epoch": 0.2199078710194272,
"grad_norm": 1.4838452339172363,
"learning_rate": 8.608268990399349e-05,
"loss": 1.7861,
"step": 549
},
{
"epoch": 0.22030843180452633,
"grad_norm": 0.964583158493042,
"learning_rate": 8.57685161726715e-05,
"loss": 2.0498,
"step": 550
},
{
"epoch": 0.22070899258962548,
"grad_norm": 1.6249324083328247,
"learning_rate": 8.545448575223368e-05,
"loss": 2.1178,
"step": 551
},
{
"epoch": 0.22110955337472463,
"grad_norm": 1.135626196861267,
"learning_rate": 8.514060180496285e-05,
"loss": 2.0566,
"step": 552
},
{
"epoch": 0.22151011415982375,
"grad_norm": 1.0067975521087646,
"learning_rate": 8.482686749166686e-05,
"loss": 2.0416,
"step": 553
},
{
"epoch": 0.2219106749449229,
"grad_norm": 1.3207542896270752,
"learning_rate": 8.451328597164679e-05,
"loss": 2.0966,
"step": 554
},
{
"epoch": 0.22231123573002204,
"grad_norm": 1.0738505125045776,
"learning_rate": 8.4199860402665e-05,
"loss": 1.8144,
"step": 555
},
{
"epoch": 0.22271179651512116,
"grad_norm": 1.1242963075637817,
"learning_rate": 8.38865939409136e-05,
"loss": 2.0862,
"step": 556
},
{
"epoch": 0.2231123573002203,
"grad_norm": 1.2468533515930176,
"learning_rate": 8.357348974098231e-05,
"loss": 2.3297,
"step": 557
},
{
"epoch": 0.22351291808531945,
"grad_norm": 1.1325228214263916,
"learning_rate": 8.326055095582694e-05,
"loss": 1.9616,
"step": 558
},
{
"epoch": 0.2239134788704186,
"grad_norm": 1.0064939260482788,
"learning_rate": 8.294778073673762e-05,
"loss": 1.8283,
"step": 559
},
{
"epoch": 0.22431403965551772,
"grad_norm": 1.3716778755187988,
"learning_rate": 8.263518223330697e-05,
"loss": 2.1096,
"step": 560
},
{
"epoch": 0.22471460044061686,
"grad_norm": 1.0072152614593506,
"learning_rate": 8.232275859339841e-05,
"loss": 2.2765,
"step": 561
},
{
"epoch": 0.225115161225716,
"grad_norm": 1.425732135772705,
"learning_rate": 8.201051296311462e-05,
"loss": 2.0824,
"step": 562
},
{
"epoch": 0.22551572201081513,
"grad_norm": 1.3738203048706055,
"learning_rate": 8.169844848676554e-05,
"loss": 1.8802,
"step": 563
},
{
"epoch": 0.22591628279591427,
"grad_norm": 1.0857584476470947,
"learning_rate": 8.1386568306837e-05,
"loss": 1.5459,
"step": 564
},
{
"epoch": 0.22631684358101342,
"grad_norm": 1.4477343559265137,
"learning_rate": 8.107487556395901e-05,
"loss": 1.5841,
"step": 565
},
{
"epoch": 0.22671740436611257,
"grad_norm": 1.384574294090271,
"learning_rate": 8.076337339687394e-05,
"loss": 1.6207,
"step": 566
},
{
"epoch": 0.2271179651512117,
"grad_norm": 1.2884398698806763,
"learning_rate": 8.045206494240521e-05,
"loss": 2.282,
"step": 567
},
{
"epoch": 0.22751852593631083,
"grad_norm": 1.4047175645828247,
"learning_rate": 8.014095333542548e-05,
"loss": 1.8804,
"step": 568
},
{
"epoch": 0.22791908672140998,
"grad_norm": 1.2710424661636353,
"learning_rate": 7.983004170882518e-05,
"loss": 1.8136,
"step": 569
},
{
"epoch": 0.2283196475065091,
"grad_norm": 0.9432646036148071,
"learning_rate": 7.951933319348095e-05,
"loss": 1.728,
"step": 570
},
{
"epoch": 0.22872020829160825,
"grad_norm": 1.2438580989837646,
"learning_rate": 7.920883091822408e-05,
"loss": 1.6693,
"step": 571
},
{
"epoch": 0.2291207690767074,
"grad_norm": 1.2749730348587036,
"learning_rate": 7.889853800980904e-05,
"loss": 1.9896,
"step": 572
},
{
"epoch": 0.22952132986180654,
"grad_norm": 1.1855494976043701,
"learning_rate": 7.858845759288198e-05,
"loss": 1.9782,
"step": 573
},
{
"epoch": 0.22992189064690566,
"grad_norm": 1.0309183597564697,
"learning_rate": 7.827859278994925e-05,
"loss": 2.2048,
"step": 574
},
{
"epoch": 0.2303224514320048,
"grad_norm": 1.5033015012741089,
"learning_rate": 7.796894672134594e-05,
"loss": 1.8239,
"step": 575
},
{
"epoch": 0.23072301221710395,
"grad_norm": 1.5223544836044312,
"learning_rate": 7.765952250520459e-05,
"loss": 2.2928,
"step": 576
},
{
"epoch": 0.2311235730022031,
"grad_norm": 1.1659106016159058,
"learning_rate": 7.735032325742355e-05,
"loss": 2.1531,
"step": 577
},
{
"epoch": 0.23152413378730222,
"grad_norm": 1.3849096298217773,
"learning_rate": 7.704135209163589e-05,
"loss": 1.8511,
"step": 578
},
{
"epoch": 0.23192469457240136,
"grad_norm": 1.0666840076446533,
"learning_rate": 7.673261211917776e-05,
"loss": 1.6983,
"step": 579
},
{
"epoch": 0.2323252553575005,
"grad_norm": 1.1079344749450684,
"learning_rate": 7.642410644905726e-05,
"loss": 1.7489,
"step": 580
},
{
"epoch": 0.23272581614259963,
"grad_norm": 1.0207995176315308,
"learning_rate": 7.611583818792311e-05,
"loss": 1.5285,
"step": 581
},
{
"epoch": 0.23312637692769878,
"grad_norm": 1.23712158203125,
"learning_rate": 7.580781044003324e-05,
"loss": 1.8596,
"step": 582
},
{
"epoch": 0.23352693771279792,
"grad_norm": 1.4562586545944214,
"learning_rate": 7.550002630722366e-05,
"loss": 2.0293,
"step": 583
},
{
"epoch": 0.23392749849789707,
"grad_norm": 1.3066940307617188,
"learning_rate": 7.519248888887716e-05,
"loss": 2.2537,
"step": 584
},
{
"epoch": 0.2343280592829962,
"grad_norm": 1.1663978099822998,
"learning_rate": 7.488520128189209e-05,
"loss": 2.3066,
"step": 585
},
{
"epoch": 0.23472862006809533,
"grad_norm": 1.2235394716262817,
"learning_rate": 7.457816658065134e-05,
"loss": 2.4259,
"step": 586
},
{
"epoch": 0.23512918085319448,
"grad_norm": 1.4642695188522339,
"learning_rate": 7.427138787699086e-05,
"loss": 2.106,
"step": 587
},
{
"epoch": 0.2355297416382936,
"grad_norm": 1.1755717992782593,
"learning_rate": 7.39648682601688e-05,
"loss": 1.8165,
"step": 588
},
{
"epoch": 0.23593030242339275,
"grad_norm": 1.2367725372314453,
"learning_rate": 7.365861081683433e-05,
"loss": 2.2718,
"step": 589
},
{
"epoch": 0.2363308632084919,
"grad_norm": 1.0610246658325195,
"learning_rate": 7.335261863099651e-05,
"loss": 2.0477,
"step": 590
},
{
"epoch": 0.23673142399359104,
"grad_norm": 1.3351134061813354,
"learning_rate": 7.304689478399323e-05,
"loss": 2.0486,
"step": 591
},
{
"epoch": 0.23713198477869016,
"grad_norm": 1.0682804584503174,
"learning_rate": 7.274144235446023e-05,
"loss": 2.0759,
"step": 592
},
{
"epoch": 0.2375325455637893,
"grad_norm": 1.245468020439148,
"learning_rate": 7.243626441830009e-05,
"loss": 2.0305,
"step": 593
},
{
"epoch": 0.23793310634888845,
"grad_norm": 1.3784598112106323,
"learning_rate": 7.213136404865124e-05,
"loss": 2.1201,
"step": 594
},
{
"epoch": 0.23833366713398757,
"grad_norm": 1.378495693206787,
"learning_rate": 7.182674431585704e-05,
"loss": 1.9075,
"step": 595
},
{
"epoch": 0.23873422791908672,
"grad_norm": 1.284844994544983,
"learning_rate": 7.152240828743477e-05,
"loss": 2.0786,
"step": 596
},
{
"epoch": 0.23913478870418586,
"grad_norm": 1.0804252624511719,
"learning_rate": 7.12183590280449e-05,
"loss": 2.1232,
"step": 597
},
{
"epoch": 0.239535349489285,
"grad_norm": 1.2147008180618286,
"learning_rate": 7.09145995994601e-05,
"loss": 1.9367,
"step": 598
},
{
"epoch": 0.23993591027438413,
"grad_norm": 1.0713460445404053,
"learning_rate": 7.061113306053443e-05,
"loss": 1.8186,
"step": 599
},
{
"epoch": 0.24033647105948328,
"grad_norm": 1.237535834312439,
"learning_rate": 7.030796246717255e-05,
"loss": 2.1206,
"step": 600
},
{
"epoch": 0.24073703184458242,
"grad_norm": 1.4354087114334106,
"learning_rate": 7.000509087229895e-05,
"loss": 2.1741,
"step": 601
},
{
"epoch": 0.24113759262968154,
"grad_norm": 1.2553120851516724,
"learning_rate": 6.970252132582728e-05,
"loss": 1.8268,
"step": 602
},
{
"epoch": 0.2415381534147807,
"grad_norm": 1.192887783050537,
"learning_rate": 6.940025687462952e-05,
"loss": 1.6772,
"step": 603
},
{
"epoch": 0.24193871419987983,
"grad_norm": 1.409682035446167,
"learning_rate": 6.909830056250527e-05,
"loss": 2.1226,
"step": 604
},
{
"epoch": 0.24233927498497898,
"grad_norm": 1.2333297729492188,
"learning_rate": 6.87966554301513e-05,
"loss": 2.201,
"step": 605
},
{
"epoch": 0.2427398357700781,
"grad_norm": 1.374571442604065,
"learning_rate": 6.849532451513074e-05,
"loss": 1.7252,
"step": 606
},
{
"epoch": 0.24314039655517725,
"grad_norm": 1.368024230003357,
"learning_rate": 6.819431085184251e-05,
"loss": 1.9868,
"step": 607
},
{
"epoch": 0.2435409573402764,
"grad_norm": 1.25948965549469,
"learning_rate": 6.789361747149093e-05,
"loss": 1.5042,
"step": 608
},
{
"epoch": 0.24394151812537554,
"grad_norm": 0.9880972504615784,
"learning_rate": 6.759324740205495e-05,
"loss": 1.8699,
"step": 609
},
{
"epoch": 0.24434207891047466,
"grad_norm": 1.016918420791626,
"learning_rate": 6.729320366825784e-05,
"loss": 1.7774,
"step": 610
},
{
"epoch": 0.2447426396955738,
"grad_norm": 1.3218151330947876,
"learning_rate": 6.699348929153668e-05,
"loss": 2.0521,
"step": 611
},
{
"epoch": 0.24514320048067295,
"grad_norm": 1.3361955881118774,
"learning_rate": 6.669410729001193e-05,
"loss": 1.9939,
"step": 612
},
{
"epoch": 0.24554376126577207,
"grad_norm": 1.083174705505371,
"learning_rate": 6.639506067845697e-05,
"loss": 1.9702,
"step": 613
},
{
"epoch": 0.24594432205087122,
"grad_norm": 1.1985284090042114,
"learning_rate": 6.609635246826794e-05,
"loss": 1.3102,
"step": 614
},
{
"epoch": 0.24634488283597036,
"grad_norm": 0.9790166020393372,
"learning_rate": 6.579798566743314e-05,
"loss": 2.3672,
"step": 615
},
{
"epoch": 0.2467454436210695,
"grad_norm": 1.3507696390151978,
"learning_rate": 6.549996328050296e-05,
"loss": 2.1707,
"step": 616
},
{
"epoch": 0.24714600440616863,
"grad_norm": 1.266715168952942,
"learning_rate": 6.52022883085595e-05,
"loss": 2.0666,
"step": 617
},
{
"epoch": 0.24754656519126778,
"grad_norm": 1.0168043375015259,
"learning_rate": 6.490496374918647e-05,
"loss": 2.1277,
"step": 618
},
{
"epoch": 0.24794712597636692,
"grad_norm": 0.8772782683372498,
"learning_rate": 6.460799259643884e-05,
"loss": 2.0311,
"step": 619
},
{
"epoch": 0.24834768676146604,
"grad_norm": 1.4147906303405762,
"learning_rate": 6.431137784081282e-05,
"loss": 2.038,
"step": 620
},
{
"epoch": 0.2487482475465652,
"grad_norm": 1.2074676752090454,
"learning_rate": 6.401512246921576e-05,
"loss": 2.0425,
"step": 621
},
{
"epoch": 0.24914880833166433,
"grad_norm": 1.3896255493164062,
"learning_rate": 6.371922946493591e-05,
"loss": 1.9922,
"step": 622
},
{
"epoch": 0.24954936911676348,
"grad_norm": 1.2395154237747192,
"learning_rate": 6.342370180761256e-05,
"loss": 1.7281,
"step": 623
},
{
"epoch": 0.2499499299018626,
"grad_norm": 1.0640504360198975,
"learning_rate": 6.312854247320595e-05,
"loss": 1.9935,
"step": 624
},
{
"epoch": 0.25035049068696175,
"grad_norm": 1.1218020915985107,
"learning_rate": 6.283375443396726e-05,
"loss": 2.1524,
"step": 625
},
{
"epoch": 0.2507510514720609,
"grad_norm": 1.1880977153778076,
"learning_rate": 6.25393406584088e-05,
"loss": 2.1081,
"step": 626
},
{
"epoch": 0.25115161225716004,
"grad_norm": 1.4129986763000488,
"learning_rate": 6.224530411127403e-05,
"loss": 1.6092,
"step": 627
},
{
"epoch": 0.2515521730422592,
"grad_norm": 0.9830234050750732,
"learning_rate": 6.19516477535077e-05,
"loss": 2.0331,
"step": 628
},
{
"epoch": 0.2519527338273583,
"grad_norm": 0.9556760787963867,
"learning_rate": 6.165837454222608e-05,
"loss": 1.9718,
"step": 629
},
{
"epoch": 0.2523532946124574,
"grad_norm": 1.383091926574707,
"learning_rate": 6.136548743068713e-05,
"loss": 1.8912,
"step": 630
},
{
"epoch": 0.25275385539755657,
"grad_norm": 1.5158681869506836,
"learning_rate": 6.107298936826086e-05,
"loss": 2.0875,
"step": 631
},
{
"epoch": 0.2531544161826557,
"grad_norm": 1.7562272548675537,
"learning_rate": 6.078088330039945e-05,
"loss": 2.3275,
"step": 632
},
{
"epoch": 0.25355497696775486,
"grad_norm": 0.8706815838813782,
"learning_rate": 6.048917216860781e-05,
"loss": 2.0608,
"step": 633
},
{
"epoch": 0.253955537752854,
"grad_norm": 1.0561360120773315,
"learning_rate": 6.019785891041381e-05,
"loss": 1.7948,
"step": 634
},
{
"epoch": 0.25435609853795316,
"grad_norm": 1.234235405921936,
"learning_rate": 5.9906946459338656e-05,
"loss": 1.8081,
"step": 635
},
{
"epoch": 0.25475665932305225,
"grad_norm": 1.2104214429855347,
"learning_rate": 5.9616437744867535e-05,
"loss": 1.9068,
"step": 636
},
{
"epoch": 0.2551572201081514,
"grad_norm": 1.253570556640625,
"learning_rate": 5.9326335692419995e-05,
"loss": 1.9251,
"step": 637
},
{
"epoch": 0.25555778089325054,
"grad_norm": 1.436930775642395,
"learning_rate": 5.9036643223320475e-05,
"loss": 2.2092,
"step": 638
},
{
"epoch": 0.2559583416783497,
"grad_norm": 1.1123769283294678,
"learning_rate": 5.8747363254768894e-05,
"loss": 1.9229,
"step": 639
},
{
"epoch": 0.25635890246344883,
"grad_norm": 1.2202420234680176,
"learning_rate": 5.845849869981137e-05,
"loss": 1.8872,
"step": 640
},
{
"epoch": 0.256759463248548,
"grad_norm": 1.117271900177002,
"learning_rate": 5.817005246731073e-05,
"loss": 1.9297,
"step": 641
},
{
"epoch": 0.2571600240336471,
"grad_norm": 1.3900742530822754,
"learning_rate": 5.788202746191734e-05,
"loss": 1.7656,
"step": 642
},
{
"epoch": 0.2575605848187462,
"grad_norm": 1.078987956047058,
"learning_rate": 5.759442658403985e-05,
"loss": 1.7871,
"step": 643
},
{
"epoch": 0.25796114560384537,
"grad_norm": 1.2144380807876587,
"learning_rate": 5.7307252729815833e-05,
"loss": 1.9427,
"step": 644
},
{
"epoch": 0.2583617063889445,
"grad_norm": 1.1583763360977173,
"learning_rate": 5.702050879108284e-05,
"loss": 1.9859,
"step": 645
},
{
"epoch": 0.25876226717404366,
"grad_norm": 1.332306146621704,
"learning_rate": 5.6734197655349156e-05,
"loss": 1.9659,
"step": 646
},
{
"epoch": 0.2591628279591428,
"grad_norm": 1.2533334493637085,
"learning_rate": 5.6448322205764794e-05,
"loss": 1.9083,
"step": 647
},
{
"epoch": 0.25956338874424195,
"grad_norm": 1.2647238969802856,
"learning_rate": 5.616288532109225e-05,
"loss": 1.9254,
"step": 648
},
{
"epoch": 0.2599639495293411,
"grad_norm": 1.1436094045639038,
"learning_rate": 5.5877889875677845e-05,
"loss": 1.5689,
"step": 649
},
{
"epoch": 0.2603645103144402,
"grad_norm": 1.4330624341964722,
"learning_rate": 5.559333873942259e-05,
"loss": 1.6846,
"step": 650
},
{
"epoch": 0.26076507109953934,
"grad_norm": 1.3361932039260864,
"learning_rate": 5.530923477775323e-05,
"loss": 1.7006,
"step": 651
},
{
"epoch": 0.2611656318846385,
"grad_norm": 1.4416998624801636,
"learning_rate": 5.5025580851593436e-05,
"loss": 1.8457,
"step": 652
},
{
"epoch": 0.26156619266973763,
"grad_norm": 1.2435954809188843,
"learning_rate": 5.474237981733521e-05,
"loss": 1.9747,
"step": 653
},
{
"epoch": 0.2619667534548368,
"grad_norm": 1.2450804710388184,
"learning_rate": 5.445963452680973e-05,
"loss": 2.5783,
"step": 654
},
{
"epoch": 0.2623673142399359,
"grad_norm": 1.004050374031067,
"learning_rate": 5.417734782725896e-05,
"loss": 2.3425,
"step": 655
},
{
"epoch": 0.26276787502503507,
"grad_norm": 1.2414133548736572,
"learning_rate": 5.38955225613069e-05,
"loss": 1.9871,
"step": 656
},
{
"epoch": 0.26316843581013416,
"grad_norm": 1.286872148513794,
"learning_rate": 5.361416156693075e-05,
"loss": 1.7411,
"step": 657
},
{
"epoch": 0.2635689965952333,
"grad_norm": 1.1407065391540527,
"learning_rate": 5.333326767743263e-05,
"loss": 1.9724,
"step": 658
},
{
"epoch": 0.26396955738033245,
"grad_norm": 1.1503363847732544,
"learning_rate": 5.305284372141095e-05,
"loss": 2.1432,
"step": 659
},
{
"epoch": 0.2643701181654316,
"grad_norm": 1.2722294330596924,
"learning_rate": 5.277289252273174e-05,
"loss": 1.5037,
"step": 660
},
{
"epoch": 0.26477067895053075,
"grad_norm": 1.1083940267562866,
"learning_rate": 5.249341690050051e-05,
"loss": 2.3438,
"step": 661
},
{
"epoch": 0.2651712397356299,
"grad_norm": 1.5369712114334106,
"learning_rate": 5.221441966903371e-05,
"loss": 2.329,
"step": 662
},
{
"epoch": 0.26557180052072904,
"grad_norm": 1.0785249471664429,
"learning_rate": 5.193590363783028e-05,
"loss": 1.7842,
"step": 663
},
{
"epoch": 0.2659723613058282,
"grad_norm": 1.3311983346939087,
"learning_rate": 5.1657871611543605e-05,
"loss": 2.1037,
"step": 664
},
{
"epoch": 0.2663729220909273,
"grad_norm": 1.1631267070770264,
"learning_rate": 5.138032638995315e-05,
"loss": 1.7456,
"step": 665
},
{
"epoch": 0.2667734828760264,
"grad_norm": 1.1494488716125488,
"learning_rate": 5.110327076793613e-05,
"loss": 1.8558,
"step": 666
},
{
"epoch": 0.26717404366112557,
"grad_norm": 1.537941575050354,
"learning_rate": 5.082670753543961e-05,
"loss": 2.1186,
"step": 667
},
{
"epoch": 0.2675746044462247,
"grad_norm": 1.0077687501907349,
"learning_rate": 5.055063947745233e-05,
"loss": 1.9591,
"step": 668
},
{
"epoch": 0.26797516523132386,
"grad_norm": 1.5373225212097168,
"learning_rate": 5.027506937397652e-05,
"loss": 1.9692,
"step": 669
},
{
"epoch": 0.268375726016423,
"grad_norm": 1.1492283344268799,
"learning_rate": 5.000000000000002e-05,
"loss": 1.8725,
"step": 670
},
{
"epoch": 0.26877628680152216,
"grad_norm": 1.2667278051376343,
"learning_rate": 4.972543412546842e-05,
"loss": 1.73,
"step": 671
},
{
"epoch": 0.26917684758662125,
"grad_norm": 1.2694602012634277,
"learning_rate": 4.945137451525707e-05,
"loss": 1.8786,
"step": 672
},
{
"epoch": 0.2695774083717204,
"grad_norm": 1.0184561014175415,
"learning_rate": 4.9177823929143106e-05,
"loss": 1.8922,
"step": 673
},
{
"epoch": 0.26997796915681954,
"grad_norm": 1.1676651239395142,
"learning_rate": 4.890478512177795e-05,
"loss": 1.8485,
"step": 674
},
{
"epoch": 0.2703785299419187,
"grad_norm": 1.315293550491333,
"learning_rate": 4.8632260842659393e-05,
"loss": 2.0489,
"step": 675
},
{
"epoch": 0.27077909072701783,
"grad_norm": 1.356357455253601,
"learning_rate": 4.836025383610382e-05,
"loss": 1.8123,
"step": 676
},
{
"epoch": 0.271179651512117,
"grad_norm": 0.9495698809623718,
"learning_rate": 4.808876684121881e-05,
"loss": 2.4272,
"step": 677
},
{
"epoch": 0.27158021229721613,
"grad_norm": 1.4226418733596802,
"learning_rate": 4.7817802591875426e-05,
"loss": 1.7676,
"step": 678
},
{
"epoch": 0.2719807730823152,
"grad_norm": 1.260791301727295,
"learning_rate": 4.754736381668057e-05,
"loss": 1.8301,
"step": 679
},
{
"epoch": 0.27238133386741437,
"grad_norm": 1.0678155422210693,
"learning_rate": 4.727745323894976e-05,
"loss": 1.9342,
"step": 680
},
{
"epoch": 0.2727818946525135,
"grad_norm": 1.5142219066619873,
"learning_rate": 4.700807357667952e-05,
"loss": 1.8998,
"step": 681
},
{
"epoch": 0.27318245543761266,
"grad_norm": 1.2509609460830688,
"learning_rate": 4.673922754252002e-05,
"loss": 2.0971,
"step": 682
},
{
"epoch": 0.2735830162227118,
"grad_norm": 1.1951498985290527,
"learning_rate": 4.647091784374785e-05,
"loss": 2.3055,
"step": 683
},
{
"epoch": 0.27398357700781095,
"grad_norm": 0.9153628945350647,
"learning_rate": 4.620314718223876e-05,
"loss": 1.591,
"step": 684
},
{
"epoch": 0.2743841377929101,
"grad_norm": 1.1153326034545898,
"learning_rate": 4.593591825444028e-05,
"loss": 1.9545,
"step": 685
},
{
"epoch": 0.2747846985780092,
"grad_norm": 1.1780824661254883,
"learning_rate": 4.566923375134472e-05,
"loss": 1.9943,
"step": 686
},
{
"epoch": 0.27518525936310834,
"grad_norm": 1.0134261846542358,
"learning_rate": 4.5403096358462095e-05,
"loss": 2.4141,
"step": 687
},
{
"epoch": 0.2755858201482075,
"grad_norm": 1.2110122442245483,
"learning_rate": 4.513750875579303e-05,
"loss": 1.7741,
"step": 688
},
{
"epoch": 0.27598638093330663,
"grad_norm": 1.648859977722168,
"learning_rate": 4.487247361780169e-05,
"loss": 1.9172,
"step": 689
},
{
"epoch": 0.2763869417184058,
"grad_norm": 1.2855784893035889,
"learning_rate": 4.4607993613388976e-05,
"loss": 2.0266,
"step": 690
},
{
"epoch": 0.2767875025035049,
"grad_norm": 1.065869927406311,
"learning_rate": 4.434407140586565e-05,
"loss": 1.9958,
"step": 691
},
{
"epoch": 0.27718806328860407,
"grad_norm": 1.305423617362976,
"learning_rate": 4.4080709652925336e-05,
"loss": 1.5812,
"step": 692
},
{
"epoch": 0.27758862407370316,
"grad_norm": 1.1796246767044067,
"learning_rate": 4.3817911006617986e-05,
"loss": 1.9753,
"step": 693
},
{
"epoch": 0.2779891848588023,
"grad_norm": 1.5826653242111206,
"learning_rate": 4.355567811332311e-05,
"loss": 1.8844,
"step": 694
},
{
"epoch": 0.27838974564390145,
"grad_norm": 1.1710503101348877,
"learning_rate": 4.329401361372294e-05,
"loss": 1.8832,
"step": 695
},
{
"epoch": 0.2787903064290006,
"grad_norm": 1.4755582809448242,
"learning_rate": 4.3032920142776125e-05,
"loss": 1.7222,
"step": 696
},
{
"epoch": 0.27919086721409975,
"grad_norm": 1.193444013595581,
"learning_rate": 4.277240032969105e-05,
"loss": 1.9375,
"step": 697
},
{
"epoch": 0.2795914279991989,
"grad_norm": 1.4020665884017944,
"learning_rate": 4.251245679789928e-05,
"loss": 1.5222,
"step": 698
},
{
"epoch": 0.27999198878429804,
"grad_norm": 1.6775052547454834,
"learning_rate": 4.225309216502933e-05,
"loss": 2.3145,
"step": 699
},
{
"epoch": 0.28039254956939713,
"grad_norm": 1.5346165895462036,
"learning_rate": 4.19943090428802e-05,
"loss": 1.8626,
"step": 700
},
{
"epoch": 0.2807931103544963,
"grad_norm": 1.1919195652008057,
"learning_rate": 4.173611003739498e-05,
"loss": 1.8406,
"step": 701
},
{
"epoch": 0.2811936711395954,
"grad_norm": 1.2349470853805542,
"learning_rate": 4.147849774863488e-05,
"loss": 1.5201,
"step": 702
},
{
"epoch": 0.28159423192469457,
"grad_norm": 1.1646391153335571,
"learning_rate": 4.12214747707527e-05,
"loss": 2.0479,
"step": 703
},
{
"epoch": 0.2819947927097937,
"grad_norm": 1.2893829345703125,
"learning_rate": 4.096504369196704e-05,
"loss": 1.9092,
"step": 704
},
{
"epoch": 0.28239535349489286,
"grad_norm": 0.9188634753227234,
"learning_rate": 4.070920709453597e-05,
"loss": 2.0369,
"step": 705
},
{
"epoch": 0.282795914279992,
"grad_norm": 1.1880841255187988,
"learning_rate": 4.045396755473121e-05,
"loss": 2.2927,
"step": 706
},
{
"epoch": 0.2831964750650911,
"grad_norm": 1.1953880786895752,
"learning_rate": 4.019932764281211e-05,
"loss": 1.9961,
"step": 707
},
{
"epoch": 0.28359703585019025,
"grad_norm": 1.1551967859268188,
"learning_rate": 3.994528992299971e-05,
"loss": 2.1265,
"step": 708
},
{
"epoch": 0.2839975966352894,
"grad_norm": 1.254237413406372,
"learning_rate": 3.969185695345105e-05,
"loss": 1.621,
"step": 709
},
{
"epoch": 0.28439815742038854,
"grad_norm": 1.0534310340881348,
"learning_rate": 3.943903128623335e-05,
"loss": 1.9361,
"step": 710
},
{
"epoch": 0.2847987182054877,
"grad_norm": 1.0625948905944824,
"learning_rate": 3.918681546729822e-05,
"loss": 1.5379,
"step": 711
},
{
"epoch": 0.28519927899058684,
"grad_norm": 1.3185557126998901,
"learning_rate": 3.893521203645618e-05,
"loss": 1.6286,
"step": 712
},
{
"epoch": 0.285599839775686,
"grad_norm": 1.3816808462142944,
"learning_rate": 3.8684223527351025e-05,
"loss": 1.7597,
"step": 713
},
{
"epoch": 0.2860004005607851,
"grad_norm": 1.2613497972488403,
"learning_rate": 3.843385246743417e-05,
"loss": 1.7095,
"step": 714
},
{
"epoch": 0.2864009613458842,
"grad_norm": 1.2452985048294067,
"learning_rate": 3.8184101377939476e-05,
"loss": 1.8584,
"step": 715
},
{
"epoch": 0.28680152213098337,
"grad_norm": 1.6921788454055786,
"learning_rate": 3.7934972773857634e-05,
"loss": 2.0267,
"step": 716
},
{
"epoch": 0.2872020829160825,
"grad_norm": 1.1087079048156738,
"learning_rate": 3.7686469163910885e-05,
"loss": 1.7955,
"step": 717
},
{
"epoch": 0.28760264370118166,
"grad_norm": 1.2214350700378418,
"learning_rate": 3.7438593050527845e-05,
"loss": 1.9203,
"step": 718
},
{
"epoch": 0.2880032044862808,
"grad_norm": 1.0154850482940674,
"learning_rate": 3.719134692981826e-05,
"loss": 2.2155,
"step": 719
},
{
"epoch": 0.28840376527137995,
"grad_norm": 1.018591046333313,
"learning_rate": 3.694473329154778e-05,
"loss": 1.7213,
"step": 720
},
{
"epoch": 0.28880432605647904,
"grad_norm": 1.3071078062057495,
"learning_rate": 3.669875461911297e-05,
"loss": 2.0372,
"step": 721
},
{
"epoch": 0.2892048868415782,
"grad_norm": 1.1150556802749634,
"learning_rate": 3.645341338951639e-05,
"loss": 2.0046,
"step": 722
},
{
"epoch": 0.28960544762667734,
"grad_norm": 1.0924551486968994,
"learning_rate": 3.62087120733415e-05,
"loss": 1.8254,
"step": 723
},
{
"epoch": 0.2900060084117765,
"grad_norm": 1.1697030067443848,
"learning_rate": 3.5964653134727776e-05,
"loss": 1.9609,
"step": 724
},
{
"epoch": 0.29040656919687563,
"grad_norm": 0.9879043698310852,
"learning_rate": 3.5721239031346066e-05,
"loss": 1.9276,
"step": 725
},
{
"epoch": 0.2908071299819748,
"grad_norm": 1.258763313293457,
"learning_rate": 3.547847221437372e-05,
"loss": 1.6585,
"step": 726
},
{
"epoch": 0.2912076907670739,
"grad_norm": 1.2708990573883057,
"learning_rate": 3.523635512846981e-05,
"loss": 1.9371,
"step": 727
},
{
"epoch": 0.291608251552173,
"grad_norm": 0.940768301486969,
"learning_rate": 3.4994890211750754e-05,
"loss": 2.1374,
"step": 728
},
{
"epoch": 0.29200881233727216,
"grad_norm": 1.2373859882354736,
"learning_rate": 3.47540798957656e-05,
"loss": 2.1544,
"step": 729
},
{
"epoch": 0.2924093731223713,
"grad_norm": 1.1635286808013916,
"learning_rate": 3.45139266054715e-05,
"loss": 1.668,
"step": 730
},
{
"epoch": 0.29280993390747045,
"grad_norm": 1.1330822706222534,
"learning_rate": 3.4274432759209453e-05,
"loss": 1.9928,
"step": 731
},
{
"epoch": 0.2932104946925696,
"grad_norm": 1.0662771463394165,
"learning_rate": 3.4035600768679855e-05,
"loss": 1.4355,
"step": 732
},
{
"epoch": 0.29361105547766875,
"grad_norm": 1.1696702241897583,
"learning_rate": 3.379743303891815e-05,
"loss": 1.8776,
"step": 733
},
{
"epoch": 0.2940116162627679,
"grad_norm": 1.2918500900268555,
"learning_rate": 3.3559931968270753e-05,
"loss": 2.0556,
"step": 734
},
{
"epoch": 0.29441217704786704,
"grad_norm": 1.015869379043579,
"learning_rate": 3.332309994837085e-05,
"loss": 1.7284,
"step": 735
},
{
"epoch": 0.29481273783296613,
"grad_norm": 1.0896291732788086,
"learning_rate": 3.308693936411421e-05,
"loss": 2.1261,
"step": 736
},
{
"epoch": 0.2952132986180653,
"grad_norm": 1.082911729812622,
"learning_rate": 3.2851452593635266e-05,
"loss": 2.0187,
"step": 737
},
{
"epoch": 0.2956138594031644,
"grad_norm": 1.2486083507537842,
"learning_rate": 3.2616642008283213e-05,
"loss": 1.7642,
"step": 738
},
{
"epoch": 0.29601442018826357,
"grad_norm": 1.1814159154891968,
"learning_rate": 3.238250997259808e-05,
"loss": 2.1014,
"step": 739
},
{
"epoch": 0.2964149809733627,
"grad_norm": 1.3076002597808838,
"learning_rate": 3.21490588442868e-05,
"loss": 1.7214,
"step": 740
},
{
"epoch": 0.29681554175846186,
"grad_norm": 1.1683399677276611,
"learning_rate": 3.191629097419966e-05,
"loss": 1.9912,
"step": 741
},
{
"epoch": 0.297216102543561,
"grad_norm": 1.4244028329849243,
"learning_rate": 3.1684208706306574e-05,
"loss": 2.1868,
"step": 742
},
{
"epoch": 0.2976166633286601,
"grad_norm": 1.08429753780365,
"learning_rate": 3.1452814377673346e-05,
"loss": 1.876,
"step": 743
},
{
"epoch": 0.29801722411375925,
"grad_norm": 1.1976194381713867,
"learning_rate": 3.1222110318438304e-05,
"loss": 1.9932,
"step": 744
},
{
"epoch": 0.2984177848988584,
"grad_norm": 0.9979033470153809,
"learning_rate": 3.099209885178882e-05,
"loss": 1.9214,
"step": 745
},
{
"epoch": 0.29881834568395754,
"grad_norm": 1.1685912609100342,
"learning_rate": 3.076278229393773e-05,
"loss": 2.2045,
"step": 746
},
{
"epoch": 0.2992189064690567,
"grad_norm": 1.0432329177856445,
"learning_rate": 3.053416295410026e-05,
"loss": 1.9744,
"step": 747
},
{
"epoch": 0.29961946725415584,
"grad_norm": 1.3380476236343384,
"learning_rate": 3.030624313447067e-05,
"loss": 2.2597,
"step": 748
},
{
"epoch": 0.300020028039255,
"grad_norm": 1.0045922994613647,
"learning_rate": 3.0079025130198935e-05,
"loss": 1.844,
"step": 749
},
{
"epoch": 0.3004205888243541,
"grad_norm": 1.2213352918624878,
"learning_rate": 2.9852511229367865e-05,
"loss": 1.7077,
"step": 750
},
{
"epoch": 0.3004205888243541,
"eval_loss": 1.9114018678665161,
"eval_runtime": 32.8226,
"eval_samples_per_second": 32.051,
"eval_steps_per_second": 16.026,
"step": 750
},
{
"epoch": 0.3008211496094532,
"grad_norm": 1.2960726022720337,
"learning_rate": 2.962670371296996e-05,
"loss": 2.397,
"step": 751
},
{
"epoch": 0.30122171039455237,
"grad_norm": 1.0980877876281738,
"learning_rate": 2.9401604854884357e-05,
"loss": 2.2021,
"step": 752
},
{
"epoch": 0.3016222711796515,
"grad_norm": 1.016575813293457,
"learning_rate": 2.91772169218541e-05,
"loss": 1.7718,
"step": 753
},
{
"epoch": 0.30202283196475066,
"grad_norm": 1.2587579488754272,
"learning_rate": 2.8953542173463133e-05,
"loss": 1.8161,
"step": 754
},
{
"epoch": 0.3024233927498498,
"grad_norm": 1.3101757764816284,
"learning_rate": 2.8730582862113742e-05,
"loss": 1.6577,
"step": 755
},
{
"epoch": 0.30282395353494895,
"grad_norm": 1.2363008260726929,
"learning_rate": 2.8508341233003654e-05,
"loss": 2.0945,
"step": 756
},
{
"epoch": 0.30322451432004804,
"grad_norm": 1.2293903827667236,
"learning_rate": 2.828681952410366e-05,
"loss": 1.8652,
"step": 757
},
{
"epoch": 0.3036250751051472,
"grad_norm": 1.2226698398590088,
"learning_rate": 2.8066019966134904e-05,
"loss": 2.0198,
"step": 758
},
{
"epoch": 0.30402563589024634,
"grad_norm": 1.4481924772262573,
"learning_rate": 2.7845944782546453e-05,
"loss": 1.861,
"step": 759
},
{
"epoch": 0.3044261966753455,
"grad_norm": 1.2004791498184204,
"learning_rate": 2.7626596189492983e-05,
"loss": 2.192,
"step": 760
},
{
"epoch": 0.30482675746044463,
"grad_norm": 1.1224644184112549,
"learning_rate": 2.7407976395812418e-05,
"loss": 2.108,
"step": 761
},
{
"epoch": 0.3052273182455438,
"grad_norm": 1.3192898035049438,
"learning_rate": 2.719008760300359e-05,
"loss": 1.7614,
"step": 762
},
{
"epoch": 0.3056278790306429,
"grad_norm": 1.3838907480239868,
"learning_rate": 2.6972932005204267e-05,
"loss": 1.9876,
"step": 763
},
{
"epoch": 0.306028439815742,
"grad_norm": 1.4712343215942383,
"learning_rate": 2.6756511789168925e-05,
"loss": 2.0119,
"step": 764
},
{
"epoch": 0.30642900060084116,
"grad_norm": 1.0120660066604614,
"learning_rate": 2.654082913424668e-05,
"loss": 2.0213,
"step": 765
},
{
"epoch": 0.3068295613859403,
"grad_norm": 1.2688238620758057,
"learning_rate": 2.6325886212359498e-05,
"loss": 1.6258,
"step": 766
},
{
"epoch": 0.30723012217103945,
"grad_norm": 1.1377239227294922,
"learning_rate": 2.6111685187980262e-05,
"loss": 1.7486,
"step": 767
},
{
"epoch": 0.3076306829561386,
"grad_norm": 1.1769511699676514,
"learning_rate": 2.589822821811083e-05,
"loss": 1.6603,
"step": 768
},
{
"epoch": 0.30803124374123775,
"grad_norm": 0.9931359887123108,
"learning_rate": 2.5685517452260567e-05,
"loss": 2.4301,
"step": 769
},
{
"epoch": 0.3084318045263369,
"grad_norm": 1.1136318445205688,
"learning_rate": 2.5473555032424533e-05,
"loss": 1.8987,
"step": 770
},
{
"epoch": 0.308832365311436,
"grad_norm": 0.8127551674842834,
"learning_rate": 2.5262343093061936e-05,
"loss": 1.9156,
"step": 771
},
{
"epoch": 0.30923292609653513,
"grad_norm": 1.2461220026016235,
"learning_rate": 2.5051883761074614e-05,
"loss": 1.9488,
"step": 772
},
{
"epoch": 0.3096334868816343,
"grad_norm": 1.610859990119934,
"learning_rate": 2.4842179155785737e-05,
"loss": 2.0289,
"step": 773
},
{
"epoch": 0.3100340476667334,
"grad_norm": 1.442642092704773,
"learning_rate": 2.4633231388918378e-05,
"loss": 1.9204,
"step": 774
},
{
"epoch": 0.31043460845183257,
"grad_norm": 0.9093045592308044,
"learning_rate": 2.4425042564574184e-05,
"loss": 1.9993,
"step": 775
},
{
"epoch": 0.3108351692369317,
"grad_norm": 0.9235540628433228,
"learning_rate": 2.4217614779212315e-05,
"loss": 1.7345,
"step": 776
},
{
"epoch": 0.31123573002203087,
"grad_norm": 1.202626347541809,
"learning_rate": 2.4010950121628318e-05,
"loss": 1.8141,
"step": 777
},
{
"epoch": 0.31163629080712996,
"grad_norm": 0.9984288811683655,
"learning_rate": 2.3805050672932928e-05,
"loss": 1.8233,
"step": 778
},
{
"epoch": 0.3120368515922291,
"grad_norm": 1.394755482673645,
"learning_rate": 2.3599918506531337e-05,
"loss": 1.8879,
"step": 779
},
{
"epoch": 0.31243741237732825,
"grad_norm": 1.5760648250579834,
"learning_rate": 2.339555568810221e-05,
"loss": 1.9124,
"step": 780
},
{
"epoch": 0.3128379731624274,
"grad_norm": 0.9450803995132446,
"learning_rate": 2.3191964275576805e-05,
"loss": 2.068,
"step": 781
},
{
"epoch": 0.31323853394752654,
"grad_norm": 1.023253083229065,
"learning_rate": 2.2989146319118425e-05,
"loss": 2.166,
"step": 782
},
{
"epoch": 0.3136390947326257,
"grad_norm": 1.1726493835449219,
"learning_rate": 2.2787103861101655e-05,
"loss": 1.9661,
"step": 783
},
{
"epoch": 0.31403965551772484,
"grad_norm": 1.1509053707122803,
"learning_rate": 2.2585838936091754e-05,
"loss": 2.0738,
"step": 784
},
{
"epoch": 0.3144402163028239,
"grad_norm": 1.0587100982666016,
"learning_rate": 2.2385353570824308e-05,
"loss": 1.8896,
"step": 785
},
{
"epoch": 0.3148407770879231,
"grad_norm": 1.2842707633972168,
"learning_rate": 2.2185649784184746e-05,
"loss": 1.9896,
"step": 786
},
{
"epoch": 0.3152413378730222,
"grad_norm": 1.2250515222549438,
"learning_rate": 2.198672958718796e-05,
"loss": 2.0292,
"step": 787
},
{
"epoch": 0.31564189865812137,
"grad_norm": 0.9199315309524536,
"learning_rate": 2.178859498295809e-05,
"loss": 2.0993,
"step": 788
},
{
"epoch": 0.3160424594432205,
"grad_norm": 1.379782795906067,
"learning_rate": 2.159124796670843e-05,
"loss": 1.9658,
"step": 789
},
{
"epoch": 0.31644302022831966,
"grad_norm": 1.297568440437317,
"learning_rate": 2.139469052572127e-05,
"loss": 1.8385,
"step": 790
},
{
"epoch": 0.3168435810134188,
"grad_norm": 1.2404309511184692,
"learning_rate": 2.119892463932781e-05,
"loss": 2.0786,
"step": 791
},
{
"epoch": 0.3172441417985179,
"grad_norm": 0.8843573331832886,
"learning_rate": 2.1003952278888382e-05,
"loss": 2.226,
"step": 792
},
{
"epoch": 0.31764470258361704,
"grad_norm": 1.3021501302719116,
"learning_rate": 2.0809775407772503e-05,
"loss": 2.0633,
"step": 793
},
{
"epoch": 0.3180452633687162,
"grad_norm": 1.5391861200332642,
"learning_rate": 2.0616395981339075e-05,
"loss": 2.0367,
"step": 794
},
{
"epoch": 0.31844582415381534,
"grad_norm": 1.118991732597351,
"learning_rate": 2.042381594691678e-05,
"loss": 1.94,
"step": 795
},
{
"epoch": 0.3188463849389145,
"grad_norm": 1.1585220098495483,
"learning_rate": 2.0232037243784475e-05,
"loss": 1.8611,
"step": 796
},
{
"epoch": 0.31924694572401363,
"grad_norm": 1.1401135921478271,
"learning_rate": 2.0041061803151508e-05,
"loss": 1.8363,
"step": 797
},
{
"epoch": 0.3196475065091128,
"grad_norm": 1.2198765277862549,
"learning_rate": 1.985089154813846e-05,
"loss": 2.0256,
"step": 798
},
{
"epoch": 0.3200480672942119,
"grad_norm": 1.0418367385864258,
"learning_rate": 1.9661528393757744e-05,
"loss": 1.9346,
"step": 799
},
{
"epoch": 0.320448628079311,
"grad_norm": 1.1914174556732178,
"learning_rate": 1.947297424689414e-05,
"loss": 2.2865,
"step": 800
},
{
"epoch": 0.32084918886441016,
"grad_norm": 1.3711671829223633,
"learning_rate": 1.9285231006285853e-05,
"loss": 2.423,
"step": 801
},
{
"epoch": 0.3212497496495093,
"grad_norm": 1.17362380027771,
"learning_rate": 1.9098300562505266e-05,
"loss": 1.5378,
"step": 802
},
{
"epoch": 0.32165031043460846,
"grad_norm": 0.9338995814323425,
"learning_rate": 1.8912184797939803e-05,
"loss": 2.2084,
"step": 803
},
{
"epoch": 0.3220508712197076,
"grad_norm": 1.059409499168396,
"learning_rate": 1.8726885586773212e-05,
"loss": 2.3115,
"step": 804
},
{
"epoch": 0.32245143200480675,
"grad_norm": 1.0587588548660278,
"learning_rate": 1.854240479496643e-05,
"loss": 1.6772,
"step": 805
},
{
"epoch": 0.3228519927899059,
"grad_norm": 1.5739694833755493,
"learning_rate": 1.835874428023905e-05,
"loss": 1.8971,
"step": 806
},
{
"epoch": 0.323252553575005,
"grad_norm": 1.096549391746521,
"learning_rate": 1.817590589205035e-05,
"loss": 1.9728,
"step": 807
},
{
"epoch": 0.32365311436010413,
"grad_norm": 1.1896045207977295,
"learning_rate": 1.7993891471580893e-05,
"loss": 1.4642,
"step": 808
},
{
"epoch": 0.3240536751452033,
"grad_norm": 1.2677874565124512,
"learning_rate": 1.7812702851713904e-05,
"loss": 1.9893,
"step": 809
},
{
"epoch": 0.3244542359303024,
"grad_norm": 1.2322319746017456,
"learning_rate": 1.763234185701673e-05,
"loss": 1.6648,
"step": 810
},
{
"epoch": 0.3248547967154016,
"grad_norm": 1.4150607585906982,
"learning_rate": 1.74528103037226e-05,
"loss": 1.8747,
"step": 811
},
{
"epoch": 0.3252553575005007,
"grad_norm": 1.1502705812454224,
"learning_rate": 1.7274109999712295e-05,
"loss": 1.8867,
"step": 812
},
{
"epoch": 0.32565591828559987,
"grad_norm": 1.0201531648635864,
"learning_rate": 1.7096242744495837e-05,
"loss": 1.9702,
"step": 813
},
{
"epoch": 0.32605647907069896,
"grad_norm": 1.109731912612915,
"learning_rate": 1.6919210329194533e-05,
"loss": 1.852,
"step": 814
},
{
"epoch": 0.3264570398557981,
"grad_norm": 1.1922898292541504,
"learning_rate": 1.6743014536522873e-05,
"loss": 1.7001,
"step": 815
},
{
"epoch": 0.32685760064089725,
"grad_norm": 0.8632221221923828,
"learning_rate": 1.6567657140770475e-05,
"loss": 1.7701,
"step": 816
},
{
"epoch": 0.3272581614259964,
"grad_norm": 1.106311321258545,
"learning_rate": 1.6393139907784404e-05,
"loss": 2.1824,
"step": 817
},
{
"epoch": 0.32765872221109554,
"grad_norm": 1.2513147592544556,
"learning_rate": 1.621946459495127e-05,
"loss": 2.1743,
"step": 818
},
{
"epoch": 0.3280592829961947,
"grad_norm": 1.183262825012207,
"learning_rate": 1.6046632951179508e-05,
"loss": 1.7933,
"step": 819
},
{
"epoch": 0.32845984378129384,
"grad_norm": 1.4637755155563354,
"learning_rate": 1.587464671688187e-05,
"loss": 1.4244,
"step": 820
},
{
"epoch": 0.3288604045663929,
"grad_norm": 1.0762394666671753,
"learning_rate": 1.5703507623957848e-05,
"loss": 1.9548,
"step": 821
},
{
"epoch": 0.3292609653514921,
"grad_norm": 1.5148048400878906,
"learning_rate": 1.553321739577619e-05,
"loss": 1.8027,
"step": 822
},
{
"epoch": 0.3296615261365912,
"grad_norm": 1.4003595113754272,
"learning_rate": 1.5363777747157572e-05,
"loss": 1.6786,
"step": 823
},
{
"epoch": 0.33006208692169037,
"grad_norm": 1.1532552242279053,
"learning_rate": 1.5195190384357404e-05,
"loss": 2.0791,
"step": 824
},
{
"epoch": 0.3304626477067895,
"grad_norm": 1.2315119504928589,
"learning_rate": 1.5027457005048573e-05,
"loss": 1.8975,
"step": 825
},
{
"epoch": 0.33086320849188866,
"grad_norm": 1.5469486713409424,
"learning_rate": 1.4860579298304312e-05,
"loss": 1.9729,
"step": 826
},
{
"epoch": 0.3312637692769878,
"grad_norm": 1.1972731351852417,
"learning_rate": 1.4694558944581293e-05,
"loss": 1.7436,
"step": 827
},
{
"epoch": 0.3316643300620869,
"grad_norm": 1.2163808345794678,
"learning_rate": 1.4529397615702656e-05,
"loss": 1.9608,
"step": 828
},
{
"epoch": 0.33206489084718604,
"grad_norm": 1.1661227941513062,
"learning_rate": 1.4365096974841108e-05,
"loss": 1.9195,
"step": 829
},
{
"epoch": 0.3324654516322852,
"grad_norm": 1.3404620885849,
"learning_rate": 1.4201658676502294e-05,
"loss": 1.9545,
"step": 830
},
{
"epoch": 0.33286601241738434,
"grad_norm": 1.3196473121643066,
"learning_rate": 1.4039084366508092e-05,
"loss": 1.789,
"step": 831
},
{
"epoch": 0.3332665732024835,
"grad_norm": 1.4525930881500244,
"learning_rate": 1.3877375681979943e-05,
"loss": 1.9036,
"step": 832
},
{
"epoch": 0.33366713398758263,
"grad_norm": 0.9184648990631104,
"learning_rate": 1.3716534251322544e-05,
"loss": 1.9158,
"step": 833
},
{
"epoch": 0.3340676947726818,
"grad_norm": 1.1989598274230957,
"learning_rate": 1.3556561694207338e-05,
"loss": 1.6822,
"step": 834
},
{
"epoch": 0.33446825555778087,
"grad_norm": 0.9898660182952881,
"learning_rate": 1.339745962155613e-05,
"loss": 2.1256,
"step": 835
},
{
"epoch": 0.33486881634288,
"grad_norm": 1.368600606918335,
"learning_rate": 1.3239229635525074e-05,
"loss": 1.592,
"step": 836
},
{
"epoch": 0.33526937712797916,
"grad_norm": 1.3661975860595703,
"learning_rate": 1.3081873329488392e-05,
"loss": 1.865,
"step": 837
},
{
"epoch": 0.3356699379130783,
"grad_norm": 0.9782090187072754,
"learning_rate": 1.2925392288022298e-05,
"loss": 1.8389,
"step": 838
},
{
"epoch": 0.33607049869817746,
"grad_norm": 1.5394399166107178,
"learning_rate": 1.2769788086889134e-05,
"loss": 2.0711,
"step": 839
},
{
"epoch": 0.3364710594832766,
"grad_norm": 1.2607556581497192,
"learning_rate": 1.2615062293021507e-05,
"loss": 2.0338,
"step": 840
},
{
"epoch": 0.33687162026837575,
"grad_norm": 1.4436510801315308,
"learning_rate": 1.2461216464506454e-05,
"loss": 2.074,
"step": 841
},
{
"epoch": 0.33727218105347484,
"grad_norm": 1.4884815216064453,
"learning_rate": 1.230825215056971e-05,
"loss": 2.0801,
"step": 842
},
{
"epoch": 0.337672741838574,
"grad_norm": 0.985197126865387,
"learning_rate": 1.2156170891560258e-05,
"loss": 2.1941,
"step": 843
},
{
"epoch": 0.33807330262367313,
"grad_norm": 1.5094271898269653,
"learning_rate": 1.2004974218934695e-05,
"loss": 2.1544,
"step": 844
},
{
"epoch": 0.3384738634087723,
"grad_norm": 1.4975275993347168,
"learning_rate": 1.1854663655241805e-05,
"loss": 2.3323,
"step": 845
},
{
"epoch": 0.3388744241938714,
"grad_norm": 1.178804636001587,
"learning_rate": 1.1705240714107302e-05,
"loss": 2.0121,
"step": 846
},
{
"epoch": 0.3392749849789706,
"grad_norm": 1.1911643743515015,
"learning_rate": 1.1556706900218572e-05,
"loss": 2.2518,
"step": 847
},
{
"epoch": 0.3396755457640697,
"grad_norm": 1.2257444858551025,
"learning_rate": 1.1409063709309442e-05,
"loss": 1.9825,
"step": 848
},
{
"epoch": 0.3400761065491688,
"grad_norm": 1.2943917512893677,
"learning_rate": 1.126231262814521e-05,
"loss": 1.9012,
"step": 849
},
{
"epoch": 0.34047666733426796,
"grad_norm": 1.4267241954803467,
"learning_rate": 1.1116455134507664e-05,
"loss": 1.9565,
"step": 850
},
{
"epoch": 0.3408772281193671,
"grad_norm": 1.4165103435516357,
"learning_rate": 1.0971492697180096e-05,
"loss": 1.9873,
"step": 851
},
{
"epoch": 0.34127778890446625,
"grad_norm": 0.963735044002533,
"learning_rate": 1.0827426775932658e-05,
"loss": 1.9228,
"step": 852
},
{
"epoch": 0.3416783496895654,
"grad_norm": 1.302394986152649,
"learning_rate": 1.068425882150762e-05,
"loss": 1.8196,
"step": 853
},
{
"epoch": 0.34207891047466454,
"grad_norm": 1.6307578086853027,
"learning_rate": 1.054199027560463e-05,
"loss": 2.2207,
"step": 854
},
{
"epoch": 0.3424794712597637,
"grad_norm": 1.1663156747817993,
"learning_rate": 1.0400622570866425e-05,
"loss": 1.3852,
"step": 855
},
{
"epoch": 0.3428800320448628,
"grad_norm": 1.391180157661438,
"learning_rate": 1.026015713086418e-05,
"loss": 2.0258,
"step": 856
},
{
"epoch": 0.34328059282996193,
"grad_norm": 0.9974930882453918,
"learning_rate": 1.0120595370083318e-05,
"loss": 2.1458,
"step": 857
},
{
"epoch": 0.3436811536150611,
"grad_norm": 1.4504176378250122,
"learning_rate": 9.98193869390922e-06,
"loss": 2.5077,
"step": 858
},
{
"epoch": 0.3440817144001602,
"grad_norm": 1.6607308387756348,
"learning_rate": 9.844188498613116e-06,
"loss": 1.9936,
"step": 859
},
{
"epoch": 0.34448227518525937,
"grad_norm": 1.0695178508758545,
"learning_rate": 9.707346171337894e-06,
"loss": 1.5378,
"step": 860
},
{
"epoch": 0.3448828359703585,
"grad_norm": 1.1689550876617432,
"learning_rate": 9.57141309008428e-06,
"loss": 2.1762,
"step": 861
},
{
"epoch": 0.34528339675545766,
"grad_norm": 1.247942566871643,
"learning_rate": 9.436390623696911e-06,
"loss": 2.2111,
"step": 862
},
{
"epoch": 0.34568395754055675,
"grad_norm": 1.3530837297439575,
"learning_rate": 9.302280131850539e-06,
"loss": 2.2161,
"step": 863
},
{
"epoch": 0.3460845183256559,
"grad_norm": 0.9715630412101746,
"learning_rate": 9.16908296503628e-06,
"loss": 1.8675,
"step": 864
},
{
"epoch": 0.34648507911075505,
"grad_norm": 1.2928553819656372,
"learning_rate": 9.036800464548157e-06,
"loss": 1.98,
"step": 865
},
{
"epoch": 0.3468856398958542,
"grad_norm": 1.3303308486938477,
"learning_rate": 8.905433962469489e-06,
"loss": 2.0134,
"step": 866
},
{
"epoch": 0.34728620068095334,
"grad_norm": 1.1572000980377197,
"learning_rate": 8.774984781659467e-06,
"loss": 1.9953,
"step": 867
},
{
"epoch": 0.3476867614660525,
"grad_norm": 1.1844559907913208,
"learning_rate": 8.645454235739903e-06,
"loss": 2.4239,
"step": 868
},
{
"epoch": 0.34808732225115163,
"grad_norm": 0.9763182401657104,
"learning_rate": 8.516843629081984e-06,
"loss": 2.2392,
"step": 869
},
{
"epoch": 0.3484878830362508,
"grad_norm": 1.408148169517517,
"learning_rate": 8.38915425679304e-06,
"loss": 1.792,
"step": 870
},
{
"epoch": 0.34888844382134987,
"grad_norm": 1.2217282056808472,
"learning_rate": 8.262387404703653e-06,
"loss": 1.5025,
"step": 871
},
{
"epoch": 0.349289004606449,
"grad_norm": 1.0182693004608154,
"learning_rate": 8.13654434935467e-06,
"loss": 1.684,
"step": 872
},
{
"epoch": 0.34968956539154816,
"grad_norm": 1.0316119194030762,
"learning_rate": 8.011626357984181e-06,
"loss": 1.9877,
"step": 873
},
{
"epoch": 0.3500901261766473,
"grad_norm": 1.3248041868209839,
"learning_rate": 7.887634688515e-06,
"loss": 2.0565,
"step": 874
},
{
"epoch": 0.35049068696174646,
"grad_norm": 1.2190947532653809,
"learning_rate": 7.764570589541875e-06,
"loss": 1.9459,
"step": 875
},
{
"epoch": 0.3508912477468456,
"grad_norm": 1.182137131690979,
"learning_rate": 7.642435300318907e-06,
"loss": 2.0345,
"step": 876
},
{
"epoch": 0.35129180853194475,
"grad_norm": 1.1659443378448486,
"learning_rate": 7.521230050747086e-06,
"loss": 2.1023,
"step": 877
},
{
"epoch": 0.35169236931704384,
"grad_norm": 1.0156196355819702,
"learning_rate": 7.400956061361974e-06,
"loss": 1.8653,
"step": 878
},
{
"epoch": 0.352092930102143,
"grad_norm": 1.0992286205291748,
"learning_rate": 7.281614543321269e-06,
"loss": 1.8927,
"step": 879
},
{
"epoch": 0.35249349088724213,
"grad_norm": 1.1435526609420776,
"learning_rate": 7.163206698392744e-06,
"loss": 2.099,
"step": 880
},
{
"epoch": 0.3528940516723413,
"grad_norm": 1.0202122926712036,
"learning_rate": 7.045733718942094e-06,
"loss": 1.8585,
"step": 881
},
{
"epoch": 0.3532946124574404,
"grad_norm": 1.4951303005218506,
"learning_rate": 6.929196787920899e-06,
"loss": 1.695,
"step": 882
},
{
"epoch": 0.3536951732425396,
"grad_norm": 1.1155850887298584,
"learning_rate": 6.813597078854772e-06,
"loss": 1.8532,
"step": 883
},
{
"epoch": 0.3540957340276387,
"grad_norm": 1.260799527168274,
"learning_rate": 6.698935755831492e-06,
"loss": 1.9453,
"step": 884
},
{
"epoch": 0.3544962948127378,
"grad_norm": 1.116297960281372,
"learning_rate": 6.585213973489335e-06,
"loss": 2.0739,
"step": 885
},
{
"epoch": 0.35489685559783696,
"grad_norm": 1.355909824371338,
"learning_rate": 6.472432877005341e-06,
"loss": 2.0201,
"step": 886
},
{
"epoch": 0.3552974163829361,
"grad_norm": 1.2766674757003784,
"learning_rate": 6.360593602083942e-06,
"loss": 1.8345,
"step": 887
},
{
"epoch": 0.35569797716803525,
"grad_norm": 1.0762203931808472,
"learning_rate": 6.2496972749453766e-06,
"loss": 1.8632,
"step": 888
},
{
"epoch": 0.3560985379531344,
"grad_norm": 1.4779655933380127,
"learning_rate": 6.139745012314424e-06,
"loss": 1.8136,
"step": 889
},
{
"epoch": 0.35649909873823354,
"grad_norm": 1.2446835041046143,
"learning_rate": 6.030737921409169e-06,
"loss": 2.1419,
"step": 890
},
{
"epoch": 0.3568996595233327,
"grad_norm": 1.3997869491577148,
"learning_rate": 5.922677099929786e-06,
"loss": 1.5943,
"step": 891
},
{
"epoch": 0.3573002203084318,
"grad_norm": 1.0760163068771362,
"learning_rate": 5.8155636360475385e-06,
"loss": 1.7411,
"step": 892
},
{
"epoch": 0.35770078109353093,
"grad_norm": 1.466942548751831,
"learning_rate": 5.709398608393835e-06,
"loss": 1.5269,
"step": 893
},
{
"epoch": 0.3581013418786301,
"grad_norm": 1.3610737323760986,
"learning_rate": 5.604183086049342e-06,
"loss": 2.1299,
"step": 894
},
{
"epoch": 0.3585019026637292,
"grad_norm": 1.5304806232452393,
"learning_rate": 5.499918128533155e-06,
"loss": 2.0253,
"step": 895
},
{
"epoch": 0.35890246344882837,
"grad_norm": 0.9878894090652466,
"learning_rate": 5.396604785792281e-06,
"loss": 1.9011,
"step": 896
},
{
"epoch": 0.3593030242339275,
"grad_norm": 1.012338638305664,
"learning_rate": 5.294244098190926e-06,
"loss": 1.8002,
"step": 897
},
{
"epoch": 0.35970358501902666,
"grad_norm": 1.3276349306106567,
"learning_rate": 5.192837096500058e-06,
"loss": 2.0386,
"step": 898
},
{
"epoch": 0.36010414580412575,
"grad_norm": 1.223771572113037,
"learning_rate": 5.092384801887074e-06,
"loss": 2.1836,
"step": 899
},
{
"epoch": 0.3605047065892249,
"grad_norm": 0.9753492474555969,
"learning_rate": 4.992888225905468e-06,
"loss": 1.6111,
"step": 900
},
{
"epoch": 0.36090526737432405,
"grad_norm": 0.7910905480384827,
"learning_rate": 4.8943483704846475e-06,
"loss": 1.903,
"step": 901
},
{
"epoch": 0.3613058281594232,
"grad_norm": 1.2467719316482544,
"learning_rate": 4.796766227919857e-06,
"loss": 1.9438,
"step": 902
},
{
"epoch": 0.36170638894452234,
"grad_norm": 1.1500917673110962,
"learning_rate": 4.700142780862205e-06,
"loss": 1.9579,
"step": 903
},
{
"epoch": 0.3621069497296215,
"grad_norm": 1.1933996677398682,
"learning_rate": 4.604479002308737e-06,
"loss": 2.3006,
"step": 904
},
{
"epoch": 0.36250751051472063,
"grad_norm": 1.1259969472885132,
"learning_rate": 4.509775855592613e-06,
"loss": 1.998,
"step": 905
},
{
"epoch": 0.3629080712998197,
"grad_norm": 1.130823016166687,
"learning_rate": 4.416034294373472e-06,
"loss": 1.8769,
"step": 906
},
{
"epoch": 0.36330863208491887,
"grad_norm": 1.02981698513031,
"learning_rate": 4.323255262627846e-06,
"loss": 1.9655,
"step": 907
},
{
"epoch": 0.363709192870018,
"grad_norm": 0.9289757609367371,
"learning_rate": 4.231439694639483e-06,
"loss": 2.0099,
"step": 908
},
{
"epoch": 0.36410975365511716,
"grad_norm": 1.203212857246399,
"learning_rate": 4.140588514990162e-06,
"loss": 1.9931,
"step": 909
},
{
"epoch": 0.3645103144402163,
"grad_norm": 1.1724556684494019,
"learning_rate": 4.050702638550275e-06,
"loss": 1.662,
"step": 910
},
{
"epoch": 0.36491087522531546,
"grad_norm": 1.212730050086975,
"learning_rate": 3.961782970469563e-06,
"loss": 2.0693,
"step": 911
},
{
"epoch": 0.3653114360104146,
"grad_norm": 0.8744038939476013,
"learning_rate": 3.873830406168111e-06,
"loss": 1.9265,
"step": 912
},
{
"epoch": 0.3657119967955137,
"grad_norm": 1.2729175090789795,
"learning_rate": 3.7868458313272904e-06,
"loss": 2.1908,
"step": 913
},
{
"epoch": 0.36611255758061284,
"grad_norm": 1.3766783475875854,
"learning_rate": 3.7008301218807716e-06,
"loss": 1.9221,
"step": 914
},
{
"epoch": 0.366513118365712,
"grad_norm": 1.1976795196533203,
"learning_rate": 3.615784144005796e-06,
"loss": 2.0681,
"step": 915
},
{
"epoch": 0.36691367915081113,
"grad_norm": 1.2433587312698364,
"learning_rate": 3.5317087541144377e-06,
"loss": 1.7831,
"step": 916
},
{
"epoch": 0.3673142399359103,
"grad_norm": 1.328249216079712,
"learning_rate": 3.448604798844912e-06,
"loss": 1.9766,
"step": 917
},
{
"epoch": 0.3677148007210094,
"grad_norm": 0.9622407555580139,
"learning_rate": 3.3664731150531482e-06,
"loss": 1.8986,
"step": 918
},
{
"epoch": 0.3681153615061086,
"grad_norm": 1.2278088331222534,
"learning_rate": 3.2853145298042953e-06,
"loss": 1.9074,
"step": 919
},
{
"epoch": 0.36851592229120766,
"grad_norm": 1.2879400253295898,
"learning_rate": 3.2051298603643753e-06,
"loss": 2.0471,
"step": 920
},
{
"epoch": 0.3689164830763068,
"grad_norm": 1.2228416204452515,
"learning_rate": 3.1259199141921435e-06,
"loss": 1.8044,
"step": 921
},
{
"epoch": 0.36931704386140596,
"grad_norm": 1.045020341873169,
"learning_rate": 3.047685488930874e-06,
"loss": 1.8997,
"step": 922
},
{
"epoch": 0.3697176046465051,
"grad_norm": 0.9093934893608093,
"learning_rate": 2.970427372400353e-06,
"loss": 1.651,
"step": 923
},
{
"epoch": 0.37011816543160425,
"grad_norm": 1.3145874738693237,
"learning_rate": 2.894146342588977e-06,
"loss": 2.1705,
"step": 924
},
{
"epoch": 0.3705187262167034,
"grad_norm": 1.227515697479248,
"learning_rate": 2.818843167645835e-06,
"loss": 2.4005,
"step": 925
},
{
"epoch": 0.37091928700180254,
"grad_norm": 0.9281034469604492,
"learning_rate": 2.744518605873092e-06,
"loss": 2.142,
"step": 926
},
{
"epoch": 0.37131984778690164,
"grad_norm": 1.3893688917160034,
"learning_rate": 2.6711734057182415e-06,
"loss": 1.9674,
"step": 927
},
{
"epoch": 0.3717204085720008,
"grad_norm": 1.1843217611312866,
"learning_rate": 2.5988083057666533e-06,
"loss": 2.0467,
"step": 928
},
{
"epoch": 0.37212096935709993,
"grad_norm": 1.1903239488601685,
"learning_rate": 2.5274240347340717e-06,
"loss": 1.9037,
"step": 929
},
{
"epoch": 0.3725215301421991,
"grad_norm": 1.1729291677474976,
"learning_rate": 2.4570213114592954e-06,
"loss": 1.9407,
"step": 930
},
{
"epoch": 0.3729220909272982,
"grad_norm": 1.4715547561645508,
"learning_rate": 2.3876008448969976e-06,
"loss": 2.1013,
"step": 931
},
{
"epoch": 0.37332265171239737,
"grad_norm": 1.2586127519607544,
"learning_rate": 2.3191633341104856e-06,
"loss": 1.8211,
"step": 932
},
{
"epoch": 0.3737232124974965,
"grad_norm": 1.3188387155532837,
"learning_rate": 2.2517094682647397e-06,
"loss": 1.7527,
"step": 933
},
{
"epoch": 0.3741237732825956,
"grad_norm": 1.0343830585479736,
"learning_rate": 2.1852399266194314e-06,
"loss": 1.4433,
"step": 934
},
{
"epoch": 0.37452433406769475,
"grad_norm": 1.2205039262771606,
"learning_rate": 2.119755378522137e-06,
"loss": 1.6247,
"step": 935
},
{
"epoch": 0.3749248948527939,
"grad_norm": 1.367773175239563,
"learning_rate": 2.05525648340148e-06,
"loss": 1.9733,
"step": 936
},
{
"epoch": 0.37532545563789305,
"grad_norm": 1.1995794773101807,
"learning_rate": 1.9917438907606556e-06,
"loss": 2.2017,
"step": 937
},
{
"epoch": 0.3757260164229922,
"grad_norm": 1.0902953147888184,
"learning_rate": 1.9292182401707603e-06,
"loss": 1.6807,
"step": 938
},
{
"epoch": 0.37612657720809134,
"grad_norm": 1.0587186813354492,
"learning_rate": 1.8676801612643957e-06,
"loss": 1.7358,
"step": 939
},
{
"epoch": 0.3765271379931905,
"grad_norm": 1.0900659561157227,
"learning_rate": 1.8071302737293295e-06,
"loss": 1.9882,
"step": 940
},
{
"epoch": 0.37692769877828963,
"grad_norm": 1.2947819232940674,
"learning_rate": 1.747569187302267e-06,
"loss": 1.8446,
"step": 941
},
{
"epoch": 0.3773282595633887,
"grad_norm": 0.9651957750320435,
"learning_rate": 1.6889975017626903e-06,
"loss": 1.9641,
"step": 942
},
{
"epoch": 0.37772882034848787,
"grad_norm": 1.0325901508331299,
"learning_rate": 1.6314158069267948e-06,
"loss": 1.9663,
"step": 943
},
{
"epoch": 0.378129381133587,
"grad_norm": 1.0935128927230835,
"learning_rate": 1.574824682641629e-06,
"loss": 1.6413,
"step": 944
},
{
"epoch": 0.37852994191868616,
"grad_norm": 1.0227198600769043,
"learning_rate": 1.5192246987791981e-06,
"loss": 1.8915,
"step": 945
},
{
"epoch": 0.3789305027037853,
"grad_norm": 1.019073486328125,
"learning_rate": 1.4646164152307018e-06,
"loss": 1.8609,
"step": 946
},
{
"epoch": 0.37933106348888446,
"grad_norm": 1.2001268863677979,
"learning_rate": 1.411000381900951e-06,
"loss": 1.95,
"step": 947
},
{
"epoch": 0.3797316242739836,
"grad_norm": 1.1757829189300537,
"learning_rate": 1.3583771387028265e-06,
"loss": 1.6127,
"step": 948
},
{
"epoch": 0.3801321850590827,
"grad_norm": 0.9995384216308594,
"learning_rate": 1.3067472155517735e-06,
"loss": 1.7688,
"step": 949
},
{
"epoch": 0.38053274584418184,
"grad_norm": 0.9961532354354858,
"learning_rate": 1.2561111323605712e-06,
"loss": 1.7373,
"step": 950
},
{
"epoch": 0.380933306629281,
"grad_norm": 1.0140630006790161,
"learning_rate": 1.2064693990339936e-06,
"loss": 1.901,
"step": 951
},
{
"epoch": 0.38133386741438013,
"grad_norm": 1.1691038608551025,
"learning_rate": 1.157822515463758e-06,
"loss": 1.8546,
"step": 952
},
{
"epoch": 0.3817344281994793,
"grad_norm": 1.1697040796279907,
"learning_rate": 1.1101709715234386e-06,
"loss": 1.7615,
"step": 953
},
{
"epoch": 0.3821349889845784,
"grad_norm": 1.594868540763855,
"learning_rate": 1.0635152470635512e-06,
"loss": 1.9847,
"step": 954
},
{
"epoch": 0.3825355497696776,
"grad_norm": 1.151361107826233,
"learning_rate": 1.0178558119067315e-06,
"loss": 2.078,
"step": 955
},
{
"epoch": 0.38293611055477667,
"grad_norm": 0.9073551297187805,
"learning_rate": 9.731931258429638e-07,
"loss": 1.9805,
"step": 956
},
{
"epoch": 0.3833366713398758,
"grad_norm": 1.2325596809387207,
"learning_rate": 9.295276386250274e-07,
"loss": 2.0565,
"step": 957
},
{
"epoch": 0.38373723212497496,
"grad_norm": 1.184550166130066,
"learning_rate": 8.868597899638898e-07,
"loss": 2.1695,
"step": 958
},
{
"epoch": 0.3841377929100741,
"grad_norm": 1.1788114309310913,
"learning_rate": 8.451900095242881e-07,
"loss": 2.0578,
"step": 959
},
{
"epoch": 0.38453835369517325,
"grad_norm": 1.1700098514556885,
"learning_rate": 8.04518716920466e-07,
"loss": 1.8347,
"step": 960
},
{
"epoch": 0.3849389144802724,
"grad_norm": 0.935607373714447,
"learning_rate": 7.648463217118984e-07,
"loss": 1.8536,
"step": 961
},
{
"epoch": 0.38533947526537154,
"grad_norm": 0.9701693058013916,
"learning_rate": 7.261732233991513e-07,
"loss": 1.6937,
"step": 962
},
{
"epoch": 0.38574003605047064,
"grad_norm": 1.195351243019104,
"learning_rate": 6.884998114198959e-07,
"loss": 2.1418,
"step": 963
},
{
"epoch": 0.3861405968355698,
"grad_norm": 1.6363762617111206,
"learning_rate": 6.518264651449779e-07,
"loss": 2.2872,
"step": 964
},
{
"epoch": 0.38654115762066893,
"grad_norm": 1.2114075422286987,
"learning_rate": 6.161535538745878e-07,
"loss": 2.1418,
"step": 965
},
{
"epoch": 0.3869417184057681,
"grad_norm": 1.7314434051513672,
"learning_rate": 5.814814368345412e-07,
"loss": 2.0097,
"step": 966
},
{
"epoch": 0.3873422791908672,
"grad_norm": 1.5273650884628296,
"learning_rate": 5.478104631726711e-07,
"loss": 2.0629,
"step": 967
},
{
"epoch": 0.38774283997596637,
"grad_norm": 1.0178067684173584,
"learning_rate": 5.151409719553079e-07,
"loss": 1.5887,
"step": 968
},
{
"epoch": 0.3881434007610655,
"grad_norm": 0.971847653388977,
"learning_rate": 4.834732921638719e-07,
"loss": 1.8772,
"step": 969
},
{
"epoch": 0.3885439615461646,
"grad_norm": 1.0141364336013794,
"learning_rate": 4.5280774269154115e-07,
"loss": 1.7437,
"step": 970
},
{
"epoch": 0.38894452233126375,
"grad_norm": 1.1142873764038086,
"learning_rate": 4.2314463234005565e-07,
"loss": 2.1158,
"step": 971
},
{
"epoch": 0.3893450831163629,
"grad_norm": 0.7133710384368896,
"learning_rate": 3.9448425981661876e-07,
"loss": 1.8973,
"step": 972
},
{
"epoch": 0.38974564390146205,
"grad_norm": 1.245099663734436,
"learning_rate": 3.6682691373086665e-07,
"loss": 1.9652,
"step": 973
},
{
"epoch": 0.3901462046865612,
"grad_norm": 1.2889657020568848,
"learning_rate": 3.401728725919373e-07,
"loss": 1.7035,
"step": 974
},
{
"epoch": 0.39054676547166034,
"grad_norm": 1.1345562934875488,
"learning_rate": 3.145224048057727e-07,
"loss": 1.6353,
"step": 975
},
{
"epoch": 0.3909473262567595,
"grad_norm": 1.1244771480560303,
"learning_rate": 2.898757686722542e-07,
"loss": 1.9095,
"step": 976
},
{
"epoch": 0.3913478870418586,
"grad_norm": 1.1727226972579956,
"learning_rate": 2.6623321238277157e-07,
"loss": 1.8767,
"step": 977
},
{
"epoch": 0.3917484478269577,
"grad_norm": 1.2802989482879639,
"learning_rate": 2.4359497401758024e-07,
"loss": 1.5799,
"step": 978
},
{
"epoch": 0.39214900861205687,
"grad_norm": 1.4607635736465454,
"learning_rate": 2.219612815434924e-07,
"loss": 2.0009,
"step": 979
},
{
"epoch": 0.392549569397156,
"grad_norm": 1.410239815711975,
"learning_rate": 2.0133235281156736e-07,
"loss": 2.0732,
"step": 980
},
{
"epoch": 0.39295013018225516,
"grad_norm": 0.9666495323181152,
"learning_rate": 1.817083955548693e-07,
"loss": 1.8358,
"step": 981
},
{
"epoch": 0.3933506909673543,
"grad_norm": 1.4496511220932007,
"learning_rate": 1.630896073864352e-07,
"loss": 1.8643,
"step": 982
},
{
"epoch": 0.39375125175245346,
"grad_norm": 1.2983746528625488,
"learning_rate": 1.4547617579725449e-07,
"loss": 1.9004,
"step": 983
},
{
"epoch": 0.39415181253755255,
"grad_norm": 1.286615014076233,
"learning_rate": 1.2886827815440372e-07,
"loss": 1.8282,
"step": 984
},
{
"epoch": 0.3945523733226517,
"grad_norm": 1.1125391721725464,
"learning_rate": 1.1326608169920372e-07,
"loss": 1.9587,
"step": 985
},
{
"epoch": 0.39495293410775084,
"grad_norm": 1.1754589080810547,
"learning_rate": 9.866974354560965e-08,
"loss": 1.8011,
"step": 986
},
{
"epoch": 0.39535349489285,
"grad_norm": 1.0687789916992188,
"learning_rate": 8.507941067859016e-08,
"loss": 1.8824,
"step": 987
},
{
"epoch": 0.39575405567794913,
"grad_norm": 1.156052589416504,
"learning_rate": 7.249521995263964e-08,
"loss": 1.8151,
"step": 988
},
{
"epoch": 0.3961546164630483,
"grad_norm": 1.0500197410583496,
"learning_rate": 6.09172980904238e-08,
"loss": 1.6616,
"step": 989
},
{
"epoch": 0.39655517724814743,
"grad_norm": 0.9670491218566895,
"learning_rate": 5.0345761681491746e-08,
"loss": 1.671,
"step": 990
},
{
"epoch": 0.3969557380332465,
"grad_norm": 1.1478677988052368,
"learning_rate": 4.078071718107701e-08,
"loss": 1.779,
"step": 991
},
{
"epoch": 0.39735629881834567,
"grad_norm": 1.2338886260986328,
"learning_rate": 3.2222260909087196e-08,
"loss": 1.8303,
"step": 992
},
{
"epoch": 0.3977568596034448,
"grad_norm": 0.9074128270149231,
"learning_rate": 2.4670479049082597e-08,
"loss": 1.8302,
"step": 993
},
{
"epoch": 0.39815742038854396,
"grad_norm": 1.1111611127853394,
"learning_rate": 1.81254476474213e-08,
"loss": 1.9118,
"step": 994
},
{
"epoch": 0.3985579811736431,
"grad_norm": 1.4943230152130127,
"learning_rate": 1.2587232612493172e-08,
"loss": 2.1329,
"step": 995
},
{
"epoch": 0.39895854195874225,
"grad_norm": 0.8442416191101074,
"learning_rate": 8.055889714064791e-09,
"loss": 1.832,
"step": 996
},
{
"epoch": 0.3993591027438414,
"grad_norm": 1.6363670825958252,
"learning_rate": 4.531464582713252e-09,
"loss": 2.3091,
"step": 997
},
{
"epoch": 0.3997596635289405,
"grad_norm": 1.186084270477295,
"learning_rate": 2.0139927093487664e-09,
"loss": 1.5448,
"step": 998
},
{
"epoch": 0.40016022431403964,
"grad_norm": 1.3950694799423218,
"learning_rate": 5.034994448926967e-10,
"loss": 1.846,
"step": 999
},
{
"epoch": 0.4005607850991388,
"grad_norm": 1.2405130863189697,
"learning_rate": 0.0,
"loss": 1.6719,
"step": 1000
},
{
"epoch": 0.4005607850991388,
"eval_loss": 1.906521201133728,
"eval_runtime": 32.8778,
"eval_samples_per_second": 31.997,
"eval_steps_per_second": 15.999,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.08473960997847e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}