sedrickkeh's picture
End of training
75bf175 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9914040114613183,
"eval_steps": 500,
"global_step": 348,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008595988538681949,
"grad_norm": 5.747197151184082,
"learning_rate": 2.8571428571428575e-07,
"loss": 0.7965,
"step": 1
},
{
"epoch": 0.017191977077363897,
"grad_norm": 6.075042247772217,
"learning_rate": 5.714285714285715e-07,
"loss": 0.8341,
"step": 2
},
{
"epoch": 0.025787965616045846,
"grad_norm": 6.132170677185059,
"learning_rate": 8.571428571428572e-07,
"loss": 0.832,
"step": 3
},
{
"epoch": 0.034383954154727794,
"grad_norm": 5.923718452453613,
"learning_rate": 1.142857142857143e-06,
"loss": 0.8212,
"step": 4
},
{
"epoch": 0.04297994269340974,
"grad_norm": 5.929561614990234,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.8191,
"step": 5
},
{
"epoch": 0.05157593123209169,
"grad_norm": 5.550649166107178,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.8166,
"step": 6
},
{
"epoch": 0.06017191977077364,
"grad_norm": 4.391837120056152,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7911,
"step": 7
},
{
"epoch": 0.06876790830945559,
"grad_norm": 3.968264579772949,
"learning_rate": 2.285714285714286e-06,
"loss": 0.7448,
"step": 8
},
{
"epoch": 0.07736389684813753,
"grad_norm": 2.4340338706970215,
"learning_rate": 2.571428571428571e-06,
"loss": 0.7445,
"step": 9
},
{
"epoch": 0.08595988538681948,
"grad_norm": 2.141514301300049,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.7332,
"step": 10
},
{
"epoch": 0.09455587392550144,
"grad_norm": 2.0045785903930664,
"learning_rate": 3.142857142857143e-06,
"loss": 0.7288,
"step": 11
},
{
"epoch": 0.10315186246418338,
"grad_norm": 2.2864484786987305,
"learning_rate": 3.428571428571429e-06,
"loss": 0.7309,
"step": 12
},
{
"epoch": 0.11174785100286533,
"grad_norm": 3.447997808456421,
"learning_rate": 3.7142857142857146e-06,
"loss": 0.6892,
"step": 13
},
{
"epoch": 0.12034383954154727,
"grad_norm": 3.6240851879119873,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7057,
"step": 14
},
{
"epoch": 0.12893982808022922,
"grad_norm": 3.2834715843200684,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.6638,
"step": 15
},
{
"epoch": 0.13753581661891118,
"grad_norm": 3.148726224899292,
"learning_rate": 4.571428571428572e-06,
"loss": 0.67,
"step": 16
},
{
"epoch": 0.14613180515759314,
"grad_norm": 2.258441209793091,
"learning_rate": 4.857142857142858e-06,
"loss": 0.6436,
"step": 17
},
{
"epoch": 0.15472779369627507,
"grad_norm": 1.8464128971099854,
"learning_rate": 5.142857142857142e-06,
"loss": 0.6459,
"step": 18
},
{
"epoch": 0.16332378223495703,
"grad_norm": 1.3139716386795044,
"learning_rate": 5.428571428571429e-06,
"loss": 0.6182,
"step": 19
},
{
"epoch": 0.17191977077363896,
"grad_norm": 1.0839262008666992,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.6076,
"step": 20
},
{
"epoch": 0.18051575931232092,
"grad_norm": 1.0999958515167236,
"learning_rate": 6e-06,
"loss": 0.5989,
"step": 21
},
{
"epoch": 0.18911174785100288,
"grad_norm": 1.2038193941116333,
"learning_rate": 6.285714285714286e-06,
"loss": 0.6457,
"step": 22
},
{
"epoch": 0.1977077363896848,
"grad_norm": 1.0150710344314575,
"learning_rate": 6.571428571428572e-06,
"loss": 0.623,
"step": 23
},
{
"epoch": 0.20630372492836677,
"grad_norm": 0.790824830532074,
"learning_rate": 6.857142857142858e-06,
"loss": 0.5953,
"step": 24
},
{
"epoch": 0.2148997134670487,
"grad_norm": 0.7728639841079712,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.5728,
"step": 25
},
{
"epoch": 0.22349570200573066,
"grad_norm": 1.012149691581726,
"learning_rate": 7.428571428571429e-06,
"loss": 0.5886,
"step": 26
},
{
"epoch": 0.23209169054441262,
"grad_norm": 0.9006128311157227,
"learning_rate": 7.714285714285716e-06,
"loss": 0.5881,
"step": 27
},
{
"epoch": 0.24068767908309455,
"grad_norm": 0.6326665282249451,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5619,
"step": 28
},
{
"epoch": 0.2492836676217765,
"grad_norm": 0.7062392234802246,
"learning_rate": 8.285714285714287e-06,
"loss": 0.548,
"step": 29
},
{
"epoch": 0.25787965616045844,
"grad_norm": 0.9393576979637146,
"learning_rate": 8.571428571428571e-06,
"loss": 0.5796,
"step": 30
},
{
"epoch": 0.2664756446991404,
"grad_norm": 0.6972165107727051,
"learning_rate": 8.857142857142858e-06,
"loss": 0.5615,
"step": 31
},
{
"epoch": 0.27507163323782235,
"grad_norm": 0.6017346382141113,
"learning_rate": 9.142857142857144e-06,
"loss": 0.5623,
"step": 32
},
{
"epoch": 0.2836676217765043,
"grad_norm": 0.7082251906394958,
"learning_rate": 9.42857142857143e-06,
"loss": 0.5391,
"step": 33
},
{
"epoch": 0.2922636103151863,
"grad_norm": 0.7304179668426514,
"learning_rate": 9.714285714285715e-06,
"loss": 0.5259,
"step": 34
},
{
"epoch": 0.3008595988538682,
"grad_norm": 0.6440004110336304,
"learning_rate": 1e-05,
"loss": 0.5335,
"step": 35
},
{
"epoch": 0.30945558739255014,
"grad_norm": 0.6165776252746582,
"learning_rate": 9.999748146823376e-06,
"loss": 0.5574,
"step": 36
},
{
"epoch": 0.31805157593123207,
"grad_norm": 0.7019054293632507,
"learning_rate": 9.99899261266551e-06,
"loss": 0.5442,
"step": 37
},
{
"epoch": 0.32664756446991405,
"grad_norm": 0.5840720534324646,
"learning_rate": 9.997733473639876e-06,
"loss": 0.5429,
"step": 38
},
{
"epoch": 0.335243553008596,
"grad_norm": 0.5142186880111694,
"learning_rate": 9.995970856593739e-06,
"loss": 0.5571,
"step": 39
},
{
"epoch": 0.3438395415472779,
"grad_norm": 0.6258730292320251,
"learning_rate": 9.993704939095376e-06,
"loss": 0.5273,
"step": 40
},
{
"epoch": 0.3524355300859599,
"grad_norm": 0.6754815578460693,
"learning_rate": 9.9909359494162e-06,
"loss": 0.5099,
"step": 41
},
{
"epoch": 0.36103151862464183,
"grad_norm": 0.502207338809967,
"learning_rate": 9.987664166507749e-06,
"loss": 0.5502,
"step": 42
},
{
"epoch": 0.36962750716332377,
"grad_norm": 0.4611126184463501,
"learning_rate": 9.983889919973586e-06,
"loss": 0.5349,
"step": 43
},
{
"epoch": 0.37822349570200575,
"grad_norm": 0.5253018140792847,
"learning_rate": 9.979613590036108e-06,
"loss": 0.4949,
"step": 44
},
{
"epoch": 0.3868194842406877,
"grad_norm": 0.4556678533554077,
"learning_rate": 9.974835607498224e-06,
"loss": 0.5359,
"step": 45
},
{
"epoch": 0.3954154727793696,
"grad_norm": 0.4698951542377472,
"learning_rate": 9.969556453699966e-06,
"loss": 0.548,
"step": 46
},
{
"epoch": 0.4040114613180516,
"grad_norm": 0.5016157031059265,
"learning_rate": 9.963776660469996e-06,
"loss": 0.5027,
"step": 47
},
{
"epoch": 0.41260744985673353,
"grad_norm": 0.42699918150901794,
"learning_rate": 9.957496810072027e-06,
"loss": 0.501,
"step": 48
},
{
"epoch": 0.42120343839541546,
"grad_norm": 0.41390037536621094,
"learning_rate": 9.95071753514617e-06,
"loss": 0.491,
"step": 49
},
{
"epoch": 0.4297994269340974,
"grad_norm": 0.5382051467895508,
"learning_rate": 9.943439518645193e-06,
"loss": 0.5121,
"step": 50
},
{
"epoch": 0.4383954154727794,
"grad_norm": 0.46307608485221863,
"learning_rate": 9.935663493765726e-06,
"loss": 0.5101,
"step": 51
},
{
"epoch": 0.4469914040114613,
"grad_norm": 0.4218026101589203,
"learning_rate": 9.9273902438744e-06,
"loss": 0.4958,
"step": 52
},
{
"epoch": 0.45558739255014324,
"grad_norm": 0.44447770714759827,
"learning_rate": 9.918620602428916e-06,
"loss": 0.5011,
"step": 53
},
{
"epoch": 0.46418338108882523,
"grad_norm": 0.5056214332580566,
"learning_rate": 9.909355452894098e-06,
"loss": 0.5046,
"step": 54
},
{
"epoch": 0.47277936962750716,
"grad_norm": 0.45448923110961914,
"learning_rate": 9.899595728652883e-06,
"loss": 0.4908,
"step": 55
},
{
"epoch": 0.4813753581661891,
"grad_norm": 0.4472047984600067,
"learning_rate": 9.889342412912296e-06,
"loss": 0.5107,
"step": 56
},
{
"epoch": 0.4899713467048711,
"grad_norm": 0.47882989048957825,
"learning_rate": 9.878596538604388e-06,
"loss": 0.5082,
"step": 57
},
{
"epoch": 0.498567335243553,
"grad_norm": 0.43798163533210754,
"learning_rate": 9.867359188282193e-06,
"loss": 0.4987,
"step": 58
},
{
"epoch": 0.5071633237822349,
"grad_norm": 0.49822425842285156,
"learning_rate": 9.855631494010661e-06,
"loss": 0.5024,
"step": 59
},
{
"epoch": 0.5157593123209169,
"grad_norm": 0.42473432421684265,
"learning_rate": 9.843414637252615e-06,
"loss": 0.5184,
"step": 60
},
{
"epoch": 0.5243553008595988,
"grad_norm": 0.400786817073822,
"learning_rate": 9.830709848749727e-06,
"loss": 0.4978,
"step": 61
},
{
"epoch": 0.5329512893982808,
"grad_norm": 0.45118945837020874,
"learning_rate": 9.817518408398536e-06,
"loss": 0.498,
"step": 62
},
{
"epoch": 0.5415472779369628,
"grad_norm": 0.4021996557712555,
"learning_rate": 9.803841645121505e-06,
"loss": 0.5096,
"step": 63
},
{
"epoch": 0.5501432664756447,
"grad_norm": 0.4471394717693329,
"learning_rate": 9.78968093673314e-06,
"loss": 0.5044,
"step": 64
},
{
"epoch": 0.5587392550143266,
"grad_norm": 0.4328402876853943,
"learning_rate": 9.775037709801206e-06,
"loss": 0.4973,
"step": 65
},
{
"epoch": 0.5673352435530086,
"grad_norm": 0.43772783875465393,
"learning_rate": 9.759913439502982e-06,
"loss": 0.4952,
"step": 66
},
{
"epoch": 0.5759312320916905,
"grad_norm": 0.4489048719406128,
"learning_rate": 9.74430964947668e-06,
"loss": 0.5195,
"step": 67
},
{
"epoch": 0.5845272206303725,
"grad_norm": 0.4349672496318817,
"learning_rate": 9.728227911667934e-06,
"loss": 0.4932,
"step": 68
},
{
"epoch": 0.5931232091690545,
"grad_norm": 0.38588646054267883,
"learning_rate": 9.711669846171443e-06,
"loss": 0.5393,
"step": 69
},
{
"epoch": 0.6017191977077364,
"grad_norm": 0.44539695978164673,
"learning_rate": 9.694637121067764e-06,
"loss": 0.5033,
"step": 70
},
{
"epoch": 0.6103151862464183,
"grad_norm": 0.40802961587905884,
"learning_rate": 9.677131452255272e-06,
"loss": 0.4878,
"step": 71
},
{
"epoch": 0.6189111747851003,
"grad_norm": 0.38218384981155396,
"learning_rate": 9.659154603277283e-06,
"loss": 0.4909,
"step": 72
},
{
"epoch": 0.6275071633237822,
"grad_norm": 0.4387964606285095,
"learning_rate": 9.640708385144403e-06,
"loss": 0.5093,
"step": 73
},
{
"epoch": 0.6361031518624641,
"grad_norm": 0.3596954643726349,
"learning_rate": 9.62179465615209e-06,
"loss": 0.4795,
"step": 74
},
{
"epoch": 0.6446991404011462,
"grad_norm": 0.43439266085624695,
"learning_rate": 9.602415321693434e-06,
"loss": 0.497,
"step": 75
},
{
"epoch": 0.6532951289398281,
"grad_norm": 0.38166651129722595,
"learning_rate": 9.582572334067213e-06,
"loss": 0.4862,
"step": 76
},
{
"epoch": 0.66189111747851,
"grad_norm": 0.39165735244750977,
"learning_rate": 9.562267692281212e-06,
"loss": 0.4929,
"step": 77
},
{
"epoch": 0.670487106017192,
"grad_norm": 0.4982180893421173,
"learning_rate": 9.541503441850844e-06,
"loss": 0.5172,
"step": 78
},
{
"epoch": 0.6790830945558739,
"grad_norm": 0.4101959764957428,
"learning_rate": 9.520281674593084e-06,
"loss": 0.4954,
"step": 79
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.38996198773384094,
"learning_rate": 9.498604528415731e-06,
"loss": 0.4895,
"step": 80
},
{
"epoch": 0.6962750716332379,
"grad_norm": 0.4096653163433075,
"learning_rate": 9.476474187102033e-06,
"loss": 0.5077,
"step": 81
},
{
"epoch": 0.7048710601719198,
"grad_norm": 0.4310821294784546,
"learning_rate": 9.453892880090696e-06,
"loss": 0.4814,
"step": 82
},
{
"epoch": 0.7134670487106017,
"grad_norm": 0.4181893467903137,
"learning_rate": 9.430862882251279e-06,
"loss": 0.5289,
"step": 83
},
{
"epoch": 0.7220630372492837,
"grad_norm": 0.3983473479747772,
"learning_rate": 9.40738651365503e-06,
"loss": 0.4819,
"step": 84
},
{
"epoch": 0.7306590257879656,
"grad_norm": 0.4769289791584015,
"learning_rate": 9.38346613934115e-06,
"loss": 0.4947,
"step": 85
},
{
"epoch": 0.7392550143266475,
"grad_norm": 0.410977303981781,
"learning_rate": 9.359104169078541e-06,
"loss": 0.4791,
"step": 86
},
{
"epoch": 0.7478510028653295,
"grad_norm": 0.4102720618247986,
"learning_rate": 9.334303057123044e-06,
"loss": 0.5152,
"step": 87
},
{
"epoch": 0.7564469914040115,
"grad_norm": 0.455049067735672,
"learning_rate": 9.309065301970193e-06,
"loss": 0.4753,
"step": 88
},
{
"epoch": 0.7650429799426934,
"grad_norm": 0.40964728593826294,
"learning_rate": 9.283393446103506e-06,
"loss": 0.4941,
"step": 89
},
{
"epoch": 0.7736389684813754,
"grad_norm": 0.385990172624588,
"learning_rate": 9.257290075738365e-06,
"loss": 0.4721,
"step": 90
},
{
"epoch": 0.7822349570200573,
"grad_norm": 0.4511207342147827,
"learning_rate": 9.23075782056147e-06,
"loss": 0.518,
"step": 91
},
{
"epoch": 0.7908309455587392,
"grad_norm": 0.4198009967803955,
"learning_rate": 9.20379935346592e-06,
"loss": 0.4704,
"step": 92
},
{
"epoch": 0.7994269340974212,
"grad_norm": 0.38073253631591797,
"learning_rate": 9.176417390281944e-06,
"loss": 0.4877,
"step": 93
},
{
"epoch": 0.8080229226361032,
"grad_norm": 0.43187421560287476,
"learning_rate": 9.148614689503307e-06,
"loss": 0.4797,
"step": 94
},
{
"epoch": 0.8166189111747851,
"grad_norm": 0.38959866762161255,
"learning_rate": 9.120394052009412e-06,
"loss": 0.4865,
"step": 95
},
{
"epoch": 0.8252148997134671,
"grad_norm": 0.4486367702484131,
"learning_rate": 9.091758320783139e-06,
"loss": 0.4914,
"step": 96
},
{
"epoch": 0.833810888252149,
"grad_norm": 0.4046187698841095,
"learning_rate": 9.062710380624439e-06,
"loss": 0.4992,
"step": 97
},
{
"epoch": 0.8424068767908309,
"grad_norm": 0.4316481053829193,
"learning_rate": 9.033253157859715e-06,
"loss": 0.4893,
"step": 98
},
{
"epoch": 0.8510028653295129,
"grad_norm": 0.3982710838317871,
"learning_rate": 9.003389620047012e-06,
"loss": 0.4754,
"step": 99
},
{
"epoch": 0.8595988538681948,
"grad_norm": 0.42544639110565186,
"learning_rate": 8.973122775677078e-06,
"loss": 0.4834,
"step": 100
},
{
"epoch": 0.8681948424068768,
"grad_norm": 0.39306387305259705,
"learning_rate": 8.942455673870278e-06,
"loss": 0.5025,
"step": 101
},
{
"epoch": 0.8767908309455588,
"grad_norm": 0.40097400546073914,
"learning_rate": 8.91139140406941e-06,
"loss": 0.5103,
"step": 102
},
{
"epoch": 0.8853868194842407,
"grad_norm": 0.3771343231201172,
"learning_rate": 8.879933095728485e-06,
"loss": 0.5046,
"step": 103
},
{
"epoch": 0.8939828080229226,
"grad_norm": 0.37679076194763184,
"learning_rate": 8.848083917997463e-06,
"loss": 0.4816,
"step": 104
},
{
"epoch": 0.9025787965616046,
"grad_norm": 0.38239219784736633,
"learning_rate": 8.815847079402972e-06,
"loss": 0.4676,
"step": 105
},
{
"epoch": 0.9111747851002865,
"grad_norm": 0.40575355291366577,
"learning_rate": 8.783225827525098e-06,
"loss": 0.5098,
"step": 106
},
{
"epoch": 0.9197707736389685,
"grad_norm": 0.39208582043647766,
"learning_rate": 8.750223448670204e-06,
"loss": 0.4679,
"step": 107
},
{
"epoch": 0.9283667621776505,
"grad_norm": 0.423259437084198,
"learning_rate": 8.716843267539868e-06,
"loss": 0.4644,
"step": 108
},
{
"epoch": 0.9369627507163324,
"grad_norm": 0.4522409737110138,
"learning_rate": 8.683088646895955e-06,
"loss": 0.4935,
"step": 109
},
{
"epoch": 0.9455587392550143,
"grad_norm": 0.431192010641098,
"learning_rate": 8.648962987221837e-06,
"loss": 0.5205,
"step": 110
},
{
"epoch": 0.9541547277936963,
"grad_norm": 0.4031922221183777,
"learning_rate": 8.614469726379833e-06,
"loss": 0.4996,
"step": 111
},
{
"epoch": 0.9627507163323782,
"grad_norm": 0.4050145745277405,
"learning_rate": 8.579612339264867e-06,
"loss": 0.4852,
"step": 112
},
{
"epoch": 0.9713467048710601,
"grad_norm": 0.4875716269016266,
"learning_rate": 8.544394337454409e-06,
"loss": 0.5126,
"step": 113
},
{
"epoch": 0.9799426934097422,
"grad_norm": 0.4137539565563202,
"learning_rate": 8.508819268854713e-06,
"loss": 0.4874,
"step": 114
},
{
"epoch": 0.9885386819484241,
"grad_norm": 0.42686036229133606,
"learning_rate": 8.472890717343391e-06,
"loss": 0.4895,
"step": 115
},
{
"epoch": 0.997134670487106,
"grad_norm": 0.44086194038391113,
"learning_rate": 8.436612302408376e-06,
"loss": 0.4695,
"step": 116
},
{
"epoch": 1.005730659025788,
"grad_norm": 0.8157400488853455,
"learning_rate": 8.399987678783285e-06,
"loss": 0.8043,
"step": 117
},
{
"epoch": 1.0143266475644699,
"grad_norm": 0.4437851011753082,
"learning_rate": 8.36302053607924e-06,
"loss": 0.4217,
"step": 118
},
{
"epoch": 1.0229226361031518,
"grad_norm": 0.5067012906074524,
"learning_rate": 8.325714598413169e-06,
"loss": 0.4721,
"step": 119
},
{
"epoch": 1.0315186246418337,
"grad_norm": 0.4572240710258484,
"learning_rate": 8.288073624032634e-06,
"loss": 0.4278,
"step": 120
},
{
"epoch": 1.0401146131805157,
"grad_norm": 0.4881412088871002,
"learning_rate": 8.250101404937223e-06,
"loss": 0.4646,
"step": 121
},
{
"epoch": 1.0487106017191976,
"grad_norm": 0.4639354348182678,
"learning_rate": 8.211801766496537e-06,
"loss": 0.4598,
"step": 122
},
{
"epoch": 1.0573065902578798,
"grad_norm": 0.4976268410682678,
"learning_rate": 8.17317856706482e-06,
"loss": 0.4918,
"step": 123
},
{
"epoch": 1.0659025787965617,
"grad_norm": 0.43885278701782227,
"learning_rate": 8.13423569759226e-06,
"loss": 0.4576,
"step": 124
},
{
"epoch": 1.0744985673352436,
"grad_norm": 0.48174867033958435,
"learning_rate": 8.094977081233006e-06,
"loss": 0.4343,
"step": 125
},
{
"epoch": 1.0830945558739256,
"grad_norm": 0.4119454622268677,
"learning_rate": 8.055406672949957e-06,
"loss": 0.4509,
"step": 126
},
{
"epoch": 1.0916905444126075,
"grad_norm": 0.41485852003097534,
"learning_rate": 8.015528459116321e-06,
"loss": 0.4196,
"step": 127
},
{
"epoch": 1.1002865329512894,
"grad_norm": 0.44501936435699463,
"learning_rate": 7.975346457114034e-06,
"loss": 0.4399,
"step": 128
},
{
"epoch": 1.1088825214899714,
"grad_norm": 0.3820144534111023,
"learning_rate": 7.934864714929036e-06,
"loss": 0.4639,
"step": 129
},
{
"epoch": 1.1174785100286533,
"grad_norm": 0.44919779896736145,
"learning_rate": 7.894087310743468e-06,
"loss": 0.4241,
"step": 130
},
{
"epoch": 1.1260744985673352,
"grad_norm": 0.41555774211883545,
"learning_rate": 7.853018352524845e-06,
"loss": 0.4385,
"step": 131
},
{
"epoch": 1.1346704871060171,
"grad_norm": 0.40297067165374756,
"learning_rate": 7.811661977612202e-06,
"loss": 0.435,
"step": 132
},
{
"epoch": 1.143266475644699,
"grad_norm": 0.41622859239578247,
"learning_rate": 7.770022352299294e-06,
"loss": 0.4305,
"step": 133
},
{
"epoch": 1.151862464183381,
"grad_norm": 0.3918144106864929,
"learning_rate": 7.728103671414889e-06,
"loss": 0.4522,
"step": 134
},
{
"epoch": 1.1604584527220632,
"grad_norm": 0.4661077857017517,
"learning_rate": 7.685910157900158e-06,
"loss": 0.4598,
"step": 135
},
{
"epoch": 1.1690544412607449,
"grad_norm": 0.3704371750354767,
"learning_rate": 7.643446062383273e-06,
"loss": 0.4211,
"step": 136
},
{
"epoch": 1.177650429799427,
"grad_norm": 0.4247897267341614,
"learning_rate": 7.600715662751166e-06,
"loss": 0.4335,
"step": 137
},
{
"epoch": 1.186246418338109,
"grad_norm": 0.43391960859298706,
"learning_rate": 7.557723263718596e-06,
"loss": 0.4732,
"step": 138
},
{
"epoch": 1.1948424068767909,
"grad_norm": 0.40003055334091187,
"learning_rate": 7.514473196394467e-06,
"loss": 0.4331,
"step": 139
},
{
"epoch": 1.2034383954154728,
"grad_norm": 0.3629187345504761,
"learning_rate": 7.470969817845518e-06,
"loss": 0.4601,
"step": 140
},
{
"epoch": 1.2120343839541547,
"grad_norm": 0.40369415283203125,
"learning_rate": 7.427217510657383e-06,
"loss": 0.4504,
"step": 141
},
{
"epoch": 1.2206303724928367,
"grad_norm": 0.4233240485191345,
"learning_rate": 7.383220682493081e-06,
"loss": 0.4594,
"step": 142
},
{
"epoch": 1.2292263610315186,
"grad_norm": 0.4018312990665436,
"learning_rate": 7.338983765648985e-06,
"loss": 0.4548,
"step": 143
},
{
"epoch": 1.2378223495702005,
"grad_norm": 0.3926844596862793,
"learning_rate": 7.294511216608308e-06,
"loss": 0.4707,
"step": 144
},
{
"epoch": 1.2464183381088825,
"grad_norm": 0.38275760412216187,
"learning_rate": 7.249807515592149e-06,
"loss": 0.4711,
"step": 145
},
{
"epoch": 1.2550143266475644,
"grad_norm": 0.4131011664867401,
"learning_rate": 7.2048771661081515e-06,
"loss": 0.4853,
"step": 146
},
{
"epoch": 1.2636103151862463,
"grad_norm": 0.3691644072532654,
"learning_rate": 7.159724694496815e-06,
"loss": 0.421,
"step": 147
},
{
"epoch": 1.2722063037249285,
"grad_norm": 0.3719673752784729,
"learning_rate": 7.114354649475499e-06,
"loss": 0.4529,
"step": 148
},
{
"epoch": 1.2808022922636102,
"grad_norm": 0.4382984936237335,
"learning_rate": 7.068771601680191e-06,
"loss": 0.4756,
"step": 149
},
{
"epoch": 1.2893982808022924,
"grad_norm": 0.36114269495010376,
"learning_rate": 7.022980143205046e-06,
"loss": 0.4224,
"step": 150
},
{
"epoch": 1.2979942693409743,
"grad_norm": 0.42413899302482605,
"learning_rate": 6.976984887139775e-06,
"loss": 0.4511,
"step": 151
},
{
"epoch": 1.3065902578796562,
"grad_norm": 0.4103688895702362,
"learning_rate": 6.930790467104916e-06,
"loss": 0.4779,
"step": 152
},
{
"epoch": 1.3151862464183381,
"grad_norm": 0.3579988181591034,
"learning_rate": 6.884401536785045e-06,
"loss": 0.4497,
"step": 153
},
{
"epoch": 1.32378223495702,
"grad_norm": 0.3847670257091522,
"learning_rate": 6.837822769459942e-06,
"loss": 0.4283,
"step": 154
},
{
"epoch": 1.332378223495702,
"grad_norm": 0.4245608150959015,
"learning_rate": 6.791058857533814e-06,
"loss": 0.4659,
"step": 155
},
{
"epoch": 1.340974212034384,
"grad_norm": 0.36232906579971313,
"learning_rate": 6.744114512062571e-06,
"loss": 0.4412,
"step": 156
},
{
"epoch": 1.3495702005730659,
"grad_norm": 0.3985491693019867,
"learning_rate": 6.696994462279223e-06,
"loss": 0.4613,
"step": 157
},
{
"epoch": 1.3581661891117478,
"grad_norm": 0.3694462776184082,
"learning_rate": 6.6497034551174585e-06,
"loss": 0.4623,
"step": 158
},
{
"epoch": 1.3667621776504297,
"grad_norm": 0.3082883656024933,
"learning_rate": 6.602246254733431e-06,
"loss": 0.4083,
"step": 159
},
{
"epoch": 1.3753581661891117,
"grad_norm": 0.38306036591529846,
"learning_rate": 6.554627642025807e-06,
"loss": 0.4636,
"step": 160
},
{
"epoch": 1.3839541547277938,
"grad_norm": 0.38210171461105347,
"learning_rate": 6.506852414154138e-06,
"loss": 0.4563,
"step": 161
},
{
"epoch": 1.3925501432664755,
"grad_norm": 0.34896981716156006,
"learning_rate": 6.4589253840555856e-06,
"loss": 0.4217,
"step": 162
},
{
"epoch": 1.4011461318051577,
"grad_norm": 0.3992280662059784,
"learning_rate": 6.41085137996006e-06,
"loss": 0.4688,
"step": 163
},
{
"epoch": 1.4097421203438396,
"grad_norm": 0.37131842970848083,
"learning_rate": 6.362635244903818e-06,
"loss": 0.4451,
"step": 164
},
{
"epoch": 1.4183381088825215,
"grad_norm": 0.32125821709632874,
"learning_rate": 6.314281836241573e-06,
"loss": 0.4463,
"step": 165
},
{
"epoch": 1.4269340974212035,
"grad_norm": 0.4050236940383911,
"learning_rate": 6.265796025157154e-06,
"loss": 0.4563,
"step": 166
},
{
"epoch": 1.4355300859598854,
"grad_norm": 0.3889450132846832,
"learning_rate": 6.217182696172776e-06,
"loss": 0.4566,
"step": 167
},
{
"epoch": 1.4441260744985673,
"grad_norm": 0.34171196818351746,
"learning_rate": 6.168446746656973e-06,
"loss": 0.4309,
"step": 168
},
{
"epoch": 1.4527220630372493,
"grad_norm": 0.45121294260025024,
"learning_rate": 6.119593086331225e-06,
"loss": 0.4769,
"step": 169
},
{
"epoch": 1.4613180515759312,
"grad_norm": 0.3910422623157501,
"learning_rate": 6.070626636775349e-06,
"loss": 0.4454,
"step": 170
},
{
"epoch": 1.4699140401146131,
"grad_norm": 0.33223018050193787,
"learning_rate": 6.021552330931693e-06,
"loss": 0.3993,
"step": 171
},
{
"epoch": 1.478510028653295,
"grad_norm": 0.3539504110813141,
"learning_rate": 5.972375112608182e-06,
"loss": 0.4473,
"step": 172
},
{
"epoch": 1.487106017191977,
"grad_norm": 0.40885159373283386,
"learning_rate": 5.923099935980278e-06,
"loss": 0.4942,
"step": 173
},
{
"epoch": 1.4957020057306591,
"grad_norm": 0.3294084370136261,
"learning_rate": 5.8737317650918905e-06,
"loss": 0.397,
"step": 174
},
{
"epoch": 1.5042979942693409,
"grad_norm": 0.3853004276752472,
"learning_rate": 5.824275573355278e-06,
"loss": 0.4691,
"step": 175
},
{
"epoch": 1.512893982808023,
"grad_norm": 0.3500480353832245,
"learning_rate": 5.7747363430500395e-06,
"loss": 0.4546,
"step": 176
},
{
"epoch": 1.5214899713467047,
"grad_norm": 0.38218215107917786,
"learning_rate": 5.725119064821185e-06,
"loss": 0.4805,
"step": 177
},
{
"epoch": 1.5300859598853869,
"grad_norm": 0.3564338982105255,
"learning_rate": 5.675428737176367e-06,
"loss": 0.4405,
"step": 178
},
{
"epoch": 1.5386819484240688,
"grad_norm": 0.3546433746814728,
"learning_rate": 5.625670365982332e-06,
"loss": 0.442,
"step": 179
},
{
"epoch": 1.5472779369627507,
"grad_norm": 0.35302734375,
"learning_rate": 5.575848963960621e-06,
"loss": 0.4039,
"step": 180
},
{
"epoch": 1.5558739255014327,
"grad_norm": 0.4053371846675873,
"learning_rate": 5.525969550182577e-06,
"loss": 0.498,
"step": 181
},
{
"epoch": 1.5644699140401146,
"grad_norm": 0.33448362350463867,
"learning_rate": 5.4760371495637256e-06,
"loss": 0.4272,
"step": 182
},
{
"epoch": 1.5730659025787965,
"grad_norm": 0.37541133165359497,
"learning_rate": 5.426056792357552e-06,
"loss": 0.4401,
"step": 183
},
{
"epoch": 1.5816618911174785,
"grad_norm": 0.3447186350822449,
"learning_rate": 5.376033513648743e-06,
"loss": 0.4199,
"step": 184
},
{
"epoch": 1.5902578796561606,
"grad_norm": 0.3608490228652954,
"learning_rate": 5.325972352845965e-06,
"loss": 0.4473,
"step": 185
},
{
"epoch": 1.5988538681948423,
"grad_norm": 0.34037309885025024,
"learning_rate": 5.2758783531741655e-06,
"loss": 0.4591,
"step": 186
},
{
"epoch": 1.6074498567335245,
"grad_norm": 0.36120274662971497,
"learning_rate": 5.225756561166521e-06,
"loss": 0.428,
"step": 187
},
{
"epoch": 1.6160458452722062,
"grad_norm": 0.3702341616153717,
"learning_rate": 5.175612026156045e-06,
"loss": 0.438,
"step": 188
},
{
"epoch": 1.6246418338108883,
"grad_norm": 0.34017178416252136,
"learning_rate": 5.125449799766916e-06,
"loss": 0.4753,
"step": 189
},
{
"epoch": 1.63323782234957,
"grad_norm": 0.3360693156719208,
"learning_rate": 5.075274935405554e-06,
"loss": 0.4399,
"step": 190
},
{
"epoch": 1.6418338108882522,
"grad_norm": 0.33825618028640747,
"learning_rate": 5.025092487751552e-06,
"loss": 0.451,
"step": 191
},
{
"epoch": 1.6504297994269341,
"grad_norm": 0.3170571029186249,
"learning_rate": 4.974907512248451e-06,
"loss": 0.3764,
"step": 192
},
{
"epoch": 1.659025787965616,
"grad_norm": 0.36524495482444763,
"learning_rate": 4.924725064594448e-06,
"loss": 0.4459,
"step": 193
},
{
"epoch": 1.667621776504298,
"grad_norm": 0.3204471170902252,
"learning_rate": 4.874550200233085e-06,
"loss": 0.4188,
"step": 194
},
{
"epoch": 1.67621776504298,
"grad_norm": 0.3678494989871979,
"learning_rate": 4.824387973843957e-06,
"loss": 0.4506,
"step": 195
},
{
"epoch": 1.6848137535816619,
"grad_norm": 0.3527798056602478,
"learning_rate": 4.7742434388334815e-06,
"loss": 0.4807,
"step": 196
},
{
"epoch": 1.6934097421203438,
"grad_norm": 0.37601473927497864,
"learning_rate": 4.724121646825838e-06,
"loss": 0.4642,
"step": 197
},
{
"epoch": 1.702005730659026,
"grad_norm": 0.4321448504924774,
"learning_rate": 4.674027647154037e-06,
"loss": 0.4613,
"step": 198
},
{
"epoch": 1.7106017191977076,
"grad_norm": 0.3331563472747803,
"learning_rate": 4.623966486351257e-06,
"loss": 0.4371,
"step": 199
},
{
"epoch": 1.7191977077363898,
"grad_norm": 0.3661264479160309,
"learning_rate": 4.573943207642452e-06,
"loss": 0.4628,
"step": 200
},
{
"epoch": 1.7277936962750715,
"grad_norm": 0.3599276542663574,
"learning_rate": 4.523962850436276e-06,
"loss": 0.4143,
"step": 201
},
{
"epoch": 1.7363896848137537,
"grad_norm": 0.3709213137626648,
"learning_rate": 4.474030449817423e-06,
"loss": 0.4316,
"step": 202
},
{
"epoch": 1.7449856733524354,
"grad_norm": 0.3640238344669342,
"learning_rate": 4.424151036039381e-06,
"loss": 0.4617,
"step": 203
},
{
"epoch": 1.7535816618911175,
"grad_norm": 0.3609802722930908,
"learning_rate": 4.3743296340176694e-06,
"loss": 0.4421,
"step": 204
},
{
"epoch": 1.7621776504297995,
"grad_norm": 0.3474278748035431,
"learning_rate": 4.3245712628236356e-06,
"loss": 0.4204,
"step": 205
},
{
"epoch": 1.7707736389684814,
"grad_norm": 0.35864928364753723,
"learning_rate": 4.274880935178817e-06,
"loss": 0.4534,
"step": 206
},
{
"epoch": 1.7793696275071633,
"grad_norm": 0.3199174702167511,
"learning_rate": 4.225263656949961e-06,
"loss": 0.4412,
"step": 207
},
{
"epoch": 1.7879656160458453,
"grad_norm": 0.32814860343933105,
"learning_rate": 4.175724426644724e-06,
"loss": 0.4271,
"step": 208
},
{
"epoch": 1.7965616045845272,
"grad_norm": 0.3326827585697174,
"learning_rate": 4.12626823490811e-06,
"loss": 0.4582,
"step": 209
},
{
"epoch": 1.8051575931232091,
"grad_norm": 0.39253976941108704,
"learning_rate": 4.076900064019721e-06,
"loss": 0.4524,
"step": 210
},
{
"epoch": 1.8137535816618913,
"grad_norm": 0.3864801228046417,
"learning_rate": 4.02762488739182e-06,
"loss": 0.4727,
"step": 211
},
{
"epoch": 1.822349570200573,
"grad_norm": 0.3414667248725891,
"learning_rate": 3.978447669068309e-06,
"loss": 0.4599,
"step": 212
},
{
"epoch": 1.8309455587392551,
"grad_norm": 0.37089967727661133,
"learning_rate": 3.929373363224654e-06,
"loss": 0.4654,
"step": 213
},
{
"epoch": 1.8395415472779368,
"grad_norm": 0.3420522212982178,
"learning_rate": 3.8804069136687775e-06,
"loss": 0.4318,
"step": 214
},
{
"epoch": 1.848137535816619,
"grad_norm": 0.33635398745536804,
"learning_rate": 3.8315532533430285e-06,
"loss": 0.4289,
"step": 215
},
{
"epoch": 1.8567335243553007,
"grad_norm": 0.3261006474494934,
"learning_rate": 3.7828173038272266e-06,
"loss": 0.4113,
"step": 216
},
{
"epoch": 1.8653295128939829,
"grad_norm": 0.3770403265953064,
"learning_rate": 3.7342039748428473e-06,
"loss": 0.4499,
"step": 217
},
{
"epoch": 1.8739255014326648,
"grad_norm": 0.3917461931705475,
"learning_rate": 3.685718163758427e-06,
"loss": 0.4387,
"step": 218
},
{
"epoch": 1.8825214899713467,
"grad_norm": 0.32100600004196167,
"learning_rate": 3.6373647550961834e-06,
"loss": 0.3887,
"step": 219
},
{
"epoch": 1.8911174785100286,
"grad_norm": 0.33872732520103455,
"learning_rate": 3.5891486200399413e-06,
"loss": 0.4313,
"step": 220
},
{
"epoch": 1.8997134670487106,
"grad_norm": 0.3846088647842407,
"learning_rate": 3.5410746159444165e-06,
"loss": 0.4799,
"step": 221
},
{
"epoch": 1.9083094555873925,
"grad_norm": 0.36191555857658386,
"learning_rate": 3.4931475858458634e-06,
"loss": 0.4047,
"step": 222
},
{
"epoch": 1.9169054441260744,
"grad_norm": 0.4302296042442322,
"learning_rate": 3.445372357974194e-06,
"loss": 0.4871,
"step": 223
},
{
"epoch": 1.9255014326647566,
"grad_norm": 0.3543616235256195,
"learning_rate": 3.397753745266571e-06,
"loss": 0.438,
"step": 224
},
{
"epoch": 1.9340974212034383,
"grad_norm": 0.36250874400138855,
"learning_rate": 3.350296544882543e-06,
"loss": 0.4273,
"step": 225
},
{
"epoch": 1.9426934097421205,
"grad_norm": 0.3882717490196228,
"learning_rate": 3.303005537720778e-06,
"loss": 0.4446,
"step": 226
},
{
"epoch": 1.9512893982808022,
"grad_norm": 0.3337433934211731,
"learning_rate": 3.255885487937431e-06,
"loss": 0.4275,
"step": 227
},
{
"epoch": 1.9598853868194843,
"grad_norm": 0.34410232305526733,
"learning_rate": 3.2089411424661864e-06,
"loss": 0.464,
"step": 228
},
{
"epoch": 1.968481375358166,
"grad_norm": 0.3260536193847656,
"learning_rate": 3.1621772305400603e-06,
"loss": 0.4146,
"step": 229
},
{
"epoch": 1.9770773638968482,
"grad_norm": 0.3726963698863983,
"learning_rate": 3.1155984632149565e-06,
"loss": 0.4816,
"step": 230
},
{
"epoch": 1.9856733524355301,
"grad_norm": 0.3261895477771759,
"learning_rate": 3.0692095328950843e-06,
"loss": 0.4516,
"step": 231
},
{
"epoch": 1.994269340974212,
"grad_norm": 0.321073055267334,
"learning_rate": 3.023015112860228e-06,
"loss": 0.4141,
"step": 232
},
{
"epoch": 2.002865329512894,
"grad_norm": 0.6429646015167236,
"learning_rate": 2.977019856794955e-06,
"loss": 0.7025,
"step": 233
},
{
"epoch": 2.011461318051576,
"grad_norm": 0.36016204953193665,
"learning_rate": 2.93122839831981e-06,
"loss": 0.4015,
"step": 234
},
{
"epoch": 2.020057306590258,
"grad_norm": 0.3868311047554016,
"learning_rate": 2.8856453505245018e-06,
"loss": 0.4401,
"step": 235
},
{
"epoch": 2.0286532951289398,
"grad_norm": 0.39288440346717834,
"learning_rate": 2.840275305503186e-06,
"loss": 0.4529,
"step": 236
},
{
"epoch": 2.037249283667622,
"grad_norm": 0.33372241258621216,
"learning_rate": 2.7951228338918506e-06,
"loss": 0.4231,
"step": 237
},
{
"epoch": 2.0458452722063036,
"grad_norm": 0.3850330114364624,
"learning_rate": 2.7501924844078538e-06,
"loss": 0.4044,
"step": 238
},
{
"epoch": 2.054441260744986,
"grad_norm": 0.36762574315071106,
"learning_rate": 2.7054887833916933e-06,
"loss": 0.4045,
"step": 239
},
{
"epoch": 2.0630372492836675,
"grad_norm": 0.3655935227870941,
"learning_rate": 2.6610162343510183e-06,
"loss": 0.3959,
"step": 240
},
{
"epoch": 2.0716332378223496,
"grad_norm": 0.3435436189174652,
"learning_rate": 2.616779317506921e-06,
"loss": 0.3721,
"step": 241
},
{
"epoch": 2.0802292263610314,
"grad_norm": 0.3302423357963562,
"learning_rate": 2.572782489342617e-06,
"loss": 0.3893,
"step": 242
},
{
"epoch": 2.0888252148997135,
"grad_norm": 0.33585667610168457,
"learning_rate": 2.5290301821544826e-06,
"loss": 0.3965,
"step": 243
},
{
"epoch": 2.097421203438395,
"grad_norm": 0.35413309931755066,
"learning_rate": 2.4855268036055346e-06,
"loss": 0.4346,
"step": 244
},
{
"epoch": 2.1060171919770774,
"grad_norm": 0.3453938364982605,
"learning_rate": 2.4422767362814045e-06,
"loss": 0.4318,
"step": 245
},
{
"epoch": 2.1146131805157595,
"grad_norm": 0.31205686926841736,
"learning_rate": 2.3992843372488357e-06,
"loss": 0.4011,
"step": 246
},
{
"epoch": 2.1232091690544412,
"grad_norm": 0.35008689761161804,
"learning_rate": 2.3565539376167295e-06,
"loss": 0.4418,
"step": 247
},
{
"epoch": 2.1318051575931234,
"grad_norm": 0.3320912718772888,
"learning_rate": 2.3140898420998425e-06,
"loss": 0.4234,
"step": 248
},
{
"epoch": 2.140401146131805,
"grad_norm": 0.3085114359855652,
"learning_rate": 2.271896328585114e-06,
"loss": 0.387,
"step": 249
},
{
"epoch": 2.1489971346704873,
"grad_norm": 0.3297877311706543,
"learning_rate": 2.2299776477007073e-06,
"loss": 0.4192,
"step": 250
},
{
"epoch": 2.157593123209169,
"grad_norm": 0.3377752900123596,
"learning_rate": 2.1883380223878004e-06,
"loss": 0.4018,
"step": 251
},
{
"epoch": 2.166189111747851,
"grad_norm": 0.3176102638244629,
"learning_rate": 2.1469816474751566e-06,
"loss": 0.3956,
"step": 252
},
{
"epoch": 2.174785100286533,
"grad_norm": 0.309105783700943,
"learning_rate": 2.105912689256533e-06,
"loss": 0.3773,
"step": 253
},
{
"epoch": 2.183381088825215,
"grad_norm": 0.3327156901359558,
"learning_rate": 2.0651352850709656e-06,
"loss": 0.4409,
"step": 254
},
{
"epoch": 2.1919770773638967,
"grad_norm": 0.31827932596206665,
"learning_rate": 2.0246535428859652e-06,
"loss": 0.4296,
"step": 255
},
{
"epoch": 2.200573065902579,
"grad_norm": 0.33633846044540405,
"learning_rate": 1.984471540883679e-06,
"loss": 0.4369,
"step": 256
},
{
"epoch": 2.2091690544412605,
"grad_norm": 0.3031711280345917,
"learning_rate": 1.9445933270500444e-06,
"loss": 0.3433,
"step": 257
},
{
"epoch": 2.2177650429799427,
"grad_norm": 0.3334784209728241,
"learning_rate": 1.905022918766995e-06,
"loss": 0.4239,
"step": 258
},
{
"epoch": 2.226361031518625,
"grad_norm": 0.3104681074619293,
"learning_rate": 1.8657643024077431e-06,
"loss": 0.4355,
"step": 259
},
{
"epoch": 2.2349570200573066,
"grad_norm": 0.30309274792671204,
"learning_rate": 1.8268214329351797e-06,
"loss": 0.4045,
"step": 260
},
{
"epoch": 2.2435530085959887,
"grad_norm": 0.2949390113353729,
"learning_rate": 1.7881982335034625e-06,
"loss": 0.3742,
"step": 261
},
{
"epoch": 2.2521489971346704,
"grad_norm": 0.32407474517822266,
"learning_rate": 1.7498985950627794e-06,
"loss": 0.4242,
"step": 262
},
{
"epoch": 2.2607449856733526,
"grad_norm": 0.313754677772522,
"learning_rate": 1.7119263759673677e-06,
"loss": 0.3957,
"step": 263
},
{
"epoch": 2.2693409742120343,
"grad_norm": 0.3055744767189026,
"learning_rate": 1.6742854015868349e-06,
"loss": 0.3981,
"step": 264
},
{
"epoch": 2.2779369627507164,
"grad_norm": 0.30761680006980896,
"learning_rate": 1.6369794639207626e-06,
"loss": 0.4348,
"step": 265
},
{
"epoch": 2.286532951289398,
"grad_norm": 0.3119281232357025,
"learning_rate": 1.6000123212167158e-06,
"loss": 0.3941,
"step": 266
},
{
"epoch": 2.2951289398280803,
"grad_norm": 0.30965036153793335,
"learning_rate": 1.5633876975916261e-06,
"loss": 0.4339,
"step": 267
},
{
"epoch": 2.303724928366762,
"grad_norm": 0.29678910970687866,
"learning_rate": 1.5271092826566108e-06,
"loss": 0.4,
"step": 268
},
{
"epoch": 2.312320916905444,
"grad_norm": 0.3296194076538086,
"learning_rate": 1.4911807311452874e-06,
"loss": 0.4163,
"step": 269
},
{
"epoch": 2.3209169054441263,
"grad_norm": 0.29097187519073486,
"learning_rate": 1.4556056625455922e-06,
"loss": 0.4162,
"step": 270
},
{
"epoch": 2.329512893982808,
"grad_norm": 0.336285799741745,
"learning_rate": 1.4203876607351347e-06,
"loss": 0.4214,
"step": 271
},
{
"epoch": 2.3381088825214897,
"grad_norm": 0.3088236451148987,
"learning_rate": 1.3855302736201686e-06,
"loss": 0.4294,
"step": 272
},
{
"epoch": 2.346704871060172,
"grad_norm": 0.27219104766845703,
"learning_rate": 1.3510370127781635e-06,
"loss": 0.3729,
"step": 273
},
{
"epoch": 2.355300859598854,
"grad_norm": 0.2987208068370819,
"learning_rate": 1.3169113531040462e-06,
"loss": 0.4273,
"step": 274
},
{
"epoch": 2.3638968481375358,
"grad_norm": 0.2788269519805908,
"learning_rate": 1.2831567324601325e-06,
"loss": 0.3505,
"step": 275
},
{
"epoch": 2.372492836676218,
"grad_norm": 0.3208795189857483,
"learning_rate": 1.2497765513297976e-06,
"loss": 0.4641,
"step": 276
},
{
"epoch": 2.3810888252148996,
"grad_norm": 0.2891055941581726,
"learning_rate": 1.2167741724749026e-06,
"loss": 0.3852,
"step": 277
},
{
"epoch": 2.3896848137535818,
"grad_norm": 0.300936758518219,
"learning_rate": 1.1841529205970281e-06,
"loss": 0.4323,
"step": 278
},
{
"epoch": 2.3982808022922635,
"grad_norm": 0.30507004261016846,
"learning_rate": 1.1519160820025382e-06,
"loss": 0.4053,
"step": 279
},
{
"epoch": 2.4068767908309456,
"grad_norm": 0.3024282157421112,
"learning_rate": 1.1200669042715163e-06,
"loss": 0.4121,
"step": 280
},
{
"epoch": 2.4154727793696273,
"grad_norm": 0.30080166459083557,
"learning_rate": 1.0886085959305915e-06,
"loss": 0.4147,
"step": 281
},
{
"epoch": 2.4240687679083095,
"grad_norm": 0.28306689858436584,
"learning_rate": 1.057544326129723e-06,
"loss": 0.3836,
"step": 282
},
{
"epoch": 2.432664756446991,
"grad_norm": 0.2936306297779083,
"learning_rate": 1.026877224322923e-06,
"loss": 0.4371,
"step": 283
},
{
"epoch": 2.4412607449856734,
"grad_norm": 0.265400230884552,
"learning_rate": 9.966103799529891e-07,
"loss": 0.3852,
"step": 284
},
{
"epoch": 2.4498567335243555,
"grad_norm": 0.30174750089645386,
"learning_rate": 9.66746842140287e-07,
"loss": 0.423,
"step": 285
},
{
"epoch": 2.458452722063037,
"grad_norm": 0.3249377906322479,
"learning_rate": 9.372896193755621e-07,
"loss": 0.4351,
"step": 286
},
{
"epoch": 2.4670487106017194,
"grad_norm": 0.2958242893218994,
"learning_rate": 9.082416792168608e-07,
"loss": 0.4118,
"step": 287
},
{
"epoch": 2.475644699140401,
"grad_norm": 0.302644819021225,
"learning_rate": 8.7960594799059e-07,
"loss": 0.444,
"step": 288
},
{
"epoch": 2.4842406876790832,
"grad_norm": 0.2721085846424103,
"learning_rate": 8.513853104966951e-07,
"loss": 0.3655,
"step": 289
},
{
"epoch": 2.492836676217765,
"grad_norm": 0.31600409746170044,
"learning_rate": 8.235826097180566e-07,
"loss": 0.4361,
"step": 290
},
{
"epoch": 2.501432664756447,
"grad_norm": 0.2974003553390503,
"learning_rate": 7.962006465340821e-07,
"loss": 0.3849,
"step": 291
},
{
"epoch": 2.510028653295129,
"grad_norm": 0.28211110830307007,
"learning_rate": 7.692421794385313e-07,
"loss": 0.3915,
"step": 292
},
{
"epoch": 2.518624641833811,
"grad_norm": 0.2826594412326813,
"learning_rate": 7.427099242616348e-07,
"loss": 0.4066,
"step": 293
},
{
"epoch": 2.5272206303724927,
"grad_norm": 0.3020119071006775,
"learning_rate": 7.166065538964955e-07,
"loss": 0.4405,
"step": 294
},
{
"epoch": 2.535816618911175,
"grad_norm": 0.2853386700153351,
"learning_rate": 6.909346980298093e-07,
"loss": 0.415,
"step": 295
},
{
"epoch": 2.544412607449857,
"grad_norm": 0.3004297614097595,
"learning_rate": 6.656969428769567e-07,
"loss": 0.4509,
"step": 296
},
{
"epoch": 2.5530085959885387,
"grad_norm": 0.2878156900405884,
"learning_rate": 6.408958309214597e-07,
"loss": 0.4249,
"step": 297
},
{
"epoch": 2.5616045845272204,
"grad_norm": 0.2895064055919647,
"learning_rate": 6.165338606588517e-07,
"loss": 0.3721,
"step": 298
},
{
"epoch": 2.5702005730659025,
"grad_norm": 0.29499444365501404,
"learning_rate": 5.926134863449712e-07,
"loss": 0.4474,
"step": 299
},
{
"epoch": 2.5787965616045847,
"grad_norm": 0.27489638328552246,
"learning_rate": 5.691371177487215e-07,
"loss": 0.3842,
"step": 300
},
{
"epoch": 2.5873925501432664,
"grad_norm": 0.2753602862358093,
"learning_rate": 5.461071199093048e-07,
"loss": 0.3994,
"step": 301
},
{
"epoch": 2.5959885386819486,
"grad_norm": 0.3153139650821686,
"learning_rate": 5.235258128979676e-07,
"loss": 0.4518,
"step": 302
},
{
"epoch": 2.6045845272206303,
"grad_norm": 0.3056911528110504,
"learning_rate": 5.0139547158427e-07,
"loss": 0.3921,
"step": 303
},
{
"epoch": 2.6131805157593124,
"grad_norm": 0.26982367038726807,
"learning_rate": 4.797183254069176e-07,
"loss": 0.3952,
"step": 304
},
{
"epoch": 2.621776504297994,
"grad_norm": 0.3166239857673645,
"learning_rate": 4.5849655814915683e-07,
"loss": 0.492,
"step": 305
},
{
"epoch": 2.6303724928366763,
"grad_norm": 0.27470794320106506,
"learning_rate": 4.3773230771879004e-07,
"loss": 0.3918,
"step": 306
},
{
"epoch": 2.6389684813753584,
"grad_norm": 0.2945537865161896,
"learning_rate": 4.1742766593278974e-07,
"loss": 0.4398,
"step": 307
},
{
"epoch": 2.64756446991404,
"grad_norm": 0.286775141954422,
"learning_rate": 3.9758467830656623e-07,
"loss": 0.3765,
"step": 308
},
{
"epoch": 2.656160458452722,
"grad_norm": 0.3127121925354004,
"learning_rate": 3.782053438479094e-07,
"loss": 0.43,
"step": 309
},
{
"epoch": 2.664756446991404,
"grad_norm": 0.2938476502895355,
"learning_rate": 3.5929161485559694e-07,
"loss": 0.4025,
"step": 310
},
{
"epoch": 2.673352435530086,
"grad_norm": 0.2913784980773926,
"learning_rate": 3.4084539672271764e-07,
"loss": 0.4572,
"step": 311
},
{
"epoch": 2.681948424068768,
"grad_norm": 0.26683560013771057,
"learning_rate": 3.228685477447291e-07,
"loss": 0.4115,
"step": 312
},
{
"epoch": 2.6905444126074496,
"grad_norm": 0.2884276509284973,
"learning_rate": 3.0536287893223603e-07,
"loss": 0.4167,
"step": 313
},
{
"epoch": 2.6991404011461317,
"grad_norm": 0.2850170135498047,
"learning_rate": 2.883301538285582e-07,
"loss": 0.4454,
"step": 314
},
{
"epoch": 2.707736389684814,
"grad_norm": 0.28306451439857483,
"learning_rate": 2.717720883320685e-07,
"loss": 0.4136,
"step": 315
},
{
"epoch": 2.7163323782234956,
"grad_norm": 0.3082982003688812,
"learning_rate": 2.556903505233216e-07,
"loss": 0.4219,
"step": 316
},
{
"epoch": 2.7249283667621778,
"grad_norm": 0.29846885800361633,
"learning_rate": 2.4008656049701875e-07,
"loss": 0.3945,
"step": 317
},
{
"epoch": 2.7335243553008595,
"grad_norm": 0.2880510091781616,
"learning_rate": 2.2496229019879635e-07,
"loss": 0.4336,
"step": 318
},
{
"epoch": 2.7421203438395416,
"grad_norm": 0.2777482867240906,
"learning_rate": 2.1031906326685946e-07,
"loss": 0.3909,
"step": 319
},
{
"epoch": 2.7507163323782233,
"grad_norm": 0.2989678680896759,
"learning_rate": 1.9615835487849677e-07,
"loss": 0.4362,
"step": 320
},
{
"epoch": 2.7593123209169055,
"grad_norm": 0.29559576511383057,
"learning_rate": 1.824815916014644e-07,
"loss": 0.4362,
"step": 321
},
{
"epoch": 2.7679083094555876,
"grad_norm": 0.2814064919948578,
"learning_rate": 1.6929015125027314e-07,
"loss": 0.3635,
"step": 322
},
{
"epoch": 2.7765042979942693,
"grad_norm": 0.2957613468170166,
"learning_rate": 1.5658536274738623e-07,
"loss": 0.4292,
"step": 323
},
{
"epoch": 2.785100286532951,
"grad_norm": 0.31237927079200745,
"learning_rate": 1.443685059893396e-07,
"loss": 0.4446,
"step": 324
},
{
"epoch": 2.793696275071633,
"grad_norm": 0.2811686396598816,
"learning_rate": 1.3264081171780797e-07,
"loss": 0.384,
"step": 325
},
{
"epoch": 2.8022922636103154,
"grad_norm": 0.29446473717689514,
"learning_rate": 1.2140346139561277e-07,
"loss": 0.4052,
"step": 326
},
{
"epoch": 2.810888252148997,
"grad_norm": 0.28350237011909485,
"learning_rate": 1.1065758708770468e-07,
"loss": 0.4163,
"step": 327
},
{
"epoch": 2.819484240687679,
"grad_norm": 0.28208962082862854,
"learning_rate": 1.004042713471165e-07,
"loss": 0.4426,
"step": 328
},
{
"epoch": 2.828080229226361,
"grad_norm": 0.2744838297367096,
"learning_rate": 9.064454710590253e-08,
"loss": 0.376,
"step": 329
},
{
"epoch": 2.836676217765043,
"grad_norm": 0.27180764079093933,
"learning_rate": 8.137939757108526e-08,
"loss": 0.4431,
"step": 330
},
{
"epoch": 2.845272206303725,
"grad_norm": 0.33907318115234375,
"learning_rate": 7.260975612560173e-08,
"loss": 0.4333,
"step": 331
},
{
"epoch": 2.853868194842407,
"grad_norm": 0.3016175925731659,
"learning_rate": 6.433650623427379e-08,
"loss": 0.4161,
"step": 332
},
{
"epoch": 2.862464183381089,
"grad_norm": 0.26725831627845764,
"learning_rate": 5.6560481354807625e-08,
"loss": 0.3748,
"step": 333
},
{
"epoch": 2.871060171919771,
"grad_norm": 0.29347124695777893,
"learning_rate": 4.928246485383148e-08,
"loss": 0.4343,
"step": 334
},
{
"epoch": 2.8796561604584525,
"grad_norm": 0.2599901556968689,
"learning_rate": 4.250318992797375e-08,
"loss": 0.3932,
"step": 335
},
{
"epoch": 2.8882521489971347,
"grad_norm": 0.27541452646255493,
"learning_rate": 3.622333953000601e-08,
"loss": 0.4059,
"step": 336
},
{
"epoch": 2.896848137535817,
"grad_norm": 0.28929024934768677,
"learning_rate": 3.0443546300035764e-08,
"loss": 0.3882,
"step": 337
},
{
"epoch": 2.9054441260744985,
"grad_norm": 0.29858601093292236,
"learning_rate": 2.516439250177749e-08,
"loss": 0.4215,
"step": 338
},
{
"epoch": 2.9140401146131802,
"grad_norm": 0.3101257085800171,
"learning_rate": 2.038640996389285e-08,
"loss": 0.4216,
"step": 339
},
{
"epoch": 2.9226361031518624,
"grad_norm": 0.28121909499168396,
"learning_rate": 1.6110080026414123e-08,
"loss": 0.394,
"step": 340
},
{
"epoch": 2.9312320916905446,
"grad_norm": 0.3071954548358917,
"learning_rate": 1.2335833492252425e-08,
"loss": 0.4546,
"step": 341
},
{
"epoch": 2.9398280802292263,
"grad_norm": 0.27462613582611084,
"learning_rate": 9.06405058380022e-09,
"loss": 0.362,
"step": 342
},
{
"epoch": 2.9484240687679084,
"grad_norm": 0.2850578725337982,
"learning_rate": 6.295060904623618e-09,
"loss": 0.4202,
"step": 343
},
{
"epoch": 2.95702005730659,
"grad_norm": 0.30015918612480164,
"learning_rate": 4.02914340626226e-09,
"loss": 0.4216,
"step": 344
},
{
"epoch": 2.9656160458452723,
"grad_norm": 0.25679218769073486,
"learning_rate": 2.2665263601240328e-09,
"loss": 0.3732,
"step": 345
},
{
"epoch": 2.974212034383954,
"grad_norm": 0.28618451952934265,
"learning_rate": 1.0073873344895735e-09,
"loss": 0.4443,
"step": 346
},
{
"epoch": 2.982808022922636,
"grad_norm": 0.27473634481430054,
"learning_rate": 2.5185317662490547e-10,
"loss": 0.4047,
"step": 347
},
{
"epoch": 2.9914040114613183,
"grad_norm": 0.29134419560432434,
"learning_rate": 0.0,
"loss": 0.4076,
"step": 348
},
{
"epoch": 2.9914040114613183,
"step": 348,
"total_flos": 514952320516096.0,
"train_loss": 0.4717323488031311,
"train_runtime": 7432.2875,
"train_samples_per_second": 4.497,
"train_steps_per_second": 0.047
}
],
"logging_steps": 1.0,
"max_steps": 348,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 514952320516096.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}