diaenra's picture
Training in progress, epoch 2, checkpoint
4abed97 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 354,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005649717514124294,
"grad_norm": 0.5776382684707642,
"learning_rate": 1e-05,
"loss": 88.641,
"step": 1
},
{
"epoch": 0.011299435028248588,
"grad_norm": 0.5559790730476379,
"learning_rate": 2e-05,
"loss": 88.6713,
"step": 2
},
{
"epoch": 0.01694915254237288,
"grad_norm": 0.5598555207252502,
"learning_rate": 3e-05,
"loss": 88.7191,
"step": 3
},
{
"epoch": 0.022598870056497175,
"grad_norm": 0.5563465356826782,
"learning_rate": 4e-05,
"loss": 88.6407,
"step": 4
},
{
"epoch": 0.02824858757062147,
"grad_norm": 0.6338562965393066,
"learning_rate": 5e-05,
"loss": 88.64,
"step": 5
},
{
"epoch": 0.03389830508474576,
"grad_norm": 0.5904527902603149,
"learning_rate": 6e-05,
"loss": 88.6842,
"step": 6
},
{
"epoch": 0.03954802259887006,
"grad_norm": 0.629975438117981,
"learning_rate": 7e-05,
"loss": 88.6572,
"step": 7
},
{
"epoch": 0.04519774011299435,
"grad_norm": 0.6180629730224609,
"learning_rate": 8e-05,
"loss": 88.6373,
"step": 8
},
{
"epoch": 0.05084745762711865,
"grad_norm": 0.6689253449440002,
"learning_rate": 9e-05,
"loss": 88.5833,
"step": 9
},
{
"epoch": 0.05649717514124294,
"grad_norm": 0.676450788974762,
"learning_rate": 0.0001,
"loss": 88.6067,
"step": 10
},
{
"epoch": 0.062146892655367235,
"grad_norm": 0.6853811740875244,
"learning_rate": 9.999791493378921e-05,
"loss": 88.6227,
"step": 11
},
{
"epoch": 0.06779661016949153,
"grad_norm": 0.6194440126419067,
"learning_rate": 9.999165990905683e-05,
"loss": 88.6231,
"step": 12
},
{
"epoch": 0.07344632768361582,
"grad_norm": 0.6684823632240295,
"learning_rate": 9.998123544748852e-05,
"loss": 88.6185,
"step": 13
},
{
"epoch": 0.07909604519774012,
"grad_norm": 0.6482928991317749,
"learning_rate": 9.996664241851197e-05,
"loss": 88.6342,
"step": 14
},
{
"epoch": 0.0847457627118644,
"grad_norm": 0.7201379537582397,
"learning_rate": 9.994788203922447e-05,
"loss": 88.5714,
"step": 15
},
{
"epoch": 0.0903954802259887,
"grad_norm": 0.6155017614364624,
"learning_rate": 9.992495587429129e-05,
"loss": 88.5541,
"step": 16
},
{
"epoch": 0.096045197740113,
"grad_norm": 0.6636160016059875,
"learning_rate": 9.989786583581535e-05,
"loss": 88.5598,
"step": 17
},
{
"epoch": 0.1016949152542373,
"grad_norm": 0.6538222432136536,
"learning_rate": 9.986661418317759e-05,
"loss": 88.5863,
"step": 18
},
{
"epoch": 0.10734463276836158,
"grad_norm": 0.5915122628211975,
"learning_rate": 9.98312035228486e-05,
"loss": 88.6102,
"step": 19
},
{
"epoch": 0.11299435028248588,
"grad_norm": 0.6704850792884827,
"learning_rate": 9.979163680817124e-05,
"loss": 88.5026,
"step": 20
},
{
"epoch": 0.11864406779661017,
"grad_norm": 0.6887699961662292,
"learning_rate": 9.97479173391143e-05,
"loss": 88.5315,
"step": 21
},
{
"epoch": 0.12429378531073447,
"grad_norm": 0.7467436790466309,
"learning_rate": 9.97000487619973e-05,
"loss": 88.5097,
"step": 22
},
{
"epoch": 0.12994350282485875,
"grad_norm": 0.750785231590271,
"learning_rate": 9.964803506918634e-05,
"loss": 88.5409,
"step": 23
},
{
"epoch": 0.13559322033898305,
"grad_norm": 0.7216200828552246,
"learning_rate": 9.959188059876115e-05,
"loss": 88.4787,
"step": 24
},
{
"epoch": 0.14124293785310735,
"grad_norm": 0.7141463160514832,
"learning_rate": 9.953159003415328e-05,
"loss": 88.5226,
"step": 25
},
{
"epoch": 0.14689265536723164,
"grad_norm": 0.7239276766777039,
"learning_rate": 9.946716840375551e-05,
"loss": 88.4626,
"step": 26
},
{
"epoch": 0.15254237288135594,
"grad_norm": 0.7370103001594543,
"learning_rate": 9.939862108050243e-05,
"loss": 88.5769,
"step": 27
},
{
"epoch": 0.15819209039548024,
"grad_norm": 0.6700708270072937,
"learning_rate": 9.932595378142233e-05,
"loss": 88.558,
"step": 28
},
{
"epoch": 0.1638418079096045,
"grad_norm": 0.7646386027336121,
"learning_rate": 9.924917256716042e-05,
"loss": 88.4619,
"step": 29
},
{
"epoch": 0.1694915254237288,
"grad_norm": 0.6728429198265076,
"learning_rate": 9.916828384147331e-05,
"loss": 88.5065,
"step": 30
},
{
"epoch": 0.1751412429378531,
"grad_norm": 0.7492191195487976,
"learning_rate": 9.908329435069495e-05,
"loss": 88.4889,
"step": 31
},
{
"epoch": 0.1807909604519774,
"grad_norm": 0.7224460244178772,
"learning_rate": 9.899421118317398e-05,
"loss": 88.4961,
"step": 32
},
{
"epoch": 0.1864406779661017,
"grad_norm": 0.7774916887283325,
"learning_rate": 9.890104176868247e-05,
"loss": 88.6111,
"step": 33
},
{
"epoch": 0.192090395480226,
"grad_norm": 0.7639403939247131,
"learning_rate": 9.880379387779637e-05,
"loss": 88.4938,
"step": 34
},
{
"epoch": 0.1977401129943503,
"grad_norm": 0.683748185634613,
"learning_rate": 9.87024756212473e-05,
"loss": 88.5265,
"step": 35
},
{
"epoch": 0.2033898305084746,
"grad_norm": 0.7794637680053711,
"learning_rate": 9.859709544924624e-05,
"loss": 88.5168,
"step": 36
},
{
"epoch": 0.20903954802259886,
"grad_norm": 0.7606369256973267,
"learning_rate": 9.848766215077858e-05,
"loss": 88.4687,
"step": 37
},
{
"epoch": 0.21468926553672316,
"grad_norm": 0.7200310826301575,
"learning_rate": 9.837418485287127e-05,
"loss": 88.494,
"step": 38
},
{
"epoch": 0.22033898305084745,
"grad_norm": 0.804512619972229,
"learning_rate": 9.825667301983148e-05,
"loss": 88.4699,
"step": 39
},
{
"epoch": 0.22598870056497175,
"grad_norm": 0.7734663486480713,
"learning_rate": 9.813513645245729e-05,
"loss": 88.4918,
"step": 40
},
{
"epoch": 0.23163841807909605,
"grad_norm": 0.8824536204338074,
"learning_rate": 9.800958528722036e-05,
"loss": 88.5269,
"step": 41
},
{
"epoch": 0.23728813559322035,
"grad_norm": 0.8647343516349792,
"learning_rate": 9.78800299954203e-05,
"loss": 88.4415,
"step": 42
},
{
"epoch": 0.24293785310734464,
"grad_norm": 0.8488788604736328,
"learning_rate": 9.774648138231163e-05,
"loss": 88.5083,
"step": 43
},
{
"epoch": 0.24858757062146894,
"grad_norm": 1.1741865873336792,
"learning_rate": 9.760895058620235e-05,
"loss": 88.4342,
"step": 44
},
{
"epoch": 0.2542372881355932,
"grad_norm": 0.6116179823875427,
"learning_rate": 9.746744907752509e-05,
"loss": 88.5531,
"step": 45
},
{
"epoch": 0.2598870056497175,
"grad_norm": 0.6405426859855652,
"learning_rate": 9.732198865788047e-05,
"loss": 88.5101,
"step": 46
},
{
"epoch": 0.2655367231638418,
"grad_norm": 0.6234097480773926,
"learning_rate": 9.71725814590527e-05,
"loss": 88.5438,
"step": 47
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.6477924585342407,
"learning_rate": 9.701923994199784e-05,
"loss": 88.4566,
"step": 48
},
{
"epoch": 0.2768361581920904,
"grad_norm": 0.5889535546302795,
"learning_rate": 9.686197689580456e-05,
"loss": 88.515,
"step": 49
},
{
"epoch": 0.2824858757062147,
"grad_norm": 0.7161465883255005,
"learning_rate": 9.67008054366274e-05,
"loss": 88.4243,
"step": 50
},
{
"epoch": 0.288135593220339,
"grad_norm": 0.6408765316009521,
"learning_rate": 9.653573900659292e-05,
"loss": 88.4917,
"step": 51
},
{
"epoch": 0.2937853107344633,
"grad_norm": 0.7543685436248779,
"learning_rate": 9.636679137267852e-05,
"loss": 88.443,
"step": 52
},
{
"epoch": 0.2994350282485876,
"grad_norm": 0.71647709608078,
"learning_rate": 9.619397662556435e-05,
"loss": 88.4699,
"step": 53
},
{
"epoch": 0.3050847457627119,
"grad_norm": 0.7122645974159241,
"learning_rate": 9.601730917845797e-05,
"loss": 88.401,
"step": 54
},
{
"epoch": 0.3107344632768362,
"grad_norm": 0.7030540704727173,
"learning_rate": 9.583680376589241e-05,
"loss": 88.4259,
"step": 55
},
{
"epoch": 0.3163841807909605,
"grad_norm": 0.7593639492988586,
"learning_rate": 9.56524754424971e-05,
"loss": 88.4487,
"step": 56
},
{
"epoch": 0.3220338983050847,
"grad_norm": 0.7722213268280029,
"learning_rate": 9.546433958174238e-05,
"loss": 88.43,
"step": 57
},
{
"epoch": 0.327683615819209,
"grad_norm": 0.7288438677787781,
"learning_rate": 9.527241187465734e-05,
"loss": 88.3441,
"step": 58
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.7370777130126953,
"learning_rate": 9.507670832852102e-05,
"loss": 88.4693,
"step": 59
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.6613349318504333,
"learning_rate": 9.487724526552753e-05,
"loss": 88.3618,
"step": 60
},
{
"epoch": 0.3446327683615819,
"grad_norm": 0.6312256455421448,
"learning_rate": 9.467403932142452e-05,
"loss": 88.3936,
"step": 61
},
{
"epoch": 0.3502824858757062,
"grad_norm": 0.7118995189666748,
"learning_rate": 9.446710744412595e-05,
"loss": 88.4331,
"step": 62
},
{
"epoch": 0.3559322033898305,
"grad_norm": 0.6945487260818481,
"learning_rate": 9.425646689229842e-05,
"loss": 88.3982,
"step": 63
},
{
"epoch": 0.3615819209039548,
"grad_norm": 0.617613673210144,
"learning_rate": 9.404213523392183e-05,
"loss": 88.4616,
"step": 64
},
{
"epoch": 0.3672316384180791,
"grad_norm": 0.6954861879348755,
"learning_rate": 9.38241303448241e-05,
"loss": 88.3863,
"step": 65
},
{
"epoch": 0.3728813559322034,
"grad_norm": 0.6992458701133728,
"learning_rate": 9.360247040719039e-05,
"loss": 88.3965,
"step": 66
},
{
"epoch": 0.3785310734463277,
"grad_norm": 0.6800311207771301,
"learning_rate": 9.337717390804652e-05,
"loss": 88.3868,
"step": 67
},
{
"epoch": 0.384180790960452,
"grad_norm": 0.7097700238227844,
"learning_rate": 9.314825963771723e-05,
"loss": 88.3632,
"step": 68
},
{
"epoch": 0.3898305084745763,
"grad_norm": 0.7166731953620911,
"learning_rate": 9.29157466882589e-05,
"loss": 88.3418,
"step": 69
},
{
"epoch": 0.3954802259887006,
"grad_norm": 0.7072133421897888,
"learning_rate": 9.267965445186733e-05,
"loss": 88.3753,
"step": 70
},
{
"epoch": 0.4011299435028249,
"grad_norm": 0.7065291404724121,
"learning_rate": 9.24400026192603e-05,
"loss": 88.3499,
"step": 71
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.8113076090812683,
"learning_rate": 9.219681117803536e-05,
"loss": 88.3214,
"step": 72
},
{
"epoch": 0.4124293785310734,
"grad_norm": 0.7646364569664001,
"learning_rate": 9.195010041100275e-05,
"loss": 88.3064,
"step": 73
},
{
"epoch": 0.4180790960451977,
"grad_norm": 0.756248950958252,
"learning_rate": 9.16998908944939e-05,
"loss": 88.3968,
"step": 74
},
{
"epoch": 0.423728813559322,
"grad_norm": 0.805596113204956,
"learning_rate": 9.14462034966451e-05,
"loss": 88.3718,
"step": 75
},
{
"epoch": 0.4293785310734463,
"grad_norm": 0.6950631737709045,
"learning_rate": 9.118905937565722e-05,
"loss": 88.3981,
"step": 76
},
{
"epoch": 0.4350282485875706,
"grad_norm": 0.7553988695144653,
"learning_rate": 9.092847997803097e-05,
"loss": 88.3488,
"step": 77
},
{
"epoch": 0.4406779661016949,
"grad_norm": 0.7791416645050049,
"learning_rate": 9.066448703677828e-05,
"loss": 88.4197,
"step": 78
},
{
"epoch": 0.4463276836158192,
"grad_norm": 0.7216358184814453,
"learning_rate": 9.039710256960957e-05,
"loss": 88.3813,
"step": 79
},
{
"epoch": 0.4519774011299435,
"grad_norm": 0.8204500079154968,
"learning_rate": 9.012634887709754e-05,
"loss": 88.2346,
"step": 80
},
{
"epoch": 0.4576271186440678,
"grad_norm": 0.843241274356842,
"learning_rate": 8.985224854081726e-05,
"loss": 88.4055,
"step": 81
},
{
"epoch": 0.4632768361581921,
"grad_norm": 0.7781232595443726,
"learning_rate": 8.957482442146272e-05,
"loss": 88.372,
"step": 82
},
{
"epoch": 0.4689265536723164,
"grad_norm": 0.8177736401557922,
"learning_rate": 8.929409965694016e-05,
"loss": 88.3409,
"step": 83
},
{
"epoch": 0.4745762711864407,
"grad_norm": 0.8297736644744873,
"learning_rate": 8.901009766043847e-05,
"loss": 88.3283,
"step": 84
},
{
"epoch": 0.480225988700565,
"grad_norm": 0.8678383231163025,
"learning_rate": 8.872284211847629e-05,
"loss": 88.3416,
"step": 85
},
{
"epoch": 0.4858757062146893,
"grad_norm": 0.8668798804283142,
"learning_rate": 8.84323569889266e-05,
"loss": 88.2483,
"step": 86
},
{
"epoch": 0.4915254237288136,
"grad_norm": 0.9378306269645691,
"learning_rate": 8.813866649901856e-05,
"loss": 88.3618,
"step": 87
},
{
"epoch": 0.4971751412429379,
"grad_norm": 1.1833090782165527,
"learning_rate": 8.784179514331682e-05,
"loss": 88.2735,
"step": 88
},
{
"epoch": 0.5028248587570622,
"grad_norm": 0.6026833057403564,
"learning_rate": 8.75417676816787e-05,
"loss": 88.4478,
"step": 89
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.6131640672683716,
"learning_rate": 8.72386091371891e-05,
"loss": 88.4419,
"step": 90
},
{
"epoch": 0.5141242937853108,
"grad_norm": 0.6138200759887695,
"learning_rate": 8.693234479407353e-05,
"loss": 88.4078,
"step": 91
},
{
"epoch": 0.519774011299435,
"grad_norm": 0.6851401329040527,
"learning_rate": 8.662300019558931e-05,
"loss": 88.3743,
"step": 92
},
{
"epoch": 0.5254237288135594,
"grad_norm": 0.6595472693443298,
"learning_rate": 8.631060114189525e-05,
"loss": 88.387,
"step": 93
},
{
"epoch": 0.5310734463276836,
"grad_norm": 0.6642423272132874,
"learning_rate": 8.59951736878998e-05,
"loss": 88.2821,
"step": 94
},
{
"epoch": 0.536723163841808,
"grad_norm": 0.7108228206634521,
"learning_rate": 8.5676744141088e-05,
"loss": 88.2668,
"step": 95
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.7303377985954285,
"learning_rate": 8.535533905932738e-05,
"loss": 88.2737,
"step": 96
},
{
"epoch": 0.5480225988700564,
"grad_norm": 0.6971437931060791,
"learning_rate": 8.503098524865301e-05,
"loss": 88.342,
"step": 97
},
{
"epoch": 0.5536723163841808,
"grad_norm": 0.7270698547363281,
"learning_rate": 8.47037097610317e-05,
"loss": 88.2374,
"step": 98
},
{
"epoch": 0.559322033898305,
"grad_norm": 0.6642720103263855,
"learning_rate": 8.43735398921059e-05,
"loss": 88.2445,
"step": 99
},
{
"epoch": 0.5649717514124294,
"grad_norm": 0.6921072602272034,
"learning_rate": 8.404050317891711e-05,
"loss": 88.3058,
"step": 100
},
{
"epoch": 0.5706214689265536,
"grad_norm": 0.7314338088035583,
"learning_rate": 8.370462739760923e-05,
"loss": 88.1885,
"step": 101
},
{
"epoch": 0.576271186440678,
"grad_norm": 0.7479543089866638,
"learning_rate": 8.336594056111197e-05,
"loss": 88.3319,
"step": 102
},
{
"epoch": 0.5819209039548022,
"grad_norm": 0.6370671987533569,
"learning_rate": 8.30244709168045e-05,
"loss": 88.3088,
"step": 103
},
{
"epoch": 0.5875706214689266,
"grad_norm": 0.68263179063797,
"learning_rate": 8.268024694415947e-05,
"loss": 88.216,
"step": 104
},
{
"epoch": 0.5932203389830508,
"grad_norm": 0.7159000635147095,
"learning_rate": 8.233329735236789e-05,
"loss": 88.2724,
"step": 105
},
{
"epoch": 0.5988700564971752,
"grad_norm": 0.6509236097335815,
"learning_rate": 8.198365107794457e-05,
"loss": 88.2867,
"step": 106
},
{
"epoch": 0.6045197740112994,
"grad_norm": 0.6526631712913513,
"learning_rate": 8.163133728231482e-05,
"loss": 88.3246,
"step": 107
},
{
"epoch": 0.6101694915254238,
"grad_norm": 0.7395955920219421,
"learning_rate": 8.127638534938227e-05,
"loss": 88.276,
"step": 108
},
{
"epoch": 0.615819209039548,
"grad_norm": 0.6584829688072205,
"learning_rate": 8.09188248830782e-05,
"loss": 88.1927,
"step": 109
},
{
"epoch": 0.6214689265536724,
"grad_norm": 0.8070241808891296,
"learning_rate": 8.055868570489247e-05,
"loss": 88.1449,
"step": 110
},
{
"epoch": 0.6271186440677966,
"grad_norm": 0.6944290995597839,
"learning_rate": 8.019599785138635e-05,
"loss": 88.2262,
"step": 111
},
{
"epoch": 0.632768361581921,
"grad_norm": 0.7627915740013123,
"learning_rate": 7.983079157168736e-05,
"loss": 88.2257,
"step": 112
},
{
"epoch": 0.6384180790960452,
"grad_norm": 0.7771816849708557,
"learning_rate": 7.946309732496647e-05,
"loss": 88.211,
"step": 113
},
{
"epoch": 0.6440677966101694,
"grad_norm": 0.766257107257843,
"learning_rate": 7.909294577789766e-05,
"loss": 88.1717,
"step": 114
},
{
"epoch": 0.6497175141242938,
"grad_norm": 0.7782958149909973,
"learning_rate": 7.872036780210026e-05,
"loss": 88.15,
"step": 115
},
{
"epoch": 0.655367231638418,
"grad_norm": 0.7616683840751648,
"learning_rate": 7.834539447156424e-05,
"loss": 88.1597,
"step": 116
},
{
"epoch": 0.6610169491525424,
"grad_norm": 0.6816295981407166,
"learning_rate": 7.796805706005843e-05,
"loss": 88.1752,
"step": 117
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.7385973334312439,
"learning_rate": 7.75883870385223e-05,
"loss": 88.1828,
"step": 118
},
{
"epoch": 0.672316384180791,
"grad_norm": 0.7695619463920593,
"learning_rate": 7.72064160724412e-05,
"loss": 88.2801,
"step": 119
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.7640277147293091,
"learning_rate": 7.682217601920529e-05,
"loss": 88.2095,
"step": 120
},
{
"epoch": 0.6836158192090396,
"grad_norm": 0.7877722382545471,
"learning_rate": 7.643569892545267e-05,
"loss": 88.16,
"step": 121
},
{
"epoch": 0.6892655367231638,
"grad_norm": 0.7659584283828735,
"learning_rate": 7.604701702439651e-05,
"loss": 88.1962,
"step": 122
},
{
"epoch": 0.6949152542372882,
"grad_norm": 0.8098394870758057,
"learning_rate": 7.565616273313678e-05,
"loss": 88.1656,
"step": 123
},
{
"epoch": 0.7005649717514124,
"grad_norm": 0.7991154193878174,
"learning_rate": 7.526316864995647e-05,
"loss": 88.1524,
"step": 124
},
{
"epoch": 0.7062146892655368,
"grad_norm": 0.7656694650650024,
"learning_rate": 7.486806755160297e-05,
"loss": 88.1187,
"step": 125
},
{
"epoch": 0.711864406779661,
"grad_norm": 0.7287772297859192,
"learning_rate": 7.447089239055428e-05,
"loss": 88.1673,
"step": 126
},
{
"epoch": 0.7175141242937854,
"grad_norm": 0.8145878911018372,
"learning_rate": 7.407167629227072e-05,
"loss": 88.1032,
"step": 127
},
{
"epoch": 0.7231638418079096,
"grad_norm": 0.857018768787384,
"learning_rate": 7.367045255243216e-05,
"loss": 88.0358,
"step": 128
},
{
"epoch": 0.7288135593220338,
"grad_norm": 0.8250066041946411,
"learning_rate": 7.326725463416117e-05,
"loss": 88.2404,
"step": 129
},
{
"epoch": 0.7344632768361582,
"grad_norm": 0.9477486610412598,
"learning_rate": 7.286211616523193e-05,
"loss": 88.2214,
"step": 130
},
{
"epoch": 0.7401129943502824,
"grad_norm": 0.971202552318573,
"learning_rate": 7.245507093526574e-05,
"loss": 88.214,
"step": 131
},
{
"epoch": 0.7457627118644068,
"grad_norm": 1.0824439525604248,
"learning_rate": 7.204615289291283e-05,
"loss": 88.1631,
"step": 132
},
{
"epoch": 0.751412429378531,
"grad_norm": 0.7381979823112488,
"learning_rate": 7.163539614302088e-05,
"loss": 88.2446,
"step": 133
},
{
"epoch": 0.7570621468926554,
"grad_norm": 0.6837661266326904,
"learning_rate": 7.122283494379076e-05,
"loss": 88.2198,
"step": 134
},
{
"epoch": 0.7627118644067796,
"grad_norm": 0.6421964764595032,
"learning_rate": 7.080850370391907e-05,
"loss": 88.2727,
"step": 135
},
{
"epoch": 0.768361581920904,
"grad_norm": 0.6602257490158081,
"learning_rate": 7.039243697972857e-05,
"loss": 88.1771,
"step": 136
},
{
"epoch": 0.7740112994350282,
"grad_norm": 0.7069459557533264,
"learning_rate": 6.997466947228596e-05,
"loss": 88.1327,
"step": 137
},
{
"epoch": 0.7796610169491526,
"grad_norm": 0.6446311473846436,
"learning_rate": 6.95552360245078e-05,
"loss": 88.2014,
"step": 138
},
{
"epoch": 0.7853107344632768,
"grad_norm": 0.6669993996620178,
"learning_rate": 6.91341716182545e-05,
"loss": 88.1836,
"step": 139
},
{
"epoch": 0.7909604519774012,
"grad_norm": 0.7162554264068604,
"learning_rate": 6.871151137141266e-05,
"loss": 88.0806,
"step": 140
},
{
"epoch": 0.7966101694915254,
"grad_norm": 0.6205349564552307,
"learning_rate": 6.828729053496629e-05,
"loss": 88.1998,
"step": 141
},
{
"epoch": 0.8022598870056498,
"grad_norm": 0.6284624934196472,
"learning_rate": 6.786154449005665e-05,
"loss": 88.0922,
"step": 142
},
{
"epoch": 0.807909604519774,
"grad_norm": 0.6776576638221741,
"learning_rate": 6.743430874503143e-05,
"loss": 88.104,
"step": 143
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.6751952767372131,
"learning_rate": 6.700561893248332e-05,
"loss": 88.1876,
"step": 144
},
{
"epoch": 0.8192090395480226,
"grad_norm": 0.6897196769714355,
"learning_rate": 6.6575510806278e-05,
"loss": 88.1415,
"step": 145
},
{
"epoch": 0.8248587570621468,
"grad_norm": 0.6813532114028931,
"learning_rate": 6.614402023857232e-05,
"loss": 88.0379,
"step": 146
},
{
"epoch": 0.8305084745762712,
"grad_norm": 0.7052000164985657,
"learning_rate": 6.57111832168224e-05,
"loss": 88.0754,
"step": 147
},
{
"epoch": 0.8361581920903954,
"grad_norm": 0.6268584132194519,
"learning_rate": 6.527703584078218e-05,
"loss": 88.0336,
"step": 148
},
{
"epoch": 0.8418079096045198,
"grad_norm": 0.6451026201248169,
"learning_rate": 6.484161431949267e-05,
"loss": 88.1128,
"step": 149
},
{
"epoch": 0.847457627118644,
"grad_norm": 0.6330875158309937,
"learning_rate": 6.440495496826189e-05,
"loss": 88.0844,
"step": 150
},
{
"epoch": 0.8531073446327684,
"grad_norm": 0.5872354507446289,
"learning_rate": 6.39670942056362e-05,
"loss": 88.082,
"step": 151
},
{
"epoch": 0.8587570621468926,
"grad_norm": 0.6449838280677795,
"learning_rate": 6.352806855036287e-05,
"loss": 88.0767,
"step": 152
},
{
"epoch": 0.864406779661017,
"grad_norm": 0.6349008083343506,
"learning_rate": 6.308791461834426e-05,
"loss": 88.0604,
"step": 153
},
{
"epoch": 0.8700564971751412,
"grad_norm": 0.6187208890914917,
"learning_rate": 6.264666911958404e-05,
"loss": 88.1468,
"step": 154
},
{
"epoch": 0.8757062146892656,
"grad_norm": 0.6470857262611389,
"learning_rate": 6.22043688551254e-05,
"loss": 88.0885,
"step": 155
},
{
"epoch": 0.8813559322033898,
"grad_norm": 0.6183845400810242,
"learning_rate": 6.17610507139818e-05,
"loss": 88.0303,
"step": 156
},
{
"epoch": 0.8870056497175142,
"grad_norm": 0.6772004961967468,
"learning_rate": 6.13167516700603e-05,
"loss": 88.111,
"step": 157
},
{
"epoch": 0.8926553672316384,
"grad_norm": 0.6499753594398499,
"learning_rate": 6.0871508779077856e-05,
"loss": 88.0266,
"step": 158
},
{
"epoch": 0.8983050847457628,
"grad_norm": 0.6717754602432251,
"learning_rate": 6.04253591754708e-05,
"loss": 88.0712,
"step": 159
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.6724316477775574,
"learning_rate": 5.9978340069297647e-05,
"loss": 87.9934,
"step": 160
},
{
"epoch": 0.9096045197740112,
"grad_norm": 0.6548104286193848,
"learning_rate": 5.9530488743135746e-05,
"loss": 88.0043,
"step": 161
},
{
"epoch": 0.9152542372881356,
"grad_norm": 0.6596248745918274,
"learning_rate": 5.908184254897182e-05,
"loss": 88.0424,
"step": 162
},
{
"epoch": 0.9209039548022598,
"grad_norm": 0.6875431537628174,
"learning_rate": 5.863243890508668e-05,
"loss": 88.0799,
"step": 163
},
{
"epoch": 0.9265536723163842,
"grad_norm": 0.6975457668304443,
"learning_rate": 5.8182315292934406e-05,
"loss": 88.0774,
"step": 164
},
{
"epoch": 0.9322033898305084,
"grad_norm": 0.6618639230728149,
"learning_rate": 5.773150925401641e-05,
"loss": 88.124,
"step": 165
},
{
"epoch": 0.9378531073446328,
"grad_norm": 0.6853637099266052,
"learning_rate": 5.7280058386750255e-05,
"loss": 87.9972,
"step": 166
},
{
"epoch": 0.943502824858757,
"grad_norm": 0.7230448126792908,
"learning_rate": 5.68280003433339e-05,
"loss": 87.9713,
"step": 167
},
{
"epoch": 0.9491525423728814,
"grad_norm": 0.7414244413375854,
"learning_rate": 5.63753728266054e-05,
"loss": 88.0774,
"step": 168
},
{
"epoch": 0.9548022598870056,
"grad_norm": 0.766106367111206,
"learning_rate": 5.592221358689843e-05,
"loss": 88.0479,
"step": 169
},
{
"epoch": 0.96045197740113,
"grad_norm": 0.7827813029289246,
"learning_rate": 5.546856041889373e-05,
"loss": 88.1115,
"step": 170
},
{
"epoch": 0.9661016949152542,
"grad_norm": 0.693665623664856,
"learning_rate": 5.5014451158466975e-05,
"loss": 88.0148,
"step": 171
},
{
"epoch": 0.9717514124293786,
"grad_norm": 0.7761551737785339,
"learning_rate": 5.4559923679533173e-05,
"loss": 87.9896,
"step": 172
},
{
"epoch": 0.9774011299435028,
"grad_norm": 0.7685063481330872,
"learning_rate": 5.410501589088785e-05,
"loss": 87.9791,
"step": 173
},
{
"epoch": 0.9830508474576272,
"grad_norm": 0.8432329297065735,
"learning_rate": 5.364976573304538e-05,
"loss": 88.1228,
"step": 174
},
{
"epoch": 0.9887005649717514,
"grad_norm": 0.9350976347923279,
"learning_rate": 5.319421117507462e-05,
"loss": 87.9292,
"step": 175
},
{
"epoch": 0.9943502824858758,
"grad_norm": 1.2149473428726196,
"learning_rate": 5.273839021143218e-05,
"loss": 87.9655,
"step": 176
},
{
"epoch": 1.0,
"grad_norm": 0.6291913986206055,
"learning_rate": 5.22823408587937e-05,
"loss": 88.1277,
"step": 177
},
{
"epoch": 1.0,
"eval_loss": 11.00737476348877,
"eval_runtime": 0.6468,
"eval_samples_per_second": 460.76,
"eval_steps_per_second": 115.963,
"step": 177
},
{
"epoch": 1.0056497175141244,
"grad_norm": 0.6629582047462463,
"learning_rate": 5.182610115288295e-05,
"loss": 88.1835,
"step": 178
},
{
"epoch": 1.0112994350282485,
"grad_norm": 0.6663825511932373,
"learning_rate": 5.136970914529975e-05,
"loss": 88.162,
"step": 179
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.6589259505271912,
"learning_rate": 5.091320290034625e-05,
"loss": 88.1961,
"step": 180
},
{
"epoch": 1.0225988700564972,
"grad_norm": 0.5902798175811768,
"learning_rate": 5.045662049185229e-05,
"loss": 88.0916,
"step": 181
},
{
"epoch": 1.0282485875706215,
"grad_norm": 0.5283370018005371,
"learning_rate": 5e-05,
"loss": 88.119,
"step": 182
},
{
"epoch": 1.0338983050847457,
"grad_norm": 0.5795127749443054,
"learning_rate": 4.9543379508147716e-05,
"loss": 88.1366,
"step": 183
},
{
"epoch": 1.03954802259887,
"grad_norm": 0.6025692820549011,
"learning_rate": 4.9086797099653756e-05,
"loss": 88.0548,
"step": 184
},
{
"epoch": 1.0451977401129944,
"grad_norm": 0.5939972996711731,
"learning_rate": 4.863029085470026e-05,
"loss": 87.9877,
"step": 185
},
{
"epoch": 1.0508474576271187,
"grad_norm": 0.5899671316146851,
"learning_rate": 4.817389884711705e-05,
"loss": 88.0826,
"step": 186
},
{
"epoch": 1.0564971751412429,
"grad_norm": 0.5887870788574219,
"learning_rate": 4.771765914120631e-05,
"loss": 88.1155,
"step": 187
},
{
"epoch": 1.0621468926553672,
"grad_norm": 0.5916518568992615,
"learning_rate": 4.726160978856782e-05,
"loss": 88.0002,
"step": 188
},
{
"epoch": 1.0677966101694916,
"grad_norm": 0.6661847233772278,
"learning_rate": 4.6805788824925386e-05,
"loss": 88.0365,
"step": 189
},
{
"epoch": 1.073446327683616,
"grad_norm": 0.5642584562301636,
"learning_rate": 4.6350234266954626e-05,
"loss": 88.0447,
"step": 190
},
{
"epoch": 1.07909604519774,
"grad_norm": 0.5734567046165466,
"learning_rate": 4.589498410911215e-05,
"loss": 88.0759,
"step": 191
},
{
"epoch": 1.0847457627118644,
"grad_norm": 0.5504806041717529,
"learning_rate": 4.5440076320466825e-05,
"loss": 88.0262,
"step": 192
},
{
"epoch": 1.0903954802259888,
"grad_norm": 0.5161648392677307,
"learning_rate": 4.4985548841533036e-05,
"loss": 88.0113,
"step": 193
},
{
"epoch": 1.0960451977401129,
"grad_norm": 0.5934569239616394,
"learning_rate": 4.4531439581106295e-05,
"loss": 88.1235,
"step": 194
},
{
"epoch": 1.1016949152542372,
"grad_norm": 0.5542126297950745,
"learning_rate": 4.4077786413101595e-05,
"loss": 88.0276,
"step": 195
},
{
"epoch": 1.1073446327683616,
"grad_norm": 0.6331348419189453,
"learning_rate": 4.362462717339461e-05,
"loss": 88.0501,
"step": 196
},
{
"epoch": 1.112994350282486,
"grad_norm": 0.5662806630134583,
"learning_rate": 4.317199965666613e-05,
"loss": 88.0157,
"step": 197
},
{
"epoch": 1.11864406779661,
"grad_norm": 0.5534279942512512,
"learning_rate": 4.271994161324977e-05,
"loss": 87.9114,
"step": 198
},
{
"epoch": 1.1242937853107344,
"grad_norm": 0.5692214369773865,
"learning_rate": 4.22684907459836e-05,
"loss": 87.9938,
"step": 199
},
{
"epoch": 1.1299435028248588,
"grad_norm": 0.6049486994743347,
"learning_rate": 4.1817684707065605e-05,
"loss": 87.9819,
"step": 200
},
{
"epoch": 1.1355932203389831,
"grad_norm": 0.5856545567512512,
"learning_rate": 4.1367561094913335e-05,
"loss": 87.9492,
"step": 201
},
{
"epoch": 1.1412429378531073,
"grad_norm": 0.5690354704856873,
"learning_rate": 4.0918157451028185e-05,
"loss": 87.9628,
"step": 202
},
{
"epoch": 1.1468926553672316,
"grad_norm": 0.6273289322853088,
"learning_rate": 4.0469511256864265e-05,
"loss": 88.0531,
"step": 203
},
{
"epoch": 1.152542372881356,
"grad_norm": 0.6032590866088867,
"learning_rate": 4.002165993070237e-05,
"loss": 87.8896,
"step": 204
},
{
"epoch": 1.1581920903954803,
"grad_norm": 0.5722517371177673,
"learning_rate": 3.957464082452922e-05,
"loss": 87.9297,
"step": 205
},
{
"epoch": 1.1638418079096045,
"grad_norm": 0.6684175729751587,
"learning_rate": 3.9128491220922156e-05,
"loss": 88.0202,
"step": 206
},
{
"epoch": 1.1694915254237288,
"grad_norm": 0.6134036779403687,
"learning_rate": 3.8683248329939716e-05,
"loss": 87.9706,
"step": 207
},
{
"epoch": 1.1751412429378532,
"grad_norm": 0.6217228174209595,
"learning_rate": 3.823894928601822e-05,
"loss": 88.0366,
"step": 208
},
{
"epoch": 1.1807909604519775,
"grad_norm": 0.577599287033081,
"learning_rate": 3.7795631144874604e-05,
"loss": 88.0188,
"step": 209
},
{
"epoch": 1.1864406779661016,
"grad_norm": 0.6666799783706665,
"learning_rate": 3.735333088041596e-05,
"loss": 88.0513,
"step": 210
},
{
"epoch": 1.192090395480226,
"grad_norm": 0.5872676968574524,
"learning_rate": 3.6912085381655734e-05,
"loss": 88.0103,
"step": 211
},
{
"epoch": 1.1977401129943503,
"grad_norm": 0.5996030569076538,
"learning_rate": 3.6471931449637124e-05,
"loss": 87.9581,
"step": 212
},
{
"epoch": 1.2033898305084745,
"grad_norm": 0.6568124890327454,
"learning_rate": 3.60329057943638e-05,
"loss": 87.9921,
"step": 213
},
{
"epoch": 1.2090395480225988,
"grad_norm": 0.7044454216957092,
"learning_rate": 3.5595045031738125e-05,
"loss": 88.0731,
"step": 214
},
{
"epoch": 1.2146892655367232,
"grad_norm": 0.7266508936882019,
"learning_rate": 3.515838568050736e-05,
"loss": 88.0184,
"step": 215
},
{
"epoch": 1.2203389830508475,
"grad_norm": 0.7107569575309753,
"learning_rate": 3.472296415921783e-05,
"loss": 88.0427,
"step": 216
},
{
"epoch": 1.2259887005649717,
"grad_norm": 0.7413809299468994,
"learning_rate": 3.428881678317763e-05,
"loss": 87.9618,
"step": 217
},
{
"epoch": 1.231638418079096,
"grad_norm": 0.7287234663963318,
"learning_rate": 3.38559797614277e-05,
"loss": 88.007,
"step": 218
},
{
"epoch": 1.2372881355932204,
"grad_norm": 0.8537057042121887,
"learning_rate": 3.3424489193722013e-05,
"loss": 88.055,
"step": 219
},
{
"epoch": 1.2429378531073447,
"grad_norm": 0.8392075300216675,
"learning_rate": 3.2994381067516696e-05,
"loss": 88.0221,
"step": 220
},
{
"epoch": 1.2485875706214689,
"grad_norm": 1.036270022392273,
"learning_rate": 3.256569125496858e-05,
"loss": 87.9944,
"step": 221
},
{
"epoch": 1.2542372881355932,
"grad_norm": 0.633751392364502,
"learning_rate": 3.2138455509943366e-05,
"loss": 88.1435,
"step": 222
},
{
"epoch": 1.2598870056497176,
"grad_norm": 0.5413132309913635,
"learning_rate": 3.171270946503373e-05,
"loss": 88.1694,
"step": 223
},
{
"epoch": 1.2655367231638417,
"grad_norm": 0.5542528629302979,
"learning_rate": 3.128848862858734e-05,
"loss": 88.1215,
"step": 224
},
{
"epoch": 1.271186440677966,
"grad_norm": 0.5940180420875549,
"learning_rate": 3.086582838174551e-05,
"loss": 88.0598,
"step": 225
},
{
"epoch": 1.2768361581920904,
"grad_norm": 0.5067227482795715,
"learning_rate": 3.0444763975492208e-05,
"loss": 88.0876,
"step": 226
},
{
"epoch": 1.2824858757062148,
"grad_norm": 0.5656532049179077,
"learning_rate": 3.0025330527714046e-05,
"loss": 88.0915,
"step": 227
},
{
"epoch": 1.288135593220339,
"grad_norm": 0.52092045545578,
"learning_rate": 2.9607563020271446e-05,
"loss": 88.0636,
"step": 228
},
{
"epoch": 1.2937853107344632,
"grad_norm": 0.5944317579269409,
"learning_rate": 2.9191496296080935e-05,
"loss": 87.999,
"step": 229
},
{
"epoch": 1.2994350282485876,
"grad_norm": 0.5355740189552307,
"learning_rate": 2.8777165056209256e-05,
"loss": 88.0768,
"step": 230
},
{
"epoch": 1.305084745762712,
"grad_norm": 0.5801308155059814,
"learning_rate": 2.836460385697911e-05,
"loss": 88.016,
"step": 231
},
{
"epoch": 1.310734463276836,
"grad_norm": 0.5269469618797302,
"learning_rate": 2.7953847107087172e-05,
"loss": 87.9684,
"step": 232
},
{
"epoch": 1.3163841807909604,
"grad_norm": 0.5054383873939514,
"learning_rate": 2.754492906473425e-05,
"loss": 88.1685,
"step": 233
},
{
"epoch": 1.3220338983050848,
"grad_norm": 0.6001875400543213,
"learning_rate": 2.7137883834768073e-05,
"loss": 88.0379,
"step": 234
},
{
"epoch": 1.327683615819209,
"grad_norm": 0.5223692059516907,
"learning_rate": 2.6732745365838828e-05,
"loss": 87.9794,
"step": 235
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.6370963454246521,
"learning_rate": 2.6329547447567836e-05,
"loss": 87.9989,
"step": 236
},
{
"epoch": 1.3389830508474576,
"grad_norm": 0.49859458208084106,
"learning_rate": 2.5928323707729306e-05,
"loss": 88.0401,
"step": 237
},
{
"epoch": 1.344632768361582,
"grad_norm": 0.544282078742981,
"learning_rate": 2.5529107609445733e-05,
"loss": 88.0129,
"step": 238
},
{
"epoch": 1.3502824858757063,
"grad_norm": 0.6058323979377747,
"learning_rate": 2.513193244839704e-05,
"loss": 88.0901,
"step": 239
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.5392006635665894,
"learning_rate": 2.4736831350043536e-05,
"loss": 88.0176,
"step": 240
},
{
"epoch": 1.3615819209039548,
"grad_norm": 0.5761566758155823,
"learning_rate": 2.4343837266863246e-05,
"loss": 88.0378,
"step": 241
},
{
"epoch": 1.3672316384180792,
"grad_norm": 0.5357884764671326,
"learning_rate": 2.3952982975603496e-05,
"loss": 88.0098,
"step": 242
},
{
"epoch": 1.3728813559322033,
"grad_norm": 0.5242740511894226,
"learning_rate": 2.356430107454733e-05,
"loss": 88.0394,
"step": 243
},
{
"epoch": 1.3785310734463276,
"grad_norm": 0.5258982181549072,
"learning_rate": 2.3177823980794727e-05,
"loss": 87.9584,
"step": 244
},
{
"epoch": 1.384180790960452,
"grad_norm": 0.6452914476394653,
"learning_rate": 2.279358392755882e-05,
"loss": 88.0219,
"step": 245
},
{
"epoch": 1.3898305084745763,
"grad_norm": 0.5566428899765015,
"learning_rate": 2.24116129614777e-05,
"loss": 87.9679,
"step": 246
},
{
"epoch": 1.3954802259887007,
"grad_norm": 0.5655139088630676,
"learning_rate": 2.2031942939941592e-05,
"loss": 87.9924,
"step": 247
},
{
"epoch": 1.4011299435028248,
"grad_norm": 0.6263113617897034,
"learning_rate": 2.1654605528435773e-05,
"loss": 87.919,
"step": 248
},
{
"epoch": 1.4067796610169492,
"grad_norm": 0.5700308084487915,
"learning_rate": 2.127963219789974e-05,
"loss": 87.9079,
"step": 249
},
{
"epoch": 1.4124293785310735,
"grad_norm": 0.5626516938209534,
"learning_rate": 2.090705422210237e-05,
"loss": 87.9963,
"step": 250
},
{
"epoch": 1.4180790960451977,
"grad_norm": 0.5786779522895813,
"learning_rate": 2.0536902675033548e-05,
"loss": 87.9655,
"step": 251
},
{
"epoch": 1.423728813559322,
"grad_norm": 0.6087446212768555,
"learning_rate": 2.0169208428312647e-05,
"loss": 87.9886,
"step": 252
},
{
"epoch": 1.4293785310734464,
"grad_norm": 0.627742350101471,
"learning_rate": 1.980400214861367e-05,
"loss": 88.072,
"step": 253
},
{
"epoch": 1.4350282485875705,
"grad_norm": 0.6009258031845093,
"learning_rate": 1.9441314295107537e-05,
"loss": 87.9974,
"step": 254
},
{
"epoch": 1.4406779661016949,
"grad_norm": 0.5660704374313354,
"learning_rate": 1.90811751169218e-05,
"loss": 87.9457,
"step": 255
},
{
"epoch": 1.4463276836158192,
"grad_norm": 0.6332747340202332,
"learning_rate": 1.8723614650617723e-05,
"loss": 88.0142,
"step": 256
},
{
"epoch": 1.4519774011299436,
"grad_norm": 0.653758704662323,
"learning_rate": 1.8368662717685187e-05,
"loss": 88.0092,
"step": 257
},
{
"epoch": 1.457627118644068,
"grad_norm": 0.574155330657959,
"learning_rate": 1.801634892205545e-05,
"loss": 87.9217,
"step": 258
},
{
"epoch": 1.463276836158192,
"grad_norm": 0.6903696060180664,
"learning_rate": 1.766670264763213e-05,
"loss": 87.9605,
"step": 259
},
{
"epoch": 1.4689265536723164,
"grad_norm": 0.6985341310501099,
"learning_rate": 1.7319753055840553e-05,
"loss": 88.1013,
"step": 260
},
{
"epoch": 1.4745762711864407,
"grad_norm": 0.6917281150817871,
"learning_rate": 1.697552908319553e-05,
"loss": 87.9394,
"step": 261
},
{
"epoch": 1.4802259887005649,
"grad_norm": 0.7425939440727234,
"learning_rate": 1.6634059438888033e-05,
"loss": 87.9243,
"step": 262
},
{
"epoch": 1.4858757062146892,
"grad_norm": 0.7640402317047119,
"learning_rate": 1.6295372602390767e-05,
"loss": 87.9382,
"step": 263
},
{
"epoch": 1.4915254237288136,
"grad_norm": 0.8300817608833313,
"learning_rate": 1.5959496821082905e-05,
"loss": 87.8827,
"step": 264
},
{
"epoch": 1.497175141242938,
"grad_norm": 1.0759795904159546,
"learning_rate": 1.562646010789411e-05,
"loss": 87.8789,
"step": 265
},
{
"epoch": 1.5028248587570623,
"grad_norm": 0.6373040080070496,
"learning_rate": 1.5296290238968303e-05,
"loss": 88.0713,
"step": 266
},
{
"epoch": 1.5084745762711864,
"grad_norm": 0.5541619062423706,
"learning_rate": 1.496901475134701e-05,
"loss": 88.1348,
"step": 267
},
{
"epoch": 1.5141242937853108,
"grad_norm": 0.5655987858772278,
"learning_rate": 1.4644660940672627e-05,
"loss": 88.1458,
"step": 268
},
{
"epoch": 1.5197740112994351,
"grad_norm": 0.5608534216880798,
"learning_rate": 1.4323255858912011e-05,
"loss": 88.1328,
"step": 269
},
{
"epoch": 1.5254237288135593,
"grad_norm": 0.5184534192085266,
"learning_rate": 1.4004826312100216e-05,
"loss": 88.049,
"step": 270
},
{
"epoch": 1.5310734463276836,
"grad_norm": 0.5682441592216492,
"learning_rate": 1.3689398858104751e-05,
"loss": 88.0269,
"step": 271
},
{
"epoch": 1.536723163841808,
"grad_norm": 0.46682822704315186,
"learning_rate": 1.337699980441069e-05,
"loss": 88.0322,
"step": 272
},
{
"epoch": 1.542372881355932,
"grad_norm": 0.55782550573349,
"learning_rate": 1.3067655205926488e-05,
"loss": 87.9584,
"step": 273
},
{
"epoch": 1.5480225988700564,
"grad_norm": 0.549993097782135,
"learning_rate": 1.2761390862810907e-05,
"loss": 87.9824,
"step": 274
},
{
"epoch": 1.5536723163841808,
"grad_norm": 0.49282097816467285,
"learning_rate": 1.2458232318321305e-05,
"loss": 87.9938,
"step": 275
},
{
"epoch": 1.559322033898305,
"grad_norm": 0.5615452527999878,
"learning_rate": 1.2158204856683176e-05,
"loss": 87.9862,
"step": 276
},
{
"epoch": 1.5649717514124295,
"grad_norm": 0.5333657264709473,
"learning_rate": 1.1861333500981448e-05,
"loss": 87.9579,
"step": 277
},
{
"epoch": 1.5706214689265536,
"grad_norm": 0.5501605868339539,
"learning_rate": 1.1567643011073392e-05,
"loss": 87.8844,
"step": 278
},
{
"epoch": 1.576271186440678,
"grad_norm": 0.5416379570960999,
"learning_rate": 1.127715788152372e-05,
"loss": 87.9542,
"step": 279
},
{
"epoch": 1.5819209039548023,
"grad_norm": 0.5352757573127747,
"learning_rate": 1.0989902339561553e-05,
"loss": 87.9788,
"step": 280
},
{
"epoch": 1.5875706214689265,
"grad_norm": 0.514640748500824,
"learning_rate": 1.0705900343059855e-05,
"loss": 87.9644,
"step": 281
},
{
"epoch": 1.5932203389830508,
"grad_norm": 0.5264514088630676,
"learning_rate": 1.0425175578537299e-05,
"loss": 88.0876,
"step": 282
},
{
"epoch": 1.5988700564971752,
"grad_norm": 0.5024112462997437,
"learning_rate": 1.0147751459182736e-05,
"loss": 87.9844,
"step": 283
},
{
"epoch": 1.6045197740112993,
"grad_norm": 0.48817285895347595,
"learning_rate": 9.873651122902472e-06,
"loss": 88.0376,
"step": 284
},
{
"epoch": 1.6101694915254239,
"grad_norm": 0.6276935935020447,
"learning_rate": 9.602897430390457e-06,
"loss": 87.933,
"step": 285
},
{
"epoch": 1.615819209039548,
"grad_norm": 0.5100238919258118,
"learning_rate": 9.335512963221732e-06,
"loss": 88.0798,
"step": 286
},
{
"epoch": 1.6214689265536724,
"grad_norm": 0.534130871295929,
"learning_rate": 9.071520021969027e-06,
"loss": 88.1008,
"step": 287
},
{
"epoch": 1.6271186440677967,
"grad_norm": 0.5745760798454285,
"learning_rate": 8.810940624342785e-06,
"loss": 87.9179,
"step": 288
},
{
"epoch": 1.6327683615819208,
"grad_norm": 0.5422095656394958,
"learning_rate": 8.553796503354899e-06,
"loss": 87.9494,
"step": 289
},
{
"epoch": 1.6384180790960452,
"grad_norm": 0.5460516214370728,
"learning_rate": 8.30010910550611e-06,
"loss": 87.9545,
"step": 290
},
{
"epoch": 1.6440677966101696,
"grad_norm": 0.6041167974472046,
"learning_rate": 8.049899588997244e-06,
"loss": 87.9521,
"step": 291
},
{
"epoch": 1.6497175141242937,
"grad_norm": 0.5654332637786865,
"learning_rate": 7.803188821964652e-06,
"loss": 87.9232,
"step": 292
},
{
"epoch": 1.655367231638418,
"grad_norm": 0.5767014026641846,
"learning_rate": 7.559997380739714e-06,
"loss": 87.8771,
"step": 293
},
{
"epoch": 1.6610169491525424,
"grad_norm": 0.5665624737739563,
"learning_rate": 7.320345548132679e-06,
"loss": 88.0009,
"step": 294
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.5585253238677979,
"learning_rate": 7.084253311741101e-06,
"loss": 87.9623,
"step": 295
},
{
"epoch": 1.672316384180791,
"grad_norm": 0.6449034214019775,
"learning_rate": 6.851740362282788e-06,
"loss": 87.968,
"step": 296
},
{
"epoch": 1.6779661016949152,
"grad_norm": 0.6085680723190308,
"learning_rate": 6.622826091953482e-06,
"loss": 88.0431,
"step": 297
},
{
"epoch": 1.6836158192090396,
"grad_norm": 0.7078083753585815,
"learning_rate": 6.397529592809614e-06,
"loss": 87.9145,
"step": 298
},
{
"epoch": 1.689265536723164,
"grad_norm": 0.5888929963111877,
"learning_rate": 6.1758696551758976e-06,
"loss": 87.974,
"step": 299
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.6576589941978455,
"learning_rate": 5.957864766078186e-06,
"loss": 87.9155,
"step": 300
},
{
"epoch": 1.7005649717514124,
"grad_norm": 0.6700954437255859,
"learning_rate": 5.743533107701593e-06,
"loss": 87.9094,
"step": 301
},
{
"epoch": 1.7062146892655368,
"grad_norm": 0.6426727175712585,
"learning_rate": 5.532892555874059e-06,
"loss": 87.9853,
"step": 302
},
{
"epoch": 1.711864406779661,
"grad_norm": 0.6309840679168701,
"learning_rate": 5.325960678575498e-06,
"loss": 87.997,
"step": 303
},
{
"epoch": 1.7175141242937855,
"grad_norm": 0.6771166920661926,
"learning_rate": 5.122754734472496e-06,
"loss": 87.9336,
"step": 304
},
{
"epoch": 1.7231638418079096,
"grad_norm": 0.7140227556228638,
"learning_rate": 4.92329167147898e-06,
"loss": 87.9128,
"step": 305
},
{
"epoch": 1.7288135593220337,
"grad_norm": 0.7067782282829285,
"learning_rate": 4.727588125342669e-06,
"loss": 88.008,
"step": 306
},
{
"epoch": 1.7344632768361583,
"grad_norm": 0.746557891368866,
"learning_rate": 4.535660418257631e-06,
"loss": 87.9927,
"step": 307
},
{
"epoch": 1.7401129943502824,
"grad_norm": 0.8619289994239807,
"learning_rate": 4.3475245575029185e-06,
"loss": 87.8356,
"step": 308
},
{
"epoch": 1.7457627118644068,
"grad_norm": 1.2444101572036743,
"learning_rate": 4.163196234107603e-06,
"loss": 87.9559,
"step": 309
},
{
"epoch": 1.7514124293785311,
"grad_norm": 0.6213842034339905,
"learning_rate": 3.982690821542035e-06,
"loss": 88.1411,
"step": 310
},
{
"epoch": 1.7570621468926553,
"grad_norm": 0.5701696872711182,
"learning_rate": 3.8060233744356633e-06,
"loss": 88.1261,
"step": 311
},
{
"epoch": 1.7627118644067796,
"grad_norm": 0.5259557366371155,
"learning_rate": 3.6332086273214827e-06,
"loss": 88.1058,
"step": 312
},
{
"epoch": 1.768361581920904,
"grad_norm": 0.561783492565155,
"learning_rate": 3.464260993407098e-06,
"loss": 87.9936,
"step": 313
},
{
"epoch": 1.774011299435028,
"grad_norm": 0.5012845993041992,
"learning_rate": 3.299194563372604e-06,
"loss": 88.0487,
"step": 314
},
{
"epoch": 1.7796610169491527,
"grad_norm": 0.5462010502815247,
"learning_rate": 3.1380231041954366e-06,
"loss": 87.9775,
"step": 315
},
{
"epoch": 1.7853107344632768,
"grad_norm": 0.5348891615867615,
"learning_rate": 2.9807600580021634e-06,
"loss": 88.0445,
"step": 316
},
{
"epoch": 1.7909604519774012,
"grad_norm": 0.5476884245872498,
"learning_rate": 2.827418540947313e-06,
"loss": 87.9698,
"step": 317
},
{
"epoch": 1.7966101694915255,
"grad_norm": 0.5374354124069214,
"learning_rate": 2.6780113421195298e-06,
"loss": 87.9377,
"step": 318
},
{
"epoch": 1.8022598870056497,
"grad_norm": 0.5628228783607483,
"learning_rate": 2.532550922474897e-06,
"loss": 87.8776,
"step": 319
},
{
"epoch": 1.807909604519774,
"grad_norm": 0.5416897535324097,
"learning_rate": 2.3910494137976523e-06,
"loss": 87.9337,
"step": 320
},
{
"epoch": 1.8135593220338984,
"grad_norm": 0.590156614780426,
"learning_rate": 2.253518617688377e-06,
"loss": 87.9672,
"step": 321
},
{
"epoch": 1.8192090395480225,
"grad_norm": 0.5673385858535767,
"learning_rate": 2.1199700045797077e-06,
"loss": 87.9425,
"step": 322
},
{
"epoch": 1.8248587570621468,
"grad_norm": 0.5306013226509094,
"learning_rate": 1.9904147127796646e-06,
"loss": 87.9447,
"step": 323
},
{
"epoch": 1.8305084745762712,
"grad_norm": 0.5657864809036255,
"learning_rate": 1.864863547542711e-06,
"loss": 87.9126,
"step": 324
},
{
"epoch": 1.8361581920903953,
"grad_norm": 0.5366300344467163,
"learning_rate": 1.7433269801685303e-06,
"loss": 87.9557,
"step": 325
},
{
"epoch": 1.84180790960452,
"grad_norm": 0.5389237403869629,
"learning_rate": 1.6258151471287396e-06,
"loss": 87.9312,
"step": 326
},
{
"epoch": 1.847457627118644,
"grad_norm": 0.5478444695472717,
"learning_rate": 1.5123378492214291e-06,
"loss": 87.9573,
"step": 327
},
{
"epoch": 1.8531073446327684,
"grad_norm": 0.5147649645805359,
"learning_rate": 1.4029045507537697e-06,
"loss": 88.0396,
"step": 328
},
{
"epoch": 1.8587570621468927,
"grad_norm": 0.5677837133407593,
"learning_rate": 1.297524378752696e-06,
"loss": 88.0318,
"step": 329
},
{
"epoch": 1.8644067796610169,
"grad_norm": 0.5549485087394714,
"learning_rate": 1.196206122203647e-06,
"loss": 88.005,
"step": 330
},
{
"epoch": 1.8700564971751412,
"grad_norm": 0.5245904922485352,
"learning_rate": 1.0989582313175374e-06,
"loss": 88.0128,
"step": 331
},
{
"epoch": 1.8757062146892656,
"grad_norm": 0.5384089946746826,
"learning_rate": 1.005788816826031e-06,
"loss": 87.8896,
"step": 332
},
{
"epoch": 1.8813559322033897,
"grad_norm": 0.5512574911117554,
"learning_rate": 9.167056493050496e-07,
"loss": 87.996,
"step": 333
},
{
"epoch": 1.8870056497175143,
"grad_norm": 0.6043868064880371,
"learning_rate": 8.317161585266964e-07,
"loss": 87.9643,
"step": 334
},
{
"epoch": 1.8926553672316384,
"grad_norm": 0.617608368396759,
"learning_rate": 7.508274328395848e-07,
"loss": 87.9645,
"step": 335
},
{
"epoch": 1.8983050847457628,
"grad_norm": 0.5591124892234802,
"learning_rate": 6.74046218577673e-07,
"loss": 87.9943,
"step": 336
},
{
"epoch": 1.9039548022598871,
"grad_norm": 0.5672664046287537,
"learning_rate": 6.013789194975749e-07,
"loss": 87.9743,
"step": 337
},
{
"epoch": 1.9096045197740112,
"grad_norm": 0.5669750571250916,
"learning_rate": 5.328315962444874e-07,
"loss": 87.8982,
"step": 338
},
{
"epoch": 1.9152542372881356,
"grad_norm": 0.5600360035896301,
"learning_rate": 4.684099658467223e-07,
"loss": 87.8697,
"step": 339
},
{
"epoch": 1.92090395480226,
"grad_norm": 0.5569552183151245,
"learning_rate": 4.0811940123886004e-07,
"loss": 88.0418,
"step": 340
},
{
"epoch": 1.926553672316384,
"grad_norm": 0.549552857875824,
"learning_rate": 3.5196493081366967e-07,
"loss": 87.9701,
"step": 341
},
{
"epoch": 1.9322033898305084,
"grad_norm": 0.6111685037612915,
"learning_rate": 2.9995123800270476e-07,
"loss": 88.1107,
"step": 342
},
{
"epoch": 1.9378531073446328,
"grad_norm": 0.6287999153137207,
"learning_rate": 2.5208266088569966e-07,
"loss": 87.9764,
"step": 343
},
{
"epoch": 1.943502824858757,
"grad_norm": 0.5528276562690735,
"learning_rate": 2.083631918287643e-07,
"loss": 87.9886,
"step": 344
},
{
"epoch": 1.9491525423728815,
"grad_norm": 0.6393068432807922,
"learning_rate": 1.6879647715140611e-07,
"loss": 88.0424,
"step": 345
},
{
"epoch": 1.9548022598870056,
"grad_norm": 0.6431833505630493,
"learning_rate": 1.333858168224178e-07,
"loss": 88.0173,
"step": 346
},
{
"epoch": 1.96045197740113,
"grad_norm": 0.6915642023086548,
"learning_rate": 1.0213416418465294e-07,
"loss": 87.9516,
"step": 347
},
{
"epoch": 1.9661016949152543,
"grad_norm": 0.6479020714759827,
"learning_rate": 7.5044125708712e-08,
"loss": 87.9935,
"step": 348
},
{
"epoch": 1.9717514124293785,
"grad_norm": 0.7263698577880859,
"learning_rate": 5.2117960775543986e-08,
"loss": 87.9126,
"step": 349
},
{
"epoch": 1.9774011299435028,
"grad_norm": 0.6763975024223328,
"learning_rate": 3.3357581488030475e-08,
"loss": 87.993,
"step": 350
},
{
"epoch": 1.9830508474576272,
"grad_norm": 0.8059737682342529,
"learning_rate": 1.8764552511485457e-08,
"loss": 88.038,
"step": 351
},
{
"epoch": 1.9887005649717513,
"grad_norm": 0.7300512790679932,
"learning_rate": 8.340090943176338e-09,
"loss": 88.0236,
"step": 352
},
{
"epoch": 1.9943502824858759,
"grad_norm": 0.9957327842712402,
"learning_rate": 2.0850662108051755e-09,
"loss": 87.9029,
"step": 353
},
{
"epoch": 2.0,
"grad_norm": 0.6088115572929382,
"learning_rate": 0.0,
"loss": 87.8932,
"step": 354
},
{
"epoch": 2.0,
"eval_loss": 10.999165534973145,
"eval_runtime": 0.674,
"eval_samples_per_second": 442.111,
"eval_steps_per_second": 111.27,
"step": 354
}
],
"logging_steps": 1,
"max_steps": 354,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3718069616640.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}