{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 354, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005649717514124294, "grad_norm": 0.5776382684707642, "learning_rate": 1e-05, "loss": 88.641, "step": 1 }, { "epoch": 0.011299435028248588, "grad_norm": 0.5559790730476379, "learning_rate": 2e-05, "loss": 88.6713, "step": 2 }, { "epoch": 0.01694915254237288, "grad_norm": 0.5598555207252502, "learning_rate": 3e-05, "loss": 88.7191, "step": 3 }, { "epoch": 0.022598870056497175, "grad_norm": 0.5563465356826782, "learning_rate": 4e-05, "loss": 88.6407, "step": 4 }, { "epoch": 0.02824858757062147, "grad_norm": 0.6338562965393066, "learning_rate": 5e-05, "loss": 88.64, "step": 5 }, { "epoch": 0.03389830508474576, "grad_norm": 0.5904527902603149, "learning_rate": 6e-05, "loss": 88.6842, "step": 6 }, { "epoch": 0.03954802259887006, "grad_norm": 0.629975438117981, "learning_rate": 7e-05, "loss": 88.6572, "step": 7 }, { "epoch": 0.04519774011299435, "grad_norm": 0.6180629730224609, "learning_rate": 8e-05, "loss": 88.6373, "step": 8 }, { "epoch": 0.05084745762711865, "grad_norm": 0.6689253449440002, "learning_rate": 9e-05, "loss": 88.5833, "step": 9 }, { "epoch": 0.05649717514124294, "grad_norm": 0.676450788974762, "learning_rate": 0.0001, "loss": 88.6067, "step": 10 }, { "epoch": 0.062146892655367235, "grad_norm": 0.6853811740875244, "learning_rate": 9.999791493378921e-05, "loss": 88.6227, "step": 11 }, { "epoch": 0.06779661016949153, "grad_norm": 0.6194440126419067, "learning_rate": 9.999165990905683e-05, "loss": 88.6231, "step": 12 }, { "epoch": 0.07344632768361582, "grad_norm": 0.6684823632240295, "learning_rate": 9.998123544748852e-05, "loss": 88.6185, "step": 13 }, { "epoch": 0.07909604519774012, "grad_norm": 0.6482928991317749, "learning_rate": 9.996664241851197e-05, "loss": 88.6342, "step": 14 }, { "epoch": 0.0847457627118644, "grad_norm": 0.7201379537582397, "learning_rate": 9.994788203922447e-05, "loss": 88.5714, "step": 15 }, { "epoch": 0.0903954802259887, "grad_norm": 0.6155017614364624, "learning_rate": 9.992495587429129e-05, "loss": 88.5541, "step": 16 }, { "epoch": 0.096045197740113, "grad_norm": 0.6636160016059875, "learning_rate": 9.989786583581535e-05, "loss": 88.5598, "step": 17 }, { "epoch": 0.1016949152542373, "grad_norm": 0.6538222432136536, "learning_rate": 9.986661418317759e-05, "loss": 88.5863, "step": 18 }, { "epoch": 0.10734463276836158, "grad_norm": 0.5915122628211975, "learning_rate": 9.98312035228486e-05, "loss": 88.6102, "step": 19 }, { "epoch": 0.11299435028248588, "grad_norm": 0.6704850792884827, "learning_rate": 9.979163680817124e-05, "loss": 88.5026, "step": 20 }, { "epoch": 0.11864406779661017, "grad_norm": 0.6887699961662292, "learning_rate": 9.97479173391143e-05, "loss": 88.5315, "step": 21 }, { "epoch": 0.12429378531073447, "grad_norm": 0.7467436790466309, "learning_rate": 9.97000487619973e-05, "loss": 88.5097, "step": 22 }, { "epoch": 0.12994350282485875, "grad_norm": 0.750785231590271, "learning_rate": 9.964803506918634e-05, "loss": 88.5409, "step": 23 }, { "epoch": 0.13559322033898305, "grad_norm": 0.7216200828552246, "learning_rate": 9.959188059876115e-05, "loss": 88.4787, "step": 24 }, { "epoch": 0.14124293785310735, "grad_norm": 0.7141463160514832, "learning_rate": 9.953159003415328e-05, "loss": 88.5226, "step": 25 }, { "epoch": 0.14689265536723164, "grad_norm": 0.7239276766777039, "learning_rate": 9.946716840375551e-05, "loss": 88.4626, "step": 26 }, { "epoch": 0.15254237288135594, "grad_norm": 0.7370103001594543, "learning_rate": 9.939862108050243e-05, "loss": 88.5769, "step": 27 }, { "epoch": 0.15819209039548024, "grad_norm": 0.6700708270072937, "learning_rate": 9.932595378142233e-05, "loss": 88.558, "step": 28 }, { "epoch": 0.1638418079096045, "grad_norm": 0.7646386027336121, "learning_rate": 9.924917256716042e-05, "loss": 88.4619, "step": 29 }, { "epoch": 0.1694915254237288, "grad_norm": 0.6728429198265076, "learning_rate": 9.916828384147331e-05, "loss": 88.5065, "step": 30 }, { "epoch": 0.1751412429378531, "grad_norm": 0.7492191195487976, "learning_rate": 9.908329435069495e-05, "loss": 88.4889, "step": 31 }, { "epoch": 0.1807909604519774, "grad_norm": 0.7224460244178772, "learning_rate": 9.899421118317398e-05, "loss": 88.4961, "step": 32 }, { "epoch": 0.1864406779661017, "grad_norm": 0.7774916887283325, "learning_rate": 9.890104176868247e-05, "loss": 88.6111, "step": 33 }, { "epoch": 0.192090395480226, "grad_norm": 0.7639403939247131, "learning_rate": 9.880379387779637e-05, "loss": 88.4938, "step": 34 }, { "epoch": 0.1977401129943503, "grad_norm": 0.683748185634613, "learning_rate": 9.87024756212473e-05, "loss": 88.5265, "step": 35 }, { "epoch": 0.2033898305084746, "grad_norm": 0.7794637680053711, "learning_rate": 9.859709544924624e-05, "loss": 88.5168, "step": 36 }, { "epoch": 0.20903954802259886, "grad_norm": 0.7606369256973267, "learning_rate": 9.848766215077858e-05, "loss": 88.4687, "step": 37 }, { "epoch": 0.21468926553672316, "grad_norm": 0.7200310826301575, "learning_rate": 9.837418485287127e-05, "loss": 88.494, "step": 38 }, { "epoch": 0.22033898305084745, "grad_norm": 0.804512619972229, "learning_rate": 9.825667301983148e-05, "loss": 88.4699, "step": 39 }, { "epoch": 0.22598870056497175, "grad_norm": 0.7734663486480713, "learning_rate": 9.813513645245729e-05, "loss": 88.4918, "step": 40 }, { "epoch": 0.23163841807909605, "grad_norm": 0.8824536204338074, "learning_rate": 9.800958528722036e-05, "loss": 88.5269, "step": 41 }, { "epoch": 0.23728813559322035, "grad_norm": 0.8647343516349792, "learning_rate": 9.78800299954203e-05, "loss": 88.4415, "step": 42 }, { "epoch": 0.24293785310734464, "grad_norm": 0.8488788604736328, "learning_rate": 9.774648138231163e-05, "loss": 88.5083, "step": 43 }, { "epoch": 0.24858757062146894, "grad_norm": 1.1741865873336792, "learning_rate": 9.760895058620235e-05, "loss": 88.4342, "step": 44 }, { "epoch": 0.2542372881355932, "grad_norm": 0.6116179823875427, "learning_rate": 9.746744907752509e-05, "loss": 88.5531, "step": 45 }, { "epoch": 0.2598870056497175, "grad_norm": 0.6405426859855652, "learning_rate": 9.732198865788047e-05, "loss": 88.5101, "step": 46 }, { "epoch": 0.2655367231638418, "grad_norm": 0.6234097480773926, "learning_rate": 9.71725814590527e-05, "loss": 88.5438, "step": 47 }, { "epoch": 0.2711864406779661, "grad_norm": 0.6477924585342407, "learning_rate": 9.701923994199784e-05, "loss": 88.4566, "step": 48 }, { "epoch": 0.2768361581920904, "grad_norm": 0.5889535546302795, "learning_rate": 9.686197689580456e-05, "loss": 88.515, "step": 49 }, { "epoch": 0.2824858757062147, "grad_norm": 0.7161465883255005, "learning_rate": 9.67008054366274e-05, "loss": 88.4243, "step": 50 }, { "epoch": 0.288135593220339, "grad_norm": 0.6408765316009521, "learning_rate": 9.653573900659292e-05, "loss": 88.4917, "step": 51 }, { "epoch": 0.2937853107344633, "grad_norm": 0.7543685436248779, "learning_rate": 9.636679137267852e-05, "loss": 88.443, "step": 52 }, { "epoch": 0.2994350282485876, "grad_norm": 0.71647709608078, "learning_rate": 9.619397662556435e-05, "loss": 88.4699, "step": 53 }, { "epoch": 0.3050847457627119, "grad_norm": 0.7122645974159241, "learning_rate": 9.601730917845797e-05, "loss": 88.401, "step": 54 }, { "epoch": 0.3107344632768362, "grad_norm": 0.7030540704727173, "learning_rate": 9.583680376589241e-05, "loss": 88.4259, "step": 55 }, { "epoch": 0.3163841807909605, "grad_norm": 0.7593639492988586, "learning_rate": 9.56524754424971e-05, "loss": 88.4487, "step": 56 }, { "epoch": 0.3220338983050847, "grad_norm": 0.7722213268280029, "learning_rate": 9.546433958174238e-05, "loss": 88.43, "step": 57 }, { "epoch": 0.327683615819209, "grad_norm": 0.7288438677787781, "learning_rate": 9.527241187465734e-05, "loss": 88.3441, "step": 58 }, { "epoch": 0.3333333333333333, "grad_norm": 0.7370777130126953, "learning_rate": 9.507670832852102e-05, "loss": 88.4693, "step": 59 }, { "epoch": 0.3389830508474576, "grad_norm": 0.6613349318504333, "learning_rate": 9.487724526552753e-05, "loss": 88.3618, "step": 60 }, { "epoch": 0.3446327683615819, "grad_norm": 0.6312256455421448, "learning_rate": 9.467403932142452e-05, "loss": 88.3936, "step": 61 }, { "epoch": 0.3502824858757062, "grad_norm": 0.7118995189666748, "learning_rate": 9.446710744412595e-05, "loss": 88.4331, "step": 62 }, { "epoch": 0.3559322033898305, "grad_norm": 0.6945487260818481, "learning_rate": 9.425646689229842e-05, "loss": 88.3982, "step": 63 }, { "epoch": 0.3615819209039548, "grad_norm": 0.617613673210144, "learning_rate": 9.404213523392183e-05, "loss": 88.4616, "step": 64 }, { "epoch": 0.3672316384180791, "grad_norm": 0.6954861879348755, "learning_rate": 9.38241303448241e-05, "loss": 88.3863, "step": 65 }, { "epoch": 0.3728813559322034, "grad_norm": 0.6992458701133728, "learning_rate": 9.360247040719039e-05, "loss": 88.3965, "step": 66 }, { "epoch": 0.3785310734463277, "grad_norm": 0.6800311207771301, "learning_rate": 9.337717390804652e-05, "loss": 88.3868, "step": 67 }, { "epoch": 0.384180790960452, "grad_norm": 0.7097700238227844, "learning_rate": 9.314825963771723e-05, "loss": 88.3632, "step": 68 }, { "epoch": 0.3898305084745763, "grad_norm": 0.7166731953620911, "learning_rate": 9.29157466882589e-05, "loss": 88.3418, "step": 69 }, { "epoch": 0.3954802259887006, "grad_norm": 0.7072133421897888, "learning_rate": 9.267965445186733e-05, "loss": 88.3753, "step": 70 }, { "epoch": 0.4011299435028249, "grad_norm": 0.7065291404724121, "learning_rate": 9.24400026192603e-05, "loss": 88.3499, "step": 71 }, { "epoch": 0.4067796610169492, "grad_norm": 0.8113076090812683, "learning_rate": 9.219681117803536e-05, "loss": 88.3214, "step": 72 }, { "epoch": 0.4124293785310734, "grad_norm": 0.7646364569664001, "learning_rate": 9.195010041100275e-05, "loss": 88.3064, "step": 73 }, { "epoch": 0.4180790960451977, "grad_norm": 0.756248950958252, "learning_rate": 9.16998908944939e-05, "loss": 88.3968, "step": 74 }, { "epoch": 0.423728813559322, "grad_norm": 0.805596113204956, "learning_rate": 9.14462034966451e-05, "loss": 88.3718, "step": 75 }, { "epoch": 0.4293785310734463, "grad_norm": 0.6950631737709045, "learning_rate": 9.118905937565722e-05, "loss": 88.3981, "step": 76 }, { "epoch": 0.4350282485875706, "grad_norm": 0.7553988695144653, "learning_rate": 9.092847997803097e-05, "loss": 88.3488, "step": 77 }, { "epoch": 0.4406779661016949, "grad_norm": 0.7791416645050049, "learning_rate": 9.066448703677828e-05, "loss": 88.4197, "step": 78 }, { "epoch": 0.4463276836158192, "grad_norm": 0.7216358184814453, "learning_rate": 9.039710256960957e-05, "loss": 88.3813, "step": 79 }, { "epoch": 0.4519774011299435, "grad_norm": 0.8204500079154968, "learning_rate": 9.012634887709754e-05, "loss": 88.2346, "step": 80 }, { "epoch": 0.4576271186440678, "grad_norm": 0.843241274356842, "learning_rate": 8.985224854081726e-05, "loss": 88.4055, "step": 81 }, { "epoch": 0.4632768361581921, "grad_norm": 0.7781232595443726, "learning_rate": 8.957482442146272e-05, "loss": 88.372, "step": 82 }, { "epoch": 0.4689265536723164, "grad_norm": 0.8177736401557922, "learning_rate": 8.929409965694016e-05, "loss": 88.3409, "step": 83 }, { "epoch": 0.4745762711864407, "grad_norm": 0.8297736644744873, "learning_rate": 8.901009766043847e-05, "loss": 88.3283, "step": 84 }, { "epoch": 0.480225988700565, "grad_norm": 0.8678383231163025, "learning_rate": 8.872284211847629e-05, "loss": 88.3416, "step": 85 }, { "epoch": 0.4858757062146893, "grad_norm": 0.8668798804283142, "learning_rate": 8.84323569889266e-05, "loss": 88.2483, "step": 86 }, { "epoch": 0.4915254237288136, "grad_norm": 0.9378306269645691, "learning_rate": 8.813866649901856e-05, "loss": 88.3618, "step": 87 }, { "epoch": 0.4971751412429379, "grad_norm": 1.1833090782165527, "learning_rate": 8.784179514331682e-05, "loss": 88.2735, "step": 88 }, { "epoch": 0.5028248587570622, "grad_norm": 0.6026833057403564, "learning_rate": 8.75417676816787e-05, "loss": 88.4478, "step": 89 }, { "epoch": 0.5084745762711864, "grad_norm": 0.6131640672683716, "learning_rate": 8.72386091371891e-05, "loss": 88.4419, "step": 90 }, { "epoch": 0.5141242937853108, "grad_norm": 0.6138200759887695, "learning_rate": 8.693234479407353e-05, "loss": 88.4078, "step": 91 }, { "epoch": 0.519774011299435, "grad_norm": 0.6851401329040527, "learning_rate": 8.662300019558931e-05, "loss": 88.3743, "step": 92 }, { "epoch": 0.5254237288135594, "grad_norm": 0.6595472693443298, "learning_rate": 8.631060114189525e-05, "loss": 88.387, "step": 93 }, { "epoch": 0.5310734463276836, "grad_norm": 0.6642423272132874, "learning_rate": 8.59951736878998e-05, "loss": 88.2821, "step": 94 }, { "epoch": 0.536723163841808, "grad_norm": 0.7108228206634521, "learning_rate": 8.5676744141088e-05, "loss": 88.2668, "step": 95 }, { "epoch": 0.5423728813559322, "grad_norm": 0.7303377985954285, "learning_rate": 8.535533905932738e-05, "loss": 88.2737, "step": 96 }, { "epoch": 0.5480225988700564, "grad_norm": 0.6971437931060791, "learning_rate": 8.503098524865301e-05, "loss": 88.342, "step": 97 }, { "epoch": 0.5536723163841808, "grad_norm": 0.7270698547363281, "learning_rate": 8.47037097610317e-05, "loss": 88.2374, "step": 98 }, { "epoch": 0.559322033898305, "grad_norm": 0.6642720103263855, "learning_rate": 8.43735398921059e-05, "loss": 88.2445, "step": 99 }, { "epoch": 0.5649717514124294, "grad_norm": 0.6921072602272034, "learning_rate": 8.404050317891711e-05, "loss": 88.3058, "step": 100 }, { "epoch": 0.5706214689265536, "grad_norm": 0.7314338088035583, "learning_rate": 8.370462739760923e-05, "loss": 88.1885, "step": 101 }, { "epoch": 0.576271186440678, "grad_norm": 0.7479543089866638, "learning_rate": 8.336594056111197e-05, "loss": 88.3319, "step": 102 }, { "epoch": 0.5819209039548022, "grad_norm": 0.6370671987533569, "learning_rate": 8.30244709168045e-05, "loss": 88.3088, "step": 103 }, { "epoch": 0.5875706214689266, "grad_norm": 0.68263179063797, "learning_rate": 8.268024694415947e-05, "loss": 88.216, "step": 104 }, { "epoch": 0.5932203389830508, "grad_norm": 0.7159000635147095, "learning_rate": 8.233329735236789e-05, "loss": 88.2724, "step": 105 }, { "epoch": 0.5988700564971752, "grad_norm": 0.6509236097335815, "learning_rate": 8.198365107794457e-05, "loss": 88.2867, "step": 106 }, { "epoch": 0.6045197740112994, "grad_norm": 0.6526631712913513, "learning_rate": 8.163133728231482e-05, "loss": 88.3246, "step": 107 }, { "epoch": 0.6101694915254238, "grad_norm": 0.7395955920219421, "learning_rate": 8.127638534938227e-05, "loss": 88.276, "step": 108 }, { "epoch": 0.615819209039548, "grad_norm": 0.6584829688072205, "learning_rate": 8.09188248830782e-05, "loss": 88.1927, "step": 109 }, { "epoch": 0.6214689265536724, "grad_norm": 0.8070241808891296, "learning_rate": 8.055868570489247e-05, "loss": 88.1449, "step": 110 }, { "epoch": 0.6271186440677966, "grad_norm": 0.6944290995597839, "learning_rate": 8.019599785138635e-05, "loss": 88.2262, "step": 111 }, { "epoch": 0.632768361581921, "grad_norm": 0.7627915740013123, "learning_rate": 7.983079157168736e-05, "loss": 88.2257, "step": 112 }, { "epoch": 0.6384180790960452, "grad_norm": 0.7771816849708557, "learning_rate": 7.946309732496647e-05, "loss": 88.211, "step": 113 }, { "epoch": 0.6440677966101694, "grad_norm": 0.766257107257843, "learning_rate": 7.909294577789766e-05, "loss": 88.1717, "step": 114 }, { "epoch": 0.6497175141242938, "grad_norm": 0.7782958149909973, "learning_rate": 7.872036780210026e-05, "loss": 88.15, "step": 115 }, { "epoch": 0.655367231638418, "grad_norm": 0.7616683840751648, "learning_rate": 7.834539447156424e-05, "loss": 88.1597, "step": 116 }, { "epoch": 0.6610169491525424, "grad_norm": 0.6816295981407166, "learning_rate": 7.796805706005843e-05, "loss": 88.1752, "step": 117 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7385973334312439, "learning_rate": 7.75883870385223e-05, "loss": 88.1828, "step": 118 }, { "epoch": 0.672316384180791, "grad_norm": 0.7695619463920593, "learning_rate": 7.72064160724412e-05, "loss": 88.2801, "step": 119 }, { "epoch": 0.6779661016949152, "grad_norm": 0.7640277147293091, "learning_rate": 7.682217601920529e-05, "loss": 88.2095, "step": 120 }, { "epoch": 0.6836158192090396, "grad_norm": 0.7877722382545471, "learning_rate": 7.643569892545267e-05, "loss": 88.16, "step": 121 }, { "epoch": 0.6892655367231638, "grad_norm": 0.7659584283828735, "learning_rate": 7.604701702439651e-05, "loss": 88.1962, "step": 122 }, { "epoch": 0.6949152542372882, "grad_norm": 0.8098394870758057, "learning_rate": 7.565616273313678e-05, "loss": 88.1656, "step": 123 }, { "epoch": 0.7005649717514124, "grad_norm": 0.7991154193878174, "learning_rate": 7.526316864995647e-05, "loss": 88.1524, "step": 124 }, { "epoch": 0.7062146892655368, "grad_norm": 0.7656694650650024, "learning_rate": 7.486806755160297e-05, "loss": 88.1187, "step": 125 }, { "epoch": 0.711864406779661, "grad_norm": 0.7287772297859192, "learning_rate": 7.447089239055428e-05, "loss": 88.1673, "step": 126 }, { "epoch": 0.7175141242937854, "grad_norm": 0.8145878911018372, "learning_rate": 7.407167629227072e-05, "loss": 88.1032, "step": 127 }, { "epoch": 0.7231638418079096, "grad_norm": 0.857018768787384, "learning_rate": 7.367045255243216e-05, "loss": 88.0358, "step": 128 }, { "epoch": 0.7288135593220338, "grad_norm": 0.8250066041946411, "learning_rate": 7.326725463416117e-05, "loss": 88.2404, "step": 129 }, { "epoch": 0.7344632768361582, "grad_norm": 0.9477486610412598, "learning_rate": 7.286211616523193e-05, "loss": 88.2214, "step": 130 }, { "epoch": 0.7401129943502824, "grad_norm": 0.971202552318573, "learning_rate": 7.245507093526574e-05, "loss": 88.214, "step": 131 }, { "epoch": 0.7457627118644068, "grad_norm": 1.0824439525604248, "learning_rate": 7.204615289291283e-05, "loss": 88.1631, "step": 132 }, { "epoch": 0.751412429378531, "grad_norm": 0.7381979823112488, "learning_rate": 7.163539614302088e-05, "loss": 88.2446, "step": 133 }, { "epoch": 0.7570621468926554, "grad_norm": 0.6837661266326904, "learning_rate": 7.122283494379076e-05, "loss": 88.2198, "step": 134 }, { "epoch": 0.7627118644067796, "grad_norm": 0.6421964764595032, "learning_rate": 7.080850370391907e-05, "loss": 88.2727, "step": 135 }, { "epoch": 0.768361581920904, "grad_norm": 0.6602257490158081, "learning_rate": 7.039243697972857e-05, "loss": 88.1771, "step": 136 }, { "epoch": 0.7740112994350282, "grad_norm": 0.7069459557533264, "learning_rate": 6.997466947228596e-05, "loss": 88.1327, "step": 137 }, { "epoch": 0.7796610169491526, "grad_norm": 0.6446311473846436, "learning_rate": 6.95552360245078e-05, "loss": 88.2014, "step": 138 }, { "epoch": 0.7853107344632768, "grad_norm": 0.6669993996620178, "learning_rate": 6.91341716182545e-05, "loss": 88.1836, "step": 139 }, { "epoch": 0.7909604519774012, "grad_norm": 0.7162554264068604, "learning_rate": 6.871151137141266e-05, "loss": 88.0806, "step": 140 }, { "epoch": 0.7966101694915254, "grad_norm": 0.6205349564552307, "learning_rate": 6.828729053496629e-05, "loss": 88.1998, "step": 141 }, { "epoch": 0.8022598870056498, "grad_norm": 0.6284624934196472, "learning_rate": 6.786154449005665e-05, "loss": 88.0922, "step": 142 }, { "epoch": 0.807909604519774, "grad_norm": 0.6776576638221741, "learning_rate": 6.743430874503143e-05, "loss": 88.104, "step": 143 }, { "epoch": 0.8135593220338984, "grad_norm": 0.6751952767372131, "learning_rate": 6.700561893248332e-05, "loss": 88.1876, "step": 144 }, { "epoch": 0.8192090395480226, "grad_norm": 0.6897196769714355, "learning_rate": 6.6575510806278e-05, "loss": 88.1415, "step": 145 }, { "epoch": 0.8248587570621468, "grad_norm": 0.6813532114028931, "learning_rate": 6.614402023857232e-05, "loss": 88.0379, "step": 146 }, { "epoch": 0.8305084745762712, "grad_norm": 0.7052000164985657, "learning_rate": 6.57111832168224e-05, "loss": 88.0754, "step": 147 }, { "epoch": 0.8361581920903954, "grad_norm": 0.6268584132194519, "learning_rate": 6.527703584078218e-05, "loss": 88.0336, "step": 148 }, { "epoch": 0.8418079096045198, "grad_norm": 0.6451026201248169, "learning_rate": 6.484161431949267e-05, "loss": 88.1128, "step": 149 }, { "epoch": 0.847457627118644, "grad_norm": 0.6330875158309937, "learning_rate": 6.440495496826189e-05, "loss": 88.0844, "step": 150 }, { "epoch": 0.8531073446327684, "grad_norm": 0.5872354507446289, "learning_rate": 6.39670942056362e-05, "loss": 88.082, "step": 151 }, { "epoch": 0.8587570621468926, "grad_norm": 0.6449838280677795, "learning_rate": 6.352806855036287e-05, "loss": 88.0767, "step": 152 }, { "epoch": 0.864406779661017, "grad_norm": 0.6349008083343506, "learning_rate": 6.308791461834426e-05, "loss": 88.0604, "step": 153 }, { "epoch": 0.8700564971751412, "grad_norm": 0.6187208890914917, "learning_rate": 6.264666911958404e-05, "loss": 88.1468, "step": 154 }, { "epoch": 0.8757062146892656, "grad_norm": 0.6470857262611389, "learning_rate": 6.22043688551254e-05, "loss": 88.0885, "step": 155 }, { "epoch": 0.8813559322033898, "grad_norm": 0.6183845400810242, "learning_rate": 6.17610507139818e-05, "loss": 88.0303, "step": 156 }, { "epoch": 0.8870056497175142, "grad_norm": 0.6772004961967468, "learning_rate": 6.13167516700603e-05, "loss": 88.111, "step": 157 }, { "epoch": 0.8926553672316384, "grad_norm": 0.6499753594398499, "learning_rate": 6.0871508779077856e-05, "loss": 88.0266, "step": 158 }, { "epoch": 0.8983050847457628, "grad_norm": 0.6717754602432251, "learning_rate": 6.04253591754708e-05, "loss": 88.0712, "step": 159 }, { "epoch": 0.903954802259887, "grad_norm": 0.6724316477775574, "learning_rate": 5.9978340069297647e-05, "loss": 87.9934, "step": 160 }, { "epoch": 0.9096045197740112, "grad_norm": 0.6548104286193848, "learning_rate": 5.9530488743135746e-05, "loss": 88.0043, "step": 161 }, { "epoch": 0.9152542372881356, "grad_norm": 0.6596248745918274, "learning_rate": 5.908184254897182e-05, "loss": 88.0424, "step": 162 }, { "epoch": 0.9209039548022598, "grad_norm": 0.6875431537628174, "learning_rate": 5.863243890508668e-05, "loss": 88.0799, "step": 163 }, { "epoch": 0.9265536723163842, "grad_norm": 0.6975457668304443, "learning_rate": 5.8182315292934406e-05, "loss": 88.0774, "step": 164 }, { "epoch": 0.9322033898305084, "grad_norm": 0.6618639230728149, "learning_rate": 5.773150925401641e-05, "loss": 88.124, "step": 165 }, { "epoch": 0.9378531073446328, "grad_norm": 0.6853637099266052, "learning_rate": 5.7280058386750255e-05, "loss": 87.9972, "step": 166 }, { "epoch": 0.943502824858757, "grad_norm": 0.7230448126792908, "learning_rate": 5.68280003433339e-05, "loss": 87.9713, "step": 167 }, { "epoch": 0.9491525423728814, "grad_norm": 0.7414244413375854, "learning_rate": 5.63753728266054e-05, "loss": 88.0774, "step": 168 }, { "epoch": 0.9548022598870056, "grad_norm": 0.766106367111206, "learning_rate": 5.592221358689843e-05, "loss": 88.0479, "step": 169 }, { "epoch": 0.96045197740113, "grad_norm": 0.7827813029289246, "learning_rate": 5.546856041889373e-05, "loss": 88.1115, "step": 170 }, { "epoch": 0.9661016949152542, "grad_norm": 0.693665623664856, "learning_rate": 5.5014451158466975e-05, "loss": 88.0148, "step": 171 }, { "epoch": 0.9717514124293786, "grad_norm": 0.7761551737785339, "learning_rate": 5.4559923679533173e-05, "loss": 87.9896, "step": 172 }, { "epoch": 0.9774011299435028, "grad_norm": 0.7685063481330872, "learning_rate": 5.410501589088785e-05, "loss": 87.9791, "step": 173 }, { "epoch": 0.9830508474576272, "grad_norm": 0.8432329297065735, "learning_rate": 5.364976573304538e-05, "loss": 88.1228, "step": 174 }, { "epoch": 0.9887005649717514, "grad_norm": 0.9350976347923279, "learning_rate": 5.319421117507462e-05, "loss": 87.9292, "step": 175 }, { "epoch": 0.9943502824858758, "grad_norm": 1.2149473428726196, "learning_rate": 5.273839021143218e-05, "loss": 87.9655, "step": 176 }, { "epoch": 1.0, "grad_norm": 0.6291913986206055, "learning_rate": 5.22823408587937e-05, "loss": 88.1277, "step": 177 }, { "epoch": 1.0, "eval_loss": 11.00737476348877, "eval_runtime": 0.6468, "eval_samples_per_second": 460.76, "eval_steps_per_second": 115.963, "step": 177 }, { "epoch": 1.0056497175141244, "grad_norm": 0.6629582047462463, "learning_rate": 5.182610115288295e-05, "loss": 88.1835, "step": 178 }, { "epoch": 1.0112994350282485, "grad_norm": 0.6663825511932373, "learning_rate": 5.136970914529975e-05, "loss": 88.162, "step": 179 }, { "epoch": 1.0169491525423728, "grad_norm": 0.6589259505271912, "learning_rate": 5.091320290034625e-05, "loss": 88.1961, "step": 180 }, { "epoch": 1.0225988700564972, "grad_norm": 0.5902798175811768, "learning_rate": 5.045662049185229e-05, "loss": 88.0916, "step": 181 }, { "epoch": 1.0282485875706215, "grad_norm": 0.5283370018005371, "learning_rate": 5e-05, "loss": 88.119, "step": 182 }, { "epoch": 1.0338983050847457, "grad_norm": 0.5795127749443054, "learning_rate": 4.9543379508147716e-05, "loss": 88.1366, "step": 183 }, { "epoch": 1.03954802259887, "grad_norm": 0.6025692820549011, "learning_rate": 4.9086797099653756e-05, "loss": 88.0548, "step": 184 }, { "epoch": 1.0451977401129944, "grad_norm": 0.5939972996711731, "learning_rate": 4.863029085470026e-05, "loss": 87.9877, "step": 185 }, { "epoch": 1.0508474576271187, "grad_norm": 0.5899671316146851, "learning_rate": 4.817389884711705e-05, "loss": 88.0826, "step": 186 }, { "epoch": 1.0564971751412429, "grad_norm": 0.5887870788574219, "learning_rate": 4.771765914120631e-05, "loss": 88.1155, "step": 187 }, { "epoch": 1.0621468926553672, "grad_norm": 0.5916518568992615, "learning_rate": 4.726160978856782e-05, "loss": 88.0002, "step": 188 }, { "epoch": 1.0677966101694916, "grad_norm": 0.6661847233772278, "learning_rate": 4.6805788824925386e-05, "loss": 88.0365, "step": 189 }, { "epoch": 1.073446327683616, "grad_norm": 0.5642584562301636, "learning_rate": 4.6350234266954626e-05, "loss": 88.0447, "step": 190 }, { "epoch": 1.07909604519774, "grad_norm": 0.5734567046165466, "learning_rate": 4.589498410911215e-05, "loss": 88.0759, "step": 191 }, { "epoch": 1.0847457627118644, "grad_norm": 0.5504806041717529, "learning_rate": 4.5440076320466825e-05, "loss": 88.0262, "step": 192 }, { "epoch": 1.0903954802259888, "grad_norm": 0.5161648392677307, "learning_rate": 4.4985548841533036e-05, "loss": 88.0113, "step": 193 }, { "epoch": 1.0960451977401129, "grad_norm": 0.5934569239616394, "learning_rate": 4.4531439581106295e-05, "loss": 88.1235, "step": 194 }, { "epoch": 1.1016949152542372, "grad_norm": 0.5542126297950745, "learning_rate": 4.4077786413101595e-05, "loss": 88.0276, "step": 195 }, { "epoch": 1.1073446327683616, "grad_norm": 0.6331348419189453, "learning_rate": 4.362462717339461e-05, "loss": 88.0501, "step": 196 }, { "epoch": 1.112994350282486, "grad_norm": 0.5662806630134583, "learning_rate": 4.317199965666613e-05, "loss": 88.0157, "step": 197 }, { "epoch": 1.11864406779661, "grad_norm": 0.5534279942512512, "learning_rate": 4.271994161324977e-05, "loss": 87.9114, "step": 198 }, { "epoch": 1.1242937853107344, "grad_norm": 0.5692214369773865, "learning_rate": 4.22684907459836e-05, "loss": 87.9938, "step": 199 }, { "epoch": 1.1299435028248588, "grad_norm": 0.6049486994743347, "learning_rate": 4.1817684707065605e-05, "loss": 87.9819, "step": 200 }, { "epoch": 1.1355932203389831, "grad_norm": 0.5856545567512512, "learning_rate": 4.1367561094913335e-05, "loss": 87.9492, "step": 201 }, { "epoch": 1.1412429378531073, "grad_norm": 0.5690354704856873, "learning_rate": 4.0918157451028185e-05, "loss": 87.9628, "step": 202 }, { "epoch": 1.1468926553672316, "grad_norm": 0.6273289322853088, "learning_rate": 4.0469511256864265e-05, "loss": 88.0531, "step": 203 }, { "epoch": 1.152542372881356, "grad_norm": 0.6032590866088867, "learning_rate": 4.002165993070237e-05, "loss": 87.8896, "step": 204 }, { "epoch": 1.1581920903954803, "grad_norm": 0.5722517371177673, "learning_rate": 3.957464082452922e-05, "loss": 87.9297, "step": 205 }, { "epoch": 1.1638418079096045, "grad_norm": 0.6684175729751587, "learning_rate": 3.9128491220922156e-05, "loss": 88.0202, "step": 206 }, { "epoch": 1.1694915254237288, "grad_norm": 0.6134036779403687, "learning_rate": 3.8683248329939716e-05, "loss": 87.9706, "step": 207 }, { "epoch": 1.1751412429378532, "grad_norm": 0.6217228174209595, "learning_rate": 3.823894928601822e-05, "loss": 88.0366, "step": 208 }, { "epoch": 1.1807909604519775, "grad_norm": 0.577599287033081, "learning_rate": 3.7795631144874604e-05, "loss": 88.0188, "step": 209 }, { "epoch": 1.1864406779661016, "grad_norm": 0.6666799783706665, "learning_rate": 3.735333088041596e-05, "loss": 88.0513, "step": 210 }, { "epoch": 1.192090395480226, "grad_norm": 0.5872676968574524, "learning_rate": 3.6912085381655734e-05, "loss": 88.0103, "step": 211 }, { "epoch": 1.1977401129943503, "grad_norm": 0.5996030569076538, "learning_rate": 3.6471931449637124e-05, "loss": 87.9581, "step": 212 }, { "epoch": 1.2033898305084745, "grad_norm": 0.6568124890327454, "learning_rate": 3.60329057943638e-05, "loss": 87.9921, "step": 213 }, { "epoch": 1.2090395480225988, "grad_norm": 0.7044454216957092, "learning_rate": 3.5595045031738125e-05, "loss": 88.0731, "step": 214 }, { "epoch": 1.2146892655367232, "grad_norm": 0.7266508936882019, "learning_rate": 3.515838568050736e-05, "loss": 88.0184, "step": 215 }, { "epoch": 1.2203389830508475, "grad_norm": 0.7107569575309753, "learning_rate": 3.472296415921783e-05, "loss": 88.0427, "step": 216 }, { "epoch": 1.2259887005649717, "grad_norm": 0.7413809299468994, "learning_rate": 3.428881678317763e-05, "loss": 87.9618, "step": 217 }, { "epoch": 1.231638418079096, "grad_norm": 0.7287234663963318, "learning_rate": 3.38559797614277e-05, "loss": 88.007, "step": 218 }, { "epoch": 1.2372881355932204, "grad_norm": 0.8537057042121887, "learning_rate": 3.3424489193722013e-05, "loss": 88.055, "step": 219 }, { "epoch": 1.2429378531073447, "grad_norm": 0.8392075300216675, "learning_rate": 3.2994381067516696e-05, "loss": 88.0221, "step": 220 }, { "epoch": 1.2485875706214689, "grad_norm": 1.036270022392273, "learning_rate": 3.256569125496858e-05, "loss": 87.9944, "step": 221 }, { "epoch": 1.2542372881355932, "grad_norm": 0.633751392364502, "learning_rate": 3.2138455509943366e-05, "loss": 88.1435, "step": 222 }, { "epoch": 1.2598870056497176, "grad_norm": 0.5413132309913635, "learning_rate": 3.171270946503373e-05, "loss": 88.1694, "step": 223 }, { "epoch": 1.2655367231638417, "grad_norm": 0.5542528629302979, "learning_rate": 3.128848862858734e-05, "loss": 88.1215, "step": 224 }, { "epoch": 1.271186440677966, "grad_norm": 0.5940180420875549, "learning_rate": 3.086582838174551e-05, "loss": 88.0598, "step": 225 }, { "epoch": 1.2768361581920904, "grad_norm": 0.5067227482795715, "learning_rate": 3.0444763975492208e-05, "loss": 88.0876, "step": 226 }, { "epoch": 1.2824858757062148, "grad_norm": 0.5656532049179077, "learning_rate": 3.0025330527714046e-05, "loss": 88.0915, "step": 227 }, { "epoch": 1.288135593220339, "grad_norm": 0.52092045545578, "learning_rate": 2.9607563020271446e-05, "loss": 88.0636, "step": 228 }, { "epoch": 1.2937853107344632, "grad_norm": 0.5944317579269409, "learning_rate": 2.9191496296080935e-05, "loss": 87.999, "step": 229 }, { "epoch": 1.2994350282485876, "grad_norm": 0.5355740189552307, "learning_rate": 2.8777165056209256e-05, "loss": 88.0768, "step": 230 }, { "epoch": 1.305084745762712, "grad_norm": 0.5801308155059814, "learning_rate": 2.836460385697911e-05, "loss": 88.016, "step": 231 }, { "epoch": 1.310734463276836, "grad_norm": 0.5269469618797302, "learning_rate": 2.7953847107087172e-05, "loss": 87.9684, "step": 232 }, { "epoch": 1.3163841807909604, "grad_norm": 0.5054383873939514, "learning_rate": 2.754492906473425e-05, "loss": 88.1685, "step": 233 }, { "epoch": 1.3220338983050848, "grad_norm": 0.6001875400543213, "learning_rate": 2.7137883834768073e-05, "loss": 88.0379, "step": 234 }, { "epoch": 1.327683615819209, "grad_norm": 0.5223692059516907, "learning_rate": 2.6732745365838828e-05, "loss": 87.9794, "step": 235 }, { "epoch": 1.3333333333333333, "grad_norm": 0.6370963454246521, "learning_rate": 2.6329547447567836e-05, "loss": 87.9989, "step": 236 }, { "epoch": 1.3389830508474576, "grad_norm": 0.49859458208084106, "learning_rate": 2.5928323707729306e-05, "loss": 88.0401, "step": 237 }, { "epoch": 1.344632768361582, "grad_norm": 0.544282078742981, "learning_rate": 2.5529107609445733e-05, "loss": 88.0129, "step": 238 }, { "epoch": 1.3502824858757063, "grad_norm": 0.6058323979377747, "learning_rate": 2.513193244839704e-05, "loss": 88.0901, "step": 239 }, { "epoch": 1.3559322033898304, "grad_norm": 0.5392006635665894, "learning_rate": 2.4736831350043536e-05, "loss": 88.0176, "step": 240 }, { "epoch": 1.3615819209039548, "grad_norm": 0.5761566758155823, "learning_rate": 2.4343837266863246e-05, "loss": 88.0378, "step": 241 }, { "epoch": 1.3672316384180792, "grad_norm": 0.5357884764671326, "learning_rate": 2.3952982975603496e-05, "loss": 88.0098, "step": 242 }, { "epoch": 1.3728813559322033, "grad_norm": 0.5242740511894226, "learning_rate": 2.356430107454733e-05, "loss": 88.0394, "step": 243 }, { "epoch": 1.3785310734463276, "grad_norm": 0.5258982181549072, "learning_rate": 2.3177823980794727e-05, "loss": 87.9584, "step": 244 }, { "epoch": 1.384180790960452, "grad_norm": 0.6452914476394653, "learning_rate": 2.279358392755882e-05, "loss": 88.0219, "step": 245 }, { "epoch": 1.3898305084745763, "grad_norm": 0.5566428899765015, "learning_rate": 2.24116129614777e-05, "loss": 87.9679, "step": 246 }, { "epoch": 1.3954802259887007, "grad_norm": 0.5655139088630676, "learning_rate": 2.2031942939941592e-05, "loss": 87.9924, "step": 247 }, { "epoch": 1.4011299435028248, "grad_norm": 0.6263113617897034, "learning_rate": 2.1654605528435773e-05, "loss": 87.919, "step": 248 }, { "epoch": 1.4067796610169492, "grad_norm": 0.5700308084487915, "learning_rate": 2.127963219789974e-05, "loss": 87.9079, "step": 249 }, { "epoch": 1.4124293785310735, "grad_norm": 0.5626516938209534, "learning_rate": 2.090705422210237e-05, "loss": 87.9963, "step": 250 }, { "epoch": 1.4180790960451977, "grad_norm": 0.5786779522895813, "learning_rate": 2.0536902675033548e-05, "loss": 87.9655, "step": 251 }, { "epoch": 1.423728813559322, "grad_norm": 0.6087446212768555, "learning_rate": 2.0169208428312647e-05, "loss": 87.9886, "step": 252 }, { "epoch": 1.4293785310734464, "grad_norm": 0.627742350101471, "learning_rate": 1.980400214861367e-05, "loss": 88.072, "step": 253 }, { "epoch": 1.4350282485875705, "grad_norm": 0.6009258031845093, "learning_rate": 1.9441314295107537e-05, "loss": 87.9974, "step": 254 }, { "epoch": 1.4406779661016949, "grad_norm": 0.5660704374313354, "learning_rate": 1.90811751169218e-05, "loss": 87.9457, "step": 255 }, { "epoch": 1.4463276836158192, "grad_norm": 0.6332747340202332, "learning_rate": 1.8723614650617723e-05, "loss": 88.0142, "step": 256 }, { "epoch": 1.4519774011299436, "grad_norm": 0.653758704662323, "learning_rate": 1.8368662717685187e-05, "loss": 88.0092, "step": 257 }, { "epoch": 1.457627118644068, "grad_norm": 0.574155330657959, "learning_rate": 1.801634892205545e-05, "loss": 87.9217, "step": 258 }, { "epoch": 1.463276836158192, "grad_norm": 0.6903696060180664, "learning_rate": 1.766670264763213e-05, "loss": 87.9605, "step": 259 }, { "epoch": 1.4689265536723164, "grad_norm": 0.6985341310501099, "learning_rate": 1.7319753055840553e-05, "loss": 88.1013, "step": 260 }, { "epoch": 1.4745762711864407, "grad_norm": 0.6917281150817871, "learning_rate": 1.697552908319553e-05, "loss": 87.9394, "step": 261 }, { "epoch": 1.4802259887005649, "grad_norm": 0.7425939440727234, "learning_rate": 1.6634059438888033e-05, "loss": 87.9243, "step": 262 }, { "epoch": 1.4858757062146892, "grad_norm": 0.7640402317047119, "learning_rate": 1.6295372602390767e-05, "loss": 87.9382, "step": 263 }, { "epoch": 1.4915254237288136, "grad_norm": 0.8300817608833313, "learning_rate": 1.5959496821082905e-05, "loss": 87.8827, "step": 264 }, { "epoch": 1.497175141242938, "grad_norm": 1.0759795904159546, "learning_rate": 1.562646010789411e-05, "loss": 87.8789, "step": 265 }, { "epoch": 1.5028248587570623, "grad_norm": 0.6373040080070496, "learning_rate": 1.5296290238968303e-05, "loss": 88.0713, "step": 266 }, { "epoch": 1.5084745762711864, "grad_norm": 0.5541619062423706, "learning_rate": 1.496901475134701e-05, "loss": 88.1348, "step": 267 }, { "epoch": 1.5141242937853108, "grad_norm": 0.5655987858772278, "learning_rate": 1.4644660940672627e-05, "loss": 88.1458, "step": 268 }, { "epoch": 1.5197740112994351, "grad_norm": 0.5608534216880798, "learning_rate": 1.4323255858912011e-05, "loss": 88.1328, "step": 269 }, { "epoch": 1.5254237288135593, "grad_norm": 0.5184534192085266, "learning_rate": 1.4004826312100216e-05, "loss": 88.049, "step": 270 }, { "epoch": 1.5310734463276836, "grad_norm": 0.5682441592216492, "learning_rate": 1.3689398858104751e-05, "loss": 88.0269, "step": 271 }, { "epoch": 1.536723163841808, "grad_norm": 0.46682822704315186, "learning_rate": 1.337699980441069e-05, "loss": 88.0322, "step": 272 }, { "epoch": 1.542372881355932, "grad_norm": 0.55782550573349, "learning_rate": 1.3067655205926488e-05, "loss": 87.9584, "step": 273 }, { "epoch": 1.5480225988700564, "grad_norm": 0.549993097782135, "learning_rate": 1.2761390862810907e-05, "loss": 87.9824, "step": 274 }, { "epoch": 1.5536723163841808, "grad_norm": 0.49282097816467285, "learning_rate": 1.2458232318321305e-05, "loss": 87.9938, "step": 275 }, { "epoch": 1.559322033898305, "grad_norm": 0.5615452527999878, "learning_rate": 1.2158204856683176e-05, "loss": 87.9862, "step": 276 }, { "epoch": 1.5649717514124295, "grad_norm": 0.5333657264709473, "learning_rate": 1.1861333500981448e-05, "loss": 87.9579, "step": 277 }, { "epoch": 1.5706214689265536, "grad_norm": 0.5501605868339539, "learning_rate": 1.1567643011073392e-05, "loss": 87.8844, "step": 278 }, { "epoch": 1.576271186440678, "grad_norm": 0.5416379570960999, "learning_rate": 1.127715788152372e-05, "loss": 87.9542, "step": 279 }, { "epoch": 1.5819209039548023, "grad_norm": 0.5352757573127747, "learning_rate": 1.0989902339561553e-05, "loss": 87.9788, "step": 280 }, { "epoch": 1.5875706214689265, "grad_norm": 0.514640748500824, "learning_rate": 1.0705900343059855e-05, "loss": 87.9644, "step": 281 }, { "epoch": 1.5932203389830508, "grad_norm": 0.5264514088630676, "learning_rate": 1.0425175578537299e-05, "loss": 88.0876, "step": 282 }, { "epoch": 1.5988700564971752, "grad_norm": 0.5024112462997437, "learning_rate": 1.0147751459182736e-05, "loss": 87.9844, "step": 283 }, { "epoch": 1.6045197740112993, "grad_norm": 0.48817285895347595, "learning_rate": 9.873651122902472e-06, "loss": 88.0376, "step": 284 }, { "epoch": 1.6101694915254239, "grad_norm": 0.6276935935020447, "learning_rate": 9.602897430390457e-06, "loss": 87.933, "step": 285 }, { "epoch": 1.615819209039548, "grad_norm": 0.5100238919258118, "learning_rate": 9.335512963221732e-06, "loss": 88.0798, "step": 286 }, { "epoch": 1.6214689265536724, "grad_norm": 0.534130871295929, "learning_rate": 9.071520021969027e-06, "loss": 88.1008, "step": 287 }, { "epoch": 1.6271186440677967, "grad_norm": 0.5745760798454285, "learning_rate": 8.810940624342785e-06, "loss": 87.9179, "step": 288 }, { "epoch": 1.6327683615819208, "grad_norm": 0.5422095656394958, "learning_rate": 8.553796503354899e-06, "loss": 87.9494, "step": 289 }, { "epoch": 1.6384180790960452, "grad_norm": 0.5460516214370728, "learning_rate": 8.30010910550611e-06, "loss": 87.9545, "step": 290 }, { "epoch": 1.6440677966101696, "grad_norm": 0.6041167974472046, "learning_rate": 8.049899588997244e-06, "loss": 87.9521, "step": 291 }, { "epoch": 1.6497175141242937, "grad_norm": 0.5654332637786865, "learning_rate": 7.803188821964652e-06, "loss": 87.9232, "step": 292 }, { "epoch": 1.655367231638418, "grad_norm": 0.5767014026641846, "learning_rate": 7.559997380739714e-06, "loss": 87.8771, "step": 293 }, { "epoch": 1.6610169491525424, "grad_norm": 0.5665624737739563, "learning_rate": 7.320345548132679e-06, "loss": 88.0009, "step": 294 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5585253238677979, "learning_rate": 7.084253311741101e-06, "loss": 87.9623, "step": 295 }, { "epoch": 1.672316384180791, "grad_norm": 0.6449034214019775, "learning_rate": 6.851740362282788e-06, "loss": 87.968, "step": 296 }, { "epoch": 1.6779661016949152, "grad_norm": 0.6085680723190308, "learning_rate": 6.622826091953482e-06, "loss": 88.0431, "step": 297 }, { "epoch": 1.6836158192090396, "grad_norm": 0.7078083753585815, "learning_rate": 6.397529592809614e-06, "loss": 87.9145, "step": 298 }, { "epoch": 1.689265536723164, "grad_norm": 0.5888929963111877, "learning_rate": 6.1758696551758976e-06, "loss": 87.974, "step": 299 }, { "epoch": 1.694915254237288, "grad_norm": 0.6576589941978455, "learning_rate": 5.957864766078186e-06, "loss": 87.9155, "step": 300 }, { "epoch": 1.7005649717514124, "grad_norm": 0.6700954437255859, "learning_rate": 5.743533107701593e-06, "loss": 87.9094, "step": 301 }, { "epoch": 1.7062146892655368, "grad_norm": 0.6426727175712585, "learning_rate": 5.532892555874059e-06, "loss": 87.9853, "step": 302 }, { "epoch": 1.711864406779661, "grad_norm": 0.6309840679168701, "learning_rate": 5.325960678575498e-06, "loss": 87.997, "step": 303 }, { "epoch": 1.7175141242937855, "grad_norm": 0.6771166920661926, "learning_rate": 5.122754734472496e-06, "loss": 87.9336, "step": 304 }, { "epoch": 1.7231638418079096, "grad_norm": 0.7140227556228638, "learning_rate": 4.92329167147898e-06, "loss": 87.9128, "step": 305 }, { "epoch": 1.7288135593220337, "grad_norm": 0.7067782282829285, "learning_rate": 4.727588125342669e-06, "loss": 88.008, "step": 306 }, { "epoch": 1.7344632768361583, "grad_norm": 0.746557891368866, "learning_rate": 4.535660418257631e-06, "loss": 87.9927, "step": 307 }, { "epoch": 1.7401129943502824, "grad_norm": 0.8619289994239807, "learning_rate": 4.3475245575029185e-06, "loss": 87.8356, "step": 308 }, { "epoch": 1.7457627118644068, "grad_norm": 1.2444101572036743, "learning_rate": 4.163196234107603e-06, "loss": 87.9559, "step": 309 }, { "epoch": 1.7514124293785311, "grad_norm": 0.6213842034339905, "learning_rate": 3.982690821542035e-06, "loss": 88.1411, "step": 310 }, { "epoch": 1.7570621468926553, "grad_norm": 0.5701696872711182, "learning_rate": 3.8060233744356633e-06, "loss": 88.1261, "step": 311 }, { "epoch": 1.7627118644067796, "grad_norm": 0.5259557366371155, "learning_rate": 3.6332086273214827e-06, "loss": 88.1058, "step": 312 }, { "epoch": 1.768361581920904, "grad_norm": 0.561783492565155, "learning_rate": 3.464260993407098e-06, "loss": 87.9936, "step": 313 }, { "epoch": 1.774011299435028, "grad_norm": 0.5012845993041992, "learning_rate": 3.299194563372604e-06, "loss": 88.0487, "step": 314 }, { "epoch": 1.7796610169491527, "grad_norm": 0.5462010502815247, "learning_rate": 3.1380231041954366e-06, "loss": 87.9775, "step": 315 }, { "epoch": 1.7853107344632768, "grad_norm": 0.5348891615867615, "learning_rate": 2.9807600580021634e-06, "loss": 88.0445, "step": 316 }, { "epoch": 1.7909604519774012, "grad_norm": 0.5476884245872498, "learning_rate": 2.827418540947313e-06, "loss": 87.9698, "step": 317 }, { "epoch": 1.7966101694915255, "grad_norm": 0.5374354124069214, "learning_rate": 2.6780113421195298e-06, "loss": 87.9377, "step": 318 }, { "epoch": 1.8022598870056497, "grad_norm": 0.5628228783607483, "learning_rate": 2.532550922474897e-06, "loss": 87.8776, "step": 319 }, { "epoch": 1.807909604519774, "grad_norm": 0.5416897535324097, "learning_rate": 2.3910494137976523e-06, "loss": 87.9337, "step": 320 }, { "epoch": 1.8135593220338984, "grad_norm": 0.590156614780426, "learning_rate": 2.253518617688377e-06, "loss": 87.9672, "step": 321 }, { "epoch": 1.8192090395480225, "grad_norm": 0.5673385858535767, "learning_rate": 2.1199700045797077e-06, "loss": 87.9425, "step": 322 }, { "epoch": 1.8248587570621468, "grad_norm": 0.5306013226509094, "learning_rate": 1.9904147127796646e-06, "loss": 87.9447, "step": 323 }, { "epoch": 1.8305084745762712, "grad_norm": 0.5657864809036255, "learning_rate": 1.864863547542711e-06, "loss": 87.9126, "step": 324 }, { "epoch": 1.8361581920903953, "grad_norm": 0.5366300344467163, "learning_rate": 1.7433269801685303e-06, "loss": 87.9557, "step": 325 }, { "epoch": 1.84180790960452, "grad_norm": 0.5389237403869629, "learning_rate": 1.6258151471287396e-06, "loss": 87.9312, "step": 326 }, { "epoch": 1.847457627118644, "grad_norm": 0.5478444695472717, "learning_rate": 1.5123378492214291e-06, "loss": 87.9573, "step": 327 }, { "epoch": 1.8531073446327684, "grad_norm": 0.5147649645805359, "learning_rate": 1.4029045507537697e-06, "loss": 88.0396, "step": 328 }, { "epoch": 1.8587570621468927, "grad_norm": 0.5677837133407593, "learning_rate": 1.297524378752696e-06, "loss": 88.0318, "step": 329 }, { "epoch": 1.8644067796610169, "grad_norm": 0.5549485087394714, "learning_rate": 1.196206122203647e-06, "loss": 88.005, "step": 330 }, { "epoch": 1.8700564971751412, "grad_norm": 0.5245904922485352, "learning_rate": 1.0989582313175374e-06, "loss": 88.0128, "step": 331 }, { "epoch": 1.8757062146892656, "grad_norm": 0.5384089946746826, "learning_rate": 1.005788816826031e-06, "loss": 87.8896, "step": 332 }, { "epoch": 1.8813559322033897, "grad_norm": 0.5512574911117554, "learning_rate": 9.167056493050496e-07, "loss": 87.996, "step": 333 }, { "epoch": 1.8870056497175143, "grad_norm": 0.6043868064880371, "learning_rate": 8.317161585266964e-07, "loss": 87.9643, "step": 334 }, { "epoch": 1.8926553672316384, "grad_norm": 0.617608368396759, "learning_rate": 7.508274328395848e-07, "loss": 87.9645, "step": 335 }, { "epoch": 1.8983050847457628, "grad_norm": 0.5591124892234802, "learning_rate": 6.74046218577673e-07, "loss": 87.9943, "step": 336 }, { "epoch": 1.9039548022598871, "grad_norm": 0.5672664046287537, "learning_rate": 6.013789194975749e-07, "loss": 87.9743, "step": 337 }, { "epoch": 1.9096045197740112, "grad_norm": 0.5669750571250916, "learning_rate": 5.328315962444874e-07, "loss": 87.8982, "step": 338 }, { "epoch": 1.9152542372881356, "grad_norm": 0.5600360035896301, "learning_rate": 4.684099658467223e-07, "loss": 87.8697, "step": 339 }, { "epoch": 1.92090395480226, "grad_norm": 0.5569552183151245, "learning_rate": 4.0811940123886004e-07, "loss": 88.0418, "step": 340 }, { "epoch": 1.926553672316384, "grad_norm": 0.549552857875824, "learning_rate": 3.5196493081366967e-07, "loss": 87.9701, "step": 341 }, { "epoch": 1.9322033898305084, "grad_norm": 0.6111685037612915, "learning_rate": 2.9995123800270476e-07, "loss": 88.1107, "step": 342 }, { "epoch": 1.9378531073446328, "grad_norm": 0.6287999153137207, "learning_rate": 2.5208266088569966e-07, "loss": 87.9764, "step": 343 }, { "epoch": 1.943502824858757, "grad_norm": 0.5528276562690735, "learning_rate": 2.083631918287643e-07, "loss": 87.9886, "step": 344 }, { "epoch": 1.9491525423728815, "grad_norm": 0.6393068432807922, "learning_rate": 1.6879647715140611e-07, "loss": 88.0424, "step": 345 }, { "epoch": 1.9548022598870056, "grad_norm": 0.6431833505630493, "learning_rate": 1.333858168224178e-07, "loss": 88.0173, "step": 346 }, { "epoch": 1.96045197740113, "grad_norm": 0.6915642023086548, "learning_rate": 1.0213416418465294e-07, "loss": 87.9516, "step": 347 }, { "epoch": 1.9661016949152543, "grad_norm": 0.6479020714759827, "learning_rate": 7.5044125708712e-08, "loss": 87.9935, "step": 348 }, { "epoch": 1.9717514124293785, "grad_norm": 0.7263698577880859, "learning_rate": 5.2117960775543986e-08, "loss": 87.9126, "step": 349 }, { "epoch": 1.9774011299435028, "grad_norm": 0.6763975024223328, "learning_rate": 3.3357581488030475e-08, "loss": 87.993, "step": 350 }, { "epoch": 1.9830508474576272, "grad_norm": 0.8059737682342529, "learning_rate": 1.8764552511485457e-08, "loss": 88.038, "step": 351 }, { "epoch": 1.9887005649717513, "grad_norm": 0.7300512790679932, "learning_rate": 8.340090943176338e-09, "loss": 88.0236, "step": 352 }, { "epoch": 1.9943502824858759, "grad_norm": 0.9957327842712402, "learning_rate": 2.0850662108051755e-09, "loss": 87.9029, "step": 353 }, { "epoch": 2.0, "grad_norm": 0.6088115572929382, "learning_rate": 0.0, "loss": 87.8932, "step": 354 }, { "epoch": 2.0, "eval_loss": 10.999165534973145, "eval_runtime": 0.674, "eval_samples_per_second": 442.111, "eval_steps_per_second": 111.27, "step": 354 } ], "logging_steps": 1, "max_steps": 354, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3718069616640.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }