{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 527, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003795066413662239, "grad_norm": 58.99103546142578, "learning_rate": 3.7735849056603773e-06, "loss": 11.9758, "step": 2 }, { "epoch": 0.007590132827324478, "grad_norm": 60.11953353881836, "learning_rate": 7.547169811320755e-06, "loss": 10.2142, "step": 4 }, { "epoch": 0.011385199240986717, "grad_norm": 15.779712677001953, "learning_rate": 1.1320754716981132e-05, "loss": 6.4991, "step": 6 }, { "epoch": 0.015180265654648957, "grad_norm": 19.793678283691406, "learning_rate": 1.509433962264151e-05, "loss": 5.2038, "step": 8 }, { "epoch": 0.018975332068311195, "grad_norm": 23.680116653442383, "learning_rate": 1.8867924528301888e-05, "loss": 4.8628, "step": 10 }, { "epoch": 0.022770398481973434, "grad_norm": 12.569281578063965, "learning_rate": 2.2641509433962265e-05, "loss": 4.6113, "step": 12 }, { "epoch": 0.026565464895635674, "grad_norm": 16.667444229125977, "learning_rate": 2.641509433962264e-05, "loss": 4.5817, "step": 14 }, { "epoch": 0.030360531309297913, "grad_norm": 9.480690956115723, "learning_rate": 3.018867924528302e-05, "loss": 4.4784, "step": 16 }, { "epoch": 0.03415559772296015, "grad_norm": 10.323396682739258, "learning_rate": 3.39622641509434e-05, "loss": 4.5034, "step": 18 }, { "epoch": 0.03795066413662239, "grad_norm": 6.275529384613037, "learning_rate": 3.7735849056603776e-05, "loss": 4.3612, "step": 20 }, { "epoch": 0.04174573055028463, "grad_norm": 18.33019256591797, "learning_rate": 4.150943396226415e-05, "loss": 4.3324, "step": 22 }, { "epoch": 0.04554079696394687, "grad_norm": 5.673777103424072, "learning_rate": 4.528301886792453e-05, "loss": 4.2084, "step": 24 }, { "epoch": 0.04933586337760911, "grad_norm": 5.893975734710693, "learning_rate": 4.9056603773584906e-05, "loss": 4.2536, "step": 26 }, { "epoch": 0.05313092979127135, "grad_norm": 7.583183765411377, "learning_rate": 5.283018867924528e-05, "loss": 4.2715, "step": 28 }, { "epoch": 0.056925996204933584, "grad_norm": 29.12498664855957, "learning_rate": 5.660377358490566e-05, "loss": 4.2941, "step": 30 }, { "epoch": 0.06072106261859583, "grad_norm": 13.786356925964355, "learning_rate": 6.037735849056604e-05, "loss": 4.1344, "step": 32 }, { "epoch": 0.06451612903225806, "grad_norm": 9.63448715209961, "learning_rate": 6.415094339622641e-05, "loss": 4.016, "step": 34 }, { "epoch": 0.0683111954459203, "grad_norm": 24.18341636657715, "learning_rate": 6.79245283018868e-05, "loss": 4.2109, "step": 36 }, { "epoch": 0.07210626185958255, "grad_norm": 10.357343673706055, "learning_rate": 7.169811320754717e-05, "loss": 4.0338, "step": 38 }, { "epoch": 0.07590132827324478, "grad_norm": 6.5294060707092285, "learning_rate": 7.547169811320755e-05, "loss": 4.1091, "step": 40 }, { "epoch": 0.07969639468690702, "grad_norm": 24.691822052001953, "learning_rate": 7.924528301886794e-05, "loss": 3.8535, "step": 42 }, { "epoch": 0.08349146110056926, "grad_norm": 5.220446586608887, "learning_rate": 8.30188679245283e-05, "loss": 3.9548, "step": 44 }, { "epoch": 0.0872865275142315, "grad_norm": 10.426956176757812, "learning_rate": 8.679245283018869e-05, "loss": 4.0897, "step": 46 }, { "epoch": 0.09108159392789374, "grad_norm": 7.928496360778809, "learning_rate": 9.056603773584906e-05, "loss": 3.8346, "step": 48 }, { "epoch": 0.09487666034155598, "grad_norm": 4.195712566375732, "learning_rate": 9.433962264150944e-05, "loss": 3.9885, "step": 50 }, { "epoch": 0.09867172675521822, "grad_norm": 8.46008014678955, "learning_rate": 9.811320754716981e-05, "loss": 3.9167, "step": 52 }, { "epoch": 0.10246679316888045, "grad_norm": 11.128891944885254, "learning_rate": 9.999975375283309e-05, "loss": 3.8611, "step": 54 }, { "epoch": 0.1062618595825427, "grad_norm": 7.3787031173706055, "learning_rate": 9.999778379005078e-05, "loss": 3.9224, "step": 56 }, { "epoch": 0.11005692599620494, "grad_norm": 9.250240325927734, "learning_rate": 9.99938439421016e-05, "loss": 3.9686, "step": 58 }, { "epoch": 0.11385199240986717, "grad_norm": 3.8329930305480957, "learning_rate": 9.99879343642134e-05, "loss": 3.8928, "step": 60 }, { "epoch": 0.11764705882352941, "grad_norm": 4.931491374969482, "learning_rate": 9.99800552892203e-05, "loss": 3.9112, "step": 62 }, { "epoch": 0.12144212523719165, "grad_norm": 4.240759372711182, "learning_rate": 9.997020702755353e-05, "loss": 3.7947, "step": 64 }, { "epoch": 0.1252371916508539, "grad_norm": 2.899198532104492, "learning_rate": 9.995838996722914e-05, "loss": 3.6796, "step": 66 }, { "epoch": 0.12903225806451613, "grad_norm": 3.4333019256591797, "learning_rate": 9.994460457383284e-05, "loss": 3.9, "step": 68 }, { "epoch": 0.13282732447817835, "grad_norm": 3.4416115283966064, "learning_rate": 9.992885139050154e-05, "loss": 3.6634, "step": 70 }, { "epoch": 0.1366223908918406, "grad_norm": 5.570509433746338, "learning_rate": 9.991113103790198e-05, "loss": 3.6589, "step": 72 }, { "epoch": 0.14041745730550284, "grad_norm": 4.77843713760376, "learning_rate": 9.98914442142063e-05, "loss": 3.5929, "step": 74 }, { "epoch": 0.1442125237191651, "grad_norm": 3.334336996078491, "learning_rate": 9.986979169506453e-05, "loss": 3.6783, "step": 76 }, { "epoch": 0.14800759013282733, "grad_norm": 6.237847805023193, "learning_rate": 9.9846174333574e-05, "loss": 3.8635, "step": 78 }, { "epoch": 0.15180265654648956, "grad_norm": 5.3761887550354, "learning_rate": 9.982059306024577e-05, "loss": 3.7552, "step": 80 }, { "epoch": 0.1555977229601518, "grad_norm": 3.021200656890869, "learning_rate": 9.979304888296792e-05, "loss": 3.7418, "step": 82 }, { "epoch": 0.15939278937381404, "grad_norm": 4.499217510223389, "learning_rate": 9.976354288696588e-05, "loss": 3.5939, "step": 84 }, { "epoch": 0.16318785578747627, "grad_norm": 2.8771920204162598, "learning_rate": 9.973207623475965e-05, "loss": 3.7072, "step": 86 }, { "epoch": 0.16698292220113853, "grad_norm": 5.786422252655029, "learning_rate": 9.9698650166118e-05, "loss": 3.6476, "step": 88 }, { "epoch": 0.17077798861480076, "grad_norm": 4.563845157623291, "learning_rate": 9.966326599800967e-05, "loss": 3.6845, "step": 90 }, { "epoch": 0.174573055028463, "grad_norm": 7.240761756896973, "learning_rate": 9.962592512455138e-05, "loss": 3.4461, "step": 92 }, { "epoch": 0.17836812144212524, "grad_norm": 7.457889556884766, "learning_rate": 9.958662901695303e-05, "loss": 3.6345, "step": 94 }, { "epoch": 0.18216318785578747, "grad_norm": 6.383795261383057, "learning_rate": 9.954537922345961e-05, "loss": 3.4164, "step": 96 }, { "epoch": 0.1859582542694497, "grad_norm": 4.423782825469971, "learning_rate": 9.950217736929029e-05, "loss": 3.4743, "step": 98 }, { "epoch": 0.18975332068311196, "grad_norm": 4.3913469314575195, "learning_rate": 9.945702515657434e-05, "loss": 3.4078, "step": 100 }, { "epoch": 0.1935483870967742, "grad_norm": 4.806511878967285, "learning_rate": 9.940992436428409e-05, "loss": 3.49, "step": 102 }, { "epoch": 0.19734345351043645, "grad_norm": 4.089972496032715, "learning_rate": 9.936087684816486e-05, "loss": 3.6553, "step": 104 }, { "epoch": 0.20113851992409867, "grad_norm": 4.213963985443115, "learning_rate": 9.930988454066177e-05, "loss": 3.3105, "step": 106 }, { "epoch": 0.2049335863377609, "grad_norm": 5.072576999664307, "learning_rate": 9.92569494508437e-05, "loss": 3.3807, "step": 108 }, { "epoch": 0.20872865275142316, "grad_norm": 3.9879724979400635, "learning_rate": 9.920207366432402e-05, "loss": 3.3832, "step": 110 }, { "epoch": 0.2125237191650854, "grad_norm": 3.9012928009033203, "learning_rate": 9.914525934317855e-05, "loss": 3.4129, "step": 112 }, { "epoch": 0.21631878557874762, "grad_norm": 3.9006423950195312, "learning_rate": 9.908650872586029e-05, "loss": 3.3788, "step": 114 }, { "epoch": 0.22011385199240988, "grad_norm": 4.585041046142578, "learning_rate": 9.90258241271112e-05, "loss": 3.2844, "step": 116 }, { "epoch": 0.2239089184060721, "grad_norm": 3.21873140335083, "learning_rate": 9.896320793787106e-05, "loss": 3.2972, "step": 118 }, { "epoch": 0.22770398481973433, "grad_norm": 4.861288547515869, "learning_rate": 9.889866262518331e-05, "loss": 3.3681, "step": 120 }, { "epoch": 0.2314990512333966, "grad_norm": 3.7863895893096924, "learning_rate": 9.883219073209772e-05, "loss": 3.2602, "step": 122 }, { "epoch": 0.23529411764705882, "grad_norm": 5.968108177185059, "learning_rate": 9.876379487757034e-05, "loss": 3.2957, "step": 124 }, { "epoch": 0.23908918406072105, "grad_norm": 4.3137993812561035, "learning_rate": 9.869347775636015e-05, "loss": 3.211, "step": 126 }, { "epoch": 0.2428842504743833, "grad_norm": 6.66726016998291, "learning_rate": 9.862124213892304e-05, "loss": 3.253, "step": 128 }, { "epoch": 0.24667931688804554, "grad_norm": 3.351065158843994, "learning_rate": 9.85470908713026e-05, "loss": 3.1391, "step": 130 }, { "epoch": 0.2504743833017078, "grad_norm": 5.007400035858154, "learning_rate": 9.847102687501797e-05, "loss": 3.2324, "step": 132 }, { "epoch": 0.25426944971537, "grad_norm": 3.5997180938720703, "learning_rate": 9.839305314694873e-05, "loss": 3.1649, "step": 134 }, { "epoch": 0.25806451612903225, "grad_norm": 5.135311126708984, "learning_rate": 9.831317275921685e-05, "loss": 3.1186, "step": 136 }, { "epoch": 0.2618595825426945, "grad_norm": 4.529127597808838, "learning_rate": 9.823138885906566e-05, "loss": 3.0933, "step": 138 }, { "epoch": 0.2656546489563567, "grad_norm": 7.555715084075928, "learning_rate": 9.814770466873585e-05, "loss": 3.1694, "step": 140 }, { "epoch": 0.269449715370019, "grad_norm": 6.438807487487793, "learning_rate": 9.806212348533838e-05, "loss": 3.0059, "step": 142 }, { "epoch": 0.2732447817836812, "grad_norm": 3.579599380493164, "learning_rate": 9.797464868072488e-05, "loss": 3.0959, "step": 144 }, { "epoch": 0.27703984819734345, "grad_norm": 2.4115488529205322, "learning_rate": 9.788528370135443e-05, "loss": 3.0124, "step": 146 }, { "epoch": 0.2808349146110057, "grad_norm": 4.941457748413086, "learning_rate": 9.77940320681581e-05, "loss": 3.0013, "step": 148 }, { "epoch": 0.2846299810246679, "grad_norm": 3.0970795154571533, "learning_rate": 9.77008973764e-05, "loss": 2.9156, "step": 150 }, { "epoch": 0.2884250474383302, "grad_norm": 3.7954447269439697, "learning_rate": 9.760588329553571e-05, "loss": 2.9065, "step": 152 }, { "epoch": 0.2922201138519924, "grad_norm": 6.178630352020264, "learning_rate": 9.750899356906775e-05, "loss": 2.9992, "step": 154 }, { "epoch": 0.29601518026565465, "grad_norm": 4.201640605926514, "learning_rate": 9.741023201439803e-05, "loss": 2.9465, "step": 156 }, { "epoch": 0.2998102466793169, "grad_norm": 6.396300792694092, "learning_rate": 9.730960252267743e-05, "loss": 2.9635, "step": 158 }, { "epoch": 0.3036053130929791, "grad_norm": 3.5059707164764404, "learning_rate": 9.720710905865256e-05, "loss": 2.9565, "step": 160 }, { "epoch": 0.30740037950664134, "grad_norm": 3.6058707237243652, "learning_rate": 9.710275566050951e-05, "loss": 2.9546, "step": 162 }, { "epoch": 0.3111954459203036, "grad_norm": 3.369931697845459, "learning_rate": 9.699654643971472e-05, "loss": 2.9414, "step": 164 }, { "epoch": 0.31499051233396586, "grad_norm": 3.4260523319244385, "learning_rate": 9.688848558085306e-05, "loss": 2.8259, "step": 166 }, { "epoch": 0.3187855787476281, "grad_norm": 2.820384979248047, "learning_rate": 9.677857734146289e-05, "loss": 2.8689, "step": 168 }, { "epoch": 0.3225806451612903, "grad_norm": 6.160562992095947, "learning_rate": 9.666682605186835e-05, "loss": 2.8823, "step": 170 }, { "epoch": 0.32637571157495254, "grad_norm": 8.138015747070312, "learning_rate": 9.655323611500875e-05, "loss": 2.9995, "step": 172 }, { "epoch": 0.3301707779886148, "grad_norm": 8.836729049682617, "learning_rate": 9.643781200626511e-05, "loss": 2.7727, "step": 174 }, { "epoch": 0.33396584440227706, "grad_norm": 5.794134616851807, "learning_rate": 9.632055827328382e-05, "loss": 2.8787, "step": 176 }, { "epoch": 0.3377609108159393, "grad_norm": 8.271780967712402, "learning_rate": 9.620147953579737e-05, "loss": 2.9325, "step": 178 }, { "epoch": 0.3415559772296015, "grad_norm": 7.904753684997559, "learning_rate": 9.608058048544251e-05, "loss": 2.9486, "step": 180 }, { "epoch": 0.34535104364326374, "grad_norm": 5.133846759796143, "learning_rate": 9.595786588557532e-05, "loss": 2.8692, "step": 182 }, { "epoch": 0.349146110056926, "grad_norm": 4.022572040557861, "learning_rate": 9.583334057108346e-05, "loss": 2.7638, "step": 184 }, { "epoch": 0.35294117647058826, "grad_norm": 6.008692264556885, "learning_rate": 9.570700944819584e-05, "loss": 2.9448, "step": 186 }, { "epoch": 0.3567362428842505, "grad_norm": 4.208110809326172, "learning_rate": 9.557887749428913e-05, "loss": 2.7578, "step": 188 }, { "epoch": 0.3605313092979127, "grad_norm": 3.0022525787353516, "learning_rate": 9.544894975769186e-05, "loss": 2.7091, "step": 190 }, { "epoch": 0.36432637571157495, "grad_norm": 2.1430158615112305, "learning_rate": 9.531723135748529e-05, "loss": 2.6014, "step": 192 }, { "epoch": 0.3681214421252372, "grad_norm": 3.903315782546997, "learning_rate": 9.518372748330194e-05, "loss": 2.7102, "step": 194 }, { "epoch": 0.3719165085388994, "grad_norm": 4.798587322235107, "learning_rate": 9.504844339512095e-05, "loss": 2.5969, "step": 196 }, { "epoch": 0.3757115749525617, "grad_norm": 2.9411423206329346, "learning_rate": 9.4911384423061e-05, "loss": 2.7626, "step": 198 }, { "epoch": 0.3795066413662239, "grad_norm": 2.5336225032806396, "learning_rate": 9.477255596717012e-05, "loss": 2.6405, "step": 200 }, { "epoch": 0.38330170777988615, "grad_norm": 2.6455540657043457, "learning_rate": 9.463196349721308e-05, "loss": 2.699, "step": 202 }, { "epoch": 0.3870967741935484, "grad_norm": 3.346395969390869, "learning_rate": 9.448961255245584e-05, "loss": 2.4129, "step": 204 }, { "epoch": 0.3908918406072106, "grad_norm": 4.412266254425049, "learning_rate": 9.434550874144728e-05, "loss": 2.5733, "step": 206 }, { "epoch": 0.3946869070208729, "grad_norm": 7.3123579025268555, "learning_rate": 9.419965774179824e-05, "loss": 2.6668, "step": 208 }, { "epoch": 0.3984819734345351, "grad_norm": 7.745405197143555, "learning_rate": 9.405206529995785e-05, "loss": 2.8233, "step": 210 }, { "epoch": 0.40227703984819735, "grad_norm": 6.307023048400879, "learning_rate": 9.39027372309871e-05, "loss": 2.649, "step": 212 }, { "epoch": 0.4060721062618596, "grad_norm": 3.971792697906494, "learning_rate": 9.375167941832973e-05, "loss": 2.7105, "step": 214 }, { "epoch": 0.4098671726755218, "grad_norm": 2.6765499114990234, "learning_rate": 9.359889781358042e-05, "loss": 2.5993, "step": 216 }, { "epoch": 0.41366223908918404, "grad_norm": 2.9960570335388184, "learning_rate": 9.344439843625034e-05, "loss": 2.6444, "step": 218 }, { "epoch": 0.4174573055028463, "grad_norm": 4.103157997131348, "learning_rate": 9.32881873735299e-05, "loss": 2.6523, "step": 220 }, { "epoch": 0.42125237191650855, "grad_norm": 3.6402695178985596, "learning_rate": 9.313027078004903e-05, "loss": 2.5287, "step": 222 }, { "epoch": 0.4250474383301708, "grad_norm": 5.530549049377441, "learning_rate": 9.297065487763462e-05, "loss": 2.6326, "step": 224 }, { "epoch": 0.428842504743833, "grad_norm": 4.8087310791015625, "learning_rate": 9.280934595506536e-05, "loss": 2.4784, "step": 226 }, { "epoch": 0.43263757115749524, "grad_norm": 6.805646896362305, "learning_rate": 9.264635036782405e-05, "loss": 2.5274, "step": 228 }, { "epoch": 0.4364326375711575, "grad_norm": 4.533606052398682, "learning_rate": 9.248167453784711e-05, "loss": 2.5572, "step": 230 }, { "epoch": 0.44022770398481975, "grad_norm": 4.55278205871582, "learning_rate": 9.231532495327165e-05, "loss": 2.5945, "step": 232 }, { "epoch": 0.444022770398482, "grad_norm": 2.897782802581787, "learning_rate": 9.21473081681797e-05, "loss": 2.6323, "step": 234 }, { "epoch": 0.4478178368121442, "grad_norm": 3.3823184967041016, "learning_rate": 9.197763080234019e-05, "loss": 2.4394, "step": 236 }, { "epoch": 0.45161290322580644, "grad_norm": 4.916838645935059, "learning_rate": 9.180629954094792e-05, "loss": 2.514, "step": 238 }, { "epoch": 0.45540796963946867, "grad_norm": 4.822211265563965, "learning_rate": 9.163332113436032e-05, "loss": 2.439, "step": 240 }, { "epoch": 0.45920303605313095, "grad_norm": 4.708934783935547, "learning_rate": 9.145870239783142e-05, "loss": 2.5608, "step": 242 }, { "epoch": 0.4629981024667932, "grad_norm": 2.962392568588257, "learning_rate": 9.128245021124334e-05, "loss": 2.5684, "step": 244 }, { "epoch": 0.4667931688804554, "grad_norm": 2.4564027786254883, "learning_rate": 9.110457151883523e-05, "loss": 2.4745, "step": 246 }, { "epoch": 0.47058823529411764, "grad_norm": 2.94209623336792, "learning_rate": 9.092507332892968e-05, "loss": 2.4497, "step": 248 }, { "epoch": 0.47438330170777987, "grad_norm": 2.5642683506011963, "learning_rate": 9.07439627136566e-05, "loss": 2.4282, "step": 250 }, { "epoch": 0.4781783681214421, "grad_norm": 2.714101791381836, "learning_rate": 9.056124680867457e-05, "loss": 2.3797, "step": 252 }, { "epoch": 0.4819734345351044, "grad_norm": 2.010376453399658, "learning_rate": 9.037693281288969e-05, "loss": 2.4963, "step": 254 }, { "epoch": 0.4857685009487666, "grad_norm": 2.4395999908447266, "learning_rate": 9.019102798817197e-05, "loss": 2.4386, "step": 256 }, { "epoch": 0.48956356736242884, "grad_norm": 2.391928195953369, "learning_rate": 9.000353965906917e-05, "loss": 2.5203, "step": 258 }, { "epoch": 0.49335863377609107, "grad_norm": 2.7171072959899902, "learning_rate": 8.981447521251831e-05, "loss": 2.4105, "step": 260 }, { "epoch": 0.4971537001897533, "grad_norm": 3.4616568088531494, "learning_rate": 8.962384209755452e-05, "loss": 2.2693, "step": 262 }, { "epoch": 0.5009487666034156, "grad_norm": 3.0641255378723145, "learning_rate": 8.943164782501765e-05, "loss": 2.4696, "step": 264 }, { "epoch": 0.5047438330170778, "grad_norm": 2.5761353969573975, "learning_rate": 8.923789996725624e-05, "loss": 2.3747, "step": 266 }, { "epoch": 0.50853889943074, "grad_norm": 3.255844831466675, "learning_rate": 8.904260615782927e-05, "loss": 2.3592, "step": 268 }, { "epoch": 0.5123339658444023, "grad_norm": 3.119816303253174, "learning_rate": 8.884577409120535e-05, "loss": 2.404, "step": 270 }, { "epoch": 0.5161290322580645, "grad_norm": 4.314058780670166, "learning_rate": 8.864741152245963e-05, "loss": 2.3595, "step": 272 }, { "epoch": 0.5199240986717267, "grad_norm": 3.1923677921295166, "learning_rate": 8.84475262669681e-05, "loss": 2.4839, "step": 274 }, { "epoch": 0.523719165085389, "grad_norm": 5.698940753936768, "learning_rate": 8.824612620009987e-05, "loss": 2.5254, "step": 276 }, { "epoch": 0.5275142314990512, "grad_norm": 4.198439598083496, "learning_rate": 8.804321925690672e-05, "loss": 2.4509, "step": 278 }, { "epoch": 0.5313092979127134, "grad_norm": 3.021700620651245, "learning_rate": 8.783881343181055e-05, "loss": 2.3725, "step": 280 }, { "epoch": 0.5351043643263758, "grad_norm": 3.3499674797058105, "learning_rate": 8.763291677828838e-05, "loss": 2.2391, "step": 282 }, { "epoch": 0.538899430740038, "grad_norm": 3.2401561737060547, "learning_rate": 8.742553740855506e-05, "loss": 2.4657, "step": 284 }, { "epoch": 0.5426944971537002, "grad_norm": 2.584604263305664, "learning_rate": 8.721668349324364e-05, "loss": 2.472, "step": 286 }, { "epoch": 0.5464895635673624, "grad_norm": 2.86765456199646, "learning_rate": 8.700636326108342e-05, "loss": 2.5246, "step": 288 }, { "epoch": 0.5502846299810247, "grad_norm": 3.742485284805298, "learning_rate": 8.679458499857582e-05, "loss": 2.3875, "step": 290 }, { "epoch": 0.5540796963946869, "grad_norm": 2.3582913875579834, "learning_rate": 8.658135704966786e-05, "loss": 2.3662, "step": 292 }, { "epoch": 0.5578747628083491, "grad_norm": 2.188462495803833, "learning_rate": 8.636668781542336e-05, "loss": 2.2918, "step": 294 }, { "epoch": 0.5616698292220114, "grad_norm": 3.121817111968994, "learning_rate": 8.615058575369202e-05, "loss": 2.3115, "step": 296 }, { "epoch": 0.5654648956356736, "grad_norm": 3.2461938858032227, "learning_rate": 8.593305937877614e-05, "loss": 2.3089, "step": 298 }, { "epoch": 0.5692599620493358, "grad_norm": 2.2377259731292725, "learning_rate": 8.571411726109519e-05, "loss": 2.4239, "step": 300 }, { "epoch": 0.573055028462998, "grad_norm": 2.2074265480041504, "learning_rate": 8.549376802684812e-05, "loss": 2.4559, "step": 302 }, { "epoch": 0.5768500948766604, "grad_norm": 2.276872396469116, "learning_rate": 8.527202035767349e-05, "loss": 2.4151, "step": 304 }, { "epoch": 0.5806451612903226, "grad_norm": 2.8724167346954346, "learning_rate": 8.504888299030747e-05, "loss": 2.3258, "step": 306 }, { "epoch": 0.5844402277039848, "grad_norm": 3.028040647506714, "learning_rate": 8.482436471623951e-05, "loss": 2.3436, "step": 308 }, { "epoch": 0.5882352941176471, "grad_norm": 2.2716915607452393, "learning_rate": 8.459847438136605e-05, "loss": 2.207, "step": 310 }, { "epoch": 0.5920303605313093, "grad_norm": 2.3537518978118896, "learning_rate": 8.437122088564198e-05, "loss": 2.2379, "step": 312 }, { "epoch": 0.5958254269449715, "grad_norm": 2.7615747451782227, "learning_rate": 8.414261318272996e-05, "loss": 2.353, "step": 314 }, { "epoch": 0.5996204933586338, "grad_norm": 2.191373586654663, "learning_rate": 8.391266027964771e-05, "loss": 2.406, "step": 316 }, { "epoch": 0.603415559772296, "grad_norm": 2.1172335147857666, "learning_rate": 8.368137123641302e-05, "loss": 2.287, "step": 318 }, { "epoch": 0.6072106261859582, "grad_norm": 2.387768030166626, "learning_rate": 8.344875516568695e-05, "loss": 2.3713, "step": 320 }, { "epoch": 0.6110056925996205, "grad_norm": 2.8590333461761475, "learning_rate": 8.321482123241464e-05, "loss": 2.2492, "step": 322 }, { "epoch": 0.6148007590132827, "grad_norm": 2.2510037422180176, "learning_rate": 8.297957865346437e-05, "loss": 2.3075, "step": 324 }, { "epoch": 0.618595825426945, "grad_norm": 2.747525930404663, "learning_rate": 8.274303669726426e-05, "loss": 2.089, "step": 326 }, { "epoch": 0.6223908918406073, "grad_norm": 2.4523816108703613, "learning_rate": 8.250520468343722e-05, "loss": 2.2978, "step": 328 }, { "epoch": 0.6261859582542695, "grad_norm": 2.1454994678497314, "learning_rate": 8.226609198243372e-05, "loss": 2.3123, "step": 330 }, { "epoch": 0.6299810246679317, "grad_norm": 2.733635663986206, "learning_rate": 8.202570801516256e-05, "loss": 2.2186, "step": 332 }, { "epoch": 0.6337760910815939, "grad_norm": 2.4969663619995117, "learning_rate": 8.178406225261981e-05, "loss": 2.2239, "step": 334 }, { "epoch": 0.6375711574952562, "grad_norm": 3.0612945556640625, "learning_rate": 8.15411642155155e-05, "loss": 2.2033, "step": 336 }, { "epoch": 0.6413662239089184, "grad_norm": 3.4063143730163574, "learning_rate": 8.129702347389865e-05, "loss": 2.3211, "step": 338 }, { "epoch": 0.6451612903225806, "grad_norm": 2.2077441215515137, "learning_rate": 8.105164964678009e-05, "loss": 2.5048, "step": 340 }, { "epoch": 0.6489563567362429, "grad_norm": 2.7867558002471924, "learning_rate": 8.080505240175363e-05, "loss": 2.3034, "step": 342 }, { "epoch": 0.6527514231499051, "grad_norm": 2.510138511657715, "learning_rate": 8.055724145461495e-05, "loss": 2.1329, "step": 344 }, { "epoch": 0.6565464895635673, "grad_norm": 2.863159418106079, "learning_rate": 8.030822656897902e-05, "loss": 2.2401, "step": 346 }, { "epoch": 0.6603415559772297, "grad_norm": 2.664177179336548, "learning_rate": 8.005801755589532e-05, "loss": 2.1028, "step": 348 }, { "epoch": 0.6641366223908919, "grad_norm": 2.095933675765991, "learning_rate": 7.980662427346127e-05, "loss": 1.9901, "step": 350 }, { "epoch": 0.6679316888045541, "grad_norm": 2.5369749069213867, "learning_rate": 7.955405662643384e-05, "loss": 2.0314, "step": 352 }, { "epoch": 0.6717267552182163, "grad_norm": 2.400635242462158, "learning_rate": 7.930032456583931e-05, "loss": 2.2097, "step": 354 }, { "epoch": 0.6755218216318786, "grad_norm": 2.1800007820129395, "learning_rate": 7.904543808858127e-05, "loss": 2.1188, "step": 356 }, { "epoch": 0.6793168880455408, "grad_norm": 2.3743057250976562, "learning_rate": 7.878940723704664e-05, "loss": 2.1496, "step": 358 }, { "epoch": 0.683111954459203, "grad_norm": 2.9855995178222656, "learning_rate": 7.853224209871007e-05, "loss": 2.2357, "step": 360 }, { "epoch": 0.6869070208728653, "grad_norm": 2.130502700805664, "learning_rate": 7.82739528057365e-05, "loss": 2.1077, "step": 362 }, { "epoch": 0.6907020872865275, "grad_norm": 2.2894301414489746, "learning_rate": 7.801454953458193e-05, "loss": 2.1231, "step": 364 }, { "epoch": 0.6944971537001897, "grad_norm": 2.3042802810668945, "learning_rate": 7.775404250559249e-05, "loss": 2.3711, "step": 366 }, { "epoch": 0.698292220113852, "grad_norm": 2.517493724822998, "learning_rate": 7.749244198260175e-05, "loss": 2.2039, "step": 368 }, { "epoch": 0.7020872865275142, "grad_norm": 2.50030255317688, "learning_rate": 7.722975827252638e-05, "loss": 2.3868, "step": 370 }, { "epoch": 0.7058823529411765, "grad_norm": 2.252354621887207, "learning_rate": 7.696600172495997e-05, "loss": 2.2204, "step": 372 }, { "epoch": 0.7096774193548387, "grad_norm": 2.182471990585327, "learning_rate": 7.670118273176534e-05, "loss": 2.2011, "step": 374 }, { "epoch": 0.713472485768501, "grad_norm": 2.4250662326812744, "learning_rate": 7.643531172666513e-05, "loss": 2.2251, "step": 376 }, { "epoch": 0.7172675521821632, "grad_norm": 2.91322922706604, "learning_rate": 7.616839918483061e-05, "loss": 2.2859, "step": 378 }, { "epoch": 0.7210626185958254, "grad_norm": 2.963437080383301, "learning_rate": 7.590045562246902e-05, "loss": 2.1708, "step": 380 }, { "epoch": 0.7248576850094877, "grad_norm": 3.132415771484375, "learning_rate": 7.563149159640929e-05, "loss": 2.1548, "step": 382 }, { "epoch": 0.7286527514231499, "grad_norm": 2.5823440551757812, "learning_rate": 7.5361517703686e-05, "loss": 2.1451, "step": 384 }, { "epoch": 0.7324478178368121, "grad_norm": 2.6822919845581055, "learning_rate": 7.509054458112202e-05, "loss": 2.3556, "step": 386 }, { "epoch": 0.7362428842504743, "grad_norm": 2.1513512134552, "learning_rate": 7.481858290490917e-05, "loss": 2.1135, "step": 388 }, { "epoch": 0.7400379506641366, "grad_norm": 2.6822762489318848, "learning_rate": 7.45456433901879e-05, "loss": 2.2004, "step": 390 }, { "epoch": 0.7438330170777988, "grad_norm": 2.8601059913635254, "learning_rate": 7.427173679062484e-05, "loss": 2.187, "step": 392 }, { "epoch": 0.7476280834914611, "grad_norm": 2.706089496612549, "learning_rate": 7.399687389798933e-05, "loss": 2.2509, "step": 394 }, { "epoch": 0.7514231499051234, "grad_norm": 3.9849417209625244, "learning_rate": 7.372106554172802e-05, "loss": 2.1557, "step": 396 }, { "epoch": 0.7552182163187856, "grad_norm": 2.7597997188568115, "learning_rate": 7.344432258853841e-05, "loss": 2.1147, "step": 398 }, { "epoch": 0.7590132827324478, "grad_norm": 2.172595500946045, "learning_rate": 7.316665594194053e-05, "loss": 2.2463, "step": 400 }, { "epoch": 0.7628083491461101, "grad_norm": 1.9581433534622192, "learning_rate": 7.288807654184747e-05, "loss": 2.1798, "step": 402 }, { "epoch": 0.7666034155597723, "grad_norm": 2.4689390659332275, "learning_rate": 7.260859536413429e-05, "loss": 2.1782, "step": 404 }, { "epoch": 0.7703984819734345, "grad_norm": 3.047780752182007, "learning_rate": 7.232822342020557e-05, "loss": 2.0098, "step": 406 }, { "epoch": 0.7741935483870968, "grad_norm": 3.0638253688812256, "learning_rate": 7.204697175656165e-05, "loss": 2.2135, "step": 408 }, { "epoch": 0.777988614800759, "grad_norm": 2.14133882522583, "learning_rate": 7.176485145436325e-05, "loss": 2.1974, "step": 410 }, { "epoch": 0.7817836812144212, "grad_norm": 3.42946720123291, "learning_rate": 7.148187362899505e-05, "loss": 2.2132, "step": 412 }, { "epoch": 0.7855787476280834, "grad_norm": 2.9244472980499268, "learning_rate": 7.119804942962762e-05, "loss": 1.9828, "step": 414 }, { "epoch": 0.7893738140417458, "grad_norm": 2.4446115493774414, "learning_rate": 7.091339003877826e-05, "loss": 2.2205, "step": 416 }, { "epoch": 0.793168880455408, "grad_norm": 2.8077917098999023, "learning_rate": 7.062790667187029e-05, "loss": 2.0921, "step": 418 }, { "epoch": 0.7969639468690702, "grad_norm": 2.168386697769165, "learning_rate": 7.034161057679127e-05, "loss": 2.1198, "step": 420 }, { "epoch": 0.8007590132827325, "grad_norm": 2.11982798576355, "learning_rate": 7.005451303344979e-05, "loss": 2.0766, "step": 422 }, { "epoch": 0.8045540796963947, "grad_norm": 2.1477482318878174, "learning_rate": 6.976662535333107e-05, "loss": 2.0088, "step": 424 }, { "epoch": 0.8083491461100569, "grad_norm": 2.2626473903656006, "learning_rate": 6.947795887905127e-05, "loss": 1.974, "step": 426 }, { "epoch": 0.8121442125237192, "grad_norm": 2.2598705291748047, "learning_rate": 6.918852498391063e-05, "loss": 2.0463, "step": 428 }, { "epoch": 0.8159392789373814, "grad_norm": 2.132086992263794, "learning_rate": 6.889833507144532e-05, "loss": 2.2507, "step": 430 }, { "epoch": 0.8197343453510436, "grad_norm": 2.307889223098755, "learning_rate": 6.860740057497823e-05, "loss": 2.1272, "step": 432 }, { "epoch": 0.8235294117647058, "grad_norm": 2.638432264328003, "learning_rate": 6.831573295716837e-05, "loss": 2.1772, "step": 434 }, { "epoch": 0.8273244781783681, "grad_norm": 2.552748680114746, "learning_rate": 6.802334370955941e-05, "loss": 2.0331, "step": 436 }, { "epoch": 0.8311195445920304, "grad_norm": 2.7252237796783447, "learning_rate": 6.773024435212678e-05, "loss": 2.1394, "step": 438 }, { "epoch": 0.8349146110056926, "grad_norm": 2.4146969318389893, "learning_rate": 6.743644643282388e-05, "loss": 2.2107, "step": 440 }, { "epoch": 0.8387096774193549, "grad_norm": 2.2211036682128906, "learning_rate": 6.714196152712704e-05, "loss": 2.2715, "step": 442 }, { "epoch": 0.8425047438330171, "grad_norm": 2.484332323074341, "learning_rate": 6.684680123757949e-05, "loss": 2.0996, "step": 444 }, { "epoch": 0.8462998102466793, "grad_norm": 2.478273868560791, "learning_rate": 6.65509771933342e-05, "loss": 1.9356, "step": 446 }, { "epoch": 0.8500948766603416, "grad_norm": 2.3699731826782227, "learning_rate": 6.625450104969572e-05, "loss": 2.1316, "step": 448 }, { "epoch": 0.8538899430740038, "grad_norm": 2.419955253601074, "learning_rate": 6.595738448766095e-05, "loss": 2.1218, "step": 450 }, { "epoch": 0.857685009487666, "grad_norm": 2.293102979660034, "learning_rate": 6.565963921345895e-05, "loss": 2.0795, "step": 452 }, { "epoch": 0.8614800759013282, "grad_norm": 2.706804037094116, "learning_rate": 6.536127695808964e-05, "loss": 2.0764, "step": 454 }, { "epoch": 0.8652751423149905, "grad_norm": 2.3462438583374023, "learning_rate": 6.506230947686172e-05, "loss": 2.0065, "step": 456 }, { "epoch": 0.8690702087286527, "grad_norm": 2.636672258377075, "learning_rate": 6.47627485489294e-05, "loss": 2.1865, "step": 458 }, { "epoch": 0.872865275142315, "grad_norm": 2.1746952533721924, "learning_rate": 6.446260597682839e-05, "loss": 1.9698, "step": 460 }, { "epoch": 0.8766603415559773, "grad_norm": 2.1193501949310303, "learning_rate": 6.416189358601088e-05, "loss": 1.9559, "step": 462 }, { "epoch": 0.8804554079696395, "grad_norm": 2.4427428245544434, "learning_rate": 6.386062322437954e-05, "loss": 1.9442, "step": 464 }, { "epoch": 0.8842504743833017, "grad_norm": 2.3021249771118164, "learning_rate": 6.355880676182086e-05, "loss": 2.1573, "step": 466 }, { "epoch": 0.888045540796964, "grad_norm": 2.224827527999878, "learning_rate": 6.325645608973735e-05, "loss": 2.0547, "step": 468 }, { "epoch": 0.8918406072106262, "grad_norm": 2.451848030090332, "learning_rate": 6.295358312057914e-05, "loss": 2.0589, "step": 470 }, { "epoch": 0.8956356736242884, "grad_norm": 2.2723612785339355, "learning_rate": 6.26501997873745e-05, "loss": 1.9392, "step": 472 }, { "epoch": 0.8994307400379506, "grad_norm": 2.340785503387451, "learning_rate": 6.234631804325981e-05, "loss": 2.1117, "step": 474 }, { "epoch": 0.9032258064516129, "grad_norm": 2.963451385498047, "learning_rate": 6.204194986100857e-05, "loss": 2.0476, "step": 476 }, { "epoch": 0.9070208728652751, "grad_norm": 2.2216341495513916, "learning_rate": 6.173710723255966e-05, "loss": 2.154, "step": 478 }, { "epoch": 0.9108159392789373, "grad_norm": 1.8229964971542358, "learning_rate": 6.143180216854487e-05, "loss": 1.8867, "step": 480 }, { "epoch": 0.9146110056925996, "grad_norm": 2.198394775390625, "learning_rate": 6.112604669781572e-05, "loss": 2.0622, "step": 482 }, { "epoch": 0.9184060721062619, "grad_norm": 1.7389253377914429, "learning_rate": 6.081985286696949e-05, "loss": 2.1547, "step": 484 }, { "epoch": 0.9222011385199241, "grad_norm": 2.4822840690612793, "learning_rate": 6.051323273987463e-05, "loss": 1.9973, "step": 486 }, { "epoch": 0.9259962049335864, "grad_norm": 2.384826183319092, "learning_rate": 6.020619839719538e-05, "loss": 2.0659, "step": 488 }, { "epoch": 0.9297912713472486, "grad_norm": 1.8725968599319458, "learning_rate": 5.989876193591589e-05, "loss": 2.082, "step": 490 }, { "epoch": 0.9335863377609108, "grad_norm": 2.7569420337677, "learning_rate": 5.959093546886356e-05, "loss": 2.0654, "step": 492 }, { "epoch": 0.937381404174573, "grad_norm": 1.96099853515625, "learning_rate": 5.928273112423177e-05, "loss": 1.9152, "step": 494 }, { "epoch": 0.9411764705882353, "grad_norm": 2.6206164360046387, "learning_rate": 5.897416104510211e-05, "loss": 2.0493, "step": 496 }, { "epoch": 0.9449715370018975, "grad_norm": 1.8731147050857544, "learning_rate": 5.866523738896587e-05, "loss": 2.0, "step": 498 }, { "epoch": 0.9487666034155597, "grad_norm": 2.0109100341796875, "learning_rate": 5.835597232724511e-05, "loss": 2.0603, "step": 500 }, { "epoch": 0.952561669829222, "grad_norm": 2.412135362625122, "learning_rate": 5.804637804481306e-05, "loss": 2.0034, "step": 502 }, { "epoch": 0.9563567362428842, "grad_norm": 1.894960880279541, "learning_rate": 5.773646673951406e-05, "loss": 2.0708, "step": 504 }, { "epoch": 0.9601518026565465, "grad_norm": 3.9883475303649902, "learning_rate": 5.742625062168303e-05, "loss": 2.1407, "step": 506 }, { "epoch": 0.9639468690702088, "grad_norm": 2.3427770137786865, "learning_rate": 5.7115741913664264e-05, "loss": 1.9919, "step": 508 }, { "epoch": 0.967741935483871, "grad_norm": 2.0752058029174805, "learning_rate": 5.680495284933e-05, "loss": 2.0542, "step": 510 }, { "epoch": 0.9715370018975332, "grad_norm": 1.9299733638763428, "learning_rate": 5.649389567359831e-05, "loss": 2.0434, "step": 512 }, { "epoch": 0.9753320683111955, "grad_norm": 2.1606550216674805, "learning_rate": 5.6182582641950764e-05, "loss": 1.9045, "step": 514 }, { "epoch": 0.9791271347248577, "grad_norm": 1.9549524784088135, "learning_rate": 5.58710260199495e-05, "loss": 1.9951, "step": 516 }, { "epoch": 0.9829222011385199, "grad_norm": 1.8001773357391357, "learning_rate": 5.555923808275395e-05, "loss": 1.8519, "step": 518 }, { "epoch": 0.9867172675521821, "grad_norm": 2.168572187423706, "learning_rate": 5.5247231114637256e-05, "loss": 1.8828, "step": 520 }, { "epoch": 0.9905123339658444, "grad_norm": 2.0529568195343018, "learning_rate": 5.4935017408502274e-05, "loss": 2.0863, "step": 522 }, { "epoch": 0.9943074003795066, "grad_norm": 2.078700304031372, "learning_rate": 5.462260926539722e-05, "loss": 2.0463, "step": 524 }, { "epoch": 0.9981024667931688, "grad_norm": 2.1271212100982666, "learning_rate": 5.431001899403098e-05, "loss": 2.0645, "step": 526 } ], "logging_steps": 2, "max_steps": 1054, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 527, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.912185436248474e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }