{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 60.66320037841797, "learning_rate": 3.1746031746031746e-06, "loss": 6.1881, "step": 2 }, { "epoch": 0.0064, "grad_norm": 42.748294830322266, "learning_rate": 6.349206349206349e-06, "loss": 5.7668, "step": 4 }, { "epoch": 0.0096, "grad_norm": 13.887235641479492, "learning_rate": 9.523809523809523e-06, "loss": 5.0841, "step": 6 }, { "epoch": 0.0128, "grad_norm": 12.488709449768066, "learning_rate": 1.2698412698412699e-05, "loss": 4.7192, "step": 8 }, { "epoch": 0.016, "grad_norm": 7.208029270172119, "learning_rate": 1.5873015873015872e-05, "loss": 4.5679, "step": 10 }, { "epoch": 0.0192, "grad_norm": 7.34755277633667, "learning_rate": 1.9047619047619046e-05, "loss": 4.368, "step": 12 }, { "epoch": 0.0224, "grad_norm": 7.489256858825684, "learning_rate": 2.2222222222222223e-05, "loss": 4.1398, "step": 14 }, { "epoch": 0.0256, "grad_norm": 4.802424430847168, "learning_rate": 2.5396825396825397e-05, "loss": 4.0532, "step": 16 }, { "epoch": 0.0288, "grad_norm": 4.138615131378174, "learning_rate": 2.857142857142857e-05, "loss": 3.7559, "step": 18 }, { "epoch": 0.032, "grad_norm": 5.362161636352539, "learning_rate": 3.1746031746031745e-05, "loss": 3.4854, "step": 20 }, { "epoch": 0.0352, "grad_norm": 3.878138303756714, "learning_rate": 3.492063492063492e-05, "loss": 3.5869, "step": 22 }, { "epoch": 0.0384, "grad_norm": 3.480282783508301, "learning_rate": 3.809523809523809e-05, "loss": 3.2734, "step": 24 }, { "epoch": 0.0416, "grad_norm": 3.527137279510498, "learning_rate": 4.126984126984127e-05, "loss": 3.1484, "step": 26 }, { "epoch": 0.0448, "grad_norm": 3.094705820083618, "learning_rate": 4.4444444444444447e-05, "loss": 3.0356, "step": 28 }, { "epoch": 0.048, "grad_norm": 4.064608573913574, "learning_rate": 4.761904761904762e-05, "loss": 3.0157, "step": 30 }, { "epoch": 0.0512, "grad_norm": 3.166187047958374, "learning_rate": 5.0793650793650794e-05, "loss": 2.9229, "step": 32 }, { "epoch": 0.0544, "grad_norm": 2.909856081008911, "learning_rate": 5.396825396825397e-05, "loss": 2.8386, "step": 34 }, { "epoch": 0.0576, "grad_norm": 3.2195472717285156, "learning_rate": 5.714285714285714e-05, "loss": 2.7819, "step": 36 }, { "epoch": 0.0608, "grad_norm": 2.603515625, "learning_rate": 6.0317460317460316e-05, "loss": 2.7116, "step": 38 }, { "epoch": 0.064, "grad_norm": 3.353161096572876, "learning_rate": 6.349206349206349e-05, "loss": 2.7137, "step": 40 }, { "epoch": 0.0672, "grad_norm": 2.7278943061828613, "learning_rate": 6.666666666666667e-05, "loss": 2.6213, "step": 42 }, { "epoch": 0.0704, "grad_norm": 3.275580883026123, "learning_rate": 6.984126984126984e-05, "loss": 2.5585, "step": 44 }, { "epoch": 0.0736, "grad_norm": 2.934300422668457, "learning_rate": 7.301587301587302e-05, "loss": 2.6126, "step": 46 }, { "epoch": 0.0768, "grad_norm": 2.483461380004883, "learning_rate": 7.619047619047618e-05, "loss": 2.4616, "step": 48 }, { "epoch": 0.08, "grad_norm": 2.5167787075042725, "learning_rate": 7.936507936507937e-05, "loss": 2.4316, "step": 50 }, { "epoch": 0.0832, "grad_norm": 2.211185932159424, "learning_rate": 8.253968253968255e-05, "loss": 2.63, "step": 52 }, { "epoch": 0.0864, "grad_norm": 3.2666075229644775, "learning_rate": 8.571428571428571e-05, "loss": 2.4892, "step": 54 }, { "epoch": 0.0896, "grad_norm": 2.849605083465576, "learning_rate": 8.888888888888889e-05, "loss": 2.4584, "step": 56 }, { "epoch": 0.0928, "grad_norm": 3.0103588104248047, "learning_rate": 9.206349206349206e-05, "loss": 2.2766, "step": 58 }, { "epoch": 0.096, "grad_norm": 2.306534767150879, "learning_rate": 9.523809523809524e-05, "loss": 2.4357, "step": 60 }, { "epoch": 0.0992, "grad_norm": 2.3400485515594482, "learning_rate": 9.841269841269841e-05, "loss": 2.4118, "step": 62 }, { "epoch": 0.1024, "grad_norm": 2.583407163619995, "learning_rate": 9.99998248790669e-05, "loss": 2.2326, "step": 64 }, { "epoch": 0.1056, "grad_norm": 2.57265567779541, "learning_rate": 9.999842391896222e-05, "loss": 2.2923, "step": 66 }, { "epoch": 0.1088, "grad_norm": 2.307471990585327, "learning_rate": 9.999562203800676e-05, "loss": 2.2235, "step": 68 }, { "epoch": 0.112, "grad_norm": 2.357084035873413, "learning_rate": 9.999141931470729e-05, "loss": 2.2896, "step": 70 }, { "epoch": 0.1152, "grad_norm": 2.6181581020355225, "learning_rate": 9.998581586682116e-05, "loss": 2.3015, "step": 72 }, { "epoch": 0.1184, "grad_norm": 2.253117799758911, "learning_rate": 9.997881185135307e-05, "loss": 2.1824, "step": 74 }, { "epoch": 0.1216, "grad_norm": 2.546729326248169, "learning_rate": 9.997040746455062e-05, "loss": 2.1502, "step": 76 }, { "epoch": 0.1248, "grad_norm": 2.4144699573516846, "learning_rate": 9.996060294189887e-05, "loss": 2.3715, "step": 78 }, { "epoch": 0.128, "grad_norm": 2.3093016147613525, "learning_rate": 9.994939855811362e-05, "loss": 2.2753, "step": 80 }, { "epoch": 0.1312, "grad_norm": 2.5628409385681152, "learning_rate": 9.993679462713395e-05, "loss": 2.3152, "step": 82 }, { "epoch": 0.1344, "grad_norm": 2.549136161804199, "learning_rate": 9.992279150211314e-05, "loss": 2.1171, "step": 84 }, { "epoch": 0.1376, "grad_norm": 2.4570610523223877, "learning_rate": 9.990738957540896e-05, "loss": 2.2414, "step": 86 }, { "epoch": 0.1408, "grad_norm": 2.256564140319824, "learning_rate": 9.989058927857263e-05, "loss": 2.1324, "step": 88 }, { "epoch": 0.144, "grad_norm": 2.818751811981201, "learning_rate": 9.987239108233668e-05, "loss": 2.184, "step": 90 }, { "epoch": 0.1472, "grad_norm": 2.432871103286743, "learning_rate": 9.985279549660185e-05, "loss": 2.1899, "step": 92 }, { "epoch": 0.1504, "grad_norm": 2.1021323204040527, "learning_rate": 9.983180307042274e-05, "loss": 2.1064, "step": 94 }, { "epoch": 0.1536, "grad_norm": 2.7487058639526367, "learning_rate": 9.980941439199246e-05, "loss": 2.2197, "step": 96 }, { "epoch": 0.1568, "grad_norm": 2.82835054397583, "learning_rate": 9.97856300886261e-05, "loss": 2.2048, "step": 98 }, { "epoch": 0.16, "grad_norm": 2.25872802734375, "learning_rate": 9.976045082674319e-05, "loss": 2.1002, "step": 100 }, { "epoch": 0.1632, "grad_norm": 2.040614366531372, "learning_rate": 9.973387731184902e-05, "loss": 2.1031, "step": 102 }, { "epoch": 0.1664, "grad_norm": 2.437248706817627, "learning_rate": 9.97059102885149e-05, "loss": 2.1416, "step": 104 }, { "epoch": 0.1696, "grad_norm": 2.0928878784179688, "learning_rate": 9.967655054035727e-05, "loss": 2.1576, "step": 106 }, { "epoch": 0.1728, "grad_norm": 2.2243545055389404, "learning_rate": 9.964579889001569e-05, "loss": 1.9863, "step": 108 }, { "epoch": 0.176, "grad_norm": 2.1860439777374268, "learning_rate": 9.961365619912989e-05, "loss": 2.0016, "step": 110 }, { "epoch": 0.1792, "grad_norm": 2.527122735977173, "learning_rate": 9.95801233683156e-05, "loss": 2.1272, "step": 112 }, { "epoch": 0.1824, "grad_norm": 1.8613876104354858, "learning_rate": 9.954520133713924e-05, "loss": 2.2001, "step": 114 }, { "epoch": 0.1856, "grad_norm": 2.115910530090332, "learning_rate": 9.950889108409172e-05, "loss": 2.0871, "step": 116 }, { "epoch": 0.1888, "grad_norm": 2.361309051513672, "learning_rate": 9.947119362656092e-05, "loss": 2.017, "step": 118 }, { "epoch": 0.192, "grad_norm": 2.09470272064209, "learning_rate": 9.94321100208032e-05, "loss": 2.1847, "step": 120 }, { "epoch": 0.1952, "grad_norm": 1.9747451543807983, "learning_rate": 9.939164136191384e-05, "loss": 2.324, "step": 122 }, { "epoch": 0.1984, "grad_norm": 1.8229223489761353, "learning_rate": 9.934978878379636e-05, "loss": 2.1454, "step": 124 }, { "epoch": 0.2016, "grad_norm": 1.9113378524780273, "learning_rate": 9.930655345913071e-05, "loss": 2.0096, "step": 126 }, { "epoch": 0.2048, "grad_norm": 2.385289192199707, "learning_rate": 9.926193659934043e-05, "loss": 2.1029, "step": 128 }, { "epoch": 0.208, "grad_norm": 2.101463794708252, "learning_rate": 9.921593945455869e-05, "loss": 2.0172, "step": 130 }, { "epoch": 0.2112, "grad_norm": 2.2676024436950684, "learning_rate": 9.916856331359335e-05, "loss": 1.9966, "step": 132 }, { "epoch": 0.2144, "grad_norm": 2.0168168544769287, "learning_rate": 9.911980950389067e-05, "loss": 2.1807, "step": 134 }, { "epoch": 0.2176, "grad_norm": 2.1054186820983887, "learning_rate": 9.906967939149831e-05, "loss": 1.9759, "step": 136 }, { "epoch": 0.2208, "grad_norm": 2.3354573249816895, "learning_rate": 9.901817438102695e-05, "loss": 1.995, "step": 138 }, { "epoch": 0.224, "grad_norm": 2.2721822261810303, "learning_rate": 9.896529591561093e-05, "loss": 2.2239, "step": 140 }, { "epoch": 0.2272, "grad_norm": 1.9209738969802856, "learning_rate": 9.891104547686782e-05, "loss": 2.0051, "step": 142 }, { "epoch": 0.2304, "grad_norm": 1.978259801864624, "learning_rate": 9.8855424584857e-05, "loss": 2.0367, "step": 144 }, { "epoch": 0.2336, "grad_norm": 1.9169765710830688, "learning_rate": 9.879843479803691e-05, "loss": 2.1009, "step": 146 }, { "epoch": 0.2368, "grad_norm": 1.8380109071731567, "learning_rate": 9.874007771322151e-05, "loss": 2.1456, "step": 148 }, { "epoch": 0.24, "grad_norm": 2.1143693923950195, "learning_rate": 9.868035496553546e-05, "loss": 1.925, "step": 150 }, { "epoch": 0.2432, "grad_norm": 1.8774141073226929, "learning_rate": 9.86192682283684e-05, "loss": 1.9616, "step": 152 }, { "epoch": 0.2464, "grad_norm": 2.3532581329345703, "learning_rate": 9.855681921332793e-05, "loss": 2.0289, "step": 154 }, { "epoch": 0.2496, "grad_norm": 2.1421797275543213, "learning_rate": 9.849300967019175e-05, "loss": 2.0153, "step": 156 }, { "epoch": 0.2528, "grad_norm": 2.0029852390289307, "learning_rate": 9.84278413868586e-05, "loss": 2.0726, "step": 158 }, { "epoch": 0.256, "grad_norm": 2.0344998836517334, "learning_rate": 9.836131618929819e-05, "loss": 2.0215, "step": 160 }, { "epoch": 0.2592, "grad_norm": 1.8781356811523438, "learning_rate": 9.82934359415e-05, "loss": 2.0622, "step": 162 }, { "epoch": 0.2624, "grad_norm": 1.9795514345169067, "learning_rate": 9.822420254542108e-05, "loss": 2.0249, "step": 164 }, { "epoch": 0.2656, "grad_norm": 2.012881278991699, "learning_rate": 9.815361794093272e-05, "loss": 1.9815, "step": 166 }, { "epoch": 0.2688, "grad_norm": 2.264941453933716, "learning_rate": 9.808168410576617e-05, "loss": 2.0232, "step": 168 }, { "epoch": 0.272, "grad_norm": 2.4006729125976562, "learning_rate": 9.800840305545715e-05, "loss": 2.0844, "step": 170 }, { "epoch": 0.2752, "grad_norm": 2.0443308353424072, "learning_rate": 9.793377684328939e-05, "loss": 2.2302, "step": 172 }, { "epoch": 0.2784, "grad_norm": 2.164515972137451, "learning_rate": 9.785780756023714e-05, "loss": 1.9808, "step": 174 }, { "epoch": 0.2816, "grad_norm": 1.9512875080108643, "learning_rate": 9.778049733490655e-05, "loss": 2.0968, "step": 176 }, { "epoch": 0.2848, "grad_norm": 1.9964834451675415, "learning_rate": 9.770184833347606e-05, "loss": 1.9889, "step": 178 }, { "epoch": 0.288, "grad_norm": 1.9380826950073242, "learning_rate": 9.762186275963563e-05, "loss": 1.9766, "step": 180 }, { "epoch": 0.2912, "grad_norm": 1.943260669708252, "learning_rate": 9.754054285452506e-05, "loss": 1.9298, "step": 182 }, { "epoch": 0.2944, "grad_norm": 2.1821844577789307, "learning_rate": 9.745789089667121e-05, "loss": 2.1202, "step": 184 }, { "epoch": 0.2976, "grad_norm": 1.7526299953460693, "learning_rate": 9.737390920192408e-05, "loss": 2.0635, "step": 186 }, { "epoch": 0.3008, "grad_norm": 2.229520082473755, "learning_rate": 9.7288600123392e-05, "loss": 1.9582, "step": 188 }, { "epoch": 0.304, "grad_norm": 2.3614768981933594, "learning_rate": 9.720196605137565e-05, "loss": 2.0278, "step": 190 }, { "epoch": 0.3072, "grad_norm": 2.1270534992218018, "learning_rate": 9.71140094133011e-05, "loss": 2.1036, "step": 192 }, { "epoch": 0.3104, "grad_norm": 2.2983131408691406, "learning_rate": 9.702473267365182e-05, "loss": 2.0558, "step": 194 }, { "epoch": 0.3136, "grad_norm": 1.9561504125595093, "learning_rate": 9.693413833389956e-05, "loss": 1.9173, "step": 196 }, { "epoch": 0.3168, "grad_norm": 2.234160900115967, "learning_rate": 9.684222893243431e-05, "loss": 2.1188, "step": 198 }, { "epoch": 0.32, "grad_norm": 1.883965015411377, "learning_rate": 9.674900704449324e-05, "loss": 1.9584, "step": 200 }, { "epoch": 0.3232, "grad_norm": 1.7237235307693481, "learning_rate": 9.665447528208836e-05, "loss": 1.9351, "step": 202 }, { "epoch": 0.3264, "grad_norm": 2.0437498092651367, "learning_rate": 9.655863629393351e-05, "loss": 1.9079, "step": 204 }, { "epoch": 0.3296, "grad_norm": 2.014540195465088, "learning_rate": 9.64614927653701e-05, "loss": 1.8612, "step": 206 }, { "epoch": 0.3328, "grad_norm": 2.379439115524292, "learning_rate": 9.636304741829181e-05, "loss": 1.9976, "step": 208 }, { "epoch": 0.336, "grad_norm": 1.962538242340088, "learning_rate": 9.626330301106837e-05, "loss": 1.932, "step": 210 }, { "epoch": 0.3392, "grad_norm": 1.862244725227356, "learning_rate": 9.616226233846828e-05, "loss": 1.8992, "step": 212 }, { "epoch": 0.3424, "grad_norm": 1.7304776906967163, "learning_rate": 9.605992823158046e-05, "loss": 2.0777, "step": 214 }, { "epoch": 0.3456, "grad_norm": 2.2403054237365723, "learning_rate": 9.595630355773501e-05, "loss": 1.8658, "step": 216 }, { "epoch": 0.3488, "grad_norm": 3.3899903297424316, "learning_rate": 9.585139122042274e-05, "loss": 1.9963, "step": 218 }, { "epoch": 0.352, "grad_norm": 2.261810064315796, "learning_rate": 9.574519415921396e-05, "loss": 1.947, "step": 220 }, { "epoch": 0.3552, "grad_norm": 2.2053134441375732, "learning_rate": 9.5637715349676e-05, "loss": 2.0544, "step": 222 }, { "epoch": 0.3584, "grad_norm": 1.871773362159729, "learning_rate": 9.552895780328987e-05, "loss": 1.8976, "step": 224 }, { "epoch": 0.3616, "grad_norm": 1.6700202226638794, "learning_rate": 9.541892456736595e-05, "loss": 2.1166, "step": 226 }, { "epoch": 0.3648, "grad_norm": 1.9986639022827148, "learning_rate": 9.530761872495849e-05, "loss": 1.9311, "step": 228 }, { "epoch": 0.368, "grad_norm": 2.288973331451416, "learning_rate": 9.519504339477932e-05, "loss": 1.98, "step": 230 }, { "epoch": 0.3712, "grad_norm": 2.177896738052368, "learning_rate": 9.508120173111039e-05, "loss": 1.862, "step": 232 }, { "epoch": 0.3744, "grad_norm": 1.9860484600067139, "learning_rate": 9.496609692371548e-05, "loss": 1.9192, "step": 234 }, { "epoch": 0.3776, "grad_norm": 1.924127221107483, "learning_rate": 9.484973219775074e-05, "loss": 1.871, "step": 236 }, { "epoch": 0.3808, "grad_norm": 1.9022867679595947, "learning_rate": 9.473211081367436e-05, "loss": 1.9067, "step": 238 }, { "epoch": 0.384, "grad_norm": 1.7447446584701538, "learning_rate": 9.46132360671552e-05, "loss": 1.8984, "step": 240 }, { "epoch": 0.3872, "grad_norm": 2.809067487716675, "learning_rate": 9.449311128898049e-05, "loss": 1.8327, "step": 242 }, { "epoch": 0.3904, "grad_norm": 1.9946494102478027, "learning_rate": 9.437173984496246e-05, "loss": 1.9735, "step": 244 }, { "epoch": 0.3936, "grad_norm": 1.8834348917007446, "learning_rate": 9.424912513584401e-05, "loss": 2.0294, "step": 246 }, { "epoch": 0.3968, "grad_norm": 1.9426389932632446, "learning_rate": 9.412527059720352e-05, "loss": 1.9919, "step": 248 }, { "epoch": 0.4, "grad_norm": 1.823935627937317, "learning_rate": 9.400017969935848e-05, "loss": 1.8907, "step": 250 }, { "epoch": 0.4032, "grad_norm": 2.1048786640167236, "learning_rate": 9.387385594726829e-05, "loss": 1.8855, "step": 252 }, { "epoch": 0.4064, "grad_norm": 1.9253580570220947, "learning_rate": 9.374630288043614e-05, "loss": 2.0577, "step": 254 }, { "epoch": 0.4096, "grad_norm": 1.785396695137024, "learning_rate": 9.361752407280965e-05, "loss": 1.9675, "step": 256 }, { "epoch": 0.4128, "grad_norm": 1.9203846454620361, "learning_rate": 9.348752313268093e-05, "loss": 1.8934, "step": 258 }, { "epoch": 0.416, "grad_norm": 1.986392855644226, "learning_rate": 9.335630370258533e-05, "loss": 1.9838, "step": 260 }, { "epoch": 0.4192, "grad_norm": 1.953905463218689, "learning_rate": 9.322386945919946e-05, "loss": 1.7604, "step": 262 }, { "epoch": 0.4224, "grad_norm": 1.7314627170562744, "learning_rate": 9.309022411323816e-05, "loss": 2.0328, "step": 264 }, { "epoch": 0.4256, "grad_norm": 1.6745048761367798, "learning_rate": 9.295537140935049e-05, "loss": 1.9734, "step": 266 }, { "epoch": 0.4288, "grad_norm": 1.8622961044311523, "learning_rate": 9.281931512601485e-05, "loss": 1.9509, "step": 268 }, { "epoch": 0.432, "grad_norm": 2.014514684677124, "learning_rate": 9.26820590754331e-05, "loss": 1.8272, "step": 270 }, { "epoch": 0.4352, "grad_norm": 2.118647336959839, "learning_rate": 9.254360710342371e-05, "loss": 1.8347, "step": 272 }, { "epoch": 0.4384, "grad_norm": 2.04239821434021, "learning_rate": 9.240396308931407e-05, "loss": 1.8675, "step": 274 }, { "epoch": 0.4416, "grad_norm": 1.951341152191162, "learning_rate": 9.226313094583173e-05, "loss": 1.9559, "step": 276 }, { "epoch": 0.4448, "grad_norm": 1.7053275108337402, "learning_rate": 9.212111461899479e-05, "loss": 2.0715, "step": 278 }, { "epoch": 0.448, "grad_norm": 1.7789607048034668, "learning_rate": 9.197791808800135e-05, "loss": 1.89, "step": 280 }, { "epoch": 0.4512, "grad_norm": 1.8625364303588867, "learning_rate": 9.183354536511803e-05, "loss": 1.9809, "step": 282 }, { "epoch": 0.4544, "grad_norm": 1.6965309381484985, "learning_rate": 9.168800049556747e-05, "loss": 1.8365, "step": 284 }, { "epoch": 0.4576, "grad_norm": 2.1207497119903564, "learning_rate": 9.154128755741509e-05, "loss": 1.8314, "step": 286 }, { "epoch": 0.4608, "grad_norm": 1.8182010650634766, "learning_rate": 9.139341066145472e-05, "loss": 1.8906, "step": 288 }, { "epoch": 0.464, "grad_norm": 1.977777361869812, "learning_rate": 9.124437395109353e-05, "loss": 1.8562, "step": 290 }, { "epoch": 0.4672, "grad_norm": 1.9953404664993286, "learning_rate": 9.109418160223585e-05, "loss": 1.8364, "step": 292 }, { "epoch": 0.4704, "grad_norm": 1.9941433668136597, "learning_rate": 9.094283782316619e-05, "loss": 1.7585, "step": 294 }, { "epoch": 0.4736, "grad_norm": 1.9799609184265137, "learning_rate": 9.079034685443133e-05, "loss": 1.8669, "step": 296 }, { "epoch": 0.4768, "grad_norm": 1.755238652229309, "learning_rate": 9.063671296872149e-05, "loss": 1.8001, "step": 298 }, { "epoch": 0.48, "grad_norm": 2.059305429458618, "learning_rate": 9.048194047075069e-05, "loss": 1.9259, "step": 300 }, { "epoch": 0.4832, "grad_norm": 1.7116378545761108, "learning_rate": 9.032603369713596e-05, "loss": 1.6954, "step": 302 }, { "epoch": 0.4864, "grad_norm": 2.472815990447998, "learning_rate": 9.016899701627604e-05, "loss": 1.8413, "step": 304 }, { "epoch": 0.4896, "grad_norm": 1.8934400081634521, "learning_rate": 9.00108348282288e-05, "loss": 1.9545, "step": 306 }, { "epoch": 0.4928, "grad_norm": 2.147753953933716, "learning_rate": 8.985155156458811e-05, "loss": 1.7679, "step": 308 }, { "epoch": 0.496, "grad_norm": 2.2302675247192383, "learning_rate": 8.969115168835954e-05, "loss": 1.8257, "step": 310 }, { "epoch": 0.4992, "grad_norm": 1.6578640937805176, "learning_rate": 8.952963969383538e-05, "loss": 1.7151, "step": 312 }, { "epoch": 0.5024, "grad_norm": 1.754835844039917, "learning_rate": 8.93670201064687e-05, "loss": 2.0074, "step": 314 }, { "epoch": 0.5056, "grad_norm": 2.130150556564331, "learning_rate": 8.920329748274649e-05, "loss": 1.8657, "step": 316 }, { "epoch": 0.5088, "grad_norm": 1.7068381309509277, "learning_rate": 8.903847641006218e-05, "loss": 1.8955, "step": 318 }, { "epoch": 0.512, "grad_norm": 2.0879528522491455, "learning_rate": 8.887256150658684e-05, "loss": 1.7092, "step": 320 }, { "epoch": 0.5152, "grad_norm": 1.8985047340393066, "learning_rate": 8.870555742113998e-05, "loss": 1.8091, "step": 322 }, { "epoch": 0.5184, "grad_norm": 1.7577992677688599, "learning_rate": 8.85374688330592e-05, "loss": 1.8895, "step": 324 }, { "epoch": 0.5216, "grad_norm": 1.8277013301849365, "learning_rate": 8.836830045206911e-05, "loss": 1.8192, "step": 326 }, { "epoch": 0.5248, "grad_norm": 1.8492199182510376, "learning_rate": 8.81980570181494e-05, "loss": 2.0282, "step": 328 }, { "epoch": 0.528, "grad_norm": 1.8850246667861938, "learning_rate": 8.802674330140192e-05, "loss": 1.7955, "step": 330 }, { "epoch": 0.5312, "grad_norm": 1.7965402603149414, "learning_rate": 8.785436410191714e-05, "loss": 1.8271, "step": 332 }, { "epoch": 0.5344, "grad_norm": 2.0495541095733643, "learning_rate": 8.76809242496396e-05, "loss": 1.9308, "step": 334 }, { "epoch": 0.5376, "grad_norm": 1.8388515710830688, "learning_rate": 8.750642860423262e-05, "loss": 1.8831, "step": 336 }, { "epoch": 0.5408, "grad_norm": 2.2101669311523438, "learning_rate": 8.733088205494205e-05, "loss": 1.9837, "step": 338 }, { "epoch": 0.544, "grad_norm": 1.7564021348953247, "learning_rate": 8.715428952045936e-05, "loss": 2.0114, "step": 340 }, { "epoch": 0.5472, "grad_norm": 2.0515785217285156, "learning_rate": 8.697665594878382e-05, "loss": 1.7574, "step": 342 }, { "epoch": 0.5504, "grad_norm": 2.1503772735595703, "learning_rate": 8.679798631708375e-05, "loss": 1.9549, "step": 344 }, { "epoch": 0.5536, "grad_norm": 1.6707327365875244, "learning_rate": 8.661828563155727e-05, "loss": 1.9318, "step": 346 }, { "epoch": 0.5568, "grad_norm": 1.9014642238616943, "learning_rate": 8.643755892729179e-05, "loss": 1.9853, "step": 348 }, { "epoch": 0.56, "grad_norm": 1.9820547103881836, "learning_rate": 8.625581126812312e-05, "loss": 1.8178, "step": 350 }, { "epoch": 0.5632, "grad_norm": 2.810029983520508, "learning_rate": 8.607304774649349e-05, "loss": 2.0081, "step": 352 }, { "epoch": 0.5664, "grad_norm": 1.8511972427368164, "learning_rate": 8.588927348330887e-05, "loss": 1.7794, "step": 354 }, { "epoch": 0.5696, "grad_norm": 1.954455852508545, "learning_rate": 8.57044936277955e-05, "loss": 1.9215, "step": 356 }, { "epoch": 0.5728, "grad_norm": 1.8836822509765625, "learning_rate": 8.551871335735565e-05, "loss": 1.7449, "step": 358 }, { "epoch": 0.576, "grad_norm": 1.8966975212097168, "learning_rate": 8.533193787742251e-05, "loss": 1.7689, "step": 360 }, { "epoch": 0.5792, "grad_norm": 1.7771093845367432, "learning_rate": 8.51441724213143e-05, "loss": 1.8151, "step": 362 }, { "epoch": 0.5824, "grad_norm": 1.880419135093689, "learning_rate": 8.495542225008771e-05, "loss": 1.805, "step": 364 }, { "epoch": 0.5856, "grad_norm": 1.820349097251892, "learning_rate": 8.476569265239046e-05, "loss": 1.758, "step": 366 }, { "epoch": 0.5888, "grad_norm": 1.984392523765564, "learning_rate": 8.457498894431311e-05, "loss": 1.7321, "step": 368 }, { "epoch": 0.592, "grad_norm": 1.710229516029358, "learning_rate": 8.438331646924013e-05, "loss": 1.7819, "step": 370 }, { "epoch": 0.5952, "grad_norm": 1.736141324043274, "learning_rate": 8.419068059770011e-05, "loss": 1.8351, "step": 372 }, { "epoch": 0.5984, "grad_norm": 1.6661279201507568, "learning_rate": 8.399708672721539e-05, "loss": 1.803, "step": 374 }, { "epoch": 0.6016, "grad_norm": 4.828789710998535, "learning_rate": 8.380254028215076e-05, "loss": 1.8539, "step": 376 }, { "epoch": 0.6048, "grad_norm": 2.078886032104492, "learning_rate": 8.360704671356145e-05, "loss": 1.7976, "step": 378 }, { "epoch": 0.608, "grad_norm": 1.720009684562683, "learning_rate": 8.341061149904045e-05, "loss": 1.9524, "step": 380 }, { "epoch": 0.6112, "grad_norm": 1.935594081878662, "learning_rate": 8.321324014256504e-05, "loss": 1.8671, "step": 382 }, { "epoch": 0.6144, "grad_norm": 1.868320345878601, "learning_rate": 8.30149381743425e-05, "loss": 1.8896, "step": 384 }, { "epoch": 0.6176, "grad_norm": 2.0732314586639404, "learning_rate": 8.28157111506552e-05, "loss": 1.8446, "step": 386 }, { "epoch": 0.6208, "grad_norm": 1.5798280239105225, "learning_rate": 8.261556465370493e-05, "loss": 1.9207, "step": 388 }, { "epoch": 0.624, "grad_norm": 1.6934467554092407, "learning_rate": 8.24145042914565e-05, "loss": 1.7548, "step": 390 }, { "epoch": 0.6272, "grad_norm": 1.7732023000717163, "learning_rate": 8.221253569748055e-05, "loss": 1.7041, "step": 392 }, { "epoch": 0.6304, "grad_norm": 1.9565222263336182, "learning_rate": 8.200966453079575e-05, "loss": 1.8865, "step": 394 }, { "epoch": 0.6336, "grad_norm": 1.7031235694885254, "learning_rate": 8.180589647571023e-05, "loss": 2.0219, "step": 396 }, { "epoch": 0.6368, "grad_norm": 1.8705931901931763, "learning_rate": 8.16012372416623e-05, "loss": 1.7774, "step": 398 }, { "epoch": 0.64, "grad_norm": 1.7355400323867798, "learning_rate": 8.13956925630605e-05, "loss": 1.7273, "step": 400 }, { "epoch": 0.6432, "grad_norm": 1.7146542072296143, "learning_rate": 8.118926819912287e-05, "loss": 1.8275, "step": 402 }, { "epoch": 0.6464, "grad_norm": 1.8502819538116455, "learning_rate": 8.098196993371565e-05, "loss": 1.856, "step": 404 }, { "epoch": 0.6496, "grad_norm": 1.6460517644882202, "learning_rate": 8.077380357519115e-05, "loss": 1.7826, "step": 406 }, { "epoch": 0.6528, "grad_norm": 1.6977733373641968, "learning_rate": 8.056477495622511e-05, "loss": 2.0396, "step": 408 }, { "epoch": 0.656, "grad_norm": 2.395606756210327, "learning_rate": 8.035488993365312e-05, "loss": 1.755, "step": 410 }, { "epoch": 0.6592, "grad_norm": 1.6800931692123413, "learning_rate": 8.014415438830667e-05, "loss": 1.9174, "step": 412 }, { "epoch": 0.6624, "grad_norm": 1.940741777420044, "learning_rate": 7.993257422484826e-05, "loss": 1.7259, "step": 414 }, { "epoch": 0.6656, "grad_norm": 1.6088985204696655, "learning_rate": 7.972015537160602e-05, "loss": 1.9236, "step": 416 }, { "epoch": 0.6688, "grad_norm": 1.77496337890625, "learning_rate": 7.950690378040758e-05, "loss": 1.9956, "step": 418 }, { "epoch": 0.672, "grad_norm": 2.08013653755188, "learning_rate": 7.929282542641325e-05, "loss": 1.71, "step": 420 }, { "epoch": 0.6752, "grad_norm": 1.9645555019378662, "learning_rate": 7.907792630794876e-05, "loss": 1.6586, "step": 422 }, { "epoch": 0.6784, "grad_norm": 2.035111904144287, "learning_rate": 7.886221244633703e-05, "loss": 1.8481, "step": 424 }, { "epoch": 0.6816, "grad_norm": 1.617519736289978, "learning_rate": 7.864568988572947e-05, "loss": 1.8787, "step": 426 }, { "epoch": 0.6848, "grad_norm": 1.9266173839569092, "learning_rate": 7.842836469293673e-05, "loss": 1.7332, "step": 428 }, { "epoch": 0.688, "grad_norm": 1.6716456413269043, "learning_rate": 7.821024295725865e-05, "loss": 1.8147, "step": 430 }, { "epoch": 0.6912, "grad_norm": 1.9675475358963013, "learning_rate": 7.79913307903136e-05, "loss": 1.77, "step": 432 }, { "epoch": 0.6944, "grad_norm": 2.048152208328247, "learning_rate": 7.777163432586734e-05, "loss": 1.7438, "step": 434 }, { "epoch": 0.6976, "grad_norm": 1.7210822105407715, "learning_rate": 7.755115971966104e-05, "loss": 1.7988, "step": 436 }, { "epoch": 0.7008, "grad_norm": 2.126711845397949, "learning_rate": 7.732991314923891e-05, "loss": 1.7376, "step": 438 }, { "epoch": 0.704, "grad_norm": 1.7960891723632812, "learning_rate": 7.710790081377502e-05, "loss": 1.7875, "step": 440 }, { "epoch": 0.7072, "grad_norm": 1.6610071659088135, "learning_rate": 7.688512893389964e-05, "loss": 1.7334, "step": 442 }, { "epoch": 0.7104, "grad_norm": 1.6998896598815918, "learning_rate": 7.666160375152496e-05, "loss": 1.886, "step": 444 }, { "epoch": 0.7136, "grad_norm": 1.6629440784454346, "learning_rate": 7.643733152967019e-05, "loss": 1.786, "step": 446 }, { "epoch": 0.7168, "grad_norm": 1.6910452842712402, "learning_rate": 7.621231855228604e-05, "loss": 2.0343, "step": 448 }, { "epoch": 0.72, "grad_norm": 1.9952099323272705, "learning_rate": 7.598657112407865e-05, "loss": 1.7571, "step": 450 }, { "epoch": 0.7232, "grad_norm": 1.7345885038375854, "learning_rate": 7.576009557033304e-05, "loss": 2.0908, "step": 452 }, { "epoch": 0.7264, "grad_norm": 1.6344877481460571, "learning_rate": 7.553289823673568e-05, "loss": 1.8395, "step": 454 }, { "epoch": 0.7296, "grad_norm": 2.138115406036377, "learning_rate": 7.530498548919693e-05, "loss": 1.7072, "step": 456 }, { "epoch": 0.7328, "grad_norm": 1.9216474294662476, "learning_rate": 7.507636371367246e-05, "loss": 1.6516, "step": 458 }, { "epoch": 0.736, "grad_norm": 1.4932810068130493, "learning_rate": 7.484703931598445e-05, "loss": 1.9351, "step": 460 }, { "epoch": 0.7392, "grad_norm": 1.8183472156524658, "learning_rate": 7.461701872164204e-05, "loss": 1.8441, "step": 462 }, { "epoch": 0.7424, "grad_norm": 1.5970336198806763, "learning_rate": 7.438630837566133e-05, "loss": 1.8145, "step": 464 }, { "epoch": 0.7456, "grad_norm": 1.7351387739181519, "learning_rate": 7.415491474238475e-05, "loss": 1.8858, "step": 466 }, { "epoch": 0.7488, "grad_norm": 1.6989448070526123, "learning_rate": 7.39228443053e-05, "loss": 1.8566, "step": 468 }, { "epoch": 0.752, "grad_norm": 1.8217098712921143, "learning_rate": 7.369010356685833e-05, "loss": 1.692, "step": 470 }, { "epoch": 0.7552, "grad_norm": 1.7833845615386963, "learning_rate": 7.345669904829237e-05, "loss": 1.8145, "step": 472 }, { "epoch": 0.7584, "grad_norm": 1.7113256454467773, "learning_rate": 7.32226372894334e-05, "loss": 1.907, "step": 474 }, { "epoch": 0.7616, "grad_norm": 1.66838800907135, "learning_rate": 7.298792484852808e-05, "loss": 1.8243, "step": 476 }, { "epoch": 0.7648, "grad_norm": 1.8057668209075928, "learning_rate": 7.27525683020548e-05, "loss": 1.6788, "step": 478 }, { "epoch": 0.768, "grad_norm": 1.7563303709030151, "learning_rate": 7.251657424453928e-05, "loss": 2.0148, "step": 480 }, { "epoch": 0.7712, "grad_norm": 1.75275719165802, "learning_rate": 7.227994928836988e-05, "loss": 1.7584, "step": 482 }, { "epoch": 0.7744, "grad_norm": 1.6364191770553589, "learning_rate": 7.204270006361228e-05, "loss": 1.9348, "step": 484 }, { "epoch": 0.7776, "grad_norm": 1.7930974960327148, "learning_rate": 7.180483321782374e-05, "loss": 1.9014, "step": 486 }, { "epoch": 0.7808, "grad_norm": 1.8914506435394287, "learning_rate": 7.156635541586682e-05, "loss": 1.7977, "step": 488 }, { "epoch": 0.784, "grad_norm": 1.7024521827697754, "learning_rate": 7.132727333972265e-05, "loss": 1.6993, "step": 490 }, { "epoch": 0.7872, "grad_norm": 1.7870112657546997, "learning_rate": 7.108759368830371e-05, "loss": 1.6965, "step": 492 }, { "epoch": 0.7904, "grad_norm": 1.763691782951355, "learning_rate": 7.084732317726611e-05, "loss": 1.7948, "step": 494 }, { "epoch": 0.7936, "grad_norm": 1.683468222618103, "learning_rate": 7.060646853882145e-05, "loss": 1.9145, "step": 496 }, { "epoch": 0.7968, "grad_norm": 1.9888768196105957, "learning_rate": 7.036503652154812e-05, "loss": 1.8192, "step": 498 }, { "epoch": 0.8, "grad_norm": 1.5705928802490234, "learning_rate": 7.012303389020234e-05, "loss": 1.7831, "step": 500 }, { "epoch": 0.8032, "grad_norm": 1.860660433769226, "learning_rate": 6.988046742552845e-05, "loss": 1.7904, "step": 502 }, { "epoch": 0.8064, "grad_norm": 1.8895405530929565, "learning_rate": 6.963734392406907e-05, "loss": 1.8645, "step": 504 }, { "epoch": 0.8096, "grad_norm": 1.74190354347229, "learning_rate": 6.93936701979746e-05, "loss": 1.8455, "step": 506 }, { "epoch": 0.8128, "grad_norm": 1.9230369329452515, "learning_rate": 6.914945307481228e-05, "loss": 1.8388, "step": 508 }, { "epoch": 0.816, "grad_norm": 1.5093566179275513, "learning_rate": 6.890469939737506e-05, "loss": 1.752, "step": 510 }, { "epoch": 0.8192, "grad_norm": 1.5916728973388672, "learning_rate": 6.865941602348966e-05, "loss": 1.7105, "step": 512 }, { "epoch": 0.8224, "grad_norm": 1.7378982305526733, "learning_rate": 6.841360982582463e-05, "loss": 1.9789, "step": 514 }, { "epoch": 0.8256, "grad_norm": 1.7520698308944702, "learning_rate": 6.816728769169757e-05, "loss": 1.7566, "step": 516 }, { "epoch": 0.8288, "grad_norm": 1.8129826784133911, "learning_rate": 6.792045652288234e-05, "loss": 1.8551, "step": 518 }, { "epoch": 0.832, "grad_norm": 1.9102818965911865, "learning_rate": 6.767312323541555e-05, "loss": 1.7726, "step": 520 }, { "epoch": 0.8352, "grad_norm": 1.5088154077529907, "learning_rate": 6.742529475940284e-05, "loss": 1.6381, "step": 522 }, { "epoch": 0.8384, "grad_norm": 1.7010055780410767, "learning_rate": 6.717697803882467e-05, "loss": 1.8741, "step": 524 }, { "epoch": 0.8416, "grad_norm": 1.6840184926986694, "learning_rate": 6.692818003134184e-05, "loss": 1.8617, "step": 526 }, { "epoch": 0.8448, "grad_norm": 1.7205629348754883, "learning_rate": 6.667890770810035e-05, "loss": 1.7349, "step": 528 }, { "epoch": 0.848, "grad_norm": 1.520727515220642, "learning_rate": 6.64291680535363e-05, "loss": 1.749, "step": 530 }, { "epoch": 0.8512, "grad_norm": 1.5941743850708008, "learning_rate": 6.617896806518005e-05, "loss": 1.7076, "step": 532 }, { "epoch": 0.8544, "grad_norm": 1.7745941877365112, "learning_rate": 6.592831475346018e-05, "loss": 1.792, "step": 534 }, { "epoch": 0.8576, "grad_norm": 1.5072052478790283, "learning_rate": 6.56772151415071e-05, "loss": 1.6149, "step": 536 }, { "epoch": 0.8608, "grad_norm": 1.6202104091644287, "learning_rate": 6.542567626495619e-05, "loss": 1.756, "step": 538 }, { "epoch": 0.864, "grad_norm": 1.4974113702774048, "learning_rate": 6.517370517175081e-05, "loss": 1.7919, "step": 540 }, { "epoch": 0.8672, "grad_norm": 1.653824806213379, "learning_rate": 6.492130892194461e-05, "loss": 2.0103, "step": 542 }, { "epoch": 0.8704, "grad_norm": 1.683524489402771, "learning_rate": 6.466849458750394e-05, "loss": 2.0337, "step": 544 }, { "epoch": 0.8736, "grad_norm": 1.5982547998428345, "learning_rate": 6.441526925210949e-05, "loss": 1.8919, "step": 546 }, { "epoch": 0.8768, "grad_norm": 1.838497519493103, "learning_rate": 6.416164001095799e-05, "loss": 1.7648, "step": 548 }, { "epoch": 0.88, "grad_norm": 1.524348258972168, "learning_rate": 6.390761397056328e-05, "loss": 1.6804, "step": 550 }, { "epoch": 0.8832, "grad_norm": 1.6498512029647827, "learning_rate": 6.365319824855727e-05, "loss": 1.6334, "step": 552 }, { "epoch": 0.8864, "grad_norm": 1.5689668655395508, "learning_rate": 6.339839997349045e-05, "loss": 1.9048, "step": 554 }, { "epoch": 0.8896, "grad_norm": 1.7050296068191528, "learning_rate": 6.314322628463219e-05, "loss": 1.6864, "step": 556 }, { "epoch": 0.8928, "grad_norm": 2.038351535797119, "learning_rate": 6.288768433177068e-05, "loss": 1.7531, "step": 558 }, { "epoch": 0.896, "grad_norm": 1.7489795684814453, "learning_rate": 6.26317812750126e-05, "loss": 1.8467, "step": 560 }, { "epoch": 0.8992, "grad_norm": 1.6068861484527588, "learning_rate": 6.237552428458256e-05, "loss": 1.8459, "step": 562 }, { "epoch": 0.9024, "grad_norm": 1.616613745689392, "learning_rate": 6.21189205406221e-05, "loss": 1.8173, "step": 564 }, { "epoch": 0.9056, "grad_norm": 1.6885602474212646, "learning_rate": 6.186197723298855e-05, "loss": 1.8358, "step": 566 }, { "epoch": 0.9088, "grad_norm": 1.688711404800415, "learning_rate": 6.160470156105362e-05, "loss": 1.6996, "step": 568 }, { "epoch": 0.912, "grad_norm": 1.74298894405365, "learning_rate": 6.134710073350156e-05, "loss": 1.722, "step": 570 }, { "epoch": 0.9152, "grad_norm": 1.6249070167541504, "learning_rate": 6.108918196812734e-05, "loss": 1.7909, "step": 572 }, { "epoch": 0.9184, "grad_norm": 1.659416675567627, "learning_rate": 6.083095249163424e-05, "loss": 1.6625, "step": 574 }, { "epoch": 0.9216, "grad_norm": 1.6332143545150757, "learning_rate": 6.057241953943154e-05, "loss": 1.8297, "step": 576 }, { "epoch": 0.9248, "grad_norm": 1.6717133522033691, "learning_rate": 6.031359035543158e-05, "loss": 2.0601, "step": 578 }, { "epoch": 0.928, "grad_norm": 1.8736896514892578, "learning_rate": 6.005447219184702e-05, "loss": 1.8117, "step": 580 }, { "epoch": 0.9312, "grad_norm": 1.6602182388305664, "learning_rate": 5.9795072308987485e-05, "loss": 1.7275, "step": 582 }, { "epoch": 0.9344, "grad_norm": 1.6776071786880493, "learning_rate": 5.9535397975056154e-05, "loss": 1.8988, "step": 584 }, { "epoch": 0.9376, "grad_norm": 1.588109016418457, "learning_rate": 5.927545646594617e-05, "loss": 1.7716, "step": 586 }, { "epoch": 0.9408, "grad_norm": 1.6331814527511597, "learning_rate": 5.901525506503668e-05, "loss": 1.8081, "step": 588 }, { "epoch": 0.944, "grad_norm": 1.7309777736663818, "learning_rate": 5.87548010629889e-05, "loss": 1.8243, "step": 590 }, { "epoch": 0.9472, "grad_norm": 1.6374008655548096, "learning_rate": 5.8494101757541676e-05, "loss": 1.7065, "step": 592 }, { "epoch": 0.9504, "grad_norm": 1.6291025876998901, "learning_rate": 5.8233164453307156e-05, "loss": 1.8013, "step": 594 }, { "epoch": 0.9536, "grad_norm": 1.7339948415756226, "learning_rate": 5.797199646156596e-05, "loss": 1.7998, "step": 596 }, { "epoch": 0.9568, "grad_norm": 1.581697940826416, "learning_rate": 5.7710605100062485e-05, "loss": 1.645, "step": 598 }, { "epoch": 0.96, "grad_norm": 1.713205337524414, "learning_rate": 5.7448997692799764e-05, "loss": 1.9092, "step": 600 }, { "epoch": 0.9632, "grad_norm": 1.7023775577545166, "learning_rate": 5.718718156983428e-05, "loss": 1.7403, "step": 602 }, { "epoch": 0.9664, "grad_norm": 1.606632947921753, "learning_rate": 5.69251640670706e-05, "loss": 1.679, "step": 604 }, { "epoch": 0.9696, "grad_norm": 1.5328476428985596, "learning_rate": 5.6662952526055793e-05, "loss": 1.7899, "step": 606 }, { "epoch": 0.9728, "grad_norm": 1.5965962409973145, "learning_rate": 5.6400554293773744e-05, "loss": 1.7776, "step": 608 }, { "epoch": 0.976, "grad_norm": 1.5174623727798462, "learning_rate": 5.61379767224393e-05, "loss": 1.6602, "step": 610 }, { "epoch": 0.9792, "grad_norm": 1.6876877546310425, "learning_rate": 5.587522716929228e-05, "loss": 1.6656, "step": 612 }, { "epoch": 0.9824, "grad_norm": 1.5483810901641846, "learning_rate": 5.561231299639127e-05, "loss": 1.6531, "step": 614 }, { "epoch": 0.9856, "grad_norm": 1.464625597000122, "learning_rate": 5.534924157040745e-05, "loss": 1.8967, "step": 616 }, { "epoch": 0.9888, "grad_norm": 1.7587417364120483, "learning_rate": 5.508602026241807e-05, "loss": 1.6637, "step": 618 }, { "epoch": 0.992, "grad_norm": 1.5783720016479492, "learning_rate": 5.482265644769998e-05, "loss": 1.7628, "step": 620 }, { "epoch": 0.9952, "grad_norm": 1.602127194404602, "learning_rate": 5.4559157505522985e-05, "loss": 1.7458, "step": 622 }, { "epoch": 0.9984, "grad_norm": 1.8909751176834106, "learning_rate": 5.429553081894304e-05, "loss": 1.6952, "step": 624 }, { "epoch": 1.0016, "grad_norm": 1.5928688049316406, "learning_rate": 5.4031783774595455e-05, "loss": 1.5991, "step": 626 }, { "epoch": 1.0048, "grad_norm": 1.487565517425537, "learning_rate": 5.3767923762487824e-05, "loss": 1.4893, "step": 628 }, { "epoch": 1.008, "grad_norm": 1.4564417600631714, "learning_rate": 5.3503958175793055e-05, "loss": 1.4741, "step": 630 }, { "epoch": 1.0112, "grad_norm": 1.4703936576843262, "learning_rate": 5.323989441064216e-05, "loss": 1.5821, "step": 632 }, { "epoch": 1.0144, "grad_norm": 1.746761441230774, "learning_rate": 5.2975739865917074e-05, "loss": 1.565, "step": 634 }, { "epoch": 1.0176, "grad_norm": 1.7585957050323486, "learning_rate": 5.271150194304326e-05, "loss": 1.5653, "step": 636 }, { "epoch": 1.0208, "grad_norm": 1.5630764961242676, "learning_rate": 5.244718804578246e-05, "loss": 1.496, "step": 638 }, { "epoch": 1.024, "grad_norm": 1.6863924264907837, "learning_rate": 5.218280558002506e-05, "loss": 1.4683, "step": 640 }, { "epoch": 1.0272, "grad_norm": 1.4588528871536255, "learning_rate": 5.191836195358278e-05, "loss": 1.513, "step": 642 }, { "epoch": 1.0304, "grad_norm": 1.4486134052276611, "learning_rate": 5.165386457598099e-05, "loss": 1.484, "step": 644 }, { "epoch": 1.0336, "grad_norm": 1.5040372610092163, "learning_rate": 5.13893208582511e-05, "loss": 1.4945, "step": 646 }, { "epoch": 1.0368, "grad_norm": 1.556773066520691, "learning_rate": 5.1124738212722966e-05, "loss": 1.395, "step": 648 }, { "epoch": 1.04, "grad_norm": 1.5296416282653809, "learning_rate": 5.086012405281717e-05, "loss": 1.3816, "step": 650 }, { "epoch": 1.0432, "grad_norm": 1.5270073413848877, "learning_rate": 5.0595485792837305e-05, "loss": 1.5622, "step": 652 }, { "epoch": 1.0464, "grad_norm": 1.4216368198394775, "learning_rate": 5.033083084776222e-05, "loss": 1.4719, "step": 654 }, { "epoch": 1.0496, "grad_norm": 1.6059627532958984, "learning_rate": 5.0066166633038305e-05, "loss": 1.5124, "step": 656 }, { "epoch": 1.0528, "grad_norm": 1.7300755977630615, "learning_rate": 4.980150056437163e-05, "loss": 1.4899, "step": 658 }, { "epoch": 1.056, "grad_norm": 1.4066635370254517, "learning_rate": 4.9536840057520224e-05, "loss": 1.4709, "step": 660 }, { "epoch": 1.0592, "grad_norm": 1.563008189201355, "learning_rate": 4.927219252808631e-05, "loss": 1.4385, "step": 662 }, { "epoch": 1.0624, "grad_norm": 1.6098390817642212, "learning_rate": 4.900756539130846e-05, "loss": 1.4528, "step": 664 }, { "epoch": 1.0656, "grad_norm": 1.5592678785324097, "learning_rate": 4.874296606185387e-05, "loss": 1.5697, "step": 666 }, { "epoch": 1.0688, "grad_norm": 1.7256783246994019, "learning_rate": 4.847840195361058e-05, "loss": 1.5878, "step": 668 }, { "epoch": 1.072, "grad_norm": 3.1409246921539307, "learning_rate": 4.821388047947979e-05, "loss": 1.4804, "step": 670 }, { "epoch": 1.0752, "grad_norm": 1.415878415107727, "learning_rate": 4.7949409051168085e-05, "loss": 1.5137, "step": 672 }, { "epoch": 1.0784, "grad_norm": 1.6788074970245361, "learning_rate": 4.768499507897981e-05, "loss": 1.4662, "step": 674 }, { "epoch": 1.0816, "grad_norm": 1.3528187274932861, "learning_rate": 4.742064597160948e-05, "loss": 1.4879, "step": 676 }, { "epoch": 1.0848, "grad_norm": 1.5209320783615112, "learning_rate": 4.715636913593404e-05, "loss": 1.5457, "step": 678 }, { "epoch": 1.088, "grad_norm": 1.7500444650650024, "learning_rate": 4.689217197680554e-05, "loss": 1.4667, "step": 680 }, { "epoch": 1.0912, "grad_norm": 1.6211199760437012, "learning_rate": 4.6628061896843474e-05, "loss": 1.5392, "step": 682 }, { "epoch": 1.0944, "grad_norm": 1.584383249282837, "learning_rate": 4.6364046296227484e-05, "loss": 1.4909, "step": 684 }, { "epoch": 1.0976, "grad_norm": 1.7815152406692505, "learning_rate": 4.6100132572489915e-05, "loss": 1.654, "step": 686 }, { "epoch": 1.1008, "grad_norm": 1.3984824419021606, "learning_rate": 4.5836328120308674e-05, "loss": 1.5179, "step": 688 }, { "epoch": 1.104, "grad_norm": 2.673772096633911, "learning_rate": 4.5572640331299875e-05, "loss": 1.5183, "step": 690 }, { "epoch": 1.1072, "grad_norm": 1.3696187734603882, "learning_rate": 4.530907659381086e-05, "loss": 1.4193, "step": 692 }, { "epoch": 1.1104, "grad_norm": 1.5257173776626587, "learning_rate": 4.504564429271311e-05, "loss": 1.4448, "step": 694 }, { "epoch": 1.1136, "grad_norm": 1.4060217142105103, "learning_rate": 4.478235080919536e-05, "loss": 1.4528, "step": 696 }, { "epoch": 1.1168, "grad_norm": 1.7510145902633667, "learning_rate": 4.451920352055678e-05, "loss": 1.4895, "step": 698 }, { "epoch": 1.12, "grad_norm": 1.639930009841919, "learning_rate": 4.425620980000026e-05, "loss": 1.5675, "step": 700 }, { "epoch": 1.1232, "grad_norm": 1.6045302152633667, "learning_rate": 4.39933770164258e-05, "loss": 1.546, "step": 702 }, { "epoch": 1.1264, "grad_norm": 1.6283916234970093, "learning_rate": 4.373071253422408e-05, "loss": 1.5913, "step": 704 }, { "epoch": 1.1296, "grad_norm": 1.594617247581482, "learning_rate": 4.346822371307009e-05, "loss": 1.5827, "step": 706 }, { "epoch": 1.1328, "grad_norm": 1.6242777109146118, "learning_rate": 4.320591790771691e-05, "loss": 1.493, "step": 708 }, { "epoch": 1.1360000000000001, "grad_norm": 1.445251226425171, "learning_rate": 4.294380246778966e-05, "loss": 1.4685, "step": 710 }, { "epoch": 1.1392, "grad_norm": 1.4812084436416626, "learning_rate": 4.2681884737579524e-05, "loss": 1.4637, "step": 712 }, { "epoch": 1.1424, "grad_norm": 1.327941656112671, "learning_rate": 4.242017205583805e-05, "loss": 1.4974, "step": 714 }, { "epoch": 1.1456, "grad_norm": 1.5381219387054443, "learning_rate": 4.215867175557142e-05, "loss": 1.4419, "step": 716 }, { "epoch": 1.1488, "grad_norm": 1.447100281715393, "learning_rate": 4.189739116383506e-05, "loss": 1.5027, "step": 718 }, { "epoch": 1.152, "grad_norm": 1.7003172636032104, "learning_rate": 4.163633760152834e-05, "loss": 1.4394, "step": 720 }, { "epoch": 1.1552, "grad_norm": 1.5542171001434326, "learning_rate": 4.137551838318936e-05, "loss": 1.502, "step": 722 }, { "epoch": 1.1584, "grad_norm": 1.4329487085342407, "learning_rate": 4.1114940816790135e-05, "loss": 1.4303, "step": 724 }, { "epoch": 1.1616, "grad_norm": 1.5292679071426392, "learning_rate": 4.08546122035317e-05, "loss": 1.4335, "step": 726 }, { "epoch": 1.1648, "grad_norm": 1.5414072275161743, "learning_rate": 4.059453983763967e-05, "loss": 1.4574, "step": 728 }, { "epoch": 1.168, "grad_norm": 1.5525455474853516, "learning_rate": 4.03347310061597e-05, "loss": 1.3842, "step": 730 }, { "epoch": 1.1712, "grad_norm": 1.444132685661316, "learning_rate": 4.007519298875347e-05, "loss": 1.4233, "step": 732 }, { "epoch": 1.1743999999999999, "grad_norm": 1.6456663608551025, "learning_rate": 3.98159330574946e-05, "loss": 1.4416, "step": 734 }, { "epoch": 1.1776, "grad_norm": 1.5266261100769043, "learning_rate": 3.955695847666494e-05, "loss": 1.4292, "step": 736 }, { "epoch": 1.1808, "grad_norm": 1.6392107009887695, "learning_rate": 3.929827650255104e-05, "loss": 1.5965, "step": 738 }, { "epoch": 1.184, "grad_norm": 1.4020483493804932, "learning_rate": 3.903989438324077e-05, "loss": 1.3887, "step": 740 }, { "epoch": 1.1872, "grad_norm": 1.5396273136138916, "learning_rate": 3.878181935842033e-05, "loss": 1.5077, "step": 742 }, { "epoch": 1.1904, "grad_norm": 1.5113873481750488, "learning_rate": 3.85240586591713e-05, "loss": 1.4624, "step": 744 }, { "epoch": 1.1936, "grad_norm": 1.4673871994018555, "learning_rate": 3.8266619507768126e-05, "loss": 1.4441, "step": 746 }, { "epoch": 1.1968, "grad_norm": 1.387566328048706, "learning_rate": 3.800950911747565e-05, "loss": 1.4103, "step": 748 }, { "epoch": 1.2, "grad_norm": 1.4239872694015503, "learning_rate": 3.775273469234712e-05, "loss": 1.5861, "step": 750 }, { "epoch": 1.2032, "grad_norm": 1.672402024269104, "learning_rate": 3.749630342702221e-05, "loss": 1.5554, "step": 752 }, { "epoch": 1.2064, "grad_norm": 1.3757636547088623, "learning_rate": 3.724022250652557e-05, "loss": 1.4742, "step": 754 }, { "epoch": 1.2096, "grad_norm": 1.5252386331558228, "learning_rate": 3.698449910606536e-05, "loss": 1.4546, "step": 756 }, { "epoch": 1.2128, "grad_norm": 1.351670503616333, "learning_rate": 3.672914039083233e-05, "loss": 1.4842, "step": 758 }, { "epoch": 1.216, "grad_norm": 2.256891965866089, "learning_rate": 3.6474153515799e-05, "loss": 1.4753, "step": 760 }, { "epoch": 1.2192, "grad_norm": 1.6337233781814575, "learning_rate": 3.6219545625519145e-05, "loss": 1.51, "step": 762 }, { "epoch": 1.2224, "grad_norm": 1.6732077598571777, "learning_rate": 3.596532385392772e-05, "loss": 1.6077, "step": 764 }, { "epoch": 1.2256, "grad_norm": 1.410620927810669, "learning_rate": 3.5711495324140845e-05, "loss": 1.4561, "step": 766 }, { "epoch": 1.2288000000000001, "grad_norm": 1.5092262029647827, "learning_rate": 3.54580671482563e-05, "loss": 1.4808, "step": 768 }, { "epoch": 1.232, "grad_norm": 1.5939708948135376, "learning_rate": 3.520504642715424e-05, "loss": 1.568, "step": 770 }, { "epoch": 1.2352, "grad_norm": 1.4955201148986816, "learning_rate": 3.495244025029822e-05, "loss": 1.4955, "step": 772 }, { "epoch": 1.2384, "grad_norm": 1.6155612468719482, "learning_rate": 3.470025569553653e-05, "loss": 1.4821, "step": 774 }, { "epoch": 1.2416, "grad_norm": 1.5372660160064697, "learning_rate": 3.444849982890393e-05, "loss": 1.4093, "step": 776 }, { "epoch": 1.2448, "grad_norm": 1.7753242254257202, "learning_rate": 3.4197179704423656e-05, "loss": 1.4489, "step": 778 }, { "epoch": 1.248, "grad_norm": 1.5369523763656616, "learning_rate": 3.39463023639097e-05, "loss": 1.469, "step": 780 }, { "epoch": 1.2511999999999999, "grad_norm": 1.5201616287231445, "learning_rate": 3.36958748367696e-05, "loss": 1.4533, "step": 782 }, { "epoch": 1.2544, "grad_norm": 1.6345765590667725, "learning_rate": 3.3445904139807405e-05, "loss": 1.5058, "step": 784 }, { "epoch": 1.2576, "grad_norm": 1.4603197574615479, "learning_rate": 3.319639727702716e-05, "loss": 1.4791, "step": 786 }, { "epoch": 1.2608, "grad_norm": 1.395509958267212, "learning_rate": 3.2947361239436525e-05, "loss": 1.3728, "step": 788 }, { "epoch": 1.264, "grad_norm": 1.625909447669983, "learning_rate": 3.2698803004851026e-05, "loss": 1.4991, "step": 790 }, { "epoch": 1.2671999999999999, "grad_norm": 1.4244285821914673, "learning_rate": 3.245072953769844e-05, "loss": 1.5196, "step": 792 }, { "epoch": 1.2704, "grad_norm": 1.4113078117370605, "learning_rate": 3.2203147788823764e-05, "loss": 1.4194, "step": 794 }, { "epoch": 1.2736, "grad_norm": 1.588931918144226, "learning_rate": 3.1956064695294305e-05, "loss": 1.4783, "step": 796 }, { "epoch": 1.2768, "grad_norm": 1.6504685878753662, "learning_rate": 3.170948718020546e-05, "loss": 1.4376, "step": 798 }, { "epoch": 1.28, "grad_norm": 1.408836007118225, "learning_rate": 3.1463422152486674e-05, "loss": 1.4438, "step": 800 }, { "epoch": 1.2832, "grad_norm": 1.6479829549789429, "learning_rate": 3.121787650670781e-05, "loss": 1.5115, "step": 802 }, { "epoch": 1.2864, "grad_norm": 1.4651159048080444, "learning_rate": 3.097285712288605e-05, "loss": 1.4111, "step": 804 }, { "epoch": 1.2896, "grad_norm": 1.4425824880599976, "learning_rate": 3.072837086629305e-05, "loss": 1.4359, "step": 806 }, { "epoch": 1.2928, "grad_norm": 1.4235820770263672, "learning_rate": 3.0484424587262655e-05, "loss": 1.4132, "step": 808 }, { "epoch": 1.296, "grad_norm": 1.471458911895752, "learning_rate": 3.024102512099889e-05, "loss": 1.4653, "step": 810 }, { "epoch": 1.2992, "grad_norm": 1.6857844591140747, "learning_rate": 2.9998179287384485e-05, "loss": 1.5749, "step": 812 }, { "epoch": 1.3024, "grad_norm": 1.4331309795379639, "learning_rate": 2.975589389078975e-05, "loss": 1.4682, "step": 814 }, { "epoch": 1.3056, "grad_norm": 1.3838329315185547, "learning_rate": 2.9514175719881966e-05, "loss": 1.4477, "step": 816 }, { "epoch": 1.3088, "grad_norm": 1.350062370300293, "learning_rate": 2.9273031547435114e-05, "loss": 1.4243, "step": 818 }, { "epoch": 1.312, "grad_norm": 1.5023717880249023, "learning_rate": 2.9032468130140168e-05, "loss": 1.4972, "step": 820 }, { "epoch": 1.3152, "grad_norm": 1.5788859128952026, "learning_rate": 2.8792492208415768e-05, "loss": 1.5411, "step": 822 }, { "epoch": 1.3184, "grad_norm": 1.4530489444732666, "learning_rate": 2.8553110506219283e-05, "loss": 1.4582, "step": 824 }, { "epoch": 1.3216, "grad_norm": 1.3994284868240356, "learning_rate": 2.831432973085848e-05, "loss": 1.4381, "step": 826 }, { "epoch": 1.3248, "grad_norm": 1.51462984085083, "learning_rate": 2.8076156572803635e-05, "loss": 1.4712, "step": 828 }, { "epoch": 1.328, "grad_norm": 1.505707859992981, "learning_rate": 2.783859770549996e-05, "loss": 1.4613, "step": 830 }, { "epoch": 1.3312, "grad_norm": 1.4707581996917725, "learning_rate": 2.760165978518067e-05, "loss": 1.4202, "step": 832 }, { "epoch": 1.3344, "grad_norm": 1.5572878122329712, "learning_rate": 2.7365349450680466e-05, "loss": 1.4835, "step": 834 }, { "epoch": 1.3376000000000001, "grad_norm": 1.6469281911849976, "learning_rate": 2.7129673323249604e-05, "loss": 1.4834, "step": 836 }, { "epoch": 1.3408, "grad_norm": 1.6217268705368042, "learning_rate": 2.689463800636824e-05, "loss": 1.4905, "step": 838 }, { "epoch": 1.3439999999999999, "grad_norm": 1.3879499435424805, "learning_rate": 2.6660250085561457e-05, "loss": 1.4575, "step": 840 }, { "epoch": 1.3472, "grad_norm": 1.436532735824585, "learning_rate": 2.6426516128214807e-05, "loss": 1.5241, "step": 842 }, { "epoch": 1.3504, "grad_norm": 1.4625749588012695, "learning_rate": 2.619344268339021e-05, "loss": 1.4646, "step": 844 }, { "epoch": 1.3536000000000001, "grad_norm": 1.2639025449752808, "learning_rate": 2.5961036281642493e-05, "loss": 1.4009, "step": 846 }, { "epoch": 1.3568, "grad_norm": 1.3335165977478027, "learning_rate": 2.572930343483637e-05, "loss": 1.3794, "step": 848 }, { "epoch": 1.3599999999999999, "grad_norm": 1.357528805732727, "learning_rate": 2.54982506359641e-05, "loss": 1.4393, "step": 850 }, { "epoch": 1.3632, "grad_norm": 1.277764916419983, "learning_rate": 2.526788435896339e-05, "loss": 1.4483, "step": 852 }, { "epoch": 1.3664, "grad_norm": 1.3506453037261963, "learning_rate": 2.5038211058536133e-05, "loss": 1.4466, "step": 854 }, { "epoch": 1.3696, "grad_norm": 1.5710474252700806, "learning_rate": 2.4809237169967458e-05, "loss": 1.553, "step": 856 }, { "epoch": 1.3728, "grad_norm": 1.3960719108581543, "learning_rate": 2.4580969108945533e-05, "loss": 1.4599, "step": 858 }, { "epoch": 1.376, "grad_norm": 1.4087730646133423, "learning_rate": 2.435341327138168e-05, "loss": 1.4237, "step": 860 }, { "epoch": 1.3792, "grad_norm": 1.4425933361053467, "learning_rate": 2.4126576033231208e-05, "loss": 1.3405, "step": 862 }, { "epoch": 1.3824, "grad_norm": 1.4608137607574463, "learning_rate": 2.3900463750314834e-05, "loss": 1.4638, "step": 864 }, { "epoch": 1.3856, "grad_norm": 1.4107086658477783, "learning_rate": 2.3675082758140475e-05, "loss": 1.5278, "step": 866 }, { "epoch": 1.3888, "grad_norm": 1.3733059167861938, "learning_rate": 2.3450439371725825e-05, "loss": 1.5054, "step": 868 }, { "epoch": 1.392, "grad_norm": 1.3773467540740967, "learning_rate": 2.3226539885421343e-05, "loss": 1.4151, "step": 870 }, { "epoch": 1.3952, "grad_norm": 1.4656720161437988, "learning_rate": 2.3003390572734006e-05, "loss": 1.4811, "step": 872 }, { "epoch": 1.3984, "grad_norm": 1.3903800249099731, "learning_rate": 2.2780997686151378e-05, "loss": 1.4274, "step": 874 }, { "epoch": 1.4016, "grad_norm": 1.3573490381240845, "learning_rate": 2.255936745696652e-05, "loss": 1.4141, "step": 876 }, { "epoch": 1.4048, "grad_norm": 1.312809944152832, "learning_rate": 2.2338506095103334e-05, "loss": 1.431, "step": 878 }, { "epoch": 1.408, "grad_norm": 1.3963521718978882, "learning_rate": 2.2118419788942672e-05, "loss": 1.4246, "step": 880 }, { "epoch": 1.4112, "grad_norm": 1.5214847326278687, "learning_rate": 2.189911470514881e-05, "loss": 1.4489, "step": 882 }, { "epoch": 1.4144, "grad_norm": 1.5130972862243652, "learning_rate": 2.1680596988496705e-05, "loss": 1.4718, "step": 884 }, { "epoch": 1.4176, "grad_norm": 1.4655705690383911, "learning_rate": 2.1462872761699905e-05, "loss": 1.4782, "step": 886 }, { "epoch": 1.4208, "grad_norm": 1.4640536308288574, "learning_rate": 2.1245948125238867e-05, "loss": 1.4003, "step": 888 }, { "epoch": 1.424, "grad_norm": 1.50718092918396, "learning_rate": 2.1029829157190117e-05, "loss": 1.43, "step": 890 }, { "epoch": 1.4272, "grad_norm": 1.3067643642425537, "learning_rate": 2.081452191305587e-05, "loss": 1.4523, "step": 892 }, { "epoch": 1.4304000000000001, "grad_norm": 1.4971873760223389, "learning_rate": 2.06000324255945e-05, "loss": 1.4485, "step": 894 }, { "epoch": 1.4336, "grad_norm": 1.4533696174621582, "learning_rate": 2.0386366704651315e-05, "loss": 1.4637, "step": 896 }, { "epoch": 1.4368, "grad_norm": 1.3518776893615723, "learning_rate": 2.0173530736990304e-05, "loss": 1.3545, "step": 898 }, { "epoch": 1.44, "grad_norm": 1.395410180091858, "learning_rate": 1.9961530486126327e-05, "loss": 1.383, "step": 900 }, { "epoch": 1.4432, "grad_norm": 2.61722731590271, "learning_rate": 1.9750371892158103e-05, "loss": 1.6465, "step": 902 }, { "epoch": 1.4464000000000001, "grad_norm": 1.403191089630127, "learning_rate": 1.9540060871601646e-05, "loss": 1.4416, "step": 904 }, { "epoch": 1.4496, "grad_norm": 1.3970694541931152, "learning_rate": 1.933060331722457e-05, "loss": 1.5657, "step": 906 }, { "epoch": 1.4527999999999999, "grad_norm": 1.3750842809677124, "learning_rate": 1.9122005097881014e-05, "loss": 1.5444, "step": 908 }, { "epoch": 1.456, "grad_norm": 1.5135127305984497, "learning_rate": 1.8914272058347088e-05, "loss": 1.5246, "step": 910 }, { "epoch": 1.4592, "grad_norm": 1.4489692449569702, "learning_rate": 1.8707410019157196e-05, "loss": 1.4398, "step": 912 }, { "epoch": 1.4624, "grad_norm": 1.508467435836792, "learning_rate": 1.8501424776440907e-05, "loss": 1.4185, "step": 914 }, { "epoch": 1.4656, "grad_norm": 1.3195210695266724, "learning_rate": 1.829632210176061e-05, "loss": 1.4815, "step": 916 }, { "epoch": 1.4687999999999999, "grad_norm": 1.3782057762145996, "learning_rate": 1.809210774194971e-05, "loss": 1.4895, "step": 918 }, { "epoch": 1.472, "grad_norm": 1.4828996658325195, "learning_rate": 1.7888787418951645e-05, "loss": 1.4907, "step": 920 }, { "epoch": 1.4752, "grad_norm": 1.409317970275879, "learning_rate": 1.7686366829659628e-05, "loss": 1.4705, "step": 922 }, { "epoch": 1.4784, "grad_norm": 1.3606739044189453, "learning_rate": 1.74848516457569e-05, "loss": 1.4784, "step": 924 }, { "epoch": 1.4816, "grad_norm": 1.4307705163955688, "learning_rate": 1.72842475135579e-05, "loss": 1.4639, "step": 926 }, { "epoch": 1.4848, "grad_norm": 1.4825451374053955, "learning_rate": 1.7084560053850024e-05, "loss": 1.4771, "step": 928 }, { "epoch": 1.488, "grad_norm": 1.5933488607406616, "learning_rate": 1.6885794861736183e-05, "loss": 1.3967, "step": 930 }, { "epoch": 1.4912, "grad_norm": 1.4622035026550293, "learning_rate": 1.668795750647796e-05, "loss": 1.3764, "step": 932 }, { "epoch": 1.4944, "grad_norm": 1.387617588043213, "learning_rate": 1.6491053531339607e-05, "loss": 1.4615, "step": 934 }, { "epoch": 1.4976, "grad_norm": 1.3928184509277344, "learning_rate": 1.62950884534327e-05, "loss": 1.3788, "step": 936 }, { "epoch": 1.5008, "grad_norm": 1.4277182817459106, "learning_rate": 1.6100067763561626e-05, "loss": 1.3684, "step": 938 }, { "epoch": 1.504, "grad_norm": 1.519103765487671, "learning_rate": 1.5905996926069628e-05, "loss": 1.4659, "step": 940 }, { "epoch": 1.5072, "grad_norm": 1.3705520629882812, "learning_rate": 1.5712881378685755e-05, "loss": 1.4145, "step": 942 }, { "epoch": 1.5104, "grad_norm": 1.437618374824524, "learning_rate": 1.5520726532372537e-05, "loss": 1.3951, "step": 944 }, { "epoch": 1.5135999999999998, "grad_norm": 1.406826376914978, "learning_rate": 1.532953777117429e-05, "loss": 1.509, "step": 946 }, { "epoch": 1.5168, "grad_norm": 1.4625985622406006, "learning_rate": 1.5139320452066313e-05, "loss": 1.4415, "step": 948 }, { "epoch": 1.52, "grad_norm": 1.500941514968872, "learning_rate": 1.4950079904804759e-05, "loss": 1.4751, "step": 950 }, { "epoch": 1.5232, "grad_norm": 1.4515104293823242, "learning_rate": 1.4761821431777373e-05, "loss": 1.441, "step": 952 }, { "epoch": 1.5264, "grad_norm": 1.3307709693908691, "learning_rate": 1.4574550307854817e-05, "loss": 1.4566, "step": 954 }, { "epoch": 1.5295999999999998, "grad_norm": 1.393490195274353, "learning_rate": 1.4388271780242929e-05, "loss": 1.4483, "step": 956 }, { "epoch": 1.5328, "grad_norm": 1.3786524534225464, "learning_rate": 1.4202991068335697e-05, "loss": 1.4522, "step": 958 }, { "epoch": 1.536, "grad_norm": 1.4812992811203003, "learning_rate": 1.4018713363569035e-05, "loss": 1.5197, "step": 960 }, { "epoch": 1.5392000000000001, "grad_norm": 1.5414302349090576, "learning_rate": 1.3835443829275268e-05, "loss": 1.4215, "step": 962 }, { "epoch": 1.5424, "grad_norm": 1.420955777168274, "learning_rate": 1.365318760053848e-05, "loss": 1.4357, "step": 964 }, { "epoch": 1.5455999999999999, "grad_norm": 1.322597861289978, "learning_rate": 1.3471949784050702e-05, "loss": 1.3364, "step": 966 }, { "epoch": 1.5488, "grad_norm": 1.356268048286438, "learning_rate": 1.3291735457968701e-05, "loss": 1.4199, "step": 968 }, { "epoch": 1.552, "grad_norm": 1.3691470623016357, "learning_rate": 1.3112549671771796e-05, "loss": 1.425, "step": 970 }, { "epoch": 1.5552000000000001, "grad_norm": 1.4271376132965088, "learning_rate": 1.2934397446120306e-05, "loss": 1.4198, "step": 972 }, { "epoch": 1.5584, "grad_norm": 1.37627375125885, "learning_rate": 1.2757283772714957e-05, "loss": 1.395, "step": 974 }, { "epoch": 1.5615999999999999, "grad_norm": 1.3378015756607056, "learning_rate": 1.2581213614156928e-05, "loss": 1.3637, "step": 976 }, { "epoch": 1.5648, "grad_norm": 1.5486069917678833, "learning_rate": 1.2406191903808844e-05, "loss": 1.4776, "step": 978 }, { "epoch": 1.568, "grad_norm": 1.4717572927474976, "learning_rate": 1.2232223545656552e-05, "loss": 1.4642, "step": 980 }, { "epoch": 1.5712000000000002, "grad_norm": 1.5726338624954224, "learning_rate": 1.205931341417173e-05, "loss": 1.4629, "step": 982 }, { "epoch": 1.5744, "grad_norm": 1.359563946723938, "learning_rate": 1.1887466354175253e-05, "loss": 1.4208, "step": 984 }, { "epoch": 1.5776, "grad_norm": 1.390755295753479, "learning_rate": 1.1716687180701474e-05, "loss": 1.4227, "step": 986 }, { "epoch": 1.5808, "grad_norm": 1.516890048980713, "learning_rate": 1.1546980678863361e-05, "loss": 1.4247, "step": 988 }, { "epoch": 1.584, "grad_norm": 1.5359797477722168, "learning_rate": 1.1378351603718312e-05, "loss": 1.3898, "step": 990 }, { "epoch": 1.5872000000000002, "grad_norm": 1.3802939653396606, "learning_rate": 1.1210804680135022e-05, "loss": 1.3354, "step": 992 }, { "epoch": 1.5904, "grad_norm": 1.4020977020263672, "learning_rate": 1.1044344602661034e-05, "loss": 1.4493, "step": 994 }, { "epoch": 1.5936, "grad_norm": 1.373717188835144, "learning_rate": 1.0878976035391252e-05, "loss": 1.3899, "step": 996 }, { "epoch": 1.5968, "grad_norm": 1.3707786798477173, "learning_rate": 1.0714703611837201e-05, "loss": 1.4054, "step": 998 }, { "epoch": 1.6, "grad_norm": 1.388121485710144, "learning_rate": 1.0551531934797243e-05, "loss": 1.3393, "step": 1000 }, { "epoch": 1.6032, "grad_norm": 1.5597281455993652, "learning_rate": 1.0389465576227558e-05, "loss": 1.4488, "step": 1002 }, { "epoch": 1.6064, "grad_norm": 1.2360092401504517, "learning_rate": 1.0228509077114146e-05, "loss": 1.4802, "step": 1004 }, { "epoch": 1.6096, "grad_norm": 1.3134233951568604, "learning_rate": 1.0068666947345456e-05, "loss": 1.3538, "step": 1006 }, { "epoch": 1.6128, "grad_norm": 1.334416151046753, "learning_rate": 9.909943665586102e-06, "loss": 1.3917, "step": 1008 }, { "epoch": 1.616, "grad_norm": 1.3590480089187622, "learning_rate": 9.752343679151399e-06, "loss": 1.4165, "step": 1010 }, { "epoch": 1.6192, "grad_norm": 1.4533120393753052, "learning_rate": 9.595871403882661e-06, "loss": 1.5106, "step": 1012 }, { "epoch": 1.6223999999999998, "grad_norm": 1.5530134439468384, "learning_rate": 9.440531224023552e-06, "loss": 1.4217, "step": 1014 }, { "epoch": 1.6256, "grad_norm": 1.4636127948760986, "learning_rate": 9.286327492097196e-06, "loss": 1.3306, "step": 1016 }, { "epoch": 1.6288, "grad_norm": 1.4135422706604004, "learning_rate": 9.133264528784274e-06, "loss": 1.425, "step": 1018 }, { "epoch": 1.6320000000000001, "grad_norm": 1.4650404453277588, "learning_rate": 8.981346622801905e-06, "loss": 1.5058, "step": 1020 }, { "epoch": 1.6352, "grad_norm": 1.4439706802368164, "learning_rate": 8.830578030783493e-06, "loss": 1.4755, "step": 1022 }, { "epoch": 1.6383999999999999, "grad_norm": 1.7018922567367554, "learning_rate": 8.680962977159502e-06, "loss": 1.457, "step": 1024 }, { "epoch": 1.6416, "grad_norm": 5.544206619262695, "learning_rate": 8.53250565403903e-06, "loss": 1.5578, "step": 1026 }, { "epoch": 1.6448, "grad_norm": 1.4300087690353394, "learning_rate": 8.385210221092382e-06, "loss": 1.4479, "step": 1028 }, { "epoch": 1.6480000000000001, "grad_norm": 1.5086474418640137, "learning_rate": 8.239080805434513e-06, "loss": 1.4618, "step": 1030 }, { "epoch": 1.6512, "grad_norm": 1.4398646354675293, "learning_rate": 8.094121501509399e-06, "loss": 1.4416, "step": 1032 }, { "epoch": 1.6543999999999999, "grad_norm": 1.3153340816497803, "learning_rate": 7.950336370975304e-06, "loss": 1.3792, "step": 1034 }, { "epoch": 1.6576, "grad_norm": 1.351747751235962, "learning_rate": 7.80772944259096e-06, "loss": 1.4545, "step": 1036 }, { "epoch": 1.6608, "grad_norm": 1.4315369129180908, "learning_rate": 7.666304712102695e-06, "loss": 1.3891, "step": 1038 }, { "epoch": 1.6640000000000001, "grad_norm": 1.2921706438064575, "learning_rate": 7.526066142132521e-06, "loss": 1.4527, "step": 1040 }, { "epoch": 1.6672, "grad_norm": 1.3649907112121582, "learning_rate": 7.3870176620670194e-06, "loss": 1.3692, "step": 1042 }, { "epoch": 1.6703999999999999, "grad_norm": 1.3974645137786865, "learning_rate": 7.249163167947287e-06, "loss": 1.3702, "step": 1044 }, { "epoch": 1.6736, "grad_norm": 1.3394696712493896, "learning_rate": 7.1125065223598076e-06, "loss": 1.3851, "step": 1046 }, { "epoch": 1.6768, "grad_norm": 1.3612364530563354, "learning_rate": 6.9770515543281455e-06, "loss": 1.4355, "step": 1048 }, { "epoch": 1.6800000000000002, "grad_norm": 1.442084789276123, "learning_rate": 6.842802059205727e-06, "loss": 1.4225, "step": 1050 }, { "epoch": 1.6832, "grad_norm": 1.2821780443191528, "learning_rate": 6.709761798569442e-06, "loss": 1.4098, "step": 1052 }, { "epoch": 1.6864, "grad_norm": 1.3669836521148682, "learning_rate": 6.577934500114335e-06, "loss": 1.4238, "step": 1054 }, { "epoch": 1.6896, "grad_norm": 1.3460770845413208, "learning_rate": 6.44732385754902e-06, "loss": 1.4161, "step": 1056 }, { "epoch": 1.6928, "grad_norm": 1.488351821899414, "learning_rate": 6.3179335304923095e-06, "loss": 1.4265, "step": 1058 }, { "epoch": 1.696, "grad_norm": 1.3682546615600586, "learning_rate": 6.189767144370645e-06, "loss": 1.408, "step": 1060 }, { "epoch": 1.6992, "grad_norm": 1.5109198093414307, "learning_rate": 6.062828290316469e-06, "loss": 1.4179, "step": 1062 }, { "epoch": 1.7024, "grad_norm": 1.4661554098129272, "learning_rate": 5.937120525067641e-06, "loss": 1.3816, "step": 1064 }, { "epoch": 1.7056, "grad_norm": 1.3422075510025024, "learning_rate": 5.812647370867763e-06, "loss": 1.4014, "step": 1066 }, { "epoch": 1.7088, "grad_norm": 1.3991550207138062, "learning_rate": 5.689412315367543e-06, "loss": 1.4443, "step": 1068 }, { "epoch": 1.712, "grad_norm": 1.3040037155151367, "learning_rate": 5.567418811526981e-06, "loss": 1.3615, "step": 1070 }, { "epoch": 1.7151999999999998, "grad_norm": 1.3308135271072388, "learning_rate": 5.4466702775186785e-06, "loss": 1.3662, "step": 1072 }, { "epoch": 1.7184, "grad_norm": 1.3251948356628418, "learning_rate": 5.327170096632089e-06, "loss": 1.4263, "step": 1074 }, { "epoch": 1.7216, "grad_norm": 1.219191074371338, "learning_rate": 5.208921617178641e-06, "loss": 1.4676, "step": 1076 }, { "epoch": 1.7248, "grad_norm": 1.364389181137085, "learning_rate": 5.091928152397984e-06, "loss": 1.4124, "step": 1078 }, { "epoch": 1.728, "grad_norm": 1.4420322179794312, "learning_rate": 4.976192980365124e-06, "loss": 1.4189, "step": 1080 }, { "epoch": 1.7311999999999999, "grad_norm": 1.3646230697631836, "learning_rate": 4.861719343898613e-06, "loss": 1.4043, "step": 1082 }, { "epoch": 1.7344, "grad_norm": 1.4223638772964478, "learning_rate": 4.748510450469623e-06, "loss": 1.4538, "step": 1084 }, { "epoch": 1.7376, "grad_norm": 1.3192670345306396, "learning_rate": 4.63656947211214e-06, "loss": 1.4187, "step": 1086 }, { "epoch": 1.7408000000000001, "grad_norm": 1.3329640626907349, "learning_rate": 4.525899545334023e-06, "loss": 1.3402, "step": 1088 }, { "epoch": 1.744, "grad_norm": 1.5323829650878906, "learning_rate": 4.416503771029201e-06, "loss": 1.6182, "step": 1090 }, { "epoch": 1.7471999999999999, "grad_norm": 1.3993905782699585, "learning_rate": 4.308385214390709e-06, "loss": 1.4359, "step": 1092 }, { "epoch": 1.7504, "grad_norm": 1.3519290685653687, "learning_rate": 4.2015469048248375e-06, "loss": 1.4073, "step": 1094 }, { "epoch": 1.7536, "grad_norm": 1.3193116188049316, "learning_rate": 4.095991835866275e-06, "loss": 1.3293, "step": 1096 }, { "epoch": 1.7568000000000001, "grad_norm": 1.3401769399642944, "learning_rate": 3.99172296509418e-06, "loss": 1.4281, "step": 1098 }, { "epoch": 1.76, "grad_norm": 1.3036984205245972, "learning_rate": 3.888743214049346e-06, "loss": 1.4516, "step": 1100 }, { "epoch": 1.7631999999999999, "grad_norm": 1.2876367568969727, "learning_rate": 3.7870554681523287e-06, "loss": 1.3857, "step": 1102 }, { "epoch": 1.7664, "grad_norm": 1.286058783531189, "learning_rate": 3.6866625766226293e-06, "loss": 1.3421, "step": 1104 }, { "epoch": 1.7696, "grad_norm": 1.2945473194122314, "learning_rate": 3.587567352398796e-06, "loss": 1.4383, "step": 1106 }, { "epoch": 1.7728000000000002, "grad_norm": 1.3773088455200195, "learning_rate": 3.489772572059674e-06, "loss": 1.4287, "step": 1108 }, { "epoch": 1.776, "grad_norm": 1.2338217496871948, "learning_rate": 3.393280975746588e-06, "loss": 1.3083, "step": 1110 }, { "epoch": 1.7792, "grad_norm": 2.751504421234131, "learning_rate": 3.2980952670865317e-06, "loss": 1.4268, "step": 1112 }, { "epoch": 1.7824, "grad_norm": 1.335714340209961, "learning_rate": 3.2042181131164528e-06, "loss": 1.4319, "step": 1114 }, { "epoch": 1.7856, "grad_norm": 1.2379279136657715, "learning_rate": 3.11165214420851e-06, "loss": 1.3696, "step": 1116 }, { "epoch": 1.7888, "grad_norm": 1.3956067562103271, "learning_rate": 3.020399953996389e-06, "loss": 1.4996, "step": 1118 }, { "epoch": 1.792, "grad_norm": 1.3509949445724487, "learning_rate": 2.9304640993025988e-06, "loss": 1.4443, "step": 1120 }, { "epoch": 1.7952, "grad_norm": 1.2706594467163086, "learning_rate": 2.8418471000668523e-06, "loss": 1.3773, "step": 1122 }, { "epoch": 1.7984, "grad_norm": 1.4019887447357178, "learning_rate": 2.7545514392754437e-06, "loss": 1.4795, "step": 1124 }, { "epoch": 1.8016, "grad_norm": 1.3648223876953125, "learning_rate": 2.6685795628917266e-06, "loss": 1.3566, "step": 1126 }, { "epoch": 1.8048, "grad_norm": 1.5225048065185547, "learning_rate": 2.5839338797875036e-06, "loss": 1.4291, "step": 1128 }, { "epoch": 1.808, "grad_norm": 1.4786689281463623, "learning_rate": 2.500616761675578e-06, "loss": 1.6036, "step": 1130 }, { "epoch": 1.8112, "grad_norm": 1.439139723777771, "learning_rate": 2.41863054304331e-06, "loss": 1.445, "step": 1132 }, { "epoch": 1.8144, "grad_norm": 1.322020411491394, "learning_rate": 2.3379775210871648e-06, "loss": 1.4561, "step": 1134 }, { "epoch": 1.8176, "grad_norm": 1.3266148567199707, "learning_rate": 2.2586599556483734e-06, "loss": 1.4347, "step": 1136 }, { "epoch": 1.8208, "grad_norm": 1.3589297533035278, "learning_rate": 2.180680069149621e-06, "loss": 1.3969, "step": 1138 }, { "epoch": 1.8239999999999998, "grad_norm": 1.3703463077545166, "learning_rate": 2.104040046532768e-06, "loss": 1.4253, "step": 1140 }, { "epoch": 1.8272, "grad_norm": 1.2915222644805908, "learning_rate": 2.0287420351976063e-06, "loss": 1.4272, "step": 1142 }, { "epoch": 1.8304, "grad_norm": 1.2881819009780884, "learning_rate": 1.954788144941727e-06, "loss": 1.3726, "step": 1144 }, { "epoch": 1.8336000000000001, "grad_norm": 1.2942179441452026, "learning_rate": 1.8821804479013772e-06, "loss": 1.3269, "step": 1146 }, { "epoch": 1.8368, "grad_norm": 1.4517358541488647, "learning_rate": 1.81092097849343e-06, "loss": 1.4521, "step": 1148 }, { "epoch": 1.8399999999999999, "grad_norm": 1.3010282516479492, "learning_rate": 1.7410117333583498e-06, "loss": 1.3935, "step": 1150 }, { "epoch": 1.8432, "grad_norm": 1.5645641088485718, "learning_rate": 1.6724546713042577e-06, "loss": 1.3637, "step": 1152 }, { "epoch": 1.8464, "grad_norm": 1.4219244718551636, "learning_rate": 1.6052517132520651e-06, "loss": 1.4567, "step": 1154 }, { "epoch": 1.8496000000000001, "grad_norm": 1.4337780475616455, "learning_rate": 1.5394047421816327e-06, "loss": 1.429, "step": 1156 }, { "epoch": 1.8528, "grad_norm": 1.561288595199585, "learning_rate": 1.4749156030790024e-06, "loss": 1.4846, "step": 1158 }, { "epoch": 1.8559999999999999, "grad_norm": 1.3074960708618164, "learning_rate": 1.4117861028847267e-06, "loss": 1.3928, "step": 1160 }, { "epoch": 1.8592, "grad_norm": 1.285032033920288, "learning_rate": 1.3500180104432325e-06, "loss": 1.365, "step": 1162 }, { "epoch": 1.8624, "grad_norm": 1.2760525941848755, "learning_rate": 1.2896130564532427e-06, "loss": 1.4376, "step": 1164 }, { "epoch": 1.8656000000000001, "grad_norm": 1.3905929327011108, "learning_rate": 1.2305729334192994e-06, "loss": 1.4799, "step": 1166 }, { "epoch": 1.8688, "grad_norm": 1.388978362083435, "learning_rate": 1.1728992956043238e-06, "loss": 1.4397, "step": 1168 }, { "epoch": 1.8719999999999999, "grad_norm": 3.00641131401062, "learning_rate": 1.1165937589833087e-06, "loss": 1.5248, "step": 1170 }, { "epoch": 1.8752, "grad_norm": 1.3412961959838867, "learning_rate": 1.061657901197971e-06, "loss": 1.4096, "step": 1172 }, { "epoch": 1.8784, "grad_norm": 1.2458962202072144, "learning_rate": 1.008093261512616e-06, "loss": 1.4463, "step": 1174 }, { "epoch": 1.8816000000000002, "grad_norm": 1.331132411956787, "learning_rate": 9.559013407709595e-07, "loss": 1.3897, "step": 1176 }, { "epoch": 1.8848, "grad_norm": 1.3351112604141235, "learning_rate": 9.050836013541009e-07, "loss": 1.4563, "step": 1178 }, { "epoch": 1.888, "grad_norm": 1.434446930885315, "learning_rate": 8.55641467139534e-07, "loss": 1.4178, "step": 1180 }, { "epoch": 1.8912, "grad_norm": 1.281200885772705, "learning_rate": 8.075763234612622e-07, "loss": 1.4287, "step": 1182 }, { "epoch": 1.8944, "grad_norm": 1.7233554124832153, "learning_rate": 7.60889517070984e-07, "loss": 1.4914, "step": 1184 }, { "epoch": 1.8976, "grad_norm": 1.3849607706069946, "learning_rate": 7.155823561003361e-07, "loss": 1.3976, "step": 1186 }, { "epoch": 1.9008, "grad_norm": 1.3818809986114502, "learning_rate": 6.716561100242658e-07, "loss": 1.4095, "step": 1188 }, { "epoch": 1.904, "grad_norm": 1.2812706232070923, "learning_rate": 6.291120096254433e-07, "loss": 1.3447, "step": 1190 }, { "epoch": 1.9072, "grad_norm": 1.4153764247894287, "learning_rate": 5.879512469598058e-07, "loss": 1.3582, "step": 1192 }, { "epoch": 1.9104, "grad_norm": 1.609390377998352, "learning_rate": 5.481749753231124e-07, "loss": 1.3982, "step": 1194 }, { "epoch": 1.9136, "grad_norm": 1.3431967496871948, "learning_rate": 5.097843092186583e-07, "loss": 1.4785, "step": 1196 }, { "epoch": 1.9167999999999998, "grad_norm": 1.350616216659546, "learning_rate": 4.7278032432604425e-07, "loss": 1.3739, "step": 1198 }, { "epoch": 1.92, "grad_norm": 1.213797688484192, "learning_rate": 4.371640574710345e-07, "loss": 1.4042, "step": 1200 }, { "epoch": 1.9232, "grad_norm": 1.3762907981872559, "learning_rate": 4.0293650659650184e-07, "loss": 1.3671, "step": 1202 }, { "epoch": 1.9264000000000001, "grad_norm": 1.3090656995773315, "learning_rate": 3.7009863073446673e-07, "loss": 1.426, "step": 1204 }, { "epoch": 1.9296, "grad_norm": 1.3925652503967285, "learning_rate": 3.386513499792354e-07, "loss": 1.4414, "step": 1206 }, { "epoch": 1.9327999999999999, "grad_norm": 1.2840951681137085, "learning_rate": 3.0859554546160965e-07, "loss": 1.297, "step": 1208 }, { "epoch": 1.936, "grad_norm": 1.2935904264450073, "learning_rate": 2.7993205932420053e-07, "loss": 1.3802, "step": 1210 }, { "epoch": 1.9392, "grad_norm": 1.2855424880981445, "learning_rate": 2.5266169469783105e-07, "loss": 1.5116, "step": 1212 }, { "epoch": 1.9424000000000001, "grad_norm": 1.345445990562439, "learning_rate": 2.2678521567903176e-07, "loss": 1.4102, "step": 1214 }, { "epoch": 1.9456, "grad_norm": 1.389737606048584, "learning_rate": 2.023033473086411e-07, "loss": 1.4212, "step": 1216 }, { "epoch": 1.9487999999999999, "grad_norm": 1.4330286979675293, "learning_rate": 1.7921677555147177e-07, "loss": 1.4581, "step": 1218 }, { "epoch": 1.952, "grad_norm": 1.428533673286438, "learning_rate": 1.5752614727712057e-07, "loss": 1.3911, "step": 1220 }, { "epoch": 1.9552, "grad_norm": 1.358577847480774, "learning_rate": 1.3723207024180507e-07, "loss": 1.4441, "step": 1222 }, { "epoch": 1.9584000000000001, "grad_norm": 1.422744631767273, "learning_rate": 1.1833511307136613e-07, "loss": 1.4378, "step": 1224 }, { "epoch": 1.9616, "grad_norm": 1.3814959526062012, "learning_rate": 1.0083580524531955e-07, "loss": 1.4551, "step": 1226 }, { "epoch": 1.9647999999999999, "grad_norm": 1.2662824392318726, "learning_rate": 8.473463708202345e-08, "loss": 1.4284, "step": 1228 }, { "epoch": 1.968, "grad_norm": 1.2959048748016357, "learning_rate": 7.003205972494486e-08, "loss": 1.3193, "step": 1230 }, { "epoch": 1.9712, "grad_norm": 1.3156046867370605, "learning_rate": 5.672848513000873e-08, "loss": 1.4021, "step": 1232 }, { "epoch": 1.9744000000000002, "grad_norm": 1.3085626363754272, "learning_rate": 4.482428605407374e-08, "loss": 1.3919, "step": 1234 }, { "epoch": 1.9776, "grad_norm": 1.3836755752563477, "learning_rate": 3.431979604445745e-08, "loss": 1.4616, "step": 1236 }, { "epoch": 1.9808, "grad_norm": 1.3475788831710815, "learning_rate": 2.521530942962702e-08, "loss": 1.4426, "step": 1238 }, { "epoch": 1.984, "grad_norm": 1.4258958101272583, "learning_rate": 1.7511081310922495e-08, "loss": 1.3884, "step": 1240 }, { "epoch": 1.9872, "grad_norm": 1.2867543697357178, "learning_rate": 1.1207327555429192e-08, "loss": 1.3472, "step": 1242 }, { "epoch": 1.9904, "grad_norm": 1.4145276546478271, "learning_rate": 6.304224789910329e-09, "loss": 1.4185, "step": 1244 }, { "epoch": 1.9936, "grad_norm": 1.2391281127929688, "learning_rate": 2.801910395877627e-09, "loss": 1.3197, "step": 1246 }, { "epoch": 1.9968, "grad_norm": 1.4041131734848022, "learning_rate": 7.004825057277398e-10, "loss": 1.4755, "step": 1248 }, { "epoch": 2.0, "grad_norm": 2.3858015537261963, "learning_rate": 0.0, "loss": 1.3312, "step": 1250 } ], "logging_steps": 2, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 625, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1133643907229286e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }