diff --git "a/checkpoint-1250/trainer_state.json" "b/checkpoint-1250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1250/trainer_state.json" @@ -0,0 +1,4408 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 60.66320037841797, + "learning_rate": 3.1746031746031746e-06, + "loss": 6.1881, + "step": 2 + }, + { + "epoch": 0.0064, + "grad_norm": 42.748294830322266, + "learning_rate": 6.349206349206349e-06, + "loss": 5.7668, + "step": 4 + }, + { + "epoch": 0.0096, + "grad_norm": 13.887235641479492, + "learning_rate": 9.523809523809523e-06, + "loss": 5.0841, + "step": 6 + }, + { + "epoch": 0.0128, + "grad_norm": 12.488709449768066, + "learning_rate": 1.2698412698412699e-05, + "loss": 4.7192, + "step": 8 + }, + { + "epoch": 0.016, + "grad_norm": 7.208029270172119, + "learning_rate": 1.5873015873015872e-05, + "loss": 4.5679, + "step": 10 + }, + { + "epoch": 0.0192, + "grad_norm": 7.34755277633667, + "learning_rate": 1.9047619047619046e-05, + "loss": 4.368, + "step": 12 + }, + { + "epoch": 0.0224, + "grad_norm": 7.489256858825684, + "learning_rate": 2.2222222222222223e-05, + "loss": 4.1398, + "step": 14 + }, + { + "epoch": 0.0256, + "grad_norm": 4.802424430847168, + "learning_rate": 2.5396825396825397e-05, + "loss": 4.0532, + "step": 16 + }, + { + "epoch": 0.0288, + "grad_norm": 4.138615131378174, + "learning_rate": 2.857142857142857e-05, + "loss": 3.7559, + "step": 18 + }, + { + "epoch": 0.032, + "grad_norm": 5.362161636352539, + "learning_rate": 3.1746031746031745e-05, + "loss": 3.4854, + "step": 20 + }, + { + "epoch": 0.0352, + "grad_norm": 3.878138303756714, + "learning_rate": 3.492063492063492e-05, + "loss": 3.5869, + "step": 22 + }, + { + "epoch": 0.0384, + "grad_norm": 3.480282783508301, + "learning_rate": 3.809523809523809e-05, + "loss": 3.2734, + "step": 24 + }, + { + "epoch": 0.0416, + "grad_norm": 3.527137279510498, + "learning_rate": 4.126984126984127e-05, + "loss": 3.1484, + "step": 26 + }, + { + "epoch": 0.0448, + "grad_norm": 3.094705820083618, + "learning_rate": 4.4444444444444447e-05, + "loss": 3.0356, + "step": 28 + }, + { + "epoch": 0.048, + "grad_norm": 4.064608573913574, + "learning_rate": 4.761904761904762e-05, + "loss": 3.0157, + "step": 30 + }, + { + "epoch": 0.0512, + "grad_norm": 3.166187047958374, + "learning_rate": 5.0793650793650794e-05, + "loss": 2.9229, + "step": 32 + }, + { + "epoch": 0.0544, + "grad_norm": 2.909856081008911, + "learning_rate": 5.396825396825397e-05, + "loss": 2.8386, + "step": 34 + }, + { + "epoch": 0.0576, + "grad_norm": 3.2195472717285156, + "learning_rate": 5.714285714285714e-05, + "loss": 2.7819, + "step": 36 + }, + { + "epoch": 0.0608, + "grad_norm": 2.603515625, + "learning_rate": 6.0317460317460316e-05, + "loss": 2.7116, + "step": 38 + }, + { + "epoch": 0.064, + "grad_norm": 3.353161096572876, + "learning_rate": 6.349206349206349e-05, + "loss": 2.7137, + "step": 40 + }, + { + "epoch": 0.0672, + "grad_norm": 2.7278943061828613, + "learning_rate": 6.666666666666667e-05, + "loss": 2.6213, + "step": 42 + }, + { + "epoch": 0.0704, + "grad_norm": 3.275580883026123, + "learning_rate": 6.984126984126984e-05, + "loss": 2.5585, + "step": 44 + }, + { + "epoch": 0.0736, + "grad_norm": 2.934300422668457, + "learning_rate": 7.301587301587302e-05, + "loss": 2.6126, + "step": 46 + }, + { + "epoch": 0.0768, + "grad_norm": 2.483461380004883, + "learning_rate": 7.619047619047618e-05, + "loss": 2.4616, + "step": 48 + }, + { + "epoch": 0.08, + "grad_norm": 2.5167787075042725, + "learning_rate": 7.936507936507937e-05, + "loss": 2.4316, + "step": 50 + }, + { + "epoch": 0.0832, + "grad_norm": 2.211185932159424, + "learning_rate": 8.253968253968255e-05, + "loss": 2.63, + "step": 52 + }, + { + "epoch": 0.0864, + "grad_norm": 3.2666075229644775, + "learning_rate": 8.571428571428571e-05, + "loss": 2.4892, + "step": 54 + }, + { + "epoch": 0.0896, + "grad_norm": 2.849605083465576, + "learning_rate": 8.888888888888889e-05, + "loss": 2.4584, + "step": 56 + }, + { + "epoch": 0.0928, + "grad_norm": 3.0103588104248047, + "learning_rate": 9.206349206349206e-05, + "loss": 2.2766, + "step": 58 + }, + { + "epoch": 0.096, + "grad_norm": 2.306534767150879, + "learning_rate": 9.523809523809524e-05, + "loss": 2.4357, + "step": 60 + }, + { + "epoch": 0.0992, + "grad_norm": 2.3400485515594482, + "learning_rate": 9.841269841269841e-05, + "loss": 2.4118, + "step": 62 + }, + { + "epoch": 0.1024, + "grad_norm": 2.583407163619995, + "learning_rate": 9.99998248790669e-05, + "loss": 2.2326, + "step": 64 + }, + { + "epoch": 0.1056, + "grad_norm": 2.57265567779541, + "learning_rate": 9.999842391896222e-05, + "loss": 2.2923, + "step": 66 + }, + { + "epoch": 0.1088, + "grad_norm": 2.307471990585327, + "learning_rate": 9.999562203800676e-05, + "loss": 2.2235, + "step": 68 + }, + { + "epoch": 0.112, + "grad_norm": 2.357084035873413, + "learning_rate": 9.999141931470729e-05, + "loss": 2.2896, + "step": 70 + }, + { + "epoch": 0.1152, + "grad_norm": 2.6181581020355225, + "learning_rate": 9.998581586682116e-05, + "loss": 2.3015, + "step": 72 + }, + { + "epoch": 0.1184, + "grad_norm": 2.253117799758911, + "learning_rate": 9.997881185135307e-05, + "loss": 2.1824, + "step": 74 + }, + { + "epoch": 0.1216, + "grad_norm": 2.546729326248169, + "learning_rate": 9.997040746455062e-05, + "loss": 2.1502, + "step": 76 + }, + { + "epoch": 0.1248, + "grad_norm": 2.4144699573516846, + "learning_rate": 9.996060294189887e-05, + "loss": 2.3715, + "step": 78 + }, + { + "epoch": 0.128, + "grad_norm": 2.3093016147613525, + "learning_rate": 9.994939855811362e-05, + "loss": 2.2753, + "step": 80 + }, + { + "epoch": 0.1312, + "grad_norm": 2.5628409385681152, + "learning_rate": 9.993679462713395e-05, + "loss": 2.3152, + "step": 82 + }, + { + "epoch": 0.1344, + "grad_norm": 2.549136161804199, + "learning_rate": 9.992279150211314e-05, + "loss": 2.1171, + "step": 84 + }, + { + "epoch": 0.1376, + "grad_norm": 2.4570610523223877, + "learning_rate": 9.990738957540896e-05, + "loss": 2.2414, + "step": 86 + }, + { + "epoch": 0.1408, + "grad_norm": 2.256564140319824, + "learning_rate": 9.989058927857263e-05, + "loss": 2.1324, + "step": 88 + }, + { + "epoch": 0.144, + "grad_norm": 2.818751811981201, + "learning_rate": 9.987239108233668e-05, + "loss": 2.184, + "step": 90 + }, + { + "epoch": 0.1472, + "grad_norm": 2.432871103286743, + "learning_rate": 9.985279549660185e-05, + "loss": 2.1899, + "step": 92 + }, + { + "epoch": 0.1504, + "grad_norm": 2.1021323204040527, + "learning_rate": 9.983180307042274e-05, + "loss": 2.1064, + "step": 94 + }, + { + "epoch": 0.1536, + "grad_norm": 2.7487058639526367, + "learning_rate": 9.980941439199246e-05, + "loss": 2.2197, + "step": 96 + }, + { + "epoch": 0.1568, + "grad_norm": 2.82835054397583, + "learning_rate": 9.97856300886261e-05, + "loss": 2.2048, + "step": 98 + }, + { + "epoch": 0.16, + "grad_norm": 2.25872802734375, + "learning_rate": 9.976045082674319e-05, + "loss": 2.1002, + "step": 100 + }, + { + "epoch": 0.1632, + "grad_norm": 2.040614366531372, + "learning_rate": 9.973387731184902e-05, + "loss": 2.1031, + "step": 102 + }, + { + "epoch": 0.1664, + "grad_norm": 2.437248706817627, + "learning_rate": 9.97059102885149e-05, + "loss": 2.1416, + "step": 104 + }, + { + "epoch": 0.1696, + "grad_norm": 2.0928878784179688, + "learning_rate": 9.967655054035727e-05, + "loss": 2.1576, + "step": 106 + }, + { + "epoch": 0.1728, + "grad_norm": 2.2243545055389404, + "learning_rate": 9.964579889001569e-05, + "loss": 1.9863, + "step": 108 + }, + { + "epoch": 0.176, + "grad_norm": 2.1860439777374268, + "learning_rate": 9.961365619912989e-05, + "loss": 2.0016, + "step": 110 + }, + { + "epoch": 0.1792, + "grad_norm": 2.527122735977173, + "learning_rate": 9.95801233683156e-05, + "loss": 2.1272, + "step": 112 + }, + { + "epoch": 0.1824, + "grad_norm": 1.8613876104354858, + "learning_rate": 9.954520133713924e-05, + "loss": 2.2001, + "step": 114 + }, + { + "epoch": 0.1856, + "grad_norm": 2.115910530090332, + "learning_rate": 9.950889108409172e-05, + "loss": 2.0871, + "step": 116 + }, + { + "epoch": 0.1888, + "grad_norm": 2.361309051513672, + "learning_rate": 9.947119362656092e-05, + "loss": 2.017, + "step": 118 + }, + { + "epoch": 0.192, + "grad_norm": 2.09470272064209, + "learning_rate": 9.94321100208032e-05, + "loss": 2.1847, + "step": 120 + }, + { + "epoch": 0.1952, + "grad_norm": 1.9747451543807983, + "learning_rate": 9.939164136191384e-05, + "loss": 2.324, + "step": 122 + }, + { + "epoch": 0.1984, + "grad_norm": 1.8229223489761353, + "learning_rate": 9.934978878379636e-05, + "loss": 2.1454, + "step": 124 + }, + { + "epoch": 0.2016, + "grad_norm": 1.9113378524780273, + "learning_rate": 9.930655345913071e-05, + "loss": 2.0096, + "step": 126 + }, + { + "epoch": 0.2048, + "grad_norm": 2.385289192199707, + "learning_rate": 9.926193659934043e-05, + "loss": 2.1029, + "step": 128 + }, + { + "epoch": 0.208, + "grad_norm": 2.101463794708252, + "learning_rate": 9.921593945455869e-05, + "loss": 2.0172, + "step": 130 + }, + { + "epoch": 0.2112, + "grad_norm": 2.2676024436950684, + "learning_rate": 9.916856331359335e-05, + "loss": 1.9966, + "step": 132 + }, + { + "epoch": 0.2144, + "grad_norm": 2.0168168544769287, + "learning_rate": 9.911980950389067e-05, + "loss": 2.1807, + "step": 134 + }, + { + "epoch": 0.2176, + "grad_norm": 2.1054186820983887, + "learning_rate": 9.906967939149831e-05, + "loss": 1.9759, + "step": 136 + }, + { + "epoch": 0.2208, + "grad_norm": 2.3354573249816895, + "learning_rate": 9.901817438102695e-05, + "loss": 1.995, + "step": 138 + }, + { + "epoch": 0.224, + "grad_norm": 2.2721822261810303, + "learning_rate": 9.896529591561093e-05, + "loss": 2.2239, + "step": 140 + }, + { + "epoch": 0.2272, + "grad_norm": 1.9209738969802856, + "learning_rate": 9.891104547686782e-05, + "loss": 2.0051, + "step": 142 + }, + { + "epoch": 0.2304, + "grad_norm": 1.978259801864624, + "learning_rate": 9.8855424584857e-05, + "loss": 2.0367, + "step": 144 + }, + { + "epoch": 0.2336, + "grad_norm": 1.9169765710830688, + "learning_rate": 9.879843479803691e-05, + "loss": 2.1009, + "step": 146 + }, + { + "epoch": 0.2368, + "grad_norm": 1.8380109071731567, + "learning_rate": 9.874007771322151e-05, + "loss": 2.1456, + "step": 148 + }, + { + "epoch": 0.24, + "grad_norm": 2.1143693923950195, + "learning_rate": 9.868035496553546e-05, + "loss": 1.925, + "step": 150 + }, + { + "epoch": 0.2432, + "grad_norm": 1.8774141073226929, + "learning_rate": 9.86192682283684e-05, + "loss": 1.9616, + "step": 152 + }, + { + "epoch": 0.2464, + "grad_norm": 2.3532581329345703, + "learning_rate": 9.855681921332793e-05, + "loss": 2.0289, + "step": 154 + }, + { + "epoch": 0.2496, + "grad_norm": 2.1421797275543213, + "learning_rate": 9.849300967019175e-05, + "loss": 2.0153, + "step": 156 + }, + { + "epoch": 0.2528, + "grad_norm": 2.0029852390289307, + "learning_rate": 9.84278413868586e-05, + "loss": 2.0726, + "step": 158 + }, + { + "epoch": 0.256, + "grad_norm": 2.0344998836517334, + "learning_rate": 9.836131618929819e-05, + "loss": 2.0215, + "step": 160 + }, + { + "epoch": 0.2592, + "grad_norm": 1.8781356811523438, + "learning_rate": 9.82934359415e-05, + "loss": 2.0622, + "step": 162 + }, + { + "epoch": 0.2624, + "grad_norm": 1.9795514345169067, + "learning_rate": 9.822420254542108e-05, + "loss": 2.0249, + "step": 164 + }, + { + "epoch": 0.2656, + "grad_norm": 2.012881278991699, + "learning_rate": 9.815361794093272e-05, + "loss": 1.9815, + "step": 166 + }, + { + "epoch": 0.2688, + "grad_norm": 2.264941453933716, + "learning_rate": 9.808168410576617e-05, + "loss": 2.0232, + "step": 168 + }, + { + "epoch": 0.272, + "grad_norm": 2.4006729125976562, + "learning_rate": 9.800840305545715e-05, + "loss": 2.0844, + "step": 170 + }, + { + "epoch": 0.2752, + "grad_norm": 2.0443308353424072, + "learning_rate": 9.793377684328939e-05, + "loss": 2.2302, + "step": 172 + }, + { + "epoch": 0.2784, + "grad_norm": 2.164515972137451, + "learning_rate": 9.785780756023714e-05, + "loss": 1.9808, + "step": 174 + }, + { + "epoch": 0.2816, + "grad_norm": 1.9512875080108643, + "learning_rate": 9.778049733490655e-05, + "loss": 2.0968, + "step": 176 + }, + { + "epoch": 0.2848, + "grad_norm": 1.9964834451675415, + "learning_rate": 9.770184833347606e-05, + "loss": 1.9889, + "step": 178 + }, + { + "epoch": 0.288, + "grad_norm": 1.9380826950073242, + "learning_rate": 9.762186275963563e-05, + "loss": 1.9766, + "step": 180 + }, + { + "epoch": 0.2912, + "grad_norm": 1.943260669708252, + "learning_rate": 9.754054285452506e-05, + "loss": 1.9298, + "step": 182 + }, + { + "epoch": 0.2944, + "grad_norm": 2.1821844577789307, + "learning_rate": 9.745789089667121e-05, + "loss": 2.1202, + "step": 184 + }, + { + "epoch": 0.2976, + "grad_norm": 1.7526299953460693, + "learning_rate": 9.737390920192408e-05, + "loss": 2.0635, + "step": 186 + }, + { + "epoch": 0.3008, + "grad_norm": 2.229520082473755, + "learning_rate": 9.7288600123392e-05, + "loss": 1.9582, + "step": 188 + }, + { + "epoch": 0.304, + "grad_norm": 2.3614768981933594, + "learning_rate": 9.720196605137565e-05, + "loss": 2.0278, + "step": 190 + }, + { + "epoch": 0.3072, + "grad_norm": 2.1270534992218018, + "learning_rate": 9.71140094133011e-05, + "loss": 2.1036, + "step": 192 + }, + { + "epoch": 0.3104, + "grad_norm": 2.2983131408691406, + "learning_rate": 9.702473267365182e-05, + "loss": 2.0558, + "step": 194 + }, + { + "epoch": 0.3136, + "grad_norm": 1.9561504125595093, + "learning_rate": 9.693413833389956e-05, + "loss": 1.9173, + "step": 196 + }, + { + "epoch": 0.3168, + "grad_norm": 2.234160900115967, + "learning_rate": 9.684222893243431e-05, + "loss": 2.1188, + "step": 198 + }, + { + "epoch": 0.32, + "grad_norm": 1.883965015411377, + "learning_rate": 9.674900704449324e-05, + "loss": 1.9584, + "step": 200 + }, + { + "epoch": 0.3232, + "grad_norm": 1.7237235307693481, + "learning_rate": 9.665447528208836e-05, + "loss": 1.9351, + "step": 202 + }, + { + "epoch": 0.3264, + "grad_norm": 2.0437498092651367, + "learning_rate": 9.655863629393351e-05, + "loss": 1.9079, + "step": 204 + }, + { + "epoch": 0.3296, + "grad_norm": 2.014540195465088, + "learning_rate": 9.64614927653701e-05, + "loss": 1.8612, + "step": 206 + }, + { + "epoch": 0.3328, + "grad_norm": 2.379439115524292, + "learning_rate": 9.636304741829181e-05, + "loss": 1.9976, + "step": 208 + }, + { + "epoch": 0.336, + "grad_norm": 1.962538242340088, + "learning_rate": 9.626330301106837e-05, + "loss": 1.932, + "step": 210 + }, + { + "epoch": 0.3392, + "grad_norm": 1.862244725227356, + "learning_rate": 9.616226233846828e-05, + "loss": 1.8992, + "step": 212 + }, + { + "epoch": 0.3424, + "grad_norm": 1.7304776906967163, + "learning_rate": 9.605992823158046e-05, + "loss": 2.0777, + "step": 214 + }, + { + "epoch": 0.3456, + "grad_norm": 2.2403054237365723, + "learning_rate": 9.595630355773501e-05, + "loss": 1.8658, + "step": 216 + }, + { + "epoch": 0.3488, + "grad_norm": 3.3899903297424316, + "learning_rate": 9.585139122042274e-05, + "loss": 1.9963, + "step": 218 + }, + { + "epoch": 0.352, + "grad_norm": 2.261810064315796, + "learning_rate": 9.574519415921396e-05, + "loss": 1.947, + "step": 220 + }, + { + "epoch": 0.3552, + "grad_norm": 2.2053134441375732, + "learning_rate": 9.5637715349676e-05, + "loss": 2.0544, + "step": 222 + }, + { + "epoch": 0.3584, + "grad_norm": 1.871773362159729, + "learning_rate": 9.552895780328987e-05, + "loss": 1.8976, + "step": 224 + }, + { + "epoch": 0.3616, + "grad_norm": 1.6700202226638794, + "learning_rate": 9.541892456736595e-05, + "loss": 2.1166, + "step": 226 + }, + { + "epoch": 0.3648, + "grad_norm": 1.9986639022827148, + "learning_rate": 9.530761872495849e-05, + "loss": 1.9311, + "step": 228 + }, + { + "epoch": 0.368, + "grad_norm": 2.288973331451416, + "learning_rate": 9.519504339477932e-05, + "loss": 1.98, + "step": 230 + }, + { + "epoch": 0.3712, + "grad_norm": 2.177896738052368, + "learning_rate": 9.508120173111039e-05, + "loss": 1.862, + "step": 232 + }, + { + "epoch": 0.3744, + "grad_norm": 1.9860484600067139, + "learning_rate": 9.496609692371548e-05, + "loss": 1.9192, + "step": 234 + }, + { + "epoch": 0.3776, + "grad_norm": 1.924127221107483, + "learning_rate": 9.484973219775074e-05, + "loss": 1.871, + "step": 236 + }, + { + "epoch": 0.3808, + "grad_norm": 1.9022867679595947, + "learning_rate": 9.473211081367436e-05, + "loss": 1.9067, + "step": 238 + }, + { + "epoch": 0.384, + "grad_norm": 1.7447446584701538, + "learning_rate": 9.46132360671552e-05, + "loss": 1.8984, + "step": 240 + }, + { + "epoch": 0.3872, + "grad_norm": 2.809067487716675, + "learning_rate": 9.449311128898049e-05, + "loss": 1.8327, + "step": 242 + }, + { + "epoch": 0.3904, + "grad_norm": 1.9946494102478027, + "learning_rate": 9.437173984496246e-05, + "loss": 1.9735, + "step": 244 + }, + { + "epoch": 0.3936, + "grad_norm": 1.8834348917007446, + "learning_rate": 9.424912513584401e-05, + "loss": 2.0294, + "step": 246 + }, + { + "epoch": 0.3968, + "grad_norm": 1.9426389932632446, + "learning_rate": 9.412527059720352e-05, + "loss": 1.9919, + "step": 248 + }, + { + "epoch": 0.4, + "grad_norm": 1.823935627937317, + "learning_rate": 9.400017969935848e-05, + "loss": 1.8907, + "step": 250 + }, + { + "epoch": 0.4032, + "grad_norm": 2.1048786640167236, + "learning_rate": 9.387385594726829e-05, + "loss": 1.8855, + "step": 252 + }, + { + "epoch": 0.4064, + "grad_norm": 1.9253580570220947, + "learning_rate": 9.374630288043614e-05, + "loss": 2.0577, + "step": 254 + }, + { + "epoch": 0.4096, + "grad_norm": 1.785396695137024, + "learning_rate": 9.361752407280965e-05, + "loss": 1.9675, + "step": 256 + }, + { + "epoch": 0.4128, + "grad_norm": 1.9203846454620361, + "learning_rate": 9.348752313268093e-05, + "loss": 1.8934, + "step": 258 + }, + { + "epoch": 0.416, + "grad_norm": 1.986392855644226, + "learning_rate": 9.335630370258533e-05, + "loss": 1.9838, + "step": 260 + }, + { + "epoch": 0.4192, + "grad_norm": 1.953905463218689, + "learning_rate": 9.322386945919946e-05, + "loss": 1.7604, + "step": 262 + }, + { + "epoch": 0.4224, + "grad_norm": 1.7314627170562744, + "learning_rate": 9.309022411323816e-05, + "loss": 2.0328, + "step": 264 + }, + { + "epoch": 0.4256, + "grad_norm": 1.6745048761367798, + "learning_rate": 9.295537140935049e-05, + "loss": 1.9734, + "step": 266 + }, + { + "epoch": 0.4288, + "grad_norm": 1.8622961044311523, + "learning_rate": 9.281931512601485e-05, + "loss": 1.9509, + "step": 268 + }, + { + "epoch": 0.432, + "grad_norm": 2.014514684677124, + "learning_rate": 9.26820590754331e-05, + "loss": 1.8272, + "step": 270 + }, + { + "epoch": 0.4352, + "grad_norm": 2.118647336959839, + "learning_rate": 9.254360710342371e-05, + "loss": 1.8347, + "step": 272 + }, + { + "epoch": 0.4384, + "grad_norm": 2.04239821434021, + "learning_rate": 9.240396308931407e-05, + "loss": 1.8675, + "step": 274 + }, + { + "epoch": 0.4416, + "grad_norm": 1.951341152191162, + "learning_rate": 9.226313094583173e-05, + "loss": 1.9559, + "step": 276 + }, + { + "epoch": 0.4448, + "grad_norm": 1.7053275108337402, + "learning_rate": 9.212111461899479e-05, + "loss": 2.0715, + "step": 278 + }, + { + "epoch": 0.448, + "grad_norm": 1.7789607048034668, + "learning_rate": 9.197791808800135e-05, + "loss": 1.89, + "step": 280 + }, + { + "epoch": 0.4512, + "grad_norm": 1.8625364303588867, + "learning_rate": 9.183354536511803e-05, + "loss": 1.9809, + "step": 282 + }, + { + "epoch": 0.4544, + "grad_norm": 1.6965309381484985, + "learning_rate": 9.168800049556747e-05, + "loss": 1.8365, + "step": 284 + }, + { + "epoch": 0.4576, + "grad_norm": 2.1207497119903564, + "learning_rate": 9.154128755741509e-05, + "loss": 1.8314, + "step": 286 + }, + { + "epoch": 0.4608, + "grad_norm": 1.8182010650634766, + "learning_rate": 9.139341066145472e-05, + "loss": 1.8906, + "step": 288 + }, + { + "epoch": 0.464, + "grad_norm": 1.977777361869812, + "learning_rate": 9.124437395109353e-05, + "loss": 1.8562, + "step": 290 + }, + { + "epoch": 0.4672, + "grad_norm": 1.9953404664993286, + "learning_rate": 9.109418160223585e-05, + "loss": 1.8364, + "step": 292 + }, + { + "epoch": 0.4704, + "grad_norm": 1.9941433668136597, + "learning_rate": 9.094283782316619e-05, + "loss": 1.7585, + "step": 294 + }, + { + "epoch": 0.4736, + "grad_norm": 1.9799609184265137, + "learning_rate": 9.079034685443133e-05, + "loss": 1.8669, + "step": 296 + }, + { + "epoch": 0.4768, + "grad_norm": 1.755238652229309, + "learning_rate": 9.063671296872149e-05, + "loss": 1.8001, + "step": 298 + }, + { + "epoch": 0.48, + "grad_norm": 2.059305429458618, + "learning_rate": 9.048194047075069e-05, + "loss": 1.9259, + "step": 300 + }, + { + "epoch": 0.4832, + "grad_norm": 1.7116378545761108, + "learning_rate": 9.032603369713596e-05, + "loss": 1.6954, + "step": 302 + }, + { + "epoch": 0.4864, + "grad_norm": 2.472815990447998, + "learning_rate": 9.016899701627604e-05, + "loss": 1.8413, + "step": 304 + }, + { + "epoch": 0.4896, + "grad_norm": 1.8934400081634521, + "learning_rate": 9.00108348282288e-05, + "loss": 1.9545, + "step": 306 + }, + { + "epoch": 0.4928, + "grad_norm": 2.147753953933716, + "learning_rate": 8.985155156458811e-05, + "loss": 1.7679, + "step": 308 + }, + { + "epoch": 0.496, + "grad_norm": 2.2302675247192383, + "learning_rate": 8.969115168835954e-05, + "loss": 1.8257, + "step": 310 + }, + { + "epoch": 0.4992, + "grad_norm": 1.6578640937805176, + "learning_rate": 8.952963969383538e-05, + "loss": 1.7151, + "step": 312 + }, + { + "epoch": 0.5024, + "grad_norm": 1.754835844039917, + "learning_rate": 8.93670201064687e-05, + "loss": 2.0074, + "step": 314 + }, + { + "epoch": 0.5056, + "grad_norm": 2.130150556564331, + "learning_rate": 8.920329748274649e-05, + "loss": 1.8657, + "step": 316 + }, + { + "epoch": 0.5088, + "grad_norm": 1.7068381309509277, + "learning_rate": 8.903847641006218e-05, + "loss": 1.8955, + "step": 318 + }, + { + "epoch": 0.512, + "grad_norm": 2.0879528522491455, + "learning_rate": 8.887256150658684e-05, + "loss": 1.7092, + "step": 320 + }, + { + "epoch": 0.5152, + "grad_norm": 1.8985047340393066, + "learning_rate": 8.870555742113998e-05, + "loss": 1.8091, + "step": 322 + }, + { + "epoch": 0.5184, + "grad_norm": 1.7577992677688599, + "learning_rate": 8.85374688330592e-05, + "loss": 1.8895, + "step": 324 + }, + { + "epoch": 0.5216, + "grad_norm": 1.8277013301849365, + "learning_rate": 8.836830045206911e-05, + "loss": 1.8192, + "step": 326 + }, + { + "epoch": 0.5248, + "grad_norm": 1.8492199182510376, + "learning_rate": 8.81980570181494e-05, + "loss": 2.0282, + "step": 328 + }, + { + "epoch": 0.528, + "grad_norm": 1.8850246667861938, + "learning_rate": 8.802674330140192e-05, + "loss": 1.7955, + "step": 330 + }, + { + "epoch": 0.5312, + "grad_norm": 1.7965402603149414, + "learning_rate": 8.785436410191714e-05, + "loss": 1.8271, + "step": 332 + }, + { + "epoch": 0.5344, + "grad_norm": 2.0495541095733643, + "learning_rate": 8.76809242496396e-05, + "loss": 1.9308, + "step": 334 + }, + { + "epoch": 0.5376, + "grad_norm": 1.8388515710830688, + "learning_rate": 8.750642860423262e-05, + "loss": 1.8831, + "step": 336 + }, + { + "epoch": 0.5408, + "grad_norm": 2.2101669311523438, + "learning_rate": 8.733088205494205e-05, + "loss": 1.9837, + "step": 338 + }, + { + "epoch": 0.544, + "grad_norm": 1.7564021348953247, + "learning_rate": 8.715428952045936e-05, + "loss": 2.0114, + "step": 340 + }, + { + "epoch": 0.5472, + "grad_norm": 2.0515785217285156, + "learning_rate": 8.697665594878382e-05, + "loss": 1.7574, + "step": 342 + }, + { + "epoch": 0.5504, + "grad_norm": 2.1503772735595703, + "learning_rate": 8.679798631708375e-05, + "loss": 1.9549, + "step": 344 + }, + { + "epoch": 0.5536, + "grad_norm": 1.6707327365875244, + "learning_rate": 8.661828563155727e-05, + "loss": 1.9318, + "step": 346 + }, + { + "epoch": 0.5568, + "grad_norm": 1.9014642238616943, + "learning_rate": 8.643755892729179e-05, + "loss": 1.9853, + "step": 348 + }, + { + "epoch": 0.56, + "grad_norm": 1.9820547103881836, + "learning_rate": 8.625581126812312e-05, + "loss": 1.8178, + "step": 350 + }, + { + "epoch": 0.5632, + "grad_norm": 2.810029983520508, + "learning_rate": 8.607304774649349e-05, + "loss": 2.0081, + "step": 352 + }, + { + "epoch": 0.5664, + "grad_norm": 1.8511972427368164, + "learning_rate": 8.588927348330887e-05, + "loss": 1.7794, + "step": 354 + }, + { + "epoch": 0.5696, + "grad_norm": 1.954455852508545, + "learning_rate": 8.57044936277955e-05, + "loss": 1.9215, + "step": 356 + }, + { + "epoch": 0.5728, + "grad_norm": 1.8836822509765625, + "learning_rate": 8.551871335735565e-05, + "loss": 1.7449, + "step": 358 + }, + { + "epoch": 0.576, + "grad_norm": 1.8966975212097168, + "learning_rate": 8.533193787742251e-05, + "loss": 1.7689, + "step": 360 + }, + { + "epoch": 0.5792, + "grad_norm": 1.7771093845367432, + "learning_rate": 8.51441724213143e-05, + "loss": 1.8151, + "step": 362 + }, + { + "epoch": 0.5824, + "grad_norm": 1.880419135093689, + "learning_rate": 8.495542225008771e-05, + "loss": 1.805, + "step": 364 + }, + { + "epoch": 0.5856, + "grad_norm": 1.820349097251892, + "learning_rate": 8.476569265239046e-05, + "loss": 1.758, + "step": 366 + }, + { + "epoch": 0.5888, + "grad_norm": 1.984392523765564, + "learning_rate": 8.457498894431311e-05, + "loss": 1.7321, + "step": 368 + }, + { + "epoch": 0.592, + "grad_norm": 1.710229516029358, + "learning_rate": 8.438331646924013e-05, + "loss": 1.7819, + "step": 370 + }, + { + "epoch": 0.5952, + "grad_norm": 1.736141324043274, + "learning_rate": 8.419068059770011e-05, + "loss": 1.8351, + "step": 372 + }, + { + "epoch": 0.5984, + "grad_norm": 1.6661279201507568, + "learning_rate": 8.399708672721539e-05, + "loss": 1.803, + "step": 374 + }, + { + "epoch": 0.6016, + "grad_norm": 4.828789710998535, + "learning_rate": 8.380254028215076e-05, + "loss": 1.8539, + "step": 376 + }, + { + "epoch": 0.6048, + "grad_norm": 2.078886032104492, + "learning_rate": 8.360704671356145e-05, + "loss": 1.7976, + "step": 378 + }, + { + "epoch": 0.608, + "grad_norm": 1.720009684562683, + "learning_rate": 8.341061149904045e-05, + "loss": 1.9524, + "step": 380 + }, + { + "epoch": 0.6112, + "grad_norm": 1.935594081878662, + "learning_rate": 8.321324014256504e-05, + "loss": 1.8671, + "step": 382 + }, + { + "epoch": 0.6144, + "grad_norm": 1.868320345878601, + "learning_rate": 8.30149381743425e-05, + "loss": 1.8896, + "step": 384 + }, + { + "epoch": 0.6176, + "grad_norm": 2.0732314586639404, + "learning_rate": 8.28157111506552e-05, + "loss": 1.8446, + "step": 386 + }, + { + "epoch": 0.6208, + "grad_norm": 1.5798280239105225, + "learning_rate": 8.261556465370493e-05, + "loss": 1.9207, + "step": 388 + }, + { + "epoch": 0.624, + "grad_norm": 1.6934467554092407, + "learning_rate": 8.24145042914565e-05, + "loss": 1.7548, + "step": 390 + }, + { + "epoch": 0.6272, + "grad_norm": 1.7732023000717163, + "learning_rate": 8.221253569748055e-05, + "loss": 1.7041, + "step": 392 + }, + { + "epoch": 0.6304, + "grad_norm": 1.9565222263336182, + "learning_rate": 8.200966453079575e-05, + "loss": 1.8865, + "step": 394 + }, + { + "epoch": 0.6336, + "grad_norm": 1.7031235694885254, + "learning_rate": 8.180589647571023e-05, + "loss": 2.0219, + "step": 396 + }, + { + "epoch": 0.6368, + "grad_norm": 1.8705931901931763, + "learning_rate": 8.16012372416623e-05, + "loss": 1.7774, + "step": 398 + }, + { + "epoch": 0.64, + "grad_norm": 1.7355400323867798, + "learning_rate": 8.13956925630605e-05, + "loss": 1.7273, + "step": 400 + }, + { + "epoch": 0.6432, + "grad_norm": 1.7146542072296143, + "learning_rate": 8.118926819912287e-05, + "loss": 1.8275, + "step": 402 + }, + { + "epoch": 0.6464, + "grad_norm": 1.8502819538116455, + "learning_rate": 8.098196993371565e-05, + "loss": 1.856, + "step": 404 + }, + { + "epoch": 0.6496, + "grad_norm": 1.6460517644882202, + "learning_rate": 8.077380357519115e-05, + "loss": 1.7826, + "step": 406 + }, + { + "epoch": 0.6528, + "grad_norm": 1.6977733373641968, + "learning_rate": 8.056477495622511e-05, + "loss": 2.0396, + "step": 408 + }, + { + "epoch": 0.656, + "grad_norm": 2.395606756210327, + "learning_rate": 8.035488993365312e-05, + "loss": 1.755, + "step": 410 + }, + { + "epoch": 0.6592, + "grad_norm": 1.6800931692123413, + "learning_rate": 8.014415438830667e-05, + "loss": 1.9174, + "step": 412 + }, + { + "epoch": 0.6624, + "grad_norm": 1.940741777420044, + "learning_rate": 7.993257422484826e-05, + "loss": 1.7259, + "step": 414 + }, + { + "epoch": 0.6656, + "grad_norm": 1.6088985204696655, + "learning_rate": 7.972015537160602e-05, + "loss": 1.9236, + "step": 416 + }, + { + "epoch": 0.6688, + "grad_norm": 1.77496337890625, + "learning_rate": 7.950690378040758e-05, + "loss": 1.9956, + "step": 418 + }, + { + "epoch": 0.672, + "grad_norm": 2.08013653755188, + "learning_rate": 7.929282542641325e-05, + "loss": 1.71, + "step": 420 + }, + { + "epoch": 0.6752, + "grad_norm": 1.9645555019378662, + "learning_rate": 7.907792630794876e-05, + "loss": 1.6586, + "step": 422 + }, + { + "epoch": 0.6784, + "grad_norm": 2.035111904144287, + "learning_rate": 7.886221244633703e-05, + "loss": 1.8481, + "step": 424 + }, + { + "epoch": 0.6816, + "grad_norm": 1.617519736289978, + "learning_rate": 7.864568988572947e-05, + "loss": 1.8787, + "step": 426 + }, + { + "epoch": 0.6848, + "grad_norm": 1.9266173839569092, + "learning_rate": 7.842836469293673e-05, + "loss": 1.7332, + "step": 428 + }, + { + "epoch": 0.688, + "grad_norm": 1.6716456413269043, + "learning_rate": 7.821024295725865e-05, + "loss": 1.8147, + "step": 430 + }, + { + "epoch": 0.6912, + "grad_norm": 1.9675475358963013, + "learning_rate": 7.79913307903136e-05, + "loss": 1.77, + "step": 432 + }, + { + "epoch": 0.6944, + "grad_norm": 2.048152208328247, + "learning_rate": 7.777163432586734e-05, + "loss": 1.7438, + "step": 434 + }, + { + "epoch": 0.6976, + "grad_norm": 1.7210822105407715, + "learning_rate": 7.755115971966104e-05, + "loss": 1.7988, + "step": 436 + }, + { + "epoch": 0.7008, + "grad_norm": 2.126711845397949, + "learning_rate": 7.732991314923891e-05, + "loss": 1.7376, + "step": 438 + }, + { + "epoch": 0.704, + "grad_norm": 1.7960891723632812, + "learning_rate": 7.710790081377502e-05, + "loss": 1.7875, + "step": 440 + }, + { + "epoch": 0.7072, + "grad_norm": 1.6610071659088135, + "learning_rate": 7.688512893389964e-05, + "loss": 1.7334, + "step": 442 + }, + { + "epoch": 0.7104, + "grad_norm": 1.6998896598815918, + "learning_rate": 7.666160375152496e-05, + "loss": 1.886, + "step": 444 + }, + { + "epoch": 0.7136, + "grad_norm": 1.6629440784454346, + "learning_rate": 7.643733152967019e-05, + "loss": 1.786, + "step": 446 + }, + { + "epoch": 0.7168, + "grad_norm": 1.6910452842712402, + "learning_rate": 7.621231855228604e-05, + "loss": 2.0343, + "step": 448 + }, + { + "epoch": 0.72, + "grad_norm": 1.9952099323272705, + "learning_rate": 7.598657112407865e-05, + "loss": 1.7571, + "step": 450 + }, + { + "epoch": 0.7232, + "grad_norm": 1.7345885038375854, + "learning_rate": 7.576009557033304e-05, + "loss": 2.0908, + "step": 452 + }, + { + "epoch": 0.7264, + "grad_norm": 1.6344877481460571, + "learning_rate": 7.553289823673568e-05, + "loss": 1.8395, + "step": 454 + }, + { + "epoch": 0.7296, + "grad_norm": 2.138115406036377, + "learning_rate": 7.530498548919693e-05, + "loss": 1.7072, + "step": 456 + }, + { + "epoch": 0.7328, + "grad_norm": 1.9216474294662476, + "learning_rate": 7.507636371367246e-05, + "loss": 1.6516, + "step": 458 + }, + { + "epoch": 0.736, + "grad_norm": 1.4932810068130493, + "learning_rate": 7.484703931598445e-05, + "loss": 1.9351, + "step": 460 + }, + { + "epoch": 0.7392, + "grad_norm": 1.8183472156524658, + "learning_rate": 7.461701872164204e-05, + "loss": 1.8441, + "step": 462 + }, + { + "epoch": 0.7424, + "grad_norm": 1.5970336198806763, + "learning_rate": 7.438630837566133e-05, + "loss": 1.8145, + "step": 464 + }, + { + "epoch": 0.7456, + "grad_norm": 1.7351387739181519, + "learning_rate": 7.415491474238475e-05, + "loss": 1.8858, + "step": 466 + }, + { + "epoch": 0.7488, + "grad_norm": 1.6989448070526123, + "learning_rate": 7.39228443053e-05, + "loss": 1.8566, + "step": 468 + }, + { + "epoch": 0.752, + "grad_norm": 1.8217098712921143, + "learning_rate": 7.369010356685833e-05, + "loss": 1.692, + "step": 470 + }, + { + "epoch": 0.7552, + "grad_norm": 1.7833845615386963, + "learning_rate": 7.345669904829237e-05, + "loss": 1.8145, + "step": 472 + }, + { + "epoch": 0.7584, + "grad_norm": 1.7113256454467773, + "learning_rate": 7.32226372894334e-05, + "loss": 1.907, + "step": 474 + }, + { + "epoch": 0.7616, + "grad_norm": 1.66838800907135, + "learning_rate": 7.298792484852808e-05, + "loss": 1.8243, + "step": 476 + }, + { + "epoch": 0.7648, + "grad_norm": 1.8057668209075928, + "learning_rate": 7.27525683020548e-05, + "loss": 1.6788, + "step": 478 + }, + { + "epoch": 0.768, + "grad_norm": 1.7563303709030151, + "learning_rate": 7.251657424453928e-05, + "loss": 2.0148, + "step": 480 + }, + { + "epoch": 0.7712, + "grad_norm": 1.75275719165802, + "learning_rate": 7.227994928836988e-05, + "loss": 1.7584, + "step": 482 + }, + { + "epoch": 0.7744, + "grad_norm": 1.6364191770553589, + "learning_rate": 7.204270006361228e-05, + "loss": 1.9348, + "step": 484 + }, + { + "epoch": 0.7776, + "grad_norm": 1.7930974960327148, + "learning_rate": 7.180483321782374e-05, + "loss": 1.9014, + "step": 486 + }, + { + "epoch": 0.7808, + "grad_norm": 1.8914506435394287, + "learning_rate": 7.156635541586682e-05, + "loss": 1.7977, + "step": 488 + }, + { + "epoch": 0.784, + "grad_norm": 1.7024521827697754, + "learning_rate": 7.132727333972265e-05, + "loss": 1.6993, + "step": 490 + }, + { + "epoch": 0.7872, + "grad_norm": 1.7870112657546997, + "learning_rate": 7.108759368830371e-05, + "loss": 1.6965, + "step": 492 + }, + { + "epoch": 0.7904, + "grad_norm": 1.763691782951355, + "learning_rate": 7.084732317726611e-05, + "loss": 1.7948, + "step": 494 + }, + { + "epoch": 0.7936, + "grad_norm": 1.683468222618103, + "learning_rate": 7.060646853882145e-05, + "loss": 1.9145, + "step": 496 + }, + { + "epoch": 0.7968, + "grad_norm": 1.9888768196105957, + "learning_rate": 7.036503652154812e-05, + "loss": 1.8192, + "step": 498 + }, + { + "epoch": 0.8, + "grad_norm": 1.5705928802490234, + "learning_rate": 7.012303389020234e-05, + "loss": 1.7831, + "step": 500 + }, + { + "epoch": 0.8032, + "grad_norm": 1.860660433769226, + "learning_rate": 6.988046742552845e-05, + "loss": 1.7904, + "step": 502 + }, + { + "epoch": 0.8064, + "grad_norm": 1.8895405530929565, + "learning_rate": 6.963734392406907e-05, + "loss": 1.8645, + "step": 504 + }, + { + "epoch": 0.8096, + "grad_norm": 1.74190354347229, + "learning_rate": 6.93936701979746e-05, + "loss": 1.8455, + "step": 506 + }, + { + "epoch": 0.8128, + "grad_norm": 1.9230369329452515, + "learning_rate": 6.914945307481228e-05, + "loss": 1.8388, + "step": 508 + }, + { + "epoch": 0.816, + "grad_norm": 1.5093566179275513, + "learning_rate": 6.890469939737506e-05, + "loss": 1.752, + "step": 510 + }, + { + "epoch": 0.8192, + "grad_norm": 1.5916728973388672, + "learning_rate": 6.865941602348966e-05, + "loss": 1.7105, + "step": 512 + }, + { + "epoch": 0.8224, + "grad_norm": 1.7378982305526733, + "learning_rate": 6.841360982582463e-05, + "loss": 1.9789, + "step": 514 + }, + { + "epoch": 0.8256, + "grad_norm": 1.7520698308944702, + "learning_rate": 6.816728769169757e-05, + "loss": 1.7566, + "step": 516 + }, + { + "epoch": 0.8288, + "grad_norm": 1.8129826784133911, + "learning_rate": 6.792045652288234e-05, + "loss": 1.8551, + "step": 518 + }, + { + "epoch": 0.832, + "grad_norm": 1.9102818965911865, + "learning_rate": 6.767312323541555e-05, + "loss": 1.7726, + "step": 520 + }, + { + "epoch": 0.8352, + "grad_norm": 1.5088154077529907, + "learning_rate": 6.742529475940284e-05, + "loss": 1.6381, + "step": 522 + }, + { + "epoch": 0.8384, + "grad_norm": 1.7010055780410767, + "learning_rate": 6.717697803882467e-05, + "loss": 1.8741, + "step": 524 + }, + { + "epoch": 0.8416, + "grad_norm": 1.6840184926986694, + "learning_rate": 6.692818003134184e-05, + "loss": 1.8617, + "step": 526 + }, + { + "epoch": 0.8448, + "grad_norm": 1.7205629348754883, + "learning_rate": 6.667890770810035e-05, + "loss": 1.7349, + "step": 528 + }, + { + "epoch": 0.848, + "grad_norm": 1.520727515220642, + "learning_rate": 6.64291680535363e-05, + "loss": 1.749, + "step": 530 + }, + { + "epoch": 0.8512, + "grad_norm": 1.5941743850708008, + "learning_rate": 6.617896806518005e-05, + "loss": 1.7076, + "step": 532 + }, + { + "epoch": 0.8544, + "grad_norm": 1.7745941877365112, + "learning_rate": 6.592831475346018e-05, + "loss": 1.792, + "step": 534 + }, + { + "epoch": 0.8576, + "grad_norm": 1.5072052478790283, + "learning_rate": 6.56772151415071e-05, + "loss": 1.6149, + "step": 536 + }, + { + "epoch": 0.8608, + "grad_norm": 1.6202104091644287, + "learning_rate": 6.542567626495619e-05, + "loss": 1.756, + "step": 538 + }, + { + "epoch": 0.864, + "grad_norm": 1.4974113702774048, + "learning_rate": 6.517370517175081e-05, + "loss": 1.7919, + "step": 540 + }, + { + "epoch": 0.8672, + "grad_norm": 1.653824806213379, + "learning_rate": 6.492130892194461e-05, + "loss": 2.0103, + "step": 542 + }, + { + "epoch": 0.8704, + "grad_norm": 1.683524489402771, + "learning_rate": 6.466849458750394e-05, + "loss": 2.0337, + "step": 544 + }, + { + "epoch": 0.8736, + "grad_norm": 1.5982547998428345, + "learning_rate": 6.441526925210949e-05, + "loss": 1.8919, + "step": 546 + }, + { + "epoch": 0.8768, + "grad_norm": 1.838497519493103, + "learning_rate": 6.416164001095799e-05, + "loss": 1.7648, + "step": 548 + }, + { + "epoch": 0.88, + "grad_norm": 1.524348258972168, + "learning_rate": 6.390761397056328e-05, + "loss": 1.6804, + "step": 550 + }, + { + "epoch": 0.8832, + "grad_norm": 1.6498512029647827, + "learning_rate": 6.365319824855727e-05, + "loss": 1.6334, + "step": 552 + }, + { + "epoch": 0.8864, + "grad_norm": 1.5689668655395508, + "learning_rate": 6.339839997349045e-05, + "loss": 1.9048, + "step": 554 + }, + { + "epoch": 0.8896, + "grad_norm": 1.7050296068191528, + "learning_rate": 6.314322628463219e-05, + "loss": 1.6864, + "step": 556 + }, + { + "epoch": 0.8928, + "grad_norm": 2.038351535797119, + "learning_rate": 6.288768433177068e-05, + "loss": 1.7531, + "step": 558 + }, + { + "epoch": 0.896, + "grad_norm": 1.7489795684814453, + "learning_rate": 6.26317812750126e-05, + "loss": 1.8467, + "step": 560 + }, + { + "epoch": 0.8992, + "grad_norm": 1.6068861484527588, + "learning_rate": 6.237552428458256e-05, + "loss": 1.8459, + "step": 562 + }, + { + "epoch": 0.9024, + "grad_norm": 1.616613745689392, + "learning_rate": 6.21189205406221e-05, + "loss": 1.8173, + "step": 564 + }, + { + "epoch": 0.9056, + "grad_norm": 1.6885602474212646, + "learning_rate": 6.186197723298855e-05, + "loss": 1.8358, + "step": 566 + }, + { + "epoch": 0.9088, + "grad_norm": 1.688711404800415, + "learning_rate": 6.160470156105362e-05, + "loss": 1.6996, + "step": 568 + }, + { + "epoch": 0.912, + "grad_norm": 1.74298894405365, + "learning_rate": 6.134710073350156e-05, + "loss": 1.722, + "step": 570 + }, + { + "epoch": 0.9152, + "grad_norm": 1.6249070167541504, + "learning_rate": 6.108918196812734e-05, + "loss": 1.7909, + "step": 572 + }, + { + "epoch": 0.9184, + "grad_norm": 1.659416675567627, + "learning_rate": 6.083095249163424e-05, + "loss": 1.6625, + "step": 574 + }, + { + "epoch": 0.9216, + "grad_norm": 1.6332143545150757, + "learning_rate": 6.057241953943154e-05, + "loss": 1.8297, + "step": 576 + }, + { + "epoch": 0.9248, + "grad_norm": 1.6717133522033691, + "learning_rate": 6.031359035543158e-05, + "loss": 2.0601, + "step": 578 + }, + { + "epoch": 0.928, + "grad_norm": 1.8736896514892578, + "learning_rate": 6.005447219184702e-05, + "loss": 1.8117, + "step": 580 + }, + { + "epoch": 0.9312, + "grad_norm": 1.6602182388305664, + "learning_rate": 5.9795072308987485e-05, + "loss": 1.7275, + "step": 582 + }, + { + "epoch": 0.9344, + "grad_norm": 1.6776071786880493, + "learning_rate": 5.9535397975056154e-05, + "loss": 1.8988, + "step": 584 + }, + { + "epoch": 0.9376, + "grad_norm": 1.588109016418457, + "learning_rate": 5.927545646594617e-05, + "loss": 1.7716, + "step": 586 + }, + { + "epoch": 0.9408, + "grad_norm": 1.6331814527511597, + "learning_rate": 5.901525506503668e-05, + "loss": 1.8081, + "step": 588 + }, + { + "epoch": 0.944, + "grad_norm": 1.7309777736663818, + "learning_rate": 5.87548010629889e-05, + "loss": 1.8243, + "step": 590 + }, + { + "epoch": 0.9472, + "grad_norm": 1.6374008655548096, + "learning_rate": 5.8494101757541676e-05, + "loss": 1.7065, + "step": 592 + }, + { + "epoch": 0.9504, + "grad_norm": 1.6291025876998901, + "learning_rate": 5.8233164453307156e-05, + "loss": 1.8013, + "step": 594 + }, + { + "epoch": 0.9536, + "grad_norm": 1.7339948415756226, + "learning_rate": 5.797199646156596e-05, + "loss": 1.7998, + "step": 596 + }, + { + "epoch": 0.9568, + "grad_norm": 1.581697940826416, + "learning_rate": 5.7710605100062485e-05, + "loss": 1.645, + "step": 598 + }, + { + "epoch": 0.96, + "grad_norm": 1.713205337524414, + "learning_rate": 5.7448997692799764e-05, + "loss": 1.9092, + "step": 600 + }, + { + "epoch": 0.9632, + "grad_norm": 1.7023775577545166, + "learning_rate": 5.718718156983428e-05, + "loss": 1.7403, + "step": 602 + }, + { + "epoch": 0.9664, + "grad_norm": 1.606632947921753, + "learning_rate": 5.69251640670706e-05, + "loss": 1.679, + "step": 604 + }, + { + "epoch": 0.9696, + "grad_norm": 1.5328476428985596, + "learning_rate": 5.6662952526055793e-05, + "loss": 1.7899, + "step": 606 + }, + { + "epoch": 0.9728, + "grad_norm": 1.5965962409973145, + "learning_rate": 5.6400554293773744e-05, + "loss": 1.7776, + "step": 608 + }, + { + "epoch": 0.976, + "grad_norm": 1.5174623727798462, + "learning_rate": 5.61379767224393e-05, + "loss": 1.6602, + "step": 610 + }, + { + "epoch": 0.9792, + "grad_norm": 1.6876877546310425, + "learning_rate": 5.587522716929228e-05, + "loss": 1.6656, + "step": 612 + }, + { + "epoch": 0.9824, + "grad_norm": 1.5483810901641846, + "learning_rate": 5.561231299639127e-05, + "loss": 1.6531, + "step": 614 + }, + { + "epoch": 0.9856, + "grad_norm": 1.464625597000122, + "learning_rate": 5.534924157040745e-05, + "loss": 1.8967, + "step": 616 + }, + { + "epoch": 0.9888, + "grad_norm": 1.7587417364120483, + "learning_rate": 5.508602026241807e-05, + "loss": 1.6637, + "step": 618 + }, + { + "epoch": 0.992, + "grad_norm": 1.5783720016479492, + "learning_rate": 5.482265644769998e-05, + "loss": 1.7628, + "step": 620 + }, + { + "epoch": 0.9952, + "grad_norm": 1.602127194404602, + "learning_rate": 5.4559157505522985e-05, + "loss": 1.7458, + "step": 622 + }, + { + "epoch": 0.9984, + "grad_norm": 1.8909751176834106, + "learning_rate": 5.429553081894304e-05, + "loss": 1.6952, + "step": 624 + }, + { + "epoch": 1.0016, + "grad_norm": 1.5928688049316406, + "learning_rate": 5.4031783774595455e-05, + "loss": 1.5991, + "step": 626 + }, + { + "epoch": 1.0048, + "grad_norm": 1.487565517425537, + "learning_rate": 5.3767923762487824e-05, + "loss": 1.4893, + "step": 628 + }, + { + "epoch": 1.008, + "grad_norm": 1.4564417600631714, + "learning_rate": 5.3503958175793055e-05, + "loss": 1.4741, + "step": 630 + }, + { + "epoch": 1.0112, + "grad_norm": 1.4703936576843262, + "learning_rate": 5.323989441064216e-05, + "loss": 1.5821, + "step": 632 + }, + { + "epoch": 1.0144, + "grad_norm": 1.746761441230774, + "learning_rate": 5.2975739865917074e-05, + "loss": 1.565, + "step": 634 + }, + { + "epoch": 1.0176, + "grad_norm": 1.7585957050323486, + "learning_rate": 5.271150194304326e-05, + "loss": 1.5653, + "step": 636 + }, + { + "epoch": 1.0208, + "grad_norm": 1.5630764961242676, + "learning_rate": 5.244718804578246e-05, + "loss": 1.496, + "step": 638 + }, + { + "epoch": 1.024, + "grad_norm": 1.6863924264907837, + "learning_rate": 5.218280558002506e-05, + "loss": 1.4683, + "step": 640 + }, + { + "epoch": 1.0272, + "grad_norm": 1.4588528871536255, + "learning_rate": 5.191836195358278e-05, + "loss": 1.513, + "step": 642 + }, + { + "epoch": 1.0304, + "grad_norm": 1.4486134052276611, + "learning_rate": 5.165386457598099e-05, + "loss": 1.484, + "step": 644 + }, + { + "epoch": 1.0336, + "grad_norm": 1.5040372610092163, + "learning_rate": 5.13893208582511e-05, + "loss": 1.4945, + "step": 646 + }, + { + "epoch": 1.0368, + "grad_norm": 1.556773066520691, + "learning_rate": 5.1124738212722966e-05, + "loss": 1.395, + "step": 648 + }, + { + "epoch": 1.04, + "grad_norm": 1.5296416282653809, + "learning_rate": 5.086012405281717e-05, + "loss": 1.3816, + "step": 650 + }, + { + "epoch": 1.0432, + "grad_norm": 1.5270073413848877, + "learning_rate": 5.0595485792837305e-05, + "loss": 1.5622, + "step": 652 + }, + { + "epoch": 1.0464, + "grad_norm": 1.4216368198394775, + "learning_rate": 5.033083084776222e-05, + "loss": 1.4719, + "step": 654 + }, + { + "epoch": 1.0496, + "grad_norm": 1.6059627532958984, + "learning_rate": 5.0066166633038305e-05, + "loss": 1.5124, + "step": 656 + }, + { + "epoch": 1.0528, + "grad_norm": 1.7300755977630615, + "learning_rate": 4.980150056437163e-05, + "loss": 1.4899, + "step": 658 + }, + { + "epoch": 1.056, + "grad_norm": 1.4066635370254517, + "learning_rate": 4.9536840057520224e-05, + "loss": 1.4709, + "step": 660 + }, + { + "epoch": 1.0592, + "grad_norm": 1.563008189201355, + "learning_rate": 4.927219252808631e-05, + "loss": 1.4385, + "step": 662 + }, + { + "epoch": 1.0624, + "grad_norm": 1.6098390817642212, + "learning_rate": 4.900756539130846e-05, + "loss": 1.4528, + "step": 664 + }, + { + "epoch": 1.0656, + "grad_norm": 1.5592678785324097, + "learning_rate": 4.874296606185387e-05, + "loss": 1.5697, + "step": 666 + }, + { + "epoch": 1.0688, + "grad_norm": 1.7256783246994019, + "learning_rate": 4.847840195361058e-05, + "loss": 1.5878, + "step": 668 + }, + { + "epoch": 1.072, + "grad_norm": 3.1409246921539307, + "learning_rate": 4.821388047947979e-05, + "loss": 1.4804, + "step": 670 + }, + { + "epoch": 1.0752, + "grad_norm": 1.415878415107727, + "learning_rate": 4.7949409051168085e-05, + "loss": 1.5137, + "step": 672 + }, + { + "epoch": 1.0784, + "grad_norm": 1.6788074970245361, + "learning_rate": 4.768499507897981e-05, + "loss": 1.4662, + "step": 674 + }, + { + "epoch": 1.0816, + "grad_norm": 1.3528187274932861, + "learning_rate": 4.742064597160948e-05, + "loss": 1.4879, + "step": 676 + }, + { + "epoch": 1.0848, + "grad_norm": 1.5209320783615112, + "learning_rate": 4.715636913593404e-05, + "loss": 1.5457, + "step": 678 + }, + { + "epoch": 1.088, + "grad_norm": 1.7500444650650024, + "learning_rate": 4.689217197680554e-05, + "loss": 1.4667, + "step": 680 + }, + { + "epoch": 1.0912, + "grad_norm": 1.6211199760437012, + "learning_rate": 4.6628061896843474e-05, + "loss": 1.5392, + "step": 682 + }, + { + "epoch": 1.0944, + "grad_norm": 1.584383249282837, + "learning_rate": 4.6364046296227484e-05, + "loss": 1.4909, + "step": 684 + }, + { + "epoch": 1.0976, + "grad_norm": 1.7815152406692505, + "learning_rate": 4.6100132572489915e-05, + "loss": 1.654, + "step": 686 + }, + { + "epoch": 1.1008, + "grad_norm": 1.3984824419021606, + "learning_rate": 4.5836328120308674e-05, + "loss": 1.5179, + "step": 688 + }, + { + "epoch": 1.104, + "grad_norm": 2.673772096633911, + "learning_rate": 4.5572640331299875e-05, + "loss": 1.5183, + "step": 690 + }, + { + "epoch": 1.1072, + "grad_norm": 1.3696187734603882, + "learning_rate": 4.530907659381086e-05, + "loss": 1.4193, + "step": 692 + }, + { + "epoch": 1.1104, + "grad_norm": 1.5257173776626587, + "learning_rate": 4.504564429271311e-05, + "loss": 1.4448, + "step": 694 + }, + { + "epoch": 1.1136, + "grad_norm": 1.4060217142105103, + "learning_rate": 4.478235080919536e-05, + "loss": 1.4528, + "step": 696 + }, + { + "epoch": 1.1168, + "grad_norm": 1.7510145902633667, + "learning_rate": 4.451920352055678e-05, + "loss": 1.4895, + "step": 698 + }, + { + "epoch": 1.12, + "grad_norm": 1.639930009841919, + "learning_rate": 4.425620980000026e-05, + "loss": 1.5675, + "step": 700 + }, + { + "epoch": 1.1232, + "grad_norm": 1.6045302152633667, + "learning_rate": 4.39933770164258e-05, + "loss": 1.546, + "step": 702 + }, + { + "epoch": 1.1264, + "grad_norm": 1.6283916234970093, + "learning_rate": 4.373071253422408e-05, + "loss": 1.5913, + "step": 704 + }, + { + "epoch": 1.1296, + "grad_norm": 1.594617247581482, + "learning_rate": 4.346822371307009e-05, + "loss": 1.5827, + "step": 706 + }, + { + "epoch": 1.1328, + "grad_norm": 1.6242777109146118, + "learning_rate": 4.320591790771691e-05, + "loss": 1.493, + "step": 708 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 1.445251226425171, + "learning_rate": 4.294380246778966e-05, + "loss": 1.4685, + "step": 710 + }, + { + "epoch": 1.1392, + "grad_norm": 1.4812084436416626, + "learning_rate": 4.2681884737579524e-05, + "loss": 1.4637, + "step": 712 + }, + { + "epoch": 1.1424, + "grad_norm": 1.327941656112671, + "learning_rate": 4.242017205583805e-05, + "loss": 1.4974, + "step": 714 + }, + { + "epoch": 1.1456, + "grad_norm": 1.5381219387054443, + "learning_rate": 4.215867175557142e-05, + "loss": 1.4419, + "step": 716 + }, + { + "epoch": 1.1488, + "grad_norm": 1.447100281715393, + "learning_rate": 4.189739116383506e-05, + "loss": 1.5027, + "step": 718 + }, + { + "epoch": 1.152, + "grad_norm": 1.7003172636032104, + "learning_rate": 4.163633760152834e-05, + "loss": 1.4394, + "step": 720 + }, + { + "epoch": 1.1552, + "grad_norm": 1.5542171001434326, + "learning_rate": 4.137551838318936e-05, + "loss": 1.502, + "step": 722 + }, + { + "epoch": 1.1584, + "grad_norm": 1.4329487085342407, + "learning_rate": 4.1114940816790135e-05, + "loss": 1.4303, + "step": 724 + }, + { + "epoch": 1.1616, + "grad_norm": 1.5292679071426392, + "learning_rate": 4.08546122035317e-05, + "loss": 1.4335, + "step": 726 + }, + { + "epoch": 1.1648, + "grad_norm": 1.5414072275161743, + "learning_rate": 4.059453983763967e-05, + "loss": 1.4574, + "step": 728 + }, + { + "epoch": 1.168, + "grad_norm": 1.5525455474853516, + "learning_rate": 4.03347310061597e-05, + "loss": 1.3842, + "step": 730 + }, + { + "epoch": 1.1712, + "grad_norm": 1.444132685661316, + "learning_rate": 4.007519298875347e-05, + "loss": 1.4233, + "step": 732 + }, + { + "epoch": 1.1743999999999999, + "grad_norm": 1.6456663608551025, + "learning_rate": 3.98159330574946e-05, + "loss": 1.4416, + "step": 734 + }, + { + "epoch": 1.1776, + "grad_norm": 1.5266261100769043, + "learning_rate": 3.955695847666494e-05, + "loss": 1.4292, + "step": 736 + }, + { + "epoch": 1.1808, + "grad_norm": 1.6392107009887695, + "learning_rate": 3.929827650255104e-05, + "loss": 1.5965, + "step": 738 + }, + { + "epoch": 1.184, + "grad_norm": 1.4020483493804932, + "learning_rate": 3.903989438324077e-05, + "loss": 1.3887, + "step": 740 + }, + { + "epoch": 1.1872, + "grad_norm": 1.5396273136138916, + "learning_rate": 3.878181935842033e-05, + "loss": 1.5077, + "step": 742 + }, + { + "epoch": 1.1904, + "grad_norm": 1.5113873481750488, + "learning_rate": 3.85240586591713e-05, + "loss": 1.4624, + "step": 744 + }, + { + "epoch": 1.1936, + "grad_norm": 1.4673871994018555, + "learning_rate": 3.8266619507768126e-05, + "loss": 1.4441, + "step": 746 + }, + { + "epoch": 1.1968, + "grad_norm": 1.387566328048706, + "learning_rate": 3.800950911747565e-05, + "loss": 1.4103, + "step": 748 + }, + { + "epoch": 1.2, + "grad_norm": 1.4239872694015503, + "learning_rate": 3.775273469234712e-05, + "loss": 1.5861, + "step": 750 + }, + { + "epoch": 1.2032, + "grad_norm": 1.672402024269104, + "learning_rate": 3.749630342702221e-05, + "loss": 1.5554, + "step": 752 + }, + { + "epoch": 1.2064, + "grad_norm": 1.3757636547088623, + "learning_rate": 3.724022250652557e-05, + "loss": 1.4742, + "step": 754 + }, + { + "epoch": 1.2096, + "grad_norm": 1.5252386331558228, + "learning_rate": 3.698449910606536e-05, + "loss": 1.4546, + "step": 756 + }, + { + "epoch": 1.2128, + "grad_norm": 1.351670503616333, + "learning_rate": 3.672914039083233e-05, + "loss": 1.4842, + "step": 758 + }, + { + "epoch": 1.216, + "grad_norm": 2.256891965866089, + "learning_rate": 3.6474153515799e-05, + "loss": 1.4753, + "step": 760 + }, + { + "epoch": 1.2192, + "grad_norm": 1.6337233781814575, + "learning_rate": 3.6219545625519145e-05, + "loss": 1.51, + "step": 762 + }, + { + "epoch": 1.2224, + "grad_norm": 1.6732077598571777, + "learning_rate": 3.596532385392772e-05, + "loss": 1.6077, + "step": 764 + }, + { + "epoch": 1.2256, + "grad_norm": 1.410620927810669, + "learning_rate": 3.5711495324140845e-05, + "loss": 1.4561, + "step": 766 + }, + { + "epoch": 1.2288000000000001, + "grad_norm": 1.5092262029647827, + "learning_rate": 3.54580671482563e-05, + "loss": 1.4808, + "step": 768 + }, + { + "epoch": 1.232, + "grad_norm": 1.5939708948135376, + "learning_rate": 3.520504642715424e-05, + "loss": 1.568, + "step": 770 + }, + { + "epoch": 1.2352, + "grad_norm": 1.4955201148986816, + "learning_rate": 3.495244025029822e-05, + "loss": 1.4955, + "step": 772 + }, + { + "epoch": 1.2384, + "grad_norm": 1.6155612468719482, + "learning_rate": 3.470025569553653e-05, + "loss": 1.4821, + "step": 774 + }, + { + "epoch": 1.2416, + "grad_norm": 1.5372660160064697, + "learning_rate": 3.444849982890393e-05, + "loss": 1.4093, + "step": 776 + }, + { + "epoch": 1.2448, + "grad_norm": 1.7753242254257202, + "learning_rate": 3.4197179704423656e-05, + "loss": 1.4489, + "step": 778 + }, + { + "epoch": 1.248, + "grad_norm": 1.5369523763656616, + "learning_rate": 3.39463023639097e-05, + "loss": 1.469, + "step": 780 + }, + { + "epoch": 1.2511999999999999, + "grad_norm": 1.5201616287231445, + "learning_rate": 3.36958748367696e-05, + "loss": 1.4533, + "step": 782 + }, + { + "epoch": 1.2544, + "grad_norm": 1.6345765590667725, + "learning_rate": 3.3445904139807405e-05, + "loss": 1.5058, + "step": 784 + }, + { + "epoch": 1.2576, + "grad_norm": 1.4603197574615479, + "learning_rate": 3.319639727702716e-05, + "loss": 1.4791, + "step": 786 + }, + { + "epoch": 1.2608, + "grad_norm": 1.395509958267212, + "learning_rate": 3.2947361239436525e-05, + "loss": 1.3728, + "step": 788 + }, + { + "epoch": 1.264, + "grad_norm": 1.625909447669983, + "learning_rate": 3.2698803004851026e-05, + "loss": 1.4991, + "step": 790 + }, + { + "epoch": 1.2671999999999999, + "grad_norm": 1.4244285821914673, + "learning_rate": 3.245072953769844e-05, + "loss": 1.5196, + "step": 792 + }, + { + "epoch": 1.2704, + "grad_norm": 1.4113078117370605, + "learning_rate": 3.2203147788823764e-05, + "loss": 1.4194, + "step": 794 + }, + { + "epoch": 1.2736, + "grad_norm": 1.588931918144226, + "learning_rate": 3.1956064695294305e-05, + "loss": 1.4783, + "step": 796 + }, + { + "epoch": 1.2768, + "grad_norm": 1.6504685878753662, + "learning_rate": 3.170948718020546e-05, + "loss": 1.4376, + "step": 798 + }, + { + "epoch": 1.28, + "grad_norm": 1.408836007118225, + "learning_rate": 3.1463422152486674e-05, + "loss": 1.4438, + "step": 800 + }, + { + "epoch": 1.2832, + "grad_norm": 1.6479829549789429, + "learning_rate": 3.121787650670781e-05, + "loss": 1.5115, + "step": 802 + }, + { + "epoch": 1.2864, + "grad_norm": 1.4651159048080444, + "learning_rate": 3.097285712288605e-05, + "loss": 1.4111, + "step": 804 + }, + { + "epoch": 1.2896, + "grad_norm": 1.4425824880599976, + "learning_rate": 3.072837086629305e-05, + "loss": 1.4359, + "step": 806 + }, + { + "epoch": 1.2928, + "grad_norm": 1.4235820770263672, + "learning_rate": 3.0484424587262655e-05, + "loss": 1.4132, + "step": 808 + }, + { + "epoch": 1.296, + "grad_norm": 1.471458911895752, + "learning_rate": 3.024102512099889e-05, + "loss": 1.4653, + "step": 810 + }, + { + "epoch": 1.2992, + "grad_norm": 1.6857844591140747, + "learning_rate": 2.9998179287384485e-05, + "loss": 1.5749, + "step": 812 + }, + { + "epoch": 1.3024, + "grad_norm": 1.4331309795379639, + "learning_rate": 2.975589389078975e-05, + "loss": 1.4682, + "step": 814 + }, + { + "epoch": 1.3056, + "grad_norm": 1.3838329315185547, + "learning_rate": 2.9514175719881966e-05, + "loss": 1.4477, + "step": 816 + }, + { + "epoch": 1.3088, + "grad_norm": 1.350062370300293, + "learning_rate": 2.9273031547435114e-05, + "loss": 1.4243, + "step": 818 + }, + { + "epoch": 1.312, + "grad_norm": 1.5023717880249023, + "learning_rate": 2.9032468130140168e-05, + "loss": 1.4972, + "step": 820 + }, + { + "epoch": 1.3152, + "grad_norm": 1.5788859128952026, + "learning_rate": 2.8792492208415768e-05, + "loss": 1.5411, + "step": 822 + }, + { + "epoch": 1.3184, + "grad_norm": 1.4530489444732666, + "learning_rate": 2.8553110506219283e-05, + "loss": 1.4582, + "step": 824 + }, + { + "epoch": 1.3216, + "grad_norm": 1.3994284868240356, + "learning_rate": 2.831432973085848e-05, + "loss": 1.4381, + "step": 826 + }, + { + "epoch": 1.3248, + "grad_norm": 1.51462984085083, + "learning_rate": 2.8076156572803635e-05, + "loss": 1.4712, + "step": 828 + }, + { + "epoch": 1.328, + "grad_norm": 1.505707859992981, + "learning_rate": 2.783859770549996e-05, + "loss": 1.4613, + "step": 830 + }, + { + "epoch": 1.3312, + "grad_norm": 1.4707581996917725, + "learning_rate": 2.760165978518067e-05, + "loss": 1.4202, + "step": 832 + }, + { + "epoch": 1.3344, + "grad_norm": 1.5572878122329712, + "learning_rate": 2.7365349450680466e-05, + "loss": 1.4835, + "step": 834 + }, + { + "epoch": 1.3376000000000001, + "grad_norm": 1.6469281911849976, + "learning_rate": 2.7129673323249604e-05, + "loss": 1.4834, + "step": 836 + }, + { + "epoch": 1.3408, + "grad_norm": 1.6217268705368042, + "learning_rate": 2.689463800636824e-05, + "loss": 1.4905, + "step": 838 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 1.3879499435424805, + "learning_rate": 2.6660250085561457e-05, + "loss": 1.4575, + "step": 840 + }, + { + "epoch": 1.3472, + "grad_norm": 1.436532735824585, + "learning_rate": 2.6426516128214807e-05, + "loss": 1.5241, + "step": 842 + }, + { + "epoch": 1.3504, + "grad_norm": 1.4625749588012695, + "learning_rate": 2.619344268339021e-05, + "loss": 1.4646, + "step": 844 + }, + { + "epoch": 1.3536000000000001, + "grad_norm": 1.2639025449752808, + "learning_rate": 2.5961036281642493e-05, + "loss": 1.4009, + "step": 846 + }, + { + "epoch": 1.3568, + "grad_norm": 1.3335165977478027, + "learning_rate": 2.572930343483637e-05, + "loss": 1.3794, + "step": 848 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.357528805732727, + "learning_rate": 2.54982506359641e-05, + "loss": 1.4393, + "step": 850 + }, + { + "epoch": 1.3632, + "grad_norm": 1.277764916419983, + "learning_rate": 2.526788435896339e-05, + "loss": 1.4483, + "step": 852 + }, + { + "epoch": 1.3664, + "grad_norm": 1.3506453037261963, + "learning_rate": 2.5038211058536133e-05, + "loss": 1.4466, + "step": 854 + }, + { + "epoch": 1.3696, + "grad_norm": 1.5710474252700806, + "learning_rate": 2.4809237169967458e-05, + "loss": 1.553, + "step": 856 + }, + { + "epoch": 1.3728, + "grad_norm": 1.3960719108581543, + "learning_rate": 2.4580969108945533e-05, + "loss": 1.4599, + "step": 858 + }, + { + "epoch": 1.376, + "grad_norm": 1.4087730646133423, + "learning_rate": 2.435341327138168e-05, + "loss": 1.4237, + "step": 860 + }, + { + "epoch": 1.3792, + "grad_norm": 1.4425933361053467, + "learning_rate": 2.4126576033231208e-05, + "loss": 1.3405, + "step": 862 + }, + { + "epoch": 1.3824, + "grad_norm": 1.4608137607574463, + "learning_rate": 2.3900463750314834e-05, + "loss": 1.4638, + "step": 864 + }, + { + "epoch": 1.3856, + "grad_norm": 1.4107086658477783, + "learning_rate": 2.3675082758140475e-05, + "loss": 1.5278, + "step": 866 + }, + { + "epoch": 1.3888, + "grad_norm": 1.3733059167861938, + "learning_rate": 2.3450439371725825e-05, + "loss": 1.5054, + "step": 868 + }, + { + "epoch": 1.392, + "grad_norm": 1.3773467540740967, + "learning_rate": 2.3226539885421343e-05, + "loss": 1.4151, + "step": 870 + }, + { + "epoch": 1.3952, + "grad_norm": 1.4656720161437988, + "learning_rate": 2.3003390572734006e-05, + "loss": 1.4811, + "step": 872 + }, + { + "epoch": 1.3984, + "grad_norm": 1.3903800249099731, + "learning_rate": 2.2780997686151378e-05, + "loss": 1.4274, + "step": 874 + }, + { + "epoch": 1.4016, + "grad_norm": 1.3573490381240845, + "learning_rate": 2.255936745696652e-05, + "loss": 1.4141, + "step": 876 + }, + { + "epoch": 1.4048, + "grad_norm": 1.312809944152832, + "learning_rate": 2.2338506095103334e-05, + "loss": 1.431, + "step": 878 + }, + { + "epoch": 1.408, + "grad_norm": 1.3963521718978882, + "learning_rate": 2.2118419788942672e-05, + "loss": 1.4246, + "step": 880 + }, + { + "epoch": 1.4112, + "grad_norm": 1.5214847326278687, + "learning_rate": 2.189911470514881e-05, + "loss": 1.4489, + "step": 882 + }, + { + "epoch": 1.4144, + "grad_norm": 1.5130972862243652, + "learning_rate": 2.1680596988496705e-05, + "loss": 1.4718, + "step": 884 + }, + { + "epoch": 1.4176, + "grad_norm": 1.4655705690383911, + "learning_rate": 2.1462872761699905e-05, + "loss": 1.4782, + "step": 886 + }, + { + "epoch": 1.4208, + "grad_norm": 1.4640536308288574, + "learning_rate": 2.1245948125238867e-05, + "loss": 1.4003, + "step": 888 + }, + { + "epoch": 1.424, + "grad_norm": 1.50718092918396, + "learning_rate": 2.1029829157190117e-05, + "loss": 1.43, + "step": 890 + }, + { + "epoch": 1.4272, + "grad_norm": 1.3067643642425537, + "learning_rate": 2.081452191305587e-05, + "loss": 1.4523, + "step": 892 + }, + { + "epoch": 1.4304000000000001, + "grad_norm": 1.4971873760223389, + "learning_rate": 2.06000324255945e-05, + "loss": 1.4485, + "step": 894 + }, + { + "epoch": 1.4336, + "grad_norm": 1.4533696174621582, + "learning_rate": 2.0386366704651315e-05, + "loss": 1.4637, + "step": 896 + }, + { + "epoch": 1.4368, + "grad_norm": 1.3518776893615723, + "learning_rate": 2.0173530736990304e-05, + "loss": 1.3545, + "step": 898 + }, + { + "epoch": 1.44, + "grad_norm": 1.395410180091858, + "learning_rate": 1.9961530486126327e-05, + "loss": 1.383, + "step": 900 + }, + { + "epoch": 1.4432, + "grad_norm": 2.61722731590271, + "learning_rate": 1.9750371892158103e-05, + "loss": 1.6465, + "step": 902 + }, + { + "epoch": 1.4464000000000001, + "grad_norm": 1.403191089630127, + "learning_rate": 1.9540060871601646e-05, + "loss": 1.4416, + "step": 904 + }, + { + "epoch": 1.4496, + "grad_norm": 1.3970694541931152, + "learning_rate": 1.933060331722457e-05, + "loss": 1.5657, + "step": 906 + }, + { + "epoch": 1.4527999999999999, + "grad_norm": 1.3750842809677124, + "learning_rate": 1.9122005097881014e-05, + "loss": 1.5444, + "step": 908 + }, + { + "epoch": 1.456, + "grad_norm": 1.5135127305984497, + "learning_rate": 1.8914272058347088e-05, + "loss": 1.5246, + "step": 910 + }, + { + "epoch": 1.4592, + "grad_norm": 1.4489692449569702, + "learning_rate": 1.8707410019157196e-05, + "loss": 1.4398, + "step": 912 + }, + { + "epoch": 1.4624, + "grad_norm": 1.508467435836792, + "learning_rate": 1.8501424776440907e-05, + "loss": 1.4185, + "step": 914 + }, + { + "epoch": 1.4656, + "grad_norm": 1.3195210695266724, + "learning_rate": 1.829632210176061e-05, + "loss": 1.4815, + "step": 916 + }, + { + "epoch": 1.4687999999999999, + "grad_norm": 1.3782057762145996, + "learning_rate": 1.809210774194971e-05, + "loss": 1.4895, + "step": 918 + }, + { + "epoch": 1.472, + "grad_norm": 1.4828996658325195, + "learning_rate": 1.7888787418951645e-05, + "loss": 1.4907, + "step": 920 + }, + { + "epoch": 1.4752, + "grad_norm": 1.409317970275879, + "learning_rate": 1.7686366829659628e-05, + "loss": 1.4705, + "step": 922 + }, + { + "epoch": 1.4784, + "grad_norm": 1.3606739044189453, + "learning_rate": 1.74848516457569e-05, + "loss": 1.4784, + "step": 924 + }, + { + "epoch": 1.4816, + "grad_norm": 1.4307705163955688, + "learning_rate": 1.72842475135579e-05, + "loss": 1.4639, + "step": 926 + }, + { + "epoch": 1.4848, + "grad_norm": 1.4825451374053955, + "learning_rate": 1.7084560053850024e-05, + "loss": 1.4771, + "step": 928 + }, + { + "epoch": 1.488, + "grad_norm": 1.5933488607406616, + "learning_rate": 1.6885794861736183e-05, + "loss": 1.3967, + "step": 930 + }, + { + "epoch": 1.4912, + "grad_norm": 1.4622035026550293, + "learning_rate": 1.668795750647796e-05, + "loss": 1.3764, + "step": 932 + }, + { + "epoch": 1.4944, + "grad_norm": 1.387617588043213, + "learning_rate": 1.6491053531339607e-05, + "loss": 1.4615, + "step": 934 + }, + { + "epoch": 1.4976, + "grad_norm": 1.3928184509277344, + "learning_rate": 1.62950884534327e-05, + "loss": 1.3788, + "step": 936 + }, + { + "epoch": 1.5008, + "grad_norm": 1.4277182817459106, + "learning_rate": 1.6100067763561626e-05, + "loss": 1.3684, + "step": 938 + }, + { + "epoch": 1.504, + "grad_norm": 1.519103765487671, + "learning_rate": 1.5905996926069628e-05, + "loss": 1.4659, + "step": 940 + }, + { + "epoch": 1.5072, + "grad_norm": 1.3705520629882812, + "learning_rate": 1.5712881378685755e-05, + "loss": 1.4145, + "step": 942 + }, + { + "epoch": 1.5104, + "grad_norm": 1.437618374824524, + "learning_rate": 1.5520726532372537e-05, + "loss": 1.3951, + "step": 944 + }, + { + "epoch": 1.5135999999999998, + "grad_norm": 1.406826376914978, + "learning_rate": 1.532953777117429e-05, + "loss": 1.509, + "step": 946 + }, + { + "epoch": 1.5168, + "grad_norm": 1.4625985622406006, + "learning_rate": 1.5139320452066313e-05, + "loss": 1.4415, + "step": 948 + }, + { + "epoch": 1.52, + "grad_norm": 1.500941514968872, + "learning_rate": 1.4950079904804759e-05, + "loss": 1.4751, + "step": 950 + }, + { + "epoch": 1.5232, + "grad_norm": 1.4515104293823242, + "learning_rate": 1.4761821431777373e-05, + "loss": 1.441, + "step": 952 + }, + { + "epoch": 1.5264, + "grad_norm": 1.3307709693908691, + "learning_rate": 1.4574550307854817e-05, + "loss": 1.4566, + "step": 954 + }, + { + "epoch": 1.5295999999999998, + "grad_norm": 1.393490195274353, + "learning_rate": 1.4388271780242929e-05, + "loss": 1.4483, + "step": 956 + }, + { + "epoch": 1.5328, + "grad_norm": 1.3786524534225464, + "learning_rate": 1.4202991068335697e-05, + "loss": 1.4522, + "step": 958 + }, + { + "epoch": 1.536, + "grad_norm": 1.4812992811203003, + "learning_rate": 1.4018713363569035e-05, + "loss": 1.5197, + "step": 960 + }, + { + "epoch": 1.5392000000000001, + "grad_norm": 1.5414302349090576, + "learning_rate": 1.3835443829275268e-05, + "loss": 1.4215, + "step": 962 + }, + { + "epoch": 1.5424, + "grad_norm": 1.420955777168274, + "learning_rate": 1.365318760053848e-05, + "loss": 1.4357, + "step": 964 + }, + { + "epoch": 1.5455999999999999, + "grad_norm": 1.322597861289978, + "learning_rate": 1.3471949784050702e-05, + "loss": 1.3364, + "step": 966 + }, + { + "epoch": 1.5488, + "grad_norm": 1.356268048286438, + "learning_rate": 1.3291735457968701e-05, + "loss": 1.4199, + "step": 968 + }, + { + "epoch": 1.552, + "grad_norm": 1.3691470623016357, + "learning_rate": 1.3112549671771796e-05, + "loss": 1.425, + "step": 970 + }, + { + "epoch": 1.5552000000000001, + "grad_norm": 1.4271376132965088, + "learning_rate": 1.2934397446120306e-05, + "loss": 1.4198, + "step": 972 + }, + { + "epoch": 1.5584, + "grad_norm": 1.37627375125885, + "learning_rate": 1.2757283772714957e-05, + "loss": 1.395, + "step": 974 + }, + { + "epoch": 1.5615999999999999, + "grad_norm": 1.3378015756607056, + "learning_rate": 1.2581213614156928e-05, + "loss": 1.3637, + "step": 976 + }, + { + "epoch": 1.5648, + "grad_norm": 1.5486069917678833, + "learning_rate": 1.2406191903808844e-05, + "loss": 1.4776, + "step": 978 + }, + { + "epoch": 1.568, + "grad_norm": 1.4717572927474976, + "learning_rate": 1.2232223545656552e-05, + "loss": 1.4642, + "step": 980 + }, + { + "epoch": 1.5712000000000002, + "grad_norm": 1.5726338624954224, + "learning_rate": 1.205931341417173e-05, + "loss": 1.4629, + "step": 982 + }, + { + "epoch": 1.5744, + "grad_norm": 1.359563946723938, + "learning_rate": 1.1887466354175253e-05, + "loss": 1.4208, + "step": 984 + }, + { + "epoch": 1.5776, + "grad_norm": 1.390755295753479, + "learning_rate": 1.1716687180701474e-05, + "loss": 1.4227, + "step": 986 + }, + { + "epoch": 1.5808, + "grad_norm": 1.516890048980713, + "learning_rate": 1.1546980678863361e-05, + "loss": 1.4247, + "step": 988 + }, + { + "epoch": 1.584, + "grad_norm": 1.5359797477722168, + "learning_rate": 1.1378351603718312e-05, + "loss": 1.3898, + "step": 990 + }, + { + "epoch": 1.5872000000000002, + "grad_norm": 1.3802939653396606, + "learning_rate": 1.1210804680135022e-05, + "loss": 1.3354, + "step": 992 + }, + { + "epoch": 1.5904, + "grad_norm": 1.4020977020263672, + "learning_rate": 1.1044344602661034e-05, + "loss": 1.4493, + "step": 994 + }, + { + "epoch": 1.5936, + "grad_norm": 1.373717188835144, + "learning_rate": 1.0878976035391252e-05, + "loss": 1.3899, + "step": 996 + }, + { + "epoch": 1.5968, + "grad_norm": 1.3707786798477173, + "learning_rate": 1.0714703611837201e-05, + "loss": 1.4054, + "step": 998 + }, + { + "epoch": 1.6, + "grad_norm": 1.388121485710144, + "learning_rate": 1.0551531934797243e-05, + "loss": 1.3393, + "step": 1000 + }, + { + "epoch": 1.6032, + "grad_norm": 1.5597281455993652, + "learning_rate": 1.0389465576227558e-05, + "loss": 1.4488, + "step": 1002 + }, + { + "epoch": 1.6064, + "grad_norm": 1.2360092401504517, + "learning_rate": 1.0228509077114146e-05, + "loss": 1.4802, + "step": 1004 + }, + { + "epoch": 1.6096, + "grad_norm": 1.3134233951568604, + "learning_rate": 1.0068666947345456e-05, + "loss": 1.3538, + "step": 1006 + }, + { + "epoch": 1.6128, + "grad_norm": 1.334416151046753, + "learning_rate": 9.909943665586102e-06, + "loss": 1.3917, + "step": 1008 + }, + { + "epoch": 1.616, + "grad_norm": 1.3590480089187622, + "learning_rate": 9.752343679151399e-06, + "loss": 1.4165, + "step": 1010 + }, + { + "epoch": 1.6192, + "grad_norm": 1.4533120393753052, + "learning_rate": 9.595871403882661e-06, + "loss": 1.5106, + "step": 1012 + }, + { + "epoch": 1.6223999999999998, + "grad_norm": 1.5530134439468384, + "learning_rate": 9.440531224023552e-06, + "loss": 1.4217, + "step": 1014 + }, + { + "epoch": 1.6256, + "grad_norm": 1.4636127948760986, + "learning_rate": 9.286327492097196e-06, + "loss": 1.3306, + "step": 1016 + }, + { + "epoch": 1.6288, + "grad_norm": 1.4135422706604004, + "learning_rate": 9.133264528784274e-06, + "loss": 1.425, + "step": 1018 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 1.4650404453277588, + "learning_rate": 8.981346622801905e-06, + "loss": 1.5058, + "step": 1020 + }, + { + "epoch": 1.6352, + "grad_norm": 1.4439706802368164, + "learning_rate": 8.830578030783493e-06, + "loss": 1.4755, + "step": 1022 + }, + { + "epoch": 1.6383999999999999, + "grad_norm": 1.7018922567367554, + "learning_rate": 8.680962977159502e-06, + "loss": 1.457, + "step": 1024 + }, + { + "epoch": 1.6416, + "grad_norm": 5.544206619262695, + "learning_rate": 8.53250565403903e-06, + "loss": 1.5578, + "step": 1026 + }, + { + "epoch": 1.6448, + "grad_norm": 1.4300087690353394, + "learning_rate": 8.385210221092382e-06, + "loss": 1.4479, + "step": 1028 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 1.5086474418640137, + "learning_rate": 8.239080805434513e-06, + "loss": 1.4618, + "step": 1030 + }, + { + "epoch": 1.6512, + "grad_norm": 1.4398646354675293, + "learning_rate": 8.094121501509399e-06, + "loss": 1.4416, + "step": 1032 + }, + { + "epoch": 1.6543999999999999, + "grad_norm": 1.3153340816497803, + "learning_rate": 7.950336370975304e-06, + "loss": 1.3792, + "step": 1034 + }, + { + "epoch": 1.6576, + "grad_norm": 1.351747751235962, + "learning_rate": 7.80772944259096e-06, + "loss": 1.4545, + "step": 1036 + }, + { + "epoch": 1.6608, + "grad_norm": 1.4315369129180908, + "learning_rate": 7.666304712102695e-06, + "loss": 1.3891, + "step": 1038 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 1.2921706438064575, + "learning_rate": 7.526066142132521e-06, + "loss": 1.4527, + "step": 1040 + }, + { + "epoch": 1.6672, + "grad_norm": 1.3649907112121582, + "learning_rate": 7.3870176620670194e-06, + "loss": 1.3692, + "step": 1042 + }, + { + "epoch": 1.6703999999999999, + "grad_norm": 1.3974645137786865, + "learning_rate": 7.249163167947287e-06, + "loss": 1.3702, + "step": 1044 + }, + { + "epoch": 1.6736, + "grad_norm": 1.3394696712493896, + "learning_rate": 7.1125065223598076e-06, + "loss": 1.3851, + "step": 1046 + }, + { + "epoch": 1.6768, + "grad_norm": 1.3612364530563354, + "learning_rate": 6.9770515543281455e-06, + "loss": 1.4355, + "step": 1048 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.442084789276123, + "learning_rate": 6.842802059205727e-06, + "loss": 1.4225, + "step": 1050 + }, + { + "epoch": 1.6832, + "grad_norm": 1.2821780443191528, + "learning_rate": 6.709761798569442e-06, + "loss": 1.4098, + "step": 1052 + }, + { + "epoch": 1.6864, + "grad_norm": 1.3669836521148682, + "learning_rate": 6.577934500114335e-06, + "loss": 1.4238, + "step": 1054 + }, + { + "epoch": 1.6896, + "grad_norm": 1.3460770845413208, + "learning_rate": 6.44732385754902e-06, + "loss": 1.4161, + "step": 1056 + }, + { + "epoch": 1.6928, + "grad_norm": 1.488351821899414, + "learning_rate": 6.3179335304923095e-06, + "loss": 1.4265, + "step": 1058 + }, + { + "epoch": 1.696, + "grad_norm": 1.3682546615600586, + "learning_rate": 6.189767144370645e-06, + "loss": 1.408, + "step": 1060 + }, + { + "epoch": 1.6992, + "grad_norm": 1.5109198093414307, + "learning_rate": 6.062828290316469e-06, + "loss": 1.4179, + "step": 1062 + }, + { + "epoch": 1.7024, + "grad_norm": 1.4661554098129272, + "learning_rate": 5.937120525067641e-06, + "loss": 1.3816, + "step": 1064 + }, + { + "epoch": 1.7056, + "grad_norm": 1.3422075510025024, + "learning_rate": 5.812647370867763e-06, + "loss": 1.4014, + "step": 1066 + }, + { + "epoch": 1.7088, + "grad_norm": 1.3991550207138062, + "learning_rate": 5.689412315367543e-06, + "loss": 1.4443, + "step": 1068 + }, + { + "epoch": 1.712, + "grad_norm": 1.3040037155151367, + "learning_rate": 5.567418811526981e-06, + "loss": 1.3615, + "step": 1070 + }, + { + "epoch": 1.7151999999999998, + "grad_norm": 1.3308135271072388, + "learning_rate": 5.4466702775186785e-06, + "loss": 1.3662, + "step": 1072 + }, + { + "epoch": 1.7184, + "grad_norm": 1.3251948356628418, + "learning_rate": 5.327170096632089e-06, + "loss": 1.4263, + "step": 1074 + }, + { + "epoch": 1.7216, + "grad_norm": 1.219191074371338, + "learning_rate": 5.208921617178641e-06, + "loss": 1.4676, + "step": 1076 + }, + { + "epoch": 1.7248, + "grad_norm": 1.364389181137085, + "learning_rate": 5.091928152397984e-06, + "loss": 1.4124, + "step": 1078 + }, + { + "epoch": 1.728, + "grad_norm": 1.4420322179794312, + "learning_rate": 4.976192980365124e-06, + "loss": 1.4189, + "step": 1080 + }, + { + "epoch": 1.7311999999999999, + "grad_norm": 1.3646230697631836, + "learning_rate": 4.861719343898613e-06, + "loss": 1.4043, + "step": 1082 + }, + { + "epoch": 1.7344, + "grad_norm": 1.4223638772964478, + "learning_rate": 4.748510450469623e-06, + "loss": 1.4538, + "step": 1084 + }, + { + "epoch": 1.7376, + "grad_norm": 1.3192670345306396, + "learning_rate": 4.63656947211214e-06, + "loss": 1.4187, + "step": 1086 + }, + { + "epoch": 1.7408000000000001, + "grad_norm": 1.3329640626907349, + "learning_rate": 4.525899545334023e-06, + "loss": 1.3402, + "step": 1088 + }, + { + "epoch": 1.744, + "grad_norm": 1.5323829650878906, + "learning_rate": 4.416503771029201e-06, + "loss": 1.6182, + "step": 1090 + }, + { + "epoch": 1.7471999999999999, + "grad_norm": 1.3993905782699585, + "learning_rate": 4.308385214390709e-06, + "loss": 1.4359, + "step": 1092 + }, + { + "epoch": 1.7504, + "grad_norm": 1.3519290685653687, + "learning_rate": 4.2015469048248375e-06, + "loss": 1.4073, + "step": 1094 + }, + { + "epoch": 1.7536, + "grad_norm": 1.3193116188049316, + "learning_rate": 4.095991835866275e-06, + "loss": 1.3293, + "step": 1096 + }, + { + "epoch": 1.7568000000000001, + "grad_norm": 1.3401769399642944, + "learning_rate": 3.99172296509418e-06, + "loss": 1.4281, + "step": 1098 + }, + { + "epoch": 1.76, + "grad_norm": 1.3036984205245972, + "learning_rate": 3.888743214049346e-06, + "loss": 1.4516, + "step": 1100 + }, + { + "epoch": 1.7631999999999999, + "grad_norm": 1.2876367568969727, + "learning_rate": 3.7870554681523287e-06, + "loss": 1.3857, + "step": 1102 + }, + { + "epoch": 1.7664, + "grad_norm": 1.286058783531189, + "learning_rate": 3.6866625766226293e-06, + "loss": 1.3421, + "step": 1104 + }, + { + "epoch": 1.7696, + "grad_norm": 1.2945473194122314, + "learning_rate": 3.587567352398796e-06, + "loss": 1.4383, + "step": 1106 + }, + { + "epoch": 1.7728000000000002, + "grad_norm": 1.3773088455200195, + "learning_rate": 3.489772572059674e-06, + "loss": 1.4287, + "step": 1108 + }, + { + "epoch": 1.776, + "grad_norm": 1.2338217496871948, + "learning_rate": 3.393280975746588e-06, + "loss": 1.3083, + "step": 1110 + }, + { + "epoch": 1.7792, + "grad_norm": 2.751504421234131, + "learning_rate": 3.2980952670865317e-06, + "loss": 1.4268, + "step": 1112 + }, + { + "epoch": 1.7824, + "grad_norm": 1.335714340209961, + "learning_rate": 3.2042181131164528e-06, + "loss": 1.4319, + "step": 1114 + }, + { + "epoch": 1.7856, + "grad_norm": 1.2379279136657715, + "learning_rate": 3.11165214420851e-06, + "loss": 1.3696, + "step": 1116 + }, + { + "epoch": 1.7888, + "grad_norm": 1.3956067562103271, + "learning_rate": 3.020399953996389e-06, + "loss": 1.4996, + "step": 1118 + }, + { + "epoch": 1.792, + "grad_norm": 1.3509949445724487, + "learning_rate": 2.9304640993025988e-06, + "loss": 1.4443, + "step": 1120 + }, + { + "epoch": 1.7952, + "grad_norm": 1.2706594467163086, + "learning_rate": 2.8418471000668523e-06, + "loss": 1.3773, + "step": 1122 + }, + { + "epoch": 1.7984, + "grad_norm": 1.4019887447357178, + "learning_rate": 2.7545514392754437e-06, + "loss": 1.4795, + "step": 1124 + }, + { + "epoch": 1.8016, + "grad_norm": 1.3648223876953125, + "learning_rate": 2.6685795628917266e-06, + "loss": 1.3566, + "step": 1126 + }, + { + "epoch": 1.8048, + "grad_norm": 1.5225048065185547, + "learning_rate": 2.5839338797875036e-06, + "loss": 1.4291, + "step": 1128 + }, + { + "epoch": 1.808, + "grad_norm": 1.4786689281463623, + "learning_rate": 2.500616761675578e-06, + "loss": 1.6036, + "step": 1130 + }, + { + "epoch": 1.8112, + "grad_norm": 1.439139723777771, + "learning_rate": 2.41863054304331e-06, + "loss": 1.445, + "step": 1132 + }, + { + "epoch": 1.8144, + "grad_norm": 1.322020411491394, + "learning_rate": 2.3379775210871648e-06, + "loss": 1.4561, + "step": 1134 + }, + { + "epoch": 1.8176, + "grad_norm": 1.3266148567199707, + "learning_rate": 2.2586599556483734e-06, + "loss": 1.4347, + "step": 1136 + }, + { + "epoch": 1.8208, + "grad_norm": 1.3589297533035278, + "learning_rate": 2.180680069149621e-06, + "loss": 1.3969, + "step": 1138 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 1.3703463077545166, + "learning_rate": 2.104040046532768e-06, + "loss": 1.4253, + "step": 1140 + }, + { + "epoch": 1.8272, + "grad_norm": 1.2915222644805908, + "learning_rate": 2.0287420351976063e-06, + "loss": 1.4272, + "step": 1142 + }, + { + "epoch": 1.8304, + "grad_norm": 1.2881819009780884, + "learning_rate": 1.954788144941727e-06, + "loss": 1.3726, + "step": 1144 + }, + { + "epoch": 1.8336000000000001, + "grad_norm": 1.2942179441452026, + "learning_rate": 1.8821804479013772e-06, + "loss": 1.3269, + "step": 1146 + }, + { + "epoch": 1.8368, + "grad_norm": 1.4517358541488647, + "learning_rate": 1.81092097849343e-06, + "loss": 1.4521, + "step": 1148 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 1.3010282516479492, + "learning_rate": 1.7410117333583498e-06, + "loss": 1.3935, + "step": 1150 + }, + { + "epoch": 1.8432, + "grad_norm": 1.5645641088485718, + "learning_rate": 1.6724546713042577e-06, + "loss": 1.3637, + "step": 1152 + }, + { + "epoch": 1.8464, + "grad_norm": 1.4219244718551636, + "learning_rate": 1.6052517132520651e-06, + "loss": 1.4567, + "step": 1154 + }, + { + "epoch": 1.8496000000000001, + "grad_norm": 1.4337780475616455, + "learning_rate": 1.5394047421816327e-06, + "loss": 1.429, + "step": 1156 + }, + { + "epoch": 1.8528, + "grad_norm": 1.561288595199585, + "learning_rate": 1.4749156030790024e-06, + "loss": 1.4846, + "step": 1158 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 1.3074960708618164, + "learning_rate": 1.4117861028847267e-06, + "loss": 1.3928, + "step": 1160 + }, + { + "epoch": 1.8592, + "grad_norm": 1.285032033920288, + "learning_rate": 1.3500180104432325e-06, + "loss": 1.365, + "step": 1162 + }, + { + "epoch": 1.8624, + "grad_norm": 1.2760525941848755, + "learning_rate": 1.2896130564532427e-06, + "loss": 1.4376, + "step": 1164 + }, + { + "epoch": 1.8656000000000001, + "grad_norm": 1.3905929327011108, + "learning_rate": 1.2305729334192994e-06, + "loss": 1.4799, + "step": 1166 + }, + { + "epoch": 1.8688, + "grad_norm": 1.388978362083435, + "learning_rate": 1.1728992956043238e-06, + "loss": 1.4397, + "step": 1168 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 3.00641131401062, + "learning_rate": 1.1165937589833087e-06, + "loss": 1.5248, + "step": 1170 + }, + { + "epoch": 1.8752, + "grad_norm": 1.3412961959838867, + "learning_rate": 1.061657901197971e-06, + "loss": 1.4096, + "step": 1172 + }, + { + "epoch": 1.8784, + "grad_norm": 1.2458962202072144, + "learning_rate": 1.008093261512616e-06, + "loss": 1.4463, + "step": 1174 + }, + { + "epoch": 1.8816000000000002, + "grad_norm": 1.331132411956787, + "learning_rate": 9.559013407709595e-07, + "loss": 1.3897, + "step": 1176 + }, + { + "epoch": 1.8848, + "grad_norm": 1.3351112604141235, + "learning_rate": 9.050836013541009e-07, + "loss": 1.4563, + "step": 1178 + }, + { + "epoch": 1.888, + "grad_norm": 1.434446930885315, + "learning_rate": 8.55641467139534e-07, + "loss": 1.4178, + "step": 1180 + }, + { + "epoch": 1.8912, + "grad_norm": 1.281200885772705, + "learning_rate": 8.075763234612622e-07, + "loss": 1.4287, + "step": 1182 + }, + { + "epoch": 1.8944, + "grad_norm": 1.7233554124832153, + "learning_rate": 7.60889517070984e-07, + "loss": 1.4914, + "step": 1184 + }, + { + "epoch": 1.8976, + "grad_norm": 1.3849607706069946, + "learning_rate": 7.155823561003361e-07, + "loss": 1.3976, + "step": 1186 + }, + { + "epoch": 1.9008, + "grad_norm": 1.3818809986114502, + "learning_rate": 6.716561100242658e-07, + "loss": 1.4095, + "step": 1188 + }, + { + "epoch": 1.904, + "grad_norm": 1.2812706232070923, + "learning_rate": 6.291120096254433e-07, + "loss": 1.3447, + "step": 1190 + }, + { + "epoch": 1.9072, + "grad_norm": 1.4153764247894287, + "learning_rate": 5.879512469598058e-07, + "loss": 1.3582, + "step": 1192 + }, + { + "epoch": 1.9104, + "grad_norm": 1.609390377998352, + "learning_rate": 5.481749753231124e-07, + "loss": 1.3982, + "step": 1194 + }, + { + "epoch": 1.9136, + "grad_norm": 1.3431967496871948, + "learning_rate": 5.097843092186583e-07, + "loss": 1.4785, + "step": 1196 + }, + { + "epoch": 1.9167999999999998, + "grad_norm": 1.350616216659546, + "learning_rate": 4.7278032432604425e-07, + "loss": 1.3739, + "step": 1198 + }, + { + "epoch": 1.92, + "grad_norm": 1.213797688484192, + "learning_rate": 4.371640574710345e-07, + "loss": 1.4042, + "step": 1200 + }, + { + "epoch": 1.9232, + "grad_norm": 1.3762907981872559, + "learning_rate": 4.0293650659650184e-07, + "loss": 1.3671, + "step": 1202 + }, + { + "epoch": 1.9264000000000001, + "grad_norm": 1.3090656995773315, + "learning_rate": 3.7009863073446673e-07, + "loss": 1.426, + "step": 1204 + }, + { + "epoch": 1.9296, + "grad_norm": 1.3925652503967285, + "learning_rate": 3.386513499792354e-07, + "loss": 1.4414, + "step": 1206 + }, + { + "epoch": 1.9327999999999999, + "grad_norm": 1.2840951681137085, + "learning_rate": 3.0859554546160965e-07, + "loss": 1.297, + "step": 1208 + }, + { + "epoch": 1.936, + "grad_norm": 1.2935904264450073, + "learning_rate": 2.7993205932420053e-07, + "loss": 1.3802, + "step": 1210 + }, + { + "epoch": 1.9392, + "grad_norm": 1.2855424880981445, + "learning_rate": 2.5266169469783105e-07, + "loss": 1.5116, + "step": 1212 + }, + { + "epoch": 1.9424000000000001, + "grad_norm": 1.345445990562439, + "learning_rate": 2.2678521567903176e-07, + "loss": 1.4102, + "step": 1214 + }, + { + "epoch": 1.9456, + "grad_norm": 1.389737606048584, + "learning_rate": 2.023033473086411e-07, + "loss": 1.4212, + "step": 1216 + }, + { + "epoch": 1.9487999999999999, + "grad_norm": 1.4330286979675293, + "learning_rate": 1.7921677555147177e-07, + "loss": 1.4581, + "step": 1218 + }, + { + "epoch": 1.952, + "grad_norm": 1.428533673286438, + "learning_rate": 1.5752614727712057e-07, + "loss": 1.3911, + "step": 1220 + }, + { + "epoch": 1.9552, + "grad_norm": 1.358577847480774, + "learning_rate": 1.3723207024180507e-07, + "loss": 1.4441, + "step": 1222 + }, + { + "epoch": 1.9584000000000001, + "grad_norm": 1.422744631767273, + "learning_rate": 1.1833511307136613e-07, + "loss": 1.4378, + "step": 1224 + }, + { + "epoch": 1.9616, + "grad_norm": 1.3814959526062012, + "learning_rate": 1.0083580524531955e-07, + "loss": 1.4551, + "step": 1226 + }, + { + "epoch": 1.9647999999999999, + "grad_norm": 1.2662824392318726, + "learning_rate": 8.473463708202345e-08, + "loss": 1.4284, + "step": 1228 + }, + { + "epoch": 1.968, + "grad_norm": 1.2959048748016357, + "learning_rate": 7.003205972494486e-08, + "loss": 1.3193, + "step": 1230 + }, + { + "epoch": 1.9712, + "grad_norm": 1.3156046867370605, + "learning_rate": 5.672848513000873e-08, + "loss": 1.4021, + "step": 1232 + }, + { + "epoch": 1.9744000000000002, + "grad_norm": 1.3085626363754272, + "learning_rate": 4.482428605407374e-08, + "loss": 1.3919, + "step": 1234 + }, + { + "epoch": 1.9776, + "grad_norm": 1.3836755752563477, + "learning_rate": 3.431979604445745e-08, + "loss": 1.4616, + "step": 1236 + }, + { + "epoch": 1.9808, + "grad_norm": 1.3475788831710815, + "learning_rate": 2.521530942962702e-08, + "loss": 1.4426, + "step": 1238 + }, + { + "epoch": 1.984, + "grad_norm": 1.4258958101272583, + "learning_rate": 1.7511081310922495e-08, + "loss": 1.3884, + "step": 1240 + }, + { + "epoch": 1.9872, + "grad_norm": 1.2867543697357178, + "learning_rate": 1.1207327555429192e-08, + "loss": 1.3472, + "step": 1242 + }, + { + "epoch": 1.9904, + "grad_norm": 1.4145276546478271, + "learning_rate": 6.304224789910329e-09, + "loss": 1.4185, + "step": 1244 + }, + { + "epoch": 1.9936, + "grad_norm": 1.2391281127929688, + "learning_rate": 2.801910395877627e-09, + "loss": 1.3197, + "step": 1246 + }, + { + "epoch": 1.9968, + "grad_norm": 1.4041131734848022, + "learning_rate": 7.004825057277398e-10, + "loss": 1.4755, + "step": 1248 + }, + { + "epoch": 2.0, + "grad_norm": 2.3858015537261963, + "learning_rate": 0.0, + "loss": 1.3312, + "step": 1250 + } + ], + "logging_steps": 2, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 625, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1133643907229286e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}