|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 60.66320037841797, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 6.1881, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 42.748294830322266, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 5.7668, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 13.887235641479492, |
|
"learning_rate": 9.523809523809523e-06, |
|
"loss": 5.0841, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 12.488709449768066, |
|
"learning_rate": 1.2698412698412699e-05, |
|
"loss": 4.7192, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 7.208029270172119, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 4.5679, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 7.34755277633667, |
|
"learning_rate": 1.9047619047619046e-05, |
|
"loss": 4.368, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 7.489256858825684, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 4.1398, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 4.802424430847168, |
|
"learning_rate": 2.5396825396825397e-05, |
|
"loss": 4.0532, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 4.138615131378174, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 3.7559, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 5.362161636352539, |
|
"learning_rate": 3.1746031746031745e-05, |
|
"loss": 3.4854, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 3.878138303756714, |
|
"learning_rate": 3.492063492063492e-05, |
|
"loss": 3.5869, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 3.480282783508301, |
|
"learning_rate": 3.809523809523809e-05, |
|
"loss": 3.2734, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 3.527137279510498, |
|
"learning_rate": 4.126984126984127e-05, |
|
"loss": 3.1484, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 3.094705820083618, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 3.0356, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 4.064608573913574, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 3.0157, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 3.166187047958374, |
|
"learning_rate": 5.0793650793650794e-05, |
|
"loss": 2.9229, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 2.909856081008911, |
|
"learning_rate": 5.396825396825397e-05, |
|
"loss": 2.8386, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 3.2195472717285156, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 2.7819, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 2.603515625, |
|
"learning_rate": 6.0317460317460316e-05, |
|
"loss": 2.7116, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 3.353161096572876, |
|
"learning_rate": 6.349206349206349e-05, |
|
"loss": 2.7137, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 2.7278943061828613, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.6213, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 3.275580883026123, |
|
"learning_rate": 6.984126984126984e-05, |
|
"loss": 2.5585, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 2.934300422668457, |
|
"learning_rate": 7.301587301587302e-05, |
|
"loss": 2.6126, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 2.483461380004883, |
|
"learning_rate": 7.619047619047618e-05, |
|
"loss": 2.4616, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.5167787075042725, |
|
"learning_rate": 7.936507936507937e-05, |
|
"loss": 2.4316, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 2.211185932159424, |
|
"learning_rate": 8.253968253968255e-05, |
|
"loss": 2.63, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 3.2666075229644775, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 2.4892, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 2.849605083465576, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 2.4584, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 3.0103588104248047, |
|
"learning_rate": 9.206349206349206e-05, |
|
"loss": 2.2766, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.306534767150879, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 2.4357, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 2.3400485515594482, |
|
"learning_rate": 9.841269841269841e-05, |
|
"loss": 2.4118, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 2.583407163619995, |
|
"learning_rate": 9.99998248790669e-05, |
|
"loss": 2.2326, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 2.57265567779541, |
|
"learning_rate": 9.999842391896222e-05, |
|
"loss": 2.2923, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 2.307471990585327, |
|
"learning_rate": 9.999562203800676e-05, |
|
"loss": 2.2235, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 2.357084035873413, |
|
"learning_rate": 9.999141931470729e-05, |
|
"loss": 2.2896, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 2.6181581020355225, |
|
"learning_rate": 9.998581586682116e-05, |
|
"loss": 2.3015, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 2.253117799758911, |
|
"learning_rate": 9.997881185135307e-05, |
|
"loss": 2.1824, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 2.546729326248169, |
|
"learning_rate": 9.997040746455062e-05, |
|
"loss": 2.1502, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 2.4144699573516846, |
|
"learning_rate": 9.996060294189887e-05, |
|
"loss": 2.3715, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 2.3093016147613525, |
|
"learning_rate": 9.994939855811362e-05, |
|
"loss": 2.2753, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 2.5628409385681152, |
|
"learning_rate": 9.993679462713395e-05, |
|
"loss": 2.3152, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 2.549136161804199, |
|
"learning_rate": 9.992279150211314e-05, |
|
"loss": 2.1171, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 2.4570610523223877, |
|
"learning_rate": 9.990738957540896e-05, |
|
"loss": 2.2414, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 2.256564140319824, |
|
"learning_rate": 9.989058927857263e-05, |
|
"loss": 2.1324, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 2.818751811981201, |
|
"learning_rate": 9.987239108233668e-05, |
|
"loss": 2.184, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 2.432871103286743, |
|
"learning_rate": 9.985279549660185e-05, |
|
"loss": 2.1899, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 2.1021323204040527, |
|
"learning_rate": 9.983180307042274e-05, |
|
"loss": 2.1064, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 2.7487058639526367, |
|
"learning_rate": 9.980941439199246e-05, |
|
"loss": 2.2197, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 2.82835054397583, |
|
"learning_rate": 9.97856300886261e-05, |
|
"loss": 2.2048, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.25872802734375, |
|
"learning_rate": 9.976045082674319e-05, |
|
"loss": 2.1002, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 2.040614366531372, |
|
"learning_rate": 9.973387731184902e-05, |
|
"loss": 2.1031, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 2.437248706817627, |
|
"learning_rate": 9.97059102885149e-05, |
|
"loss": 2.1416, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 2.0928878784179688, |
|
"learning_rate": 9.967655054035727e-05, |
|
"loss": 2.1576, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 2.2243545055389404, |
|
"learning_rate": 9.964579889001569e-05, |
|
"loss": 1.9863, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 2.1860439777374268, |
|
"learning_rate": 9.961365619912989e-05, |
|
"loss": 2.0016, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 2.527122735977173, |
|
"learning_rate": 9.95801233683156e-05, |
|
"loss": 2.1272, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 1.8613876104354858, |
|
"learning_rate": 9.954520133713924e-05, |
|
"loss": 2.2001, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 2.115910530090332, |
|
"learning_rate": 9.950889108409172e-05, |
|
"loss": 2.0871, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 2.361309051513672, |
|
"learning_rate": 9.947119362656092e-05, |
|
"loss": 2.017, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 2.09470272064209, |
|
"learning_rate": 9.94321100208032e-05, |
|
"loss": 2.1847, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 1.9747451543807983, |
|
"learning_rate": 9.939164136191384e-05, |
|
"loss": 2.324, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 1.8229223489761353, |
|
"learning_rate": 9.934978878379636e-05, |
|
"loss": 2.1454, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 1.9113378524780273, |
|
"learning_rate": 9.930655345913071e-05, |
|
"loss": 2.0096, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 2.385289192199707, |
|
"learning_rate": 9.926193659934043e-05, |
|
"loss": 2.1029, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 2.101463794708252, |
|
"learning_rate": 9.921593945455869e-05, |
|
"loss": 2.0172, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 2.2676024436950684, |
|
"learning_rate": 9.916856331359335e-05, |
|
"loss": 1.9966, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 2.0168168544769287, |
|
"learning_rate": 9.911980950389067e-05, |
|
"loss": 2.1807, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 2.1054186820983887, |
|
"learning_rate": 9.906967939149831e-05, |
|
"loss": 1.9759, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 2.3354573249816895, |
|
"learning_rate": 9.901817438102695e-05, |
|
"loss": 1.995, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 2.2721822261810303, |
|
"learning_rate": 9.896529591561093e-05, |
|
"loss": 2.2239, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 1.9209738969802856, |
|
"learning_rate": 9.891104547686782e-05, |
|
"loss": 2.0051, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 1.978259801864624, |
|
"learning_rate": 9.8855424584857e-05, |
|
"loss": 2.0367, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 1.9169765710830688, |
|
"learning_rate": 9.879843479803691e-05, |
|
"loss": 2.1009, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 1.8380109071731567, |
|
"learning_rate": 9.874007771322151e-05, |
|
"loss": 2.1456, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.1143693923950195, |
|
"learning_rate": 9.868035496553546e-05, |
|
"loss": 1.925, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 1.8774141073226929, |
|
"learning_rate": 9.86192682283684e-05, |
|
"loss": 1.9616, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 2.3532581329345703, |
|
"learning_rate": 9.855681921332793e-05, |
|
"loss": 2.0289, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 2.1421797275543213, |
|
"learning_rate": 9.849300967019175e-05, |
|
"loss": 2.0153, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 2.0029852390289307, |
|
"learning_rate": 9.84278413868586e-05, |
|
"loss": 2.0726, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 2.0344998836517334, |
|
"learning_rate": 9.836131618929819e-05, |
|
"loss": 2.0215, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 1.8781356811523438, |
|
"learning_rate": 9.82934359415e-05, |
|
"loss": 2.0622, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 1.9795514345169067, |
|
"learning_rate": 9.822420254542108e-05, |
|
"loss": 2.0249, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 2.012881278991699, |
|
"learning_rate": 9.815361794093272e-05, |
|
"loss": 1.9815, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 2.264941453933716, |
|
"learning_rate": 9.808168410576617e-05, |
|
"loss": 2.0232, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 2.4006729125976562, |
|
"learning_rate": 9.800840305545715e-05, |
|
"loss": 2.0844, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 2.0443308353424072, |
|
"learning_rate": 9.793377684328939e-05, |
|
"loss": 2.2302, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 2.164515972137451, |
|
"learning_rate": 9.785780756023714e-05, |
|
"loss": 1.9808, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 1.9512875080108643, |
|
"learning_rate": 9.778049733490655e-05, |
|
"loss": 2.0968, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 1.9964834451675415, |
|
"learning_rate": 9.770184833347606e-05, |
|
"loss": 1.9889, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.9380826950073242, |
|
"learning_rate": 9.762186275963563e-05, |
|
"loss": 1.9766, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 1.943260669708252, |
|
"learning_rate": 9.754054285452506e-05, |
|
"loss": 1.9298, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 2.1821844577789307, |
|
"learning_rate": 9.745789089667121e-05, |
|
"loss": 2.1202, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.7526299953460693, |
|
"learning_rate": 9.737390920192408e-05, |
|
"loss": 2.0635, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 2.229520082473755, |
|
"learning_rate": 9.7288600123392e-05, |
|
"loss": 1.9582, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 2.3614768981933594, |
|
"learning_rate": 9.720196605137565e-05, |
|
"loss": 2.0278, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 2.1270534992218018, |
|
"learning_rate": 9.71140094133011e-05, |
|
"loss": 2.1036, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 2.2983131408691406, |
|
"learning_rate": 9.702473267365182e-05, |
|
"loss": 2.0558, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 1.9561504125595093, |
|
"learning_rate": 9.693413833389956e-05, |
|
"loss": 1.9173, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 2.234160900115967, |
|
"learning_rate": 9.684222893243431e-05, |
|
"loss": 2.1188, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.883965015411377, |
|
"learning_rate": 9.674900704449324e-05, |
|
"loss": 1.9584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 1.7237235307693481, |
|
"learning_rate": 9.665447528208836e-05, |
|
"loss": 1.9351, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 2.0437498092651367, |
|
"learning_rate": 9.655863629393351e-05, |
|
"loss": 1.9079, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 2.014540195465088, |
|
"learning_rate": 9.64614927653701e-05, |
|
"loss": 1.8612, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 2.379439115524292, |
|
"learning_rate": 9.636304741829181e-05, |
|
"loss": 1.9976, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.962538242340088, |
|
"learning_rate": 9.626330301106837e-05, |
|
"loss": 1.932, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 1.862244725227356, |
|
"learning_rate": 9.616226233846828e-05, |
|
"loss": 1.8992, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 1.7304776906967163, |
|
"learning_rate": 9.605992823158046e-05, |
|
"loss": 2.0777, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 2.2403054237365723, |
|
"learning_rate": 9.595630355773501e-05, |
|
"loss": 1.8658, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 3.3899903297424316, |
|
"learning_rate": 9.585139122042274e-05, |
|
"loss": 1.9963, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 2.261810064315796, |
|
"learning_rate": 9.574519415921396e-05, |
|
"loss": 1.947, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 2.2053134441375732, |
|
"learning_rate": 9.5637715349676e-05, |
|
"loss": 2.0544, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 1.871773362159729, |
|
"learning_rate": 9.552895780328987e-05, |
|
"loss": 1.8976, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 1.6700202226638794, |
|
"learning_rate": 9.541892456736595e-05, |
|
"loss": 2.1166, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 1.9986639022827148, |
|
"learning_rate": 9.530761872495849e-05, |
|
"loss": 1.9311, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 2.288973331451416, |
|
"learning_rate": 9.519504339477932e-05, |
|
"loss": 1.98, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 2.177896738052368, |
|
"learning_rate": 9.508120173111039e-05, |
|
"loss": 1.862, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 1.9860484600067139, |
|
"learning_rate": 9.496609692371548e-05, |
|
"loss": 1.9192, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 1.924127221107483, |
|
"learning_rate": 9.484973219775074e-05, |
|
"loss": 1.871, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 1.9022867679595947, |
|
"learning_rate": 9.473211081367436e-05, |
|
"loss": 1.9067, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.7447446584701538, |
|
"learning_rate": 9.46132360671552e-05, |
|
"loss": 1.8984, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 2.809067487716675, |
|
"learning_rate": 9.449311128898049e-05, |
|
"loss": 1.8327, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 1.9946494102478027, |
|
"learning_rate": 9.437173984496246e-05, |
|
"loss": 1.9735, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 1.8834348917007446, |
|
"learning_rate": 9.424912513584401e-05, |
|
"loss": 2.0294, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 1.9426389932632446, |
|
"learning_rate": 9.412527059720352e-05, |
|
"loss": 1.9919, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.823935627937317, |
|
"learning_rate": 9.400017969935848e-05, |
|
"loss": 1.8907, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 2.1048786640167236, |
|
"learning_rate": 9.387385594726829e-05, |
|
"loss": 1.8855, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 1.9253580570220947, |
|
"learning_rate": 9.374630288043614e-05, |
|
"loss": 2.0577, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 1.785396695137024, |
|
"learning_rate": 9.361752407280965e-05, |
|
"loss": 1.9675, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 1.9203846454620361, |
|
"learning_rate": 9.348752313268093e-05, |
|
"loss": 1.8934, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 1.986392855644226, |
|
"learning_rate": 9.335630370258533e-05, |
|
"loss": 1.9838, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 1.953905463218689, |
|
"learning_rate": 9.322386945919946e-05, |
|
"loss": 1.7604, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 1.7314627170562744, |
|
"learning_rate": 9.309022411323816e-05, |
|
"loss": 2.0328, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 1.6745048761367798, |
|
"learning_rate": 9.295537140935049e-05, |
|
"loss": 1.9734, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 1.8622961044311523, |
|
"learning_rate": 9.281931512601485e-05, |
|
"loss": 1.9509, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 2.014514684677124, |
|
"learning_rate": 9.26820590754331e-05, |
|
"loss": 1.8272, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 2.118647336959839, |
|
"learning_rate": 9.254360710342371e-05, |
|
"loss": 1.8347, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 2.04239821434021, |
|
"learning_rate": 9.240396308931407e-05, |
|
"loss": 1.8675, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 1.951341152191162, |
|
"learning_rate": 9.226313094583173e-05, |
|
"loss": 1.9559, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 1.7053275108337402, |
|
"learning_rate": 9.212111461899479e-05, |
|
"loss": 2.0715, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.7789607048034668, |
|
"learning_rate": 9.197791808800135e-05, |
|
"loss": 1.89, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 1.8625364303588867, |
|
"learning_rate": 9.183354536511803e-05, |
|
"loss": 1.9809, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 1.6965309381484985, |
|
"learning_rate": 9.168800049556747e-05, |
|
"loss": 1.8365, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 2.1207497119903564, |
|
"learning_rate": 9.154128755741509e-05, |
|
"loss": 1.8314, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 1.8182010650634766, |
|
"learning_rate": 9.139341066145472e-05, |
|
"loss": 1.8906, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 1.977777361869812, |
|
"learning_rate": 9.124437395109353e-05, |
|
"loss": 1.8562, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 1.9953404664993286, |
|
"learning_rate": 9.109418160223585e-05, |
|
"loss": 1.8364, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 1.9941433668136597, |
|
"learning_rate": 9.094283782316619e-05, |
|
"loss": 1.7585, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 1.9799609184265137, |
|
"learning_rate": 9.079034685443133e-05, |
|
"loss": 1.8669, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 1.755238652229309, |
|
"learning_rate": 9.063671296872149e-05, |
|
"loss": 1.8001, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.059305429458618, |
|
"learning_rate": 9.048194047075069e-05, |
|
"loss": 1.9259, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 1.7116378545761108, |
|
"learning_rate": 9.032603369713596e-05, |
|
"loss": 1.6954, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 2.472815990447998, |
|
"learning_rate": 9.016899701627604e-05, |
|
"loss": 1.8413, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 1.8934400081634521, |
|
"learning_rate": 9.00108348282288e-05, |
|
"loss": 1.9545, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 2.147753953933716, |
|
"learning_rate": 8.985155156458811e-05, |
|
"loss": 1.7679, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 2.2302675247192383, |
|
"learning_rate": 8.969115168835954e-05, |
|
"loss": 1.8257, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 1.6578640937805176, |
|
"learning_rate": 8.952963969383538e-05, |
|
"loss": 1.7151, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 1.754835844039917, |
|
"learning_rate": 8.93670201064687e-05, |
|
"loss": 2.0074, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 2.130150556564331, |
|
"learning_rate": 8.920329748274649e-05, |
|
"loss": 1.8657, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 1.7068381309509277, |
|
"learning_rate": 8.903847641006218e-05, |
|
"loss": 1.8955, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 2.0879528522491455, |
|
"learning_rate": 8.887256150658684e-05, |
|
"loss": 1.7092, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 1.8985047340393066, |
|
"learning_rate": 8.870555742113998e-05, |
|
"loss": 1.8091, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 1.7577992677688599, |
|
"learning_rate": 8.85374688330592e-05, |
|
"loss": 1.8895, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 1.8277013301849365, |
|
"learning_rate": 8.836830045206911e-05, |
|
"loss": 1.8192, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 1.8492199182510376, |
|
"learning_rate": 8.81980570181494e-05, |
|
"loss": 2.0282, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.8850246667861938, |
|
"learning_rate": 8.802674330140192e-05, |
|
"loss": 1.7955, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 1.7965402603149414, |
|
"learning_rate": 8.785436410191714e-05, |
|
"loss": 1.8271, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 2.0495541095733643, |
|
"learning_rate": 8.76809242496396e-05, |
|
"loss": 1.9308, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 1.8388515710830688, |
|
"learning_rate": 8.750642860423262e-05, |
|
"loss": 1.8831, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 2.2101669311523438, |
|
"learning_rate": 8.733088205494205e-05, |
|
"loss": 1.9837, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.7564021348953247, |
|
"learning_rate": 8.715428952045936e-05, |
|
"loss": 2.0114, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 2.0515785217285156, |
|
"learning_rate": 8.697665594878382e-05, |
|
"loss": 1.7574, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 2.1503772735595703, |
|
"learning_rate": 8.679798631708375e-05, |
|
"loss": 1.9549, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 1.6707327365875244, |
|
"learning_rate": 8.661828563155727e-05, |
|
"loss": 1.9318, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 1.9014642238616943, |
|
"learning_rate": 8.643755892729179e-05, |
|
"loss": 1.9853, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.9820547103881836, |
|
"learning_rate": 8.625581126812312e-05, |
|
"loss": 1.8178, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 2.810029983520508, |
|
"learning_rate": 8.607304774649349e-05, |
|
"loss": 2.0081, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 1.8511972427368164, |
|
"learning_rate": 8.588927348330887e-05, |
|
"loss": 1.7794, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 1.954455852508545, |
|
"learning_rate": 8.57044936277955e-05, |
|
"loss": 1.9215, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 1.8836822509765625, |
|
"learning_rate": 8.551871335735565e-05, |
|
"loss": 1.7449, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.8966975212097168, |
|
"learning_rate": 8.533193787742251e-05, |
|
"loss": 1.7689, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 1.7771093845367432, |
|
"learning_rate": 8.51441724213143e-05, |
|
"loss": 1.8151, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 1.880419135093689, |
|
"learning_rate": 8.495542225008771e-05, |
|
"loss": 1.805, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 1.820349097251892, |
|
"learning_rate": 8.476569265239046e-05, |
|
"loss": 1.758, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 1.984392523765564, |
|
"learning_rate": 8.457498894431311e-05, |
|
"loss": 1.7321, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.710229516029358, |
|
"learning_rate": 8.438331646924013e-05, |
|
"loss": 1.7819, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 1.736141324043274, |
|
"learning_rate": 8.419068059770011e-05, |
|
"loss": 1.8351, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 1.6661279201507568, |
|
"learning_rate": 8.399708672721539e-05, |
|
"loss": 1.803, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 4.828789710998535, |
|
"learning_rate": 8.380254028215076e-05, |
|
"loss": 1.8539, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 2.078886032104492, |
|
"learning_rate": 8.360704671356145e-05, |
|
"loss": 1.7976, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.720009684562683, |
|
"learning_rate": 8.341061149904045e-05, |
|
"loss": 1.9524, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 1.935594081878662, |
|
"learning_rate": 8.321324014256504e-05, |
|
"loss": 1.8671, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 1.868320345878601, |
|
"learning_rate": 8.30149381743425e-05, |
|
"loss": 1.8896, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 2.0732314586639404, |
|
"learning_rate": 8.28157111506552e-05, |
|
"loss": 1.8446, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 1.5798280239105225, |
|
"learning_rate": 8.261556465370493e-05, |
|
"loss": 1.9207, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.6934467554092407, |
|
"learning_rate": 8.24145042914565e-05, |
|
"loss": 1.7548, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 1.7732023000717163, |
|
"learning_rate": 8.221253569748055e-05, |
|
"loss": 1.7041, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 1.9565222263336182, |
|
"learning_rate": 8.200966453079575e-05, |
|
"loss": 1.8865, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 1.7031235694885254, |
|
"learning_rate": 8.180589647571023e-05, |
|
"loss": 2.0219, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 1.8705931901931763, |
|
"learning_rate": 8.16012372416623e-05, |
|
"loss": 1.7774, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.7355400323867798, |
|
"learning_rate": 8.13956925630605e-05, |
|
"loss": 1.7273, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 1.7146542072296143, |
|
"learning_rate": 8.118926819912287e-05, |
|
"loss": 1.8275, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 1.8502819538116455, |
|
"learning_rate": 8.098196993371565e-05, |
|
"loss": 1.856, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 1.6460517644882202, |
|
"learning_rate": 8.077380357519115e-05, |
|
"loss": 1.7826, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 1.6977733373641968, |
|
"learning_rate": 8.056477495622511e-05, |
|
"loss": 2.0396, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 2.395606756210327, |
|
"learning_rate": 8.035488993365312e-05, |
|
"loss": 1.755, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 1.6800931692123413, |
|
"learning_rate": 8.014415438830667e-05, |
|
"loss": 1.9174, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 1.940741777420044, |
|
"learning_rate": 7.993257422484826e-05, |
|
"loss": 1.7259, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 1.6088985204696655, |
|
"learning_rate": 7.972015537160602e-05, |
|
"loss": 1.9236, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 1.77496337890625, |
|
"learning_rate": 7.950690378040758e-05, |
|
"loss": 1.9956, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 2.08013653755188, |
|
"learning_rate": 7.929282542641325e-05, |
|
"loss": 1.71, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 1.9645555019378662, |
|
"learning_rate": 7.907792630794876e-05, |
|
"loss": 1.6586, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 2.035111904144287, |
|
"learning_rate": 7.886221244633703e-05, |
|
"loss": 1.8481, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 1.617519736289978, |
|
"learning_rate": 7.864568988572947e-05, |
|
"loss": 1.8787, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 1.9266173839569092, |
|
"learning_rate": 7.842836469293673e-05, |
|
"loss": 1.7332, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 1.6716456413269043, |
|
"learning_rate": 7.821024295725865e-05, |
|
"loss": 1.8147, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 1.9675475358963013, |
|
"learning_rate": 7.79913307903136e-05, |
|
"loss": 1.77, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 2.048152208328247, |
|
"learning_rate": 7.777163432586734e-05, |
|
"loss": 1.7438, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 1.7210822105407715, |
|
"learning_rate": 7.755115971966104e-05, |
|
"loss": 1.7988, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 2.126711845397949, |
|
"learning_rate": 7.732991314923891e-05, |
|
"loss": 1.7376, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.7960891723632812, |
|
"learning_rate": 7.710790081377502e-05, |
|
"loss": 1.7875, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 1.6610071659088135, |
|
"learning_rate": 7.688512893389964e-05, |
|
"loss": 1.7334, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 1.6998896598815918, |
|
"learning_rate": 7.666160375152496e-05, |
|
"loss": 1.886, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 1.6629440784454346, |
|
"learning_rate": 7.643733152967019e-05, |
|
"loss": 1.786, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 1.6910452842712402, |
|
"learning_rate": 7.621231855228604e-05, |
|
"loss": 2.0343, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.9952099323272705, |
|
"learning_rate": 7.598657112407865e-05, |
|
"loss": 1.7571, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 1.7345885038375854, |
|
"learning_rate": 7.576009557033304e-05, |
|
"loss": 2.0908, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 1.6344877481460571, |
|
"learning_rate": 7.553289823673568e-05, |
|
"loss": 1.8395, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 2.138115406036377, |
|
"learning_rate": 7.530498548919693e-05, |
|
"loss": 1.7072, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 1.9216474294662476, |
|
"learning_rate": 7.507636371367246e-05, |
|
"loss": 1.6516, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.4932810068130493, |
|
"learning_rate": 7.484703931598445e-05, |
|
"loss": 1.9351, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 1.8183472156524658, |
|
"learning_rate": 7.461701872164204e-05, |
|
"loss": 1.8441, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 1.5970336198806763, |
|
"learning_rate": 7.438630837566133e-05, |
|
"loss": 1.8145, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 1.7351387739181519, |
|
"learning_rate": 7.415491474238475e-05, |
|
"loss": 1.8858, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 1.6989448070526123, |
|
"learning_rate": 7.39228443053e-05, |
|
"loss": 1.8566, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 1.8217098712921143, |
|
"learning_rate": 7.369010356685833e-05, |
|
"loss": 1.692, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 1.7833845615386963, |
|
"learning_rate": 7.345669904829237e-05, |
|
"loss": 1.8145, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 1.7113256454467773, |
|
"learning_rate": 7.32226372894334e-05, |
|
"loss": 1.907, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 1.66838800907135, |
|
"learning_rate": 7.298792484852808e-05, |
|
"loss": 1.8243, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 1.8057668209075928, |
|
"learning_rate": 7.27525683020548e-05, |
|
"loss": 1.6788, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.7563303709030151, |
|
"learning_rate": 7.251657424453928e-05, |
|
"loss": 2.0148, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 1.75275719165802, |
|
"learning_rate": 7.227994928836988e-05, |
|
"loss": 1.7584, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 1.6364191770553589, |
|
"learning_rate": 7.204270006361228e-05, |
|
"loss": 1.9348, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 1.7930974960327148, |
|
"learning_rate": 7.180483321782374e-05, |
|
"loss": 1.9014, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 1.8914506435394287, |
|
"learning_rate": 7.156635541586682e-05, |
|
"loss": 1.7977, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.7024521827697754, |
|
"learning_rate": 7.132727333972265e-05, |
|
"loss": 1.6993, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 1.7870112657546997, |
|
"learning_rate": 7.108759368830371e-05, |
|
"loss": 1.6965, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 1.763691782951355, |
|
"learning_rate": 7.084732317726611e-05, |
|
"loss": 1.7948, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 1.683468222618103, |
|
"learning_rate": 7.060646853882145e-05, |
|
"loss": 1.9145, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 1.9888768196105957, |
|
"learning_rate": 7.036503652154812e-05, |
|
"loss": 1.8192, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5705928802490234, |
|
"learning_rate": 7.012303389020234e-05, |
|
"loss": 1.7831, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 1.860660433769226, |
|
"learning_rate": 6.988046742552845e-05, |
|
"loss": 1.7904, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 1.8895405530929565, |
|
"learning_rate": 6.963734392406907e-05, |
|
"loss": 1.8645, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 1.74190354347229, |
|
"learning_rate": 6.93936701979746e-05, |
|
"loss": 1.8455, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 1.9230369329452515, |
|
"learning_rate": 6.914945307481228e-05, |
|
"loss": 1.8388, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.5093566179275513, |
|
"learning_rate": 6.890469939737506e-05, |
|
"loss": 1.752, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 1.5916728973388672, |
|
"learning_rate": 6.865941602348966e-05, |
|
"loss": 1.7105, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 1.7378982305526733, |
|
"learning_rate": 6.841360982582463e-05, |
|
"loss": 1.9789, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 1.7520698308944702, |
|
"learning_rate": 6.816728769169757e-05, |
|
"loss": 1.7566, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 1.8129826784133911, |
|
"learning_rate": 6.792045652288234e-05, |
|
"loss": 1.8551, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.9102818965911865, |
|
"learning_rate": 6.767312323541555e-05, |
|
"loss": 1.7726, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 1.5088154077529907, |
|
"learning_rate": 6.742529475940284e-05, |
|
"loss": 1.6381, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 1.7010055780410767, |
|
"learning_rate": 6.717697803882467e-05, |
|
"loss": 1.8741, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 1.6840184926986694, |
|
"learning_rate": 6.692818003134184e-05, |
|
"loss": 1.8617, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 1.7205629348754883, |
|
"learning_rate": 6.667890770810035e-05, |
|
"loss": 1.7349, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 1.520727515220642, |
|
"learning_rate": 6.64291680535363e-05, |
|
"loss": 1.749, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 1.5941743850708008, |
|
"learning_rate": 6.617896806518005e-05, |
|
"loss": 1.7076, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 1.7745941877365112, |
|
"learning_rate": 6.592831475346018e-05, |
|
"loss": 1.792, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 1.5072052478790283, |
|
"learning_rate": 6.56772151415071e-05, |
|
"loss": 1.6149, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 1.6202104091644287, |
|
"learning_rate": 6.542567626495619e-05, |
|
"loss": 1.756, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.4974113702774048, |
|
"learning_rate": 6.517370517175081e-05, |
|
"loss": 1.7919, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 1.653824806213379, |
|
"learning_rate": 6.492130892194461e-05, |
|
"loss": 2.0103, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 1.683524489402771, |
|
"learning_rate": 6.466849458750394e-05, |
|
"loss": 2.0337, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 1.5982547998428345, |
|
"learning_rate": 6.441526925210949e-05, |
|
"loss": 1.8919, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 1.838497519493103, |
|
"learning_rate": 6.416164001095799e-05, |
|
"loss": 1.7648, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.524348258972168, |
|
"learning_rate": 6.390761397056328e-05, |
|
"loss": 1.6804, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 1.6498512029647827, |
|
"learning_rate": 6.365319824855727e-05, |
|
"loss": 1.6334, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 1.5689668655395508, |
|
"learning_rate": 6.339839997349045e-05, |
|
"loss": 1.9048, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 1.7050296068191528, |
|
"learning_rate": 6.314322628463219e-05, |
|
"loss": 1.6864, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 2.038351535797119, |
|
"learning_rate": 6.288768433177068e-05, |
|
"loss": 1.7531, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.7489795684814453, |
|
"learning_rate": 6.26317812750126e-05, |
|
"loss": 1.8467, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 1.6068861484527588, |
|
"learning_rate": 6.237552428458256e-05, |
|
"loss": 1.8459, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 1.616613745689392, |
|
"learning_rate": 6.21189205406221e-05, |
|
"loss": 1.8173, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 1.6885602474212646, |
|
"learning_rate": 6.186197723298855e-05, |
|
"loss": 1.8358, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 1.688711404800415, |
|
"learning_rate": 6.160470156105362e-05, |
|
"loss": 1.6996, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.74298894405365, |
|
"learning_rate": 6.134710073350156e-05, |
|
"loss": 1.722, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 1.6249070167541504, |
|
"learning_rate": 6.108918196812734e-05, |
|
"loss": 1.7909, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 1.659416675567627, |
|
"learning_rate": 6.083095249163424e-05, |
|
"loss": 1.6625, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 1.6332143545150757, |
|
"learning_rate": 6.057241953943154e-05, |
|
"loss": 1.8297, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 1.6717133522033691, |
|
"learning_rate": 6.031359035543158e-05, |
|
"loss": 2.0601, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.8736896514892578, |
|
"learning_rate": 6.005447219184702e-05, |
|
"loss": 1.8117, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 1.6602182388305664, |
|
"learning_rate": 5.9795072308987485e-05, |
|
"loss": 1.7275, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 1.6776071786880493, |
|
"learning_rate": 5.9535397975056154e-05, |
|
"loss": 1.8988, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 1.588109016418457, |
|
"learning_rate": 5.927545646594617e-05, |
|
"loss": 1.7716, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 1.6331814527511597, |
|
"learning_rate": 5.901525506503668e-05, |
|
"loss": 1.8081, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.7309777736663818, |
|
"learning_rate": 5.87548010629889e-05, |
|
"loss": 1.8243, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 1.6374008655548096, |
|
"learning_rate": 5.8494101757541676e-05, |
|
"loss": 1.7065, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 1.6291025876998901, |
|
"learning_rate": 5.8233164453307156e-05, |
|
"loss": 1.8013, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 1.7339948415756226, |
|
"learning_rate": 5.797199646156596e-05, |
|
"loss": 1.7998, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 1.581697940826416, |
|
"learning_rate": 5.7710605100062485e-05, |
|
"loss": 1.645, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.713205337524414, |
|
"learning_rate": 5.7448997692799764e-05, |
|
"loss": 1.9092, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9632, |
|
"grad_norm": 1.7023775577545166, |
|
"learning_rate": 5.718718156983428e-05, |
|
"loss": 1.7403, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 1.606632947921753, |
|
"learning_rate": 5.69251640670706e-05, |
|
"loss": 1.679, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.9696, |
|
"grad_norm": 1.5328476428985596, |
|
"learning_rate": 5.6662952526055793e-05, |
|
"loss": 1.7899, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 1.5965962409973145, |
|
"learning_rate": 5.6400554293773744e-05, |
|
"loss": 1.7776, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 1.5174623727798462, |
|
"learning_rate": 5.61379767224393e-05, |
|
"loss": 1.6602, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 1.6876877546310425, |
|
"learning_rate": 5.587522716929228e-05, |
|
"loss": 1.6656, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.9824, |
|
"grad_norm": 1.5483810901641846, |
|
"learning_rate": 5.561231299639127e-05, |
|
"loss": 1.6531, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 1.464625597000122, |
|
"learning_rate": 5.534924157040745e-05, |
|
"loss": 1.8967, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.9888, |
|
"grad_norm": 1.7587417364120483, |
|
"learning_rate": 5.508602026241807e-05, |
|
"loss": 1.6637, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.5783720016479492, |
|
"learning_rate": 5.482265644769998e-05, |
|
"loss": 1.7628, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9952, |
|
"grad_norm": 1.602127194404602, |
|
"learning_rate": 5.4559157505522985e-05, |
|
"loss": 1.7458, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 1.8909751176834106, |
|
"learning_rate": 5.429553081894304e-05, |
|
"loss": 1.6952, |
|
"step": 624 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 625, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0566821953614643e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|