|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 30.0, |
|
"eval_steps": 0, |
|
"global_step": 235740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06362942224484602, |
|
"grad_norm": 1662.35009765625, |
|
"learning_rate": 4.86e-07, |
|
"loss": 92.5416, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12725884448969205, |
|
"grad_norm": 288.2401428222656, |
|
"learning_rate": 9.86e-07, |
|
"loss": 20.6659, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19088826673453804, |
|
"grad_norm": 56.13795852661133, |
|
"learning_rate": 1.4860000000000003e-06, |
|
"loss": 14.7631, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2545176889793841, |
|
"grad_norm": 102.28019714355469, |
|
"learning_rate": 1.986e-06, |
|
"loss": 14.3025, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.31814711122423006, |
|
"grad_norm": 155.62403869628906, |
|
"learning_rate": 2.486e-06, |
|
"loss": 13.5257, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3817765334690761, |
|
"grad_norm": 210.75811767578125, |
|
"learning_rate": 2.986e-06, |
|
"loss": 12.8666, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4454059557139221, |
|
"grad_norm": 256.039306640625, |
|
"learning_rate": 3.4860000000000006e-06, |
|
"loss": 12.397, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5090353779587682, |
|
"grad_norm": 227.79017639160156, |
|
"learning_rate": 3.9860000000000005e-06, |
|
"loss": 12.2718, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5726648002036141, |
|
"grad_norm": 307.928955078125, |
|
"learning_rate": 4.486000000000001e-06, |
|
"loss": 11.539, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6362942224484601, |
|
"grad_norm": 199.85580444335938, |
|
"learning_rate": 4.986e-06, |
|
"loss": 11.1145, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6999236446933061, |
|
"grad_norm": 236.899169921875, |
|
"learning_rate": 5.4860000000000005e-06, |
|
"loss": 11.1232, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7635530669381522, |
|
"grad_norm": 265.123046875, |
|
"learning_rate": 5.986000000000001e-06, |
|
"loss": 10.6021, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8271824891829982, |
|
"grad_norm": 254.1043701171875, |
|
"learning_rate": 6.486e-06, |
|
"loss": 10.4115, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.8908119114278442, |
|
"grad_norm": 172.3489990234375, |
|
"learning_rate": 6.9860000000000005e-06, |
|
"loss": 10.4529, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9544413336726902, |
|
"grad_norm": 374.72003173828125, |
|
"learning_rate": 7.486000000000001e-06, |
|
"loss": 10.1329, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.0180707559175364, |
|
"grad_norm": 320.3682556152344, |
|
"learning_rate": 7.985e-06, |
|
"loss": 10.1367, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0817001781623823, |
|
"grad_norm": 297.0594787597656, |
|
"learning_rate": 8.485000000000001e-06, |
|
"loss": 9.5914, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.1453296004072282, |
|
"grad_norm": 266.2686767578125, |
|
"learning_rate": 8.985000000000001e-06, |
|
"loss": 9.2799, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.2089590226520743, |
|
"grad_norm": 168.0514373779297, |
|
"learning_rate": 9.485000000000002e-06, |
|
"loss": 9.266, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.2725884448969205, |
|
"grad_norm": 213.7965545654297, |
|
"learning_rate": 9.985000000000002e-06, |
|
"loss": 9.1661, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.3362178671417664, |
|
"grad_norm": 189.05682373046875, |
|
"learning_rate": 9.978515105874015e-06, |
|
"loss": 8.954, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.3998472893866123, |
|
"grad_norm": 230.05084228515625, |
|
"learning_rate": 9.956365730486402e-06, |
|
"loss": 8.9562, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.4634767116314584, |
|
"grad_norm": 314.4221496582031, |
|
"learning_rate": 9.934304952600337e-06, |
|
"loss": 9.4717, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.5271061338763046, |
|
"grad_norm": 190.9048614501953, |
|
"learning_rate": 9.912155577212723e-06, |
|
"loss": 8.6758, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.5907355561211505, |
|
"grad_norm": 3140.1875, |
|
"learning_rate": 9.89000620182511e-06, |
|
"loss": 8.87, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.6543649783659964, |
|
"grad_norm": 396.64117431640625, |
|
"learning_rate": 9.867856826437496e-06, |
|
"loss": 8.5826, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.7179944006108423, |
|
"grad_norm": 171.70077514648438, |
|
"learning_rate": 9.845707451049881e-06, |
|
"loss": 8.4827, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.7816238228556884, |
|
"grad_norm": 269.8551940917969, |
|
"learning_rate": 9.823558075662267e-06, |
|
"loss": 8.5306, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.8452532451005346, |
|
"grad_norm": 255.013671875, |
|
"learning_rate": 9.801408700274653e-06, |
|
"loss": 8.182, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.9088826673453805, |
|
"grad_norm": 194.22486877441406, |
|
"learning_rate": 9.77925932488704e-06, |
|
"loss": 8.3592, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.9725120895902264, |
|
"grad_norm": 149.85800170898438, |
|
"learning_rate": 9.757109949499426e-06, |
|
"loss": 8.3879, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.0361415118350727, |
|
"grad_norm": 156.6005401611328, |
|
"learning_rate": 9.735004872862585e-06, |
|
"loss": 7.4399, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.0997709340799187, |
|
"grad_norm": 286.58648681640625, |
|
"learning_rate": 9.712855497474972e-06, |
|
"loss": 7.0406, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.1634003563247646, |
|
"grad_norm": 242.3479461669922, |
|
"learning_rate": 9.690706122087358e-06, |
|
"loss": 6.89, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.2270297785696105, |
|
"grad_norm": 180.5225372314453, |
|
"learning_rate": 9.668556746699744e-06, |
|
"loss": 6.8651, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.2906592008144564, |
|
"grad_norm": 223.84552001953125, |
|
"learning_rate": 9.64640737131213e-06, |
|
"loss": 6.8461, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.3542886230593028, |
|
"grad_norm": 233.3303680419922, |
|
"learning_rate": 9.624257995924515e-06, |
|
"loss": 6.7663, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.4179180453041487, |
|
"grad_norm": 237.0810546875, |
|
"learning_rate": 9.602108620536902e-06, |
|
"loss": 6.9313, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.4815474675489946, |
|
"grad_norm": 176.5728302001953, |
|
"learning_rate": 9.579959245149288e-06, |
|
"loss": 6.9688, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.545176889793841, |
|
"grad_norm": 184.43077087402344, |
|
"learning_rate": 9.557809869761674e-06, |
|
"loss": 6.7821, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.608806312038687, |
|
"grad_norm": 182.1748809814453, |
|
"learning_rate": 9.535660494374059e-06, |
|
"loss": 6.9468, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.6724357342835328, |
|
"grad_norm": 232.06759643554688, |
|
"learning_rate": 9.51355541773722e-06, |
|
"loss": 6.731, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.7360651565283787, |
|
"grad_norm": 169.12734985351562, |
|
"learning_rate": 9.491406042349606e-06, |
|
"loss": 6.649, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.7996945787732246, |
|
"grad_norm": 153.9056854248047, |
|
"learning_rate": 9.469256666961992e-06, |
|
"loss": 6.7055, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.8633240010180705, |
|
"grad_norm": 252.30517578125, |
|
"learning_rate": 9.447107291574379e-06, |
|
"loss": 6.7744, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.926953423262917, |
|
"grad_norm": 182.51229858398438, |
|
"learning_rate": 9.424957916186765e-06, |
|
"loss": 6.9481, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.9905828455077628, |
|
"grad_norm": 213.7582244873047, |
|
"learning_rate": 9.40280854079915e-06, |
|
"loss": 6.5967, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.0542122677526087, |
|
"grad_norm": 187.1132049560547, |
|
"learning_rate": 9.380659165411536e-06, |
|
"loss": 5.7351, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.117841689997455, |
|
"grad_norm": 157.81378173828125, |
|
"learning_rate": 9.358509790023921e-06, |
|
"loss": 5.4125, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.181471112242301, |
|
"grad_norm": 448.2672424316406, |
|
"learning_rate": 9.336360414636309e-06, |
|
"loss": 5.4095, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.245100534487147, |
|
"grad_norm": 170.9069061279297, |
|
"learning_rate": 9.314211039248694e-06, |
|
"loss": 5.4253, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.3087299567319928, |
|
"grad_norm": 186.37034606933594, |
|
"learning_rate": 9.29206166386108e-06, |
|
"loss": 5.3774, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.3723593789768387, |
|
"grad_norm": 134.44960021972656, |
|
"learning_rate": 9.269912288473466e-06, |
|
"loss": 5.5277, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.435988801221685, |
|
"grad_norm": 268.1274108886719, |
|
"learning_rate": 9.247807211836627e-06, |
|
"loss": 5.4516, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.499618223466531, |
|
"grad_norm": 248.0684814453125, |
|
"learning_rate": 9.225657836449013e-06, |
|
"loss": 5.322, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.563247645711377, |
|
"grad_norm": 214.72317504882812, |
|
"learning_rate": 9.203508461061398e-06, |
|
"loss": 5.5531, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.626877067956223, |
|
"grad_norm": 153.9894256591797, |
|
"learning_rate": 9.181359085673784e-06, |
|
"loss": 5.5238, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.690506490201069, |
|
"grad_norm": 174.88331604003906, |
|
"learning_rate": 9.159209710286171e-06, |
|
"loss": 5.5992, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.754135912445915, |
|
"grad_norm": 301.410888671875, |
|
"learning_rate": 9.137104633649332e-06, |
|
"loss": 5.5351, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.817765334690761, |
|
"grad_norm": 201.53282165527344, |
|
"learning_rate": 9.114955258261718e-06, |
|
"loss": 5.3985, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.881394756935607, |
|
"grad_norm": 212.6214141845703, |
|
"learning_rate": 9.092805882874104e-06, |
|
"loss": 5.4313, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.945024179180453, |
|
"grad_norm": 177.44863891601562, |
|
"learning_rate": 9.07065650748649e-06, |
|
"loss": 5.4173, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.008653601425299, |
|
"grad_norm": 160.0504150390625, |
|
"learning_rate": 9.04855143084965e-06, |
|
"loss": 5.2333, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.0722830236701455, |
|
"grad_norm": 150.31857299804688, |
|
"learning_rate": 9.026446354212812e-06, |
|
"loss": 4.3352, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.135912445914991, |
|
"grad_norm": 124.97169494628906, |
|
"learning_rate": 9.004296978825197e-06, |
|
"loss": 4.3442, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 4.199541868159837, |
|
"grad_norm": 215.25157165527344, |
|
"learning_rate": 8.982147603437585e-06, |
|
"loss": 4.3288, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 4.263171290404683, |
|
"grad_norm": 148.4134521484375, |
|
"learning_rate": 8.95999822804997e-06, |
|
"loss": 4.367, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 4.326800712649529, |
|
"grad_norm": 204.40850830078125, |
|
"learning_rate": 8.93789315141313e-06, |
|
"loss": 4.4607, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 4.390430134894375, |
|
"grad_norm": 164.64273071289062, |
|
"learning_rate": 8.915743776025517e-06, |
|
"loss": 4.4461, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 4.454059557139221, |
|
"grad_norm": 204.80953979492188, |
|
"learning_rate": 8.893594400637903e-06, |
|
"loss": 4.6218, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.517688979384067, |
|
"grad_norm": 185.70278930664062, |
|
"learning_rate": 8.871445025250289e-06, |
|
"loss": 4.4249, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 4.581318401628913, |
|
"grad_norm": 202.91989135742188, |
|
"learning_rate": 8.849295649862674e-06, |
|
"loss": 4.4129, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.64494782387376, |
|
"grad_norm": 164.02198791503906, |
|
"learning_rate": 8.82714627447506e-06, |
|
"loss": 4.4065, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 4.7085772461186055, |
|
"grad_norm": 155.7901153564453, |
|
"learning_rate": 8.804996899087447e-06, |
|
"loss": 4.5452, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 4.772206668363451, |
|
"grad_norm": 194.26280212402344, |
|
"learning_rate": 8.782847523699833e-06, |
|
"loss": 4.5411, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 4.835836090608297, |
|
"grad_norm": 168.18798828125, |
|
"learning_rate": 8.760698148312218e-06, |
|
"loss": 4.5423, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.899465512853143, |
|
"grad_norm": 136.41905212402344, |
|
"learning_rate": 8.738548772924604e-06, |
|
"loss": 4.4942, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.963094935097989, |
|
"grad_norm": 141.8522491455078, |
|
"learning_rate": 8.71639939753699e-06, |
|
"loss": 4.5332, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 5.026724357342835, |
|
"grad_norm": 149.42271423339844, |
|
"learning_rate": 8.694250022149377e-06, |
|
"loss": 4.0759, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 5.090353779587681, |
|
"grad_norm": 139.2994842529297, |
|
"learning_rate": 8.672100646761763e-06, |
|
"loss": 3.6274, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 5.153983201832528, |
|
"grad_norm": 140.65269470214844, |
|
"learning_rate": 8.649951271374148e-06, |
|
"loss": 3.6795, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 5.217612624077374, |
|
"grad_norm": 139.22752380371094, |
|
"learning_rate": 8.627801895986534e-06, |
|
"loss": 3.6741, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 5.28124204632222, |
|
"grad_norm": 93.71381378173828, |
|
"learning_rate": 8.60565252059892e-06, |
|
"loss": 3.7396, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 5.3448714685670655, |
|
"grad_norm": 118.81936645507812, |
|
"learning_rate": 8.583503145211307e-06, |
|
"loss": 3.6839, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 5.408500890811911, |
|
"grad_norm": 143.53829956054688, |
|
"learning_rate": 8.561353769823692e-06, |
|
"loss": 3.732, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 5.472130313056757, |
|
"grad_norm": 152.01527404785156, |
|
"learning_rate": 8.539248693186852e-06, |
|
"loss": 3.6557, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.535759735301603, |
|
"grad_norm": 159.16392517089844, |
|
"learning_rate": 8.517143616550015e-06, |
|
"loss": 3.6925, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 5.599389157546449, |
|
"grad_norm": 143.2123260498047, |
|
"learning_rate": 8.4949942411624e-06, |
|
"loss": 3.7149, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 5.663018579791295, |
|
"grad_norm": 136.5101318359375, |
|
"learning_rate": 8.472844865774786e-06, |
|
"loss": 3.6744, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 5.726648002036142, |
|
"grad_norm": 156.95541381835938, |
|
"learning_rate": 8.450695490387172e-06, |
|
"loss": 3.7669, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.790277424280988, |
|
"grad_norm": 137.13330078125, |
|
"learning_rate": 8.428546114999557e-06, |
|
"loss": 3.651, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 5.853906846525834, |
|
"grad_norm": 149.19625854492188, |
|
"learning_rate": 8.406396739611945e-06, |
|
"loss": 3.721, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 5.91753626877068, |
|
"grad_norm": 193.83432006835938, |
|
"learning_rate": 8.384291662975104e-06, |
|
"loss": 3.7012, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 5.9811656910155255, |
|
"grad_norm": 149.3867950439453, |
|
"learning_rate": 8.362186586338266e-06, |
|
"loss": 3.7294, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 6.0447951132603714, |
|
"grad_norm": 144.5869140625, |
|
"learning_rate": 8.340037210950653e-06, |
|
"loss": 3.2432, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 6.108424535505217, |
|
"grad_norm": 138.15234375, |
|
"learning_rate": 8.317887835563039e-06, |
|
"loss": 3.0295, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 6.172053957750063, |
|
"grad_norm": 544.6531372070312, |
|
"learning_rate": 8.295738460175424e-06, |
|
"loss": 3.0364, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 6.23568337999491, |
|
"grad_norm": 124.35468292236328, |
|
"learning_rate": 8.273633383538585e-06, |
|
"loss": 3.0687, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 6.299312802239756, |
|
"grad_norm": 93.38568878173828, |
|
"learning_rate": 8.251484008150971e-06, |
|
"loss": 3.064, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 6.362942224484602, |
|
"grad_norm": 192.03231811523438, |
|
"learning_rate": 8.229334632763357e-06, |
|
"loss": 3.112, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 6.426571646729448, |
|
"grad_norm": 107.92765808105469, |
|
"learning_rate": 8.207185257375742e-06, |
|
"loss": 3.1438, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 6.490201068974294, |
|
"grad_norm": 124.23885345458984, |
|
"learning_rate": 8.185080180738904e-06, |
|
"loss": 3.0733, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 6.55383049121914, |
|
"grad_norm": 154.87612915039062, |
|
"learning_rate": 8.162930805351291e-06, |
|
"loss": 3.1719, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 6.6174599134639855, |
|
"grad_norm": 134.2186737060547, |
|
"learning_rate": 8.140781429963675e-06, |
|
"loss": 3.1355, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 6.6810893357088315, |
|
"grad_norm": 173.08433532714844, |
|
"learning_rate": 8.11863205457606e-06, |
|
"loss": 3.1612, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 6.744718757953677, |
|
"grad_norm": 179.25296020507812, |
|
"learning_rate": 8.096482679188448e-06, |
|
"loss": 3.1938, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 6.808348180198524, |
|
"grad_norm": 138.08518981933594, |
|
"learning_rate": 8.074333303800833e-06, |
|
"loss": 3.1375, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 6.87197760244337, |
|
"grad_norm": 106.96342468261719, |
|
"learning_rate": 8.052183928413219e-06, |
|
"loss": 3.1969, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 6.935607024688216, |
|
"grad_norm": 127.7270278930664, |
|
"learning_rate": 8.030034553025605e-06, |
|
"loss": 3.2214, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 6.999236446933062, |
|
"grad_norm": 151.88905334472656, |
|
"learning_rate": 8.007885177637992e-06, |
|
"loss": 3.1364, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 7.062865869177908, |
|
"grad_norm": 146.13461303710938, |
|
"learning_rate": 7.985735802250378e-06, |
|
"loss": 2.63, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 7.126495291422754, |
|
"grad_norm": 158.6125030517578, |
|
"learning_rate": 7.963586426862763e-06, |
|
"loss": 2.5451, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 7.1901247136676, |
|
"grad_norm": 136.17828369140625, |
|
"learning_rate": 7.941481350225924e-06, |
|
"loss": 2.644, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 7.2537541359124456, |
|
"grad_norm": 183.11447143554688, |
|
"learning_rate": 7.91933197483831e-06, |
|
"loss": 2.6482, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 7.317383558157292, |
|
"grad_norm": 125.30079650878906, |
|
"learning_rate": 7.897182599450696e-06, |
|
"loss": 2.6017, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 7.381012980402138, |
|
"grad_norm": 104.10094451904297, |
|
"learning_rate": 7.875033224063083e-06, |
|
"loss": 2.6626, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 7.444642402646984, |
|
"grad_norm": 153.14060974121094, |
|
"learning_rate": 7.852883848675467e-06, |
|
"loss": 2.6698, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 7.50827182489183, |
|
"grad_norm": 80.38119506835938, |
|
"learning_rate": 7.830734473287854e-06, |
|
"loss": 2.6595, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 7.571901247136676, |
|
"grad_norm": 139.31524658203125, |
|
"learning_rate": 7.80858509790024e-06, |
|
"loss": 2.6683, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 7.635530669381522, |
|
"grad_norm": 135.78240966796875, |
|
"learning_rate": 7.786480021263401e-06, |
|
"loss": 2.7187, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 7.699160091626368, |
|
"grad_norm": 109.59832000732422, |
|
"learning_rate": 7.764330645875787e-06, |
|
"loss": 2.6213, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 7.762789513871214, |
|
"grad_norm": 143.305908203125, |
|
"learning_rate": 7.742181270488172e-06, |
|
"loss": 2.7119, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 7.82641893611606, |
|
"grad_norm": 147.27064514160156, |
|
"learning_rate": 7.72003189510056e-06, |
|
"loss": 2.739, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 7.8900483583609065, |
|
"grad_norm": 109.4032211303711, |
|
"learning_rate": 7.697882519712945e-06, |
|
"loss": 2.686, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 7.953677780605752, |
|
"grad_norm": 111.08818054199219, |
|
"learning_rate": 7.675733144325331e-06, |
|
"loss": 2.7295, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 8.017307202850597, |
|
"grad_norm": 80.8994369506836, |
|
"learning_rate": 7.653583768937717e-06, |
|
"loss": 2.6062, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 8.080936625095443, |
|
"grad_norm": 132.42283630371094, |
|
"learning_rate": 7.631434393550102e-06, |
|
"loss": 2.2272, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 8.144566047340291, |
|
"grad_norm": 105.58837127685547, |
|
"learning_rate": 7.6093293169132635e-06, |
|
"loss": 2.2692, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 8.208195469585137, |
|
"grad_norm": 165.8797149658203, |
|
"learning_rate": 7.58717994152565e-06, |
|
"loss": 2.3135, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 8.271824891829983, |
|
"grad_norm": 103.73261260986328, |
|
"learning_rate": 7.5650305661380356e-06, |
|
"loss": 2.2546, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 8.335454314074829, |
|
"grad_norm": 100.5468521118164, |
|
"learning_rate": 7.542881190750422e-06, |
|
"loss": 2.2882, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 8.399083736319675, |
|
"grad_norm": 124.30194854736328, |
|
"learning_rate": 7.520731815362808e-06, |
|
"loss": 2.2749, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 8.46271315856452, |
|
"grad_norm": 124.07736206054688, |
|
"learning_rate": 7.498582439975194e-06, |
|
"loss": 2.363, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 8.526342580809366, |
|
"grad_norm": 110.9386978149414, |
|
"learning_rate": 7.47643306458758e-06, |
|
"loss": 2.2923, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 8.589972003054212, |
|
"grad_norm": 129.3117218017578, |
|
"learning_rate": 7.4542836891999645e-06, |
|
"loss": 2.3275, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 8.653601425299058, |
|
"grad_norm": 111.8931884765625, |
|
"learning_rate": 7.432134313812351e-06, |
|
"loss": 2.3738, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 8.717230847543904, |
|
"grad_norm": 118.7526626586914, |
|
"learning_rate": 7.409984938424737e-06, |
|
"loss": 2.3416, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 8.78086026978875, |
|
"grad_norm": 149.440673828125, |
|
"learning_rate": 7.387835563037123e-06, |
|
"loss": 2.3851, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 8.844489692033596, |
|
"grad_norm": 122.81755828857422, |
|
"learning_rate": 7.365730486400284e-06, |
|
"loss": 2.3356, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 8.908119114278442, |
|
"grad_norm": 132.1360626220703, |
|
"learning_rate": 7.34358111101267e-06, |
|
"loss": 2.3598, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 8.971748536523288, |
|
"grad_norm": 125.38104248046875, |
|
"learning_rate": 7.3214317356250565e-06, |
|
"loss": 2.4272, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 9.035377958768134, |
|
"grad_norm": 94.84292602539062, |
|
"learning_rate": 7.299326658988217e-06, |
|
"loss": 2.141, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 9.09900738101298, |
|
"grad_norm": 108.36376190185547, |
|
"learning_rate": 7.2771772836006025e-06, |
|
"loss": 2.001, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 9.162636803257826, |
|
"grad_norm": 120.51274108886719, |
|
"learning_rate": 7.255027908212989e-06, |
|
"loss": 2.014, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 9.226266225502673, |
|
"grad_norm": 76.73661041259766, |
|
"learning_rate": 7.232878532825375e-06, |
|
"loss": 1.9826, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 9.28989564774752, |
|
"grad_norm": 93.48287200927734, |
|
"learning_rate": 7.210729157437761e-06, |
|
"loss": 1.995, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 9.353525069992365, |
|
"grad_norm": 87.56092071533203, |
|
"learning_rate": 7.188579782050147e-06, |
|
"loss": 2.0097, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 9.417154492237211, |
|
"grad_norm": 128.68373107910156, |
|
"learning_rate": 7.166430406662532e-06, |
|
"loss": 2.0412, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 9.480783914482057, |
|
"grad_norm": 101.52668762207031, |
|
"learning_rate": 7.144281031274919e-06, |
|
"loss": 2.0144, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 9.544413336726903, |
|
"grad_norm": 90.50218963623047, |
|
"learning_rate": 7.12217595463808e-06, |
|
"loss": 2.0653, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 9.608042758971749, |
|
"grad_norm": 113.8707046508789, |
|
"learning_rate": 7.100026579250465e-06, |
|
"loss": 2.022, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 9.671672181216595, |
|
"grad_norm": 78.54847717285156, |
|
"learning_rate": 7.077921502613627e-06, |
|
"loss": 2.0327, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 9.73530160346144, |
|
"grad_norm": 131.4427947998047, |
|
"learning_rate": 7.055772127226013e-06, |
|
"loss": 2.0596, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 9.798931025706286, |
|
"grad_norm": 120.61900329589844, |
|
"learning_rate": 7.033667050589174e-06, |
|
"loss": 2.0761, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 9.862560447951132, |
|
"grad_norm": 84.29814147949219, |
|
"learning_rate": 7.01151767520156e-06, |
|
"loss": 2.1245, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 9.926189870195978, |
|
"grad_norm": 91.78532409667969, |
|
"learning_rate": 6.989368299813946e-06, |
|
"loss": 2.1062, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 9.989819292440824, |
|
"grad_norm": 111.85667419433594, |
|
"learning_rate": 6.9672189244263324e-06, |
|
"loss": 2.1186, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 10.05344871468567, |
|
"grad_norm": 97.98519897460938, |
|
"learning_rate": 6.945113847789493e-06, |
|
"loss": 1.8283, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 10.117078136930516, |
|
"grad_norm": 80.28434753417969, |
|
"learning_rate": 6.9229644724018785e-06, |
|
"loss": 1.7627, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 10.180707559175362, |
|
"grad_norm": 99.89539337158203, |
|
"learning_rate": 6.900859395765041e-06, |
|
"loss": 1.7775, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 10.244336981420208, |
|
"grad_norm": 87.49510955810547, |
|
"learning_rate": 6.878710020377426e-06, |
|
"loss": 1.7865, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 10.307966403665056, |
|
"grad_norm": 87.29383850097656, |
|
"learning_rate": 6.856560644989811e-06, |
|
"loss": 1.8018, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 10.371595825909901, |
|
"grad_norm": 88.82074737548828, |
|
"learning_rate": 6.834411269602198e-06, |
|
"loss": 1.7851, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 10.435225248154747, |
|
"grad_norm": 90.42290496826172, |
|
"learning_rate": 6.812261894214583e-06, |
|
"loss": 1.8085, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 10.498854670399593, |
|
"grad_norm": 85.89569091796875, |
|
"learning_rate": 6.7901125188269704e-06, |
|
"loss": 1.8293, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 10.56248409264444, |
|
"grad_norm": 89.20499420166016, |
|
"learning_rate": 6.767963143439355e-06, |
|
"loss": 1.8549, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 10.626113514889285, |
|
"grad_norm": 193.05775451660156, |
|
"learning_rate": 6.745813768051741e-06, |
|
"loss": 1.8531, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 10.689742937134131, |
|
"grad_norm": 106.58789825439453, |
|
"learning_rate": 6.723664392664127e-06, |
|
"loss": 1.8538, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 10.753372359378977, |
|
"grad_norm": 136.8468780517578, |
|
"learning_rate": 6.701515017276513e-06, |
|
"loss": 1.8814, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 10.817001781623823, |
|
"grad_norm": 128.12271118164062, |
|
"learning_rate": 6.679365641888899e-06, |
|
"loss": 1.8576, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 10.880631203868669, |
|
"grad_norm": 70.90370178222656, |
|
"learning_rate": 6.657216266501285e-06, |
|
"loss": 1.8516, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 10.944260626113515, |
|
"grad_norm": 77.27445220947266, |
|
"learning_rate": 6.635066891113671e-06, |
|
"loss": 1.8555, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 11.00789004835836, |
|
"grad_norm": 108.38621520996094, |
|
"learning_rate": 6.612917515726057e-06, |
|
"loss": 1.8631, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 11.071519470603207, |
|
"grad_norm": 145.12940979003906, |
|
"learning_rate": 6.590768140338443e-06, |
|
"loss": 1.6189, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 11.135148892848052, |
|
"grad_norm": 115.5062484741211, |
|
"learning_rate": 6.568618764950829e-06, |
|
"loss": 1.6143, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 11.198778315092898, |
|
"grad_norm": 71.71438598632812, |
|
"learning_rate": 6.546469389563215e-06, |
|
"loss": 1.6246, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 11.262407737337744, |
|
"grad_norm": 89.9764633178711, |
|
"learning_rate": 6.5243200141756004e-06, |
|
"loss": 1.5997, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 11.32603715958259, |
|
"grad_norm": 80.51982879638672, |
|
"learning_rate": 6.502170638787987e-06, |
|
"loss": 1.646, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 11.389666581827438, |
|
"grad_norm": 87.14283752441406, |
|
"learning_rate": 6.4800212634003725e-06, |
|
"loss": 1.6323, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 11.453296004072284, |
|
"grad_norm": 76.05656433105469, |
|
"learning_rate": 6.457871888012759e-06, |
|
"loss": 1.6623, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 11.51692542631713, |
|
"grad_norm": 84.08787536621094, |
|
"learning_rate": 6.435722512625145e-06, |
|
"loss": 1.6544, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 11.580554848561976, |
|
"grad_norm": 113.19395446777344, |
|
"learning_rate": 6.413573137237531e-06, |
|
"loss": 1.6671, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 11.644184270806822, |
|
"grad_norm": 92.68965911865234, |
|
"learning_rate": 6.391423761849917e-06, |
|
"loss": 1.6742, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 11.707813693051667, |
|
"grad_norm": 116.95278930664062, |
|
"learning_rate": 6.369274386462302e-06, |
|
"loss": 1.6409, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 11.771443115296513, |
|
"grad_norm": 77.6058120727539, |
|
"learning_rate": 6.347213608576238e-06, |
|
"loss": 1.6504, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 11.83507253754136, |
|
"grad_norm": 74.96102142333984, |
|
"learning_rate": 6.3251085319394e-06, |
|
"loss": 1.6791, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 11.898701959786205, |
|
"grad_norm": 95.83757781982422, |
|
"learning_rate": 6.302959156551785e-06, |
|
"loss": 1.6923, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 11.962331382031051, |
|
"grad_norm": 114.55757141113281, |
|
"learning_rate": 6.280809781164172e-06, |
|
"loss": 1.697, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 12.025960804275897, |
|
"grad_norm": 59.73118591308594, |
|
"learning_rate": 6.258660405776557e-06, |
|
"loss": 1.6136, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 12.089590226520743, |
|
"grad_norm": 86.23199462890625, |
|
"learning_rate": 6.236511030388943e-06, |
|
"loss": 1.4437, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 12.153219648765589, |
|
"grad_norm": 71.51868438720703, |
|
"learning_rate": 6.2143616550013295e-06, |
|
"loss": 1.49, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 12.216849071010435, |
|
"grad_norm": 96.19779205322266, |
|
"learning_rate": 6.192212279613715e-06, |
|
"loss": 1.4567, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 12.28047849325528, |
|
"grad_norm": 79.43608093261719, |
|
"learning_rate": 6.170062904226102e-06, |
|
"loss": 1.5007, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 12.344107915500127, |
|
"grad_norm": 79.2935791015625, |
|
"learning_rate": 6.147913528838487e-06, |
|
"loss": 1.4826, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 12.407737337744972, |
|
"grad_norm": 144.53054809570312, |
|
"learning_rate": 6.125764153450873e-06, |
|
"loss": 1.4668, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 12.47136675998982, |
|
"grad_norm": 105.31471252441406, |
|
"learning_rate": 6.103659076814035e-06, |
|
"loss": 1.5009, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 12.534996182234666, |
|
"grad_norm": 79.45948028564453, |
|
"learning_rate": 6.08150970142642e-06, |
|
"loss": 1.5008, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 12.598625604479512, |
|
"grad_norm": 100.81867218017578, |
|
"learning_rate": 6.059360326038807e-06, |
|
"loss": 1.5336, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 12.662255026724358, |
|
"grad_norm": 94.66363525390625, |
|
"learning_rate": 6.037210950651192e-06, |
|
"loss": 1.5057, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 12.725884448969204, |
|
"grad_norm": 73.030517578125, |
|
"learning_rate": 6.0150615752635775e-06, |
|
"loss": 1.5081, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 12.78951387121405, |
|
"grad_norm": 67.0549545288086, |
|
"learning_rate": 5.99295649862674e-06, |
|
"loss": 1.5402, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 12.853143293458896, |
|
"grad_norm": 91.37773895263672, |
|
"learning_rate": 5.970807123239125e-06, |
|
"loss": 1.5519, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 12.916772715703742, |
|
"grad_norm": 87.36595153808594, |
|
"learning_rate": 5.948657747851511e-06, |
|
"loss": 1.5171, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 12.980402137948587, |
|
"grad_norm": 82.45221710205078, |
|
"learning_rate": 5.926508372463897e-06, |
|
"loss": 1.5249, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 13.044031560193433, |
|
"grad_norm": 67.87359619140625, |
|
"learning_rate": 5.904358997076283e-06, |
|
"loss": 1.4117, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 13.10766098243828, |
|
"grad_norm": 77.75003814697266, |
|
"learning_rate": 5.882209621688669e-06, |
|
"loss": 1.3524, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 13.171290404683125, |
|
"grad_norm": 103.19142150878906, |
|
"learning_rate": 5.860060246301055e-06, |
|
"loss": 1.3564, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 13.234919826927971, |
|
"grad_norm": 82.8349380493164, |
|
"learning_rate": 5.837999468414991e-06, |
|
"loss": 1.3483, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 13.298549249172817, |
|
"grad_norm": 83.94813537597656, |
|
"learning_rate": 5.815850093027378e-06, |
|
"loss": 1.386, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 13.362178671417663, |
|
"grad_norm": 80.00110626220703, |
|
"learning_rate": 5.793700717639763e-06, |
|
"loss": 1.3723, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 13.425808093662509, |
|
"grad_norm": 79.54706573486328, |
|
"learning_rate": 5.771551342252149e-06, |
|
"loss": 1.3933, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 13.489437515907355, |
|
"grad_norm": 118.33966827392578, |
|
"learning_rate": 5.749401966864535e-06, |
|
"loss": 1.3672, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 13.553066938152202, |
|
"grad_norm": 148.68141174316406, |
|
"learning_rate": 5.727252591476921e-06, |
|
"loss": 1.3796, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 13.616696360397048, |
|
"grad_norm": 81.23079681396484, |
|
"learning_rate": 5.705103216089307e-06, |
|
"loss": 1.3637, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 13.680325782641894, |
|
"grad_norm": 118.37026977539062, |
|
"learning_rate": 5.682953840701693e-06, |
|
"loss": 1.4061, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 13.74395520488674, |
|
"grad_norm": 87.67139434814453, |
|
"learning_rate": 5.660804465314078e-06, |
|
"loss": 1.3897, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 13.807584627131586, |
|
"grad_norm": 76.84065246582031, |
|
"learning_rate": 5.638655089926465e-06, |
|
"loss": 1.4342, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 13.871214049376432, |
|
"grad_norm": 83.0779037475586, |
|
"learning_rate": 5.61650571453885e-06, |
|
"loss": 1.3821, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 13.934843471621278, |
|
"grad_norm": 63.323001861572266, |
|
"learning_rate": 5.594400637902012e-06, |
|
"loss": 1.411, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 13.998472893866124, |
|
"grad_norm": 75.757080078125, |
|
"learning_rate": 5.572295561265173e-06, |
|
"loss": 1.4214, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 14.06210231611097, |
|
"grad_norm": 47.76633071899414, |
|
"learning_rate": 5.550146185877559e-06, |
|
"loss": 1.2551, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 14.125731738355816, |
|
"grad_norm": 67.52932739257812, |
|
"learning_rate": 5.528041109240719e-06, |
|
"loss": 1.2366, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 14.189361160600662, |
|
"grad_norm": 77.91776275634766, |
|
"learning_rate": 5.505891733853106e-06, |
|
"loss": 1.2553, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 14.252990582845507, |
|
"grad_norm": 74.56119537353516, |
|
"learning_rate": 5.4837423584654914e-06, |
|
"loss": 1.2553, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 14.316620005090353, |
|
"grad_norm": 70.80554962158203, |
|
"learning_rate": 5.461592983077878e-06, |
|
"loss": 1.2624, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 14.3802494273352, |
|
"grad_norm": 72.7087631225586, |
|
"learning_rate": 5.4394436076902635e-06, |
|
"loss": 1.2771, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 14.443878849580045, |
|
"grad_norm": 81.98471069335938, |
|
"learning_rate": 5.41729423230265e-06, |
|
"loss": 1.2744, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 14.507508271824891, |
|
"grad_norm": 71.72978973388672, |
|
"learning_rate": 5.395189155665811e-06, |
|
"loss": 1.2616, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 14.571137694069737, |
|
"grad_norm": 73.07415771484375, |
|
"learning_rate": 5.373039780278196e-06, |
|
"loss": 1.2744, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 14.634767116314585, |
|
"grad_norm": 46.78715133666992, |
|
"learning_rate": 5.350890404890583e-06, |
|
"loss": 1.2705, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 14.69839653855943, |
|
"grad_norm": 80.48126220703125, |
|
"learning_rate": 5.328741029502968e-06, |
|
"loss": 1.3005, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 14.762025960804277, |
|
"grad_norm": 78.11446380615234, |
|
"learning_rate": 5.306591654115354e-06, |
|
"loss": 1.3013, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 14.825655383049122, |
|
"grad_norm": 113.7435302734375, |
|
"learning_rate": 5.28444227872774e-06, |
|
"loss": 1.298, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 14.889284805293968, |
|
"grad_norm": 58.536346435546875, |
|
"learning_rate": 5.262292903340126e-06, |
|
"loss": 1.2972, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 14.952914227538814, |
|
"grad_norm": 85.87594604492188, |
|
"learning_rate": 5.240143527952512e-06, |
|
"loss": 1.277, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 15.01654364978366, |
|
"grad_norm": 61.39375305175781, |
|
"learning_rate": 5.217994152564898e-06, |
|
"loss": 1.2718, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 15.080173072028506, |
|
"grad_norm": 64.70631408691406, |
|
"learning_rate": 5.1958447771772836e-06, |
|
"loss": 1.1697, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 15.143802494273352, |
|
"grad_norm": 81.51799774169922, |
|
"learning_rate": 5.17369540178967e-06, |
|
"loss": 1.1819, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 15.207431916518198, |
|
"grad_norm": 81.38251495361328, |
|
"learning_rate": 5.151546026402056e-06, |
|
"loss": 1.1916, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 15.271061338763044, |
|
"grad_norm": 87.31340789794922, |
|
"learning_rate": 5.129396651014442e-06, |
|
"loss": 1.1829, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 15.33469076100789, |
|
"grad_norm": 67.25629425048828, |
|
"learning_rate": 5.107247275626828e-06, |
|
"loss": 1.1632, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 15.398320183252736, |
|
"grad_norm": 56.04712677001953, |
|
"learning_rate": 5.085097900239213e-06, |
|
"loss": 1.1809, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 15.461949605497582, |
|
"grad_norm": 66.33815002441406, |
|
"learning_rate": 5.0629928236023755e-06, |
|
"loss": 1.1913, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 15.525579027742427, |
|
"grad_norm": 69.98699951171875, |
|
"learning_rate": 5.04084344821476e-06, |
|
"loss": 1.1916, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 15.589208449987273, |
|
"grad_norm": 70.65410614013672, |
|
"learning_rate": 5.018694072827147e-06, |
|
"loss": 1.1969, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 15.65283787223212, |
|
"grad_norm": 68.66796875, |
|
"learning_rate": 4.996544697439532e-06, |
|
"loss": 1.1929, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 15.716467294476967, |
|
"grad_norm": 68.35984802246094, |
|
"learning_rate": 4.974439620802694e-06, |
|
"loss": 1.2086, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 15.780096716721813, |
|
"grad_norm": 64.63552856445312, |
|
"learning_rate": 4.952290245415079e-06, |
|
"loss": 1.1864, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 15.843726138966659, |
|
"grad_norm": 59.172645568847656, |
|
"learning_rate": 4.930140870027466e-06, |
|
"loss": 1.2068, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 15.907355561211505, |
|
"grad_norm": 64.2562255859375, |
|
"learning_rate": 4.907991494639851e-06, |
|
"loss": 1.2253, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 15.97098498345635, |
|
"grad_norm": 61.80392837524414, |
|
"learning_rate": 4.885842119252238e-06, |
|
"loss": 1.1963, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 16.034614405701195, |
|
"grad_norm": 86.79552459716797, |
|
"learning_rate": 4.8636927438646234e-06, |
|
"loss": 1.1585, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 16.098243827946042, |
|
"grad_norm": 81.79373931884766, |
|
"learning_rate": 4.841543368477009e-06, |
|
"loss": 1.0834, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 16.161873250190887, |
|
"grad_norm": 60.24835205078125, |
|
"learning_rate": 4.8193939930893955e-06, |
|
"loss": 1.0937, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 16.225502672435734, |
|
"grad_norm": 74.93160247802734, |
|
"learning_rate": 4.797244617701781e-06, |
|
"loss": 1.0995, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 16.289132094680582, |
|
"grad_norm": 75.08971405029297, |
|
"learning_rate": 4.775095242314168e-06, |
|
"loss": 1.0787, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 16.352761516925426, |
|
"grad_norm": 66.41687774658203, |
|
"learning_rate": 4.752990165677328e-06, |
|
"loss": 1.1217, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 16.416390939170274, |
|
"grad_norm": 68.62983703613281, |
|
"learning_rate": 4.730840790289714e-06, |
|
"loss": 1.1185, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 16.480020361415118, |
|
"grad_norm": 67.19387817382812, |
|
"learning_rate": 4.7086914149021e-06, |
|
"loss": 1.1203, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 16.543649783659966, |
|
"grad_norm": 84.18933868408203, |
|
"learning_rate": 4.686542039514486e-06, |
|
"loss": 1.1201, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 16.60727920590481, |
|
"grad_norm": 56.41159439086914, |
|
"learning_rate": 4.664392664126872e-06, |
|
"loss": 1.125, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 16.670908628149657, |
|
"grad_norm": 90.81446075439453, |
|
"learning_rate": 4.642376184991584e-06, |
|
"loss": 1.1214, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 16.7345380503945, |
|
"grad_norm": 66.80113220214844, |
|
"learning_rate": 4.62022680960397e-06, |
|
"loss": 1.1228, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 16.79816747263935, |
|
"grad_norm": 68.8161849975586, |
|
"learning_rate": 4.598077434216355e-06, |
|
"loss": 1.1381, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 16.861796894884193, |
|
"grad_norm": 80.11527252197266, |
|
"learning_rate": 4.575928058828742e-06, |
|
"loss": 1.1414, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 16.92542631712904, |
|
"grad_norm": 64.7822036743164, |
|
"learning_rate": 4.553778683441127e-06, |
|
"loss": 1.123, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 16.989055739373885, |
|
"grad_norm": 65.32453918457031, |
|
"learning_rate": 4.531629308053514e-06, |
|
"loss": 1.1003, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 17.052685161618733, |
|
"grad_norm": 91.62205505371094, |
|
"learning_rate": 4.5094799326658994e-06, |
|
"loss": 1.0447, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 17.116314583863577, |
|
"grad_norm": 49.810699462890625, |
|
"learning_rate": 4.487330557278285e-06, |
|
"loss": 1.036, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 17.179944006108425, |
|
"grad_norm": 64.78093719482422, |
|
"learning_rate": 4.465181181890671e-06, |
|
"loss": 1.0264, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 17.24357342835327, |
|
"grad_norm": 61.86587905883789, |
|
"learning_rate": 4.443031806503057e-06, |
|
"loss": 1.0375, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 17.307202850598117, |
|
"grad_norm": 57.83167266845703, |
|
"learning_rate": 4.420882431115443e-06, |
|
"loss": 1.0509, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 17.370832272842964, |
|
"grad_norm": 81.06472778320312, |
|
"learning_rate": 4.398733055727829e-06, |
|
"loss": 1.0452, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 17.43446169508781, |
|
"grad_norm": 77.34078979492188, |
|
"learning_rate": 4.376583680340215e-06, |
|
"loss": 1.0519, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 17.498091117332656, |
|
"grad_norm": 75.93341064453125, |
|
"learning_rate": 4.3544343049526005e-06, |
|
"loss": 1.0498, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 17.5617205395775, |
|
"grad_norm": 56.93536376953125, |
|
"learning_rate": 4.332284929564987e-06, |
|
"loss": 1.0514, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 17.625349961822348, |
|
"grad_norm": 63.56499481201172, |
|
"learning_rate": 4.310179852928148e-06, |
|
"loss": 1.054, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 17.688979384067192, |
|
"grad_norm": 71.97218322753906, |
|
"learning_rate": 4.288030477540534e-06, |
|
"loss": 1.0457, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 17.75260880631204, |
|
"grad_norm": 63.93644332885742, |
|
"learning_rate": 4.265925400903695e-06, |
|
"loss": 1.0582, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 17.816238228556884, |
|
"grad_norm": 60.03602981567383, |
|
"learning_rate": 4.243776025516081e-06, |
|
"loss": 1.0566, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 17.87986765080173, |
|
"grad_norm": 66.63105010986328, |
|
"learning_rate": 4.221626650128467e-06, |
|
"loss": 1.0644, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 17.943497073046576, |
|
"grad_norm": 66.42560577392578, |
|
"learning_rate": 4.199477274740853e-06, |
|
"loss": 1.0579, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 18.007126495291423, |
|
"grad_norm": 58.22215270996094, |
|
"learning_rate": 4.1773278993532385e-06, |
|
"loss": 1.0647, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 18.070755917536268, |
|
"grad_norm": 63.40778350830078, |
|
"learning_rate": 4.155178523965624e-06, |
|
"loss": 0.9704, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 18.134385339781115, |
|
"grad_norm": 67.59272003173828, |
|
"learning_rate": 4.1330291485780105e-06, |
|
"loss": 0.9787, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 18.19801476202596, |
|
"grad_norm": 58.86367416381836, |
|
"learning_rate": 4.110879773190396e-06, |
|
"loss": 0.9875, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 18.261644184270807, |
|
"grad_norm": 72.68678283691406, |
|
"learning_rate": 4.088730397802782e-06, |
|
"loss": 0.987, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 18.32527360651565, |
|
"grad_norm": 69.95580291748047, |
|
"learning_rate": 4.066581022415168e-06, |
|
"loss": 0.9834, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 18.3889030287605, |
|
"grad_norm": 63.809104919433594, |
|
"learning_rate": 4.044431647027554e-06, |
|
"loss": 0.999, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 18.452532451005347, |
|
"grad_norm": 76.64576721191406, |
|
"learning_rate": 4.02228227163994e-06, |
|
"loss": 0.9872, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 18.51616187325019, |
|
"grad_norm": 54.77001953125, |
|
"learning_rate": 4.000177195003101e-06, |
|
"loss": 0.9851, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 18.57979129549504, |
|
"grad_norm": 67.22696685791016, |
|
"learning_rate": 3.978027819615487e-06, |
|
"loss": 0.9986, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 18.643420717739883, |
|
"grad_norm": 69.88746643066406, |
|
"learning_rate": 3.955878444227873e-06, |
|
"loss": 0.9853, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 18.70705013998473, |
|
"grad_norm": 66.42214965820312, |
|
"learning_rate": 3.933729068840259e-06, |
|
"loss": 0.9973, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 18.770679562229574, |
|
"grad_norm": 75.5511245727539, |
|
"learning_rate": 3.9116682909541955e-06, |
|
"loss": 0.988, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 18.834308984474422, |
|
"grad_norm": 73.2605209350586, |
|
"learning_rate": 3.889518915566581e-06, |
|
"loss": 0.999, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 18.897938406719266, |
|
"grad_norm": 63.08274841308594, |
|
"learning_rate": 3.8673695401789675e-06, |
|
"loss": 0.9899, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 18.961567828964114, |
|
"grad_norm": 98.51166534423828, |
|
"learning_rate": 3.845220164791353e-06, |
|
"loss": 1.0053, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 19.025197251208958, |
|
"grad_norm": 67.6368408203125, |
|
"learning_rate": 3.823070789403739e-06, |
|
"loss": 0.9802, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 19.088826673453806, |
|
"grad_norm": 71.1702880859375, |
|
"learning_rate": 3.800921414016125e-06, |
|
"loss": 0.9301, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 19.15245609569865, |
|
"grad_norm": 74.88888549804688, |
|
"learning_rate": 3.778772038628511e-06, |
|
"loss": 0.9295, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 19.216085517943498, |
|
"grad_norm": 49.797691345214844, |
|
"learning_rate": 3.756622663240897e-06, |
|
"loss": 0.9334, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 19.27971494018834, |
|
"grad_norm": 49.12934875488281, |
|
"learning_rate": 3.734473287853283e-06, |
|
"loss": 0.9503, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 19.34334436243319, |
|
"grad_norm": 47.530452728271484, |
|
"learning_rate": 3.712323912465669e-06, |
|
"loss": 0.9161, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 19.406973784678033, |
|
"grad_norm": 69.1083984375, |
|
"learning_rate": 3.6901745370780546e-06, |
|
"loss": 0.9433, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 19.47060320692288, |
|
"grad_norm": 62.554718017578125, |
|
"learning_rate": 3.6680251616904407e-06, |
|
"loss": 0.9376, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 19.53423262916773, |
|
"grad_norm": 55.151100158691406, |
|
"learning_rate": 3.645920085053602e-06, |
|
"loss": 0.9274, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 19.597862051412573, |
|
"grad_norm": 60.6050910949707, |
|
"learning_rate": 3.623770709665988e-06, |
|
"loss": 0.9414, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 19.66149147365742, |
|
"grad_norm": 55.62131118774414, |
|
"learning_rate": 3.6016213342783736e-06, |
|
"loss": 0.94, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 19.725120895902265, |
|
"grad_norm": 59.69659423828125, |
|
"learning_rate": 3.5794719588907597e-06, |
|
"loss": 0.9344, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 19.788750318147112, |
|
"grad_norm": 46.444984436035156, |
|
"learning_rate": 3.557366882253921e-06, |
|
"loss": 0.9464, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 19.852379740391957, |
|
"grad_norm": 63.4849739074707, |
|
"learning_rate": 3.535217506866307e-06, |
|
"loss": 0.9583, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 19.916009162636804, |
|
"grad_norm": 69.28148651123047, |
|
"learning_rate": 3.5130681314786926e-06, |
|
"loss": 0.953, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 19.97963858488165, |
|
"grad_norm": 127.65802764892578, |
|
"learning_rate": 3.4909187560910782e-06, |
|
"loss": 0.9481, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 20.043268007126496, |
|
"grad_norm": 64.03028106689453, |
|
"learning_rate": 3.46881367945424e-06, |
|
"loss": 0.8982, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 20.10689742937134, |
|
"grad_norm": 68.30170440673828, |
|
"learning_rate": 3.446664304066625e-06, |
|
"loss": 0.8974, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 20.170526851616188, |
|
"grad_norm": 44.01250457763672, |
|
"learning_rate": 3.424514928679011e-06, |
|
"loss": 0.9022, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 20.234156273861032, |
|
"grad_norm": 65.26950073242188, |
|
"learning_rate": 3.4023655532913972e-06, |
|
"loss": 0.8923, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 20.29778569610588, |
|
"grad_norm": 46.552730560302734, |
|
"learning_rate": 3.380260476654559e-06, |
|
"loss": 0.8935, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 20.361415118350724, |
|
"grad_norm": 59.73283767700195, |
|
"learning_rate": 3.358111101266944e-06, |
|
"loss": 0.8917, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 20.42504454059557, |
|
"grad_norm": 84.19660186767578, |
|
"learning_rate": 3.33596172587933e-06, |
|
"loss": 0.9021, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 20.488673962840416, |
|
"grad_norm": 48.705833435058594, |
|
"learning_rate": 3.3138123504917162e-06, |
|
"loss": 0.8978, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 20.552303385085263, |
|
"grad_norm": 44.999656677246094, |
|
"learning_rate": 3.2916629751041023e-06, |
|
"loss": 0.9078, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 20.61593280733011, |
|
"grad_norm": 62.21163558959961, |
|
"learning_rate": 3.2695135997164883e-06, |
|
"loss": 0.903, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 20.679562229574955, |
|
"grad_norm": 61.40314483642578, |
|
"learning_rate": 3.247408523079649e-06, |
|
"loss": 0.8989, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 20.743191651819803, |
|
"grad_norm": 55.93895721435547, |
|
"learning_rate": 3.2252591476920352e-06, |
|
"loss": 0.9023, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 20.806821074064647, |
|
"grad_norm": 64.3861312866211, |
|
"learning_rate": 3.2031097723044213e-06, |
|
"loss": 0.8918, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 20.870450496309495, |
|
"grad_norm": 92.62686157226562, |
|
"learning_rate": 3.1809603969168073e-06, |
|
"loss": 0.8968, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 20.93407991855434, |
|
"grad_norm": 58.37923049926758, |
|
"learning_rate": 3.1588110215291934e-06, |
|
"loss": 0.8977, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 20.997709340799187, |
|
"grad_norm": 49.36125564575195, |
|
"learning_rate": 3.136661646141579e-06, |
|
"loss": 0.9035, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 21.06133876304403, |
|
"grad_norm": 69.00907135009766, |
|
"learning_rate": 3.114512270753965e-06, |
|
"loss": 0.8347, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 21.12496818528888, |
|
"grad_norm": 56.581600189208984, |
|
"learning_rate": 3.0924071941171263e-06, |
|
"loss": 0.8415, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 21.188597607533723, |
|
"grad_norm": 56.39949417114258, |
|
"learning_rate": 3.0702578187295124e-06, |
|
"loss": 0.8472, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 21.25222702977857, |
|
"grad_norm": 58.36819839477539, |
|
"learning_rate": 3.048108443341898e-06, |
|
"loss": 0.8663, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 21.315856452023414, |
|
"grad_norm": 59.97529602050781, |
|
"learning_rate": 3.025959067954284e-06, |
|
"loss": 0.8633, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 21.379485874268262, |
|
"grad_norm": 52.96846008300781, |
|
"learning_rate": 3.00380969256667e-06, |
|
"loss": 0.8569, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 21.443115296513106, |
|
"grad_norm": 52.254085540771484, |
|
"learning_rate": 2.981660317179056e-06, |
|
"loss": 0.8529, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 21.506744718757954, |
|
"grad_norm": 80.7918472290039, |
|
"learning_rate": 2.959555240542217e-06, |
|
"loss": 0.8485, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 21.570374141002798, |
|
"grad_norm": 59.65958023071289, |
|
"learning_rate": 2.9374058651546026e-06, |
|
"loss": 0.8759, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 21.634003563247646, |
|
"grad_norm": 48.33919906616211, |
|
"learning_rate": 2.9152564897669886e-06, |
|
"loss": 0.8667, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 21.697632985492493, |
|
"grad_norm": 68.83987426757812, |
|
"learning_rate": 2.8931071143793747e-06, |
|
"loss": 0.8615, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 21.761262407737338, |
|
"grad_norm": 42.605552673339844, |
|
"learning_rate": 2.8709577389917607e-06, |
|
"loss": 0.8623, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 21.824891829982185, |
|
"grad_norm": 57.37046432495117, |
|
"learning_rate": 2.8488083636041464e-06, |
|
"loss": 0.8613, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 21.88852125222703, |
|
"grad_norm": 66.89559173583984, |
|
"learning_rate": 2.8266589882165324e-06, |
|
"loss": 0.8515, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 21.952150674471877, |
|
"grad_norm": 53.939571380615234, |
|
"learning_rate": 2.8045096128289184e-06, |
|
"loss": 0.8615, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 22.01578009671672, |
|
"grad_norm": 61.67373275756836, |
|
"learning_rate": 2.7824045361920797e-06, |
|
"loss": 0.8457, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 22.07940951896157, |
|
"grad_norm": 71.31520080566406, |
|
"learning_rate": 2.7602551608044653e-06, |
|
"loss": 0.8106, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 22.143038941206413, |
|
"grad_norm": 44.70698165893555, |
|
"learning_rate": 2.7381057854168514e-06, |
|
"loss": 0.8109, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 22.20666836345126, |
|
"grad_norm": 43.95622253417969, |
|
"learning_rate": 2.7159564100292374e-06, |
|
"loss": 0.8108, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 22.270297785696105, |
|
"grad_norm": 55.156822204589844, |
|
"learning_rate": 2.6938513333923987e-06, |
|
"loss": 0.8197, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 22.333927207940953, |
|
"grad_norm": 85.59542846679688, |
|
"learning_rate": 2.6717019580047843e-06, |
|
"loss": 0.8165, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 22.397556630185797, |
|
"grad_norm": 65.07913208007812, |
|
"learning_rate": 2.6495525826171704e-06, |
|
"loss": 0.8289, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 22.461186052430644, |
|
"grad_norm": 65.89120483398438, |
|
"learning_rate": 2.6274032072295564e-06, |
|
"loss": 0.8288, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 22.52481547467549, |
|
"grad_norm": 55.914939880371094, |
|
"learning_rate": 2.6052981305927177e-06, |
|
"loss": 0.8145, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 22.588444896920336, |
|
"grad_norm": 80.44625854492188, |
|
"learning_rate": 2.5831487552051033e-06, |
|
"loss": 0.8249, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 22.65207431916518, |
|
"grad_norm": 65.78691101074219, |
|
"learning_rate": 2.5609993798174894e-06, |
|
"loss": 0.8218, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 22.715703741410028, |
|
"grad_norm": 41.24105453491211, |
|
"learning_rate": 2.5388500044298754e-06, |
|
"loss": 0.8284, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 22.779333163654876, |
|
"grad_norm": 64.46809387207031, |
|
"learning_rate": 2.5167892265438115e-06, |
|
"loss": 0.833, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 22.84296258589972, |
|
"grad_norm": 56.47655487060547, |
|
"learning_rate": 2.4946398511561976e-06, |
|
"loss": 0.8176, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 22.906592008144568, |
|
"grad_norm": 71.83133697509766, |
|
"learning_rate": 2.4724904757685836e-06, |
|
"loss": 0.8431, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 22.97022143038941, |
|
"grad_norm": 53.66551971435547, |
|
"learning_rate": 2.450385399131745e-06, |
|
"loss": 0.8234, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 23.03385085263426, |
|
"grad_norm": 78.90325927734375, |
|
"learning_rate": 2.4282360237441305e-06, |
|
"loss": 0.7998, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 23.097480274879103, |
|
"grad_norm": 64.31370544433594, |
|
"learning_rate": 2.4060866483565166e-06, |
|
"loss": 0.7821, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 23.16110969712395, |
|
"grad_norm": 48.68030548095703, |
|
"learning_rate": 2.3839372729689026e-06, |
|
"loss": 0.7914, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 23.224739119368795, |
|
"grad_norm": 52.06983184814453, |
|
"learning_rate": 2.3617878975812882e-06, |
|
"loss": 0.7851, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 23.288368541613643, |
|
"grad_norm": 50.310157775878906, |
|
"learning_rate": 2.3396385221936743e-06, |
|
"loss": 0.7797, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 23.351997963858487, |
|
"grad_norm": 52.41871643066406, |
|
"learning_rate": 2.3174891468060603e-06, |
|
"loss": 0.7931, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 23.415627386103335, |
|
"grad_norm": 88.78260040283203, |
|
"learning_rate": 2.295339771418446e-06, |
|
"loss": 0.7912, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 23.47925680834818, |
|
"grad_norm": 62.528663635253906, |
|
"learning_rate": 2.273190396030832e-06, |
|
"loss": 0.7876, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 23.542886230593027, |
|
"grad_norm": 46.27097702026367, |
|
"learning_rate": 2.251041020643218e-06, |
|
"loss": 0.7954, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 23.60651565283787, |
|
"grad_norm": 50.20694351196289, |
|
"learning_rate": 2.228891645255604e-06, |
|
"loss": 0.7946, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 23.67014507508272, |
|
"grad_norm": 56.892765045166016, |
|
"learning_rate": 2.20674226986799e-06, |
|
"loss": 0.7782, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 23.733774497327563, |
|
"grad_norm": 41.52644729614258, |
|
"learning_rate": 2.184637193231151e-06, |
|
"loss": 0.7952, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 23.79740391957241, |
|
"grad_norm": 58.025516510009766, |
|
"learning_rate": 2.162487817843537e-06, |
|
"loss": 0.8015, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 23.861033341817258, |
|
"grad_norm": 48.62569046020508, |
|
"learning_rate": 2.140338442455923e-06, |
|
"loss": 0.7977, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 23.924662764062102, |
|
"grad_norm": 46.91473388671875, |
|
"learning_rate": 2.1181890670683087e-06, |
|
"loss": 0.7875, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 23.98829218630695, |
|
"grad_norm": 52.42847442626953, |
|
"learning_rate": 2.09608399043147e-06, |
|
"loss": 0.7935, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 24.051921608551794, |
|
"grad_norm": 76.6783676147461, |
|
"learning_rate": 2.073934615043856e-06, |
|
"loss": 0.7617, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 24.11555103079664, |
|
"grad_norm": 67.17424011230469, |
|
"learning_rate": 2.0517852396562417e-06, |
|
"loss": 0.7625, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 24.179180453041486, |
|
"grad_norm": 50.021053314208984, |
|
"learning_rate": 2.0296358642686277e-06, |
|
"loss": 0.7514, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 24.242809875286333, |
|
"grad_norm": 53.048465728759766, |
|
"learning_rate": 2.0074864888810137e-06, |
|
"loss": 0.7662, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 24.306439297531178, |
|
"grad_norm": 67.73706817626953, |
|
"learning_rate": 1.9854257109949503e-06, |
|
"loss": 0.7692, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 24.370068719776025, |
|
"grad_norm": 57.47793960571289, |
|
"learning_rate": 1.9632763356073363e-06, |
|
"loss": 0.7733, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 24.43369814202087, |
|
"grad_norm": 59.039405822753906, |
|
"learning_rate": 1.941126960219722e-06, |
|
"loss": 0.7561, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 24.497327564265717, |
|
"grad_norm": 46.10505676269531, |
|
"learning_rate": 1.918977584832108e-06, |
|
"loss": 0.7577, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 24.56095698651056, |
|
"grad_norm": 85.20184326171875, |
|
"learning_rate": 1.8968282094444936e-06, |
|
"loss": 0.7687, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 24.62458640875541, |
|
"grad_norm": 53.42023849487305, |
|
"learning_rate": 1.8746788340568796e-06, |
|
"loss": 0.7647, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 24.688215831000253, |
|
"grad_norm": 63.070919036865234, |
|
"learning_rate": 1.852573757420041e-06, |
|
"loss": 0.7717, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 24.7518452532451, |
|
"grad_norm": 59.02709197998047, |
|
"learning_rate": 1.830424382032427e-06, |
|
"loss": 0.761, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 24.815474675489945, |
|
"grad_norm": 47.43505859375, |
|
"learning_rate": 1.8082750066448126e-06, |
|
"loss": 0.7661, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 24.879104097734793, |
|
"grad_norm": 79.37848663330078, |
|
"learning_rate": 1.7861256312571986e-06, |
|
"loss": 0.7446, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 24.94273351997964, |
|
"grad_norm": 51.29045104980469, |
|
"learning_rate": 1.7639762558695847e-06, |
|
"loss": 0.7659, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 25.006362942224484, |
|
"grad_norm": 43.27066421508789, |
|
"learning_rate": 1.7418711792327458e-06, |
|
"loss": 0.7559, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 25.069992364469332, |
|
"grad_norm": 45.82556915283203, |
|
"learning_rate": 1.7197218038451316e-06, |
|
"loss": 0.7183, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 25.133621786714176, |
|
"grad_norm": 61.48518753051758, |
|
"learning_rate": 1.6975724284575176e-06, |
|
"loss": 0.7399, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 25.197251208959024, |
|
"grad_norm": 56.30770492553711, |
|
"learning_rate": 1.6754230530699037e-06, |
|
"loss": 0.7308, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 25.260880631203868, |
|
"grad_norm": 77.96941375732422, |
|
"learning_rate": 1.6532736776822895e-06, |
|
"loss": 0.733, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 25.324510053448716, |
|
"grad_norm": 75.03907775878906, |
|
"learning_rate": 1.6311243022946753e-06, |
|
"loss": 0.746, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 25.38813947569356, |
|
"grad_norm": 49.624366760253906, |
|
"learning_rate": 1.6089749269070614e-06, |
|
"loss": 0.7274, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 25.451768897938408, |
|
"grad_norm": 54.31991195678711, |
|
"learning_rate": 1.5868255515194472e-06, |
|
"loss": 0.7358, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 25.51539832018325, |
|
"grad_norm": 57.31879425048828, |
|
"learning_rate": 1.5646761761318333e-06, |
|
"loss": 0.7468, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 25.5790277424281, |
|
"grad_norm": 51.115596771240234, |
|
"learning_rate": 1.5425710994949943e-06, |
|
"loss": 0.734, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 25.642657164672944, |
|
"grad_norm": 68.8400650024414, |
|
"learning_rate": 1.5204660228581556e-06, |
|
"loss": 0.7493, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 25.70628658691779, |
|
"grad_norm": 40.318153381347656, |
|
"learning_rate": 1.4983166474705415e-06, |
|
"loss": 0.7263, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 25.769916009162635, |
|
"grad_norm": 63.84051513671875, |
|
"learning_rate": 1.4761672720829273e-06, |
|
"loss": 0.7355, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 25.833545431407483, |
|
"grad_norm": 61.53810501098633, |
|
"learning_rate": 1.4540178966953133e-06, |
|
"loss": 0.745, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 25.897174853652327, |
|
"grad_norm": 51.513187408447266, |
|
"learning_rate": 1.4318685213076994e-06, |
|
"loss": 0.7301, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 25.960804275897175, |
|
"grad_norm": 65.9397201538086, |
|
"learning_rate": 1.4097191459200852e-06, |
|
"loss": 0.7457, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 26.024433698142023, |
|
"grad_norm": 59.864540100097656, |
|
"learning_rate": 1.3875697705324713e-06, |
|
"loss": 0.7072, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 26.088063120386867, |
|
"grad_norm": 52.43559646606445, |
|
"learning_rate": 1.3654203951448569e-06, |
|
"loss": 0.7212, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 26.151692542631714, |
|
"grad_norm": 45.977447509765625, |
|
"learning_rate": 1.343315318508018e-06, |
|
"loss": 0.7186, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 26.21532196487656, |
|
"grad_norm": 48.874996185302734, |
|
"learning_rate": 1.321165943120404e-06, |
|
"loss": 0.7225, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 26.278951387121406, |
|
"grad_norm": 57.50956344604492, |
|
"learning_rate": 1.29901656773279e-06, |
|
"loss": 0.7065, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 26.34258080936625, |
|
"grad_norm": 64.73326110839844, |
|
"learning_rate": 1.2768671923451759e-06, |
|
"loss": 0.7153, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 26.406210231611098, |
|
"grad_norm": 53.96969223022461, |
|
"learning_rate": 1.254762115708337e-06, |
|
"loss": 0.72, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 26.469839653855942, |
|
"grad_norm": 56.118255615234375, |
|
"learning_rate": 1.232612740320723e-06, |
|
"loss": 0.7074, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 26.53346907610079, |
|
"grad_norm": 57.4562873840332, |
|
"learning_rate": 1.2104633649331088e-06, |
|
"loss": 0.7117, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 26.597098498345634, |
|
"grad_norm": 60.367919921875, |
|
"learning_rate": 1.1883139895454949e-06, |
|
"loss": 0.7206, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 26.66072792059048, |
|
"grad_norm": 55.18882369995117, |
|
"learning_rate": 1.166164614157881e-06, |
|
"loss": 0.7132, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 26.724357342835326, |
|
"grad_norm": 48.3643798828125, |
|
"learning_rate": 1.144059537521042e-06, |
|
"loss": 0.7199, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 26.787986765080174, |
|
"grad_norm": 50.825225830078125, |
|
"learning_rate": 1.1219101621334278e-06, |
|
"loss": 0.7102, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 26.851616187325018, |
|
"grad_norm": 36.502899169921875, |
|
"learning_rate": 1.0997607867458139e-06, |
|
"loss": 0.7155, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 26.915245609569865, |
|
"grad_norm": 58.10041809082031, |
|
"learning_rate": 1.0776114113581997e-06, |
|
"loss": 0.7057, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 26.97887503181471, |
|
"grad_norm": 41.11175537109375, |
|
"learning_rate": 1.055506334721361e-06, |
|
"loss": 0.7191, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 27.042504454059557, |
|
"grad_norm": 56.8629150390625, |
|
"learning_rate": 1.0333569593337468e-06, |
|
"loss": 0.6942, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 27.106133876304405, |
|
"grad_norm": 43.03855514526367, |
|
"learning_rate": 1.011251882696908e-06, |
|
"loss": 0.6924, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 27.16976329854925, |
|
"grad_norm": 41.03914260864258, |
|
"learning_rate": 9.89102507309294e-07, |
|
"loss": 0.7025, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 27.233392720794097, |
|
"grad_norm": 44.40423583984375, |
|
"learning_rate": 9.6695313192168e-07, |
|
"loss": 0.6911, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 27.29702214303894, |
|
"grad_norm": 48.28982925415039, |
|
"learning_rate": 9.448037565340658e-07, |
|
"loss": 0.6955, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 27.36065156528379, |
|
"grad_norm": 58.85805130004883, |
|
"learning_rate": 9.226543811464518e-07, |
|
"loss": 0.6875, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 27.424280987528633, |
|
"grad_norm": 58.64131164550781, |
|
"learning_rate": 9.005050057588377e-07, |
|
"loss": 0.698, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 27.48791040977348, |
|
"grad_norm": 54.10153579711914, |
|
"learning_rate": 8.783999291219989e-07, |
|
"loss": 0.7054, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 27.551539832018324, |
|
"grad_norm": 37.60294723510742, |
|
"learning_rate": 8.562505537343847e-07, |
|
"loss": 0.6968, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 27.615169254263172, |
|
"grad_norm": 46.13175964355469, |
|
"learning_rate": 8.341011783467707e-07, |
|
"loss": 0.7044, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 27.678798676508016, |
|
"grad_norm": 49.83407211303711, |
|
"learning_rate": 8.119518029591567e-07, |
|
"loss": 0.6946, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 27.742428098752864, |
|
"grad_norm": 62.65508270263672, |
|
"learning_rate": 7.898024275715425e-07, |
|
"loss": 0.6865, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 27.806057520997708, |
|
"grad_norm": 64.78981018066406, |
|
"learning_rate": 7.676530521839285e-07, |
|
"loss": 0.6974, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 27.869686943242556, |
|
"grad_norm": 55.65605926513672, |
|
"learning_rate": 7.455479755470895e-07, |
|
"loss": 0.698, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 27.9333163654874, |
|
"grad_norm": 51.40291976928711, |
|
"learning_rate": 7.233986001594756e-07, |
|
"loss": 0.6943, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 27.996945787732248, |
|
"grad_norm": 52.821475982666016, |
|
"learning_rate": 7.012492247718615e-07, |
|
"loss": 0.6985, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 28.06057520997709, |
|
"grad_norm": 105.65634155273438, |
|
"learning_rate": 6.790998493842474e-07, |
|
"loss": 0.6785, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 28.12420463222194, |
|
"grad_norm": 66.97595977783203, |
|
"learning_rate": 6.569504739966333e-07, |
|
"loss": 0.6842, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 28.187834054466787, |
|
"grad_norm": 77.8376693725586, |
|
"learning_rate": 6.348010986090193e-07, |
|
"loss": 0.6832, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 28.25146347671163, |
|
"grad_norm": 68.83918762207031, |
|
"learning_rate": 6.126517232214052e-07, |
|
"loss": 0.6863, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 28.31509289895648, |
|
"grad_norm": 49.16581344604492, |
|
"learning_rate": 5.905023478337911e-07, |
|
"loss": 0.6806, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 28.378722321201323, |
|
"grad_norm": 58.93035888671875, |
|
"learning_rate": 5.683972711969523e-07, |
|
"loss": 0.6897, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 28.44235174344617, |
|
"grad_norm": 57.476531982421875, |
|
"learning_rate": 5.462478958093382e-07, |
|
"loss": 0.6975, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 28.505981165691015, |
|
"grad_norm": 56.7477912902832, |
|
"learning_rate": 5.240985204217242e-07, |
|
"loss": 0.6802, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 28.569610587935863, |
|
"grad_norm": 45.01617431640625, |
|
"learning_rate": 5.0194914503411e-07, |
|
"loss": 0.6836, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 28.633240010180707, |
|
"grad_norm": 49.77652359008789, |
|
"learning_rate": 4.79799769646496e-07, |
|
"loss": 0.6849, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 28.696869432425554, |
|
"grad_norm": 52.57892990112305, |
|
"learning_rate": 4.57650394258882e-07, |
|
"loss": 0.6781, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 28.7604988546704, |
|
"grad_norm": 64.97437286376953, |
|
"learning_rate": 4.3550101887126787e-07, |
|
"loss": 0.6761, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 28.824128276915246, |
|
"grad_norm": 52.035160064697266, |
|
"learning_rate": 4.133516434836538e-07, |
|
"loss": 0.6762, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 28.88775769916009, |
|
"grad_norm": 57.393035888671875, |
|
"learning_rate": 3.912022680960397e-07, |
|
"loss": 0.6781, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 28.951387121404938, |
|
"grad_norm": 49.78774642944336, |
|
"learning_rate": 3.691414902099761e-07, |
|
"loss": 0.682, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 29.015016543649782, |
|
"grad_norm": 47.4661750793457, |
|
"learning_rate": 3.4699211482236206e-07, |
|
"loss": 0.6742, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 29.07864596589463, |
|
"grad_norm": 69.56925964355469, |
|
"learning_rate": 3.248870381855232e-07, |
|
"loss": 0.6595, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 29.142275388139474, |
|
"grad_norm": 49.844520568847656, |
|
"learning_rate": 3.027376627979091e-07, |
|
"loss": 0.683, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 29.20590481038432, |
|
"grad_norm": 58.6362419128418, |
|
"learning_rate": 2.8058828741029506e-07, |
|
"loss": 0.6721, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 29.26953423262917, |
|
"grad_norm": 44.214717864990234, |
|
"learning_rate": 2.5843891202268095e-07, |
|
"loss": 0.669, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 29.333163654874014, |
|
"grad_norm": 50.08256530761719, |
|
"learning_rate": 2.3628953663506691e-07, |
|
"loss": 0.683, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 29.39679307711886, |
|
"grad_norm": 51.15972900390625, |
|
"learning_rate": 2.1414016124745283e-07, |
|
"loss": 0.6652, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 29.460422499363705, |
|
"grad_norm": 47.7255859375, |
|
"learning_rate": 1.92035084610614e-07, |
|
"loss": 0.671, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 29.524051921608553, |
|
"grad_norm": 45.42967987060547, |
|
"learning_rate": 1.6988570922299992e-07, |
|
"loss": 0.6662, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 29.587681343853397, |
|
"grad_norm": 47.5881462097168, |
|
"learning_rate": 1.4773633383538586e-07, |
|
"loss": 0.6665, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 29.651310766098245, |
|
"grad_norm": 71.63655090332031, |
|
"learning_rate": 1.2558695844777177e-07, |
|
"loss": 0.6718, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 29.71494018834309, |
|
"grad_norm": 45.697998046875, |
|
"learning_rate": 1.0343758306015771e-07, |
|
"loss": 0.6657, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 29.778569610587937, |
|
"grad_norm": 58.17982864379883, |
|
"learning_rate": 8.128820767254363e-08, |
|
"loss": 0.6677, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 29.84219903283278, |
|
"grad_norm": 46.64686965942383, |
|
"learning_rate": 5.9138832284929565e-08, |
|
"loss": 0.6732, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 29.90582845507763, |
|
"grad_norm": 52.96521759033203, |
|
"learning_rate": 3.69894568973155e-08, |
|
"loss": 0.6687, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 29.969457877322473, |
|
"grad_norm": 44.063079833984375, |
|
"learning_rate": 1.4840081509701428e-08, |
|
"loss": 0.6732, |
|
"step": 235500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 235740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|