|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 63788, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006270771932024832, |
|
"grad_norm": 92.5528793334961, |
|
"learning_rate": 3.041229032763756e-07, |
|
"loss": 11.9622, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.012541543864049664, |
|
"grad_norm": 85.98809814453125, |
|
"learning_rate": 6.176516695406804e-07, |
|
"loss": 11.265, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018812315796074498, |
|
"grad_norm": 82.95439910888672, |
|
"learning_rate": 9.311804358049851e-07, |
|
"loss": 10.5195, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.025083087728099328, |
|
"grad_norm": 74.84368896484375, |
|
"learning_rate": 1.2447092020692899e-06, |
|
"loss": 9.4744, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03135385966012416, |
|
"grad_norm": 71.96393585205078, |
|
"learning_rate": 1.5582379683335947e-06, |
|
"loss": 8.4815, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03135385966012416, |
|
"eval_loss": 8.621713638305664, |
|
"eval_runtime": 232.9735, |
|
"eval_samples_per_second": 547.582, |
|
"eval_steps_per_second": 17.114, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.037624631592148995, |
|
"grad_norm": 82.08424377441406, |
|
"learning_rate": 1.8717667345978996e-06, |
|
"loss": 7.6105, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04389540352417383, |
|
"grad_norm": 82.3794937133789, |
|
"learning_rate": 2.1852955008622044e-06, |
|
"loss": 6.8023, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.050166175456198656, |
|
"grad_norm": 42.44660186767578, |
|
"learning_rate": 2.4988242671265088e-06, |
|
"loss": 6.1258, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05643694738822349, |
|
"grad_norm": 42.748497009277344, |
|
"learning_rate": 2.812353033390814e-06, |
|
"loss": 5.5032, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.06270771932024832, |
|
"grad_norm": 49.40472412109375, |
|
"learning_rate": 3.1258817996551187e-06, |
|
"loss": 5.0397, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06270771932024832, |
|
"eval_loss": 5.194935321807861, |
|
"eval_runtime": 248.6107, |
|
"eval_samples_per_second": 513.14, |
|
"eval_steps_per_second": 16.037, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06897849125227315, |
|
"grad_norm": 54.32060623168945, |
|
"learning_rate": 3.439410565919423e-06, |
|
"loss": 4.6909, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.07524926318429799, |
|
"grad_norm": 50.955474853515625, |
|
"learning_rate": 3.7529393321837283e-06, |
|
"loss": 4.5716, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.08152003511632282, |
|
"grad_norm": 42.99276351928711, |
|
"learning_rate": 4.066468098448033e-06, |
|
"loss": 4.3983, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.08779080704834766, |
|
"grad_norm": 56.20285415649414, |
|
"learning_rate": 4.379996864712338e-06, |
|
"loss": 4.2073, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.09406157898037248, |
|
"grad_norm": 47.72187805175781, |
|
"learning_rate": 4.693525630976643e-06, |
|
"loss": 4.2164, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09406157898037248, |
|
"eval_loss": 4.142153263092041, |
|
"eval_runtime": 255.6642, |
|
"eval_samples_per_second": 498.983, |
|
"eval_steps_per_second": 15.595, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.10033235091239731, |
|
"grad_norm": 53.83956527709961, |
|
"learning_rate": 5.0070543972409465e-06, |
|
"loss": 4.0921, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.10660312284442215, |
|
"grad_norm": 35.98961639404297, |
|
"learning_rate": 5.320583163505252e-06, |
|
"loss": 4.1785, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.11287389477644698, |
|
"grad_norm": 90.68247985839844, |
|
"learning_rate": 5.634111929769557e-06, |
|
"loss": 4.0503, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.11914466670847182, |
|
"grad_norm": 64.07307434082031, |
|
"learning_rate": 5.947640696033862e-06, |
|
"loss": 3.8969, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.12541543864049665, |
|
"grad_norm": 58.8675537109375, |
|
"learning_rate": 6.2611694622981665e-06, |
|
"loss": 3.8538, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12541543864049665, |
|
"eval_loss": 3.910861015319824, |
|
"eval_runtime": 257.3903, |
|
"eval_samples_per_second": 495.636, |
|
"eval_steps_per_second": 15.49, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.13168621057252147, |
|
"grad_norm": 58.32042694091797, |
|
"learning_rate": 6.574698228562471e-06, |
|
"loss": 3.872, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.1379569825045463, |
|
"grad_norm": 38.084102630615234, |
|
"learning_rate": 6.885091707164133e-06, |
|
"loss": 3.851, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.14422775443657115, |
|
"grad_norm": 62.679237365722656, |
|
"learning_rate": 7.198620473428438e-06, |
|
"loss": 3.6301, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.15049852636859598, |
|
"grad_norm": 60.0799446105957, |
|
"learning_rate": 7.512149239692742e-06, |
|
"loss": 3.5202, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1567692983006208, |
|
"grad_norm": 30.11835289001465, |
|
"learning_rate": 7.825678005957047e-06, |
|
"loss": 3.6759, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1567692983006208, |
|
"eval_loss": 3.638855218887329, |
|
"eval_runtime": 259.3655, |
|
"eval_samples_per_second": 491.862, |
|
"eval_steps_per_second": 15.372, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.16304007023264563, |
|
"grad_norm": 72.76881408691406, |
|
"learning_rate": 8.139206772221352e-06, |
|
"loss": 3.4106, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.16931084216467046, |
|
"grad_norm": 89.71743774414062, |
|
"learning_rate": 8.452735538485657e-06, |
|
"loss": 3.69, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.17558161409669532, |
|
"grad_norm": 81.42703247070312, |
|
"learning_rate": 8.766264304749962e-06, |
|
"loss": 3.6336, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.18185238602872014, |
|
"grad_norm": 84.91463470458984, |
|
"learning_rate": 9.079793071014266e-06, |
|
"loss": 3.4715, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.18812315796074497, |
|
"grad_norm": 50.145713806152344, |
|
"learning_rate": 9.393321837278571e-06, |
|
"loss": 3.2166, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.18812315796074497, |
|
"eval_loss": 3.273871898651123, |
|
"eval_runtime": 257.8101, |
|
"eval_samples_per_second": 494.829, |
|
"eval_steps_per_second": 15.465, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1943939298927698, |
|
"grad_norm": 62.13496780395508, |
|
"learning_rate": 9.706850603542876e-06, |
|
"loss": 3.3844, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.20066470182479462, |
|
"grad_norm": 74.26377868652344, |
|
"learning_rate": 1.002037936980718e-05, |
|
"loss": 3.4449, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.20693547375681948, |
|
"grad_norm": 112.08758544921875, |
|
"learning_rate": 1.0333908136071484e-05, |
|
"loss": 3.0811, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.2132062456888443, |
|
"grad_norm": 40.37177276611328, |
|
"learning_rate": 1.0644301614673146e-05, |
|
"loss": 3.2777, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.21947701762086913, |
|
"grad_norm": 65.6463623046875, |
|
"learning_rate": 1.0957830380937451e-05, |
|
"loss": 2.9505, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.21947701762086913, |
|
"eval_loss": 3.0864851474761963, |
|
"eval_runtime": 252.3303, |
|
"eval_samples_per_second": 505.575, |
|
"eval_steps_per_second": 15.801, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.22574778955289396, |
|
"grad_norm": 103.7303466796875, |
|
"learning_rate": 1.1271359147201758e-05, |
|
"loss": 3.1534, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.23201856148491878, |
|
"grad_norm": 145.92767333984375, |
|
"learning_rate": 1.158488791346606e-05, |
|
"loss": 2.9669, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.23828933341694364, |
|
"grad_norm": 78.69353485107422, |
|
"learning_rate": 1.1898416679730367e-05, |
|
"loss": 2.9416, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.24456010534896847, |
|
"grad_norm": 55.99378204345703, |
|
"learning_rate": 1.221194544599467e-05, |
|
"loss": 2.9637, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2508308772809933, |
|
"grad_norm": 104.32599639892578, |
|
"learning_rate": 1.2525474212258977e-05, |
|
"loss": 2.9322, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2508308772809933, |
|
"eval_loss": 2.844682455062866, |
|
"eval_runtime": 252.142, |
|
"eval_samples_per_second": 505.953, |
|
"eval_steps_per_second": 15.813, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25710164921301815, |
|
"grad_norm": 68.12931823730469, |
|
"learning_rate": 1.283900297852328e-05, |
|
"loss": 2.6926, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.26337242114504295, |
|
"grad_norm": 208.65591430664062, |
|
"learning_rate": 1.3152531744787585e-05, |
|
"loss": 2.9353, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.2696431930770678, |
|
"grad_norm": 91.44706726074219, |
|
"learning_rate": 1.3466060511051891e-05, |
|
"loss": 2.635, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.2759139650090926, |
|
"grad_norm": 92.03852081298828, |
|
"learning_rate": 1.3779589277316194e-05, |
|
"loss": 2.5692, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.28218473694111745, |
|
"grad_norm": 138.34088134765625, |
|
"learning_rate": 1.4089982755917857e-05, |
|
"loss": 3.0283, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.28218473694111745, |
|
"eval_loss": 2.9033422470092773, |
|
"eval_runtime": 248.9921, |
|
"eval_samples_per_second": 512.354, |
|
"eval_steps_per_second": 16.013, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2884555088731423, |
|
"grad_norm": 57.657936096191406, |
|
"learning_rate": 1.4403511522182162e-05, |
|
"loss": 2.5804, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.2947262808051671, |
|
"grad_norm": 52.86611557006836, |
|
"learning_rate": 1.4717040288446466e-05, |
|
"loss": 3.1374, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.30099705273719196, |
|
"grad_norm": 80.26192474365234, |
|
"learning_rate": 1.5030569054710771e-05, |
|
"loss": 2.8479, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.30726782466921676, |
|
"grad_norm": 7.4570465087890625, |
|
"learning_rate": 1.5344097820975076e-05, |
|
"loss": 2.6809, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.3135385966012416, |
|
"grad_norm": 4.543123722076416, |
|
"learning_rate": 1.5657626587239382e-05, |
|
"loss": 2.8267, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3135385966012416, |
|
"eval_loss": 2.6946306228637695, |
|
"eval_runtime": 249.1581, |
|
"eval_samples_per_second": 512.012, |
|
"eval_steps_per_second": 16.002, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.31980936853326647, |
|
"grad_norm": 29.848108291625977, |
|
"learning_rate": 1.5971155353503685e-05, |
|
"loss": 2.7341, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.32608014046529127, |
|
"grad_norm": 139.14234924316406, |
|
"learning_rate": 1.6284684119767992e-05, |
|
"loss": 2.8157, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.3323509123973161, |
|
"grad_norm": 65.15583038330078, |
|
"learning_rate": 1.659507759836965e-05, |
|
"loss": 2.5867, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3386216843293409, |
|
"grad_norm": 76.32029724121094, |
|
"learning_rate": 1.6908606364633958e-05, |
|
"loss": 2.8622, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.3448924562613658, |
|
"grad_norm": 85.2268295288086, |
|
"learning_rate": 1.722213513089826e-05, |
|
"loss": 2.9063, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3448924562613658, |
|
"eval_loss": 2.6115071773529053, |
|
"eval_runtime": 249.5988, |
|
"eval_samples_per_second": 511.108, |
|
"eval_steps_per_second": 15.974, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.35116322819339063, |
|
"grad_norm": 21.968101501464844, |
|
"learning_rate": 1.7535663897162567e-05, |
|
"loss": 2.1514, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.35743400012541543, |
|
"grad_norm": 159.9650421142578, |
|
"learning_rate": 1.784919266342687e-05, |
|
"loss": 2.3755, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.3637047720574403, |
|
"grad_norm": 53.702919006347656, |
|
"learning_rate": 1.8162721429691173e-05, |
|
"loss": 2.5055, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.3699755439894651, |
|
"grad_norm": 16.580570220947266, |
|
"learning_rate": 1.847625019595548e-05, |
|
"loss": 3.3237, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.37624631592148994, |
|
"grad_norm": 77.9209213256836, |
|
"learning_rate": 1.8789778962219786e-05, |
|
"loss": 2.561, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.37624631592148994, |
|
"eval_loss": 2.7512075901031494, |
|
"eval_runtime": 252.3158, |
|
"eval_samples_per_second": 505.605, |
|
"eval_steps_per_second": 15.802, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3825170878535148, |
|
"grad_norm": 181.68307495117188, |
|
"learning_rate": 1.910330772848409e-05, |
|
"loss": 2.4351, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.3887878597855396, |
|
"grad_norm": 405.9890441894531, |
|
"learning_rate": 1.9416836494748396e-05, |
|
"loss": 2.8472, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.39505863171756445, |
|
"grad_norm": 65.42109680175781, |
|
"learning_rate": 1.9730365261012702e-05, |
|
"loss": 2.76, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.40132940364958924, |
|
"grad_norm": 1.5880606174468994, |
|
"learning_rate": 1.999512271595046e-05, |
|
"loss": 2.1947, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.4076001755816141, |
|
"grad_norm": 147.59877014160156, |
|
"learning_rate": 1.9960284972739466e-05, |
|
"loss": 2.6409, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.4076001755816141, |
|
"eval_loss": 2.536679267883301, |
|
"eval_runtime": 247.9993, |
|
"eval_samples_per_second": 514.405, |
|
"eval_steps_per_second": 16.077, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.41387094751363895, |
|
"grad_norm": 147.17579650878906, |
|
"learning_rate": 1.9925447229528472e-05, |
|
"loss": 2.7262, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.42014171944566375, |
|
"grad_norm": 42.11772155761719, |
|
"learning_rate": 1.989060948631748e-05, |
|
"loss": 2.7781, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.4264124913776886, |
|
"grad_norm": 194.7137451171875, |
|
"learning_rate": 1.985577174310648e-05, |
|
"loss": 2.4718, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.4326832633097134, |
|
"grad_norm": 63.2336540222168, |
|
"learning_rate": 1.9820933999895488e-05, |
|
"loss": 2.567, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.43895403524173826, |
|
"grad_norm": 122.4419174194336, |
|
"learning_rate": 1.9786096256684494e-05, |
|
"loss": 2.4215, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43895403524173826, |
|
"eval_loss": 2.340890407562256, |
|
"eval_runtime": 249.0613, |
|
"eval_samples_per_second": 512.211, |
|
"eval_steps_per_second": 16.008, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.4452248071737631, |
|
"grad_norm": 173.85031127929688, |
|
"learning_rate": 1.97512585134735e-05, |
|
"loss": 1.9308, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.4514955791057879, |
|
"grad_norm": 200.34971618652344, |
|
"learning_rate": 1.9716420770262504e-05, |
|
"loss": 2.1232, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.45776635103781277, |
|
"grad_norm": 208.45030212402344, |
|
"learning_rate": 1.968158302705151e-05, |
|
"loss": 2.421, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.46403712296983757, |
|
"grad_norm": 148.36253356933594, |
|
"learning_rate": 1.9646745283840513e-05, |
|
"loss": 2.3232, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.4703078949018624, |
|
"grad_norm": 24.392248153686523, |
|
"learning_rate": 1.961190754062952e-05, |
|
"loss": 2.8543, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4703078949018624, |
|
"eval_loss": 2.3705639839172363, |
|
"eval_runtime": 247.4495, |
|
"eval_samples_per_second": 515.548, |
|
"eval_steps_per_second": 16.112, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4765786668338873, |
|
"grad_norm": 217.60328674316406, |
|
"learning_rate": 1.9577069797418526e-05, |
|
"loss": 2.4276, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.4828494387659121, |
|
"grad_norm": 91.55315399169922, |
|
"learning_rate": 1.9542232054207532e-05, |
|
"loss": 2.4507, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.48912021069793693, |
|
"grad_norm": 246.22488403320312, |
|
"learning_rate": 1.9507394310996535e-05, |
|
"loss": 2.1963, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.49539098262996173, |
|
"grad_norm": 76.7205810546875, |
|
"learning_rate": 1.947255656778554e-05, |
|
"loss": 2.4247, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.5016617545619866, |
|
"grad_norm": 0.794611394405365, |
|
"learning_rate": 1.9437718824574544e-05, |
|
"loss": 2.1948, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5016617545619866, |
|
"eval_loss": 2.5728752613067627, |
|
"eval_runtime": 248.1179, |
|
"eval_samples_per_second": 514.159, |
|
"eval_steps_per_second": 16.069, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5079325264940114, |
|
"grad_norm": 75.7978744506836, |
|
"learning_rate": 1.940288108136355e-05, |
|
"loss": 2.4069, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.5142032984260363, |
|
"grad_norm": 81.46521759033203, |
|
"learning_rate": 1.9368043338152557e-05, |
|
"loss": 2.4328, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.520474070358061, |
|
"grad_norm": 153.08226013183594, |
|
"learning_rate": 1.9333205594941563e-05, |
|
"loss": 2.2198, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.5267448422900859, |
|
"grad_norm": 1.353060245513916, |
|
"learning_rate": 1.9298367851730566e-05, |
|
"loss": 2.1746, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5330156142221107, |
|
"grad_norm": 298.2365417480469, |
|
"learning_rate": 1.9263878485951682e-05, |
|
"loss": 2.2618, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5330156142221107, |
|
"eval_loss": 2.345949411392212, |
|
"eval_runtime": 249.317, |
|
"eval_samples_per_second": 511.686, |
|
"eval_steps_per_second": 15.992, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5392863861541356, |
|
"grad_norm": 409.5244140625, |
|
"learning_rate": 1.922904074274069e-05, |
|
"loss": 2.3909, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.5455571580861605, |
|
"grad_norm": 0.6597223877906799, |
|
"learning_rate": 1.919420299952969e-05, |
|
"loss": 2.035, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5518279300181852, |
|
"grad_norm": 365.05914306640625, |
|
"learning_rate": 1.9159365256318698e-05, |
|
"loss": 2.2626, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.55809870195021, |
|
"grad_norm": 103.37579345703125, |
|
"learning_rate": 1.91245275131077e-05, |
|
"loss": 2.1541, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.5643694738822349, |
|
"grad_norm": 4.599234104156494, |
|
"learning_rate": 1.9089689769896707e-05, |
|
"loss": 1.9424, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5643694738822349, |
|
"eval_loss": 2.1624536514282227, |
|
"eval_runtime": 248.0391, |
|
"eval_samples_per_second": 514.322, |
|
"eval_steps_per_second": 16.074, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5706402458142598, |
|
"grad_norm": 0.6885708570480347, |
|
"learning_rate": 1.9054852026685714e-05, |
|
"loss": 2.5152, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.5769110177462846, |
|
"grad_norm": 103.164794921875, |
|
"learning_rate": 1.9020014283474716e-05, |
|
"loss": 2.0462, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5831817896783094, |
|
"grad_norm": 0.7507800459861755, |
|
"learning_rate": 1.8985176540263723e-05, |
|
"loss": 1.6124, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5894525616103342, |
|
"grad_norm": 38.5381965637207, |
|
"learning_rate": 1.895033879705273e-05, |
|
"loss": 2.2236, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5957233335423591, |
|
"grad_norm": 374.18011474609375, |
|
"learning_rate": 1.8915501053841735e-05, |
|
"loss": 2.4706, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5957233335423591, |
|
"eval_loss": 2.0568950176239014, |
|
"eval_runtime": 250.2023, |
|
"eval_samples_per_second": 509.875, |
|
"eval_steps_per_second": 15.935, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.6019941054743839, |
|
"grad_norm": 115.00419616699219, |
|
"learning_rate": 1.888066331063074e-05, |
|
"loss": 2.4612, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.6082648774064088, |
|
"grad_norm": 302.7066955566406, |
|
"learning_rate": 1.8845825567419745e-05, |
|
"loss": 2.2784, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.6145356493384335, |
|
"grad_norm": 0.18385061621665955, |
|
"learning_rate": 1.8810987824208748e-05, |
|
"loss": 1.9335, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.6208064212704584, |
|
"grad_norm": 9.742902755737305, |
|
"learning_rate": 1.8776150080997754e-05, |
|
"loss": 2.3779, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.6270771932024832, |
|
"grad_norm": 12.202372550964355, |
|
"learning_rate": 1.874131233778676e-05, |
|
"loss": 1.6778, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6270771932024832, |
|
"eval_loss": 2.112342596054077, |
|
"eval_runtime": 247.5759, |
|
"eval_samples_per_second": 515.284, |
|
"eval_steps_per_second": 16.104, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6333479651345081, |
|
"grad_norm": 47.51719284057617, |
|
"learning_rate": 1.8706474594575767e-05, |
|
"loss": 2.4721, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.6396187370665329, |
|
"grad_norm": 330.02703857421875, |
|
"learning_rate": 1.867163685136477e-05, |
|
"loss": 1.7822, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.6458895089985577, |
|
"grad_norm": 110.14346313476562, |
|
"learning_rate": 1.8636799108153776e-05, |
|
"loss": 2.077, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.6521602809305825, |
|
"grad_norm": 28.561458587646484, |
|
"learning_rate": 1.860196136494278e-05, |
|
"loss": 1.9223, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.6584310528626074, |
|
"grad_norm": 14.915325164794922, |
|
"learning_rate": 1.8567123621731785e-05, |
|
"loss": 2.3513, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6584310528626074, |
|
"eval_loss": 1.8402663469314575, |
|
"eval_runtime": 246.8028, |
|
"eval_samples_per_second": 516.899, |
|
"eval_steps_per_second": 16.155, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6647018247946322, |
|
"grad_norm": 142.3553009033203, |
|
"learning_rate": 1.853228587852079e-05, |
|
"loss": 2.1387, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6709725967266571, |
|
"grad_norm": 2.4230360984802246, |
|
"learning_rate": 1.8497448135309798e-05, |
|
"loss": 2.1853, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6772433686586818, |
|
"grad_norm": 85.05690002441406, |
|
"learning_rate": 1.84626103920988e-05, |
|
"loss": 1.8715, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6835141405907067, |
|
"grad_norm": 88.9746322631836, |
|
"learning_rate": 1.8427772648887807e-05, |
|
"loss": 1.8581, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6897849125227316, |
|
"grad_norm": 191.67779541015625, |
|
"learning_rate": 1.839293490567681e-05, |
|
"loss": 2.0076, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6897849125227316, |
|
"eval_loss": 2.00632643699646, |
|
"eval_runtime": 246.7856, |
|
"eval_samples_per_second": 516.934, |
|
"eval_steps_per_second": 16.156, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6960556844547564, |
|
"grad_norm": 157.76986694335938, |
|
"learning_rate": 1.8358097162465817e-05, |
|
"loss": 2.3144, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.7023264563867813, |
|
"grad_norm": 52.53676223754883, |
|
"learning_rate": 1.8323259419254823e-05, |
|
"loss": 2.0942, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.708597228318806, |
|
"grad_norm": 61.30582046508789, |
|
"learning_rate": 1.828842167604383e-05, |
|
"loss": 1.9117, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.7148680002508309, |
|
"grad_norm": 146.37437438964844, |
|
"learning_rate": 1.8253583932832832e-05, |
|
"loss": 2.2214, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.7211387721828557, |
|
"grad_norm": 214.81398010253906, |
|
"learning_rate": 1.821874618962184e-05, |
|
"loss": 1.9678, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7211387721828557, |
|
"eval_loss": 1.9028793573379517, |
|
"eval_runtime": 244.7222, |
|
"eval_samples_per_second": 521.293, |
|
"eval_steps_per_second": 16.292, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7274095441148806, |
|
"grad_norm": 5.435591220855713, |
|
"learning_rate": 1.818390844641084e-05, |
|
"loss": 1.7459, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.7336803160469054, |
|
"grad_norm": 107.97034454345703, |
|
"learning_rate": 1.8149070703199848e-05, |
|
"loss": 2.0616, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.7399510879789302, |
|
"grad_norm": 63.21007537841797, |
|
"learning_rate": 1.8114232959988854e-05, |
|
"loss": 1.6169, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.746221859910955, |
|
"grad_norm": 113.56210327148438, |
|
"learning_rate": 1.8079743594209967e-05, |
|
"loss": 1.5674, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.7524926318429799, |
|
"grad_norm": 107.1183090209961, |
|
"learning_rate": 1.8044905850998973e-05, |
|
"loss": 1.4956, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7524926318429799, |
|
"eval_loss": 1.8266816139221191, |
|
"eval_runtime": 244.3373, |
|
"eval_samples_per_second": 522.114, |
|
"eval_steps_per_second": 16.318, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7587634037750047, |
|
"grad_norm": 151.79904174804688, |
|
"learning_rate": 1.801006810778798e-05, |
|
"loss": 2.3816, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.7650341757070296, |
|
"grad_norm": 323.1309814453125, |
|
"learning_rate": 1.7975230364576983e-05, |
|
"loss": 2.2387, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.7713049476390543, |
|
"grad_norm": 4.0979743003845215, |
|
"learning_rate": 1.794039262136599e-05, |
|
"loss": 1.4625, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7775757195710792, |
|
"grad_norm": 126.16666412353516, |
|
"learning_rate": 1.7905554878154995e-05, |
|
"loss": 2.028, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.783846491503104, |
|
"grad_norm": 42.80760955810547, |
|
"learning_rate": 1.7870717134944e-05, |
|
"loss": 2.151, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.783846491503104, |
|
"eval_loss": 1.7581337690353394, |
|
"eval_runtime": 244.6209, |
|
"eval_samples_per_second": 521.509, |
|
"eval_steps_per_second": 16.299, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7901172634351289, |
|
"grad_norm": 0.3076690435409546, |
|
"learning_rate": 1.7835879391733005e-05, |
|
"loss": 1.6896, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7963880353671537, |
|
"grad_norm": 1.4938758611679077, |
|
"learning_rate": 1.780104164852201e-05, |
|
"loss": 1.8526, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.8026588072991785, |
|
"grad_norm": 208.20004272460938, |
|
"learning_rate": 1.7766203905311014e-05, |
|
"loss": 1.9745, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.8089295792312033, |
|
"grad_norm": 14.515748023986816, |
|
"learning_rate": 1.773136616210002e-05, |
|
"loss": 2.1042, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.8152003511632282, |
|
"grad_norm": 170.497314453125, |
|
"learning_rate": 1.7696528418889027e-05, |
|
"loss": 1.83, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.8152003511632282, |
|
"eval_loss": 1.5666632652282715, |
|
"eval_runtime": 244.9634, |
|
"eval_samples_per_second": 520.78, |
|
"eval_steps_per_second": 16.276, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.821471123095253, |
|
"grad_norm": 37.14794158935547, |
|
"learning_rate": 1.7661690675678033e-05, |
|
"loss": 1.7451, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.8277418950272779, |
|
"grad_norm": 97.6008529663086, |
|
"learning_rate": 1.7626852932467036e-05, |
|
"loss": 1.568, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.8340126669593027, |
|
"grad_norm": 1.4752888679504395, |
|
"learning_rate": 1.7592015189256042e-05, |
|
"loss": 1.4432, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.8402834388913275, |
|
"grad_norm": 100.85454559326172, |
|
"learning_rate": 1.7557177446045045e-05, |
|
"loss": 1.9172, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.8465542108233524, |
|
"grad_norm": 169.63970947265625, |
|
"learning_rate": 1.752233970283405e-05, |
|
"loss": 1.9438, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8465542108233524, |
|
"eval_loss": 1.6055145263671875, |
|
"eval_runtime": 239.684, |
|
"eval_samples_per_second": 532.251, |
|
"eval_steps_per_second": 16.634, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8528249827553772, |
|
"grad_norm": 145.4659881591797, |
|
"learning_rate": 1.7487501959623058e-05, |
|
"loss": 1.6488, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8590957546874021, |
|
"grad_norm": 9.112565994262695, |
|
"learning_rate": 1.7452664216412064e-05, |
|
"loss": 1.8166, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.8653665266194268, |
|
"grad_norm": 122.40379333496094, |
|
"learning_rate": 1.7417826473201067e-05, |
|
"loss": 1.5929, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.8716372985514517, |
|
"grad_norm": 1.4977953433990479, |
|
"learning_rate": 1.7382988729990073e-05, |
|
"loss": 1.2476, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.8779080704834765, |
|
"grad_norm": 264.9580078125, |
|
"learning_rate": 1.7348150986779076e-05, |
|
"loss": 1.5236, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8779080704834765, |
|
"eval_loss": 1.8921126127243042, |
|
"eval_runtime": 242.2787, |
|
"eval_samples_per_second": 526.551, |
|
"eval_steps_per_second": 16.456, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8841788424155014, |
|
"grad_norm": 0.0032478359062224627, |
|
"learning_rate": 1.7313661621000193e-05, |
|
"loss": 1.6538, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8904496143475262, |
|
"grad_norm": 169.41224670410156, |
|
"learning_rate": 1.72788238777892e-05, |
|
"loss": 1.8689, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.896720386279551, |
|
"grad_norm": 91.79679107666016, |
|
"learning_rate": 1.7243986134578202e-05, |
|
"loss": 1.0831, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.9029911582115758, |
|
"grad_norm": 1.378010869026184, |
|
"learning_rate": 1.7209148391367208e-05, |
|
"loss": 1.7765, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.9092619301436007, |
|
"grad_norm": 86.2571792602539, |
|
"learning_rate": 1.7174310648156215e-05, |
|
"loss": 1.3548, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.9092619301436007, |
|
"eval_loss": 1.668320894241333, |
|
"eval_runtime": 242.597, |
|
"eval_samples_per_second": 525.86, |
|
"eval_steps_per_second": 16.435, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.9155327020756255, |
|
"grad_norm": 42.63466262817383, |
|
"learning_rate": 1.7139472904945218e-05, |
|
"loss": 1.7792, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.9218034740076504, |
|
"grad_norm": 31.874799728393555, |
|
"learning_rate": 1.7104635161734224e-05, |
|
"loss": 1.73, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.9280742459396751, |
|
"grad_norm": 288.0302734375, |
|
"learning_rate": 1.7069797418523227e-05, |
|
"loss": 1.5979, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.9343450178717, |
|
"grad_norm": 76.91877746582031, |
|
"learning_rate": 1.7034959675312233e-05, |
|
"loss": 1.3678, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.9406157898037248, |
|
"grad_norm": 153.2476348876953, |
|
"learning_rate": 1.700012193210124e-05, |
|
"loss": 2.0664, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9406157898037248, |
|
"eval_loss": 1.5160768032073975, |
|
"eval_runtime": 241.7632, |
|
"eval_samples_per_second": 527.673, |
|
"eval_steps_per_second": 16.491, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9468865617357497, |
|
"grad_norm": 204.87367248535156, |
|
"learning_rate": 1.6965284188890246e-05, |
|
"loss": 1.4472, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.9531573336677746, |
|
"grad_norm": 107.19727325439453, |
|
"learning_rate": 1.693044644567925e-05, |
|
"loss": 1.447, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.9594281055997993, |
|
"grad_norm": 0.9635588526725769, |
|
"learning_rate": 1.6895608702468255e-05, |
|
"loss": 1.7261, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.9656988775318242, |
|
"grad_norm": 21.72879981994629, |
|
"learning_rate": 1.686077095925726e-05, |
|
"loss": 1.4881, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.971969649463849, |
|
"grad_norm": 3.110539197921753, |
|
"learning_rate": 1.6825933216046268e-05, |
|
"loss": 1.313, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.971969649463849, |
|
"eval_loss": 1.6226599216461182, |
|
"eval_runtime": 241.336, |
|
"eval_samples_per_second": 528.607, |
|
"eval_steps_per_second": 16.521, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.9782404213958739, |
|
"grad_norm": 4.804477691650391, |
|
"learning_rate": 1.679109547283527e-05, |
|
"loss": 1.4587, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.9845111933278987, |
|
"grad_norm": 159.54579162597656, |
|
"learning_rate": 1.6756257729624277e-05, |
|
"loss": 2.0982, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.9907819652599235, |
|
"grad_norm": 0.04496179521083832, |
|
"learning_rate": 1.672141998641328e-05, |
|
"loss": 1.4854, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.9970527371919483, |
|
"grad_norm": 178.064453125, |
|
"learning_rate": 1.6686582243202286e-05, |
|
"loss": 1.343, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.0033235091239732, |
|
"grad_norm": 60.21414566040039, |
|
"learning_rate": 1.6651744499991293e-05, |
|
"loss": 1.1795, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.0033235091239732, |
|
"eval_loss": 1.5639160871505737, |
|
"eval_runtime": 239.9545, |
|
"eval_samples_per_second": 531.651, |
|
"eval_steps_per_second": 16.616, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.009594281055998, |
|
"grad_norm": 28.01744842529297, |
|
"learning_rate": 1.66169067567803e-05, |
|
"loss": 1.4001, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.0158650529880229, |
|
"grad_norm": 0.9447069764137268, |
|
"learning_rate": 1.6582069013569302e-05, |
|
"loss": 1.3867, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.0221358249200476, |
|
"grad_norm": 271.91583251953125, |
|
"learning_rate": 1.654723127035831e-05, |
|
"loss": 1.5191, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.0284065968520726, |
|
"grad_norm": 50.53108596801758, |
|
"learning_rate": 1.651239352714731e-05, |
|
"loss": 1.4693, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.0346773687840973, |
|
"grad_norm": 37.87648010253906, |
|
"learning_rate": 1.6477555783936318e-05, |
|
"loss": 1.628, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.0346773687840973, |
|
"eval_loss": 1.4715627431869507, |
|
"eval_runtime": 243.7121, |
|
"eval_samples_per_second": 523.454, |
|
"eval_steps_per_second": 16.359, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.040948140716122, |
|
"grad_norm": 0.5571967363357544, |
|
"learning_rate": 1.6442718040725324e-05, |
|
"loss": 1.0041, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.047218912648147, |
|
"grad_norm": 410.87158203125, |
|
"learning_rate": 1.6408228674946437e-05, |
|
"loss": 1.7728, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.0534896845801718, |
|
"grad_norm": 0.04839416220784187, |
|
"learning_rate": 1.6373390931735443e-05, |
|
"loss": 1.5586, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.0597604565121967, |
|
"grad_norm": 15.377680778503418, |
|
"learning_rate": 1.633855318852445e-05, |
|
"loss": 1.7229, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.0660312284442215, |
|
"grad_norm": 156.4866943359375, |
|
"learning_rate": 1.6304063822745562e-05, |
|
"loss": 1.5556, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.0660312284442215, |
|
"eval_loss": 1.467575192451477, |
|
"eval_runtime": 244.9859, |
|
"eval_samples_per_second": 520.732, |
|
"eval_steps_per_second": 16.274, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.0723020003762462, |
|
"grad_norm": 0.4987052083015442, |
|
"learning_rate": 1.626922607953457e-05, |
|
"loss": 1.2529, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.0785727723082712, |
|
"grad_norm": 0.12283490598201752, |
|
"learning_rate": 1.6234388336323575e-05, |
|
"loss": 1.4787, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.084843544240296, |
|
"grad_norm": 0.2928747534751892, |
|
"learning_rate": 1.6199550593112578e-05, |
|
"loss": 1.1947, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.091114316172321, |
|
"grad_norm": 0.06402698904275894, |
|
"learning_rate": 1.6164712849901584e-05, |
|
"loss": 1.3014, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.0973850881043457, |
|
"grad_norm": 38.54865646362305, |
|
"learning_rate": 1.612987510669059e-05, |
|
"loss": 1.3743, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.0973850881043457, |
|
"eval_loss": 1.4624249935150146, |
|
"eval_runtime": 242.5321, |
|
"eval_samples_per_second": 526.0, |
|
"eval_steps_per_second": 16.439, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.1036558600363704, |
|
"grad_norm": 4.770035266876221, |
|
"learning_rate": 1.6095037363479597e-05, |
|
"loss": 1.3397, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.1099266319683954, |
|
"grad_norm": 73.70013427734375, |
|
"learning_rate": 1.60601996202686e-05, |
|
"loss": 1.3062, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.11619740390042, |
|
"grad_norm": 0.7905834317207336, |
|
"learning_rate": 1.6025361877057606e-05, |
|
"loss": 1.3288, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.122468175832445, |
|
"grad_norm": 85.46574401855469, |
|
"learning_rate": 1.599052413384661e-05, |
|
"loss": 2.0002, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.1287389477644698, |
|
"grad_norm": 101.38238525390625, |
|
"learning_rate": 1.5955686390635616e-05, |
|
"loss": 2.0294, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.1287389477644698, |
|
"eval_loss": 1.4184610843658447, |
|
"eval_runtime": 243.202, |
|
"eval_samples_per_second": 524.552, |
|
"eval_steps_per_second": 16.394, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.1350097196964946, |
|
"grad_norm": 177.34451293945312, |
|
"learning_rate": 1.5920848647424622e-05, |
|
"loss": 1.5053, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.1412804916285195, |
|
"grad_norm": 0.12398409098386765, |
|
"learning_rate": 1.5886010904213628e-05, |
|
"loss": 1.3657, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.1475512635605443, |
|
"grad_norm": 1.1212390661239624, |
|
"learning_rate": 1.585117316100263e-05, |
|
"loss": 1.3877, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.1538220354925692, |
|
"grad_norm": 132.34060668945312, |
|
"learning_rate": 1.5816335417791638e-05, |
|
"loss": 1.9034, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.160092807424594, |
|
"grad_norm": 8.030499458312988, |
|
"learning_rate": 1.578149767458064e-05, |
|
"loss": 1.4001, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.160092807424594, |
|
"eval_loss": 1.3812620639801025, |
|
"eval_runtime": 242.7932, |
|
"eval_samples_per_second": 525.435, |
|
"eval_steps_per_second": 16.421, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.1663635793566187, |
|
"grad_norm": 100.8308334350586, |
|
"learning_rate": 1.5746659931369647e-05, |
|
"loss": 1.7503, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.1726343512886437, |
|
"grad_norm": 57.332176208496094, |
|
"learning_rate": 1.5711822188158653e-05, |
|
"loss": 1.1482, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.1789051232206684, |
|
"grad_norm": 0.38618066906929016, |
|
"learning_rate": 1.567698444494766e-05, |
|
"loss": 1.0958, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.1851758951526934, |
|
"grad_norm": 29.31690216064453, |
|
"learning_rate": 1.5642146701736662e-05, |
|
"loss": 1.2657, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.1914466670847181, |
|
"grad_norm": 98.16004180908203, |
|
"learning_rate": 1.560730895852567e-05, |
|
"loss": 1.3721, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.1914466670847181, |
|
"eval_loss": 1.4701639413833618, |
|
"eval_runtime": 242.7602, |
|
"eval_samples_per_second": 525.506, |
|
"eval_steps_per_second": 16.424, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.1977174390167429, |
|
"grad_norm": 18.174930572509766, |
|
"learning_rate": 1.5572471215314672e-05, |
|
"loss": 1.2361, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.2039882109487678, |
|
"grad_norm": 18.77554702758789, |
|
"learning_rate": 1.5537633472103678e-05, |
|
"loss": 1.003, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.2102589828807926, |
|
"grad_norm": 105.063720703125, |
|
"learning_rate": 1.5502795728892684e-05, |
|
"loss": 1.3677, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.2165297548128176, |
|
"grad_norm": 65.42724609375, |
|
"learning_rate": 1.546795798568169e-05, |
|
"loss": 1.668, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.2228005267448423, |
|
"grad_norm": 57.190792083740234, |
|
"learning_rate": 1.5433120242470694e-05, |
|
"loss": 1.2026, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.2228005267448423, |
|
"eval_loss": 1.3641443252563477, |
|
"eval_runtime": 244.6626, |
|
"eval_samples_per_second": 521.42, |
|
"eval_steps_per_second": 16.296, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.229071298676867, |
|
"grad_norm": 291.0449523925781, |
|
"learning_rate": 1.53982824992597e-05, |
|
"loss": 1.1754, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.235342070608892, |
|
"grad_norm": 0.6484419703483582, |
|
"learning_rate": 1.5363444756048703e-05, |
|
"loss": 1.3196, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.2416128425409168, |
|
"grad_norm": 10.18918514251709, |
|
"learning_rate": 1.532860701283771e-05, |
|
"loss": 1.4766, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.2478836144729417, |
|
"grad_norm": 0.408495157957077, |
|
"learning_rate": 1.5293769269626716e-05, |
|
"loss": 1.389, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.2541543864049665, |
|
"grad_norm": 1.5292593240737915, |
|
"learning_rate": 1.525893152641572e-05, |
|
"loss": 1.6974, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.2541543864049665, |
|
"eval_loss": 1.3344130516052246, |
|
"eval_runtime": 243.5732, |
|
"eval_samples_per_second": 523.752, |
|
"eval_steps_per_second": 16.369, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.2604251583369912, |
|
"grad_norm": 0.13364413380622864, |
|
"learning_rate": 1.5224093783204725e-05, |
|
"loss": 1.5036, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.2666959302690162, |
|
"grad_norm": 68.8973617553711, |
|
"learning_rate": 1.518925603999373e-05, |
|
"loss": 1.1728, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.272966702201041, |
|
"grad_norm": 2.0211031436920166, |
|
"learning_rate": 1.5154418296782736e-05, |
|
"loss": 1.6058, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.2792374741330659, |
|
"grad_norm": 16.78483009338379, |
|
"learning_rate": 1.511958055357174e-05, |
|
"loss": 1.5191, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.2855082460650906, |
|
"grad_norm": 36.06229019165039, |
|
"learning_rate": 1.5084742810360747e-05, |
|
"loss": 1.4516, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.2855082460650906, |
|
"eval_loss": 1.320965051651001, |
|
"eval_runtime": 237.9506, |
|
"eval_samples_per_second": 536.128, |
|
"eval_steps_per_second": 16.756, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.2917790179971154, |
|
"grad_norm": 0.42136240005493164, |
|
"learning_rate": 1.504990506714975e-05, |
|
"loss": 1.3485, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.2980497899291403, |
|
"grad_norm": 69.18399810791016, |
|
"learning_rate": 1.5015067323938756e-05, |
|
"loss": 1.2598, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.304320561861165, |
|
"grad_norm": 0.9956406354904175, |
|
"learning_rate": 1.4980229580727761e-05, |
|
"loss": 1.5871, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.31059133379319, |
|
"grad_norm": 296.9071044921875, |
|
"learning_rate": 1.4945391837516767e-05, |
|
"loss": 1.1965, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.3168621057252148, |
|
"grad_norm": 135.63108825683594, |
|
"learning_rate": 1.4910554094305772e-05, |
|
"loss": 1.3983, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.3168621057252148, |
|
"eval_loss": 1.2516661882400513, |
|
"eval_runtime": 243.3539, |
|
"eval_samples_per_second": 524.224, |
|
"eval_steps_per_second": 16.384, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.3231328776572395, |
|
"grad_norm": 0.29125073552131653, |
|
"learning_rate": 1.4875716351094778e-05, |
|
"loss": 1.2605, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.3294036495892645, |
|
"grad_norm": 120.13431549072266, |
|
"learning_rate": 1.4840878607883781e-05, |
|
"loss": 1.5629, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.3356744215212892, |
|
"grad_norm": 0.6574529409408569, |
|
"learning_rate": 1.4806040864672787e-05, |
|
"loss": 1.0668, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.3419451934533142, |
|
"grad_norm": 0.08501740545034409, |
|
"learning_rate": 1.4771203121461792e-05, |
|
"loss": 1.1879, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.348215965385339, |
|
"grad_norm": 0.06920505315065384, |
|
"learning_rate": 1.4736365378250798e-05, |
|
"loss": 1.132, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.348215965385339, |
|
"eval_loss": 1.3881497383117676, |
|
"eval_runtime": 239.3274, |
|
"eval_samples_per_second": 533.044, |
|
"eval_steps_per_second": 16.659, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.3544867373173637, |
|
"grad_norm": 119.1258773803711, |
|
"learning_rate": 1.4701876012471915e-05, |
|
"loss": 1.7231, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.3607575092493887, |
|
"grad_norm": 219.5289764404297, |
|
"learning_rate": 1.4667038269260918e-05, |
|
"loss": 1.7636, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.3670282811814134, |
|
"grad_norm": 27.880413055419922, |
|
"learning_rate": 1.4632548903482034e-05, |
|
"loss": 1.1193, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.3732990531134384, |
|
"grad_norm": 5.331712245941162, |
|
"learning_rate": 1.459771116027104e-05, |
|
"loss": 1.4662, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.379569825045463, |
|
"grad_norm": 53.37089538574219, |
|
"learning_rate": 1.4562873417060043e-05, |
|
"loss": 2.0394, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.379569825045463, |
|
"eval_loss": 1.1926569938659668, |
|
"eval_runtime": 241.0069, |
|
"eval_samples_per_second": 529.329, |
|
"eval_steps_per_second": 16.543, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.3858405969774878, |
|
"grad_norm": 22.226316452026367, |
|
"learning_rate": 1.452803567384905e-05, |
|
"loss": 1.1535, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.3921113689095128, |
|
"grad_norm": 2.272599458694458, |
|
"learning_rate": 1.4493197930638054e-05, |
|
"loss": 1.4592, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.3983821408415376, |
|
"grad_norm": 25.961870193481445, |
|
"learning_rate": 1.445836018742706e-05, |
|
"loss": 1.276, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.4046529127735625, |
|
"grad_norm": 73.93904113769531, |
|
"learning_rate": 1.4423522444216065e-05, |
|
"loss": 1.2984, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.4109236847055873, |
|
"grad_norm": 122.38665771484375, |
|
"learning_rate": 1.438868470100507e-05, |
|
"loss": 0.9741, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.4109236847055873, |
|
"eval_loss": 1.2707290649414062, |
|
"eval_runtime": 241.8021, |
|
"eval_samples_per_second": 527.588, |
|
"eval_steps_per_second": 16.489, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.417194456637612, |
|
"grad_norm": 402.4999084472656, |
|
"learning_rate": 1.4353846957794076e-05, |
|
"loss": 1.4253, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.423465228569637, |
|
"grad_norm": 0.6434441208839417, |
|
"learning_rate": 1.4319009214583079e-05, |
|
"loss": 1.0769, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.4297360005016617, |
|
"grad_norm": 43.16348648071289, |
|
"learning_rate": 1.4284171471372085e-05, |
|
"loss": 0.8276, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.4360067724336867, |
|
"grad_norm": 91.6303482055664, |
|
"learning_rate": 1.424933372816109e-05, |
|
"loss": 1.2689, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.4422775443657114, |
|
"grad_norm": 123.81659698486328, |
|
"learning_rate": 1.4214495984950096e-05, |
|
"loss": 1.4817, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.4422775443657114, |
|
"eval_loss": 1.2094941139221191, |
|
"eval_runtime": 235.8497, |
|
"eval_samples_per_second": 540.904, |
|
"eval_steps_per_second": 16.905, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.4485483162977362, |
|
"grad_norm": 33.1621208190918, |
|
"learning_rate": 1.4179658241739101e-05, |
|
"loss": 1.1522, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.4548190882297611, |
|
"grad_norm": 0.6552605628967285, |
|
"learning_rate": 1.4144820498528107e-05, |
|
"loss": 0.8978, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.4610898601617859, |
|
"grad_norm": 1.6054786443710327, |
|
"learning_rate": 1.410998275531711e-05, |
|
"loss": 1.015, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.4673606320938108, |
|
"grad_norm": 30.116901397705078, |
|
"learning_rate": 1.4075145012106117e-05, |
|
"loss": 1.0351, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.4736314040258356, |
|
"grad_norm": 74.74423217773438, |
|
"learning_rate": 1.4040307268895121e-05, |
|
"loss": 1.3959, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.4736314040258356, |
|
"eval_loss": 1.1969189643859863, |
|
"eval_runtime": 238.5287, |
|
"eval_samples_per_second": 534.829, |
|
"eval_steps_per_second": 16.715, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.4799021759578603, |
|
"grad_norm": 4.446337699890137, |
|
"learning_rate": 1.4005469525684128e-05, |
|
"loss": 1.2879, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.4861729478898853, |
|
"grad_norm": 132.61671447753906, |
|
"learning_rate": 1.3970631782473132e-05, |
|
"loss": 1.0651, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.49244371982191, |
|
"grad_norm": 64.33197784423828, |
|
"learning_rate": 1.3935794039262139e-05, |
|
"loss": 1.1601, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.498714491753935, |
|
"grad_norm": 0.5995836853981018, |
|
"learning_rate": 1.3900956296051142e-05, |
|
"loss": 1.0034, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.5049852636859598, |
|
"grad_norm": 0.1931271255016327, |
|
"learning_rate": 1.3866118552840148e-05, |
|
"loss": 1.3386, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.5049852636859598, |
|
"eval_loss": 1.1590368747711182, |
|
"eval_runtime": 241.3271, |
|
"eval_samples_per_second": 528.627, |
|
"eval_steps_per_second": 16.521, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.5112560356179845, |
|
"grad_norm": 38.5876579284668, |
|
"learning_rate": 1.3831280809629153e-05, |
|
"loss": 1.142, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.5175268075500095, |
|
"grad_norm": 8.049750328063965, |
|
"learning_rate": 1.3796443066418159e-05, |
|
"loss": 1.3495, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.5237975794820342, |
|
"grad_norm": 32.30927658081055, |
|
"learning_rate": 1.3761605323207164e-05, |
|
"loss": 0.9993, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.5300683514140592, |
|
"grad_norm": 0.47087952494621277, |
|
"learning_rate": 1.372676757999617e-05, |
|
"loss": 0.9363, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.536339123346084, |
|
"grad_norm": 160.12139892578125, |
|
"learning_rate": 1.3691929836785175e-05, |
|
"loss": 1.4402, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.536339123346084, |
|
"eval_loss": 1.2178274393081665, |
|
"eval_runtime": 241.2634, |
|
"eval_samples_per_second": 528.767, |
|
"eval_steps_per_second": 16.526, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.5426098952781087, |
|
"grad_norm": 43.729827880859375, |
|
"learning_rate": 1.3657092093574181e-05, |
|
"loss": 1.0648, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.5488806672101336, |
|
"grad_norm": 16.396068572998047, |
|
"learning_rate": 1.3622254350363184e-05, |
|
"loss": 1.5102, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.5551514391421584, |
|
"grad_norm": 280.9241027832031, |
|
"learning_rate": 1.358741660715219e-05, |
|
"loss": 1.3415, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.5614222110741833, |
|
"grad_norm": 0.3944130539894104, |
|
"learning_rate": 1.3552578863941195e-05, |
|
"loss": 0.7441, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 1.567692983006208, |
|
"grad_norm": 242.84613037109375, |
|
"learning_rate": 1.3517741120730201e-05, |
|
"loss": 0.901, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.567692983006208, |
|
"eval_loss": 1.1982382535934448, |
|
"eval_runtime": 241.5382, |
|
"eval_samples_per_second": 528.165, |
|
"eval_steps_per_second": 16.507, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.5739637549382328, |
|
"grad_norm": 61.62953567504883, |
|
"learning_rate": 1.3482903377519206e-05, |
|
"loss": 1.3147, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 1.5802345268702578, |
|
"grad_norm": 2.465519905090332, |
|
"learning_rate": 1.3448065634308212e-05, |
|
"loss": 0.971, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.5865052988022825, |
|
"grad_norm": 184.7733612060547, |
|
"learning_rate": 1.3413227891097215e-05, |
|
"loss": 0.9988, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 1.5927760707343075, |
|
"grad_norm": 221.9571533203125, |
|
"learning_rate": 1.337839014788622e-05, |
|
"loss": 1.1445, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 1.5990468426663322, |
|
"grad_norm": 14.548208236694336, |
|
"learning_rate": 1.3343552404675226e-05, |
|
"loss": 1.1018, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.5990468426663322, |
|
"eval_loss": 1.142329454421997, |
|
"eval_runtime": 238.9747, |
|
"eval_samples_per_second": 533.831, |
|
"eval_steps_per_second": 16.684, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.605317614598357, |
|
"grad_norm": 0.4988707900047302, |
|
"learning_rate": 1.330871466146423e-05, |
|
"loss": 1.0902, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.611588386530382, |
|
"grad_norm": 213.1658477783203, |
|
"learning_rate": 1.3273876918253237e-05, |
|
"loss": 1.2577, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 1.6178591584624067, |
|
"grad_norm": 74.17716217041016, |
|
"learning_rate": 1.3239039175042242e-05, |
|
"loss": 1.2005, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 1.6241299303944317, |
|
"grad_norm": 196.46742248535156, |
|
"learning_rate": 1.3204201431831248e-05, |
|
"loss": 1.2839, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 1.6304007023264564, |
|
"grad_norm": 264.5187072753906, |
|
"learning_rate": 1.3169363688620251e-05, |
|
"loss": 1.4122, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.6304007023264564, |
|
"eval_loss": 1.1125129461288452, |
|
"eval_runtime": 238.2144, |
|
"eval_samples_per_second": 535.534, |
|
"eval_steps_per_second": 16.737, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.6366714742584811, |
|
"grad_norm": 0.5429248213768005, |
|
"learning_rate": 1.3134525945409257e-05, |
|
"loss": 0.7832, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 1.642942246190506, |
|
"grad_norm": 0.07243086397647858, |
|
"learning_rate": 1.3099688202198262e-05, |
|
"loss": 1.3278, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 1.6492130181225308, |
|
"grad_norm": 176.74636840820312, |
|
"learning_rate": 1.3064850458987268e-05, |
|
"loss": 1.2055, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 1.6554837900545558, |
|
"grad_norm": 1.1564711332321167, |
|
"learning_rate": 1.3030012715776273e-05, |
|
"loss": 1.5814, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.6617545619865806, |
|
"grad_norm": 0.3095082640647888, |
|
"learning_rate": 1.299517497256528e-05, |
|
"loss": 1.0393, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.6617545619865806, |
|
"eval_loss": 1.0945708751678467, |
|
"eval_runtime": 240.297, |
|
"eval_samples_per_second": 530.893, |
|
"eval_steps_per_second": 16.592, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.6680253339186053, |
|
"grad_norm": 0.8863621354103088, |
|
"learning_rate": 1.2960337229354282e-05, |
|
"loss": 1.4531, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 1.6742961058506303, |
|
"grad_norm": 0.15211889147758484, |
|
"learning_rate": 1.2925499486143289e-05, |
|
"loss": 1.4162, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 1.680566877782655, |
|
"grad_norm": 0.271015465259552, |
|
"learning_rate": 1.2890661742932293e-05, |
|
"loss": 0.8498, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.68683764971468, |
|
"grad_norm": 1.462451457977295, |
|
"learning_rate": 1.285617237715341e-05, |
|
"loss": 1.1318, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 1.6931084216467047, |
|
"grad_norm": 1.1144922971725464, |
|
"learning_rate": 1.2821334633942416e-05, |
|
"loss": 1.3287, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.6931084216467047, |
|
"eval_loss": 1.0439221858978271, |
|
"eval_runtime": 239.1496, |
|
"eval_samples_per_second": 533.44, |
|
"eval_steps_per_second": 16.672, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.6993791935787295, |
|
"grad_norm": 1.3803671598434448, |
|
"learning_rate": 1.2786496890731419e-05, |
|
"loss": 1.0886, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 1.7056499655107544, |
|
"grad_norm": 51.79226303100586, |
|
"learning_rate": 1.2752007524952535e-05, |
|
"loss": 0.8991, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.7119207374427792, |
|
"grad_norm": 17.195894241333008, |
|
"learning_rate": 1.2717169781741541e-05, |
|
"loss": 0.7563, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 1.7181915093748041, |
|
"grad_norm": 0.548939049243927, |
|
"learning_rate": 1.2682332038530544e-05, |
|
"loss": 0.9284, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 1.7244622813068289, |
|
"grad_norm": 3.179530620574951, |
|
"learning_rate": 1.264749429531955e-05, |
|
"loss": 1.3388, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.7244622813068289, |
|
"eval_loss": 1.0940054655075073, |
|
"eval_runtime": 239.4702, |
|
"eval_samples_per_second": 532.726, |
|
"eval_steps_per_second": 16.649, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.7307330532388536, |
|
"grad_norm": 0.8089356422424316, |
|
"learning_rate": 1.2612656552108555e-05, |
|
"loss": 1.2951, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.7370038251708786, |
|
"grad_norm": 698.0848388671875, |
|
"learning_rate": 1.2577818808897562e-05, |
|
"loss": 0.9789, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 1.7432745971029033, |
|
"grad_norm": 156.7066192626953, |
|
"learning_rate": 1.2542981065686566e-05, |
|
"loss": 1.2898, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 1.7495453690349283, |
|
"grad_norm": 59.603519439697266, |
|
"learning_rate": 1.2508143322475569e-05, |
|
"loss": 0.9915, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 1.755816140966953, |
|
"grad_norm": 5.36550760269165, |
|
"learning_rate": 1.2473305579264575e-05, |
|
"loss": 1.5349, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.755816140966953, |
|
"eval_loss": 1.0266426801681519, |
|
"eval_runtime": 240.254, |
|
"eval_samples_per_second": 530.988, |
|
"eval_steps_per_second": 16.595, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.7620869128989778, |
|
"grad_norm": 3.0849006175994873, |
|
"learning_rate": 1.243846783605358e-05, |
|
"loss": 1.124, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 1.7683576848310028, |
|
"grad_norm": 2.890775442123413, |
|
"learning_rate": 1.2403630092842586e-05, |
|
"loss": 0.809, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 1.7746284567630275, |
|
"grad_norm": 0.6994801163673401, |
|
"learning_rate": 1.2368792349631591e-05, |
|
"loss": 0.9617, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 1.7808992286950525, |
|
"grad_norm": 14.703944206237793, |
|
"learning_rate": 1.2333954606420597e-05, |
|
"loss": 1.3061, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.7871700006270772, |
|
"grad_norm": 188.39633178710938, |
|
"learning_rate": 1.2299116863209602e-05, |
|
"loss": 1.1323, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.7871700006270772, |
|
"eval_loss": 1.0488332509994507, |
|
"eval_runtime": 240.6796, |
|
"eval_samples_per_second": 530.049, |
|
"eval_steps_per_second": 16.566, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.793440772559102, |
|
"grad_norm": 12.853857040405273, |
|
"learning_rate": 1.2264279119998608e-05, |
|
"loss": 1.2991, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 1.7997115444911267, |
|
"grad_norm": 17.315292358398438, |
|
"learning_rate": 1.2229441376787611e-05, |
|
"loss": 0.8708, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 1.8059823164231517, |
|
"grad_norm": 24.514192581176758, |
|
"learning_rate": 1.2194603633576618e-05, |
|
"loss": 0.7493, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.8122530883551766, |
|
"grad_norm": 17.776947021484375, |
|
"learning_rate": 1.2159765890365622e-05, |
|
"loss": 1.004, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 1.8185238602872014, |
|
"grad_norm": 154.2757110595703, |
|
"learning_rate": 1.2124928147154629e-05, |
|
"loss": 1.1477, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.8185238602872014, |
|
"eval_loss": 1.0206255912780762, |
|
"eval_runtime": 238.9764, |
|
"eval_samples_per_second": 533.827, |
|
"eval_steps_per_second": 16.684, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.824794632219226, |
|
"grad_norm": 174.9512939453125, |
|
"learning_rate": 1.2090090403943633e-05, |
|
"loss": 1.1826, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 1.8310654041512509, |
|
"grad_norm": 251.60848999023438, |
|
"learning_rate": 1.205525266073264e-05, |
|
"loss": 1.0961, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.8373361760832758, |
|
"grad_norm": 15.37478256225586, |
|
"learning_rate": 1.2020414917521643e-05, |
|
"loss": 1.4743, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 1.8436069480153008, |
|
"grad_norm": 17.250076293945312, |
|
"learning_rate": 1.1985577174310649e-05, |
|
"loss": 0.8413, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 1.8498777199473255, |
|
"grad_norm": 0.08943232893943787, |
|
"learning_rate": 1.1950739431099654e-05, |
|
"loss": 1.2623, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.8498777199473255, |
|
"eval_loss": 1.004668951034546, |
|
"eval_runtime": 241.047, |
|
"eval_samples_per_second": 529.241, |
|
"eval_steps_per_second": 16.54, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.8561484918793503, |
|
"grad_norm": 66.96379089355469, |
|
"learning_rate": 1.191590168788866e-05, |
|
"loss": 0.8486, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.862419263811375, |
|
"grad_norm": 62.850799560546875, |
|
"learning_rate": 1.1881063944677665e-05, |
|
"loss": 1.4481, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 1.8686900357434, |
|
"grad_norm": 1.5179458856582642, |
|
"learning_rate": 1.1846226201466671e-05, |
|
"loss": 1.2704, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 1.874960807675425, |
|
"grad_norm": 0.09656574577093124, |
|
"learning_rate": 1.1811388458255676e-05, |
|
"loss": 1.1913, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 1.8812315796074497, |
|
"grad_norm": 0.12182077020406723, |
|
"learning_rate": 1.1776550715044682e-05, |
|
"loss": 0.9369, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.8812315796074497, |
|
"eval_loss": 1.0277103185653687, |
|
"eval_runtime": 240.7265, |
|
"eval_samples_per_second": 529.946, |
|
"eval_steps_per_second": 16.562, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.8875023515394744, |
|
"grad_norm": 171.4630126953125, |
|
"learning_rate": 1.1741712971833685e-05, |
|
"loss": 1.2427, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 1.8937731234714992, |
|
"grad_norm": 14.272507667541504, |
|
"learning_rate": 1.1706875228622691e-05, |
|
"loss": 1.0576, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 1.9000438954035241, |
|
"grad_norm": 8.003202438354492, |
|
"learning_rate": 1.1672037485411696e-05, |
|
"loss": 0.9188, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 1.906314667335549, |
|
"grad_norm": 72.2535629272461, |
|
"learning_rate": 1.1637199742200702e-05, |
|
"loss": 1.3227, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.9125854392675739, |
|
"grad_norm": 60.970176696777344, |
|
"learning_rate": 1.1602361998989707e-05, |
|
"loss": 1.4614, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.9125854392675739, |
|
"eval_loss": 1.0549676418304443, |
|
"eval_runtime": 232.0087, |
|
"eval_samples_per_second": 549.859, |
|
"eval_steps_per_second": 17.185, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.9188562111995986, |
|
"grad_norm": 113.54409790039062, |
|
"learning_rate": 1.1567524255778713e-05, |
|
"loss": 1.2316, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 1.9251269831316233, |
|
"grad_norm": 1.6219086647033691, |
|
"learning_rate": 1.1532686512567716e-05, |
|
"loss": 0.9487, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 1.9313977550636483, |
|
"grad_norm": 74.66547393798828, |
|
"learning_rate": 1.1497848769356722e-05, |
|
"loss": 1.1651, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.9376685269956733, |
|
"grad_norm": 0.036245282739400864, |
|
"learning_rate": 1.1463011026145727e-05, |
|
"loss": 1.1622, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 1.943939298927698, |
|
"grad_norm": 1.6117188930511475, |
|
"learning_rate": 1.1428173282934732e-05, |
|
"loss": 1.1801, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.943939298927698, |
|
"eval_loss": 0.9981088042259216, |
|
"eval_runtime": 241.2373, |
|
"eval_samples_per_second": 528.824, |
|
"eval_steps_per_second": 16.527, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.9502100708597228, |
|
"grad_norm": 4.923341751098633, |
|
"learning_rate": 1.1393335539723738e-05, |
|
"loss": 0.8798, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 1.9564808427917475, |
|
"grad_norm": 214.46116638183594, |
|
"learning_rate": 1.1358497796512741e-05, |
|
"loss": 0.7196, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.9627516147237725, |
|
"grad_norm": 16.161603927612305, |
|
"learning_rate": 1.1323660053301749e-05, |
|
"loss": 1.2003, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 1.9690223866557974, |
|
"grad_norm": 249.83189392089844, |
|
"learning_rate": 1.1289170687522864e-05, |
|
"loss": 1.1823, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 1.9752931585878222, |
|
"grad_norm": 18.310449600219727, |
|
"learning_rate": 1.1254332944311868e-05, |
|
"loss": 1.1453, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.9752931585878222, |
|
"eval_loss": 1.0320409536361694, |
|
"eval_runtime": 237.3095, |
|
"eval_samples_per_second": 537.576, |
|
"eval_steps_per_second": 16.801, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.981563930519847, |
|
"grad_norm": 221.7801513671875, |
|
"learning_rate": 1.1219495201100875e-05, |
|
"loss": 1.4751, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.9878347024518717, |
|
"grad_norm": 20.95890235900879, |
|
"learning_rate": 1.1184657457889878e-05, |
|
"loss": 0.8502, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 1.9941054743838966, |
|
"grad_norm": 2.7732744216918945, |
|
"learning_rate": 1.1149819714678884e-05, |
|
"loss": 0.8757, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 2.0003762463159216, |
|
"grad_norm": 1.1170719861984253, |
|
"learning_rate": 1.1114981971467889e-05, |
|
"loss": 1.0489, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 2.0066470182479463, |
|
"grad_norm": 31.308385848999023, |
|
"learning_rate": 1.1080144228256895e-05, |
|
"loss": 1.4672, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.0066470182479463, |
|
"eval_loss": 1.0570933818817139, |
|
"eval_runtime": 236.2248, |
|
"eval_samples_per_second": 540.045, |
|
"eval_steps_per_second": 16.878, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.012917790179971, |
|
"grad_norm": 0.2743261754512787, |
|
"learning_rate": 1.10453064850459e-05, |
|
"loss": 0.9474, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 2.019188562111996, |
|
"grad_norm": 2.2496840953826904, |
|
"learning_rate": 1.1010468741834906e-05, |
|
"loss": 0.8037, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 2.025459334044021, |
|
"grad_norm": 32.999935150146484, |
|
"learning_rate": 1.0975630998623909e-05, |
|
"loss": 0.9782, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 2.0317301059760458, |
|
"grad_norm": 19.94236183166504, |
|
"learning_rate": 1.0940793255412915e-05, |
|
"loss": 0.6943, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 2.0380008779080705, |
|
"grad_norm": 0.7693130373954773, |
|
"learning_rate": 1.090595551220192e-05, |
|
"loss": 1.0097, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.0380008779080705, |
|
"eval_loss": 0.9797225594520569, |
|
"eval_runtime": 237.8696, |
|
"eval_samples_per_second": 536.311, |
|
"eval_steps_per_second": 16.761, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.0442716498400952, |
|
"grad_norm": 156.60507202148438, |
|
"learning_rate": 1.0871117768990926e-05, |
|
"loss": 0.9067, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 2.05054242177212, |
|
"grad_norm": 45.05233383178711, |
|
"learning_rate": 1.083628002577993e-05, |
|
"loss": 1.09, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 2.056813193704145, |
|
"grad_norm": 0.9790059328079224, |
|
"learning_rate": 1.0801442282568937e-05, |
|
"loss": 0.8464, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 2.06308396563617, |
|
"grad_norm": 311.8387145996094, |
|
"learning_rate": 1.0766604539357942e-05, |
|
"loss": 0.9359, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 2.0693547375681947, |
|
"grad_norm": 2.4389493465423584, |
|
"learning_rate": 1.0731766796146948e-05, |
|
"loss": 0.813, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.0693547375681947, |
|
"eval_loss": 0.990721583366394, |
|
"eval_runtime": 239.867, |
|
"eval_samples_per_second": 531.845, |
|
"eval_steps_per_second": 16.622, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.0756255095002194, |
|
"grad_norm": 40.27507781982422, |
|
"learning_rate": 1.0696929052935951e-05, |
|
"loss": 0.8738, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 2.081896281432244, |
|
"grad_norm": 0.029316190630197525, |
|
"learning_rate": 1.0662091309724957e-05, |
|
"loss": 0.8178, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 2.0881670533642693, |
|
"grad_norm": 0.06512907892465591, |
|
"learning_rate": 1.0627253566513962e-05, |
|
"loss": 1.1704, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 2.094437825296294, |
|
"grad_norm": 14.495019912719727, |
|
"learning_rate": 1.0592415823302968e-05, |
|
"loss": 1.0073, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 2.100708597228319, |
|
"grad_norm": 85.92517852783203, |
|
"learning_rate": 1.0557578080091973e-05, |
|
"loss": 1.1849, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.100708597228319, |
|
"eval_loss": 0.9582126140594482, |
|
"eval_runtime": 238.5255, |
|
"eval_samples_per_second": 534.836, |
|
"eval_steps_per_second": 16.715, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.1069793691603436, |
|
"grad_norm": 0.8284154534339905, |
|
"learning_rate": 1.0522740336880976e-05, |
|
"loss": 0.7795, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 2.1132501410923683, |
|
"grad_norm": 3.656404972076416, |
|
"learning_rate": 1.0487902593669982e-05, |
|
"loss": 0.7688, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 2.1195209130243935, |
|
"grad_norm": 0.08456479012966156, |
|
"learning_rate": 1.0453064850458987e-05, |
|
"loss": 0.9465, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 2.1257916849564182, |
|
"grad_norm": 27.962339401245117, |
|
"learning_rate": 1.0418227107247993e-05, |
|
"loss": 1.0883, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 2.132062456888443, |
|
"grad_norm": 37.31398010253906, |
|
"learning_rate": 1.0383389364036998e-05, |
|
"loss": 0.7711, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.132062456888443, |
|
"eval_loss": 0.955656886100769, |
|
"eval_runtime": 237.3977, |
|
"eval_samples_per_second": 537.377, |
|
"eval_steps_per_second": 16.795, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.1383332288204677, |
|
"grad_norm": 3.700526714324951, |
|
"learning_rate": 1.0348551620826004e-05, |
|
"loss": 0.9767, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 2.1446040007524925, |
|
"grad_norm": 111.15718841552734, |
|
"learning_rate": 1.0313713877615009e-05, |
|
"loss": 0.6702, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 2.1508747726845177, |
|
"grad_norm": 0.5821614861488342, |
|
"learning_rate": 1.0278876134404015e-05, |
|
"loss": 0.9444, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 2.1571455446165424, |
|
"grad_norm": 20.9290771484375, |
|
"learning_rate": 1.0244038391193018e-05, |
|
"loss": 0.8741, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 2.163416316548567, |
|
"grad_norm": 52.165771484375, |
|
"learning_rate": 1.0209200647982025e-05, |
|
"loss": 1.0717, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.163416316548567, |
|
"eval_loss": 0.9526209831237793, |
|
"eval_runtime": 235.6861, |
|
"eval_samples_per_second": 541.279, |
|
"eval_steps_per_second": 16.917, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.169687088480592, |
|
"grad_norm": 0.01671871915459633, |
|
"learning_rate": 1.017436290477103e-05, |
|
"loss": 0.8584, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 2.1759578604126166, |
|
"grad_norm": 12.125747680664062, |
|
"learning_rate": 1.0139525161560035e-05, |
|
"loss": 0.8926, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 2.182228632344642, |
|
"grad_norm": 114.18839263916016, |
|
"learning_rate": 1.010468741834904e-05, |
|
"loss": 0.8567, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.1884994042766666, |
|
"grad_norm": 0.2531642019748688, |
|
"learning_rate": 1.0069849675138046e-05, |
|
"loss": 0.71, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 2.1947701762086913, |
|
"grad_norm": 160.1878662109375, |
|
"learning_rate": 1.0035360309359161e-05, |
|
"loss": 1.1285, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.1947701762086913, |
|
"eval_loss": 0.958905816078186, |
|
"eval_runtime": 235.6565, |
|
"eval_samples_per_second": 541.347, |
|
"eval_steps_per_second": 16.919, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.201040948140716, |
|
"grad_norm": 42.54741287231445, |
|
"learning_rate": 1.0000522566148166e-05, |
|
"loss": 0.8999, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 2.207311720072741, |
|
"grad_norm": 291.0119323730469, |
|
"learning_rate": 9.96568482293717e-06, |
|
"loss": 0.8459, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.213582492004766, |
|
"grad_norm": 3.8935604095458984, |
|
"learning_rate": 9.930847079726175e-06, |
|
"loss": 1.0608, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 2.2198532639367907, |
|
"grad_norm": 73.73111724853516, |
|
"learning_rate": 9.896009336515181e-06, |
|
"loss": 0.6115, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 2.2261240358688155, |
|
"grad_norm": 137.14573669433594, |
|
"learning_rate": 9.861171593304186e-06, |
|
"loss": 1.2468, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.2261240358688155, |
|
"eval_loss": 0.9768953323364258, |
|
"eval_runtime": 237.6341, |
|
"eval_samples_per_second": 536.842, |
|
"eval_steps_per_second": 16.778, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.23239480780084, |
|
"grad_norm": 72.25751495361328, |
|
"learning_rate": 9.826333850093192e-06, |
|
"loss": 0.9987, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.238665579732865, |
|
"grad_norm": 310.7902526855469, |
|
"learning_rate": 9.791496106882197e-06, |
|
"loss": 0.9186, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 2.24493635166489, |
|
"grad_norm": 0.11791533976793289, |
|
"learning_rate": 9.756658363671202e-06, |
|
"loss": 1.0505, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 2.251207123596915, |
|
"grad_norm": 43.25834274291992, |
|
"learning_rate": 9.721820620460208e-06, |
|
"loss": 0.6253, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 2.2574778955289396, |
|
"grad_norm": 29.648263931274414, |
|
"learning_rate": 9.686982877249213e-06, |
|
"loss": 0.6523, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.2574778955289396, |
|
"eval_loss": 0.9501162171363831, |
|
"eval_runtime": 238.1223, |
|
"eval_samples_per_second": 535.742, |
|
"eval_steps_per_second": 16.743, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.2637486674609644, |
|
"grad_norm": 5.313396453857422, |
|
"learning_rate": 9.652145134038217e-06, |
|
"loss": 0.8252, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 2.270019439392989, |
|
"grad_norm": 0.04373766854405403, |
|
"learning_rate": 9.617307390827224e-06, |
|
"loss": 0.9793, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 2.2762902113250143, |
|
"grad_norm": 118.00153350830078, |
|
"learning_rate": 9.582469647616228e-06, |
|
"loss": 0.8845, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 2.282560983257039, |
|
"grad_norm": 99.67394256591797, |
|
"learning_rate": 9.547631904405233e-06, |
|
"loss": 1.0121, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.288831755189064, |
|
"grad_norm": 0.7632407546043396, |
|
"learning_rate": 9.51279416119424e-06, |
|
"loss": 0.9849, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.288831755189064, |
|
"eval_loss": 0.9245060086250305, |
|
"eval_runtime": 237.5388, |
|
"eval_samples_per_second": 537.058, |
|
"eval_steps_per_second": 16.785, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.2951025271210885, |
|
"grad_norm": 0.21792149543762207, |
|
"learning_rate": 9.477956417983244e-06, |
|
"loss": 1.2937, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 2.3013732990531133, |
|
"grad_norm": 161.54714965820312, |
|
"learning_rate": 9.443118674772248e-06, |
|
"loss": 1.0484, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 2.3076440709851385, |
|
"grad_norm": 1.5865380764007568, |
|
"learning_rate": 9.408280931561255e-06, |
|
"loss": 0.8801, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.313914842917163, |
|
"grad_norm": 52.73973846435547, |
|
"learning_rate": 9.37344318835026e-06, |
|
"loss": 0.7552, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 2.320185614849188, |
|
"grad_norm": 72.2259750366211, |
|
"learning_rate": 9.338605445139266e-06, |
|
"loss": 0.7641, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.320185614849188, |
|
"eval_loss": 0.9280443787574768, |
|
"eval_runtime": 234.7895, |
|
"eval_samples_per_second": 543.346, |
|
"eval_steps_per_second": 16.981, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.3264563867812127, |
|
"grad_norm": 161.67674255371094, |
|
"learning_rate": 9.30376770192827e-06, |
|
"loss": 0.883, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 2.3327271587132374, |
|
"grad_norm": 0.07621905952692032, |
|
"learning_rate": 9.269278336149385e-06, |
|
"loss": 0.77, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.3389979306452626, |
|
"grad_norm": 0.2586478292942047, |
|
"learning_rate": 9.234440592938391e-06, |
|
"loss": 1.2699, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 2.3452687025772874, |
|
"grad_norm": 79.81159973144531, |
|
"learning_rate": 9.199602849727396e-06, |
|
"loss": 0.8766, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 2.351539474509312, |
|
"grad_norm": 7.059108257293701, |
|
"learning_rate": 9.1647651065164e-06, |
|
"loss": 1.1154, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.351539474509312, |
|
"eval_loss": 0.962340772151947, |
|
"eval_runtime": 238.8795, |
|
"eval_samples_per_second": 534.043, |
|
"eval_steps_per_second": 16.69, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.357810246441337, |
|
"grad_norm": 1.4081709384918213, |
|
"learning_rate": 9.129927363305405e-06, |
|
"loss": 1.0634, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.3640810183733616, |
|
"grad_norm": 0.605450451374054, |
|
"learning_rate": 9.09508962009441e-06, |
|
"loss": 0.8822, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 2.370351790305387, |
|
"grad_norm": 1.7804793119430542, |
|
"learning_rate": 9.060251876883416e-06, |
|
"loss": 0.839, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 2.3766225622374115, |
|
"grad_norm": 0.285157710313797, |
|
"learning_rate": 9.025414133672421e-06, |
|
"loss": 0.684, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 2.3828933341694363, |
|
"grad_norm": 1.6291695833206177, |
|
"learning_rate": 8.990576390461425e-06, |
|
"loss": 0.8051, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.3828933341694363, |
|
"eval_loss": 0.9198396801948547, |
|
"eval_runtime": 235.4699, |
|
"eval_samples_per_second": 541.776, |
|
"eval_steps_per_second": 16.932, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.389164106101461, |
|
"grad_norm": 0.22198112308979034, |
|
"learning_rate": 8.955738647250432e-06, |
|
"loss": 0.9585, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 2.3954348780334858, |
|
"grad_norm": 0.15497685968875885, |
|
"learning_rate": 8.920900904039436e-06, |
|
"loss": 0.7156, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 2.401705649965511, |
|
"grad_norm": 3.716522216796875, |
|
"learning_rate": 8.886063160828443e-06, |
|
"loss": 0.5271, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 2.4079764218975357, |
|
"grad_norm": 211.54660034179688, |
|
"learning_rate": 8.851225417617447e-06, |
|
"loss": 0.805, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.4142471938295604, |
|
"grad_norm": 104.68868255615234, |
|
"learning_rate": 8.816387674406452e-06, |
|
"loss": 0.7898, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.4142471938295604, |
|
"eval_loss": 0.8785400986671448, |
|
"eval_runtime": 236.3653, |
|
"eval_samples_per_second": 539.724, |
|
"eval_steps_per_second": 16.868, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.420517965761585, |
|
"grad_norm": 117.63562774658203, |
|
"learning_rate": 8.781549931195458e-06, |
|
"loss": 0.6935, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 2.42678873769361, |
|
"grad_norm": 6.395357131958008, |
|
"learning_rate": 8.746712187984463e-06, |
|
"loss": 0.8011, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 2.433059509625635, |
|
"grad_norm": 146.0078582763672, |
|
"learning_rate": 8.711874444773468e-06, |
|
"loss": 0.9812, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 2.43933028155766, |
|
"grad_norm": 0.07249762117862701, |
|
"learning_rate": 8.677036701562474e-06, |
|
"loss": 0.4427, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 2.4456010534896846, |
|
"grad_norm": 113.86747741699219, |
|
"learning_rate": 8.642198958351479e-06, |
|
"loss": 0.492, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.4456010534896846, |
|
"eval_loss": 0.9312570095062256, |
|
"eval_runtime": 235.7259, |
|
"eval_samples_per_second": 541.188, |
|
"eval_steps_per_second": 16.914, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.4518718254217093, |
|
"grad_norm": 174.28895568847656, |
|
"learning_rate": 8.607361215140483e-06, |
|
"loss": 0.47, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 2.458142597353734, |
|
"grad_norm": 1.0906648635864258, |
|
"learning_rate": 8.5728718493616e-06, |
|
"loss": 1.1876, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 2.4644133692857593, |
|
"grad_norm": 1.2390027046203613, |
|
"learning_rate": 8.538034106150604e-06, |
|
"loss": 0.5778, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 2.470684141217784, |
|
"grad_norm": 8.68694019317627, |
|
"learning_rate": 8.503196362939609e-06, |
|
"loss": 0.6763, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 2.4769549131498088, |
|
"grad_norm": 0.0290305744856596, |
|
"learning_rate": 8.468358619728615e-06, |
|
"loss": 0.6896, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.4769549131498088, |
|
"eval_loss": 0.8978257179260254, |
|
"eval_runtime": 238.5786, |
|
"eval_samples_per_second": 534.717, |
|
"eval_steps_per_second": 16.711, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.4832256850818335, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.43386925394973e-06, |
|
"loss": 0.8905, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 2.4894964570138582, |
|
"grad_norm": 0.6685202121734619, |
|
"learning_rate": 8.399031510738736e-06, |
|
"loss": 0.7845, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 2.4957672289458834, |
|
"grad_norm": 0.6609179377555847, |
|
"learning_rate": 8.36419376752774e-06, |
|
"loss": 0.8691, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 2.502038000877908, |
|
"grad_norm": 0.44005250930786133, |
|
"learning_rate": 8.329356024316745e-06, |
|
"loss": 0.55, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 2.508308772809933, |
|
"grad_norm": 191.84471130371094, |
|
"learning_rate": 8.294518281105752e-06, |
|
"loss": 0.6978, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.508308772809933, |
|
"eval_loss": 0.9054428935050964, |
|
"eval_runtime": 236.0808, |
|
"eval_samples_per_second": 540.374, |
|
"eval_steps_per_second": 16.888, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.5145795447419577, |
|
"grad_norm": 20.063995361328125, |
|
"learning_rate": 8.259680537894755e-06, |
|
"loss": 0.6378, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 2.5208503166739824, |
|
"grad_norm": 1.4460866451263428, |
|
"learning_rate": 8.224842794683761e-06, |
|
"loss": 0.895, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 2.527121088606007, |
|
"grad_norm": 0.06669195741415024, |
|
"learning_rate": 8.190353428904876e-06, |
|
"loss": 0.9683, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 2.5333918605380323, |
|
"grad_norm": 80.40859985351562, |
|
"learning_rate": 8.155515685693882e-06, |
|
"loss": 0.9373, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 2.539662632470057, |
|
"grad_norm": 0.014817653223872185, |
|
"learning_rate": 8.120677942482887e-06, |
|
"loss": 0.7406, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.539662632470057, |
|
"eval_loss": 0.912805438041687, |
|
"eval_runtime": 230.5789, |
|
"eval_samples_per_second": 553.268, |
|
"eval_steps_per_second": 17.291, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.545933404402082, |
|
"grad_norm": 41.673622131347656, |
|
"learning_rate": 8.085840199271891e-06, |
|
"loss": 0.8917, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 2.5522041763341066, |
|
"grad_norm": 213.1597900390625, |
|
"learning_rate": 8.051002456060898e-06, |
|
"loss": 1.0552, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 2.5584749482661318, |
|
"grad_norm": 65.40398406982422, |
|
"learning_rate": 8.016164712849902e-06, |
|
"loss": 0.5281, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 2.5647457201981565, |
|
"grad_norm": 4.673154830932617, |
|
"learning_rate": 7.981326969638907e-06, |
|
"loss": 0.9064, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 2.5710164921301812, |
|
"grad_norm": 187.15573120117188, |
|
"learning_rate": 7.946489226427913e-06, |
|
"loss": 0.6886, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.5710164921301812, |
|
"eval_loss": 0.9048876166343689, |
|
"eval_runtime": 269.6795, |
|
"eval_samples_per_second": 473.05, |
|
"eval_steps_per_second": 14.784, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.577287264062206, |
|
"grad_norm": 0.14457735419273376, |
|
"learning_rate": 7.911651483216918e-06, |
|
"loss": 0.7166, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 2.5835580359942307, |
|
"grad_norm": 126.45314025878906, |
|
"learning_rate": 7.876813740005922e-06, |
|
"loss": 0.8343, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 2.5898288079262555, |
|
"grad_norm": 0.15031389892101288, |
|
"learning_rate": 7.841975996794929e-06, |
|
"loss": 0.9468, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 2.5960995798582807, |
|
"grad_norm": 0.14378446340560913, |
|
"learning_rate": 7.807138253583933e-06, |
|
"loss": 0.8529, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 2.6023703517903054, |
|
"grad_norm": 0.031118595972657204, |
|
"learning_rate": 7.772300510372938e-06, |
|
"loss": 0.8092, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.6023703517903054, |
|
"eval_loss": 0.8954480886459351, |
|
"eval_runtime": 246.3441, |
|
"eval_samples_per_second": 517.861, |
|
"eval_steps_per_second": 16.185, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.60864112372233, |
|
"grad_norm": 17.187223434448242, |
|
"learning_rate": 7.737462767161944e-06, |
|
"loss": 0.8501, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 2.614911895654355, |
|
"grad_norm": 3.00113844871521, |
|
"learning_rate": 7.702625023950949e-06, |
|
"loss": 0.9877, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 2.62118266758638, |
|
"grad_norm": 0.45281580090522766, |
|
"learning_rate": 7.667787280739954e-06, |
|
"loss": 0.8592, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 2.627453439518405, |
|
"grad_norm": 79.49444580078125, |
|
"learning_rate": 7.63294953752896e-06, |
|
"loss": 0.8632, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 2.6337242114504296, |
|
"grad_norm": 0.05600200593471527, |
|
"learning_rate": 7.598111794317965e-06, |
|
"loss": 0.6766, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.6337242114504296, |
|
"eval_loss": 0.8706979751586914, |
|
"eval_runtime": 245.6205, |
|
"eval_samples_per_second": 519.387, |
|
"eval_steps_per_second": 16.232, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.6399949833824543, |
|
"grad_norm": 20.844148635864258, |
|
"learning_rate": 7.56327405110697e-06, |
|
"loss": 0.7587, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 2.646265755314479, |
|
"grad_norm": 0.24995607137680054, |
|
"learning_rate": 7.528436307895976e-06, |
|
"loss": 0.8949, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 2.652536527246504, |
|
"grad_norm": 80.21415710449219, |
|
"learning_rate": 7.49359856468498e-06, |
|
"loss": 0.4173, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 2.658807299178529, |
|
"grad_norm": 24.900297164916992, |
|
"learning_rate": 7.458760821473986e-06, |
|
"loss": 0.5995, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 2.6650780711105537, |
|
"grad_norm": 231.90145874023438, |
|
"learning_rate": 7.423923078262991e-06, |
|
"loss": 0.8157, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.6650780711105537, |
|
"eval_loss": 0.8680915236473083, |
|
"eval_runtime": 245.7882, |
|
"eval_samples_per_second": 519.032, |
|
"eval_steps_per_second": 16.221, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.6713488430425785, |
|
"grad_norm": 0.030076002702116966, |
|
"learning_rate": 7.389085335051997e-06, |
|
"loss": 0.92, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 2.677619614974603, |
|
"grad_norm": 391.046875, |
|
"learning_rate": 7.354247591841001e-06, |
|
"loss": 0.9118, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 2.6838903869066284, |
|
"grad_norm": 0.29524192214012146, |
|
"learning_rate": 7.319409848630006e-06, |
|
"loss": 0.7446, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 2.690161158838653, |
|
"grad_norm": 0.06050710007548332, |
|
"learning_rate": 7.284572105419011e-06, |
|
"loss": 0.6835, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 2.696431930770678, |
|
"grad_norm": 0.3519326150417328, |
|
"learning_rate": 7.249734362208016e-06, |
|
"loss": 0.6157, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.696431930770678, |
|
"eval_loss": 0.8691079020500183, |
|
"eval_runtime": 245.1929, |
|
"eval_samples_per_second": 520.292, |
|
"eval_steps_per_second": 16.261, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 3.0073323249816895, |
|
"learning_rate": 7.214896618997022e-06, |
|
"loss": 0.5423, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 2.7089734746347274, |
|
"grad_norm": 47.103782653808594, |
|
"learning_rate": 7.180058875786027e-06, |
|
"loss": 0.8098, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 2.715244246566752, |
|
"grad_norm": 1.3290644884109497, |
|
"learning_rate": 7.145221132575032e-06, |
|
"loss": 0.8908, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 2.7215150184987773, |
|
"grad_norm": 51.733924865722656, |
|
"learning_rate": 7.110383389364037e-06, |
|
"loss": 1.1275, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 2.727785790430802, |
|
"grad_norm": 7.54064416885376, |
|
"learning_rate": 7.075545646153043e-06, |
|
"loss": 1.0345, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.727785790430802, |
|
"eval_loss": 0.8884279131889343, |
|
"eval_runtime": 250.7463, |
|
"eval_samples_per_second": 508.769, |
|
"eval_steps_per_second": 15.901, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.734056562362827, |
|
"grad_norm": 0.2361198216676712, |
|
"learning_rate": 7.0407079029420475e-06, |
|
"loss": 0.6198, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 2.7403273342948515, |
|
"grad_norm": 0.045945364981889725, |
|
"learning_rate": 7.005870159731053e-06, |
|
"loss": 0.8315, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 2.7465981062268767, |
|
"grad_norm": 1.2798868417739868, |
|
"learning_rate": 6.9710324165200584e-06, |
|
"loss": 0.9317, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 2.7528688781589015, |
|
"grad_norm": 0.2944384217262268, |
|
"learning_rate": 6.936194673309063e-06, |
|
"loss": 0.516, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 2.759139650090926, |
|
"grad_norm": 0.38825371861457825, |
|
"learning_rate": 6.9013569300980686e-06, |
|
"loss": 0.8229, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.759139650090926, |
|
"eval_loss": 0.8659059405326843, |
|
"eval_runtime": 250.2562, |
|
"eval_samples_per_second": 509.766, |
|
"eval_steps_per_second": 15.932, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.765410422022951, |
|
"grad_norm": 121.3291015625, |
|
"learning_rate": 6.866519186887074e-06, |
|
"loss": 0.7989, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 2.7716811939549757, |
|
"grad_norm": 0.05258101224899292, |
|
"learning_rate": 6.8316814436760795e-06, |
|
"loss": 0.9291, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 2.7779519658870004, |
|
"grad_norm": 13.635845184326172, |
|
"learning_rate": 6.796843700465084e-06, |
|
"loss": 0.5954, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 2.7842227378190256, |
|
"grad_norm": 0.01324045192450285, |
|
"learning_rate": 6.76200595725409e-06, |
|
"loss": 0.8537, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 2.7904935097510504, |
|
"grad_norm": 0.1794157326221466, |
|
"learning_rate": 6.727168214043095e-06, |
|
"loss": 0.9506, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.7904935097510504, |
|
"eval_loss": 0.8657113909721375, |
|
"eval_runtime": 251.0944, |
|
"eval_samples_per_second": 508.064, |
|
"eval_steps_per_second": 15.878, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.796764281683075, |
|
"grad_norm": 1.5337361097335815, |
|
"learning_rate": 6.6923304708321e-06, |
|
"loss": 0.5789, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 2.8030350536151, |
|
"grad_norm": 67.04114532470703, |
|
"learning_rate": 6.657492727621105e-06, |
|
"loss": 0.4861, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 2.809305825547125, |
|
"grad_norm": 0.7064642310142517, |
|
"learning_rate": 6.622654984410111e-06, |
|
"loss": 0.9614, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 2.81557659747915, |
|
"grad_norm": 182.1068572998047, |
|
"learning_rate": 6.587817241199116e-06, |
|
"loss": 1.0069, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 2.8218473694111745, |
|
"grad_norm": 11.14926528930664, |
|
"learning_rate": 6.552979497988121e-06, |
|
"loss": 0.5599, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.8218473694111745, |
|
"eval_loss": 0.8618975281715393, |
|
"eval_runtime": 253.2257, |
|
"eval_samples_per_second": 503.788, |
|
"eval_steps_per_second": 15.745, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.8281181413431993, |
|
"grad_norm": 3.852113723754883, |
|
"learning_rate": 6.5181417547771264e-06, |
|
"loss": 1.3747, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 2.834388913275224, |
|
"grad_norm": 0.024370471015572548, |
|
"learning_rate": 6.483304011566132e-06, |
|
"loss": 0.5638, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 2.8406596852072488, |
|
"grad_norm": 30.42238998413086, |
|
"learning_rate": 6.4484662683551366e-06, |
|
"loss": 1.2095, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 2.846930457139274, |
|
"grad_norm": 54.890380859375, |
|
"learning_rate": 6.413628525144142e-06, |
|
"loss": 0.7364, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 2.8532012290712987, |
|
"grad_norm": 0.05865807831287384, |
|
"learning_rate": 6.3787907819331475e-06, |
|
"loss": 0.5692, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.8532012290712987, |
|
"eval_loss": 0.8817957043647766, |
|
"eval_runtime": 250.5213, |
|
"eval_samples_per_second": 509.226, |
|
"eval_steps_per_second": 15.915, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.8594720010033234, |
|
"grad_norm": 0.23342262208461761, |
|
"learning_rate": 6.343953038722153e-06, |
|
"loss": 0.8848, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 2.865742772935348, |
|
"grad_norm": 0.24238887429237366, |
|
"learning_rate": 6.309115295511157e-06, |
|
"loss": 0.9063, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 2.8720135448673734, |
|
"grad_norm": 303.49761962890625, |
|
"learning_rate": 6.274277552300162e-06, |
|
"loss": 0.8675, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 2.878284316799398, |
|
"grad_norm": 27.475610733032227, |
|
"learning_rate": 6.239439809089167e-06, |
|
"loss": 0.9703, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 2.884555088731423, |
|
"grad_norm": 0.12018956989049911, |
|
"learning_rate": 6.2046020658781725e-06, |
|
"loss": 0.6657, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.884555088731423, |
|
"eval_loss": 0.842439591884613, |
|
"eval_runtime": 250.5638, |
|
"eval_samples_per_second": 509.14, |
|
"eval_steps_per_second": 15.912, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.8908258606634476, |
|
"grad_norm": 36.39583969116211, |
|
"learning_rate": 6.169764322667178e-06, |
|
"loss": 0.6564, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 2.8970966325954723, |
|
"grad_norm": 6.755324840545654, |
|
"learning_rate": 6.135274956888293e-06, |
|
"loss": 0.7945, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 2.903367404527497, |
|
"grad_norm": 0.24825870990753174, |
|
"learning_rate": 6.100437213677298e-06, |
|
"loss": 0.6341, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 2.9096381764595223, |
|
"grad_norm": 0.06013401225209236, |
|
"learning_rate": 6.0655994704663035e-06, |
|
"loss": 1.042, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 2.915908948391547, |
|
"grad_norm": 14.515037536621094, |
|
"learning_rate": 6.030761727255309e-06, |
|
"loss": 1.0812, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.915908948391547, |
|
"eval_loss": 0.8509716987609863, |
|
"eval_runtime": 247.7079, |
|
"eval_samples_per_second": 515.01, |
|
"eval_steps_per_second": 16.096, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.9221797203235718, |
|
"grad_norm": 0.9338593482971191, |
|
"learning_rate": 5.995923984044314e-06, |
|
"loss": 0.9787, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 2.9284504922555965, |
|
"grad_norm": 241.10589599609375, |
|
"learning_rate": 5.961086240833319e-06, |
|
"loss": 0.8732, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 2.9347212641876217, |
|
"grad_norm": 119.96747589111328, |
|
"learning_rate": 5.926248497622325e-06, |
|
"loss": 1.1872, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 2.9409920361196464, |
|
"grad_norm": 28.35833740234375, |
|
"learning_rate": 5.89141075441133e-06, |
|
"loss": 0.989, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 2.947262808051671, |
|
"grad_norm": 0.007068769074976444, |
|
"learning_rate": 5.856573011200335e-06, |
|
"loss": 0.874, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.947262808051671, |
|
"eval_loss": 0.8214829564094543, |
|
"eval_runtime": 243.5295, |
|
"eval_samples_per_second": 523.846, |
|
"eval_steps_per_second": 16.372, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.953533579983696, |
|
"grad_norm": 0.0442727729678154, |
|
"learning_rate": 5.82173526798934e-06, |
|
"loss": 1.0229, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 2.9598043519157207, |
|
"grad_norm": 0.031402587890625, |
|
"learning_rate": 5.786897524778346e-06, |
|
"loss": 0.9888, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 2.9660751238477454, |
|
"grad_norm": 2.0282115936279297, |
|
"learning_rate": 5.75205978156735e-06, |
|
"loss": 0.4883, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 2.9723458957797706, |
|
"grad_norm": 7.441370487213135, |
|
"learning_rate": 5.717222038356356e-06, |
|
"loss": 0.7474, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 2.9786166677117953, |
|
"grad_norm": 20.524629592895508, |
|
"learning_rate": 5.682384295145361e-06, |
|
"loss": 0.7615, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.9786166677117953, |
|
"eval_loss": 0.8217635750770569, |
|
"eval_runtime": 249.4571, |
|
"eval_samples_per_second": 511.399, |
|
"eval_steps_per_second": 15.983, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.98488743964382, |
|
"grad_norm": 0.4798177182674408, |
|
"learning_rate": 5.647546551934367e-06, |
|
"loss": 0.6208, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 2.991158211575845, |
|
"grad_norm": 112.3564224243164, |
|
"learning_rate": 5.6127088087233715e-06, |
|
"loss": 0.8332, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 2.99742898350787, |
|
"grad_norm": 52.40660095214844, |
|
"learning_rate": 5.577871065512377e-06, |
|
"loss": 0.6734, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 3.0036997554398948, |
|
"grad_norm": 0.9568219184875488, |
|
"learning_rate": 5.5430333223013825e-06, |
|
"loss": 0.5095, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 3.0099705273719195, |
|
"grad_norm": 0.40387988090515137, |
|
"learning_rate": 5.508195579090387e-06, |
|
"loss": 0.7709, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.0099705273719195, |
|
"eval_loss": 0.8220009803771973, |
|
"eval_runtime": 248.6927, |
|
"eval_samples_per_second": 512.97, |
|
"eval_steps_per_second": 16.032, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.0162412993039442, |
|
"grad_norm": 192.66201782226562, |
|
"learning_rate": 5.473357835879393e-06, |
|
"loss": 0.5449, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 3.022512071235969, |
|
"grad_norm": 0.026696085929870605, |
|
"learning_rate": 5.438520092668398e-06, |
|
"loss": 0.772, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 3.028782843167994, |
|
"grad_norm": 7.1632232666015625, |
|
"learning_rate": 5.403682349457403e-06, |
|
"loss": 0.8582, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 3.035053615100019, |
|
"grad_norm": 4.1231584548950195, |
|
"learning_rate": 5.369192983678517e-06, |
|
"loss": 0.5742, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 3.0413243870320437, |
|
"grad_norm": 0.08916144073009491, |
|
"learning_rate": 5.334355240467523e-06, |
|
"loss": 0.5584, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.0413243870320437, |
|
"eval_loss": 0.8492663502693176, |
|
"eval_runtime": 247.0758, |
|
"eval_samples_per_second": 516.327, |
|
"eval_steps_per_second": 16.137, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.0475951589640684, |
|
"grad_norm": 15.887138366699219, |
|
"learning_rate": 5.299517497256527e-06, |
|
"loss": 0.9766, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 3.053865930896093, |
|
"grad_norm": 3.1666200160980225, |
|
"learning_rate": 5.264679754045533e-06, |
|
"loss": 0.6473, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 3.0601367028281183, |
|
"grad_norm": 4.730705261230469, |
|
"learning_rate": 5.229842010834538e-06, |
|
"loss": 0.5861, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 3.066407474760143, |
|
"grad_norm": 0.19111567735671997, |
|
"learning_rate": 5.195004267623544e-06, |
|
"loss": 0.6377, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 3.072678246692168, |
|
"grad_norm": 0.17477057874202728, |
|
"learning_rate": 5.1601665244125485e-06, |
|
"loss": 0.8393, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.072678246692168, |
|
"eval_loss": 0.8429604768753052, |
|
"eval_runtime": 246.6172, |
|
"eval_samples_per_second": 517.288, |
|
"eval_steps_per_second": 16.167, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.0789490186241926, |
|
"grad_norm": 0.47240251302719116, |
|
"learning_rate": 5.125328781201554e-06, |
|
"loss": 0.8385, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 3.0852197905562173, |
|
"grad_norm": 1.6392873525619507, |
|
"learning_rate": 5.0904910379905595e-06, |
|
"loss": 0.5523, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 3.0914905624882425, |
|
"grad_norm": 0.08180980384349823, |
|
"learning_rate": 5.055653294779564e-06, |
|
"loss": 0.6217, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 3.0977613344202672, |
|
"grad_norm": 10.683464050292969, |
|
"learning_rate": 5.02081555156857e-06, |
|
"loss": 0.5515, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 3.104032106352292, |
|
"grad_norm": 154.55838012695312, |
|
"learning_rate": 4.985977808357575e-06, |
|
"loss": 0.851, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 3.104032106352292, |
|
"eval_loss": 0.8000255227088928, |
|
"eval_runtime": 245.3365, |
|
"eval_samples_per_second": 519.988, |
|
"eval_steps_per_second": 16.251, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 3.1103028782843167, |
|
"grad_norm": 0.106838159263134, |
|
"learning_rate": 4.95114006514658e-06, |
|
"loss": 0.9247, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 3.1165736502163415, |
|
"grad_norm": 0.03634607046842575, |
|
"learning_rate": 4.916302321935585e-06, |
|
"loss": 0.655, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 3.1228444221483667, |
|
"grad_norm": 118.1080322265625, |
|
"learning_rate": 4.881464578724591e-06, |
|
"loss": 0.4979, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 3.1291151940803914, |
|
"grad_norm": 0.2726267874240875, |
|
"learning_rate": 4.846626835513596e-06, |
|
"loss": 0.7521, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 3.135385966012416, |
|
"grad_norm": 0.031166499480605125, |
|
"learning_rate": 4.811789092302601e-06, |
|
"loss": 0.53, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.135385966012416, |
|
"eval_loss": 0.8105431795120239, |
|
"eval_runtime": 248.1106, |
|
"eval_samples_per_second": 514.174, |
|
"eval_steps_per_second": 16.069, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.141656737944441, |
|
"grad_norm": 88.85710144042969, |
|
"learning_rate": 4.776951349091606e-06, |
|
"loss": 0.5943, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 3.1479275098764656, |
|
"grad_norm": 11.926735877990723, |
|
"learning_rate": 4.742113605880612e-06, |
|
"loss": 0.4659, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 3.154198281808491, |
|
"grad_norm": 17.817556381225586, |
|
"learning_rate": 4.7072758626696165e-06, |
|
"loss": 0.4843, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 3.1604690537405156, |
|
"grad_norm": 95.25701904296875, |
|
"learning_rate": 4.672438119458621e-06, |
|
"loss": 0.7577, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 3.1667398256725403, |
|
"grad_norm": 0.007618566509336233, |
|
"learning_rate": 4.637600376247627e-06, |
|
"loss": 0.3448, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 3.1667398256725403, |
|
"eval_loss": 0.8055439591407776, |
|
"eval_runtime": 246.7777, |
|
"eval_samples_per_second": 516.951, |
|
"eval_steps_per_second": 16.156, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 3.173010597604565, |
|
"grad_norm": 45.008056640625, |
|
"learning_rate": 4.602762633036632e-06, |
|
"loss": 0.8392, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 3.17928136953659, |
|
"grad_norm": 0.11749571561813354, |
|
"learning_rate": 4.567924889825638e-06, |
|
"loss": 0.75, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 3.185552141468615, |
|
"grad_norm": 0.04399213567376137, |
|
"learning_rate": 4.533087146614642e-06, |
|
"loss": 0.5195, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 3.1918229134006397, |
|
"grad_norm": 0.1250951737165451, |
|
"learning_rate": 4.498249403403648e-06, |
|
"loss": 0.617, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 3.1980936853326645, |
|
"grad_norm": 213.82589721679688, |
|
"learning_rate": 4.463411660192653e-06, |
|
"loss": 0.6892, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.1980936853326645, |
|
"eval_loss": 0.8293086290359497, |
|
"eval_runtime": 244.3828, |
|
"eval_samples_per_second": 522.017, |
|
"eval_steps_per_second": 16.315, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.204364457264689, |
|
"grad_norm": 12.81237506866455, |
|
"learning_rate": 4.428573916981658e-06, |
|
"loss": 0.497, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 3.210635229196714, |
|
"grad_norm": 0.06836537271738052, |
|
"learning_rate": 4.393736173770663e-06, |
|
"loss": 0.6793, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 3.216906001128739, |
|
"grad_norm": 0.30741751194000244, |
|
"learning_rate": 4.358898430559669e-06, |
|
"loss": 0.7251, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 3.223176773060764, |
|
"grad_norm": 299.8288269042969, |
|
"learning_rate": 4.324060687348674e-06, |
|
"loss": 0.6471, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 3.2294475449927886, |
|
"grad_norm": 39.92329406738281, |
|
"learning_rate": 4.289222944137679e-06, |
|
"loss": 0.775, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 3.2294475449927886, |
|
"eval_loss": 0.8012564778327942, |
|
"eval_runtime": 245.7294, |
|
"eval_samples_per_second": 519.157, |
|
"eval_steps_per_second": 16.225, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 3.2357183169248134, |
|
"grad_norm": 7.386813640594482, |
|
"learning_rate": 4.2543852009266845e-06, |
|
"loss": 0.7289, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 3.241989088856838, |
|
"grad_norm": 0.8339570760726929, |
|
"learning_rate": 4.21954745771569e-06, |
|
"loss": 0.6894, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 3.2482598607888633, |
|
"grad_norm": 103.696533203125, |
|
"learning_rate": 4.184709714504695e-06, |
|
"loss": 0.5677, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 3.254530632720888, |
|
"grad_norm": 157.94912719726562, |
|
"learning_rate": 4.149871971293699e-06, |
|
"loss": 0.317, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 3.260801404652913, |
|
"grad_norm": 0.6201029419898987, |
|
"learning_rate": 4.115034228082705e-06, |
|
"loss": 0.5376, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.260801404652913, |
|
"eval_loss": 0.785252034664154, |
|
"eval_runtime": 247.2797, |
|
"eval_samples_per_second": 515.902, |
|
"eval_steps_per_second": 16.123, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.2670721765849375, |
|
"grad_norm": 5.9356913566589355, |
|
"learning_rate": 4.08019648487171e-06, |
|
"loss": 0.4582, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 3.2733429485169623, |
|
"grad_norm": 99.29075622558594, |
|
"learning_rate": 4.045358741660716e-06, |
|
"loss": 0.8505, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 3.279613720448987, |
|
"grad_norm": 7.142418384552002, |
|
"learning_rate": 4.01052099844972e-06, |
|
"loss": 0.6236, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 3.285884492381012, |
|
"grad_norm": 0.18595051765441895, |
|
"learning_rate": 3.975683255238726e-06, |
|
"loss": 0.7388, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 3.292155264313037, |
|
"grad_norm": 0.26398783922195435, |
|
"learning_rate": 3.940845512027731e-06, |
|
"loss": 0.7061, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 3.292155264313037, |
|
"eval_loss": 0.786342203617096, |
|
"eval_runtime": 246.6236, |
|
"eval_samples_per_second": 517.274, |
|
"eval_steps_per_second": 16.166, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 3.2984260362450617, |
|
"grad_norm": 0.15353605151176453, |
|
"learning_rate": 3.906007768816736e-06, |
|
"loss": 0.5411, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 3.3046968081770864, |
|
"grad_norm": 0.6905626654624939, |
|
"learning_rate": 3.8711700256057415e-06, |
|
"loss": 0.9511, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 3.3109675801091116, |
|
"grad_norm": 120.66680145263672, |
|
"learning_rate": 3.836332282394747e-06, |
|
"loss": 0.5364, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 3.3172383520411364, |
|
"grad_norm": 22.492393493652344, |
|
"learning_rate": 3.801494539183752e-06, |
|
"loss": 0.5795, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 3.323509123973161, |
|
"grad_norm": 11.335774421691895, |
|
"learning_rate": 3.766656795972757e-06, |
|
"loss": 0.5305, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.323509123973161, |
|
"eval_loss": 0.787602961063385, |
|
"eval_runtime": 250.8532, |
|
"eval_samples_per_second": 508.552, |
|
"eval_steps_per_second": 15.894, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.329779895905186, |
|
"grad_norm": 0.07559686154127121, |
|
"learning_rate": 3.7318190527617626e-06, |
|
"loss": 0.8051, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 3.3360506678372106, |
|
"grad_norm": 0.06827156990766525, |
|
"learning_rate": 3.6969813095507677e-06, |
|
"loss": 0.5342, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 3.3423214397692353, |
|
"grad_norm": 1.358184576034546, |
|
"learning_rate": 3.662143566339773e-06, |
|
"loss": 0.4567, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 3.3485922117012605, |
|
"grad_norm": 58.48233413696289, |
|
"learning_rate": 3.627305823128778e-06, |
|
"loss": 0.9751, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 3.3548629836332853, |
|
"grad_norm": 0.13244691491127014, |
|
"learning_rate": 3.592468079917783e-06, |
|
"loss": 0.4413, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 3.3548629836332853, |
|
"eval_loss": 0.8008161783218384, |
|
"eval_runtime": 248.8641, |
|
"eval_samples_per_second": 512.617, |
|
"eval_steps_per_second": 16.021, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 3.36113375556531, |
|
"grad_norm": 5.010788917541504, |
|
"learning_rate": 3.5576303367067884e-06, |
|
"loss": 0.6011, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 3.3674045274973348, |
|
"grad_norm": 0.032868873327970505, |
|
"learning_rate": 3.5227925934957935e-06, |
|
"loss": 0.4708, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 3.37367529942936, |
|
"grad_norm": 2.3022570610046387, |
|
"learning_rate": 3.488303227716909e-06, |
|
"loss": 0.6167, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 3.3799460713613847, |
|
"grad_norm": 0.7494950890541077, |
|
"learning_rate": 3.453465484505914e-06, |
|
"loss": 0.7653, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 3.3862168432934094, |
|
"grad_norm": 1.9640907049179077, |
|
"learning_rate": 3.4186277412949194e-06, |
|
"loss": 0.7781, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.3862168432934094, |
|
"eval_loss": 0.7897498607635498, |
|
"eval_runtime": 250.7968, |
|
"eval_samples_per_second": 508.667, |
|
"eval_steps_per_second": 15.897, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.392487615225434, |
|
"grad_norm": 95.03298950195312, |
|
"learning_rate": 3.3837899980839245e-06, |
|
"loss": 0.9323, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 3.398758387157459, |
|
"grad_norm": 1.3489042520523071, |
|
"learning_rate": 3.3489522548729296e-06, |
|
"loss": 0.6003, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 3.4050291590894837, |
|
"grad_norm": 1.4920170307159424, |
|
"learning_rate": 3.314114511661935e-06, |
|
"loss": 0.5268, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 3.411299931021509, |
|
"grad_norm": 71.25545501708984, |
|
"learning_rate": 3.27927676845094e-06, |
|
"loss": 0.6639, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 3.4175707029535336, |
|
"grad_norm": 1.5343536138534546, |
|
"learning_rate": 3.2444390252399456e-06, |
|
"loss": 0.388, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 3.4175707029535336, |
|
"eval_loss": 0.7854874730110168, |
|
"eval_runtime": 247.7117, |
|
"eval_samples_per_second": 515.002, |
|
"eval_steps_per_second": 16.095, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 3.4238414748855583, |
|
"grad_norm": 0.22106263041496277, |
|
"learning_rate": 3.2096012820289502e-06, |
|
"loss": 0.7258, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 3.430112246817583, |
|
"grad_norm": 0.10803945362567902, |
|
"learning_rate": 3.1747635388179553e-06, |
|
"loss": 0.6475, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 3.4363830187496083, |
|
"grad_norm": 89.42733764648438, |
|
"learning_rate": 3.139925795606961e-06, |
|
"loss": 0.795, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 3.442653790681633, |
|
"grad_norm": 0.15668845176696777, |
|
"learning_rate": 3.105088052395966e-06, |
|
"loss": 0.4978, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 3.4489245626136578, |
|
"grad_norm": 60.56550216674805, |
|
"learning_rate": 3.070250309184971e-06, |
|
"loss": 0.6259, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.4489245626136578, |
|
"eval_loss": 0.7704712748527527, |
|
"eval_runtime": 250.1048, |
|
"eval_samples_per_second": 510.074, |
|
"eval_steps_per_second": 15.941, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.4551953345456825, |
|
"grad_norm": 0.28135305643081665, |
|
"learning_rate": 3.0354125659739764e-06, |
|
"loss": 0.791, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 3.4614661064777072, |
|
"grad_norm": 120.33629608154297, |
|
"learning_rate": 3.0005748227629815e-06, |
|
"loss": 0.7602, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 3.467736878409732, |
|
"grad_norm": 0.6213288903236389, |
|
"learning_rate": 2.965737079551987e-06, |
|
"loss": 0.2236, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 3.474007650341757, |
|
"grad_norm": 0.051405176520347595, |
|
"learning_rate": 2.930899336340992e-06, |
|
"loss": 0.5577, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 3.480278422273782, |
|
"grad_norm": 6.140790939331055, |
|
"learning_rate": 2.8960615931299975e-06, |
|
"loss": 0.4214, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 3.480278422273782, |
|
"eval_loss": 0.768252432346344, |
|
"eval_runtime": 248.6626, |
|
"eval_samples_per_second": 513.032, |
|
"eval_steps_per_second": 16.034, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 3.4865491942058067, |
|
"grad_norm": 0.051673661917448044, |
|
"learning_rate": 2.8612238499190026e-06, |
|
"loss": 0.7335, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 3.4928199661378314, |
|
"grad_norm": 5.123118877410889, |
|
"learning_rate": 2.8263861067080077e-06, |
|
"loss": 0.7536, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 3.4990907380698566, |
|
"grad_norm": 0.7104228734970093, |
|
"learning_rate": 2.791548363497013e-06, |
|
"loss": 0.4577, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 3.5053615100018813, |
|
"grad_norm": 49.410400390625, |
|
"learning_rate": 2.7567106202860182e-06, |
|
"loss": 0.5869, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 3.511632281933906, |
|
"grad_norm": 0.0593554824590683, |
|
"learning_rate": 2.7218728770750237e-06, |
|
"loss": 0.8563, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.511632281933906, |
|
"eval_loss": 0.7587498426437378, |
|
"eval_runtime": 247.0433, |
|
"eval_samples_per_second": 516.395, |
|
"eval_steps_per_second": 16.139, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.517903053865931, |
|
"grad_norm": 8.727328300476074, |
|
"learning_rate": 2.6870351338640284e-06, |
|
"loss": 0.9291, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 3.5241738257979556, |
|
"grad_norm": 0.023664651438593864, |
|
"learning_rate": 2.6521973906530334e-06, |
|
"loss": 0.4387, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 3.5304445977299803, |
|
"grad_norm": 2.834498405456543, |
|
"learning_rate": 2.617359647442039e-06, |
|
"loss": 0.4491, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 3.5367153696620055, |
|
"grad_norm": 1.9824761152267456, |
|
"learning_rate": 2.582870281663154e-06, |
|
"loss": 0.506, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 3.5429861415940302, |
|
"grad_norm": 0.7142437100410461, |
|
"learning_rate": 2.5480325384521594e-06, |
|
"loss": 0.6626, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 3.5429861415940302, |
|
"eval_loss": 0.7634491920471191, |
|
"eval_runtime": 247.6797, |
|
"eval_samples_per_second": 515.068, |
|
"eval_steps_per_second": 16.097, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 3.549256913526055, |
|
"grad_norm": 0.030130065977573395, |
|
"learning_rate": 2.5131947952411645e-06, |
|
"loss": 0.8654, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 3.5555276854580797, |
|
"grad_norm": 0.757265031337738, |
|
"learning_rate": 2.4783570520301695e-06, |
|
"loss": 0.4455, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 3.561798457390105, |
|
"grad_norm": 130.99807739257812, |
|
"learning_rate": 2.443519308819175e-06, |
|
"loss": 0.4593, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 3.5680692293221297, |
|
"grad_norm": 86.36803436279297, |
|
"learning_rate": 2.40868156560818e-06, |
|
"loss": 0.878, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 3.5743400012541544, |
|
"grad_norm": 0.8545703887939453, |
|
"learning_rate": 2.373843822397185e-06, |
|
"loss": 0.3737, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.5743400012541544, |
|
"eval_loss": 0.7617383599281311, |
|
"eval_runtime": 246.3862, |
|
"eval_samples_per_second": 517.773, |
|
"eval_steps_per_second": 16.182, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.580610773186179, |
|
"grad_norm": 100.52796173095703, |
|
"learning_rate": 2.3390060791861902e-06, |
|
"loss": 0.377, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 3.586881545118204, |
|
"grad_norm": 31.44060516357422, |
|
"learning_rate": 2.3041683359751957e-06, |
|
"loss": 0.6894, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 3.5931523170502286, |
|
"grad_norm": 0.2915436625480652, |
|
"learning_rate": 2.2693305927642008e-06, |
|
"loss": 0.6635, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 3.599423088982254, |
|
"grad_norm": 0.009617321193218231, |
|
"learning_rate": 2.2344928495532063e-06, |
|
"loss": 0.9224, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 3.6056938609142786, |
|
"grad_norm": 0.21305809915065765, |
|
"learning_rate": 2.1996551063422113e-06, |
|
"loss": 0.635, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 3.6056938609142786, |
|
"eval_loss": 0.7668555974960327, |
|
"eval_runtime": 248.1105, |
|
"eval_samples_per_second": 514.174, |
|
"eval_steps_per_second": 16.069, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 3.6119646328463033, |
|
"grad_norm": 11.654231071472168, |
|
"learning_rate": 2.164817363131217e-06, |
|
"loss": 0.6797, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 3.618235404778328, |
|
"grad_norm": 1.0893511772155762, |
|
"learning_rate": 2.1299796199202215e-06, |
|
"loss": 0.9814, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 3.6245061767103532, |
|
"grad_norm": 0.3305797278881073, |
|
"learning_rate": 2.095141876709227e-06, |
|
"loss": 0.9893, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 3.630776948642378, |
|
"grad_norm": 0.10635466873645782, |
|
"learning_rate": 2.060304133498232e-06, |
|
"loss": 0.6753, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 3.6370477205744027, |
|
"grad_norm": 0.09898664057254791, |
|
"learning_rate": 2.0254663902872375e-06, |
|
"loss": 0.8349, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.6370477205744027, |
|
"eval_loss": 0.7500940561294556, |
|
"eval_runtime": 252.1601, |
|
"eval_samples_per_second": 505.917, |
|
"eval_steps_per_second": 15.811, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.6433184925064275, |
|
"grad_norm": 0.05218241736292839, |
|
"learning_rate": 1.9906286470762426e-06, |
|
"loss": 0.8523, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 3.649589264438452, |
|
"grad_norm": 0.648098886013031, |
|
"learning_rate": 1.9557909038652477e-06, |
|
"loss": 0.2962, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 3.655860036370477, |
|
"grad_norm": 19.993263244628906, |
|
"learning_rate": 1.920953160654253e-06, |
|
"loss": 0.6585, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 3.662130808302502, |
|
"grad_norm": 549.2650146484375, |
|
"learning_rate": 1.886115417443258e-06, |
|
"loss": 1.0247, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 3.668401580234527, |
|
"grad_norm": 53.44794464111328, |
|
"learning_rate": 1.8516260516643734e-06, |
|
"loss": 0.8638, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 3.668401580234527, |
|
"eval_loss": 0.7576786279678345, |
|
"eval_runtime": 251.8307, |
|
"eval_samples_per_second": 506.578, |
|
"eval_steps_per_second": 15.832, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 3.6746723521665516, |
|
"grad_norm": 0.11638414114713669, |
|
"learning_rate": 1.8167883084533785e-06, |
|
"loss": 0.9456, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 3.6809431240985764, |
|
"grad_norm": 0.16805872321128845, |
|
"learning_rate": 1.7819505652423837e-06, |
|
"loss": 0.5401, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 3.6872138960306016, |
|
"grad_norm": 161.84934997558594, |
|
"learning_rate": 1.747112822031389e-06, |
|
"loss": 0.6602, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 3.6934846679626263, |
|
"grad_norm": 0.16537758708000183, |
|
"learning_rate": 1.7122750788203943e-06, |
|
"loss": 0.7543, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 3.699755439894651, |
|
"grad_norm": 82.06924438476562, |
|
"learning_rate": 1.6774373356093992e-06, |
|
"loss": 0.7893, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.699755439894651, |
|
"eval_loss": 0.7599766850471497, |
|
"eval_runtime": 255.6784, |
|
"eval_samples_per_second": 498.955, |
|
"eval_steps_per_second": 15.594, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.706026211826676, |
|
"grad_norm": 123.94532012939453, |
|
"learning_rate": 1.6425995923984044e-06, |
|
"loss": 0.7746, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 3.7122969837587005, |
|
"grad_norm": 0.06561436504125595, |
|
"learning_rate": 1.6077618491874097e-06, |
|
"loss": 0.6539, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 3.7185677556907253, |
|
"grad_norm": 243.56668090820312, |
|
"learning_rate": 1.572924105976415e-06, |
|
"loss": 0.8083, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 3.7248385276227505, |
|
"grad_norm": 0.13773566484451294, |
|
"learning_rate": 1.5380863627654203e-06, |
|
"loss": 0.3429, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 3.731109299554775, |
|
"grad_norm": 412.2792053222656, |
|
"learning_rate": 1.5032486195544256e-06, |
|
"loss": 0.5005, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 3.731109299554775, |
|
"eval_loss": 0.7445316314697266, |
|
"eval_runtime": 251.276, |
|
"eval_samples_per_second": 507.697, |
|
"eval_steps_per_second": 15.867, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 3.7373800714868, |
|
"grad_norm": 10.323953628540039, |
|
"learning_rate": 1.4684108763434306e-06, |
|
"loss": 0.6238, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 3.7436508434188247, |
|
"grad_norm": 34.32875061035156, |
|
"learning_rate": 1.4335731331324357e-06, |
|
"loss": 0.4343, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 3.74992161535085, |
|
"grad_norm": 0.08429472893476486, |
|
"learning_rate": 1.398735389921441e-06, |
|
"loss": 0.8189, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 3.7561923872828746, |
|
"grad_norm": 68.88423156738281, |
|
"learning_rate": 1.3638976467104463e-06, |
|
"loss": 0.6272, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 3.7624631592148994, |
|
"grad_norm": 0.1870589703321457, |
|
"learning_rate": 1.3290599034994513e-06, |
|
"loss": 0.2982, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.7624631592148994, |
|
"eval_loss": 0.7597461342811584, |
|
"eval_runtime": 254.4738, |
|
"eval_samples_per_second": 501.317, |
|
"eval_steps_per_second": 15.668, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.768733931146924, |
|
"grad_norm": 0.051242515444755554, |
|
"learning_rate": 1.2942221602884566e-06, |
|
"loss": 0.7028, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 3.775004703078949, |
|
"grad_norm": 187.53872680664062, |
|
"learning_rate": 1.2593844170774619e-06, |
|
"loss": 0.9447, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 3.7812754750109736, |
|
"grad_norm": 64.70340728759766, |
|
"learning_rate": 1.224546673866467e-06, |
|
"loss": 0.6175, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 3.787546246942999, |
|
"grad_norm": 0.8817376494407654, |
|
"learning_rate": 1.1897089306554722e-06, |
|
"loss": 0.5856, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 3.7938170188750235, |
|
"grad_norm": 88.64114379882812, |
|
"learning_rate": 1.1548711874444775e-06, |
|
"loss": 0.8249, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 3.7938170188750235, |
|
"eval_loss": 0.750523030757904, |
|
"eval_runtime": 252.8744, |
|
"eval_samples_per_second": 504.488, |
|
"eval_steps_per_second": 15.767, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 3.8000877908070483, |
|
"grad_norm": 0.041767679154872894, |
|
"learning_rate": 1.1203818216655927e-06, |
|
"loss": 0.6617, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 3.806358562739073, |
|
"grad_norm": 74.78905487060547, |
|
"learning_rate": 1.085544078454598e-06, |
|
"loss": 0.5767, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 3.812629334671098, |
|
"grad_norm": 0.11142675578594208, |
|
"learning_rate": 1.050706335243603e-06, |
|
"loss": 1.0094, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 3.818900106603123, |
|
"grad_norm": 92.60441589355469, |
|
"learning_rate": 1.0158685920326083e-06, |
|
"loss": 0.471, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 3.8251708785351477, |
|
"grad_norm": 42.58308410644531, |
|
"learning_rate": 9.810308488216134e-07, |
|
"loss": 0.6313, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.8251708785351477, |
|
"eval_loss": 0.7488948702812195, |
|
"eval_runtime": 251.0082, |
|
"eval_samples_per_second": 508.238, |
|
"eval_steps_per_second": 15.884, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.8314416504671724, |
|
"grad_norm": 46.6805305480957, |
|
"learning_rate": 9.461931056106186e-07, |
|
"loss": 0.6545, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 3.837712422399197, |
|
"grad_norm": 0.06978940218687057, |
|
"learning_rate": 9.113553623996238e-07, |
|
"loss": 0.699, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 3.843983194331222, |
|
"grad_norm": 0.933862030506134, |
|
"learning_rate": 8.76517619188629e-07, |
|
"loss": 0.6272, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 3.850253966263247, |
|
"grad_norm": 44.13498306274414, |
|
"learning_rate": 8.416798759776343e-07, |
|
"loss": 0.7375, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 3.856524738195272, |
|
"grad_norm": 3.0953245162963867, |
|
"learning_rate": 8.068421327666394e-07, |
|
"loss": 0.4213, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 3.856524738195272, |
|
"eval_loss": 0.7490043640136719, |
|
"eval_runtime": 251.9621, |
|
"eval_samples_per_second": 506.314, |
|
"eval_steps_per_second": 15.824, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 3.8627955101272966, |
|
"grad_norm": 15.084046363830566, |
|
"learning_rate": 7.720043895556446e-07, |
|
"loss": 0.6631, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 3.8690662820593213, |
|
"grad_norm": 34.3710823059082, |
|
"learning_rate": 7.371666463446499e-07, |
|
"loss": 0.552, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 3.8753370539913465, |
|
"grad_norm": 0.2596281170845032, |
|
"learning_rate": 7.023289031336551e-07, |
|
"loss": 0.7041, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 3.8816078259233713, |
|
"grad_norm": 0.04028361290693283, |
|
"learning_rate": 6.674911599226603e-07, |
|
"loss": 0.8457, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 3.887878597855396, |
|
"grad_norm": 0.2941274344921112, |
|
"learning_rate": 6.326534167116654e-07, |
|
"loss": 0.8104, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.887878597855396, |
|
"eval_loss": 0.7476946115493774, |
|
"eval_runtime": 276.1611, |
|
"eval_samples_per_second": 461.948, |
|
"eval_steps_per_second": 14.437, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.8941493697874208, |
|
"grad_norm": 51.24428939819336, |
|
"learning_rate": 5.978156735006706e-07, |
|
"loss": 0.4494, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 3.9004201417194455, |
|
"grad_norm": 89.3067855834961, |
|
"learning_rate": 5.629779302896759e-07, |
|
"loss": 0.6947, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 3.9066909136514703, |
|
"grad_norm": 0.06883756071329117, |
|
"learning_rate": 5.281401870786811e-07, |
|
"loss": 0.8061, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 3.9129616855834954, |
|
"grad_norm": 0.8000829219818115, |
|
"learning_rate": 4.933024438676863e-07, |
|
"loss": 0.416, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 3.91923245751552, |
|
"grad_norm": 119.61589813232422, |
|
"learning_rate": 4.5846470065669146e-07, |
|
"loss": 0.7359, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 3.91923245751552, |
|
"eval_loss": 0.7468039989471436, |
|
"eval_runtime": 257.4303, |
|
"eval_samples_per_second": 495.559, |
|
"eval_steps_per_second": 15.488, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 3.925503229447545, |
|
"grad_norm": 0.29899609088897705, |
|
"learning_rate": 4.2362695744569673e-07, |
|
"loss": 0.7408, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 3.9317740013795697, |
|
"grad_norm": 112.43661499023438, |
|
"learning_rate": 3.887892142347019e-07, |
|
"loss": 0.6255, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 3.938044773311595, |
|
"grad_norm": 474.4875793457031, |
|
"learning_rate": 3.5395147102370713e-07, |
|
"loss": 0.7865, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 3.9443155452436196, |
|
"grad_norm": 0.3225669860839844, |
|
"learning_rate": 3.191137278127123e-07, |
|
"loss": 0.4879, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 3.9505863171756443, |
|
"grad_norm": 7.089817047119141, |
|
"learning_rate": 2.8427598460171753e-07, |
|
"loss": 0.5196, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.9505863171756443, |
|
"eval_loss": 0.7484961748123169, |
|
"eval_runtime": 258.5356, |
|
"eval_samples_per_second": 493.441, |
|
"eval_steps_per_second": 15.421, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.956857089107669, |
|
"grad_norm": 0.09167669713497162, |
|
"learning_rate": 2.494382413907227e-07, |
|
"loss": 0.5683, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 3.963127861039694, |
|
"grad_norm": 12.482440948486328, |
|
"learning_rate": 2.1460049817972793e-07, |
|
"loss": 0.5141, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 3.9693986329717186, |
|
"grad_norm": 8.954193115234375, |
|
"learning_rate": 1.8011113240084312e-07, |
|
"loss": 0.6068, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 3.9756694049037438, |
|
"grad_norm": 219.27337646484375, |
|
"learning_rate": 1.452733891898483e-07, |
|
"loss": 0.5929, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 3.9819401768357685, |
|
"grad_norm": 1.6949673891067505, |
|
"learning_rate": 1.104356459788535e-07, |
|
"loss": 0.7513, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 3.9819401768357685, |
|
"eval_loss": 0.7482015490531921, |
|
"eval_runtime": 257.9174, |
|
"eval_samples_per_second": 494.623, |
|
"eval_steps_per_second": 15.458, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 3.9882109487677933, |
|
"grad_norm": 0.34383705258369446, |
|
"learning_rate": 7.55979027678587e-08, |
|
"loss": 0.5053, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 3.994481720699818, |
|
"grad_norm": 0.20212756097316742, |
|
"learning_rate": 4.0760159556863914e-08, |
|
"loss": 0.5707, |
|
"step": 63700 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 63788, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|