|
{ |
|
"best_metric": 0.3695652173913043, |
|
"best_model_checkpoint": "swinv2-tiny-patch4-window8-256-DMAE-da3-colab/checkpoint-1485", |
|
"epoch": 117.33333333333333, |
|
"eval_steps": 500, |
|
"global_step": 2640, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 3.825187921524048, |
|
"learning_rate": 0.0009962121212121213, |
|
"loss": 1.5683, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 5.65834903717041, |
|
"learning_rate": 0.0009924242424242424, |
|
"loss": 1.3523, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"eval_accuracy": 0.32608695652173914, |
|
"eval_loss": 1.4024076461791992, |
|
"eval_runtime": 1.0691, |
|
"eval_samples_per_second": 43.028, |
|
"eval_steps_per_second": 2.806, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 10.152283668518066, |
|
"learning_rate": 0.0009886363636363636, |
|
"loss": 1.2778, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 3.099470376968384, |
|
"learning_rate": 0.000984848484848485, |
|
"loss": 1.3805, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.3775243759155273, |
|
"eval_runtime": 0.9313, |
|
"eval_samples_per_second": 49.394, |
|
"eval_steps_per_second": 3.221, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 4.7406511306762695, |
|
"learning_rate": 0.0009810606060606062, |
|
"loss": 1.3708, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 3.888740301132202, |
|
"learning_rate": 0.0009772727272727272, |
|
"loss": 1.3221, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.977777777777778, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.4418785572052002, |
|
"eval_runtime": 0.8857, |
|
"eval_samples_per_second": 51.938, |
|
"eval_steps_per_second": 3.387, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 2.308217763900757, |
|
"learning_rate": 0.0009734848484848485, |
|
"loss": 1.2355, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 5.7985053062438965, |
|
"learning_rate": 0.0009696969696969698, |
|
"loss": 1.2409, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 110.87601470947266, |
|
"learning_rate": 0.000965909090909091, |
|
"loss": 1.297, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.32608695652173914, |
|
"eval_loss": 1.3581539392471313, |
|
"eval_runtime": 0.8843, |
|
"eval_samples_per_second": 52.019, |
|
"eval_steps_per_second": 3.393, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 4.244718551635742, |
|
"learning_rate": 0.0009621212121212122, |
|
"loss": 1.3056, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 4.412840843200684, |
|
"learning_rate": 0.0009583333333333334, |
|
"loss": 1.353, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.977777777777778, |
|
"eval_accuracy": 0.34782608695652173, |
|
"eval_loss": 1.3405784368515015, |
|
"eval_runtime": 0.8733, |
|
"eval_samples_per_second": 52.672, |
|
"eval_steps_per_second": 3.435, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 2.0132997035980225, |
|
"learning_rate": 0.0009545454545454546, |
|
"loss": 1.3115, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 3.7579846382141113, |
|
"learning_rate": 0.0009507575757575758, |
|
"loss": 1.2627, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.15217391304347827, |
|
"eval_loss": 1.3823662996292114, |
|
"eval_runtime": 0.8973, |
|
"eval_samples_per_second": 51.264, |
|
"eval_steps_per_second": 3.343, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 3.731177806854248, |
|
"learning_rate": 0.000946969696969697, |
|
"loss": 1.2545, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 3.8037898540496826, |
|
"learning_rate": 0.0009431818181818183, |
|
"loss": 1.3006, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.977777777777778, |
|
"eval_accuracy": 0.15217391304347827, |
|
"eval_loss": 1.4008022546768188, |
|
"eval_runtime": 1.1693, |
|
"eval_samples_per_second": 39.34, |
|
"eval_steps_per_second": 2.566, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 2.627021074295044, |
|
"learning_rate": 0.0009393939393939394, |
|
"loss": 1.2558, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.555555555555555, |
|
"grad_norm": 2.487823247909546, |
|
"learning_rate": 0.0009356060606060606, |
|
"loss": 1.216, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 2.525559902191162, |
|
"learning_rate": 0.0009318181818181818, |
|
"loss": 1.2438, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.32608695652173914, |
|
"eval_loss": 1.3769112825393677, |
|
"eval_runtime": 0.9024, |
|
"eval_samples_per_second": 50.973, |
|
"eval_steps_per_second": 3.324, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.444444444444445, |
|
"grad_norm": 9.093722343444824, |
|
"learning_rate": 0.000928030303030303, |
|
"loss": 1.2023, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 7.364065170288086, |
|
"learning_rate": 0.0009242424242424242, |
|
"loss": 1.222, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 8.977777777777778, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.421162486076355, |
|
"eval_runtime": 0.8938, |
|
"eval_samples_per_second": 51.468, |
|
"eval_steps_per_second": 3.357, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 3.667402744293213, |
|
"learning_rate": 0.0009204545454545455, |
|
"loss": 1.2186, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 10.466683387756348, |
|
"learning_rate": 0.0009166666666666666, |
|
"loss": 1.2221, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4223273992538452, |
|
"eval_runtime": 0.8601, |
|
"eval_samples_per_second": 53.48, |
|
"eval_steps_per_second": 3.488, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 10.222222222222221, |
|
"grad_norm": 13.964035987854004, |
|
"learning_rate": 0.0009128787878787878, |
|
"loss": 1.2394, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 4.716994762420654, |
|
"learning_rate": 0.0009090909090909091, |
|
"loss": 1.2262, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 10.977777777777778, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4154268503189087, |
|
"eval_runtime": 1.2346, |
|
"eval_samples_per_second": 37.258, |
|
"eval_steps_per_second": 2.43, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 11.025979995727539, |
|
"learning_rate": 0.0009053030303030303, |
|
"loss": 1.2196, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 11.555555555555555, |
|
"grad_norm": 22.928346633911133, |
|
"learning_rate": 0.0009015151515151515, |
|
"loss": 1.2131, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 7.0689263343811035, |
|
"learning_rate": 0.0008977272727272727, |
|
"loss": 1.2381, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.3327127695083618, |
|
"eval_runtime": 0.884, |
|
"eval_samples_per_second": 52.039, |
|
"eval_steps_per_second": 3.394, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 12.444444444444445, |
|
"grad_norm": 3.511734962463379, |
|
"learning_rate": 0.000893939393939394, |
|
"loss": 1.1634, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 12.88888888888889, |
|
"grad_norm": 33.742347717285156, |
|
"learning_rate": 0.0008901515151515151, |
|
"loss": 1.227, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 12.977777777777778, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.288680076599121, |
|
"eval_runtime": 0.8658, |
|
"eval_samples_per_second": 53.132, |
|
"eval_steps_per_second": 3.465, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 6.175626277923584, |
|
"learning_rate": 0.0008863636363636364, |
|
"loss": 1.2082, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 13.777777777777779, |
|
"grad_norm": 11.057406425476074, |
|
"learning_rate": 0.0008825757575757576, |
|
"loss": 1.2158, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.3465280532836914, |
|
"eval_runtime": 0.872, |
|
"eval_samples_per_second": 52.749, |
|
"eval_steps_per_second": 3.44, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 14.222222222222221, |
|
"grad_norm": 37.48932647705078, |
|
"learning_rate": 0.0008787878787878789, |
|
"loss": 1.2026, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 14.666666666666666, |
|
"grad_norm": 4.4914960861206055, |
|
"learning_rate": 0.000875, |
|
"loss": 1.2174, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 14.977777777777778, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.34762704372406, |
|
"eval_runtime": 1.1749, |
|
"eval_samples_per_second": 39.152, |
|
"eval_steps_per_second": 2.553, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 15.11111111111111, |
|
"grad_norm": 3.3884503841400146, |
|
"learning_rate": 0.0008712121212121212, |
|
"loss": 1.1947, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 15.555555555555555, |
|
"grad_norm": 3.722648859024048, |
|
"learning_rate": 0.0008674242424242425, |
|
"loss": 1.2054, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 15.185981750488281, |
|
"learning_rate": 0.0008636363636363636, |
|
"loss": 1.1767, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4023534059524536, |
|
"eval_runtime": 0.8914, |
|
"eval_samples_per_second": 51.601, |
|
"eval_steps_per_second": 3.365, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 16.444444444444443, |
|
"grad_norm": 4.847630500793457, |
|
"learning_rate": 0.0008598484848484849, |
|
"loss": 1.1721, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 16.88888888888889, |
|
"grad_norm": 3.731781482696533, |
|
"learning_rate": 0.0008560606060606061, |
|
"loss": 1.2067, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 16.977777777777778, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.3664109706878662, |
|
"eval_runtime": 0.8715, |
|
"eval_samples_per_second": 52.783, |
|
"eval_steps_per_second": 3.442, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 17.333333333333332, |
|
"grad_norm": 3.2619216442108154, |
|
"learning_rate": 0.0008522727272727273, |
|
"loss": 1.1866, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 11.475706100463867, |
|
"learning_rate": 0.0008484848484848485, |
|
"loss": 1.2303, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.4259557723999023, |
|
"eval_runtime": 0.8714, |
|
"eval_samples_per_second": 52.786, |
|
"eval_steps_per_second": 3.443, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 18.22222222222222, |
|
"grad_norm": 18.98031997680664, |
|
"learning_rate": 0.0008446969696969698, |
|
"loss": 1.1939, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 18.666666666666668, |
|
"grad_norm": 2.759476900100708, |
|
"learning_rate": 0.000840909090909091, |
|
"loss": 1.222, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 18.977777777777778, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.480705976486206, |
|
"eval_runtime": 1.1937, |
|
"eval_samples_per_second": 38.535, |
|
"eval_steps_per_second": 2.513, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 19.11111111111111, |
|
"grad_norm": 4.408325672149658, |
|
"learning_rate": 0.0008371212121212122, |
|
"loss": 1.1725, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 19.555555555555557, |
|
"grad_norm": 2.795828104019165, |
|
"learning_rate": 0.0008333333333333334, |
|
"loss": 1.1577, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 8.652217864990234, |
|
"learning_rate": 0.0008295454545454546, |
|
"loss": 1.2026, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.3851475715637207, |
|
"eval_runtime": 0.8768, |
|
"eval_samples_per_second": 52.465, |
|
"eval_steps_per_second": 3.422, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 20.444444444444443, |
|
"grad_norm": 4.27825927734375, |
|
"learning_rate": 0.0008257575757575758, |
|
"loss": 1.1934, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 20.88888888888889, |
|
"grad_norm": 3.146332263946533, |
|
"learning_rate": 0.000821969696969697, |
|
"loss": 1.2185, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 20.977777777777778, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.3213552236557007, |
|
"eval_runtime": 0.889, |
|
"eval_samples_per_second": 51.741, |
|
"eval_steps_per_second": 3.374, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 21.333333333333332, |
|
"grad_norm": 6.067198753356934, |
|
"learning_rate": 0.0008181818181818183, |
|
"loss": 1.2416, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 21.77777777777778, |
|
"grad_norm": 1.9041274785995483, |
|
"learning_rate": 0.0008143939393939394, |
|
"loss": 1.2773, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4403716325759888, |
|
"eval_runtime": 0.9305, |
|
"eval_samples_per_second": 49.436, |
|
"eval_steps_per_second": 3.224, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 22.22222222222222, |
|
"grad_norm": 3.5751218795776367, |
|
"learning_rate": 0.0008106060606060606, |
|
"loss": 1.2544, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 22.666666666666668, |
|
"grad_norm": 6.662022590637207, |
|
"learning_rate": 0.0008068181818181818, |
|
"loss": 1.227, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 22.977777777777778, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.453503966331482, |
|
"eval_runtime": 1.2139, |
|
"eval_samples_per_second": 37.894, |
|
"eval_steps_per_second": 2.471, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 23.11111111111111, |
|
"grad_norm": 2.344249725341797, |
|
"learning_rate": 0.000803030303030303, |
|
"loss": 1.1912, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 23.555555555555557, |
|
"grad_norm": 5.093134880065918, |
|
"learning_rate": 0.0007992424242424242, |
|
"loss": 1.211, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 3.9864282608032227, |
|
"learning_rate": 0.0007954545454545455, |
|
"loss": 1.2032, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.3966683149337769, |
|
"eval_runtime": 0.8732, |
|
"eval_samples_per_second": 52.682, |
|
"eval_steps_per_second": 3.436, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 24.444444444444443, |
|
"grad_norm": 5.3367438316345215, |
|
"learning_rate": 0.0007916666666666666, |
|
"loss": 1.2051, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 24.88888888888889, |
|
"grad_norm": 3.129544973373413, |
|
"learning_rate": 0.0007878787878787878, |
|
"loss": 1.2223, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 24.977777777777778, |
|
"eval_accuracy": 0.32608695652173914, |
|
"eval_loss": 1.408994436264038, |
|
"eval_runtime": 0.8542, |
|
"eval_samples_per_second": 53.849, |
|
"eval_steps_per_second": 3.512, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 25.333333333333332, |
|
"grad_norm": 2.691143751144409, |
|
"learning_rate": 0.0007840909090909091, |
|
"loss": 1.2041, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 25.77777777777778, |
|
"grad_norm": 9.64910888671875, |
|
"learning_rate": 0.0007803030303030303, |
|
"loss": 1.2527, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4858431816101074, |
|
"eval_runtime": 0.9084, |
|
"eval_samples_per_second": 50.639, |
|
"eval_steps_per_second": 3.303, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 26.22222222222222, |
|
"grad_norm": 2.8862998485565186, |
|
"learning_rate": 0.0007765151515151515, |
|
"loss": 1.1968, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 2.620485544204712, |
|
"learning_rate": 0.0007727272727272727, |
|
"loss": 1.2203, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 26.977777777777778, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4366178512573242, |
|
"eval_runtime": 0.888, |
|
"eval_samples_per_second": 51.8, |
|
"eval_steps_per_second": 3.378, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 27.11111111111111, |
|
"grad_norm": 1.672736406326294, |
|
"learning_rate": 0.000768939393939394, |
|
"loss": 1.182, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 27.555555555555557, |
|
"grad_norm": 4.047964096069336, |
|
"learning_rate": 0.0007651515151515151, |
|
"loss": 1.1824, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 7.370471477508545, |
|
"learning_rate": 0.0007613636363636364, |
|
"loss": 1.1993, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4055887460708618, |
|
"eval_runtime": 0.8703, |
|
"eval_samples_per_second": 52.856, |
|
"eval_steps_per_second": 3.447, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 28.444444444444443, |
|
"grad_norm": 2.3139352798461914, |
|
"learning_rate": 0.0007575757575757576, |
|
"loss": 1.1908, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 28.88888888888889, |
|
"grad_norm": 2.4274730682373047, |
|
"learning_rate": 0.0007537878787878788, |
|
"loss": 1.2014, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 28.977777777777778, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.3755403757095337, |
|
"eval_runtime": 0.8706, |
|
"eval_samples_per_second": 52.837, |
|
"eval_steps_per_second": 3.446, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 29.333333333333332, |
|
"grad_norm": 2.012920618057251, |
|
"learning_rate": 0.00075, |
|
"loss": 1.2134, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 29.77777777777778, |
|
"grad_norm": 3.179375171661377, |
|
"learning_rate": 0.0007462121212121212, |
|
"loss": 1.2027, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.457945466041565, |
|
"eval_runtime": 1.1362, |
|
"eval_samples_per_second": 40.485, |
|
"eval_steps_per_second": 2.64, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 30.22222222222222, |
|
"grad_norm": 3.6860008239746094, |
|
"learning_rate": 0.0007424242424242425, |
|
"loss": 1.2086, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 30.666666666666668, |
|
"grad_norm": 1.5310014486312866, |
|
"learning_rate": 0.0007386363636363636, |
|
"loss": 1.1961, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 30.977777777777778, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4524133205413818, |
|
"eval_runtime": 0.9718, |
|
"eval_samples_per_second": 47.335, |
|
"eval_steps_per_second": 3.087, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 31.11111111111111, |
|
"grad_norm": 24.489850997924805, |
|
"learning_rate": 0.0007348484848484849, |
|
"loss": 1.2059, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 31.555555555555557, |
|
"grad_norm": 2.539658308029175, |
|
"learning_rate": 0.0007310606060606061, |
|
"loss": 1.1874, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 2.7552974224090576, |
|
"learning_rate": 0.0007272727272727273, |
|
"loss": 1.1939, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.448819875717163, |
|
"eval_runtime": 0.8862, |
|
"eval_samples_per_second": 51.905, |
|
"eval_steps_per_second": 3.385, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 32.44444444444444, |
|
"grad_norm": 3.2986652851104736, |
|
"learning_rate": 0.0007234848484848485, |
|
"loss": 1.2196, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 32.888888888888886, |
|
"grad_norm": 2.5736711025238037, |
|
"learning_rate": 0.0007196969696969698, |
|
"loss": 1.1889, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 32.977777777777774, |
|
"eval_accuracy": 0.15217391304347827, |
|
"eval_loss": 1.456831693649292, |
|
"eval_runtime": 0.8853, |
|
"eval_samples_per_second": 51.958, |
|
"eval_steps_per_second": 3.389, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 5.722994327545166, |
|
"learning_rate": 0.0007159090909090909, |
|
"loss": 1.178, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 33.77777777777778, |
|
"grad_norm": 1.9777653217315674, |
|
"learning_rate": 0.0007121212121212122, |
|
"loss": 1.1871, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.32608695652173914, |
|
"eval_loss": 1.3814184665679932, |
|
"eval_runtime": 0.8797, |
|
"eval_samples_per_second": 52.292, |
|
"eval_steps_per_second": 3.41, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 34.22222222222222, |
|
"grad_norm": 2.5516679286956787, |
|
"learning_rate": 0.0007083333333333334, |
|
"loss": 1.2329, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 34.666666666666664, |
|
"grad_norm": 3.3046047687530518, |
|
"learning_rate": 0.0007045454545454546, |
|
"loss": 1.1778, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 34.977777777777774, |
|
"eval_accuracy": 0.13043478260869565, |
|
"eval_loss": 1.44027578830719, |
|
"eval_runtime": 1.196, |
|
"eval_samples_per_second": 38.461, |
|
"eval_steps_per_second": 2.508, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 35.111111111111114, |
|
"grad_norm": 1.8858623504638672, |
|
"learning_rate": 0.0007007575757575758, |
|
"loss": 1.2006, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 35.55555555555556, |
|
"grad_norm": 5.07556676864624, |
|
"learning_rate": 0.000696969696969697, |
|
"loss": 1.2925, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 5.501034259796143, |
|
"learning_rate": 0.0006931818181818183, |
|
"loss": 1.2404, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4436728954315186, |
|
"eval_runtime": 0.9177, |
|
"eval_samples_per_second": 50.125, |
|
"eval_steps_per_second": 3.269, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 36.44444444444444, |
|
"grad_norm": 2.563979387283325, |
|
"learning_rate": 0.0006893939393939394, |
|
"loss": 1.222, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 36.888888888888886, |
|
"grad_norm": 2.562971591949463, |
|
"learning_rate": 0.0006856060606060606, |
|
"loss": 1.197, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 36.977777777777774, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.476518988609314, |
|
"eval_runtime": 0.8797, |
|
"eval_samples_per_second": 52.292, |
|
"eval_steps_per_second": 3.41, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 37.333333333333336, |
|
"grad_norm": 4.266129016876221, |
|
"learning_rate": 0.0006818181818181818, |
|
"loss": 1.1733, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 37.77777777777778, |
|
"grad_norm": 2.0012102127075195, |
|
"learning_rate": 0.000678030303030303, |
|
"loss": 1.2161, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.3720359802246094, |
|
"eval_runtime": 0.8896, |
|
"eval_samples_per_second": 51.708, |
|
"eval_steps_per_second": 3.372, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 38.22222222222222, |
|
"grad_norm": 5.82732629776001, |
|
"learning_rate": 0.0006742424242424242, |
|
"loss": 1.194, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 38.666666666666664, |
|
"grad_norm": 3.146446466445923, |
|
"learning_rate": 0.0006704545454545455, |
|
"loss": 1.221, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 38.977777777777774, |
|
"eval_accuracy": 0.34782608695652173, |
|
"eval_loss": 1.3749516010284424, |
|
"eval_runtime": 1.1844, |
|
"eval_samples_per_second": 38.839, |
|
"eval_steps_per_second": 2.533, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 39.111111111111114, |
|
"grad_norm": 2.4704267978668213, |
|
"learning_rate": 0.0006666666666666666, |
|
"loss": 1.2082, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 39.55555555555556, |
|
"grad_norm": 3.377894878387451, |
|
"learning_rate": 0.0006628787878787878, |
|
"loss": 1.2203, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 3.2576770782470703, |
|
"learning_rate": 0.0006590909090909091, |
|
"loss": 1.229, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.3404773473739624, |
|
"eval_runtime": 0.876, |
|
"eval_samples_per_second": 52.511, |
|
"eval_steps_per_second": 3.425, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 40.44444444444444, |
|
"grad_norm": 2.0540192127227783, |
|
"learning_rate": 0.0006553030303030303, |
|
"loss": 1.2222, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 40.888888888888886, |
|
"grad_norm": 2.336094617843628, |
|
"learning_rate": 0.0006515151515151515, |
|
"loss": 1.2046, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 40.977777777777774, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4231040477752686, |
|
"eval_runtime": 0.899, |
|
"eval_samples_per_second": 51.168, |
|
"eval_steps_per_second": 3.337, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 41.333333333333336, |
|
"grad_norm": 3.401913642883301, |
|
"learning_rate": 0.0006477272727272727, |
|
"loss": 1.2028, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 41.77777777777778, |
|
"grad_norm": 1.3715537786483765, |
|
"learning_rate": 0.000643939393939394, |
|
"loss": 1.2077, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4383732080459595, |
|
"eval_runtime": 1.0424, |
|
"eval_samples_per_second": 44.128, |
|
"eval_steps_per_second": 2.878, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 42.22222222222222, |
|
"grad_norm": 3.548417806625366, |
|
"learning_rate": 0.0006401515151515151, |
|
"loss": 1.2126, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 42.666666666666664, |
|
"grad_norm": 15.91876220703125, |
|
"learning_rate": 0.0006363636363636364, |
|
"loss": 1.1865, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 42.977777777777774, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4346174001693726, |
|
"eval_runtime": 0.9529, |
|
"eval_samples_per_second": 48.275, |
|
"eval_steps_per_second": 3.148, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 43.111111111111114, |
|
"grad_norm": 3.9566547870635986, |
|
"learning_rate": 0.0006325757575757576, |
|
"loss": 1.175, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 43.55555555555556, |
|
"grad_norm": 2.891098976135254, |
|
"learning_rate": 0.0006287878787878788, |
|
"loss": 1.2044, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 1.5360569953918457, |
|
"learning_rate": 0.000625, |
|
"loss": 1.1882, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.367881417274475, |
|
"eval_runtime": 0.8863, |
|
"eval_samples_per_second": 51.903, |
|
"eval_steps_per_second": 3.385, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 44.44444444444444, |
|
"grad_norm": 2.5285489559173584, |
|
"learning_rate": 0.0006212121212121212, |
|
"loss": 1.1957, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 44.888888888888886, |
|
"grad_norm": 4.244635105133057, |
|
"learning_rate": 0.0006174242424242425, |
|
"loss": 1.2528, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 44.977777777777774, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.3451467752456665, |
|
"eval_runtime": 0.8705, |
|
"eval_samples_per_second": 52.845, |
|
"eval_steps_per_second": 3.446, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 45.333333333333336, |
|
"grad_norm": 1.46609365940094, |
|
"learning_rate": 0.0006136363636363636, |
|
"loss": 1.2534, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 45.77777777777778, |
|
"grad_norm": 9.328937530517578, |
|
"learning_rate": 0.0006098484848484849, |
|
"loss": 1.1836, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4912604093551636, |
|
"eval_runtime": 0.8907, |
|
"eval_samples_per_second": 51.643, |
|
"eval_steps_per_second": 3.368, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 46.22222222222222, |
|
"grad_norm": 2.901005744934082, |
|
"learning_rate": 0.0006060606060606061, |
|
"loss": 1.2226, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"grad_norm": 4.646663188934326, |
|
"learning_rate": 0.0006022727272727273, |
|
"loss": 1.2009, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 46.977777777777774, |
|
"eval_accuracy": 0.32608695652173914, |
|
"eval_loss": 1.4841315746307373, |
|
"eval_runtime": 1.1901, |
|
"eval_samples_per_second": 38.652, |
|
"eval_steps_per_second": 2.521, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 47.111111111111114, |
|
"grad_norm": 2.45768141746521, |
|
"learning_rate": 0.0005984848484848485, |
|
"loss": 1.219, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 47.55555555555556, |
|
"grad_norm": 5.484715938568115, |
|
"learning_rate": 0.0005946969696969698, |
|
"loss": 1.2043, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 62.28512954711914, |
|
"learning_rate": 0.0005909090909090909, |
|
"loss": 1.203, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.4326200485229492, |
|
"eval_runtime": 0.8718, |
|
"eval_samples_per_second": 52.767, |
|
"eval_steps_per_second": 3.441, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 48.44444444444444, |
|
"grad_norm": 2.838622808456421, |
|
"learning_rate": 0.0005871212121212122, |
|
"loss": 1.2071, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"grad_norm": 4.207474708557129, |
|
"learning_rate": 0.0005833333333333334, |
|
"loss": 1.1679, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 48.977777777777774, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.3934518098831177, |
|
"eval_runtime": 0.857, |
|
"eval_samples_per_second": 53.673, |
|
"eval_steps_per_second": 3.5, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 49.333333333333336, |
|
"grad_norm": 2.942615270614624, |
|
"learning_rate": 0.0005795454545454545, |
|
"loss": 1.1719, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 49.77777777777778, |
|
"grad_norm": 54.94511032104492, |
|
"learning_rate": 0.0005757575757575758, |
|
"loss": 1.179, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4185277223587036, |
|
"eval_runtime": 0.8854, |
|
"eval_samples_per_second": 51.953, |
|
"eval_steps_per_second": 3.388, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 50.22222222222222, |
|
"grad_norm": 11.695096969604492, |
|
"learning_rate": 0.000571969696969697, |
|
"loss": 1.1624, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 50.666666666666664, |
|
"grad_norm": 1.823878288269043, |
|
"learning_rate": 0.0005681818181818183, |
|
"loss": 1.1687, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 50.977777777777774, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.3686347007751465, |
|
"eval_runtime": 1.2456, |
|
"eval_samples_per_second": 36.929, |
|
"eval_steps_per_second": 2.408, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 51.111111111111114, |
|
"grad_norm": 17.766942977905273, |
|
"learning_rate": 0.0005643939393939394, |
|
"loss": 1.17, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 51.55555555555556, |
|
"grad_norm": 4.572807788848877, |
|
"learning_rate": 0.0005606060606060606, |
|
"loss": 1.1363, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 7.582568168640137, |
|
"learning_rate": 0.0005568181818181818, |
|
"loss": 1.1779, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4319127798080444, |
|
"eval_runtime": 0.895, |
|
"eval_samples_per_second": 51.396, |
|
"eval_steps_per_second": 3.352, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 52.44444444444444, |
|
"grad_norm": 2.6896812915802, |
|
"learning_rate": 0.000553030303030303, |
|
"loss": 1.1698, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 52.888888888888886, |
|
"grad_norm": 3.666240692138672, |
|
"learning_rate": 0.0005492424242424242, |
|
"loss": 1.1566, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 52.977777777777774, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.3800519704818726, |
|
"eval_runtime": 0.9305, |
|
"eval_samples_per_second": 49.434, |
|
"eval_steps_per_second": 3.224, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 4.159182071685791, |
|
"learning_rate": 0.0005454545454545455, |
|
"loss": 1.1785, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 53.77777777777778, |
|
"grad_norm": 5.972134590148926, |
|
"learning_rate": 0.0005416666666666666, |
|
"loss": 1.192, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.3745651245117188, |
|
"eval_runtime": 1.1774, |
|
"eval_samples_per_second": 39.07, |
|
"eval_steps_per_second": 2.548, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 54.22222222222222, |
|
"grad_norm": 5.2233428955078125, |
|
"learning_rate": 0.0005378787878787878, |
|
"loss": 1.1768, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 54.666666666666664, |
|
"grad_norm": 5.244997501373291, |
|
"learning_rate": 0.0005340909090909091, |
|
"loss": 1.1803, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 54.977777777777774, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4016964435577393, |
|
"eval_runtime": 0.8875, |
|
"eval_samples_per_second": 51.83, |
|
"eval_steps_per_second": 3.38, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 55.111111111111114, |
|
"grad_norm": 4.16229772567749, |
|
"learning_rate": 0.0005303030303030302, |
|
"loss": 1.1548, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 55.55555555555556, |
|
"grad_norm": 3.8485047817230225, |
|
"learning_rate": 0.0005265151515151515, |
|
"loss": 1.1629, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 3.398857593536377, |
|
"learning_rate": 0.0005227272727272727, |
|
"loss": 1.194, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4288326501846313, |
|
"eval_runtime": 0.8835, |
|
"eval_samples_per_second": 52.063, |
|
"eval_steps_per_second": 3.395, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 56.44444444444444, |
|
"grad_norm": 2.9012982845306396, |
|
"learning_rate": 0.000518939393939394, |
|
"loss": 1.1283, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 56.888888888888886, |
|
"grad_norm": 2.652462959289551, |
|
"learning_rate": 0.0005151515151515151, |
|
"loss": 1.1486, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 56.977777777777774, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.392043113708496, |
|
"eval_runtime": 1.1872, |
|
"eval_samples_per_second": 38.747, |
|
"eval_steps_per_second": 2.527, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 57.333333333333336, |
|
"grad_norm": 16.2806453704834, |
|
"learning_rate": 0.0005113636363636364, |
|
"loss": 1.154, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 57.77777777777778, |
|
"grad_norm": 2.9445173740386963, |
|
"learning_rate": 0.0005075757575757576, |
|
"loss": 1.1429, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.461561918258667, |
|
"eval_runtime": 0.8733, |
|
"eval_samples_per_second": 52.675, |
|
"eval_steps_per_second": 3.435, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 58.22222222222222, |
|
"grad_norm": 4.916953086853027, |
|
"learning_rate": 0.0005037878787878788, |
|
"loss": 1.1694, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 58.666666666666664, |
|
"grad_norm": 5.632236957550049, |
|
"learning_rate": 0.0005, |
|
"loss": 1.1655, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 58.977777777777774, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4119428396224976, |
|
"eval_runtime": 0.8592, |
|
"eval_samples_per_second": 53.535, |
|
"eval_steps_per_second": 3.491, |
|
"step": 1327 |
|
}, |
|
{ |
|
"epoch": 59.111111111111114, |
|
"grad_norm": 10.658681869506836, |
|
"learning_rate": 0.0004962121212121212, |
|
"loss": 1.148, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 59.55555555555556, |
|
"grad_norm": 4.212299823760986, |
|
"learning_rate": 0.0004924242424242425, |
|
"loss": 1.1508, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 4.59652853012085, |
|
"learning_rate": 0.0004886363636363636, |
|
"loss": 1.1697, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.3811644315719604, |
|
"eval_runtime": 0.9127, |
|
"eval_samples_per_second": 50.4, |
|
"eval_steps_per_second": 3.287, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 60.44444444444444, |
|
"grad_norm": 3.656782865524292, |
|
"learning_rate": 0.0004848484848484849, |
|
"loss": 1.1312, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 60.888888888888886, |
|
"grad_norm": 9.660019874572754, |
|
"learning_rate": 0.0004810606060606061, |
|
"loss": 1.1898, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 60.977777777777774, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4008588790893555, |
|
"eval_runtime": 1.1831, |
|
"eval_samples_per_second": 38.883, |
|
"eval_steps_per_second": 2.536, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 61.333333333333336, |
|
"grad_norm": 5.424683570861816, |
|
"learning_rate": 0.0004772727272727273, |
|
"loss": 1.2188, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 61.77777777777778, |
|
"grad_norm": 3.420642375946045, |
|
"learning_rate": 0.0004734848484848485, |
|
"loss": 1.1882, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.422127604484558, |
|
"eval_runtime": 1.0461, |
|
"eval_samples_per_second": 43.972, |
|
"eval_steps_per_second": 2.868, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 62.22222222222222, |
|
"grad_norm": 37.69420623779297, |
|
"learning_rate": 0.0004696969696969697, |
|
"loss": 1.1428, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 62.666666666666664, |
|
"grad_norm": 6.073638439178467, |
|
"learning_rate": 0.0004659090909090909, |
|
"loss": 1.134, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 62.977777777777774, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.618972897529602, |
|
"eval_runtime": 0.888, |
|
"eval_samples_per_second": 51.801, |
|
"eval_steps_per_second": 3.378, |
|
"step": 1417 |
|
}, |
|
{ |
|
"epoch": 63.111111111111114, |
|
"grad_norm": 14.145001411437988, |
|
"learning_rate": 0.0004621212121212121, |
|
"loss": 1.177, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 63.55555555555556, |
|
"grad_norm": 3.415473222732544, |
|
"learning_rate": 0.0004583333333333333, |
|
"loss": 1.1739, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 9.372400283813477, |
|
"learning_rate": 0.00045454545454545455, |
|
"loss": 1.1748, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4336298704147339, |
|
"eval_runtime": 0.9015, |
|
"eval_samples_per_second": 51.025, |
|
"eval_steps_per_second": 3.328, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 64.44444444444444, |
|
"grad_norm": 3.4770920276641846, |
|
"learning_rate": 0.00045075757575757577, |
|
"loss": 1.1419, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 64.88888888888889, |
|
"grad_norm": 17.83055877685547, |
|
"learning_rate": 0.000446969696969697, |
|
"loss": 1.1439, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 64.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.3744150400161743, |
|
"eval_runtime": 1.1623, |
|
"eval_samples_per_second": 39.576, |
|
"eval_steps_per_second": 2.581, |
|
"step": 1462 |
|
}, |
|
{ |
|
"epoch": 65.33333333333333, |
|
"grad_norm": 5.167716026306152, |
|
"learning_rate": 0.0004431818181818182, |
|
"loss": 1.1155, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 65.77777777777777, |
|
"grad_norm": 2.334927558898926, |
|
"learning_rate": 0.0004393939393939394, |
|
"loss": 1.1585, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.3695652173913043, |
|
"eval_loss": 1.3992067575454712, |
|
"eval_runtime": 0.8747, |
|
"eval_samples_per_second": 52.591, |
|
"eval_steps_per_second": 3.43, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 66.22222222222223, |
|
"grad_norm": 4.668221473693848, |
|
"learning_rate": 0.0004356060606060606, |
|
"loss": 1.136, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 2.9979135990142822, |
|
"learning_rate": 0.0004318181818181818, |
|
"loss": 1.1344, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 66.97777777777777, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.3951774835586548, |
|
"eval_runtime": 0.8935, |
|
"eval_samples_per_second": 51.481, |
|
"eval_steps_per_second": 3.357, |
|
"step": 1507 |
|
}, |
|
{ |
|
"epoch": 67.11111111111111, |
|
"grad_norm": 2.9879891872406006, |
|
"learning_rate": 0.00042803030303030303, |
|
"loss": 1.1615, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 67.55555555555556, |
|
"grad_norm": 3.322258710861206, |
|
"learning_rate": 0.00042424242424242425, |
|
"loss": 1.1635, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 4.408764362335205, |
|
"learning_rate": 0.0004204545454545455, |
|
"loss": 1.1374, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.3666102886199951, |
|
"eval_runtime": 0.8532, |
|
"eval_samples_per_second": 53.917, |
|
"eval_steps_per_second": 3.516, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 68.44444444444444, |
|
"grad_norm": 18.494497299194336, |
|
"learning_rate": 0.0004166666666666667, |
|
"loss": 1.126, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 68.88888888888889, |
|
"grad_norm": 6.762816905975342, |
|
"learning_rate": 0.0004128787878787879, |
|
"loss": 1.1252, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 68.97777777777777, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.3704602718353271, |
|
"eval_runtime": 1.2029, |
|
"eval_samples_per_second": 38.24, |
|
"eval_steps_per_second": 2.494, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 69.33333333333333, |
|
"grad_norm": 4.585610389709473, |
|
"learning_rate": 0.00040909090909090913, |
|
"loss": 1.1272, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 69.77777777777777, |
|
"grad_norm": 10.724448204040527, |
|
"learning_rate": 0.0004053030303030303, |
|
"loss": 1.1339, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.3982820510864258, |
|
"eval_runtime": 1.0724, |
|
"eval_samples_per_second": 42.893, |
|
"eval_steps_per_second": 2.797, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 70.22222222222223, |
|
"grad_norm": 5.506129741668701, |
|
"learning_rate": 0.0004015151515151515, |
|
"loss": 1.1491, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 70.66666666666667, |
|
"grad_norm": 17.69223976135254, |
|
"learning_rate": 0.00039772727272727274, |
|
"loss": 1.1344, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 70.97777777777777, |
|
"eval_accuracy": 0.30434782608695654, |
|
"eval_loss": 1.3792437314987183, |
|
"eval_runtime": 0.8913, |
|
"eval_samples_per_second": 51.609, |
|
"eval_steps_per_second": 3.366, |
|
"step": 1597 |
|
}, |
|
{ |
|
"epoch": 71.11111111111111, |
|
"grad_norm": 10.686148643493652, |
|
"learning_rate": 0.0003939393939393939, |
|
"loss": 1.1495, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 71.55555555555556, |
|
"grad_norm": 2.353846549987793, |
|
"learning_rate": 0.0003901515151515151, |
|
"loss": 1.1566, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 3.250394821166992, |
|
"learning_rate": 0.00038636363636363635, |
|
"loss": 1.1343, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.4466689825057983, |
|
"eval_runtime": 0.891, |
|
"eval_samples_per_second": 51.629, |
|
"eval_steps_per_second": 3.367, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 72.44444444444444, |
|
"grad_norm": 6.152634143829346, |
|
"learning_rate": 0.00038257575757575757, |
|
"loss": 1.1417, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 72.88888888888889, |
|
"grad_norm": 93.5090560913086, |
|
"learning_rate": 0.0003787878787878788, |
|
"loss": 1.1555, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 72.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4822701215744019, |
|
"eval_runtime": 1.1509, |
|
"eval_samples_per_second": 39.968, |
|
"eval_steps_per_second": 2.607, |
|
"step": 1642 |
|
}, |
|
{ |
|
"epoch": 73.33333333333333, |
|
"grad_norm": 90.8385238647461, |
|
"learning_rate": 0.000375, |
|
"loss": 1.1227, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 73.77777777777777, |
|
"grad_norm": 3.00873064994812, |
|
"learning_rate": 0.00037121212121212123, |
|
"loss": 1.1329, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.15217391304347827, |
|
"eval_loss": 1.5136324167251587, |
|
"eval_runtime": 1.0615, |
|
"eval_samples_per_second": 43.334, |
|
"eval_steps_per_second": 2.826, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 74.22222222222223, |
|
"grad_norm": 9.816000938415527, |
|
"learning_rate": 0.00036742424242424245, |
|
"loss": 1.1719, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 74.66666666666667, |
|
"grad_norm": 2.1428558826446533, |
|
"learning_rate": 0.00036363636363636367, |
|
"loss": 1.1513, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 74.97777777777777, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.479099988937378, |
|
"eval_runtime": 0.8852, |
|
"eval_samples_per_second": 51.967, |
|
"eval_steps_per_second": 3.389, |
|
"step": 1687 |
|
}, |
|
{ |
|
"epoch": 75.11111111111111, |
|
"grad_norm": 4.258810997009277, |
|
"learning_rate": 0.0003598484848484849, |
|
"loss": 1.1449, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 75.55555555555556, |
|
"grad_norm": 3.7126126289367676, |
|
"learning_rate": 0.0003560606060606061, |
|
"loss": 1.1289, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 4.913658142089844, |
|
"learning_rate": 0.0003522727272727273, |
|
"loss": 1.1278, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.2608695652173913, |
|
"eval_loss": 1.4527482986450195, |
|
"eval_runtime": 0.8695, |
|
"eval_samples_per_second": 52.906, |
|
"eval_steps_per_second": 3.45, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 76.44444444444444, |
|
"grad_norm": 5.169142723083496, |
|
"learning_rate": 0.0003484848484848485, |
|
"loss": 1.1212, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 76.88888888888889, |
|
"grad_norm": 6.800394058227539, |
|
"learning_rate": 0.0003446969696969697, |
|
"loss": 1.0956, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 76.97777777777777, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4839533567428589, |
|
"eval_runtime": 0.9059, |
|
"eval_samples_per_second": 50.779, |
|
"eval_steps_per_second": 3.312, |
|
"step": 1732 |
|
}, |
|
{ |
|
"epoch": 77.33333333333333, |
|
"grad_norm": 4.102919578552246, |
|
"learning_rate": 0.0003409090909090909, |
|
"loss": 1.1466, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 77.77777777777777, |
|
"grad_norm": 4.560825824737549, |
|
"learning_rate": 0.0003371212121212121, |
|
"loss": 1.1131, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4900346994400024, |
|
"eval_runtime": 1.1932, |
|
"eval_samples_per_second": 38.552, |
|
"eval_steps_per_second": 2.514, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 78.22222222222223, |
|
"grad_norm": 1.923434853553772, |
|
"learning_rate": 0.0003333333333333333, |
|
"loss": 1.1285, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 78.66666666666667, |
|
"grad_norm": 4.646895885467529, |
|
"learning_rate": 0.00032954545454545454, |
|
"loss": 1.1376, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 78.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.5395020246505737, |
|
"eval_runtime": 0.8752, |
|
"eval_samples_per_second": 52.557, |
|
"eval_steps_per_second": 3.428, |
|
"step": 1777 |
|
}, |
|
{ |
|
"epoch": 79.11111111111111, |
|
"grad_norm": 8.433492660522461, |
|
"learning_rate": 0.00032575757575757576, |
|
"loss": 1.1072, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 79.55555555555556, |
|
"grad_norm": 5.669383525848389, |
|
"learning_rate": 0.000321969696969697, |
|
"loss": 1.1135, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 18.24361801147461, |
|
"learning_rate": 0.0003181818181818182, |
|
"loss": 1.0883, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.5037870407104492, |
|
"eval_runtime": 0.8647, |
|
"eval_samples_per_second": 53.198, |
|
"eval_steps_per_second": 3.469, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 80.44444444444444, |
|
"grad_norm": 8.68807315826416, |
|
"learning_rate": 0.0003143939393939394, |
|
"loss": 1.0899, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 80.88888888888889, |
|
"grad_norm": 39.473899841308594, |
|
"learning_rate": 0.0003106060606060606, |
|
"loss": 1.1017, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 80.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.5392367839813232, |
|
"eval_runtime": 0.8702, |
|
"eval_samples_per_second": 52.864, |
|
"eval_steps_per_second": 3.448, |
|
"step": 1822 |
|
}, |
|
{ |
|
"epoch": 81.33333333333333, |
|
"grad_norm": 3.8075191974639893, |
|
"learning_rate": 0.0003068181818181818, |
|
"loss": 1.0607, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 81.77777777777777, |
|
"grad_norm": 4.356723308563232, |
|
"learning_rate": 0.00030303030303030303, |
|
"loss": 1.1608, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4875361919403076, |
|
"eval_runtime": 1.2071, |
|
"eval_samples_per_second": 38.106, |
|
"eval_steps_per_second": 2.485, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 82.22222222222223, |
|
"grad_norm": 4.517791748046875, |
|
"learning_rate": 0.00029924242424242425, |
|
"loss": 1.1144, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 82.66666666666667, |
|
"grad_norm": 4.995427131652832, |
|
"learning_rate": 0.00029545454545454547, |
|
"loss": 1.1308, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 82.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.5079646110534668, |
|
"eval_runtime": 0.8614, |
|
"eval_samples_per_second": 53.403, |
|
"eval_steps_per_second": 3.483, |
|
"step": 1867 |
|
}, |
|
{ |
|
"epoch": 83.11111111111111, |
|
"grad_norm": 4.479354381561279, |
|
"learning_rate": 0.0002916666666666667, |
|
"loss": 1.0821, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 83.55555555555556, |
|
"grad_norm": 4.680623531341553, |
|
"learning_rate": 0.0002878787878787879, |
|
"loss": 1.0904, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 11.278671264648438, |
|
"learning_rate": 0.00028409090909090913, |
|
"loss": 1.1382, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4835433959960938, |
|
"eval_runtime": 0.8759, |
|
"eval_samples_per_second": 52.52, |
|
"eval_steps_per_second": 3.425, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 84.44444444444444, |
|
"grad_norm": 4.505593776702881, |
|
"learning_rate": 0.0002803030303030303, |
|
"loss": 1.0869, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 84.88888888888889, |
|
"grad_norm": 3.064387083053589, |
|
"learning_rate": 0.0002765151515151515, |
|
"loss": 1.1195, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 84.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4076049327850342, |
|
"eval_runtime": 0.865, |
|
"eval_samples_per_second": 53.179, |
|
"eval_steps_per_second": 3.468, |
|
"step": 1912 |
|
}, |
|
{ |
|
"epoch": 85.33333333333333, |
|
"grad_norm": 4.745396137237549, |
|
"learning_rate": 0.00027272727272727274, |
|
"loss": 1.1153, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 85.77777777777777, |
|
"grad_norm": 14.576072692871094, |
|
"learning_rate": 0.0002689393939393939, |
|
"loss": 1.1149, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4840431213378906, |
|
"eval_runtime": 1.1314, |
|
"eval_samples_per_second": 40.656, |
|
"eval_steps_per_second": 2.651, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 86.22222222222223, |
|
"grad_norm": 4.840237617492676, |
|
"learning_rate": 0.0002651515151515151, |
|
"loss": 1.1314, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 86.66666666666667, |
|
"grad_norm": 7.4581756591796875, |
|
"learning_rate": 0.00026136363636363634, |
|
"loss": 1.1344, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 86.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.473250150680542, |
|
"eval_runtime": 1.086, |
|
"eval_samples_per_second": 42.357, |
|
"eval_steps_per_second": 2.762, |
|
"step": 1957 |
|
}, |
|
{ |
|
"epoch": 87.11111111111111, |
|
"grad_norm": 4.788184642791748, |
|
"learning_rate": 0.00025757575757575756, |
|
"loss": 1.0985, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 87.55555555555556, |
|
"grad_norm": 7.187996864318848, |
|
"learning_rate": 0.0002537878787878788, |
|
"loss": 1.1018, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 6.393486976623535, |
|
"learning_rate": 0.00025, |
|
"loss": 1.1268, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4446380138397217, |
|
"eval_runtime": 0.877, |
|
"eval_samples_per_second": 52.451, |
|
"eval_steps_per_second": 3.421, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 88.44444444444444, |
|
"grad_norm": 19.996049880981445, |
|
"learning_rate": 0.0002462121212121212, |
|
"loss": 1.115, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 88.88888888888889, |
|
"grad_norm": 11.436882019042969, |
|
"learning_rate": 0.00024242424242424245, |
|
"loss": 1.1267, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 88.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4359674453735352, |
|
"eval_runtime": 0.8812, |
|
"eval_samples_per_second": 52.199, |
|
"eval_steps_per_second": 3.404, |
|
"step": 2002 |
|
}, |
|
{ |
|
"epoch": 89.33333333333333, |
|
"grad_norm": 7.150528430938721, |
|
"learning_rate": 0.00023863636363636364, |
|
"loss": 1.1335, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 89.77777777777777, |
|
"grad_norm": 142.4888153076172, |
|
"learning_rate": 0.00023484848484848486, |
|
"loss": 1.1034, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_accuracy": 0.15217391304347827, |
|
"eval_loss": 1.4328769445419312, |
|
"eval_runtime": 0.8938, |
|
"eval_samples_per_second": 51.465, |
|
"eval_steps_per_second": 3.356, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 90.22222222222223, |
|
"grad_norm": 13.855992317199707, |
|
"learning_rate": 0.00023106060606060605, |
|
"loss": 1.0987, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 90.66666666666667, |
|
"grad_norm": 4.523609638214111, |
|
"learning_rate": 0.00022727272727272727, |
|
"loss": 1.1113, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 90.97777777777777, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4670028686523438, |
|
"eval_runtime": 1.1907, |
|
"eval_samples_per_second": 38.631, |
|
"eval_steps_per_second": 2.519, |
|
"step": 2047 |
|
}, |
|
{ |
|
"epoch": 91.11111111111111, |
|
"grad_norm": 8.413890838623047, |
|
"learning_rate": 0.0002234848484848485, |
|
"loss": 1.0848, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 91.55555555555556, |
|
"grad_norm": 2.8552653789520264, |
|
"learning_rate": 0.0002196969696969697, |
|
"loss": 1.0788, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 22.520193099975586, |
|
"learning_rate": 0.0002159090909090909, |
|
"loss": 1.0957, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4802157878875732, |
|
"eval_runtime": 0.8728, |
|
"eval_samples_per_second": 52.706, |
|
"eval_steps_per_second": 3.437, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 92.44444444444444, |
|
"grad_norm": 17.830493927001953, |
|
"learning_rate": 0.00021212121212121213, |
|
"loss": 1.122, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 92.88888888888889, |
|
"grad_norm": 4.072429656982422, |
|
"learning_rate": 0.00020833333333333335, |
|
"loss": 1.1227, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 92.97777777777777, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4715131521224976, |
|
"eval_runtime": 0.8786, |
|
"eval_samples_per_second": 52.357, |
|
"eval_steps_per_second": 3.415, |
|
"step": 2092 |
|
}, |
|
{ |
|
"epoch": 93.33333333333333, |
|
"grad_norm": 4.416510581970215, |
|
"learning_rate": 0.00020454545454545457, |
|
"loss": 1.0755, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 93.77777777777777, |
|
"grad_norm": 19.229928970336914, |
|
"learning_rate": 0.00020075757575757576, |
|
"loss": 1.1083, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4812626838684082, |
|
"eval_runtime": 0.8656, |
|
"eval_samples_per_second": 53.14, |
|
"eval_steps_per_second": 3.466, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 94.22222222222223, |
|
"grad_norm": 4.718553066253662, |
|
"learning_rate": 0.00019696969696969695, |
|
"loss": 1.0757, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 94.66666666666667, |
|
"grad_norm": 5.6447601318359375, |
|
"learning_rate": 0.00019318181818181817, |
|
"loss": 1.0583, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 94.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.520257830619812, |
|
"eval_runtime": 1.1597, |
|
"eval_samples_per_second": 39.665, |
|
"eval_steps_per_second": 2.587, |
|
"step": 2137 |
|
}, |
|
{ |
|
"epoch": 95.11111111111111, |
|
"grad_norm": 5.209888935089111, |
|
"learning_rate": 0.0001893939393939394, |
|
"loss": 1.098, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 95.55555555555556, |
|
"grad_norm": 7.493114471435547, |
|
"learning_rate": 0.00018560606060606061, |
|
"loss": 1.0796, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 11.232746124267578, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 1.093, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.5394465923309326, |
|
"eval_runtime": 0.8817, |
|
"eval_samples_per_second": 52.171, |
|
"eval_steps_per_second": 3.402, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 96.44444444444444, |
|
"grad_norm": 9.968954086303711, |
|
"learning_rate": 0.00017803030303030305, |
|
"loss": 1.0958, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 96.88888888888889, |
|
"grad_norm": 15.155268669128418, |
|
"learning_rate": 0.00017424242424242425, |
|
"loss": 1.0809, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 96.97777777777777, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4620193243026733, |
|
"eval_runtime": 0.8656, |
|
"eval_samples_per_second": 53.142, |
|
"eval_steps_per_second": 3.466, |
|
"step": 2182 |
|
}, |
|
{ |
|
"epoch": 97.33333333333333, |
|
"grad_norm": 10.245019912719727, |
|
"learning_rate": 0.00017045454545454544, |
|
"loss": 1.0822, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 97.77777777777777, |
|
"grad_norm": 56.624778747558594, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 1.0888, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.4407347440719604, |
|
"eval_runtime": 0.8737, |
|
"eval_samples_per_second": 52.651, |
|
"eval_steps_per_second": 3.434, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 98.22222222222223, |
|
"grad_norm": 16.125377655029297, |
|
"learning_rate": 0.00016287878787878788, |
|
"loss": 1.0803, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 98.66666666666667, |
|
"grad_norm": 4.7502546310424805, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 1.1292, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 98.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4577943086624146, |
|
"eval_runtime": 1.2602, |
|
"eval_samples_per_second": 36.503, |
|
"eval_steps_per_second": 2.381, |
|
"step": 2227 |
|
}, |
|
{ |
|
"epoch": 99.11111111111111, |
|
"grad_norm": 12.011200904846191, |
|
"learning_rate": 0.0001553030303030303, |
|
"loss": 1.076, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 99.55555555555556, |
|
"grad_norm": 10.061297416687012, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 1.0789, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 7.683816432952881, |
|
"learning_rate": 0.00014772727272727274, |
|
"loss": 1.0754, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.17391304347826086, |
|
"eval_loss": 1.5031030178070068, |
|
"eval_runtime": 0.8791, |
|
"eval_samples_per_second": 52.328, |
|
"eval_steps_per_second": 3.413, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 100.44444444444444, |
|
"grad_norm": 10.789087295532227, |
|
"learning_rate": 0.00014393939393939396, |
|
"loss": 1.0662, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 100.88888888888889, |
|
"grad_norm": 14.93548583984375, |
|
"learning_rate": 0.00014015151515151515, |
|
"loss": 1.0817, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 100.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4460813999176025, |
|
"eval_runtime": 0.8764, |
|
"eval_samples_per_second": 52.489, |
|
"eval_steps_per_second": 3.423, |
|
"step": 2272 |
|
}, |
|
{ |
|
"epoch": 101.33333333333333, |
|
"grad_norm": 7.292104244232178, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 1.1087, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 101.77777777777777, |
|
"grad_norm": 21.396413803100586, |
|
"learning_rate": 0.00013257575757575756, |
|
"loss": 1.0671, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 102.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4722661972045898, |
|
"eval_runtime": 0.8883, |
|
"eval_samples_per_second": 51.786, |
|
"eval_steps_per_second": 3.377, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 102.22222222222223, |
|
"grad_norm": 7.56445837020874, |
|
"learning_rate": 0.00012878787878787878, |
|
"loss": 1.0837, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 102.66666666666667, |
|
"grad_norm": 12.246611595153809, |
|
"learning_rate": 0.000125, |
|
"loss": 1.0815, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 102.97777777777777, |
|
"eval_accuracy": 0.1956521739130435, |
|
"eval_loss": 1.4988662004470825, |
|
"eval_runtime": 1.1771, |
|
"eval_samples_per_second": 39.079, |
|
"eval_steps_per_second": 2.549, |
|
"step": 2317 |
|
}, |
|
{ |
|
"epoch": 103.11111111111111, |
|
"grad_norm": 10.69598388671875, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 1.0852, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 103.55555555555556, |
|
"grad_norm": 9.647980690002441, |
|
"learning_rate": 0.00011742424242424243, |
|
"loss": 1.076, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"grad_norm": 38.92795944213867, |
|
"learning_rate": 0.00011363636363636364, |
|
"loss": 1.0967, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.465432047843933, |
|
"eval_runtime": 0.8692, |
|
"eval_samples_per_second": 52.922, |
|
"eval_steps_per_second": 3.451, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 104.44444444444444, |
|
"grad_norm": 9.53869915008545, |
|
"learning_rate": 0.00010984848484848486, |
|
"loss": 1.0838, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 104.88888888888889, |
|
"grad_norm": 5.269750118255615, |
|
"learning_rate": 0.00010606060606060606, |
|
"loss": 1.091, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 104.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4559190273284912, |
|
"eval_runtime": 0.8756, |
|
"eval_samples_per_second": 52.535, |
|
"eval_steps_per_second": 3.426, |
|
"step": 2362 |
|
}, |
|
{ |
|
"epoch": 105.33333333333333, |
|
"grad_norm": 12.788183212280273, |
|
"learning_rate": 0.00010227272727272728, |
|
"loss": 1.1085, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 105.77777777777777, |
|
"grad_norm": 7.0092058181762695, |
|
"learning_rate": 9.848484848484848e-05, |
|
"loss": 1.0895, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 106.0, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.4221450090408325, |
|
"eval_runtime": 1.1631, |
|
"eval_samples_per_second": 39.549, |
|
"eval_steps_per_second": 2.579, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 106.22222222222223, |
|
"grad_norm": 12.933279037475586, |
|
"learning_rate": 9.46969696969697e-05, |
|
"loss": 1.1548, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 106.66666666666667, |
|
"grad_norm": 4.65310001373291, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.0847, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 106.97777777777777, |
|
"eval_accuracy": 0.2826086956521739, |
|
"eval_loss": 1.4292521476745605, |
|
"eval_runtime": 0.8992, |
|
"eval_samples_per_second": 51.156, |
|
"eval_steps_per_second": 3.336, |
|
"step": 2407 |
|
}, |
|
{ |
|
"epoch": 107.11111111111111, |
|
"grad_norm": 7.354497909545898, |
|
"learning_rate": 8.712121212121212e-05, |
|
"loss": 1.0996, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 107.55555555555556, |
|
"grad_norm": 5.702237129211426, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 1.0883, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"grad_norm": 4.873330116271973, |
|
"learning_rate": 7.954545454545455e-05, |
|
"loss": 1.102, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"eval_accuracy": 0.2391304347826087, |
|
"eval_loss": 1.4582384824752808, |
|
"eval_runtime": 0.9053, |
|
"eval_samples_per_second": 50.81, |
|
"eval_steps_per_second": 3.314, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 108.44444444444444, |
|
"grad_norm": 78.408447265625, |
|
"learning_rate": 7.575757575757576e-05, |
|
"loss": 1.1048, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 108.88888888888889, |
|
"grad_norm": 6.651626110076904, |
|
"learning_rate": 7.196969696969698e-05, |
|
"loss": 1.0404, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 108.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4655812978744507, |
|
"eval_runtime": 0.8722, |
|
"eval_samples_per_second": 52.741, |
|
"eval_steps_per_second": 3.44, |
|
"step": 2452 |
|
}, |
|
{ |
|
"epoch": 109.33333333333333, |
|
"grad_norm": 10.666740417480469, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 1.0799, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 109.77777777777777, |
|
"grad_norm": 6.093382358551025, |
|
"learning_rate": 6.439393939393939e-05, |
|
"loss": 1.0488, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 110.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.489004373550415, |
|
"eval_runtime": 1.0926, |
|
"eval_samples_per_second": 42.101, |
|
"eval_steps_per_second": 2.746, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 110.22222222222223, |
|
"grad_norm": 5.012928009033203, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 1.1006, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 110.66666666666667, |
|
"grad_norm": 8.174079895019531, |
|
"learning_rate": 5.681818181818182e-05, |
|
"loss": 1.0966, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 110.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4631787538528442, |
|
"eval_runtime": 0.8917, |
|
"eval_samples_per_second": 51.586, |
|
"eval_steps_per_second": 3.364, |
|
"step": 2497 |
|
}, |
|
{ |
|
"epoch": 111.11111111111111, |
|
"grad_norm": 4.650646686553955, |
|
"learning_rate": 5.303030303030303e-05, |
|
"loss": 1.0629, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 111.55555555555556, |
|
"grad_norm": 5.132219314575195, |
|
"learning_rate": 4.924242424242424e-05, |
|
"loss": 1.0835, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"grad_norm": 8.792770385742188, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.0901, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4494850635528564, |
|
"eval_runtime": 0.872, |
|
"eval_samples_per_second": 52.75, |
|
"eval_steps_per_second": 3.44, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 112.44444444444444, |
|
"grad_norm": 6.029526710510254, |
|
"learning_rate": 4.1666666666666665e-05, |
|
"loss": 1.0764, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 112.88888888888889, |
|
"grad_norm": 5.1245927810668945, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 1.1008, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 112.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4332908391952515, |
|
"eval_runtime": 0.9004, |
|
"eval_samples_per_second": 51.09, |
|
"eval_steps_per_second": 3.332, |
|
"step": 2542 |
|
}, |
|
{ |
|
"epoch": 113.33333333333333, |
|
"grad_norm": 11.329251289367676, |
|
"learning_rate": 3.409090909090909e-05, |
|
"loss": 1.0763, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 113.77777777777777, |
|
"grad_norm": 3.865112066268921, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 1.0884, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 114.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4406064748764038, |
|
"eval_runtime": 0.9189, |
|
"eval_samples_per_second": 50.057, |
|
"eval_steps_per_second": 3.265, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 114.22222222222223, |
|
"grad_norm": 7.575568675994873, |
|
"learning_rate": 2.6515151515151516e-05, |
|
"loss": 1.088, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 114.66666666666667, |
|
"grad_norm": 7.7806620597839355, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.0889, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 114.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.447421908378601, |
|
"eval_runtime": 0.9635, |
|
"eval_samples_per_second": 47.742, |
|
"eval_steps_per_second": 3.114, |
|
"step": 2587 |
|
}, |
|
{ |
|
"epoch": 115.11111111111111, |
|
"grad_norm": 20.770198822021484, |
|
"learning_rate": 1.893939393939394e-05, |
|
"loss": 1.0754, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 115.55555555555556, |
|
"grad_norm": 5.939628601074219, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 1.07, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 116.0, |
|
"grad_norm": 35.29814147949219, |
|
"learning_rate": 1.1363636363636365e-05, |
|
"loss": 1.0729, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 116.0, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4561296701431274, |
|
"eval_runtime": 0.899, |
|
"eval_samples_per_second": 51.169, |
|
"eval_steps_per_second": 3.337, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 116.44444444444444, |
|
"grad_norm": 24.017641067504883, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 1.0589, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 116.88888888888889, |
|
"grad_norm": 14.974956512451172, |
|
"learning_rate": 3.7878787878787882e-06, |
|
"loss": 1.0671, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 116.97777777777777, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.4538123607635498, |
|
"eval_runtime": 0.8903, |
|
"eval_samples_per_second": 51.667, |
|
"eval_steps_per_second": 3.37, |
|
"step": 2632 |
|
}, |
|
{ |
|
"epoch": 117.33333333333333, |
|
"grad_norm": 8.030067443847656, |
|
"learning_rate": 0.0, |
|
"loss": 1.0937, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 117.33333333333333, |
|
"eval_accuracy": 0.21739130434782608, |
|
"eval_loss": 1.453188180923462, |
|
"eval_runtime": 1.73, |
|
"eval_samples_per_second": 26.589, |
|
"eval_steps_per_second": 1.734, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 117.33333333333333, |
|
"step": 2640, |
|
"total_flos": 5.466852859010089e+18, |
|
"train_loss": 1.1631801536588957, |
|
"train_runtime": 5282.7516, |
|
"train_samples_per_second": 32.529, |
|
"train_steps_per_second": 0.5 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2640, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 120, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.466852859010089e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|