diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9745 @@ +{ + "best_metric": 0.2633333333333333, + "best_model_checkpoint": "swinv2-small-patch4-window16-256-mineral\\checkpoint-6693", + "epoch": 480.0, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.53, + "learning_rate": 5.555555555555556e-07, + "loss": 5.6941, + "step": 10 + }, + { + "epoch": 0.96, + "eval_accuracy": 0.005, + "eval_loss": 5.692106246948242, + "eval_runtime": 9.9033, + "eval_samples_per_second": 60.586, + "eval_steps_per_second": 1.919, + "step": 18 + }, + { + "epoch": 1.07, + "learning_rate": 1.1111111111111112e-06, + "loss": 5.6939, + "step": 20 + }, + { + "epoch": 1.6, + "learning_rate": 1.6666666666666667e-06, + "loss": 5.6886, + "step": 30 + }, + { + "epoch": 1.97, + "eval_accuracy": 0.005, + "eval_loss": 5.682541370391846, + "eval_runtime": 4.1429, + "eval_samples_per_second": 144.826, + "eval_steps_per_second": 4.586, + "step": 37 + }, + { + "epoch": 2.13, + "learning_rate": 2.2222222222222225e-06, + "loss": 5.6844, + "step": 40 + }, + { + "epoch": 2.67, + "learning_rate": 2.777777777777778e-06, + "loss": 5.6735, + "step": 50 + }, + { + "epoch": 2.99, + "eval_accuracy": 0.005, + "eval_loss": 5.669071674346924, + "eval_runtime": 4.142, + "eval_samples_per_second": 144.858, + "eval_steps_per_second": 4.587, + "step": 56 + }, + { + "epoch": 3.2, + "learning_rate": 3.3333333333333333e-06, + "loss": 5.6534, + "step": 60 + }, + { + "epoch": 3.73, + "learning_rate": 3.888888888888889e-06, + "loss": 5.6521, + "step": 70 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.0033333333333333335, + "eval_loss": 5.654940605163574, + "eval_runtime": 4.1572, + "eval_samples_per_second": 144.326, + "eval_steps_per_second": 4.57, + "step": 75 + }, + { + "epoch": 4.27, + "learning_rate": 4.444444444444445e-06, + "loss": 5.6431, + "step": 80 + }, + { + "epoch": 4.8, + "learning_rate": 5e-06, + "loss": 5.6394, + "step": 90 + }, + { + "epoch": 4.96, + "eval_accuracy": 0.0033333333333333335, + "eval_loss": 5.641611099243164, + "eval_runtime": 4.1535, + "eval_samples_per_second": 144.457, + "eval_steps_per_second": 4.574, + "step": 93 + }, + { + "epoch": 5.33, + "learning_rate": 5.555555555555556e-06, + "loss": 5.5941, + "step": 100 + }, + { + "epoch": 5.87, + "learning_rate": 6.111111111111111e-06, + "loss": 5.6078, + "step": 110 + }, + { + "epoch": 5.97, + "eval_accuracy": 0.0033333333333333335, + "eval_loss": 5.627758979797363, + "eval_runtime": 4.1893, + "eval_samples_per_second": 143.22, + "eval_steps_per_second": 4.535, + "step": 112 + }, + { + "epoch": 6.4, + "learning_rate": 6.666666666666667e-06, + "loss": 5.5762, + "step": 120 + }, + { + "epoch": 6.93, + "learning_rate": 7.222222222222222e-06, + "loss": 5.5743, + "step": 130 + }, + { + "epoch": 6.99, + "eval_accuracy": 0.0016666666666666668, + "eval_loss": 5.612813472747803, + "eval_runtime": 4.224, + "eval_samples_per_second": 142.046, + "eval_steps_per_second": 4.498, + "step": 131 + }, + { + "epoch": 7.47, + "learning_rate": 7.777777777777777e-06, + "loss": 5.5413, + "step": 140 + }, + { + "epoch": 8.0, + "learning_rate": 8.333333333333334e-06, + "loss": 5.5509, + "step": 150 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.0016666666666666668, + "eval_loss": 5.591813087463379, + "eval_runtime": 4.1747, + "eval_samples_per_second": 143.723, + "eval_steps_per_second": 4.551, + "step": 150 + }, + { + "epoch": 8.53, + "learning_rate": 8.88888888888889e-06, + "loss": 5.5115, + "step": 160 + }, + { + "epoch": 8.96, + "eval_accuracy": 0.006666666666666667, + "eval_loss": 5.569559097290039, + "eval_runtime": 4.1739, + "eval_samples_per_second": 143.75, + "eval_steps_per_second": 4.552, + "step": 168 + }, + { + "epoch": 9.07, + "learning_rate": 9.444444444444445e-06, + "loss": 5.4912, + "step": 170 + }, + { + "epoch": 9.6, + "learning_rate": 1e-05, + "loss": 5.4411, + "step": 180 + }, + { + "epoch": 9.97, + "eval_accuracy": 0.01, + "eval_loss": 5.5439839363098145, + "eval_runtime": 4.1644, + "eval_samples_per_second": 144.078, + "eval_steps_per_second": 4.562, + "step": 187 + }, + { + "epoch": 10.13, + "learning_rate": 1.0555555555555555e-05, + "loss": 5.3942, + "step": 190 + }, + { + "epoch": 10.67, + "learning_rate": 1.1111111111111112e-05, + "loss": 5.3335, + "step": 200 + }, + { + "epoch": 10.99, + "eval_accuracy": 0.016666666666666666, + "eval_loss": 5.513484001159668, + "eval_runtime": 4.1932, + "eval_samples_per_second": 143.088, + "eval_steps_per_second": 4.531, + "step": 206 + }, + { + "epoch": 11.2, + "learning_rate": 1.1666666666666668e-05, + "loss": 5.2998, + "step": 210 + }, + { + "epoch": 11.73, + "learning_rate": 1.2222222222222222e-05, + "loss": 5.2413, + "step": 220 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.021666666666666667, + "eval_loss": 5.464037895202637, + "eval_runtime": 4.2106, + "eval_samples_per_second": 142.496, + "eval_steps_per_second": 4.512, + "step": 225 + }, + { + "epoch": 12.27, + "learning_rate": 1.2777777777777777e-05, + "loss": 5.2175, + "step": 230 + }, + { + "epoch": 12.8, + "learning_rate": 1.3333333333333333e-05, + "loss": 5.1738, + "step": 240 + }, + { + "epoch": 12.96, + "eval_accuracy": 0.03333333333333333, + "eval_loss": 5.408351421356201, + "eval_runtime": 4.1793, + "eval_samples_per_second": 143.566, + "eval_steps_per_second": 4.546, + "step": 243 + }, + { + "epoch": 13.33, + "learning_rate": 1.388888888888889e-05, + "loss": 5.0966, + "step": 250 + }, + { + "epoch": 13.87, + "learning_rate": 1.4444444444444444e-05, + "loss": 5.0222, + "step": 260 + }, + { + "epoch": 13.97, + "eval_accuracy": 0.045, + "eval_loss": 5.3320746421813965, + "eval_runtime": 4.1797, + "eval_samples_per_second": 143.55, + "eval_steps_per_second": 4.546, + "step": 262 + }, + { + "epoch": 14.4, + "learning_rate": 1.5e-05, + "loss": 4.913, + "step": 270 + }, + { + "epoch": 14.93, + "learning_rate": 1.5555555555555555e-05, + "loss": 4.8594, + "step": 280 + }, + { + "epoch": 14.99, + "eval_accuracy": 0.05333333333333334, + "eval_loss": 5.248490333557129, + "eval_runtime": 4.1704, + "eval_samples_per_second": 143.871, + "eval_steps_per_second": 4.556, + "step": 281 + }, + { + "epoch": 15.47, + "learning_rate": 1.6111111111111115e-05, + "loss": 4.7591, + "step": 290 + }, + { + "epoch": 16.0, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.7441, + "step": 300 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.065, + "eval_loss": 5.150908946990967, + "eval_runtime": 4.1485, + "eval_samples_per_second": 144.629, + "eval_steps_per_second": 4.58, + "step": 300 + }, + { + "epoch": 16.53, + "learning_rate": 1.7222222222222224e-05, + "loss": 4.5946, + "step": 310 + }, + { + "epoch": 16.96, + "eval_accuracy": 0.07166666666666667, + "eval_loss": 5.070082187652588, + "eval_runtime": 4.1543, + "eval_samples_per_second": 144.43, + "eval_steps_per_second": 4.574, + "step": 318 + }, + { + "epoch": 17.07, + "learning_rate": 1.777777777777778e-05, + "loss": 4.4995, + "step": 320 + }, + { + "epoch": 17.6, + "learning_rate": 1.8333333333333333e-05, + "loss": 4.3382, + "step": 330 + }, + { + "epoch": 17.97, + "eval_accuracy": 0.08666666666666667, + "eval_loss": 4.976734161376953, + "eval_runtime": 4.158, + "eval_samples_per_second": 144.301, + "eval_steps_per_second": 4.57, + "step": 337 + }, + { + "epoch": 18.13, + "learning_rate": 1.888888888888889e-05, + "loss": 4.3477, + "step": 340 + }, + { + "epoch": 18.67, + "learning_rate": 1.9444444444444445e-05, + "loss": 4.2008, + "step": 350 + }, + { + "epoch": 18.99, + "eval_accuracy": 0.105, + "eval_loss": 4.862234115600586, + "eval_runtime": 4.1637, + "eval_samples_per_second": 144.103, + "eval_steps_per_second": 4.563, + "step": 356 + }, + { + "epoch": 19.2, + "learning_rate": 2e-05, + "loss": 4.0491, + "step": 360 + }, + { + "epoch": 19.73, + "learning_rate": 2.0555555555555555e-05, + "loss": 4.0563, + "step": 370 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.10333333333333333, + "eval_loss": 4.772627830505371, + "eval_runtime": 4.1762, + "eval_samples_per_second": 143.672, + "eval_steps_per_second": 4.55, + "step": 375 + }, + { + "epoch": 20.27, + "learning_rate": 2.111111111111111e-05, + "loss": 3.8101, + "step": 380 + }, + { + "epoch": 20.8, + "learning_rate": 2.1666666666666667e-05, + "loss": 3.8064, + "step": 390 + }, + { + "epoch": 20.96, + "eval_accuracy": 0.115, + "eval_loss": 4.689815044403076, + "eval_runtime": 4.151, + "eval_samples_per_second": 144.542, + "eval_steps_per_second": 4.577, + "step": 393 + }, + { + "epoch": 21.33, + "learning_rate": 2.2222222222222223e-05, + "loss": 3.6083, + "step": 400 + }, + { + "epoch": 21.87, + "learning_rate": 2.277777777777778e-05, + "loss": 3.5584, + "step": 410 + }, + { + "epoch": 21.97, + "eval_accuracy": 0.125, + "eval_loss": 4.599685192108154, + "eval_runtime": 4.1649, + "eval_samples_per_second": 144.062, + "eval_steps_per_second": 4.562, + "step": 412 + }, + { + "epoch": 22.4, + "learning_rate": 2.3333333333333336e-05, + "loss": 3.4029, + "step": 420 + }, + { + "epoch": 22.93, + "learning_rate": 2.3888888888888892e-05, + "loss": 3.3377, + "step": 430 + }, + { + "epoch": 22.99, + "eval_accuracy": 0.13666666666666666, + "eval_loss": 4.484786510467529, + "eval_runtime": 4.165, + "eval_samples_per_second": 144.057, + "eval_steps_per_second": 4.562, + "step": 431 + }, + { + "epoch": 23.47, + "learning_rate": 2.4444444444444445e-05, + "loss": 3.1633, + "step": 440 + }, + { + "epoch": 24.0, + "learning_rate": 2.5e-05, + "loss": 3.1119, + "step": 450 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.15333333333333332, + "eval_loss": 4.405167102813721, + "eval_runtime": 4.1901, + "eval_samples_per_second": 143.196, + "eval_steps_per_second": 4.535, + "step": 450 + }, + { + "epoch": 24.53, + "learning_rate": 2.5555555555555554e-05, + "loss": 2.8686, + "step": 460 + }, + { + "epoch": 24.96, + "eval_accuracy": 0.15, + "eval_loss": 4.37052583694458, + "eval_runtime": 4.1543, + "eval_samples_per_second": 144.427, + "eval_steps_per_second": 4.574, + "step": 468 + }, + { + "epoch": 25.07, + "learning_rate": 2.6111111111111114e-05, + "loss": 3.0138, + "step": 470 + }, + { + "epoch": 25.6, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.7649, + "step": 480 + }, + { + "epoch": 25.97, + "eval_accuracy": 0.165, + "eval_loss": 4.297973155975342, + "eval_runtime": 4.1559, + "eval_samples_per_second": 144.373, + "eval_steps_per_second": 4.572, + "step": 487 + }, + { + "epoch": 26.13, + "learning_rate": 2.7222222222222223e-05, + "loss": 2.6887, + "step": 490 + }, + { + "epoch": 26.67, + "learning_rate": 2.777777777777778e-05, + "loss": 2.5698, + "step": 500 + }, + { + "epoch": 26.99, + "eval_accuracy": 0.17666666666666667, + "eval_loss": 4.236337184906006, + "eval_runtime": 4.1581, + "eval_samples_per_second": 144.296, + "eval_steps_per_second": 4.569, + "step": 506 + }, + { + "epoch": 27.2, + "learning_rate": 2.8333333333333335e-05, + "loss": 2.5301, + "step": 510 + }, + { + "epoch": 27.73, + "learning_rate": 2.8888888888888888e-05, + "loss": 2.4344, + "step": 520 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.17666666666666667, + "eval_loss": 4.17328405380249, + "eval_runtime": 4.1589, + "eval_samples_per_second": 144.269, + "eval_steps_per_second": 4.569, + "step": 525 + }, + { + "epoch": 28.27, + "learning_rate": 2.9444444444444448e-05, + "loss": 2.2843, + "step": 530 + }, + { + "epoch": 28.8, + "learning_rate": 3e-05, + "loss": 2.2186, + "step": 540 + }, + { + "epoch": 28.96, + "eval_accuracy": 0.17333333333333334, + "eval_loss": 4.178333282470703, + "eval_runtime": 4.1547, + "eval_samples_per_second": 144.413, + "eval_steps_per_second": 4.573, + "step": 543 + }, + { + "epoch": 29.33, + "learning_rate": 3.055555555555556e-05, + "loss": 2.1025, + "step": 550 + }, + { + "epoch": 29.87, + "learning_rate": 3.111111111111111e-05, + "loss": 2.0227, + "step": 560 + }, + { + "epoch": 29.97, + "eval_accuracy": 0.18, + "eval_loss": 4.13058614730835, + "eval_runtime": 4.1618, + "eval_samples_per_second": 144.168, + "eval_steps_per_second": 4.565, + "step": 562 + }, + { + "epoch": 30.4, + "learning_rate": 3.1666666666666666e-05, + "loss": 1.8851, + "step": 570 + }, + { + "epoch": 30.93, + "learning_rate": 3.222222222222223e-05, + "loss": 1.9153, + "step": 580 + }, + { + "epoch": 30.99, + "eval_accuracy": 0.175, + "eval_loss": 4.094816207885742, + "eval_runtime": 4.1706, + "eval_samples_per_second": 143.864, + "eval_steps_per_second": 4.556, + "step": 581 + }, + { + "epoch": 31.47, + "learning_rate": 3.277777777777778e-05, + "loss": 1.7959, + "step": 590 + }, + { + "epoch": 32.0, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.7363, + "step": 600 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.17833333333333334, + "eval_loss": 4.061172008514404, + "eval_runtime": 4.1527, + "eval_samples_per_second": 144.483, + "eval_steps_per_second": 4.575, + "step": 600 + }, + { + "epoch": 32.53, + "learning_rate": 3.388888888888889e-05, + "loss": 1.6171, + "step": 610 + }, + { + "epoch": 32.96, + "eval_accuracy": 0.185, + "eval_loss": 4.020925521850586, + "eval_runtime": 4.1794, + "eval_samples_per_second": 143.562, + "eval_steps_per_second": 4.546, + "step": 618 + }, + { + "epoch": 33.07, + "learning_rate": 3.444444444444445e-05, + "loss": 1.5284, + "step": 620 + }, + { + "epoch": 33.6, + "learning_rate": 3.5e-05, + "loss": 1.4865, + "step": 630 + }, + { + "epoch": 33.97, + "eval_accuracy": 0.185, + "eval_loss": 4.019384860992432, + "eval_runtime": 4.1555, + "eval_samples_per_second": 144.386, + "eval_steps_per_second": 4.572, + "step": 637 + }, + { + "epoch": 34.13, + "learning_rate": 3.555555555555556e-05, + "loss": 1.4216, + "step": 640 + }, + { + "epoch": 34.67, + "learning_rate": 3.611111111111111e-05, + "loss": 1.3194, + "step": 650 + }, + { + "epoch": 34.99, + "eval_accuracy": 0.205, + "eval_loss": 3.988067626953125, + "eval_runtime": 4.1907, + "eval_samples_per_second": 143.174, + "eval_steps_per_second": 4.534, + "step": 656 + }, + { + "epoch": 35.2, + "learning_rate": 3.6666666666666666e-05, + "loss": 1.3247, + "step": 660 + }, + { + "epoch": 35.73, + "learning_rate": 3.722222222222222e-05, + "loss": 1.2811, + "step": 670 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.215, + "eval_loss": 3.9861950874328613, + "eval_runtime": 4.2139, + "eval_samples_per_second": 142.387, + "eval_steps_per_second": 4.509, + "step": 675 + }, + { + "epoch": 36.27, + "learning_rate": 3.777777777777778e-05, + "loss": 1.1968, + "step": 680 + }, + { + "epoch": 36.8, + "learning_rate": 3.8333333333333334e-05, + "loss": 1.1703, + "step": 690 + }, + { + "epoch": 36.96, + "eval_accuracy": 0.20333333333333334, + "eval_loss": 3.9904768466949463, + "eval_runtime": 4.165, + "eval_samples_per_second": 144.057, + "eval_steps_per_second": 4.562, + "step": 693 + }, + { + "epoch": 37.33, + "learning_rate": 3.888888888888889e-05, + "loss": 1.1136, + "step": 700 + }, + { + "epoch": 37.87, + "learning_rate": 3.944444444444445e-05, + "loss": 1.114, + "step": 710 + }, + { + "epoch": 37.97, + "eval_accuracy": 0.21333333333333335, + "eval_loss": 3.951385021209717, + "eval_runtime": 4.1675, + "eval_samples_per_second": 143.97, + "eval_steps_per_second": 4.559, + "step": 712 + }, + { + "epoch": 38.4, + "learning_rate": 4e-05, + "loss": 1.0194, + "step": 720 + }, + { + "epoch": 38.93, + "learning_rate": 4.055555555555556e-05, + "loss": 0.9645, + "step": 730 + }, + { + "epoch": 38.99, + "eval_accuracy": 0.20666666666666667, + "eval_loss": 3.9677815437316895, + "eval_runtime": 4.1678, + "eval_samples_per_second": 143.96, + "eval_steps_per_second": 4.559, + "step": 731 + }, + { + "epoch": 39.47, + "learning_rate": 4.111111111111111e-05, + "loss": 0.9037, + "step": 740 + }, + { + "epoch": 40.0, + "learning_rate": 4.166666666666667e-05, + "loss": 0.8976, + "step": 750 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.21666666666666667, + "eval_loss": 3.987384080886841, + "eval_runtime": 4.1794, + "eval_samples_per_second": 143.56, + "eval_steps_per_second": 4.546, + "step": 750 + }, + { + "epoch": 40.53, + "learning_rate": 4.222222222222222e-05, + "loss": 0.8147, + "step": 760 + }, + { + "epoch": 40.96, + "eval_accuracy": 0.20833333333333334, + "eval_loss": 3.925668954849243, + "eval_runtime": 4.2286, + "eval_samples_per_second": 141.892, + "eval_steps_per_second": 4.493, + "step": 768 + }, + { + "epoch": 41.07, + "learning_rate": 4.277777777777778e-05, + "loss": 0.8449, + "step": 770 + }, + { + "epoch": 41.6, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.7239, + "step": 780 + }, + { + "epoch": 41.97, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 3.9394490718841553, + "eval_runtime": 4.2239, + "eval_samples_per_second": 142.05, + "eval_steps_per_second": 4.498, + "step": 787 + }, + { + "epoch": 42.13, + "learning_rate": 4.388888888888889e-05, + "loss": 0.7847, + "step": 790 + }, + { + "epoch": 42.67, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.7732, + "step": 800 + }, + { + "epoch": 42.99, + "eval_accuracy": 0.215, + "eval_loss": 3.9472830295562744, + "eval_runtime": 4.1769, + "eval_samples_per_second": 143.646, + "eval_steps_per_second": 4.549, + "step": 806 + }, + { + "epoch": 43.2, + "learning_rate": 4.5e-05, + "loss": 0.7159, + "step": 810 + }, + { + "epoch": 43.73, + "learning_rate": 4.555555555555556e-05, + "loss": 0.7009, + "step": 820 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.215, + "eval_loss": 3.946096897125244, + "eval_runtime": 4.1968, + "eval_samples_per_second": 142.965, + "eval_steps_per_second": 4.527, + "step": 825 + }, + { + "epoch": 44.27, + "learning_rate": 4.6111111111111115e-05, + "loss": 0.663, + "step": 830 + }, + { + "epoch": 44.8, + "learning_rate": 4.666666666666667e-05, + "loss": 0.5945, + "step": 840 + }, + { + "epoch": 44.96, + "eval_accuracy": 0.21333333333333335, + "eval_loss": 4.0206685066223145, + "eval_runtime": 4.207, + "eval_samples_per_second": 142.62, + "eval_steps_per_second": 4.516, + "step": 843 + }, + { + "epoch": 45.33, + "learning_rate": 4.722222222222222e-05, + "loss": 0.5903, + "step": 850 + }, + { + "epoch": 45.87, + "learning_rate": 4.7777777777777784e-05, + "loss": 0.555, + "step": 860 + }, + { + "epoch": 45.97, + "eval_accuracy": 0.20833333333333334, + "eval_loss": 4.035262107849121, + "eval_runtime": 4.1806, + "eval_samples_per_second": 143.522, + "eval_steps_per_second": 4.545, + "step": 862 + }, + { + "epoch": 46.4, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.5768, + "step": 870 + }, + { + "epoch": 46.93, + "learning_rate": 4.888888888888889e-05, + "loss": 0.5241, + "step": 880 + }, + { + "epoch": 46.99, + "eval_accuracy": 0.21666666666666667, + "eval_loss": 4.023153305053711, + "eval_runtime": 4.1743, + "eval_samples_per_second": 143.736, + "eval_steps_per_second": 4.552, + "step": 881 + }, + { + "epoch": 47.47, + "learning_rate": 4.9444444444444446e-05, + "loss": 0.5324, + "step": 890 + }, + { + "epoch": 48.0, + "learning_rate": 5e-05, + "loss": 0.4789, + "step": 900 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.22, + "eval_loss": 4.002644062042236, + "eval_runtime": 4.1605, + "eval_samples_per_second": 144.215, + "eval_steps_per_second": 4.567, + "step": 900 + }, + { + "epoch": 48.53, + "learning_rate": 4.9938271604938276e-05, + "loss": 0.4284, + "step": 910 + }, + { + "epoch": 48.96, + "eval_accuracy": 0.22, + "eval_loss": 4.0031256675720215, + "eval_runtime": 4.1788, + "eval_samples_per_second": 143.582, + "eval_steps_per_second": 4.547, + "step": 918 + }, + { + "epoch": 49.07, + "learning_rate": 4.987654320987655e-05, + "loss": 0.5321, + "step": 920 + }, + { + "epoch": 49.6, + "learning_rate": 4.981481481481482e-05, + "loss": 0.4701, + "step": 930 + }, + { + "epoch": 49.97, + "eval_accuracy": 0.215, + "eval_loss": 4.057220935821533, + "eval_runtime": 4.1731, + "eval_samples_per_second": 143.779, + "eval_steps_per_second": 4.553, + "step": 937 + }, + { + "epoch": 50.13, + "learning_rate": 4.9753086419753084e-05, + "loss": 0.491, + "step": 940 + }, + { + "epoch": 50.67, + "learning_rate": 4.969135802469136e-05, + "loss": 0.4501, + "step": 950 + }, + { + "epoch": 50.99, + "eval_accuracy": 0.215, + "eval_loss": 4.087738513946533, + "eval_runtime": 4.1876, + "eval_samples_per_second": 143.279, + "eval_steps_per_second": 4.537, + "step": 956 + }, + { + "epoch": 51.2, + "learning_rate": 4.962962962962963e-05, + "loss": 0.4453, + "step": 960 + }, + { + "epoch": 51.73, + "learning_rate": 4.9567901234567905e-05, + "loss": 0.3966, + "step": 970 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.21666666666666667, + "eval_loss": 4.020733833312988, + "eval_runtime": 4.1529, + "eval_samples_per_second": 144.478, + "eval_steps_per_second": 4.575, + "step": 975 + }, + { + "epoch": 52.27, + "learning_rate": 4.950617283950618e-05, + "loss": 0.4295, + "step": 980 + }, + { + "epoch": 52.8, + "learning_rate": 4.9444444444444446e-05, + "loss": 0.3564, + "step": 990 + }, + { + "epoch": 52.96, + "eval_accuracy": 0.215, + "eval_loss": 4.082664966583252, + "eval_runtime": 4.1603, + "eval_samples_per_second": 144.219, + "eval_steps_per_second": 4.567, + "step": 993 + }, + { + "epoch": 53.33, + "learning_rate": 4.938271604938271e-05, + "loss": 0.4013, + "step": 1000 + }, + { + "epoch": 53.87, + "learning_rate": 4.932098765432099e-05, + "loss": 0.3472, + "step": 1010 + }, + { + "epoch": 53.97, + "eval_accuracy": 0.235, + "eval_loss": 4.090172290802002, + "eval_runtime": 4.1532, + "eval_samples_per_second": 144.465, + "eval_steps_per_second": 4.575, + "step": 1012 + }, + { + "epoch": 54.4, + "learning_rate": 4.925925925925926e-05, + "loss": 0.383, + "step": 1020 + }, + { + "epoch": 54.93, + "learning_rate": 4.9197530864197535e-05, + "loss": 0.3731, + "step": 1030 + }, + { + "epoch": 54.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.095271587371826, + "eval_runtime": 4.2172, + "eval_samples_per_second": 142.273, + "eval_steps_per_second": 4.505, + "step": 1031 + }, + { + "epoch": 55.47, + "learning_rate": 4.913580246913581e-05, + "loss": 0.3237, + "step": 1040 + }, + { + "epoch": 56.0, + "learning_rate": 4.9074074074074075e-05, + "loss": 0.3161, + "step": 1050 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.20333333333333334, + "eval_loss": 4.165963172912598, + "eval_runtime": 4.2197, + "eval_samples_per_second": 142.19, + "eval_steps_per_second": 4.503, + "step": 1050 + }, + { + "epoch": 56.53, + "learning_rate": 4.901234567901235e-05, + "loss": 0.3352, + "step": 1060 + }, + { + "epoch": 56.96, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.115335464477539, + "eval_runtime": 4.1813, + "eval_samples_per_second": 143.495, + "eval_steps_per_second": 4.544, + "step": 1068 + }, + { + "epoch": 57.07, + "learning_rate": 4.8950617283950616e-05, + "loss": 0.3455, + "step": 1070 + }, + { + "epoch": 57.6, + "learning_rate": 4.888888888888889e-05, + "loss": 0.3317, + "step": 1080 + }, + { + "epoch": 57.97, + "eval_accuracy": 0.21666666666666667, + "eval_loss": 4.1095662117004395, + "eval_runtime": 4.2026, + "eval_samples_per_second": 142.768, + "eval_steps_per_second": 4.521, + "step": 1087 + }, + { + "epoch": 58.13, + "learning_rate": 4.8827160493827164e-05, + "loss": 0.3594, + "step": 1090 + }, + { + "epoch": 58.67, + "learning_rate": 4.876543209876544e-05, + "loss": 0.294, + "step": 1100 + }, + { + "epoch": 58.99, + "eval_accuracy": 0.215, + "eval_loss": 4.185626029968262, + "eval_runtime": 4.176, + "eval_samples_per_second": 143.679, + "eval_steps_per_second": 4.55, + "step": 1106 + }, + { + "epoch": 59.2, + "learning_rate": 4.8703703703703704e-05, + "loss": 0.3059, + "step": 1110 + }, + { + "epoch": 59.73, + "learning_rate": 4.864197530864198e-05, + "loss": 0.3299, + "step": 1120 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.1475830078125, + "eval_runtime": 4.1797, + "eval_samples_per_second": 143.552, + "eval_steps_per_second": 4.546, + "step": 1125 + }, + { + "epoch": 60.27, + "learning_rate": 4.858024691358025e-05, + "loss": 0.2965, + "step": 1130 + }, + { + "epoch": 60.8, + "learning_rate": 4.851851851851852e-05, + "loss": 0.2847, + "step": 1140 + }, + { + "epoch": 60.96, + "eval_accuracy": 0.225, + "eval_loss": 4.204588890075684, + "eval_runtime": 4.1808, + "eval_samples_per_second": 143.515, + "eval_steps_per_second": 4.545, + "step": 1143 + }, + { + "epoch": 61.33, + "learning_rate": 4.845679012345679e-05, + "loss": 0.3176, + "step": 1150 + }, + { + "epoch": 61.87, + "learning_rate": 4.8395061728395067e-05, + "loss": 0.2924, + "step": 1160 + }, + { + "epoch": 61.97, + "eval_accuracy": 0.21833333333333332, + "eval_loss": 4.156820297241211, + "eval_runtime": 4.1645, + "eval_samples_per_second": 144.075, + "eval_steps_per_second": 4.562, + "step": 1162 + }, + { + "epoch": 62.4, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.255, + "step": 1170 + }, + { + "epoch": 62.93, + "learning_rate": 4.827160493827161e-05, + "loss": 0.2818, + "step": 1180 + }, + { + "epoch": 62.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.151925563812256, + "eval_runtime": 4.175, + "eval_samples_per_second": 143.711, + "eval_steps_per_second": 4.551, + "step": 1181 + }, + { + "epoch": 63.47, + "learning_rate": 4.820987654320988e-05, + "loss": 0.2618, + "step": 1190 + }, + { + "epoch": 64.0, + "learning_rate": 4.814814814814815e-05, + "loss": 0.2698, + "step": 1200 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.215, + "eval_loss": 4.227489471435547, + "eval_runtime": 4.1793, + "eval_samples_per_second": 143.565, + "eval_steps_per_second": 4.546, + "step": 1200 + }, + { + "epoch": 64.53, + "learning_rate": 4.808641975308642e-05, + "loss": 0.2579, + "step": 1210 + }, + { + "epoch": 64.96, + "eval_accuracy": 0.235, + "eval_loss": 4.162615776062012, + "eval_runtime": 4.1756, + "eval_samples_per_second": 143.693, + "eval_steps_per_second": 4.55, + "step": 1218 + }, + { + "epoch": 65.07, + "learning_rate": 4.8024691358024696e-05, + "loss": 0.2555, + "step": 1220 + }, + { + "epoch": 65.6, + "learning_rate": 4.796296296296296e-05, + "loss": 0.2597, + "step": 1230 + }, + { + "epoch": 65.97, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.227728843688965, + "eval_runtime": 4.1826, + "eval_samples_per_second": 143.452, + "eval_steps_per_second": 4.543, + "step": 1237 + }, + { + "epoch": 66.13, + "learning_rate": 4.7901234567901237e-05, + "loss": 0.1989, + "step": 1240 + }, + { + "epoch": 66.67, + "learning_rate": 4.783950617283951e-05, + "loss": 0.2443, + "step": 1250 + }, + { + "epoch": 66.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.192920207977295, + "eval_runtime": 4.1844, + "eval_samples_per_second": 143.39, + "eval_steps_per_second": 4.541, + "step": 1256 + }, + { + "epoch": 67.2, + "learning_rate": 4.7777777777777784e-05, + "loss": 0.2686, + "step": 1260 + }, + { + "epoch": 67.73, + "learning_rate": 4.771604938271605e-05, + "loss": 0.2532, + "step": 1270 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.277872085571289, + "eval_runtime": 4.1836, + "eval_samples_per_second": 143.417, + "eval_steps_per_second": 4.542, + "step": 1275 + }, + { + "epoch": 68.27, + "learning_rate": 4.7654320987654325e-05, + "loss": 0.2647, + "step": 1280 + }, + { + "epoch": 68.8, + "learning_rate": 4.759259259259259e-05, + "loss": 0.2305, + "step": 1290 + }, + { + "epoch": 68.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.244070053100586, + "eval_runtime": 4.1785, + "eval_samples_per_second": 143.591, + "eval_steps_per_second": 4.547, + "step": 1293 + }, + { + "epoch": 69.33, + "learning_rate": 4.7530864197530866e-05, + "loss": 0.2347, + "step": 1300 + }, + { + "epoch": 69.87, + "learning_rate": 4.746913580246914e-05, + "loss": 0.2423, + "step": 1310 + }, + { + "epoch": 69.97, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.25825309753418, + "eval_runtime": 4.1881, + "eval_samples_per_second": 143.264, + "eval_steps_per_second": 4.537, + "step": 1312 + }, + { + "epoch": 70.4, + "learning_rate": 4.740740740740741e-05, + "loss": 0.2167, + "step": 1320 + }, + { + "epoch": 70.93, + "learning_rate": 4.734567901234569e-05, + "loss": 0.222, + "step": 1330 + }, + { + "epoch": 70.99, + "eval_accuracy": 0.23, + "eval_loss": 4.293475151062012, + "eval_runtime": 4.178, + "eval_samples_per_second": 143.61, + "eval_steps_per_second": 4.548, + "step": 1331 + }, + { + "epoch": 71.47, + "learning_rate": 4.7283950617283954e-05, + "loss": 0.2781, + "step": 1340 + }, + { + "epoch": 72.0, + "learning_rate": 4.722222222222222e-05, + "loss": 0.2096, + "step": 1350 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.23, + "eval_loss": 4.271422863006592, + "eval_runtime": 4.1808, + "eval_samples_per_second": 143.513, + "eval_steps_per_second": 4.545, + "step": 1350 + }, + { + "epoch": 72.53, + "learning_rate": 4.7160493827160495e-05, + "loss": 0.1776, + "step": 1360 + }, + { + "epoch": 72.96, + "eval_accuracy": 0.225, + "eval_loss": 4.234805583953857, + "eval_runtime": 4.1972, + "eval_samples_per_second": 142.953, + "eval_steps_per_second": 4.527, + "step": 1368 + }, + { + "epoch": 73.07, + "learning_rate": 4.709876543209877e-05, + "loss": 0.1977, + "step": 1370 + }, + { + "epoch": 73.6, + "learning_rate": 4.703703703703704e-05, + "loss": 0.2009, + "step": 1380 + }, + { + "epoch": 73.97, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.292957305908203, + "eval_runtime": 4.1773, + "eval_samples_per_second": 143.633, + "eval_steps_per_second": 4.548, + "step": 1387 + }, + { + "epoch": 74.13, + "learning_rate": 4.6975308641975316e-05, + "loss": 0.2292, + "step": 1390 + }, + { + "epoch": 74.67, + "learning_rate": 4.691358024691358e-05, + "loss": 0.2087, + "step": 1400 + }, + { + "epoch": 74.99, + "eval_accuracy": 0.235, + "eval_loss": 4.307061672210693, + "eval_runtime": 4.182, + "eval_samples_per_second": 143.472, + "eval_steps_per_second": 4.543, + "step": 1406 + }, + { + "epoch": 75.2, + "learning_rate": 4.685185185185185e-05, + "loss": 0.2235, + "step": 1410 + }, + { + "epoch": 75.73, + "learning_rate": 4.6790123456790124e-05, + "loss": 0.1818, + "step": 1420 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.235, + "eval_loss": 4.296043872833252, + "eval_runtime": 4.1959, + "eval_samples_per_second": 142.996, + "eval_steps_per_second": 4.528, + "step": 1425 + }, + { + "epoch": 76.27, + "learning_rate": 4.67283950617284e-05, + "loss": 0.2019, + "step": 1430 + }, + { + "epoch": 76.8, + "learning_rate": 4.666666666666667e-05, + "loss": 0.2236, + "step": 1440 + }, + { + "epoch": 76.96, + "eval_accuracy": 0.24, + "eval_loss": 4.290974140167236, + "eval_runtime": 4.1784, + "eval_samples_per_second": 143.596, + "eval_steps_per_second": 4.547, + "step": 1443 + }, + { + "epoch": 77.33, + "learning_rate": 4.6604938271604945e-05, + "loss": 0.2008, + "step": 1450 + }, + { + "epoch": 77.87, + "learning_rate": 4.654320987654321e-05, + "loss": 0.1802, + "step": 1460 + }, + { + "epoch": 77.97, + "eval_accuracy": 0.25, + "eval_loss": 4.289625644683838, + "eval_runtime": 4.224, + "eval_samples_per_second": 142.044, + "eval_steps_per_second": 4.498, + "step": 1462 + }, + { + "epoch": 78.4, + "learning_rate": 4.648148148148148e-05, + "loss": 0.2296, + "step": 1470 + }, + { + "epoch": 78.93, + "learning_rate": 4.641975308641975e-05, + "loss": 0.2037, + "step": 1480 + }, + { + "epoch": 78.99, + "eval_accuracy": 0.245, + "eval_loss": 4.331364154815674, + "eval_runtime": 4.2445, + "eval_samples_per_second": 141.359, + "eval_steps_per_second": 4.476, + "step": 1481 + }, + { + "epoch": 79.47, + "learning_rate": 4.635802469135803e-05, + "loss": 0.1697, + "step": 1490 + }, + { + "epoch": 80.0, + "learning_rate": 4.62962962962963e-05, + "loss": 0.1912, + "step": 1500 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.261221408843994, + "eval_runtime": 4.1986, + "eval_samples_per_second": 142.905, + "eval_steps_per_second": 4.525, + "step": 1500 + }, + { + "epoch": 80.53, + "learning_rate": 4.623456790123457e-05, + "loss": 0.2305, + "step": 1510 + }, + { + "epoch": 80.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.279019832611084, + "eval_runtime": 4.1994, + "eval_samples_per_second": 142.879, + "eval_steps_per_second": 4.524, + "step": 1518 + }, + { + "epoch": 81.07, + "learning_rate": 4.617283950617284e-05, + "loss": 0.1808, + "step": 1520 + }, + { + "epoch": 81.6, + "learning_rate": 4.6111111111111115e-05, + "loss": 0.2188, + "step": 1530 + }, + { + "epoch": 81.97, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.306877613067627, + "eval_runtime": 4.1818, + "eval_samples_per_second": 143.477, + "eval_steps_per_second": 4.543, + "step": 1537 + }, + { + "epoch": 82.13, + "learning_rate": 4.604938271604938e-05, + "loss": 0.2251, + "step": 1540 + }, + { + "epoch": 82.67, + "learning_rate": 4.5987654320987656e-05, + "loss": 0.1639, + "step": 1550 + }, + { + "epoch": 82.99, + "eval_accuracy": 0.21833333333333332, + "eval_loss": 4.353877067565918, + "eval_runtime": 4.2313, + "eval_samples_per_second": 141.801, + "eval_steps_per_second": 4.49, + "step": 1556 + }, + { + "epoch": 83.2, + "learning_rate": 4.592592592592593e-05, + "loss": 0.1407, + "step": 1560 + }, + { + "epoch": 83.73, + "learning_rate": 4.58641975308642e-05, + "loss": 0.1741, + "step": 1570 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.225, + "eval_loss": 4.321075916290283, + "eval_runtime": 4.2392, + "eval_samples_per_second": 141.536, + "eval_steps_per_second": 4.482, + "step": 1575 + }, + { + "epoch": 84.27, + "learning_rate": 4.580246913580247e-05, + "loss": 0.1926, + "step": 1580 + }, + { + "epoch": 84.8, + "learning_rate": 4.5740740740740745e-05, + "loss": 0.1937, + "step": 1590 + }, + { + "epoch": 84.96, + "eval_accuracy": 0.21166666666666667, + "eval_loss": 4.357635498046875, + "eval_runtime": 4.1967, + "eval_samples_per_second": 142.97, + "eval_steps_per_second": 4.527, + "step": 1593 + }, + { + "epoch": 85.33, + "learning_rate": 4.567901234567901e-05, + "loss": 0.1932, + "step": 1600 + }, + { + "epoch": 85.87, + "learning_rate": 4.5617283950617285e-05, + "loss": 0.1712, + "step": 1610 + }, + { + "epoch": 85.97, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.3434247970581055, + "eval_runtime": 4.2166, + "eval_samples_per_second": 142.294, + "eval_steps_per_second": 4.506, + "step": 1612 + }, + { + "epoch": 86.4, + "learning_rate": 4.555555555555556e-05, + "loss": 0.1574, + "step": 1620 + }, + { + "epoch": 86.93, + "learning_rate": 4.5493827160493826e-05, + "loss": 0.1665, + "step": 1630 + }, + { + "epoch": 86.99, + "eval_accuracy": 0.21166666666666667, + "eval_loss": 4.334897041320801, + "eval_runtime": 4.1946, + "eval_samples_per_second": 143.042, + "eval_steps_per_second": 4.53, + "step": 1631 + }, + { + "epoch": 87.47, + "learning_rate": 4.54320987654321e-05, + "loss": 0.1565, + "step": 1640 + }, + { + "epoch": 88.0, + "learning_rate": 4.5370370370370374e-05, + "loss": 0.1846, + "step": 1650 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.235, + "eval_loss": 4.417025566101074, + "eval_runtime": 4.202, + "eval_samples_per_second": 142.79, + "eval_steps_per_second": 4.522, + "step": 1650 + }, + { + "epoch": 88.53, + "learning_rate": 4.530864197530865e-05, + "loss": 0.1827, + "step": 1660 + }, + { + "epoch": 88.96, + "eval_accuracy": 0.23, + "eval_loss": 4.335045337677002, + "eval_runtime": 4.1833, + "eval_samples_per_second": 143.428, + "eval_steps_per_second": 4.542, + "step": 1668 + }, + { + "epoch": 89.07, + "learning_rate": 4.5246913580246914e-05, + "loss": 0.1504, + "step": 1670 + }, + { + "epoch": 89.6, + "learning_rate": 4.518518518518519e-05, + "loss": 0.1591, + "step": 1680 + }, + { + "epoch": 89.97, + "eval_accuracy": 0.215, + "eval_loss": 4.339655876159668, + "eval_runtime": 4.1906, + "eval_samples_per_second": 143.178, + "eval_steps_per_second": 4.534, + "step": 1687 + }, + { + "epoch": 90.13, + "learning_rate": 4.5123456790123455e-05, + "loss": 0.1328, + "step": 1690 + }, + { + "epoch": 90.67, + "learning_rate": 4.506172839506173e-05, + "loss": 0.1508, + "step": 1700 + }, + { + "epoch": 90.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.3272809982299805, + "eval_runtime": 4.1836, + "eval_samples_per_second": 143.418, + "eval_steps_per_second": 4.542, + "step": 1706 + }, + { + "epoch": 91.2, + "learning_rate": 4.5e-05, + "loss": 0.1493, + "step": 1710 + }, + { + "epoch": 91.73, + "learning_rate": 4.493827160493828e-05, + "loss": 0.1808, + "step": 1720 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.3314714431762695, + "eval_runtime": 4.1951, + "eval_samples_per_second": 143.024, + "eval_steps_per_second": 4.529, + "step": 1725 + }, + { + "epoch": 92.27, + "learning_rate": 4.4876543209876544e-05, + "loss": 0.1551, + "step": 1730 + }, + { + "epoch": 92.8, + "learning_rate": 4.481481481481482e-05, + "loss": 0.17, + "step": 1740 + }, + { + "epoch": 92.96, + "eval_accuracy": 0.24, + "eval_loss": 4.275998592376709, + "eval_runtime": 4.1877, + "eval_samples_per_second": 143.278, + "eval_steps_per_second": 4.537, + "step": 1743 + }, + { + "epoch": 93.33, + "learning_rate": 4.4753086419753084e-05, + "loss": 0.1676, + "step": 1750 + }, + { + "epoch": 93.87, + "learning_rate": 4.469135802469136e-05, + "loss": 0.14, + "step": 1760 + }, + { + "epoch": 93.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.314431190490723, + "eval_runtime": 4.222, + "eval_samples_per_second": 142.112, + "eval_steps_per_second": 4.5, + "step": 1762 + }, + { + "epoch": 94.4, + "learning_rate": 4.462962962962963e-05, + "loss": 0.1526, + "step": 1770 + }, + { + "epoch": 94.93, + "learning_rate": 4.4567901234567906e-05, + "loss": 0.1734, + "step": 1780 + }, + { + "epoch": 94.99, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.366744041442871, + "eval_runtime": 4.2453, + "eval_samples_per_second": 141.334, + "eval_steps_per_second": 4.476, + "step": 1781 + }, + { + "epoch": 95.47, + "learning_rate": 4.450617283950618e-05, + "loss": 0.1472, + "step": 1790 + }, + { + "epoch": 96.0, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.1593, + "step": 1800 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.225, + "eval_loss": 4.390308380126953, + "eval_runtime": 4.2118, + "eval_samples_per_second": 142.456, + "eval_steps_per_second": 4.511, + "step": 1800 + }, + { + "epoch": 96.53, + "learning_rate": 4.4382716049382714e-05, + "loss": 0.1523, + "step": 1810 + }, + { + "epoch": 96.96, + "eval_accuracy": 0.24, + "eval_loss": 4.331397533416748, + "eval_runtime": 4.1925, + "eval_samples_per_second": 143.113, + "eval_steps_per_second": 4.532, + "step": 1818 + }, + { + "epoch": 97.07, + "learning_rate": 4.432098765432099e-05, + "loss": 0.1665, + "step": 1820 + }, + { + "epoch": 97.6, + "learning_rate": 4.425925925925926e-05, + "loss": 0.1599, + "step": 1830 + }, + { + "epoch": 97.97, + "eval_accuracy": 0.23, + "eval_loss": 4.411539077758789, + "eval_runtime": 4.1869, + "eval_samples_per_second": 143.306, + "eval_steps_per_second": 4.538, + "step": 1837 + }, + { + "epoch": 98.13, + "learning_rate": 4.4197530864197535e-05, + "loss": 0.1319, + "step": 1840 + }, + { + "epoch": 98.67, + "learning_rate": 4.413580246913581e-05, + "loss": 0.1352, + "step": 1850 + }, + { + "epoch": 98.99, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.362613201141357, + "eval_runtime": 4.181, + "eval_samples_per_second": 143.506, + "eval_steps_per_second": 4.544, + "step": 1856 + }, + { + "epoch": 99.2, + "learning_rate": 4.4074074074074076e-05, + "loss": 0.1625, + "step": 1860 + }, + { + "epoch": 99.73, + "learning_rate": 4.401234567901234e-05, + "loss": 0.1406, + "step": 1870 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.355536937713623, + "eval_runtime": 4.2021, + "eval_samples_per_second": 142.787, + "eval_steps_per_second": 4.522, + "step": 1875 + }, + { + "epoch": 100.27, + "learning_rate": 4.3950617283950617e-05, + "loss": 0.1404, + "step": 1880 + }, + { + "epoch": 100.8, + "learning_rate": 4.388888888888889e-05, + "loss": 0.1486, + "step": 1890 + }, + { + "epoch": 100.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.311557769775391, + "eval_runtime": 4.1983, + "eval_samples_per_second": 142.914, + "eval_steps_per_second": 4.526, + "step": 1893 + }, + { + "epoch": 101.33, + "learning_rate": 4.3827160493827164e-05, + "loss": 0.1458, + "step": 1900 + }, + { + "epoch": 101.87, + "learning_rate": 4.376543209876544e-05, + "loss": 0.149, + "step": 1910 + }, + { + "epoch": 101.97, + "eval_accuracy": 0.23, + "eval_loss": 4.389366149902344, + "eval_runtime": 4.2353, + "eval_samples_per_second": 141.666, + "eval_steps_per_second": 4.486, + "step": 1912 + }, + { + "epoch": 102.4, + "learning_rate": 4.3703703703703705e-05, + "loss": 0.1317, + "step": 1920 + }, + { + "epoch": 102.93, + "learning_rate": 4.364197530864197e-05, + "loss": 0.115, + "step": 1930 + }, + { + "epoch": 102.99, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.3754754066467285, + "eval_runtime": 4.2516, + "eval_samples_per_second": 141.122, + "eval_steps_per_second": 4.469, + "step": 1931 + }, + { + "epoch": 103.47, + "learning_rate": 4.3580246913580246e-05, + "loss": 0.1476, + "step": 1940 + }, + { + "epoch": 104.0, + "learning_rate": 4.351851851851852e-05, + "loss": 0.1301, + "step": 1950 + }, + { + "epoch": 104.0, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.376509189605713, + "eval_runtime": 4.2304, + "eval_samples_per_second": 141.83, + "eval_steps_per_second": 4.491, + "step": 1950 + }, + { + "epoch": 104.53, + "learning_rate": 4.345679012345679e-05, + "loss": 0.1429, + "step": 1960 + }, + { + "epoch": 104.96, + "eval_accuracy": 0.235, + "eval_loss": 4.402740955352783, + "eval_runtime": 4.1986, + "eval_samples_per_second": 142.905, + "eval_steps_per_second": 4.525, + "step": 1968 + }, + { + "epoch": 105.07, + "learning_rate": 4.339506172839507e-05, + "loss": 0.123, + "step": 1970 + }, + { + "epoch": 105.6, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.1209, + "step": 1980 + }, + { + "epoch": 105.97, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.380291938781738, + "eval_runtime": 4.2686, + "eval_samples_per_second": 140.56, + "eval_steps_per_second": 4.451, + "step": 1987 + }, + { + "epoch": 106.13, + "learning_rate": 4.327160493827161e-05, + "loss": 0.1189, + "step": 1990 + }, + { + "epoch": 106.67, + "learning_rate": 4.3209876543209875e-05, + "loss": 0.1287, + "step": 2000 + }, + { + "epoch": 106.99, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.323451995849609, + "eval_runtime": 5.6461, + "eval_samples_per_second": 106.268, + "eval_steps_per_second": 3.365, + "step": 2006 + }, + { + "epoch": 107.2, + "learning_rate": 4.314814814814815e-05, + "loss": 0.1318, + "step": 2010 + }, + { + "epoch": 107.73, + "learning_rate": 4.308641975308642e-05, + "loss": 0.1318, + "step": 2020 + }, + { + "epoch": 108.0, + "eval_accuracy": 0.24, + "eval_loss": 4.348374843597412, + "eval_runtime": 4.2065, + "eval_samples_per_second": 142.635, + "eval_steps_per_second": 4.517, + "step": 2025 + }, + { + "epoch": 108.27, + "learning_rate": 4.3024691358024696e-05, + "loss": 0.1328, + "step": 2030 + }, + { + "epoch": 108.8, + "learning_rate": 4.296296296296296e-05, + "loss": 0.1136, + "step": 2040 + }, + { + "epoch": 108.96, + "eval_accuracy": 0.225, + "eval_loss": 4.39766263961792, + "eval_runtime": 4.2019, + "eval_samples_per_second": 142.792, + "eval_steps_per_second": 4.522, + "step": 2043 + }, + { + "epoch": 109.33, + "learning_rate": 4.290123456790124e-05, + "loss": 0.1218, + "step": 2050 + }, + { + "epoch": 109.87, + "learning_rate": 4.283950617283951e-05, + "loss": 0.1326, + "step": 2060 + }, + { + "epoch": 109.97, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.397821426391602, + "eval_runtime": 4.1836, + "eval_samples_per_second": 143.416, + "eval_steps_per_second": 4.542, + "step": 2062 + }, + { + "epoch": 110.4, + "learning_rate": 4.277777777777778e-05, + "loss": 0.1223, + "step": 2070 + }, + { + "epoch": 110.93, + "learning_rate": 4.271604938271605e-05, + "loss": 0.1415, + "step": 2080 + }, + { + "epoch": 110.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.321408748626709, + "eval_runtime": 4.1873, + "eval_samples_per_second": 143.29, + "eval_steps_per_second": 4.538, + "step": 2081 + }, + { + "epoch": 111.47, + "learning_rate": 4.2654320987654325e-05, + "loss": 0.1281, + "step": 2090 + }, + { + "epoch": 112.0, + "learning_rate": 4.259259259259259e-05, + "loss": 0.1229, + "step": 2100 + }, + { + "epoch": 112.0, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.369870662689209, + "eval_runtime": 4.2352, + "eval_samples_per_second": 141.67, + "eval_steps_per_second": 4.486, + "step": 2100 + }, + { + "epoch": 112.53, + "learning_rate": 4.2530864197530866e-05, + "loss": 0.1004, + "step": 2110 + }, + { + "epoch": 112.96, + "eval_accuracy": 0.25833333333333336, + "eval_loss": 4.382783889770508, + "eval_runtime": 4.2396, + "eval_samples_per_second": 141.524, + "eval_steps_per_second": 4.482, + "step": 2118 + }, + { + "epoch": 113.07, + "learning_rate": 4.246913580246914e-05, + "loss": 0.1238, + "step": 2120 + }, + { + "epoch": 113.6, + "learning_rate": 4.240740740740741e-05, + "loss": 0.0961, + "step": 2130 + }, + { + "epoch": 113.97, + "eval_accuracy": 0.25166666666666665, + "eval_loss": 4.356354713439941, + "eval_runtime": 4.2272, + "eval_samples_per_second": 141.937, + "eval_steps_per_second": 4.495, + "step": 2137 + }, + { + "epoch": 114.13, + "learning_rate": 4.234567901234568e-05, + "loss": 0.1181, + "step": 2140 + }, + { + "epoch": 114.67, + "learning_rate": 4.2283950617283955e-05, + "loss": 0.1132, + "step": 2150 + }, + { + "epoch": 114.99, + "eval_accuracy": 0.25333333333333335, + "eval_loss": 4.338386058807373, + "eval_runtime": 4.2183, + "eval_samples_per_second": 142.239, + "eval_steps_per_second": 4.504, + "step": 2156 + }, + { + "epoch": 115.2, + "learning_rate": 4.222222222222222e-05, + "loss": 0.1232, + "step": 2160 + }, + { + "epoch": 115.73, + "learning_rate": 4.2160493827160495e-05, + "loss": 0.1166, + "step": 2170 + }, + { + "epoch": 116.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.415248394012451, + "eval_runtime": 4.233, + "eval_samples_per_second": 141.743, + "eval_steps_per_second": 4.489, + "step": 2175 + }, + { + "epoch": 116.27, + "learning_rate": 4.209876543209877e-05, + "loss": 0.129, + "step": 2180 + }, + { + "epoch": 116.8, + "learning_rate": 4.203703703703704e-05, + "loss": 0.1193, + "step": 2190 + }, + { + "epoch": 116.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.363390922546387, + "eval_runtime": 4.2449, + "eval_samples_per_second": 141.345, + "eval_steps_per_second": 4.476, + "step": 2193 + }, + { + "epoch": 117.33, + "learning_rate": 4.197530864197531e-05, + "loss": 0.1348, + "step": 2200 + }, + { + "epoch": 117.87, + "learning_rate": 4.1913580246913584e-05, + "loss": 0.096, + "step": 2210 + }, + { + "epoch": 117.97, + "eval_accuracy": 0.235, + "eval_loss": 4.382623672485352, + "eval_runtime": 4.236, + "eval_samples_per_second": 141.643, + "eval_steps_per_second": 4.485, + "step": 2212 + }, + { + "epoch": 118.4, + "learning_rate": 4.185185185185185e-05, + "loss": 0.1375, + "step": 2220 + }, + { + "epoch": 118.93, + "learning_rate": 4.1790123456790124e-05, + "loss": 0.1158, + "step": 2230 + }, + { + "epoch": 118.99, + "eval_accuracy": 0.235, + "eval_loss": 4.452427387237549, + "eval_runtime": 4.1862, + "eval_samples_per_second": 143.327, + "eval_steps_per_second": 4.539, + "step": 2231 + }, + { + "epoch": 119.47, + "learning_rate": 4.17283950617284e-05, + "loss": 0.0863, + "step": 2240 + }, + { + "epoch": 120.0, + "learning_rate": 4.166666666666667e-05, + "loss": 0.099, + "step": 2250 + }, + { + "epoch": 120.0, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.497795581817627, + "eval_runtime": 4.2433, + "eval_samples_per_second": 141.399, + "eval_steps_per_second": 4.478, + "step": 2250 + }, + { + "epoch": 120.53, + "learning_rate": 4.1604938271604946e-05, + "loss": 0.1065, + "step": 2260 + }, + { + "epoch": 120.96, + "eval_accuracy": 0.24, + "eval_loss": 4.412367343902588, + "eval_runtime": 4.2487, + "eval_samples_per_second": 141.22, + "eval_steps_per_second": 4.472, + "step": 2268 + }, + { + "epoch": 121.07, + "learning_rate": 4.154320987654321e-05, + "loss": 0.1202, + "step": 2270 + }, + { + "epoch": 121.6, + "learning_rate": 4.148148148148148e-05, + "loss": 0.129, + "step": 2280 + }, + { + "epoch": 121.97, + "eval_accuracy": 0.235, + "eval_loss": 4.381356239318848, + "eval_runtime": 4.2226, + "eval_samples_per_second": 142.092, + "eval_steps_per_second": 4.5, + "step": 2287 + }, + { + "epoch": 122.13, + "learning_rate": 4.1419753086419754e-05, + "loss": 0.1152, + "step": 2290 + }, + { + "epoch": 122.67, + "learning_rate": 4.135802469135803e-05, + "loss": 0.1047, + "step": 2300 + }, + { + "epoch": 122.99, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.366286754608154, + "eval_runtime": 4.1923, + "eval_samples_per_second": 143.119, + "eval_steps_per_second": 4.532, + "step": 2306 + }, + { + "epoch": 123.2, + "learning_rate": 4.12962962962963e-05, + "loss": 0.1075, + "step": 2310 + }, + { + "epoch": 123.73, + "learning_rate": 4.1234567901234575e-05, + "loss": 0.101, + "step": 2320 + }, + { + "epoch": 124.0, + "eval_accuracy": 0.23, + "eval_loss": 4.511256694793701, + "eval_runtime": 4.2032, + "eval_samples_per_second": 142.748, + "eval_steps_per_second": 4.52, + "step": 2325 + }, + { + "epoch": 124.27, + "learning_rate": 4.117283950617284e-05, + "loss": 0.1111, + "step": 2330 + }, + { + "epoch": 124.8, + "learning_rate": 4.111111111111111e-05, + "loss": 0.1076, + "step": 2340 + }, + { + "epoch": 124.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.455278396606445, + "eval_runtime": 4.1874, + "eval_samples_per_second": 143.286, + "eval_steps_per_second": 4.537, + "step": 2343 + }, + { + "epoch": 125.33, + "learning_rate": 4.104938271604938e-05, + "loss": 0.112, + "step": 2350 + }, + { + "epoch": 125.87, + "learning_rate": 4.0987654320987657e-05, + "loss": 0.1135, + "step": 2360 + }, + { + "epoch": 125.97, + "eval_accuracy": 0.23, + "eval_loss": 4.435062885284424, + "eval_runtime": 4.2228, + "eval_samples_per_second": 142.087, + "eval_steps_per_second": 4.499, + "step": 2362 + }, + { + "epoch": 126.4, + "learning_rate": 4.092592592592593e-05, + "loss": 0.0839, + "step": 2370 + }, + { + "epoch": 126.93, + "learning_rate": 4.0864197530864204e-05, + "loss": 0.1066, + "step": 2380 + }, + { + "epoch": 126.99, + "eval_accuracy": 0.235, + "eval_loss": 4.4874396324157715, + "eval_runtime": 4.1945, + "eval_samples_per_second": 143.044, + "eval_steps_per_second": 4.53, + "step": 2381 + }, + { + "epoch": 127.47, + "learning_rate": 4.080246913580247e-05, + "loss": 0.1007, + "step": 2390 + }, + { + "epoch": 128.0, + "learning_rate": 4.074074074074074e-05, + "loss": 0.1256, + "step": 2400 + }, + { + "epoch": 128.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.463526725769043, + "eval_runtime": 4.2437, + "eval_samples_per_second": 141.387, + "eval_steps_per_second": 4.477, + "step": 2400 + }, + { + "epoch": 128.53, + "learning_rate": 4.067901234567901e-05, + "loss": 0.0932, + "step": 2410 + }, + { + "epoch": 128.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.457594394683838, + "eval_runtime": 4.2373, + "eval_samples_per_second": 141.599, + "eval_steps_per_second": 4.484, + "step": 2418 + }, + { + "epoch": 129.07, + "learning_rate": 4.0617283950617286e-05, + "loss": 0.1158, + "step": 2420 + }, + { + "epoch": 129.6, + "learning_rate": 4.055555555555556e-05, + "loss": 0.1189, + "step": 2430 + }, + { + "epoch": 129.97, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.577010154724121, + "eval_runtime": 4.2164, + "eval_samples_per_second": 142.301, + "eval_steps_per_second": 4.506, + "step": 2437 + }, + { + "epoch": 130.13, + "learning_rate": 4.049382716049383e-05, + "loss": 0.116, + "step": 2440 + }, + { + "epoch": 130.67, + "learning_rate": 4.04320987654321e-05, + "loss": 0.1096, + "step": 2450 + }, + { + "epoch": 130.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.492093563079834, + "eval_runtime": 4.2174, + "eval_samples_per_second": 142.267, + "eval_steps_per_second": 4.505, + "step": 2456 + }, + { + "epoch": 131.2, + "learning_rate": 4.0370370370370374e-05, + "loss": 0.1004, + "step": 2460 + }, + { + "epoch": 131.73, + "learning_rate": 4.030864197530864e-05, + "loss": 0.0791, + "step": 2470 + }, + { + "epoch": 132.0, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.508973598480225, + "eval_runtime": 4.207, + "eval_samples_per_second": 142.618, + "eval_steps_per_second": 4.516, + "step": 2475 + }, + { + "epoch": 132.27, + "learning_rate": 4.0246913580246915e-05, + "loss": 0.0935, + "step": 2480 + }, + { + "epoch": 132.8, + "learning_rate": 4.018518518518519e-05, + "loss": 0.1152, + "step": 2490 + }, + { + "epoch": 132.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.457157611846924, + "eval_runtime": 4.1941, + "eval_samples_per_second": 143.058, + "eval_steps_per_second": 4.53, + "step": 2493 + }, + { + "epoch": 133.33, + "learning_rate": 4.012345679012346e-05, + "loss": 0.091, + "step": 2500 + }, + { + "epoch": 133.87, + "learning_rate": 4.006172839506173e-05, + "loss": 0.1264, + "step": 2510 + }, + { + "epoch": 133.97, + "eval_accuracy": 0.25, + "eval_loss": 4.510921001434326, + "eval_runtime": 4.2057, + "eval_samples_per_second": 142.663, + "eval_steps_per_second": 4.518, + "step": 2512 + }, + { + "epoch": 134.4, + "learning_rate": 4e-05, + "loss": 0.0749, + "step": 2520 + }, + { + "epoch": 134.93, + "learning_rate": 3.993827160493827e-05, + "loss": 0.1009, + "step": 2530 + }, + { + "epoch": 134.99, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.523574352264404, + "eval_runtime": 4.1971, + "eval_samples_per_second": 142.957, + "eval_steps_per_second": 4.527, + "step": 2531 + }, + { + "epoch": 135.47, + "learning_rate": 3.9876543209876544e-05, + "loss": 0.0964, + "step": 2540 + }, + { + "epoch": 136.0, + "learning_rate": 3.981481481481482e-05, + "loss": 0.0956, + "step": 2550 + }, + { + "epoch": 136.0, + "eval_accuracy": 0.245, + "eval_loss": 4.478328227996826, + "eval_runtime": 4.1949, + "eval_samples_per_second": 143.029, + "eval_steps_per_second": 4.529, + "step": 2550 + }, + { + "epoch": 136.53, + "learning_rate": 3.975308641975309e-05, + "loss": 0.0919, + "step": 2560 + }, + { + "epoch": 136.96, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.5484466552734375, + "eval_runtime": 4.2364, + "eval_samples_per_second": 141.629, + "eval_steps_per_second": 4.485, + "step": 2568 + }, + { + "epoch": 137.07, + "learning_rate": 3.969135802469136e-05, + "loss": 0.1154, + "step": 2570 + }, + { + "epoch": 137.6, + "learning_rate": 3.962962962962963e-05, + "loss": 0.1042, + "step": 2580 + }, + { + "epoch": 137.97, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.542301654815674, + "eval_runtime": 4.2535, + "eval_samples_per_second": 141.061, + "eval_steps_per_second": 4.467, + "step": 2587 + }, + { + "epoch": 138.13, + "learning_rate": 3.9567901234567906e-05, + "loss": 0.0859, + "step": 2590 + }, + { + "epoch": 138.67, + "learning_rate": 3.950617283950617e-05, + "loss": 0.1039, + "step": 2600 + }, + { + "epoch": 138.99, + "eval_accuracy": 0.245, + "eval_loss": 4.491814136505127, + "eval_runtime": 4.1944, + "eval_samples_per_second": 143.049, + "eval_steps_per_second": 4.53, + "step": 2606 + }, + { + "epoch": 139.2, + "learning_rate": 3.944444444444445e-05, + "loss": 0.0846, + "step": 2610 + }, + { + "epoch": 139.73, + "learning_rate": 3.938271604938272e-05, + "loss": 0.094, + "step": 2620 + }, + { + "epoch": 140.0, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.545647144317627, + "eval_runtime": 4.1844, + "eval_samples_per_second": 143.39, + "eval_steps_per_second": 4.541, + "step": 2625 + }, + { + "epoch": 140.27, + "learning_rate": 3.932098765432099e-05, + "loss": 0.0929, + "step": 2630 + }, + { + "epoch": 140.8, + "learning_rate": 3.925925925925926e-05, + "loss": 0.1056, + "step": 2640 + }, + { + "epoch": 140.96, + "eval_accuracy": 0.245, + "eval_loss": 4.521935939788818, + "eval_runtime": 4.1914, + "eval_samples_per_second": 143.151, + "eval_steps_per_second": 4.533, + "step": 2643 + }, + { + "epoch": 141.33, + "learning_rate": 3.9197530864197535e-05, + "loss": 0.1025, + "step": 2650 + }, + { + "epoch": 141.87, + "learning_rate": 3.91358024691358e-05, + "loss": 0.0918, + "step": 2660 + }, + { + "epoch": 141.97, + "eval_accuracy": 0.245, + "eval_loss": 4.52545166015625, + "eval_runtime": 4.2013, + "eval_samples_per_second": 142.812, + "eval_steps_per_second": 4.522, + "step": 2662 + }, + { + "epoch": 142.4, + "learning_rate": 3.9074074074074076e-05, + "loss": 0.098, + "step": 2670 + }, + { + "epoch": 142.93, + "learning_rate": 3.901234567901234e-05, + "loss": 0.0877, + "step": 2680 + }, + { + "epoch": 142.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.492305278778076, + "eval_runtime": 4.1813, + "eval_samples_per_second": 143.497, + "eval_steps_per_second": 4.544, + "step": 2681 + }, + { + "epoch": 143.47, + "learning_rate": 3.895061728395062e-05, + "loss": 0.0936, + "step": 2690 + }, + { + "epoch": 144.0, + "learning_rate": 3.888888888888889e-05, + "loss": 0.105, + "step": 2700 + }, + { + "epoch": 144.0, + "eval_accuracy": 0.235, + "eval_loss": 4.535154819488525, + "eval_runtime": 4.1912, + "eval_samples_per_second": 143.158, + "eval_steps_per_second": 4.533, + "step": 2700 + }, + { + "epoch": 144.53, + "learning_rate": 3.8827160493827165e-05, + "loss": 0.0892, + "step": 2710 + }, + { + "epoch": 144.96, + "eval_accuracy": 0.245, + "eval_loss": 4.471460819244385, + "eval_runtime": 4.1911, + "eval_samples_per_second": 143.161, + "eval_steps_per_second": 4.533, + "step": 2718 + }, + { + "epoch": 145.07, + "learning_rate": 3.876543209876544e-05, + "loss": 0.1041, + "step": 2720 + }, + { + "epoch": 145.6, + "learning_rate": 3.8703703703703705e-05, + "loss": 0.0963, + "step": 2730 + }, + { + "epoch": 145.97, + "eval_accuracy": 0.245, + "eval_loss": 4.506024360656738, + "eval_runtime": 4.1827, + "eval_samples_per_second": 143.449, + "eval_steps_per_second": 4.543, + "step": 2737 + }, + { + "epoch": 146.13, + "learning_rate": 3.864197530864197e-05, + "loss": 0.089, + "step": 2740 + }, + { + "epoch": 146.67, + "learning_rate": 3.8580246913580246e-05, + "loss": 0.095, + "step": 2750 + }, + { + "epoch": 146.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.559337615966797, + "eval_runtime": 4.1884, + "eval_samples_per_second": 143.254, + "eval_steps_per_second": 4.536, + "step": 2756 + }, + { + "epoch": 147.2, + "learning_rate": 3.851851851851852e-05, + "loss": 0.0978, + "step": 2760 + }, + { + "epoch": 147.73, + "learning_rate": 3.8456790123456794e-05, + "loss": 0.0997, + "step": 2770 + }, + { + "epoch": 148.0, + "eval_accuracy": 0.24, + "eval_loss": 4.580421447753906, + "eval_runtime": 4.2171, + "eval_samples_per_second": 142.278, + "eval_steps_per_second": 4.505, + "step": 2775 + }, + { + "epoch": 148.27, + "learning_rate": 3.839506172839507e-05, + "loss": 0.0689, + "step": 2780 + }, + { + "epoch": 148.8, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.0839, + "step": 2790 + }, + { + "epoch": 148.96, + "eval_accuracy": 0.23, + "eval_loss": 4.59170389175415, + "eval_runtime": 4.2419, + "eval_samples_per_second": 141.446, + "eval_steps_per_second": 4.479, + "step": 2793 + }, + { + "epoch": 149.33, + "learning_rate": 3.82716049382716e-05, + "loss": 0.1028, + "step": 2800 + }, + { + "epoch": 149.87, + "learning_rate": 3.8209876543209875e-05, + "loss": 0.0924, + "step": 2810 + }, + { + "epoch": 149.97, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.593113422393799, + "eval_runtime": 4.2519, + "eval_samples_per_second": 141.113, + "eval_steps_per_second": 4.469, + "step": 2812 + }, + { + "epoch": 150.4, + "learning_rate": 3.814814814814815e-05, + "loss": 0.0694, + "step": 2820 + }, + { + "epoch": 150.93, + "learning_rate": 3.808641975308642e-05, + "loss": 0.0781, + "step": 2830 + }, + { + "epoch": 150.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.578421115875244, + "eval_runtime": 4.2293, + "eval_samples_per_second": 141.867, + "eval_steps_per_second": 4.492, + "step": 2831 + }, + { + "epoch": 151.47, + "learning_rate": 3.80246913580247e-05, + "loss": 0.1092, + "step": 2840 + }, + { + "epoch": 152.0, + "learning_rate": 3.7962962962962964e-05, + "loss": 0.0986, + "step": 2850 + }, + { + "epoch": 152.0, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.654634475708008, + "eval_runtime": 4.2497, + "eval_samples_per_second": 141.185, + "eval_steps_per_second": 4.471, + "step": 2850 + }, + { + "epoch": 152.53, + "learning_rate": 3.790123456790123e-05, + "loss": 0.0823, + "step": 2860 + }, + { + "epoch": 152.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.598492622375488, + "eval_runtime": 4.242, + "eval_samples_per_second": 141.443, + "eval_steps_per_second": 4.479, + "step": 2868 + }, + { + "epoch": 153.07, + "learning_rate": 3.7839506172839504e-05, + "loss": 0.0913, + "step": 2870 + }, + { + "epoch": 153.6, + "learning_rate": 3.777777777777778e-05, + "loss": 0.0887, + "step": 2880 + }, + { + "epoch": 153.97, + "eval_accuracy": 0.23, + "eval_loss": 4.614808559417725, + "eval_runtime": 4.2174, + "eval_samples_per_second": 142.267, + "eval_steps_per_second": 4.505, + "step": 2887 + }, + { + "epoch": 154.13, + "learning_rate": 3.771604938271605e-05, + "loss": 0.0787, + "step": 2890 + }, + { + "epoch": 154.67, + "learning_rate": 3.7654320987654326e-05, + "loss": 0.0671, + "step": 2900 + }, + { + "epoch": 154.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.63968563079834, + "eval_runtime": 4.1904, + "eval_samples_per_second": 143.186, + "eval_steps_per_second": 4.534, + "step": 2906 + }, + { + "epoch": 155.2, + "learning_rate": 3.759259259259259e-05, + "loss": 0.104, + "step": 2910 + }, + { + "epoch": 155.73, + "learning_rate": 3.7530864197530867e-05, + "loss": 0.0897, + "step": 2920 + }, + { + "epoch": 156.0, + "eval_accuracy": 0.235, + "eval_loss": 4.583400249481201, + "eval_runtime": 4.1921, + "eval_samples_per_second": 143.126, + "eval_steps_per_second": 4.532, + "step": 2925 + }, + { + "epoch": 156.27, + "learning_rate": 3.7469135802469134e-05, + "loss": 0.0795, + "step": 2930 + }, + { + "epoch": 156.8, + "learning_rate": 3.740740740740741e-05, + "loss": 0.093, + "step": 2940 + }, + { + "epoch": 156.96, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.539726257324219, + "eval_runtime": 4.203, + "eval_samples_per_second": 142.754, + "eval_steps_per_second": 4.521, + "step": 2943 + }, + { + "epoch": 157.33, + "learning_rate": 3.734567901234568e-05, + "loss": 0.0903, + "step": 2950 + }, + { + "epoch": 157.87, + "learning_rate": 3.7283950617283955e-05, + "loss": 0.0973, + "step": 2960 + }, + { + "epoch": 157.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.55323600769043, + "eval_runtime": 4.2018, + "eval_samples_per_second": 142.796, + "eval_steps_per_second": 4.522, + "step": 2962 + }, + { + "epoch": 158.4, + "learning_rate": 3.722222222222222e-05, + "loss": 0.064, + "step": 2970 + }, + { + "epoch": 158.93, + "learning_rate": 3.7160493827160496e-05, + "loss": 0.1001, + "step": 2980 + }, + { + "epoch": 158.99, + "eval_accuracy": 0.24, + "eval_loss": 4.58270263671875, + "eval_runtime": 4.2074, + "eval_samples_per_second": 142.605, + "eval_steps_per_second": 4.516, + "step": 2981 + }, + { + "epoch": 159.47, + "learning_rate": 3.709876543209877e-05, + "loss": 0.0788, + "step": 2990 + }, + { + "epoch": 160.0, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.0884, + "step": 3000 + }, + { + "epoch": 160.0, + "eval_accuracy": 0.235, + "eval_loss": 4.572762966156006, + "eval_runtime": 4.217, + "eval_samples_per_second": 142.282, + "eval_steps_per_second": 4.506, + "step": 3000 + }, + { + "epoch": 160.53, + "learning_rate": 3.697530864197531e-05, + "loss": 0.084, + "step": 3010 + }, + { + "epoch": 160.96, + "eval_accuracy": 0.235, + "eval_loss": 4.654175758361816, + "eval_runtime": 4.2104, + "eval_samples_per_second": 142.503, + "eval_steps_per_second": 4.513, + "step": 3018 + }, + { + "epoch": 161.07, + "learning_rate": 3.6913580246913584e-05, + "loss": 0.0773, + "step": 3020 + }, + { + "epoch": 161.6, + "learning_rate": 3.685185185185185e-05, + "loss": 0.0902, + "step": 3030 + }, + { + "epoch": 161.97, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.636648178100586, + "eval_runtime": 4.2002, + "eval_samples_per_second": 142.849, + "eval_steps_per_second": 4.524, + "step": 3037 + }, + { + "epoch": 162.13, + "learning_rate": 3.6790123456790125e-05, + "loss": 0.0819, + "step": 3040 + }, + { + "epoch": 162.67, + "learning_rate": 3.67283950617284e-05, + "loss": 0.0944, + "step": 3050 + }, + { + "epoch": 162.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.5957183837890625, + "eval_runtime": 4.2207, + "eval_samples_per_second": 142.157, + "eval_steps_per_second": 4.502, + "step": 3056 + }, + { + "epoch": 163.2, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.0792, + "step": 3060 + }, + { + "epoch": 163.73, + "learning_rate": 3.660493827160494e-05, + "loss": 0.0828, + "step": 3070 + }, + { + "epoch": 164.0, + "eval_accuracy": 0.23, + "eval_loss": 4.652061462402344, + "eval_runtime": 4.2215, + "eval_samples_per_second": 142.13, + "eval_steps_per_second": 4.501, + "step": 3075 + }, + { + "epoch": 164.27, + "learning_rate": 3.654320987654321e-05, + "loss": 0.0729, + "step": 3080 + }, + { + "epoch": 164.8, + "learning_rate": 3.648148148148148e-05, + "loss": 0.0812, + "step": 3090 + }, + { + "epoch": 164.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.676053047180176, + "eval_runtime": 4.2058, + "eval_samples_per_second": 142.66, + "eval_steps_per_second": 4.518, + "step": 3093 + }, + { + "epoch": 165.33, + "learning_rate": 3.6419753086419754e-05, + "loss": 0.0723, + "step": 3100 + }, + { + "epoch": 165.87, + "learning_rate": 3.635802469135803e-05, + "loss": 0.0817, + "step": 3110 + }, + { + "epoch": 165.97, + "eval_accuracy": 0.225, + "eval_loss": 4.627193927764893, + "eval_runtime": 4.22, + "eval_samples_per_second": 142.182, + "eval_steps_per_second": 4.502, + "step": 3112 + }, + { + "epoch": 166.4, + "learning_rate": 3.62962962962963e-05, + "loss": 0.0718, + "step": 3120 + }, + { + "epoch": 166.93, + "learning_rate": 3.623456790123457e-05, + "loss": 0.07, + "step": 3130 + }, + { + "epoch": 166.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.653589725494385, + "eval_runtime": 4.255, + "eval_samples_per_second": 141.01, + "eval_steps_per_second": 4.465, + "step": 3131 + }, + { + "epoch": 167.47, + "learning_rate": 3.617283950617284e-05, + "loss": 0.0672, + "step": 3140 + }, + { + "epoch": 168.0, + "learning_rate": 3.611111111111111e-05, + "loss": 0.0746, + "step": 3150 + }, + { + "epoch": 168.0, + "eval_accuracy": 0.245, + "eval_loss": 4.567090034484863, + "eval_runtime": 4.2691, + "eval_samples_per_second": 140.544, + "eval_steps_per_second": 4.451, + "step": 3150 + }, + { + "epoch": 168.53, + "learning_rate": 3.604938271604938e-05, + "loss": 0.0782, + "step": 3160 + }, + { + "epoch": 168.96, + "eval_accuracy": 0.24, + "eval_loss": 4.591490745544434, + "eval_runtime": 4.2101, + "eval_samples_per_second": 142.516, + "eval_steps_per_second": 4.513, + "step": 3168 + }, + { + "epoch": 169.07, + "learning_rate": 3.598765432098766e-05, + "loss": 0.0667, + "step": 3170 + }, + { + "epoch": 169.6, + "learning_rate": 3.592592592592593e-05, + "loss": 0.0677, + "step": 3180 + }, + { + "epoch": 169.97, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.637347221374512, + "eval_runtime": 4.2173, + "eval_samples_per_second": 142.272, + "eval_steps_per_second": 4.505, + "step": 3187 + }, + { + "epoch": 170.13, + "learning_rate": 3.5864197530864205e-05, + "loss": 0.0839, + "step": 3190 + }, + { + "epoch": 170.67, + "learning_rate": 3.580246913580247e-05, + "loss": 0.0626, + "step": 3200 + }, + { + "epoch": 170.99, + "eval_accuracy": 0.25833333333333336, + "eval_loss": 4.672304630279541, + "eval_runtime": 4.2092, + "eval_samples_per_second": 142.546, + "eval_steps_per_second": 4.514, + "step": 3206 + }, + { + "epoch": 171.2, + "learning_rate": 3.574074074074074e-05, + "loss": 0.076, + "step": 3210 + }, + { + "epoch": 171.73, + "learning_rate": 3.567901234567901e-05, + "loss": 0.0697, + "step": 3220 + }, + { + "epoch": 172.0, + "eval_accuracy": 0.245, + "eval_loss": 4.681668281555176, + "eval_runtime": 4.1945, + "eval_samples_per_second": 143.044, + "eval_steps_per_second": 4.53, + "step": 3225 + }, + { + "epoch": 172.27, + "learning_rate": 3.5617283950617286e-05, + "loss": 0.0826, + "step": 3230 + }, + { + "epoch": 172.8, + "learning_rate": 3.555555555555556e-05, + "loss": 0.077, + "step": 3240 + }, + { + "epoch": 172.96, + "eval_accuracy": 0.23, + "eval_loss": 4.679342269897461, + "eval_runtime": 4.2134, + "eval_samples_per_second": 142.403, + "eval_steps_per_second": 4.509, + "step": 3243 + }, + { + "epoch": 173.33, + "learning_rate": 3.5493827160493834e-05, + "loss": 0.0766, + "step": 3250 + }, + { + "epoch": 173.87, + "learning_rate": 3.54320987654321e-05, + "loss": 0.068, + "step": 3260 + }, + { + "epoch": 173.97, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.711019039154053, + "eval_runtime": 4.211, + "eval_samples_per_second": 142.484, + "eval_steps_per_second": 4.512, + "step": 3262 + }, + { + "epoch": 174.4, + "learning_rate": 3.537037037037037e-05, + "loss": 0.0587, + "step": 3270 + }, + { + "epoch": 174.93, + "learning_rate": 3.530864197530864e-05, + "loss": 0.0875, + "step": 3280 + }, + { + "epoch": 174.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.701217174530029, + "eval_runtime": 4.2016, + "eval_samples_per_second": 142.802, + "eval_steps_per_second": 4.522, + "step": 3281 + }, + { + "epoch": 175.47, + "learning_rate": 3.5246913580246915e-05, + "loss": 0.0765, + "step": 3290 + }, + { + "epoch": 176.0, + "learning_rate": 3.518518518518519e-05, + "loss": 0.0787, + "step": 3300 + }, + { + "epoch": 176.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.711310863494873, + "eval_runtime": 4.2282, + "eval_samples_per_second": 141.904, + "eval_steps_per_second": 4.494, + "step": 3300 + }, + { + "epoch": 176.53, + "learning_rate": 3.512345679012346e-05, + "loss": 0.0779, + "step": 3310 + }, + { + "epoch": 176.96, + "eval_accuracy": 0.24, + "eval_loss": 4.699758529663086, + "eval_runtime": 4.1993, + "eval_samples_per_second": 142.882, + "eval_steps_per_second": 4.525, + "step": 3318 + }, + { + "epoch": 177.07, + "learning_rate": 3.506172839506173e-05, + "loss": 0.0632, + "step": 3320 + }, + { + "epoch": 177.6, + "learning_rate": 3.5e-05, + "loss": 0.0823, + "step": 3330 + }, + { + "epoch": 177.97, + "eval_accuracy": 0.24, + "eval_loss": 4.709224224090576, + "eval_runtime": 4.2154, + "eval_samples_per_second": 142.335, + "eval_steps_per_second": 4.507, + "step": 3337 + }, + { + "epoch": 178.13, + "learning_rate": 3.493827160493827e-05, + "loss": 0.0819, + "step": 3340 + }, + { + "epoch": 178.67, + "learning_rate": 3.4876543209876545e-05, + "loss": 0.0685, + "step": 3350 + }, + { + "epoch": 178.99, + "eval_accuracy": 0.245, + "eval_loss": 4.676272869110107, + "eval_runtime": 4.2277, + "eval_samples_per_second": 141.922, + "eval_steps_per_second": 4.494, + "step": 3356 + }, + { + "epoch": 179.2, + "learning_rate": 3.481481481481482e-05, + "loss": 0.0592, + "step": 3360 + }, + { + "epoch": 179.73, + "learning_rate": 3.475308641975309e-05, + "loss": 0.0698, + "step": 3370 + }, + { + "epoch": 180.0, + "eval_accuracy": 0.25666666666666665, + "eval_loss": 4.718149185180664, + "eval_runtime": 4.2111, + "eval_samples_per_second": 142.479, + "eval_steps_per_second": 4.512, + "step": 3375 + }, + { + "epoch": 180.27, + "learning_rate": 3.469135802469136e-05, + "loss": 0.074, + "step": 3380 + }, + { + "epoch": 180.8, + "learning_rate": 3.4629629629629626e-05, + "loss": 0.0924, + "step": 3390 + }, + { + "epoch": 180.96, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.715093612670898, + "eval_runtime": 4.2612, + "eval_samples_per_second": 140.804, + "eval_steps_per_second": 4.459, + "step": 3393 + }, + { + "epoch": 181.33, + "learning_rate": 3.45679012345679e-05, + "loss": 0.0792, + "step": 3400 + }, + { + "epoch": 181.87, + "learning_rate": 3.4506172839506174e-05, + "loss": 0.084, + "step": 3410 + }, + { + "epoch": 181.97, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.723077774047852, + "eval_runtime": 4.2506, + "eval_samples_per_second": 141.157, + "eval_steps_per_second": 4.47, + "step": 3412 + }, + { + "epoch": 182.4, + "learning_rate": 3.444444444444445e-05, + "loss": 0.08, + "step": 3420 + }, + { + "epoch": 182.93, + "learning_rate": 3.438271604938272e-05, + "loss": 0.0508, + "step": 3430 + }, + { + "epoch": 182.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.685625076293945, + "eval_runtime": 4.2741, + "eval_samples_per_second": 140.38, + "eval_steps_per_second": 4.445, + "step": 3431 + }, + { + "epoch": 183.47, + "learning_rate": 3.432098765432099e-05, + "loss": 0.0604, + "step": 3440 + }, + { + "epoch": 184.0, + "learning_rate": 3.425925925925926e-05, + "loss": 0.0637, + "step": 3450 + }, + { + "epoch": 184.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.704137325286865, + "eval_runtime": 4.274, + "eval_samples_per_second": 140.382, + "eval_steps_per_second": 4.445, + "step": 3450 + }, + { + "epoch": 184.53, + "learning_rate": 3.419753086419753e-05, + "loss": 0.06, + "step": 3460 + }, + { + "epoch": 184.96, + "eval_accuracy": 0.24, + "eval_loss": 4.720521450042725, + "eval_runtime": 4.2381, + "eval_samples_per_second": 141.572, + "eval_steps_per_second": 4.483, + "step": 3468 + }, + { + "epoch": 185.07, + "learning_rate": 3.41358024691358e-05, + "loss": 0.062, + "step": 3470 + }, + { + "epoch": 185.6, + "learning_rate": 3.4074074074074077e-05, + "loss": 0.0659, + "step": 3480 + }, + { + "epoch": 185.97, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.725123405456543, + "eval_runtime": 4.2202, + "eval_samples_per_second": 142.172, + "eval_steps_per_second": 4.502, + "step": 3487 + }, + { + "epoch": 186.13, + "learning_rate": 3.401234567901235e-05, + "loss": 0.0788, + "step": 3490 + }, + { + "epoch": 186.67, + "learning_rate": 3.395061728395062e-05, + "loss": 0.0842, + "step": 3500 + }, + { + "epoch": 186.99, + "eval_accuracy": 0.23, + "eval_loss": 4.721489906311035, + "eval_runtime": 4.232, + "eval_samples_per_second": 141.778, + "eval_steps_per_second": 4.49, + "step": 3506 + }, + { + "epoch": 187.2, + "learning_rate": 3.388888888888889e-05, + "loss": 0.0773, + "step": 3510 + }, + { + "epoch": 187.73, + "learning_rate": 3.3827160493827165e-05, + "loss": 0.0733, + "step": 3520 + }, + { + "epoch": 188.0, + "eval_accuracy": 0.24, + "eval_loss": 4.706781387329102, + "eval_runtime": 4.2434, + "eval_samples_per_second": 141.395, + "eval_steps_per_second": 4.478, + "step": 3525 + }, + { + "epoch": 188.27, + "learning_rate": 3.376543209876543e-05, + "loss": 0.0722, + "step": 3530 + }, + { + "epoch": 188.8, + "learning_rate": 3.3703703703703706e-05, + "loss": 0.0647, + "step": 3540 + }, + { + "epoch": 188.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.759402751922607, + "eval_runtime": 4.2349, + "eval_samples_per_second": 141.679, + "eval_steps_per_second": 4.486, + "step": 3543 + }, + { + "epoch": 189.33, + "learning_rate": 3.364197530864198e-05, + "loss": 0.0714, + "step": 3550 + }, + { + "epoch": 189.87, + "learning_rate": 3.3580246913580247e-05, + "loss": 0.0569, + "step": 3560 + }, + { + "epoch": 189.97, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.783107280731201, + "eval_runtime": 4.2157, + "eval_samples_per_second": 142.324, + "eval_steps_per_second": 4.507, + "step": 3562 + }, + { + "epoch": 190.4, + "learning_rate": 3.351851851851852e-05, + "loss": 0.0673, + "step": 3570 + }, + { + "epoch": 190.93, + "learning_rate": 3.3456790123456794e-05, + "loss": 0.0883, + "step": 3580 + }, + { + "epoch": 190.99, + "eval_accuracy": 0.235, + "eval_loss": 4.721207618713379, + "eval_runtime": 4.2273, + "eval_samples_per_second": 141.935, + "eval_steps_per_second": 4.495, + "step": 3581 + }, + { + "epoch": 191.47, + "learning_rate": 3.339506172839506e-05, + "loss": 0.0704, + "step": 3590 + }, + { + "epoch": 192.0, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0622, + "step": 3600 + }, + { + "epoch": 192.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.6877546310424805, + "eval_runtime": 4.2265, + "eval_samples_per_second": 141.962, + "eval_steps_per_second": 4.495, + "step": 3600 + }, + { + "epoch": 192.53, + "learning_rate": 3.327160493827161e-05, + "loss": 0.057, + "step": 3610 + }, + { + "epoch": 192.96, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.665402889251709, + "eval_runtime": 4.2317, + "eval_samples_per_second": 141.788, + "eval_steps_per_second": 4.49, + "step": 3618 + }, + { + "epoch": 193.07, + "learning_rate": 3.3209876543209876e-05, + "loss": 0.0563, + "step": 3620 + }, + { + "epoch": 193.6, + "learning_rate": 3.314814814814815e-05, + "loss": 0.0654, + "step": 3630 + }, + { + "epoch": 193.97, + "eval_accuracy": 0.25166666666666665, + "eval_loss": 4.635808944702148, + "eval_runtime": 4.2421, + "eval_samples_per_second": 141.438, + "eval_steps_per_second": 4.479, + "step": 3637 + }, + { + "epoch": 194.13, + "learning_rate": 3.308641975308642e-05, + "loss": 0.0771, + "step": 3640 + }, + { + "epoch": 194.67, + "learning_rate": 3.30246913580247e-05, + "loss": 0.0868, + "step": 3650 + }, + { + "epoch": 194.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.662071704864502, + "eval_runtime": 4.2363, + "eval_samples_per_second": 141.634, + "eval_steps_per_second": 4.485, + "step": 3656 + }, + { + "epoch": 195.2, + "learning_rate": 3.2962962962962964e-05, + "loss": 0.0592, + "step": 3660 + }, + { + "epoch": 195.73, + "learning_rate": 3.290123456790124e-05, + "loss": 0.0789, + "step": 3670 + }, + { + "epoch": 196.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.698493003845215, + "eval_runtime": 4.2392, + "eval_samples_per_second": 141.537, + "eval_steps_per_second": 4.482, + "step": 3675 + }, + { + "epoch": 196.27, + "learning_rate": 3.2839506172839505e-05, + "loss": 0.0683, + "step": 3680 + }, + { + "epoch": 196.8, + "learning_rate": 3.277777777777778e-05, + "loss": 0.0657, + "step": 3690 + }, + { + "epoch": 196.96, + "eval_accuracy": 0.25666666666666665, + "eval_loss": 4.663649559020996, + "eval_runtime": 4.2398, + "eval_samples_per_second": 141.517, + "eval_steps_per_second": 4.481, + "step": 3693 + }, + { + "epoch": 197.33, + "learning_rate": 3.271604938271605e-05, + "loss": 0.057, + "step": 3700 + }, + { + "epoch": 197.87, + "learning_rate": 3.2654320987654326e-05, + "loss": 0.0648, + "step": 3710 + }, + { + "epoch": 197.97, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.7698283195495605, + "eval_runtime": 4.2294, + "eval_samples_per_second": 141.864, + "eval_steps_per_second": 4.492, + "step": 3712 + }, + { + "epoch": 198.4, + "learning_rate": 3.25925925925926e-05, + "loss": 0.0577, + "step": 3720 + }, + { + "epoch": 198.93, + "learning_rate": 3.253086419753087e-05, + "loss": 0.0635, + "step": 3730 + }, + { + "epoch": 198.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.722621440887451, + "eval_runtime": 4.2242, + "eval_samples_per_second": 142.038, + "eval_steps_per_second": 4.498, + "step": 3731 + }, + { + "epoch": 199.47, + "learning_rate": 3.2469135802469134e-05, + "loss": 0.0563, + "step": 3740 + }, + { + "epoch": 200.0, + "learning_rate": 3.240740740740741e-05, + "loss": 0.0637, + "step": 3750 + }, + { + "epoch": 200.0, + "eval_accuracy": 0.245, + "eval_loss": 4.748103618621826, + "eval_runtime": 4.2435, + "eval_samples_per_second": 141.393, + "eval_steps_per_second": 4.477, + "step": 3750 + }, + { + "epoch": 200.53, + "learning_rate": 3.234567901234568e-05, + "loss": 0.0665, + "step": 3760 + }, + { + "epoch": 200.96, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.778894901275635, + "eval_runtime": 4.2222, + "eval_samples_per_second": 142.106, + "eval_steps_per_second": 4.5, + "step": 3768 + }, + { + "epoch": 201.07, + "learning_rate": 3.2283950617283955e-05, + "loss": 0.0649, + "step": 3770 + }, + { + "epoch": 201.6, + "learning_rate": 3.222222222222223e-05, + "loss": 0.0799, + "step": 3780 + }, + { + "epoch": 201.97, + "eval_accuracy": 0.235, + "eval_loss": 4.701383590698242, + "eval_runtime": 4.2577, + "eval_samples_per_second": 140.92, + "eval_steps_per_second": 4.462, + "step": 3787 + }, + { + "epoch": 202.13, + "learning_rate": 3.216049382716049e-05, + "loss": 0.049, + "step": 3790 + }, + { + "epoch": 202.67, + "learning_rate": 3.209876543209876e-05, + "loss": 0.064, + "step": 3800 + }, + { + "epoch": 202.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.752817153930664, + "eval_runtime": 4.2702, + "eval_samples_per_second": 140.507, + "eval_steps_per_second": 4.449, + "step": 3806 + }, + { + "epoch": 203.2, + "learning_rate": 3.203703703703704e-05, + "loss": 0.0554, + "step": 3810 + }, + { + "epoch": 203.73, + "learning_rate": 3.197530864197531e-05, + "loss": 0.0772, + "step": 3820 + }, + { + "epoch": 204.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.740113735198975, + "eval_runtime": 4.2544, + "eval_samples_per_second": 141.031, + "eval_steps_per_second": 4.466, + "step": 3825 + }, + { + "epoch": 204.27, + "learning_rate": 3.1913580246913585e-05, + "loss": 0.0625, + "step": 3830 + }, + { + "epoch": 204.8, + "learning_rate": 3.185185185185185e-05, + "loss": 0.0438, + "step": 3840 + }, + { + "epoch": 204.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.76779317855835, + "eval_runtime": 4.2439, + "eval_samples_per_second": 141.379, + "eval_steps_per_second": 4.477, + "step": 3843 + }, + { + "epoch": 205.33, + "learning_rate": 3.1790123456790125e-05, + "loss": 0.0716, + "step": 3850 + }, + { + "epoch": 205.87, + "learning_rate": 3.172839506172839e-05, + "loss": 0.0766, + "step": 3860 + }, + { + "epoch": 205.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.71795654296875, + "eval_runtime": 4.2289, + "eval_samples_per_second": 141.881, + "eval_steps_per_second": 4.493, + "step": 3862 + }, + { + "epoch": 206.4, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.08, + "step": 3870 + }, + { + "epoch": 206.93, + "learning_rate": 3.160493827160494e-05, + "loss": 0.0687, + "step": 3880 + }, + { + "epoch": 206.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.7058281898498535, + "eval_runtime": 4.2979, + "eval_samples_per_second": 139.602, + "eval_steps_per_second": 4.421, + "step": 3881 + }, + { + "epoch": 207.47, + "learning_rate": 3.1543209876543214e-05, + "loss": 0.0686, + "step": 3890 + }, + { + "epoch": 208.0, + "learning_rate": 3.148148148148148e-05, + "loss": 0.0801, + "step": 3900 + }, + { + "epoch": 208.0, + "eval_accuracy": 0.235, + "eval_loss": 4.7583746910095215, + "eval_runtime": 4.3035, + "eval_samples_per_second": 139.423, + "eval_steps_per_second": 4.415, + "step": 3900 + }, + { + "epoch": 208.53, + "learning_rate": 3.1419753086419755e-05, + "loss": 0.0772, + "step": 3910 + }, + { + "epoch": 208.96, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.73037576675415, + "eval_runtime": 4.265, + "eval_samples_per_second": 140.681, + "eval_steps_per_second": 4.455, + "step": 3918 + }, + { + "epoch": 209.07, + "learning_rate": 3.135802469135803e-05, + "loss": 0.0504, + "step": 3920 + }, + { + "epoch": 209.6, + "learning_rate": 3.1296296296296295e-05, + "loss": 0.0663, + "step": 3930 + }, + { + "epoch": 209.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.693957805633545, + "eval_runtime": 4.2721, + "eval_samples_per_second": 140.445, + "eval_steps_per_second": 4.447, + "step": 3937 + }, + { + "epoch": 210.13, + "learning_rate": 3.123456790123457e-05, + "loss": 0.0497, + "step": 3940 + }, + { + "epoch": 210.67, + "learning_rate": 3.117283950617284e-05, + "loss": 0.0529, + "step": 3950 + }, + { + "epoch": 210.99, + "eval_accuracy": 0.235, + "eval_loss": 4.694019317626953, + "eval_runtime": 4.2603, + "eval_samples_per_second": 140.834, + "eval_steps_per_second": 4.46, + "step": 3956 + }, + { + "epoch": 211.2, + "learning_rate": 3.111111111111111e-05, + "loss": 0.0559, + "step": 3960 + }, + { + "epoch": 211.73, + "learning_rate": 3.1049382716049384e-05, + "loss": 0.0568, + "step": 3970 + }, + { + "epoch": 212.0, + "eval_accuracy": 0.235, + "eval_loss": 4.733299255371094, + "eval_runtime": 4.2654, + "eval_samples_per_second": 140.666, + "eval_steps_per_second": 4.454, + "step": 3975 + }, + { + "epoch": 212.27, + "learning_rate": 3.098765432098766e-05, + "loss": 0.057, + "step": 3980 + }, + { + "epoch": 212.8, + "learning_rate": 3.0925925925925924e-05, + "loss": 0.0697, + "step": 3990 + }, + { + "epoch": 212.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.667250156402588, + "eval_runtime": 4.2885, + "eval_samples_per_second": 139.909, + "eval_steps_per_second": 4.43, + "step": 3993 + }, + { + "epoch": 213.33, + "learning_rate": 3.08641975308642e-05, + "loss": 0.0402, + "step": 4000 + }, + { + "epoch": 213.87, + "learning_rate": 3.080246913580247e-05, + "loss": 0.0394, + "step": 4010 + }, + { + "epoch": 213.97, + "eval_accuracy": 0.245, + "eval_loss": 4.673309326171875, + "eval_runtime": 4.2825, + "eval_samples_per_second": 140.105, + "eval_steps_per_second": 4.437, + "step": 4012 + }, + { + "epoch": 214.4, + "learning_rate": 3.074074074074074e-05, + "loss": 0.0398, + "step": 4020 + }, + { + "epoch": 214.93, + "learning_rate": 3.067901234567901e-05, + "loss": 0.0625, + "step": 4030 + }, + { + "epoch": 214.99, + "eval_accuracy": 0.225, + "eval_loss": 4.738312244415283, + "eval_runtime": 4.3054, + "eval_samples_per_second": 139.359, + "eval_steps_per_second": 4.413, + "step": 4031 + }, + { + "epoch": 215.47, + "learning_rate": 3.061728395061729e-05, + "loss": 0.0626, + "step": 4040 + }, + { + "epoch": 216.0, + "learning_rate": 3.055555555555556e-05, + "loss": 0.0588, + "step": 4050 + }, + { + "epoch": 216.0, + "eval_accuracy": 0.24, + "eval_loss": 4.767359256744385, + "eval_runtime": 4.3344, + "eval_samples_per_second": 138.426, + "eval_steps_per_second": 4.383, + "step": 4050 + }, + { + "epoch": 216.53, + "learning_rate": 3.0493827160493827e-05, + "loss": 0.0594, + "step": 4060 + }, + { + "epoch": 216.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.687304496765137, + "eval_runtime": 4.285, + "eval_samples_per_second": 140.024, + "eval_steps_per_second": 4.434, + "step": 4068 + }, + { + "epoch": 217.07, + "learning_rate": 3.0432098765432098e-05, + "loss": 0.0721, + "step": 4070 + }, + { + "epoch": 217.6, + "learning_rate": 3.037037037037037e-05, + "loss": 0.0451, + "step": 4080 + }, + { + "epoch": 217.97, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.671844959259033, + "eval_runtime": 4.2925, + "eval_samples_per_second": 139.78, + "eval_steps_per_second": 4.426, + "step": 4087 + }, + { + "epoch": 218.13, + "learning_rate": 3.0308641975308642e-05, + "loss": 0.0445, + "step": 4090 + }, + { + "epoch": 218.67, + "learning_rate": 3.0246913580246916e-05, + "loss": 0.047, + "step": 4100 + }, + { + "epoch": 218.99, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.714609146118164, + "eval_runtime": 4.2883, + "eval_samples_per_second": 139.914, + "eval_steps_per_second": 4.431, + "step": 4106 + }, + { + "epoch": 219.2, + "learning_rate": 3.018518518518519e-05, + "loss": 0.0648, + "step": 4110 + }, + { + "epoch": 219.73, + "learning_rate": 3.012345679012346e-05, + "loss": 0.0445, + "step": 4120 + }, + { + "epoch": 220.0, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.717392921447754, + "eval_runtime": 4.2776, + "eval_samples_per_second": 140.266, + "eval_steps_per_second": 4.442, + "step": 4125 + }, + { + "epoch": 220.27, + "learning_rate": 3.0061728395061727e-05, + "loss": 0.0438, + "step": 4130 + }, + { + "epoch": 220.8, + "learning_rate": 3e-05, + "loss": 0.0746, + "step": 4140 + }, + { + "epoch": 220.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.670175075531006, + "eval_runtime": 4.2998, + "eval_samples_per_second": 139.542, + "eval_steps_per_second": 4.419, + "step": 4143 + }, + { + "epoch": 221.33, + "learning_rate": 2.993827160493827e-05, + "loss": 0.0617, + "step": 4150 + }, + { + "epoch": 221.87, + "learning_rate": 2.9876543209876545e-05, + "loss": 0.0697, + "step": 4160 + }, + { + "epoch": 221.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.646183967590332, + "eval_runtime": 4.3381, + "eval_samples_per_second": 138.308, + "eval_steps_per_second": 4.38, + "step": 4162 + }, + { + "epoch": 222.4, + "learning_rate": 2.981481481481482e-05, + "loss": 0.0866, + "step": 4170 + }, + { + "epoch": 222.93, + "learning_rate": 2.975308641975309e-05, + "loss": 0.0562, + "step": 4180 + }, + { + "epoch": 222.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.695559024810791, + "eval_runtime": 4.3517, + "eval_samples_per_second": 137.876, + "eval_steps_per_second": 4.366, + "step": 4181 + }, + { + "epoch": 223.47, + "learning_rate": 2.9691358024691356e-05, + "loss": 0.0986, + "step": 4190 + }, + { + "epoch": 224.0, + "learning_rate": 2.962962962962963e-05, + "loss": 0.047, + "step": 4200 + }, + { + "epoch": 224.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.727797985076904, + "eval_runtime": 4.3288, + "eval_samples_per_second": 138.607, + "eval_steps_per_second": 4.389, + "step": 4200 + }, + { + "epoch": 224.53, + "learning_rate": 2.95679012345679e-05, + "loss": 0.0612, + "step": 4210 + }, + { + "epoch": 224.96, + "eval_accuracy": 0.235, + "eval_loss": 4.730659484863281, + "eval_runtime": 4.3287, + "eval_samples_per_second": 138.611, + "eval_steps_per_second": 4.389, + "step": 4218 + }, + { + "epoch": 225.07, + "learning_rate": 2.9506172839506174e-05, + "loss": 0.048, + "step": 4220 + }, + { + "epoch": 225.6, + "learning_rate": 2.9444444444444448e-05, + "loss": 0.0625, + "step": 4230 + }, + { + "epoch": 225.97, + "eval_accuracy": 0.25666666666666665, + "eval_loss": 4.667015075683594, + "eval_runtime": 4.3594, + "eval_samples_per_second": 137.632, + "eval_steps_per_second": 4.358, + "step": 4237 + }, + { + "epoch": 226.13, + "learning_rate": 2.9382716049382718e-05, + "loss": 0.0558, + "step": 4240 + }, + { + "epoch": 226.67, + "learning_rate": 2.9320987654320992e-05, + "loss": 0.0739, + "step": 4250 + }, + { + "epoch": 226.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.710987091064453, + "eval_runtime": 4.3562, + "eval_samples_per_second": 137.734, + "eval_steps_per_second": 4.362, + "step": 4256 + }, + { + "epoch": 227.2, + "learning_rate": 2.925925925925926e-05, + "loss": 0.054, + "step": 4260 + }, + { + "epoch": 227.73, + "learning_rate": 2.919753086419753e-05, + "loss": 0.0637, + "step": 4270 + }, + { + "epoch": 228.0, + "eval_accuracy": 0.22, + "eval_loss": 4.703871726989746, + "eval_runtime": 4.3307, + "eval_samples_per_second": 138.545, + "eval_steps_per_second": 4.387, + "step": 4275 + }, + { + "epoch": 228.27, + "learning_rate": 2.9135802469135803e-05, + "loss": 0.0491, + "step": 4280 + }, + { + "epoch": 228.8, + "learning_rate": 2.9074074074074077e-05, + "loss": 0.0461, + "step": 4290 + }, + { + "epoch": 228.96, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.711916923522949, + "eval_runtime": 4.3352, + "eval_samples_per_second": 138.402, + "eval_steps_per_second": 4.383, + "step": 4293 + }, + { + "epoch": 229.33, + "learning_rate": 2.9012345679012347e-05, + "loss": 0.0544, + "step": 4300 + }, + { + "epoch": 229.87, + "learning_rate": 2.895061728395062e-05, + "loss": 0.0506, + "step": 4310 + }, + { + "epoch": 229.97, + "eval_accuracy": 0.23, + "eval_loss": 4.7098541259765625, + "eval_runtime": 4.3688, + "eval_samples_per_second": 137.339, + "eval_steps_per_second": 4.349, + "step": 4312 + }, + { + "epoch": 230.4, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.0511, + "step": 4320 + }, + { + "epoch": 230.93, + "learning_rate": 2.882716049382716e-05, + "loss": 0.0412, + "step": 4330 + }, + { + "epoch": 230.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.671385288238525, + "eval_runtime": 4.3317, + "eval_samples_per_second": 138.515, + "eval_steps_per_second": 4.386, + "step": 4331 + }, + { + "epoch": 231.47, + "learning_rate": 2.8765432098765432e-05, + "loss": 0.0681, + "step": 4340 + }, + { + "epoch": 232.0, + "learning_rate": 2.8703703703703706e-05, + "loss": 0.057, + "step": 4350 + }, + { + "epoch": 232.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.692080497741699, + "eval_runtime": 4.3407, + "eval_samples_per_second": 138.227, + "eval_steps_per_second": 4.377, + "step": 4350 + }, + { + "epoch": 232.53, + "learning_rate": 2.8641975308641977e-05, + "loss": 0.0402, + "step": 4360 + }, + { + "epoch": 232.96, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.754528045654297, + "eval_runtime": 4.3356, + "eval_samples_per_second": 138.389, + "eval_steps_per_second": 4.382, + "step": 4368 + }, + { + "epoch": 233.07, + "learning_rate": 2.858024691358025e-05, + "loss": 0.0766, + "step": 4370 + }, + { + "epoch": 233.6, + "learning_rate": 2.851851851851852e-05, + "loss": 0.058, + "step": 4380 + }, + { + "epoch": 233.97, + "eval_accuracy": 0.225, + "eval_loss": 4.7573161125183105, + "eval_runtime": 4.353, + "eval_samples_per_second": 137.837, + "eval_steps_per_second": 4.365, + "step": 4387 + }, + { + "epoch": 234.13, + "learning_rate": 2.8456790123456788e-05, + "loss": 0.0749, + "step": 4390 + }, + { + "epoch": 234.67, + "learning_rate": 2.839506172839506e-05, + "loss": 0.0661, + "step": 4400 + }, + { + "epoch": 234.99, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.6800408363342285, + "eval_runtime": 4.3414, + "eval_samples_per_second": 138.205, + "eval_steps_per_second": 4.377, + "step": 4406 + }, + { + "epoch": 235.2, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.0613, + "step": 4410 + }, + { + "epoch": 235.73, + "learning_rate": 2.8271604938271606e-05, + "loss": 0.0613, + "step": 4420 + }, + { + "epoch": 236.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.653302192687988, + "eval_runtime": 4.3257, + "eval_samples_per_second": 138.707, + "eval_steps_per_second": 4.392, + "step": 4425 + }, + { + "epoch": 236.27, + "learning_rate": 2.820987654320988e-05, + "loss": 0.0555, + "step": 4430 + }, + { + "epoch": 236.8, + "learning_rate": 2.814814814814815e-05, + "loss": 0.0462, + "step": 4440 + }, + { + "epoch": 236.96, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.674839496612549, + "eval_runtime": 4.3407, + "eval_samples_per_second": 138.228, + "eval_steps_per_second": 4.377, + "step": 4443 + }, + { + "epoch": 237.33, + "learning_rate": 2.8086419753086424e-05, + "loss": 0.0742, + "step": 4450 + }, + { + "epoch": 237.87, + "learning_rate": 2.802469135802469e-05, + "loss": 0.0494, + "step": 4460 + }, + { + "epoch": 237.97, + "eval_accuracy": 0.23, + "eval_loss": 4.687388896942139, + "eval_runtime": 4.3685, + "eval_samples_per_second": 137.346, + "eval_steps_per_second": 4.349, + "step": 4462 + }, + { + "epoch": 238.4, + "learning_rate": 2.7962962962962965e-05, + "loss": 0.0617, + "step": 4470 + }, + { + "epoch": 238.93, + "learning_rate": 2.7901234567901235e-05, + "loss": 0.0643, + "step": 4480 + }, + { + "epoch": 238.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.729123592376709, + "eval_runtime": 4.3628, + "eval_samples_per_second": 137.527, + "eval_steps_per_second": 4.355, + "step": 4481 + }, + { + "epoch": 239.47, + "learning_rate": 2.783950617283951e-05, + "loss": 0.0557, + "step": 4490 + }, + { + "epoch": 240.0, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0422, + "step": 4500 + }, + { + "epoch": 240.0, + "eval_accuracy": 0.23, + "eval_loss": 4.7088117599487305, + "eval_runtime": 4.3986, + "eval_samples_per_second": 136.407, + "eval_steps_per_second": 4.32, + "step": 4500 + }, + { + "epoch": 240.53, + "learning_rate": 2.7716049382716053e-05, + "loss": 0.0376, + "step": 4510 + }, + { + "epoch": 240.96, + "eval_accuracy": 0.225, + "eval_loss": 4.74221658706665, + "eval_runtime": 4.4146, + "eval_samples_per_second": 135.913, + "eval_steps_per_second": 4.304, + "step": 4518 + }, + { + "epoch": 241.07, + "learning_rate": 2.765432098765432e-05, + "loss": 0.0343, + "step": 4520 + }, + { + "epoch": 241.6, + "learning_rate": 2.7592592592592594e-05, + "loss": 0.0696, + "step": 4530 + }, + { + "epoch": 241.97, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.801132678985596, + "eval_runtime": 4.3976, + "eval_samples_per_second": 136.44, + "eval_steps_per_second": 4.321, + "step": 4537 + }, + { + "epoch": 242.13, + "learning_rate": 2.7530864197530864e-05, + "loss": 0.0603, + "step": 4540 + }, + { + "epoch": 242.67, + "learning_rate": 2.7469135802469138e-05, + "loss": 0.0609, + "step": 4550 + }, + { + "epoch": 242.99, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.801338195800781, + "eval_runtime": 4.3836, + "eval_samples_per_second": 136.873, + "eval_steps_per_second": 4.334, + "step": 4556 + }, + { + "epoch": 243.2, + "learning_rate": 2.7407407407407408e-05, + "loss": 0.0484, + "step": 4560 + }, + { + "epoch": 243.73, + "learning_rate": 2.7345679012345682e-05, + "loss": 0.0637, + "step": 4570 + }, + { + "epoch": 244.0, + "eval_accuracy": 0.225, + "eval_loss": 4.760260105133057, + "eval_runtime": 4.4032, + "eval_samples_per_second": 136.265, + "eval_steps_per_second": 4.315, + "step": 4575 + }, + { + "epoch": 244.27, + "learning_rate": 2.7283950617283956e-05, + "loss": 0.0489, + "step": 4580 + }, + { + "epoch": 244.8, + "learning_rate": 2.7222222222222223e-05, + "loss": 0.0529, + "step": 4590 + }, + { + "epoch": 244.96, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.789524078369141, + "eval_runtime": 4.388, + "eval_samples_per_second": 136.736, + "eval_steps_per_second": 4.33, + "step": 4593 + }, + { + "epoch": 245.33, + "learning_rate": 2.7160493827160493e-05, + "loss": 0.0617, + "step": 4600 + }, + { + "epoch": 245.87, + "learning_rate": 2.7098765432098767e-05, + "loss": 0.0603, + "step": 4610 + }, + { + "epoch": 245.97, + "eval_accuracy": 0.235, + "eval_loss": 4.763910293579102, + "eval_runtime": 4.3989, + "eval_samples_per_second": 136.397, + "eval_steps_per_second": 4.319, + "step": 4612 + }, + { + "epoch": 246.4, + "learning_rate": 2.7037037037037037e-05, + "loss": 0.0558, + "step": 4620 + }, + { + "epoch": 246.93, + "learning_rate": 2.697530864197531e-05, + "loss": 0.0365, + "step": 4630 + }, + { + "epoch": 246.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.728492259979248, + "eval_runtime": 4.3868, + "eval_samples_per_second": 136.774, + "eval_steps_per_second": 4.331, + "step": 4631 + }, + { + "epoch": 247.47, + "learning_rate": 2.6913580246913585e-05, + "loss": 0.0862, + "step": 4640 + }, + { + "epoch": 248.0, + "learning_rate": 2.6851851851851855e-05, + "loss": 0.0732, + "step": 4650 + }, + { + "epoch": 248.0, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.725191116333008, + "eval_runtime": 4.4162, + "eval_samples_per_second": 135.863, + "eval_steps_per_second": 4.302, + "step": 4650 + }, + { + "epoch": 248.53, + "learning_rate": 2.6790123456790122e-05, + "loss": 0.0709, + "step": 4660 + }, + { + "epoch": 248.96, + "eval_accuracy": 0.23, + "eval_loss": 4.761960506439209, + "eval_runtime": 4.3965, + "eval_samples_per_second": 136.472, + "eval_steps_per_second": 4.322, + "step": 4668 + }, + { + "epoch": 249.07, + "learning_rate": 2.6728395061728396e-05, + "loss": 0.0463, + "step": 4670 + }, + { + "epoch": 249.6, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.0485, + "step": 4680 + }, + { + "epoch": 249.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.752857208251953, + "eval_runtime": 4.404, + "eval_samples_per_second": 136.24, + "eval_steps_per_second": 4.314, + "step": 4687 + }, + { + "epoch": 250.13, + "learning_rate": 2.660493827160494e-05, + "loss": 0.0453, + "step": 4690 + }, + { + "epoch": 250.67, + "learning_rate": 2.654320987654321e-05, + "loss": 0.0449, + "step": 4700 + }, + { + "epoch": 250.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.800561428070068, + "eval_runtime": 4.4195, + "eval_samples_per_second": 135.761, + "eval_steps_per_second": 4.299, + "step": 4706 + }, + { + "epoch": 251.2, + "learning_rate": 2.6481481481481485e-05, + "loss": 0.0629, + "step": 4710 + }, + { + "epoch": 251.73, + "learning_rate": 2.641975308641975e-05, + "loss": 0.0506, + "step": 4720 + }, + { + "epoch": 252.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.802790641784668, + "eval_runtime": 4.4472, + "eval_samples_per_second": 134.916, + "eval_steps_per_second": 4.272, + "step": 4725 + }, + { + "epoch": 252.27, + "learning_rate": 2.6358024691358025e-05, + "loss": 0.038, + "step": 4730 + }, + { + "epoch": 252.8, + "learning_rate": 2.6296296296296296e-05, + "loss": 0.0455, + "step": 4740 + }, + { + "epoch": 252.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.777773380279541, + "eval_runtime": 4.4493, + "eval_samples_per_second": 134.853, + "eval_steps_per_second": 4.27, + "step": 4743 + }, + { + "epoch": 253.33, + "learning_rate": 2.623456790123457e-05, + "loss": 0.0418, + "step": 4750 + }, + { + "epoch": 253.87, + "learning_rate": 2.617283950617284e-05, + "loss": 0.0594, + "step": 4760 + }, + { + "epoch": 253.97, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.743904113769531, + "eval_runtime": 4.4293, + "eval_samples_per_second": 135.461, + "eval_steps_per_second": 4.29, + "step": 4762 + }, + { + "epoch": 254.4, + "learning_rate": 2.6111111111111114e-05, + "loss": 0.0511, + "step": 4770 + }, + { + "epoch": 254.93, + "learning_rate": 2.6049382716049388e-05, + "loss": 0.0551, + "step": 4780 + }, + { + "epoch": 254.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.806947708129883, + "eval_runtime": 4.4446, + "eval_samples_per_second": 134.995, + "eval_steps_per_second": 4.275, + "step": 4781 + }, + { + "epoch": 255.47, + "learning_rate": 2.5987654320987655e-05, + "loss": 0.0512, + "step": 4790 + }, + { + "epoch": 256.0, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.0435, + "step": 4800 + }, + { + "epoch": 256.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.817090034484863, + "eval_runtime": 4.463, + "eval_samples_per_second": 134.44, + "eval_steps_per_second": 4.257, + "step": 4800 + }, + { + "epoch": 256.53, + "learning_rate": 2.58641975308642e-05, + "loss": 0.042, + "step": 4810 + }, + { + "epoch": 256.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.796122074127197, + "eval_runtime": 4.4296, + "eval_samples_per_second": 135.453, + "eval_steps_per_second": 4.289, + "step": 4818 + }, + { + "epoch": 257.07, + "learning_rate": 2.580246913580247e-05, + "loss": 0.0614, + "step": 4820 + }, + { + "epoch": 257.6, + "learning_rate": 2.5740740740740743e-05, + "loss": 0.0403, + "step": 4830 + }, + { + "epoch": 257.97, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.817234516143799, + "eval_runtime": 4.4455, + "eval_samples_per_second": 134.967, + "eval_steps_per_second": 4.274, + "step": 4837 + }, + { + "epoch": 258.13, + "learning_rate": 2.5679012345679017e-05, + "loss": 0.0428, + "step": 4840 + }, + { + "epoch": 258.67, + "learning_rate": 2.5617283950617287e-05, + "loss": 0.0524, + "step": 4850 + }, + { + "epoch": 258.99, + "eval_accuracy": 0.23, + "eval_loss": 4.8536577224731445, + "eval_runtime": 4.4559, + "eval_samples_per_second": 134.654, + "eval_steps_per_second": 4.264, + "step": 4856 + }, + { + "epoch": 259.2, + "learning_rate": 2.5555555555555554e-05, + "loss": 0.0495, + "step": 4860 + }, + { + "epoch": 259.73, + "learning_rate": 2.5493827160493828e-05, + "loss": 0.0461, + "step": 4870 + }, + { + "epoch": 260.0, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.769797325134277, + "eval_runtime": 4.4781, + "eval_samples_per_second": 133.984, + "eval_steps_per_second": 4.243, + "step": 4875 + }, + { + "epoch": 260.27, + "learning_rate": 2.5432098765432098e-05, + "loss": 0.0547, + "step": 4880 + }, + { + "epoch": 260.8, + "learning_rate": 2.5370370370370372e-05, + "loss": 0.05, + "step": 4890 + }, + { + "epoch": 260.96, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.805800437927246, + "eval_runtime": 4.4587, + "eval_samples_per_second": 134.567, + "eval_steps_per_second": 4.261, + "step": 4893 + }, + { + "epoch": 261.33, + "learning_rate": 2.5308641975308646e-05, + "loss": 0.0463, + "step": 4900 + }, + { + "epoch": 261.87, + "learning_rate": 2.5246913580246916e-05, + "loss": 0.0545, + "step": 4910 + }, + { + "epoch": 261.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.839805603027344, + "eval_runtime": 5.2865, + "eval_samples_per_second": 113.497, + "eval_steps_per_second": 3.594, + "step": 4912 + }, + { + "epoch": 262.4, + "learning_rate": 2.5185185185185183e-05, + "loss": 0.066, + "step": 4920 + }, + { + "epoch": 262.93, + "learning_rate": 2.5123456790123457e-05, + "loss": 0.0405, + "step": 4930 + }, + { + "epoch": 262.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.82278299331665, + "eval_runtime": 4.4772, + "eval_samples_per_second": 134.011, + "eval_steps_per_second": 4.244, + "step": 4931 + }, + { + "epoch": 263.47, + "learning_rate": 2.5061728395061727e-05, + "loss": 0.0466, + "step": 4940 + }, + { + "epoch": 264.0, + "learning_rate": 2.5e-05, + "loss": 0.0615, + "step": 4950 + }, + { + "epoch": 264.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.839545726776123, + "eval_runtime": 4.4638, + "eval_samples_per_second": 134.416, + "eval_steps_per_second": 4.257, + "step": 4950 + }, + { + "epoch": 264.53, + "learning_rate": 2.4938271604938275e-05, + "loss": 0.0381, + "step": 4960 + }, + { + "epoch": 264.96, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.823088645935059, + "eval_runtime": 4.4558, + "eval_samples_per_second": 134.657, + "eval_steps_per_second": 4.264, + "step": 4968 + }, + { + "epoch": 265.07, + "learning_rate": 2.4876543209876542e-05, + "loss": 0.0609, + "step": 4970 + }, + { + "epoch": 265.6, + "learning_rate": 2.4814814814814816e-05, + "loss": 0.0464, + "step": 4980 + }, + { + "epoch": 265.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.818026065826416, + "eval_runtime": 4.4544, + "eval_samples_per_second": 134.698, + "eval_steps_per_second": 4.265, + "step": 4987 + }, + { + "epoch": 266.13, + "learning_rate": 2.475308641975309e-05, + "loss": 0.0454, + "step": 4990 + }, + { + "epoch": 266.67, + "learning_rate": 2.4691358024691357e-05, + "loss": 0.058, + "step": 5000 + }, + { + "epoch": 266.99, + "eval_accuracy": 0.235, + "eval_loss": 4.87436056137085, + "eval_runtime": 4.4992, + "eval_samples_per_second": 133.357, + "eval_steps_per_second": 4.223, + "step": 5006 + }, + { + "epoch": 267.2, + "learning_rate": 2.462962962962963e-05, + "loss": 0.044, + "step": 5010 + }, + { + "epoch": 267.73, + "learning_rate": 2.4567901234567904e-05, + "loss": 0.0553, + "step": 5020 + }, + { + "epoch": 268.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.886570453643799, + "eval_runtime": 4.4931, + "eval_samples_per_second": 133.538, + "eval_steps_per_second": 4.229, + "step": 5025 + }, + { + "epoch": 268.27, + "learning_rate": 2.4506172839506175e-05, + "loss": 0.0536, + "step": 5030 + }, + { + "epoch": 268.8, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.0505, + "step": 5040 + }, + { + "epoch": 268.96, + "eval_accuracy": 0.24, + "eval_loss": 4.853390216827393, + "eval_runtime": 4.4705, + "eval_samples_per_second": 134.214, + "eval_steps_per_second": 4.25, + "step": 5043 + }, + { + "epoch": 269.33, + "learning_rate": 2.438271604938272e-05, + "loss": 0.0269, + "step": 5050 + }, + { + "epoch": 269.87, + "learning_rate": 2.432098765432099e-05, + "loss": 0.049, + "step": 5060 + }, + { + "epoch": 269.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.870242118835449, + "eval_runtime": 4.4681, + "eval_samples_per_second": 134.285, + "eval_steps_per_second": 4.252, + "step": 5062 + }, + { + "epoch": 270.4, + "learning_rate": 2.425925925925926e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 270.93, + "learning_rate": 2.4197530864197533e-05, + "loss": 0.0444, + "step": 5080 + }, + { + "epoch": 270.99, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.871464729309082, + "eval_runtime": 4.4942, + "eval_samples_per_second": 133.505, + "eval_steps_per_second": 4.228, + "step": 5081 + }, + { + "epoch": 271.47, + "learning_rate": 2.4135802469135804e-05, + "loss": 0.0433, + "step": 5090 + }, + { + "epoch": 272.0, + "learning_rate": 2.4074074074074074e-05, + "loss": 0.0457, + "step": 5100 + }, + { + "epoch": 272.0, + "eval_accuracy": 0.225, + "eval_loss": 4.827383518218994, + "eval_runtime": 4.4878, + "eval_samples_per_second": 133.697, + "eval_steps_per_second": 4.234, + "step": 5100 + }, + { + "epoch": 272.53, + "learning_rate": 2.4012345679012348e-05, + "loss": 0.0546, + "step": 5110 + }, + { + "epoch": 272.96, + "eval_accuracy": 0.225, + "eval_loss": 4.844120502471924, + "eval_runtime": 4.5115, + "eval_samples_per_second": 132.993, + "eval_steps_per_second": 4.211, + "step": 5118 + }, + { + "epoch": 273.07, + "learning_rate": 2.3950617283950618e-05, + "loss": 0.0427, + "step": 5120 + }, + { + "epoch": 273.6, + "learning_rate": 2.3888888888888892e-05, + "loss": 0.0378, + "step": 5130 + }, + { + "epoch": 273.97, + "eval_accuracy": 0.225, + "eval_loss": 4.822915077209473, + "eval_runtime": 4.4872, + "eval_samples_per_second": 133.713, + "eval_steps_per_second": 4.234, + "step": 5137 + }, + { + "epoch": 274.13, + "learning_rate": 2.3827160493827162e-05, + "loss": 0.0603, + "step": 5140 + }, + { + "epoch": 274.67, + "learning_rate": 2.3765432098765433e-05, + "loss": 0.0374, + "step": 5150 + }, + { + "epoch": 274.99, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.805349349975586, + "eval_runtime": 4.5038, + "eval_samples_per_second": 133.22, + "eval_steps_per_second": 4.219, + "step": 5156 + }, + { + "epoch": 275.2, + "learning_rate": 2.3703703703703707e-05, + "loss": 0.0384, + "step": 5160 + }, + { + "epoch": 275.73, + "learning_rate": 2.3641975308641977e-05, + "loss": 0.047, + "step": 5170 + }, + { + "epoch": 276.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.861939907073975, + "eval_runtime": 4.5272, + "eval_samples_per_second": 132.531, + "eval_steps_per_second": 4.197, + "step": 5175 + }, + { + "epoch": 276.27, + "learning_rate": 2.3580246913580247e-05, + "loss": 0.0352, + "step": 5180 + }, + { + "epoch": 276.8, + "learning_rate": 2.351851851851852e-05, + "loss": 0.0526, + "step": 5190 + }, + { + "epoch": 276.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.87931489944458, + "eval_runtime": 4.5397, + "eval_samples_per_second": 132.166, + "eval_steps_per_second": 4.185, + "step": 5193 + }, + { + "epoch": 277.33, + "learning_rate": 2.345679012345679e-05, + "loss": 0.0406, + "step": 5200 + }, + { + "epoch": 277.87, + "learning_rate": 2.3395061728395062e-05, + "loss": 0.0503, + "step": 5210 + }, + { + "epoch": 277.97, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.9059576988220215, + "eval_runtime": 4.5301, + "eval_samples_per_second": 132.446, + "eval_steps_per_second": 4.194, + "step": 5212 + }, + { + "epoch": 278.4, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.043, + "step": 5220 + }, + { + "epoch": 278.93, + "learning_rate": 2.3271604938271606e-05, + "loss": 0.0414, + "step": 5230 + }, + { + "epoch": 278.99, + "eval_accuracy": 0.24, + "eval_loss": 4.86867094039917, + "eval_runtime": 4.4935, + "eval_samples_per_second": 133.527, + "eval_steps_per_second": 4.228, + "step": 5231 + }, + { + "epoch": 279.47, + "learning_rate": 2.3209876543209877e-05, + "loss": 0.0561, + "step": 5240 + }, + { + "epoch": 280.0, + "learning_rate": 2.314814814814815e-05, + "loss": 0.0361, + "step": 5250 + }, + { + "epoch": 280.0, + "eval_accuracy": 0.24, + "eval_loss": 4.853731155395508, + "eval_runtime": 4.5103, + "eval_samples_per_second": 133.029, + "eval_steps_per_second": 4.213, + "step": 5250 + }, + { + "epoch": 280.53, + "learning_rate": 2.308641975308642e-05, + "loss": 0.0449, + "step": 5260 + }, + { + "epoch": 280.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.8204240798950195, + "eval_runtime": 4.5205, + "eval_samples_per_second": 132.729, + "eval_steps_per_second": 4.203, + "step": 5268 + }, + { + "epoch": 281.07, + "learning_rate": 2.302469135802469e-05, + "loss": 0.0527, + "step": 5270 + }, + { + "epoch": 281.6, + "learning_rate": 2.2962962962962965e-05, + "loss": 0.0596, + "step": 5280 + }, + { + "epoch": 281.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.803044319152832, + "eval_runtime": 4.5651, + "eval_samples_per_second": 131.433, + "eval_steps_per_second": 4.162, + "step": 5287 + }, + { + "epoch": 282.13, + "learning_rate": 2.2901234567901235e-05, + "loss": 0.056, + "step": 5290 + }, + { + "epoch": 282.67, + "learning_rate": 2.2839506172839506e-05, + "loss": 0.0494, + "step": 5300 + }, + { + "epoch": 282.99, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.8059892654418945, + "eval_runtime": 4.5556, + "eval_samples_per_second": 131.706, + "eval_steps_per_second": 4.171, + "step": 5306 + }, + { + "epoch": 283.2, + "learning_rate": 2.277777777777778e-05, + "loss": 0.0311, + "step": 5310 + }, + { + "epoch": 283.73, + "learning_rate": 2.271604938271605e-05, + "loss": 0.0483, + "step": 5320 + }, + { + "epoch": 284.0, + "eval_accuracy": 0.235, + "eval_loss": 4.7877960205078125, + "eval_runtime": 4.5534, + "eval_samples_per_second": 131.77, + "eval_steps_per_second": 4.173, + "step": 5325 + }, + { + "epoch": 284.27, + "learning_rate": 2.2654320987654324e-05, + "loss": 0.0606, + "step": 5330 + }, + { + "epoch": 284.8, + "learning_rate": 2.2592592592592594e-05, + "loss": 0.0338, + "step": 5340 + }, + { + "epoch": 284.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.825440406799316, + "eval_runtime": 4.5446, + "eval_samples_per_second": 132.024, + "eval_steps_per_second": 4.181, + "step": 5343 + }, + { + "epoch": 285.33, + "learning_rate": 2.2530864197530865e-05, + "loss": 0.0454, + "step": 5350 + }, + { + "epoch": 285.87, + "learning_rate": 2.246913580246914e-05, + "loss": 0.0319, + "step": 5360 + }, + { + "epoch": 285.97, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.826366424560547, + "eval_runtime": 4.5558, + "eval_samples_per_second": 131.701, + "eval_steps_per_second": 4.171, + "step": 5362 + }, + { + "epoch": 286.4, + "learning_rate": 2.240740740740741e-05, + "loss": 0.0414, + "step": 5370 + }, + { + "epoch": 286.93, + "learning_rate": 2.234567901234568e-05, + "loss": 0.0454, + "step": 5380 + }, + { + "epoch": 286.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.842591762542725, + "eval_runtime": 4.55, + "eval_samples_per_second": 131.868, + "eval_steps_per_second": 4.176, + "step": 5381 + }, + { + "epoch": 287.47, + "learning_rate": 2.2283950617283953e-05, + "loss": 0.0426, + "step": 5390 + }, + { + "epoch": 288.0, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0409, + "step": 5400 + }, + { + "epoch": 288.0, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.819784164428711, + "eval_runtime": 4.5681, + "eval_samples_per_second": 131.345, + "eval_steps_per_second": 4.159, + "step": 5400 + }, + { + "epoch": 288.53, + "learning_rate": 2.2160493827160494e-05, + "loss": 0.0435, + "step": 5410 + }, + { + "epoch": 288.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.8339433670043945, + "eval_runtime": 4.5995, + "eval_samples_per_second": 130.45, + "eval_steps_per_second": 4.131, + "step": 5418 + }, + { + "epoch": 289.07, + "learning_rate": 2.2098765432098767e-05, + "loss": 0.0452, + "step": 5420 + }, + { + "epoch": 289.6, + "learning_rate": 2.2037037037037038e-05, + "loss": 0.0498, + "step": 5430 + }, + { + "epoch": 289.97, + "eval_accuracy": 0.225, + "eval_loss": 4.838677883148193, + "eval_runtime": 4.6181, + "eval_samples_per_second": 129.923, + "eval_steps_per_second": 4.114, + "step": 5437 + }, + { + "epoch": 290.13, + "learning_rate": 2.1975308641975308e-05, + "loss": 0.0467, + "step": 5440 + }, + { + "epoch": 290.67, + "learning_rate": 2.1913580246913582e-05, + "loss": 0.0447, + "step": 5450 + }, + { + "epoch": 290.99, + "eval_accuracy": 0.23, + "eval_loss": 4.834191799163818, + "eval_runtime": 4.5905, + "eval_samples_per_second": 130.706, + "eval_steps_per_second": 4.139, + "step": 5456 + }, + { + "epoch": 291.2, + "learning_rate": 2.1851851851851852e-05, + "loss": 0.0441, + "step": 5460 + }, + { + "epoch": 291.73, + "learning_rate": 2.1790123456790123e-05, + "loss": 0.0402, + "step": 5470 + }, + { + "epoch": 292.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.849569320678711, + "eval_runtime": 4.5792, + "eval_samples_per_second": 131.028, + "eval_steps_per_second": 4.149, + "step": 5475 + }, + { + "epoch": 292.27, + "learning_rate": 2.1728395061728397e-05, + "loss": 0.0314, + "step": 5480 + }, + { + "epoch": 292.8, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.0366, + "step": 5490 + }, + { + "epoch": 292.96, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.867130279541016, + "eval_runtime": 4.5608, + "eval_samples_per_second": 131.555, + "eval_steps_per_second": 4.166, + "step": 5493 + }, + { + "epoch": 293.33, + "learning_rate": 2.1604938271604937e-05, + "loss": 0.0388, + "step": 5500 + }, + { + "epoch": 293.87, + "learning_rate": 2.154320987654321e-05, + "loss": 0.0369, + "step": 5510 + }, + { + "epoch": 293.97, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.836596488952637, + "eval_runtime": 4.583, + "eval_samples_per_second": 130.917, + "eval_steps_per_second": 4.146, + "step": 5512 + }, + { + "epoch": 294.4, + "learning_rate": 2.148148148148148e-05, + "loss": 0.0651, + "step": 5520 + }, + { + "epoch": 294.93, + "learning_rate": 2.1419753086419755e-05, + "loss": 0.0361, + "step": 5530 + }, + { + "epoch": 294.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.799242973327637, + "eval_runtime": 4.6324, + "eval_samples_per_second": 129.522, + "eval_steps_per_second": 4.102, + "step": 5531 + }, + { + "epoch": 295.47, + "learning_rate": 2.1358024691358026e-05, + "loss": 0.0549, + "step": 5540 + }, + { + "epoch": 296.0, + "learning_rate": 2.1296296296296296e-05, + "loss": 0.0448, + "step": 5550 + }, + { + "epoch": 296.0, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.848645210266113, + "eval_runtime": 4.6425, + "eval_samples_per_second": 129.241, + "eval_steps_per_second": 4.093, + "step": 5550 + }, + { + "epoch": 296.53, + "learning_rate": 2.123456790123457e-05, + "loss": 0.055, + "step": 5560 + }, + { + "epoch": 296.96, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.897942066192627, + "eval_runtime": 4.5933, + "eval_samples_per_second": 130.624, + "eval_steps_per_second": 4.136, + "step": 5568 + }, + { + "epoch": 297.07, + "learning_rate": 2.117283950617284e-05, + "loss": 0.0427, + "step": 5570 + }, + { + "epoch": 297.6, + "learning_rate": 2.111111111111111e-05, + "loss": 0.0585, + "step": 5580 + }, + { + "epoch": 297.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.866022109985352, + "eval_runtime": 4.6056, + "eval_samples_per_second": 130.276, + "eval_steps_per_second": 4.125, + "step": 5587 + }, + { + "epoch": 298.13, + "learning_rate": 2.1049382716049385e-05, + "loss": 0.0478, + "step": 5590 + }, + { + "epoch": 298.67, + "learning_rate": 2.0987654320987655e-05, + "loss": 0.0477, + "step": 5600 + }, + { + "epoch": 298.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.8717451095581055, + "eval_runtime": 4.6241, + "eval_samples_per_second": 129.756, + "eval_steps_per_second": 4.109, + "step": 5606 + }, + { + "epoch": 299.2, + "learning_rate": 2.0925925925925925e-05, + "loss": 0.0515, + "step": 5610 + }, + { + "epoch": 299.73, + "learning_rate": 2.08641975308642e-05, + "loss": 0.0247, + "step": 5620 + }, + { + "epoch": 300.0, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.883845806121826, + "eval_runtime": 4.6098, + "eval_samples_per_second": 130.158, + "eval_steps_per_second": 4.122, + "step": 5625 + }, + { + "epoch": 300.27, + "learning_rate": 2.0802469135802473e-05, + "loss": 0.0438, + "step": 5630 + }, + { + "epoch": 300.8, + "learning_rate": 2.074074074074074e-05, + "loss": 0.047, + "step": 5640 + }, + { + "epoch": 300.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.824845314025879, + "eval_runtime": 4.5984, + "eval_samples_per_second": 130.482, + "eval_steps_per_second": 4.132, + "step": 5643 + }, + { + "epoch": 301.33, + "learning_rate": 2.0679012345679014e-05, + "loss": 0.0497, + "step": 5650 + }, + { + "epoch": 301.87, + "learning_rate": 2.0617283950617287e-05, + "loss": 0.0608, + "step": 5660 + }, + { + "epoch": 301.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.832959175109863, + "eval_runtime": 4.6004, + "eval_samples_per_second": 130.422, + "eval_steps_per_second": 4.13, + "step": 5662 + }, + { + "epoch": 302.4, + "learning_rate": 2.0555555555555555e-05, + "loss": 0.0413, + "step": 5670 + }, + { + "epoch": 302.93, + "learning_rate": 2.0493827160493828e-05, + "loss": 0.0417, + "step": 5680 + }, + { + "epoch": 302.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.823630332946777, + "eval_runtime": 4.5991, + "eval_samples_per_second": 130.462, + "eval_steps_per_second": 4.131, + "step": 5681 + }, + { + "epoch": 303.47, + "learning_rate": 2.0432098765432102e-05, + "loss": 0.0329, + "step": 5690 + }, + { + "epoch": 304.0, + "learning_rate": 2.037037037037037e-05, + "loss": 0.0494, + "step": 5700 + }, + { + "epoch": 304.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.807046413421631, + "eval_runtime": 4.6151, + "eval_samples_per_second": 130.008, + "eval_steps_per_second": 4.117, + "step": 5700 + }, + { + "epoch": 304.53, + "learning_rate": 2.0308641975308643e-05, + "loss": 0.0316, + "step": 5710 + }, + { + "epoch": 304.96, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.821282386779785, + "eval_runtime": 4.6056, + "eval_samples_per_second": 130.275, + "eval_steps_per_second": 4.125, + "step": 5718 + }, + { + "epoch": 305.07, + "learning_rate": 2.0246913580246917e-05, + "loss": 0.0473, + "step": 5720 + }, + { + "epoch": 305.6, + "learning_rate": 2.0185185185185187e-05, + "loss": 0.0421, + "step": 5730 + }, + { + "epoch": 305.97, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.863409996032715, + "eval_runtime": 4.6418, + "eval_samples_per_second": 129.259, + "eval_steps_per_second": 4.093, + "step": 5737 + }, + { + "epoch": 306.13, + "learning_rate": 2.0123456790123457e-05, + "loss": 0.0382, + "step": 5740 + }, + { + "epoch": 306.67, + "learning_rate": 2.006172839506173e-05, + "loss": 0.0411, + "step": 5750 + }, + { + "epoch": 306.99, + "eval_accuracy": 0.24, + "eval_loss": 4.877004623413086, + "eval_runtime": 4.6583, + "eval_samples_per_second": 128.802, + "eval_steps_per_second": 4.079, + "step": 5756 + }, + { + "epoch": 307.2, + "learning_rate": 2e-05, + "loss": 0.042, + "step": 5760 + }, + { + "epoch": 307.73, + "learning_rate": 1.9938271604938272e-05, + "loss": 0.0404, + "step": 5770 + }, + { + "epoch": 308.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.9029860496521, + "eval_runtime": 4.6363, + "eval_samples_per_second": 129.412, + "eval_steps_per_second": 4.098, + "step": 5775 + }, + { + "epoch": 308.27, + "learning_rate": 1.9876543209876546e-05, + "loss": 0.0401, + "step": 5780 + }, + { + "epoch": 308.8, + "learning_rate": 1.9814814814814816e-05, + "loss": 0.0397, + "step": 5790 + }, + { + "epoch": 308.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.943274974822998, + "eval_runtime": 4.6219, + "eval_samples_per_second": 129.817, + "eval_steps_per_second": 4.111, + "step": 5793 + }, + { + "epoch": 309.33, + "learning_rate": 1.9753086419753087e-05, + "loss": 0.0467, + "step": 5800 + }, + { + "epoch": 309.87, + "learning_rate": 1.969135802469136e-05, + "loss": 0.053, + "step": 5810 + }, + { + "epoch": 309.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.930065631866455, + "eval_runtime": 4.6916, + "eval_samples_per_second": 127.887, + "eval_steps_per_second": 4.05, + "step": 5812 + }, + { + "epoch": 310.4, + "learning_rate": 1.962962962962963e-05, + "loss": 0.0459, + "step": 5820 + }, + { + "epoch": 310.93, + "learning_rate": 1.95679012345679e-05, + "loss": 0.0303, + "step": 5830 + }, + { + "epoch": 310.99, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.8961286544799805, + "eval_runtime": 4.6651, + "eval_samples_per_second": 128.614, + "eval_steps_per_second": 4.073, + "step": 5831 + }, + { + "epoch": 311.47, + "learning_rate": 1.950617283950617e-05, + "loss": 0.0314, + "step": 5840 + }, + { + "epoch": 312.0, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.0369, + "step": 5850 + }, + { + "epoch": 312.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.856044292449951, + "eval_runtime": 4.6488, + "eval_samples_per_second": 129.067, + "eval_steps_per_second": 4.087, + "step": 5850 + }, + { + "epoch": 312.53, + "learning_rate": 1.938271604938272e-05, + "loss": 0.0423, + "step": 5860 + }, + { + "epoch": 312.96, + "eval_accuracy": 0.225, + "eval_loss": 4.917734146118164, + "eval_runtime": 4.6571, + "eval_samples_per_second": 128.835, + "eval_steps_per_second": 4.08, + "step": 5868 + }, + { + "epoch": 313.07, + "learning_rate": 1.9320987654320986e-05, + "loss": 0.03, + "step": 5870 + }, + { + "epoch": 313.6, + "learning_rate": 1.925925925925926e-05, + "loss": 0.0343, + "step": 5880 + }, + { + "epoch": 313.97, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.892765522003174, + "eval_runtime": 4.6848, + "eval_samples_per_second": 128.074, + "eval_steps_per_second": 4.056, + "step": 5887 + }, + { + "epoch": 314.13, + "learning_rate": 1.9197530864197534e-05, + "loss": 0.0299, + "step": 5890 + }, + { + "epoch": 314.67, + "learning_rate": 1.91358024691358e-05, + "loss": 0.0216, + "step": 5900 + }, + { + "epoch": 314.99, + "eval_accuracy": 0.23, + "eval_loss": 4.895847320556641, + "eval_runtime": 4.6722, + "eval_samples_per_second": 128.42, + "eval_steps_per_second": 4.067, + "step": 5906 + }, + { + "epoch": 315.2, + "learning_rate": 1.9074074074074075e-05, + "loss": 0.0604, + "step": 5910 + }, + { + "epoch": 315.73, + "learning_rate": 1.901234567901235e-05, + "loss": 0.0287, + "step": 5920 + }, + { + "epoch": 316.0, + "eval_accuracy": 0.235, + "eval_loss": 4.880258083343506, + "eval_runtime": 4.6757, + "eval_samples_per_second": 128.323, + "eval_steps_per_second": 4.064, + "step": 5925 + }, + { + "epoch": 316.27, + "learning_rate": 1.8950617283950615e-05, + "loss": 0.0269, + "step": 5930 + }, + { + "epoch": 316.8, + "learning_rate": 1.888888888888889e-05, + "loss": 0.0286, + "step": 5940 + }, + { + "epoch": 316.96, + "eval_accuracy": 0.23, + "eval_loss": 4.86151123046875, + "eval_runtime": 4.6507, + "eval_samples_per_second": 129.012, + "eval_steps_per_second": 4.085, + "step": 5943 + }, + { + "epoch": 317.33, + "learning_rate": 1.8827160493827163e-05, + "loss": 0.0478, + "step": 5950 + }, + { + "epoch": 317.87, + "learning_rate": 1.8765432098765433e-05, + "loss": 0.0304, + "step": 5960 + }, + { + "epoch": 317.97, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.873566150665283, + "eval_runtime": 4.6681, + "eval_samples_per_second": 128.532, + "eval_steps_per_second": 4.07, + "step": 5962 + }, + { + "epoch": 318.4, + "learning_rate": 1.8703703703703704e-05, + "loss": 0.0346, + "step": 5970 + }, + { + "epoch": 318.93, + "learning_rate": 1.8641975308641977e-05, + "loss": 0.0486, + "step": 5980 + }, + { + "epoch": 318.99, + "eval_accuracy": 0.22333333333333333, + "eval_loss": 4.882538318634033, + "eval_runtime": 4.6432, + "eval_samples_per_second": 129.222, + "eval_steps_per_second": 4.092, + "step": 5981 + }, + { + "epoch": 319.47, + "learning_rate": 1.8580246913580248e-05, + "loss": 0.0391, + "step": 5990 + }, + { + "epoch": 320.0, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.0404, + "step": 6000 + }, + { + "epoch": 320.0, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.861847400665283, + "eval_runtime": 4.6741, + "eval_samples_per_second": 128.368, + "eval_steps_per_second": 4.065, + "step": 6000 + }, + { + "epoch": 320.53, + "learning_rate": 1.8456790123456792e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 320.96, + "eval_accuracy": 0.23, + "eval_loss": 4.884802341461182, + "eval_runtime": 4.66, + "eval_samples_per_second": 128.756, + "eval_steps_per_second": 4.077, + "step": 6018 + }, + { + "epoch": 321.07, + "learning_rate": 1.8395061728395062e-05, + "loss": 0.0387, + "step": 6020 + }, + { + "epoch": 321.6, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.0428, + "step": 6030 + }, + { + "epoch": 321.97, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.897517681121826, + "eval_runtime": 4.6751, + "eval_samples_per_second": 128.34, + "eval_steps_per_second": 4.064, + "step": 6037 + }, + { + "epoch": 322.13, + "learning_rate": 1.8271604938271607e-05, + "loss": 0.0261, + "step": 6040 + }, + { + "epoch": 322.67, + "learning_rate": 1.8209876543209877e-05, + "loss": 0.0498, + "step": 6050 + }, + { + "epoch": 322.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.8614420890808105, + "eval_runtime": 4.6819, + "eval_samples_per_second": 128.153, + "eval_steps_per_second": 4.058, + "step": 6056 + }, + { + "epoch": 323.2, + "learning_rate": 1.814814814814815e-05, + "loss": 0.0403, + "step": 6060 + }, + { + "epoch": 323.73, + "learning_rate": 1.808641975308642e-05, + "loss": 0.0314, + "step": 6070 + }, + { + "epoch": 324.0, + "eval_accuracy": 0.235, + "eval_loss": 4.871830463409424, + "eval_runtime": 4.6693, + "eval_samples_per_second": 128.5, + "eval_steps_per_second": 4.069, + "step": 6075 + }, + { + "epoch": 324.27, + "learning_rate": 1.802469135802469e-05, + "loss": 0.0365, + "step": 6080 + }, + { + "epoch": 324.8, + "learning_rate": 1.7962962962962965e-05, + "loss": 0.0334, + "step": 6090 + }, + { + "epoch": 324.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.902100563049316, + "eval_runtime": 4.7082, + "eval_samples_per_second": 127.436, + "eval_steps_per_second": 4.035, + "step": 6093 + }, + { + "epoch": 325.33, + "learning_rate": 1.7901234567901236e-05, + "loss": 0.0446, + "step": 6100 + }, + { + "epoch": 325.87, + "learning_rate": 1.7839506172839506e-05, + "loss": 0.0431, + "step": 6110 + }, + { + "epoch": 325.97, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.897326946258545, + "eval_runtime": 4.7241, + "eval_samples_per_second": 127.008, + "eval_steps_per_second": 4.022, + "step": 6112 + }, + { + "epoch": 326.4, + "learning_rate": 1.777777777777778e-05, + "loss": 0.0316, + "step": 6120 + }, + { + "epoch": 326.93, + "learning_rate": 1.771604938271605e-05, + "loss": 0.0473, + "step": 6130 + }, + { + "epoch": 326.99, + "eval_accuracy": 0.24, + "eval_loss": 4.867129802703857, + "eval_runtime": 4.6999, + "eval_samples_per_second": 127.662, + "eval_steps_per_second": 4.043, + "step": 6131 + }, + { + "epoch": 327.47, + "learning_rate": 1.765432098765432e-05, + "loss": 0.0365, + "step": 6140 + }, + { + "epoch": 328.0, + "learning_rate": 1.7592592592592595e-05, + "loss": 0.0348, + "step": 6150 + }, + { + "epoch": 328.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.905031204223633, + "eval_runtime": 4.6993, + "eval_samples_per_second": 127.679, + "eval_steps_per_second": 4.043, + "step": 6150 + }, + { + "epoch": 328.53, + "learning_rate": 1.7530864197530865e-05, + "loss": 0.0718, + "step": 6160 + }, + { + "epoch": 328.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.886887073516846, + "eval_runtime": 4.7046, + "eval_samples_per_second": 127.535, + "eval_steps_per_second": 4.039, + "step": 6168 + }, + { + "epoch": 329.07, + "learning_rate": 1.7469135802469135e-05, + "loss": 0.0418, + "step": 6170 + }, + { + "epoch": 329.6, + "learning_rate": 1.740740740740741e-05, + "loss": 0.0387, + "step": 6180 + }, + { + "epoch": 329.97, + "eval_accuracy": 0.245, + "eval_loss": 4.855226039886475, + "eval_runtime": 4.7051, + "eval_samples_per_second": 127.52, + "eval_steps_per_second": 4.038, + "step": 6187 + }, + { + "epoch": 330.13, + "learning_rate": 1.734567901234568e-05, + "loss": 0.032, + "step": 6190 + }, + { + "epoch": 330.67, + "learning_rate": 1.728395061728395e-05, + "loss": 0.0335, + "step": 6200 + }, + { + "epoch": 330.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.893190860748291, + "eval_runtime": 4.7105, + "eval_samples_per_second": 127.376, + "eval_steps_per_second": 4.034, + "step": 6206 + }, + { + "epoch": 331.2, + "learning_rate": 1.7222222222222224e-05, + "loss": 0.0271, + "step": 6210 + }, + { + "epoch": 331.73, + "learning_rate": 1.7160493827160494e-05, + "loss": 0.0355, + "step": 6220 + }, + { + "epoch": 332.0, + "eval_accuracy": 0.245, + "eval_loss": 4.919488906860352, + "eval_runtime": 4.7258, + "eval_samples_per_second": 126.963, + "eval_steps_per_second": 4.02, + "step": 6225 + }, + { + "epoch": 332.27, + "learning_rate": 1.7098765432098765e-05, + "loss": 0.038, + "step": 6230 + }, + { + "epoch": 332.8, + "learning_rate": 1.7037037037037038e-05, + "loss": 0.0407, + "step": 6240 + }, + { + "epoch": 332.96, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.9162702560424805, + "eval_runtime": 4.7131, + "eval_samples_per_second": 127.305, + "eval_steps_per_second": 4.031, + "step": 6243 + }, + { + "epoch": 333.33, + "learning_rate": 1.697530864197531e-05, + "loss": 0.0434, + "step": 6250 + }, + { + "epoch": 333.87, + "learning_rate": 1.6913580246913582e-05, + "loss": 0.0471, + "step": 6260 + }, + { + "epoch": 333.97, + "eval_accuracy": 0.225, + "eval_loss": 4.885989189147949, + "eval_runtime": 4.7137, + "eval_samples_per_second": 127.289, + "eval_steps_per_second": 4.031, + "step": 6262 + }, + { + "epoch": 334.4, + "learning_rate": 1.6851851851851853e-05, + "loss": 0.0336, + "step": 6270 + }, + { + "epoch": 334.93, + "learning_rate": 1.6790123456790123e-05, + "loss": 0.0334, + "step": 6280 + }, + { + "epoch": 334.99, + "eval_accuracy": 0.235, + "eval_loss": 4.894328594207764, + "eval_runtime": 4.6988, + "eval_samples_per_second": 127.693, + "eval_steps_per_second": 4.044, + "step": 6281 + }, + { + "epoch": 335.47, + "learning_rate": 1.6728395061728397e-05, + "loss": 0.0441, + "step": 6290 + }, + { + "epoch": 336.0, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0301, + "step": 6300 + }, + { + "epoch": 336.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.922252178192139, + "eval_runtime": 4.7453, + "eval_samples_per_second": 126.441, + "eval_steps_per_second": 4.004, + "step": 6300 + }, + { + "epoch": 336.53, + "learning_rate": 1.6604938271604938e-05, + "loss": 0.0281, + "step": 6310 + }, + { + "epoch": 336.96, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.910050392150879, + "eval_runtime": 4.7284, + "eval_samples_per_second": 126.892, + "eval_steps_per_second": 4.018, + "step": 6318 + }, + { + "epoch": 337.07, + "learning_rate": 1.654320987654321e-05, + "loss": 0.0365, + "step": 6320 + }, + { + "epoch": 337.6, + "learning_rate": 1.6481481481481482e-05, + "loss": 0.0305, + "step": 6330 + }, + { + "epoch": 337.97, + "eval_accuracy": 0.24, + "eval_loss": 4.889711856842041, + "eval_runtime": 4.7203, + "eval_samples_per_second": 127.111, + "eval_steps_per_second": 4.025, + "step": 6337 + }, + { + "epoch": 338.13, + "learning_rate": 1.6419753086419752e-05, + "loss": 0.0542, + "step": 6340 + }, + { + "epoch": 338.67, + "learning_rate": 1.6358024691358026e-05, + "loss": 0.0505, + "step": 6350 + }, + { + "epoch": 338.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.929032802581787, + "eval_runtime": 4.7098, + "eval_samples_per_second": 127.395, + "eval_steps_per_second": 4.034, + "step": 6356 + }, + { + "epoch": 339.2, + "learning_rate": 1.62962962962963e-05, + "loss": 0.0414, + "step": 6360 + }, + { + "epoch": 339.73, + "learning_rate": 1.6234567901234567e-05, + "loss": 0.024, + "step": 6370 + }, + { + "epoch": 340.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.944223880767822, + "eval_runtime": 4.7276, + "eval_samples_per_second": 126.914, + "eval_steps_per_second": 4.019, + "step": 6375 + }, + { + "epoch": 340.27, + "learning_rate": 1.617283950617284e-05, + "loss": 0.0267, + "step": 6380 + }, + { + "epoch": 340.8, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.0504, + "step": 6390 + }, + { + "epoch": 340.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.918275833129883, + "eval_runtime": 4.709, + "eval_samples_per_second": 127.416, + "eval_steps_per_second": 4.035, + "step": 6393 + }, + { + "epoch": 341.33, + "learning_rate": 1.604938271604938e-05, + "loss": 0.0413, + "step": 6400 + }, + { + "epoch": 341.87, + "learning_rate": 1.5987654320987655e-05, + "loss": 0.0259, + "step": 6410 + }, + { + "epoch": 341.97, + "eval_accuracy": 0.235, + "eval_loss": 4.883179187774658, + "eval_runtime": 4.734, + "eval_samples_per_second": 126.742, + "eval_steps_per_second": 4.014, + "step": 6412 + }, + { + "epoch": 342.4, + "learning_rate": 1.5925925925925926e-05, + "loss": 0.0338, + "step": 6420 + }, + { + "epoch": 342.93, + "learning_rate": 1.5864197530864196e-05, + "loss": 0.0313, + "step": 6430 + }, + { + "epoch": 342.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.895809650421143, + "eval_runtime": 4.7766, + "eval_samples_per_second": 125.612, + "eval_steps_per_second": 3.978, + "step": 6431 + }, + { + "epoch": 343.47, + "learning_rate": 1.580246913580247e-05, + "loss": 0.0213, + "step": 6440 + }, + { + "epoch": 344.0, + "learning_rate": 1.574074074074074e-05, + "loss": 0.0293, + "step": 6450 + }, + { + "epoch": 344.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.897879123687744, + "eval_runtime": 4.8088, + "eval_samples_per_second": 124.771, + "eval_steps_per_second": 3.951, + "step": 6450 + }, + { + "epoch": 344.53, + "learning_rate": 1.5679012345679014e-05, + "loss": 0.0427, + "step": 6460 + }, + { + "epoch": 344.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.9055495262146, + "eval_runtime": 4.8414, + "eval_samples_per_second": 123.93, + "eval_steps_per_second": 3.924, + "step": 6468 + }, + { + "epoch": 345.07, + "learning_rate": 1.5617283950617285e-05, + "loss": 0.0344, + "step": 6470 + }, + { + "epoch": 345.6, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.0399, + "step": 6480 + }, + { + "epoch": 345.97, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.895743370056152, + "eval_runtime": 4.7399, + "eval_samples_per_second": 126.584, + "eval_steps_per_second": 4.009, + "step": 6487 + }, + { + "epoch": 346.13, + "learning_rate": 1.549382716049383e-05, + "loss": 0.0253, + "step": 6490 + }, + { + "epoch": 346.67, + "learning_rate": 1.54320987654321e-05, + "loss": 0.0273, + "step": 6500 + }, + { + "epoch": 346.99, + "eval_accuracy": 0.24, + "eval_loss": 4.8988518714904785, + "eval_runtime": 4.7852, + "eval_samples_per_second": 125.387, + "eval_steps_per_second": 3.971, + "step": 6506 + }, + { + "epoch": 347.2, + "learning_rate": 1.537037037037037e-05, + "loss": 0.0261, + "step": 6510 + }, + { + "epoch": 347.73, + "learning_rate": 1.5308641975308643e-05, + "loss": 0.0388, + "step": 6520 + }, + { + "epoch": 348.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.9087018966674805, + "eval_runtime": 4.7683, + "eval_samples_per_second": 125.83, + "eval_steps_per_second": 3.985, + "step": 6525 + }, + { + "epoch": 348.27, + "learning_rate": 1.5246913580246914e-05, + "loss": 0.0323, + "step": 6530 + }, + { + "epoch": 348.8, + "learning_rate": 1.5185185185185186e-05, + "loss": 0.0306, + "step": 6540 + }, + { + "epoch": 348.96, + "eval_accuracy": 0.22833333333333333, + "eval_loss": 4.926441669464111, + "eval_runtime": 4.7772, + "eval_samples_per_second": 125.598, + "eval_steps_per_second": 3.977, + "step": 6543 + }, + { + "epoch": 349.33, + "learning_rate": 1.5123456790123458e-05, + "loss": 0.0249, + "step": 6550 + }, + { + "epoch": 349.87, + "learning_rate": 1.506172839506173e-05, + "loss": 0.0411, + "step": 6560 + }, + { + "epoch": 349.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.921908855438232, + "eval_runtime": 4.7531, + "eval_samples_per_second": 126.234, + "eval_steps_per_second": 3.997, + "step": 6562 + }, + { + "epoch": 350.4, + "learning_rate": 1.5e-05, + "loss": 0.031, + "step": 6570 + }, + { + "epoch": 350.93, + "learning_rate": 1.4938271604938272e-05, + "loss": 0.0394, + "step": 6580 + }, + { + "epoch": 350.99, + "eval_accuracy": 0.24, + "eval_loss": 4.89980936050415, + "eval_runtime": 4.7836, + "eval_samples_per_second": 125.428, + "eval_steps_per_second": 3.972, + "step": 6581 + }, + { + "epoch": 351.47, + "learning_rate": 1.4876543209876545e-05, + "loss": 0.0465, + "step": 6590 + }, + { + "epoch": 352.0, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.0507, + "step": 6600 + }, + { + "epoch": 352.0, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.930387496948242, + "eval_runtime": 4.8303, + "eval_samples_per_second": 124.216, + "eval_steps_per_second": 3.934, + "step": 6600 + }, + { + "epoch": 352.53, + "learning_rate": 1.4753086419753087e-05, + "loss": 0.0263, + "step": 6610 + }, + { + "epoch": 352.96, + "eval_accuracy": 0.23, + "eval_loss": 4.923248767852783, + "eval_runtime": 4.822, + "eval_samples_per_second": 124.429, + "eval_steps_per_second": 3.94, + "step": 6618 + }, + { + "epoch": 353.07, + "learning_rate": 1.4691358024691359e-05, + "loss": 0.0355, + "step": 6620 + }, + { + "epoch": 353.6, + "learning_rate": 1.462962962962963e-05, + "loss": 0.0395, + "step": 6630 + }, + { + "epoch": 353.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.92411470413208, + "eval_runtime": 4.7975, + "eval_samples_per_second": 125.066, + "eval_steps_per_second": 3.96, + "step": 6637 + }, + { + "epoch": 354.13, + "learning_rate": 1.4567901234567902e-05, + "loss": 0.0258, + "step": 6640 + }, + { + "epoch": 354.67, + "learning_rate": 1.4506172839506174e-05, + "loss": 0.0394, + "step": 6650 + }, + { + "epoch": 354.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.926273345947266, + "eval_runtime": 4.7995, + "eval_samples_per_second": 125.013, + "eval_steps_per_second": 3.959, + "step": 6656 + }, + { + "epoch": 355.2, + "learning_rate": 1.4444444444444444e-05, + "loss": 0.0345, + "step": 6660 + }, + { + "epoch": 355.73, + "learning_rate": 1.4382716049382716e-05, + "loss": 0.0391, + "step": 6670 + }, + { + "epoch": 356.0, + "eval_accuracy": 0.26, + "eval_loss": 4.927285671234131, + "eval_runtime": 4.819, + "eval_samples_per_second": 124.507, + "eval_steps_per_second": 3.943, + "step": 6675 + }, + { + "epoch": 356.27, + "learning_rate": 1.4320987654320988e-05, + "loss": 0.0274, + "step": 6680 + }, + { + "epoch": 356.8, + "learning_rate": 1.425925925925926e-05, + "loss": 0.0647, + "step": 6690 + }, + { + "epoch": 356.96, + "eval_accuracy": 0.2633333333333333, + "eval_loss": 4.903398513793945, + "eval_runtime": 4.8546, + "eval_samples_per_second": 123.593, + "eval_steps_per_second": 3.914, + "step": 6693 + }, + { + "epoch": 357.33, + "learning_rate": 1.419753086419753e-05, + "loss": 0.0427, + "step": 6700 + }, + { + "epoch": 357.87, + "learning_rate": 1.4135802469135803e-05, + "loss": 0.038, + "step": 6710 + }, + { + "epoch": 357.97, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.891026496887207, + "eval_runtime": 4.8918, + "eval_samples_per_second": 122.655, + "eval_steps_per_second": 3.884, + "step": 6712 + }, + { + "epoch": 358.4, + "learning_rate": 1.4074074074074075e-05, + "loss": 0.0254, + "step": 6720 + }, + { + "epoch": 358.93, + "learning_rate": 1.4012345679012345e-05, + "loss": 0.0368, + "step": 6730 + }, + { + "epoch": 358.99, + "eval_accuracy": 0.245, + "eval_loss": 4.883033275604248, + "eval_runtime": 4.8388, + "eval_samples_per_second": 123.997, + "eval_steps_per_second": 3.927, + "step": 6731 + }, + { + "epoch": 359.47, + "learning_rate": 1.3950617283950617e-05, + "loss": 0.0278, + "step": 6740 + }, + { + "epoch": 360.0, + "learning_rate": 1.388888888888889e-05, + "loss": 0.0308, + "step": 6750 + }, + { + "epoch": 360.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.886683940887451, + "eval_runtime": 4.8656, + "eval_samples_per_second": 123.316, + "eval_steps_per_second": 3.905, + "step": 6750 + }, + { + "epoch": 360.53, + "learning_rate": 1.382716049382716e-05, + "loss": 0.0346, + "step": 6760 + }, + { + "epoch": 360.96, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.86568021774292, + "eval_runtime": 4.8606, + "eval_samples_per_second": 123.442, + "eval_steps_per_second": 3.909, + "step": 6768 + }, + { + "epoch": 361.07, + "learning_rate": 1.3765432098765432e-05, + "loss": 0.0254, + "step": 6770 + }, + { + "epoch": 361.6, + "learning_rate": 1.3703703703703704e-05, + "loss": 0.0279, + "step": 6780 + }, + { + "epoch": 361.97, + "eval_accuracy": 0.24, + "eval_loss": 4.8677825927734375, + "eval_runtime": 4.8214, + "eval_samples_per_second": 124.446, + "eval_steps_per_second": 3.941, + "step": 6787 + }, + { + "epoch": 362.13, + "learning_rate": 1.3641975308641978e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 362.67, + "learning_rate": 1.3580246913580247e-05, + "loss": 0.0443, + "step": 6800 + }, + { + "epoch": 362.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.872296333312988, + "eval_runtime": 4.8453, + "eval_samples_per_second": 123.832, + "eval_steps_per_second": 3.921, + "step": 6806 + }, + { + "epoch": 363.2, + "learning_rate": 1.3518518518518519e-05, + "loss": 0.0224, + "step": 6810 + }, + { + "epoch": 363.73, + "learning_rate": 1.3456790123456793e-05, + "loss": 0.027, + "step": 6820 + }, + { + "epoch": 364.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.875555515289307, + "eval_runtime": 4.8571, + "eval_samples_per_second": 123.529, + "eval_steps_per_second": 3.912, + "step": 6825 + }, + { + "epoch": 364.27, + "learning_rate": 1.3395061728395061e-05, + "loss": 0.0394, + "step": 6830 + }, + { + "epoch": 364.8, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0447, + "step": 6840 + }, + { + "epoch": 364.96, + "eval_accuracy": 0.235, + "eval_loss": 4.874227046966553, + "eval_runtime": 4.8623, + "eval_samples_per_second": 123.397, + "eval_steps_per_second": 3.908, + "step": 6843 + }, + { + "epoch": 365.33, + "learning_rate": 1.3271604938271605e-05, + "loss": 0.0402, + "step": 6850 + }, + { + "epoch": 365.87, + "learning_rate": 1.3209876543209876e-05, + "loss": 0.028, + "step": 6860 + }, + { + "epoch": 365.97, + "eval_accuracy": 0.235, + "eval_loss": 4.904233455657959, + "eval_runtime": 4.8676, + "eval_samples_per_second": 123.264, + "eval_steps_per_second": 3.903, + "step": 6862 + }, + { + "epoch": 366.4, + "learning_rate": 1.3148148148148148e-05, + "loss": 0.0306, + "step": 6870 + }, + { + "epoch": 366.93, + "learning_rate": 1.308641975308642e-05, + "loss": 0.0483, + "step": 6880 + }, + { + "epoch": 366.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.908579349517822, + "eval_runtime": 4.8554, + "eval_samples_per_second": 123.573, + "eval_steps_per_second": 3.913, + "step": 6881 + }, + { + "epoch": 367.47, + "learning_rate": 1.3024691358024694e-05, + "loss": 0.0321, + "step": 6890 + }, + { + "epoch": 368.0, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.034, + "step": 6900 + }, + { + "epoch": 368.0, + "eval_accuracy": 0.24, + "eval_loss": 4.888582229614258, + "eval_runtime": 4.9075, + "eval_samples_per_second": 122.261, + "eval_steps_per_second": 3.872, + "step": 6900 + }, + { + "epoch": 368.53, + "learning_rate": 1.2901234567901235e-05, + "loss": 0.0363, + "step": 6910 + }, + { + "epoch": 368.96, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.877806186676025, + "eval_runtime": 4.9146, + "eval_samples_per_second": 122.085, + "eval_steps_per_second": 3.866, + "step": 6918 + }, + { + "epoch": 369.07, + "learning_rate": 1.2839506172839508e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 369.6, + "learning_rate": 1.2777777777777777e-05, + "loss": 0.0417, + "step": 6930 + }, + { + "epoch": 369.97, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.905084609985352, + "eval_runtime": 4.8859, + "eval_samples_per_second": 122.802, + "eval_steps_per_second": 3.889, + "step": 6937 + }, + { + "epoch": 370.13, + "learning_rate": 1.2716049382716049e-05, + "loss": 0.0338, + "step": 6940 + }, + { + "epoch": 370.67, + "learning_rate": 1.2654320987654323e-05, + "loss": 0.0326, + "step": 6950 + }, + { + "epoch": 370.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.911184310913086, + "eval_runtime": 4.8779, + "eval_samples_per_second": 123.003, + "eval_steps_per_second": 3.895, + "step": 6956 + }, + { + "epoch": 371.2, + "learning_rate": 1.2592592592592592e-05, + "loss": 0.0261, + "step": 6960 + }, + { + "epoch": 371.73, + "learning_rate": 1.2530864197530864e-05, + "loss": 0.028, + "step": 6970 + }, + { + "epoch": 372.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.911579608917236, + "eval_runtime": 4.906, + "eval_samples_per_second": 122.299, + "eval_steps_per_second": 3.873, + "step": 6975 + }, + { + "epoch": 372.27, + "learning_rate": 1.2469135802469137e-05, + "loss": 0.0196, + "step": 6980 + }, + { + "epoch": 372.8, + "learning_rate": 1.2407407407407408e-05, + "loss": 0.0343, + "step": 6990 + }, + { + "epoch": 372.96, + "eval_accuracy": 0.245, + "eval_loss": 4.910400390625, + "eval_runtime": 4.8963, + "eval_samples_per_second": 122.541, + "eval_steps_per_second": 3.88, + "step": 6993 + }, + { + "epoch": 373.33, + "learning_rate": 1.2345679012345678e-05, + "loss": 0.0246, + "step": 7000 + }, + { + "epoch": 373.87, + "learning_rate": 1.2283950617283952e-05, + "loss": 0.0229, + "step": 7010 + }, + { + "epoch": 373.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.940114974975586, + "eval_runtime": 4.8862, + "eval_samples_per_second": 122.794, + "eval_steps_per_second": 3.888, + "step": 7012 + }, + { + "epoch": 374.4, + "learning_rate": 1.2222222222222222e-05, + "loss": 0.0235, + "step": 7020 + }, + { + "epoch": 374.93, + "learning_rate": 1.2160493827160495e-05, + "loss": 0.0337, + "step": 7030 + }, + { + "epoch": 374.99, + "eval_accuracy": 0.245, + "eval_loss": 4.934114933013916, + "eval_runtime": 4.9143, + "eval_samples_per_second": 122.092, + "eval_steps_per_second": 3.866, + "step": 7031 + }, + { + "epoch": 375.47, + "learning_rate": 1.2098765432098767e-05, + "loss": 0.0334, + "step": 7040 + }, + { + "epoch": 376.0, + "learning_rate": 1.2037037037037037e-05, + "loss": 0.0356, + "step": 7050 + }, + { + "epoch": 376.0, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.933629035949707, + "eval_runtime": 4.9706, + "eval_samples_per_second": 120.709, + "eval_steps_per_second": 3.822, + "step": 7050 + }, + { + "epoch": 376.53, + "learning_rate": 1.1975308641975309e-05, + "loss": 0.029, + "step": 7060 + }, + { + "epoch": 376.96, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.913231372833252, + "eval_runtime": 4.888, + "eval_samples_per_second": 122.75, + "eval_steps_per_second": 3.887, + "step": 7068 + }, + { + "epoch": 377.07, + "learning_rate": 1.1913580246913581e-05, + "loss": 0.0348, + "step": 7070 + }, + { + "epoch": 377.6, + "learning_rate": 1.1851851851851853e-05, + "loss": 0.0272, + "step": 7080 + }, + { + "epoch": 377.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.910186290740967, + "eval_runtime": 4.9298, + "eval_samples_per_second": 121.709, + "eval_steps_per_second": 3.854, + "step": 7087 + }, + { + "epoch": 378.13, + "learning_rate": 1.1790123456790124e-05, + "loss": 0.0269, + "step": 7090 + }, + { + "epoch": 378.67, + "learning_rate": 1.1728395061728396e-05, + "loss": 0.0256, + "step": 7100 + }, + { + "epoch": 378.99, + "eval_accuracy": 0.23166666666666666, + "eval_loss": 4.925504207611084, + "eval_runtime": 4.948, + "eval_samples_per_second": 121.26, + "eval_steps_per_second": 3.84, + "step": 7106 + }, + { + "epoch": 379.2, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.0315, + "step": 7110 + }, + { + "epoch": 379.73, + "learning_rate": 1.1604938271604938e-05, + "loss": 0.0276, + "step": 7120 + }, + { + "epoch": 380.0, + "eval_accuracy": 0.22666666666666666, + "eval_loss": 4.928166389465332, + "eval_runtime": 4.9278, + "eval_samples_per_second": 121.757, + "eval_steps_per_second": 3.856, + "step": 7125 + }, + { + "epoch": 380.27, + "learning_rate": 1.154320987654321e-05, + "loss": 0.0286, + "step": 7130 + }, + { + "epoch": 380.8, + "learning_rate": 1.1481481481481482e-05, + "loss": 0.026, + "step": 7140 + }, + { + "epoch": 380.96, + "eval_accuracy": 0.22, + "eval_loss": 4.952660083770752, + "eval_runtime": 4.9502, + "eval_samples_per_second": 121.208, + "eval_steps_per_second": 3.838, + "step": 7143 + }, + { + "epoch": 381.33, + "learning_rate": 1.1419753086419753e-05, + "loss": 0.031, + "step": 7150 + }, + { + "epoch": 381.87, + "learning_rate": 1.1358024691358025e-05, + "loss": 0.0385, + "step": 7160 + }, + { + "epoch": 381.97, + "eval_accuracy": 0.22166666666666668, + "eval_loss": 4.941068649291992, + "eval_runtime": 4.9601, + "eval_samples_per_second": 120.966, + "eval_steps_per_second": 3.831, + "step": 7162 + }, + { + "epoch": 382.4, + "learning_rate": 1.1296296296296297e-05, + "loss": 0.0269, + "step": 7170 + }, + { + "epoch": 382.93, + "learning_rate": 1.123456790123457e-05, + "loss": 0.026, + "step": 7180 + }, + { + "epoch": 382.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.952951908111572, + "eval_runtime": 4.9497, + "eval_samples_per_second": 121.22, + "eval_steps_per_second": 3.839, + "step": 7181 + }, + { + "epoch": 383.47, + "learning_rate": 1.117283950617284e-05, + "loss": 0.0262, + "step": 7190 + }, + { + "epoch": 384.0, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0444, + "step": 7200 + }, + { + "epoch": 384.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.9387335777282715, + "eval_runtime": 4.9515, + "eval_samples_per_second": 121.176, + "eval_steps_per_second": 3.837, + "step": 7200 + }, + { + "epoch": 384.53, + "learning_rate": 1.1049382716049384e-05, + "loss": 0.0369, + "step": 7210 + }, + { + "epoch": 384.96, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.904226303100586, + "eval_runtime": 4.9498, + "eval_samples_per_second": 121.217, + "eval_steps_per_second": 3.839, + "step": 7218 + }, + { + "epoch": 385.07, + "learning_rate": 1.0987654320987654e-05, + "loss": 0.0258, + "step": 7220 + }, + { + "epoch": 385.6, + "learning_rate": 1.0925925925925926e-05, + "loss": 0.0203, + "step": 7230 + }, + { + "epoch": 385.97, + "eval_accuracy": 0.23, + "eval_loss": 4.886034965515137, + "eval_runtime": 4.9753, + "eval_samples_per_second": 120.596, + "eval_steps_per_second": 3.819, + "step": 7237 + }, + { + "epoch": 386.13, + "learning_rate": 1.0864197530864198e-05, + "loss": 0.0332, + "step": 7240 + }, + { + "epoch": 386.67, + "learning_rate": 1.0802469135802469e-05, + "loss": 0.0238, + "step": 7250 + }, + { + "epoch": 386.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.877529144287109, + "eval_runtime": 4.9631, + "eval_samples_per_second": 120.891, + "eval_steps_per_second": 3.828, + "step": 7256 + }, + { + "epoch": 387.2, + "learning_rate": 1.074074074074074e-05, + "loss": 0.0266, + "step": 7260 + }, + { + "epoch": 387.73, + "learning_rate": 1.0679012345679013e-05, + "loss": 0.0315, + "step": 7270 + }, + { + "epoch": 388.0, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.864088535308838, + "eval_runtime": 4.9534, + "eval_samples_per_second": 121.128, + "eval_steps_per_second": 3.836, + "step": 7275 + }, + { + "epoch": 388.27, + "learning_rate": 1.0617283950617285e-05, + "loss": 0.0208, + "step": 7280 + }, + { + "epoch": 388.8, + "learning_rate": 1.0555555555555555e-05, + "loss": 0.0349, + "step": 7290 + }, + { + "epoch": 388.96, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.86765718460083, + "eval_runtime": 4.9642, + "eval_samples_per_second": 120.866, + "eval_steps_per_second": 3.827, + "step": 7293 + }, + { + "epoch": 389.33, + "learning_rate": 1.0493827160493827e-05, + "loss": 0.0336, + "step": 7300 + }, + { + "epoch": 389.87, + "learning_rate": 1.04320987654321e-05, + "loss": 0.038, + "step": 7310 + }, + { + "epoch": 389.97, + "eval_accuracy": 0.24, + "eval_loss": 4.868815898895264, + "eval_runtime": 4.9771, + "eval_samples_per_second": 120.552, + "eval_steps_per_second": 3.817, + "step": 7312 + }, + { + "epoch": 390.4, + "learning_rate": 1.037037037037037e-05, + "loss": 0.0188, + "step": 7320 + }, + { + "epoch": 390.93, + "learning_rate": 1.0308641975308644e-05, + "loss": 0.0301, + "step": 7330 + }, + { + "epoch": 390.99, + "eval_accuracy": 0.245, + "eval_loss": 4.8932037353515625, + "eval_runtime": 4.9575, + "eval_samples_per_second": 121.03, + "eval_steps_per_second": 3.833, + "step": 7331 + }, + { + "epoch": 391.47, + "learning_rate": 1.0246913580246914e-05, + "loss": 0.0418, + "step": 7340 + }, + { + "epoch": 392.0, + "learning_rate": 1.0185185185185185e-05, + "loss": 0.0363, + "step": 7350 + }, + { + "epoch": 392.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.902304172515869, + "eval_runtime": 4.9867, + "eval_samples_per_second": 120.321, + "eval_steps_per_second": 3.81, + "step": 7350 + }, + { + "epoch": 392.53, + "learning_rate": 1.0123456790123458e-05, + "loss": 0.0329, + "step": 7360 + }, + { + "epoch": 392.96, + "eval_accuracy": 0.24, + "eval_loss": 4.882464408874512, + "eval_runtime": 4.9782, + "eval_samples_per_second": 120.525, + "eval_steps_per_second": 3.817, + "step": 7368 + }, + { + "epoch": 393.07, + "learning_rate": 1.0061728395061729e-05, + "loss": 0.0255, + "step": 7370 + }, + { + "epoch": 393.6, + "learning_rate": 1e-05, + "loss": 0.0174, + "step": 7380 + }, + { + "epoch": 393.97, + "eval_accuracy": 0.24, + "eval_loss": 4.87109375, + "eval_runtime": 4.9821, + "eval_samples_per_second": 120.431, + "eval_steps_per_second": 3.814, + "step": 7387 + }, + { + "epoch": 394.13, + "learning_rate": 9.938271604938273e-06, + "loss": 0.0256, + "step": 7390 + }, + { + "epoch": 394.67, + "learning_rate": 9.876543209876543e-06, + "loss": 0.0284, + "step": 7400 + }, + { + "epoch": 394.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.876201152801514, + "eval_runtime": 5.0563, + "eval_samples_per_second": 118.663, + "eval_steps_per_second": 3.758, + "step": 7406 + }, + { + "epoch": 395.2, + "learning_rate": 9.814814814814815e-06, + "loss": 0.0458, + "step": 7410 + }, + { + "epoch": 395.73, + "learning_rate": 9.753086419753086e-06, + "loss": 0.0178, + "step": 7420 + }, + { + "epoch": 396.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.868426322937012, + "eval_runtime": 5.0279, + "eval_samples_per_second": 119.333, + "eval_steps_per_second": 3.779, + "step": 7425 + }, + { + "epoch": 396.27, + "learning_rate": 9.69135802469136e-06, + "loss": 0.0154, + "step": 7430 + }, + { + "epoch": 396.8, + "learning_rate": 9.62962962962963e-06, + "loss": 0.0359, + "step": 7440 + }, + { + "epoch": 396.96, + "eval_accuracy": 0.245, + "eval_loss": 4.865981578826904, + "eval_runtime": 5.0157, + "eval_samples_per_second": 119.625, + "eval_steps_per_second": 3.788, + "step": 7443 + }, + { + "epoch": 397.33, + "learning_rate": 9.5679012345679e-06, + "loss": 0.0295, + "step": 7450 + }, + { + "epoch": 397.87, + "learning_rate": 9.506172839506174e-06, + "loss": 0.029, + "step": 7460 + }, + { + "epoch": 397.97, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.879904270172119, + "eval_runtime": 5.0461, + "eval_samples_per_second": 118.904, + "eval_steps_per_second": 3.765, + "step": 7462 + }, + { + "epoch": 398.4, + "learning_rate": 9.444444444444445e-06, + "loss": 0.0368, + "step": 7470 + }, + { + "epoch": 398.93, + "learning_rate": 9.382716049382717e-06, + "loss": 0.0227, + "step": 7480 + }, + { + "epoch": 398.99, + "eval_accuracy": 0.25, + "eval_loss": 4.884500503540039, + "eval_runtime": 5.007, + "eval_samples_per_second": 119.833, + "eval_steps_per_second": 3.795, + "step": 7481 + }, + { + "epoch": 399.47, + "learning_rate": 9.320987654320989e-06, + "loss": 0.0338, + "step": 7490 + }, + { + "epoch": 400.0, + "learning_rate": 9.259259259259259e-06, + "loss": 0.0135, + "step": 7500 + }, + { + "epoch": 400.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.889830112457275, + "eval_runtime": 5.0086, + "eval_samples_per_second": 119.793, + "eval_steps_per_second": 3.793, + "step": 7500 + }, + { + "epoch": 400.53, + "learning_rate": 9.197530864197531e-06, + "loss": 0.0297, + "step": 7510 + }, + { + "epoch": 400.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.896719455718994, + "eval_runtime": 5.0281, + "eval_samples_per_second": 119.329, + "eval_steps_per_second": 3.779, + "step": 7518 + }, + { + "epoch": 401.07, + "learning_rate": 9.135802469135803e-06, + "loss": 0.0364, + "step": 7520 + }, + { + "epoch": 401.6, + "learning_rate": 9.074074074074075e-06, + "loss": 0.0263, + "step": 7530 + }, + { + "epoch": 401.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.888412952423096, + "eval_runtime": 5.0155, + "eval_samples_per_second": 119.63, + "eval_steps_per_second": 3.788, + "step": 7537 + }, + { + "epoch": 402.13, + "learning_rate": 9.012345679012346e-06, + "loss": 0.035, + "step": 7540 + }, + { + "epoch": 402.67, + "learning_rate": 8.950617283950618e-06, + "loss": 0.0386, + "step": 7550 + }, + { + "epoch": 402.99, + "eval_accuracy": 0.24, + "eval_loss": 4.871886730194092, + "eval_runtime": 4.9851, + "eval_samples_per_second": 120.358, + "eval_steps_per_second": 3.811, + "step": 7556 + }, + { + "epoch": 403.2, + "learning_rate": 8.88888888888889e-06, + "loss": 0.0339, + "step": 7560 + }, + { + "epoch": 403.73, + "learning_rate": 8.82716049382716e-06, + "loss": 0.0298, + "step": 7570 + }, + { + "epoch": 404.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.860945701599121, + "eval_runtime": 5.0553, + "eval_samples_per_second": 118.688, + "eval_steps_per_second": 3.758, + "step": 7575 + }, + { + "epoch": 404.27, + "learning_rate": 8.765432098765432e-06, + "loss": 0.0318, + "step": 7580 + }, + { + "epoch": 404.8, + "learning_rate": 8.703703703703705e-06, + "loss": 0.0232, + "step": 7590 + }, + { + "epoch": 404.96, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.860249996185303, + "eval_runtime": 5.0445, + "eval_samples_per_second": 118.942, + "eval_steps_per_second": 3.767, + "step": 7593 + }, + { + "epoch": 405.33, + "learning_rate": 8.641975308641975e-06, + "loss": 0.0367, + "step": 7600 + }, + { + "epoch": 405.87, + "learning_rate": 8.580246913580247e-06, + "loss": 0.0232, + "step": 7610 + }, + { + "epoch": 405.97, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.866739749908447, + "eval_runtime": 5.0222, + "eval_samples_per_second": 119.47, + "eval_steps_per_second": 3.783, + "step": 7612 + }, + { + "epoch": 406.4, + "learning_rate": 8.518518518518519e-06, + "loss": 0.0223, + "step": 7620 + }, + { + "epoch": 406.93, + "learning_rate": 8.456790123456791e-06, + "loss": 0.032, + "step": 7630 + }, + { + "epoch": 406.99, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.8684258460998535, + "eval_runtime": 5.0294, + "eval_samples_per_second": 119.298, + "eval_steps_per_second": 3.778, + "step": 7631 + }, + { + "epoch": 407.47, + "learning_rate": 8.395061728395062e-06, + "loss": 0.0272, + "step": 7640 + }, + { + "epoch": 408.0, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0306, + "step": 7650 + }, + { + "epoch": 408.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.875509262084961, + "eval_runtime": 5.0495, + "eval_samples_per_second": 118.823, + "eval_steps_per_second": 3.763, + "step": 7650 + }, + { + "epoch": 408.53, + "learning_rate": 8.271604938271606e-06, + "loss": 0.0299, + "step": 7660 + }, + { + "epoch": 408.96, + "eval_accuracy": 0.245, + "eval_loss": 4.868679046630859, + "eval_runtime": 5.094, + "eval_samples_per_second": 117.785, + "eval_steps_per_second": 3.73, + "step": 7668 + }, + { + "epoch": 409.07, + "learning_rate": 8.209876543209876e-06, + "loss": 0.0402, + "step": 7670 + }, + { + "epoch": 409.6, + "learning_rate": 8.14814814814815e-06, + "loss": 0.0307, + "step": 7680 + }, + { + "epoch": 409.97, + "eval_accuracy": 0.24, + "eval_loss": 4.872376441955566, + "eval_runtime": 5.0699, + "eval_samples_per_second": 118.345, + "eval_steps_per_second": 3.748, + "step": 7687 + }, + { + "epoch": 410.13, + "learning_rate": 8.08641975308642e-06, + "loss": 0.0315, + "step": 7690 + }, + { + "epoch": 410.67, + "learning_rate": 8.02469135802469e-06, + "loss": 0.0304, + "step": 7700 + }, + { + "epoch": 410.99, + "eval_accuracy": 0.25, + "eval_loss": 4.879815101623535, + "eval_runtime": 5.0824, + "eval_samples_per_second": 118.054, + "eval_steps_per_second": 3.738, + "step": 7706 + }, + { + "epoch": 411.2, + "learning_rate": 7.962962962962963e-06, + "loss": 0.039, + "step": 7710 + }, + { + "epoch": 411.73, + "learning_rate": 7.901234567901235e-06, + "loss": 0.0293, + "step": 7720 + }, + { + "epoch": 412.0, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.890056133270264, + "eval_runtime": 5.1118, + "eval_samples_per_second": 117.376, + "eval_steps_per_second": 3.717, + "step": 7725 + }, + { + "epoch": 412.27, + "learning_rate": 7.839506172839507e-06, + "loss": 0.0276, + "step": 7730 + }, + { + "epoch": 412.8, + "learning_rate": 7.777777777777777e-06, + "loss": 0.0273, + "step": 7740 + }, + { + "epoch": 412.96, + "eval_accuracy": 0.24, + "eval_loss": 4.902527332305908, + "eval_runtime": 5.0871, + "eval_samples_per_second": 117.946, + "eval_steps_per_second": 3.735, + "step": 7743 + }, + { + "epoch": 413.33, + "learning_rate": 7.71604938271605e-06, + "loss": 0.0334, + "step": 7750 + }, + { + "epoch": 413.87, + "learning_rate": 7.654320987654322e-06, + "loss": 0.0184, + "step": 7760 + }, + { + "epoch": 413.97, + "eval_accuracy": 0.24, + "eval_loss": 4.886964321136475, + "eval_runtime": 5.0738, + "eval_samples_per_second": 118.254, + "eval_steps_per_second": 3.745, + "step": 7762 + }, + { + "epoch": 414.4, + "learning_rate": 7.592592592592593e-06, + "loss": 0.0227, + "step": 7770 + }, + { + "epoch": 414.93, + "learning_rate": 7.530864197530865e-06, + "loss": 0.0377, + "step": 7780 + }, + { + "epoch": 414.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.890088081359863, + "eval_runtime": 5.0985, + "eval_samples_per_second": 117.682, + "eval_steps_per_second": 3.727, + "step": 7781 + }, + { + "epoch": 415.47, + "learning_rate": 7.469135802469136e-06, + "loss": 0.0284, + "step": 7790 + }, + { + "epoch": 416.0, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.0278, + "step": 7800 + }, + { + "epoch": 416.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.889472484588623, + "eval_runtime": 5.0982, + "eval_samples_per_second": 117.689, + "eval_steps_per_second": 3.727, + "step": 7800 + }, + { + "epoch": 416.53, + "learning_rate": 7.3456790123456796e-06, + "loss": 0.0345, + "step": 7810 + }, + { + "epoch": 416.96, + "eval_accuracy": 0.25333333333333335, + "eval_loss": 4.904553413391113, + "eval_runtime": 5.0835, + "eval_samples_per_second": 118.028, + "eval_steps_per_second": 3.738, + "step": 7818 + }, + { + "epoch": 417.07, + "learning_rate": 7.283950617283951e-06, + "loss": 0.0376, + "step": 7820 + }, + { + "epoch": 417.6, + "learning_rate": 7.222222222222222e-06, + "loss": 0.0301, + "step": 7830 + }, + { + "epoch": 417.97, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.900204181671143, + "eval_runtime": 5.1232, + "eval_samples_per_second": 117.114, + "eval_steps_per_second": 3.709, + "step": 7837 + }, + { + "epoch": 418.13, + "learning_rate": 7.160493827160494e-06, + "loss": 0.0204, + "step": 7840 + }, + { + "epoch": 418.67, + "learning_rate": 7.098765432098765e-06, + "loss": 0.0159, + "step": 7850 + }, + { + "epoch": 418.99, + "eval_accuracy": 0.245, + "eval_loss": 4.89817476272583, + "eval_runtime": 5.1276, + "eval_samples_per_second": 117.015, + "eval_steps_per_second": 3.705, + "step": 7856 + }, + { + "epoch": 419.2, + "learning_rate": 7.0370370370370375e-06, + "loss": 0.0217, + "step": 7860 + }, + { + "epoch": 419.73, + "learning_rate": 6.975308641975309e-06, + "loss": 0.0203, + "step": 7870 + }, + { + "epoch": 420.0, + "eval_accuracy": 0.24833333333333332, + "eval_loss": 4.900780200958252, + "eval_runtime": 5.1291, + "eval_samples_per_second": 116.98, + "eval_steps_per_second": 3.704, + "step": 7875 + }, + { + "epoch": 420.27, + "learning_rate": 6.91358024691358e-06, + "loss": 0.0295, + "step": 7880 + }, + { + "epoch": 420.8, + "learning_rate": 6.851851851851852e-06, + "loss": 0.0182, + "step": 7890 + }, + { + "epoch": 420.96, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.9113335609436035, + "eval_runtime": 5.1351, + "eval_samples_per_second": 116.842, + "eval_steps_per_second": 3.7, + "step": 7893 + }, + { + "epoch": 421.33, + "learning_rate": 6.790123456790123e-06, + "loss": 0.0321, + "step": 7900 + }, + { + "epoch": 421.87, + "learning_rate": 6.728395061728396e-06, + "loss": 0.0258, + "step": 7910 + }, + { + "epoch": 421.97, + "eval_accuracy": 0.25, + "eval_loss": 4.918017387390137, + "eval_runtime": 5.1337, + "eval_samples_per_second": 116.875, + "eval_steps_per_second": 3.701, + "step": 7912 + }, + { + "epoch": 422.4, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0277, + "step": 7920 + }, + { + "epoch": 422.93, + "learning_rate": 6.604938271604938e-06, + "loss": 0.0266, + "step": 7930 + }, + { + "epoch": 422.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.913443565368652, + "eval_runtime": 5.1296, + "eval_samples_per_second": 116.969, + "eval_steps_per_second": 3.704, + "step": 7931 + }, + { + "epoch": 423.47, + "learning_rate": 6.54320987654321e-06, + "loss": 0.037, + "step": 7940 + }, + { + "epoch": 424.0, + "learning_rate": 6.481481481481481e-06, + "loss": 0.0304, + "step": 7950 + }, + { + "epoch": 424.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.900519847869873, + "eval_runtime": 5.1632, + "eval_samples_per_second": 116.207, + "eval_steps_per_second": 3.68, + "step": 7950 + }, + { + "epoch": 424.53, + "learning_rate": 6.419753086419754e-06, + "loss": 0.0247, + "step": 7960 + }, + { + "epoch": 424.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.893669605255127, + "eval_runtime": 5.2111, + "eval_samples_per_second": 115.139, + "eval_steps_per_second": 3.646, + "step": 7968 + }, + { + "epoch": 425.07, + "learning_rate": 6.3580246913580246e-06, + "loss": 0.0335, + "step": 7970 + }, + { + "epoch": 425.6, + "learning_rate": 6.296296296296296e-06, + "loss": 0.0493, + "step": 7980 + }, + { + "epoch": 425.97, + "eval_accuracy": 0.245, + "eval_loss": 4.883533954620361, + "eval_runtime": 5.1746, + "eval_samples_per_second": 115.951, + "eval_steps_per_second": 3.672, + "step": 7987 + }, + { + "epoch": 426.13, + "learning_rate": 6.234567901234569e-06, + "loss": 0.0157, + "step": 7990 + }, + { + "epoch": 426.67, + "learning_rate": 6.172839506172839e-06, + "loss": 0.0286, + "step": 8000 + }, + { + "epoch": 426.99, + "eval_accuracy": 0.24, + "eval_loss": 4.896754741668701, + "eval_runtime": 5.1903, + "eval_samples_per_second": 115.6, + "eval_steps_per_second": 3.661, + "step": 8006 + }, + { + "epoch": 427.2, + "learning_rate": 6.111111111111111e-06, + "loss": 0.0249, + "step": 8010 + }, + { + "epoch": 427.73, + "learning_rate": 6.049382716049383e-06, + "loss": 0.0228, + "step": 8020 + }, + { + "epoch": 428.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.906573295593262, + "eval_runtime": 5.2526, + "eval_samples_per_second": 114.23, + "eval_steps_per_second": 3.617, + "step": 8025 + }, + { + "epoch": 428.27, + "learning_rate": 5.9876543209876546e-06, + "loss": 0.0324, + "step": 8030 + }, + { + "epoch": 428.8, + "learning_rate": 5.925925925925927e-06, + "loss": 0.0362, + "step": 8040 + }, + { + "epoch": 428.96, + "eval_accuracy": 0.245, + "eval_loss": 4.903099536895752, + "eval_runtime": 5.2207, + "eval_samples_per_second": 114.927, + "eval_steps_per_second": 3.639, + "step": 8043 + }, + { + "epoch": 429.33, + "learning_rate": 5.864197530864198e-06, + "loss": 0.0213, + "step": 8050 + }, + { + "epoch": 429.87, + "learning_rate": 5.802469135802469e-06, + "loss": 0.0244, + "step": 8060 + }, + { + "epoch": 429.97, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.899692535400391, + "eval_runtime": 5.2551, + "eval_samples_per_second": 114.174, + "eval_steps_per_second": 3.616, + "step": 8062 + }, + { + "epoch": 430.4, + "learning_rate": 5.740740740740741e-06, + "loss": 0.0405, + "step": 8070 + }, + { + "epoch": 430.93, + "learning_rate": 5.6790123456790125e-06, + "loss": 0.0204, + "step": 8080 + }, + { + "epoch": 430.99, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.905935287475586, + "eval_runtime": 5.2136, + "eval_samples_per_second": 115.084, + "eval_steps_per_second": 3.644, + "step": 8081 + }, + { + "epoch": 431.47, + "learning_rate": 5.617283950617285e-06, + "loss": 0.0175, + "step": 8090 + }, + { + "epoch": 432.0, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0344, + "step": 8100 + }, + { + "epoch": 432.0, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.905205726623535, + "eval_runtime": 5.2045, + "eval_samples_per_second": 115.286, + "eval_steps_per_second": 3.651, + "step": 8100 + }, + { + "epoch": 432.53, + "learning_rate": 5.493827160493827e-06, + "loss": 0.0252, + "step": 8110 + }, + { + "epoch": 432.96, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.897459983825684, + "eval_runtime": 5.1933, + "eval_samples_per_second": 115.533, + "eval_steps_per_second": 3.659, + "step": 8118 + }, + { + "epoch": 433.07, + "learning_rate": 5.432098765432099e-06, + "loss": 0.0185, + "step": 8120 + }, + { + "epoch": 433.6, + "learning_rate": 5.37037037037037e-06, + "loss": 0.0242, + "step": 8130 + }, + { + "epoch": 433.97, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.896070957183838, + "eval_runtime": 5.2428, + "eval_samples_per_second": 114.443, + "eval_steps_per_second": 3.624, + "step": 8137 + }, + { + "epoch": 434.13, + "learning_rate": 5.3086419753086425e-06, + "loss": 0.0372, + "step": 8140 + }, + { + "epoch": 434.67, + "learning_rate": 5.246913580246914e-06, + "loss": 0.0135, + "step": 8150 + }, + { + "epoch": 434.99, + "eval_accuracy": 0.24666666666666667, + "eval_loss": 4.908581733703613, + "eval_runtime": 5.248, + "eval_samples_per_second": 114.329, + "eval_steps_per_second": 3.62, + "step": 8156 + }, + { + "epoch": 435.2, + "learning_rate": 5.185185185185185e-06, + "loss": 0.0322, + "step": 8160 + }, + { + "epoch": 435.73, + "learning_rate": 5.123456790123457e-06, + "loss": 0.0296, + "step": 8170 + }, + { + "epoch": 436.0, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.9134602546691895, + "eval_runtime": 5.2357, + "eval_samples_per_second": 114.598, + "eval_steps_per_second": 3.629, + "step": 8175 + }, + { + "epoch": 436.27, + "learning_rate": 5.061728395061729e-06, + "loss": 0.0164, + "step": 8180 + }, + { + "epoch": 436.8, + "learning_rate": 5e-06, + "loss": 0.0432, + "step": 8190 + }, + { + "epoch": 436.96, + "eval_accuracy": 0.24333333333333335, + "eval_loss": 4.907933712005615, + "eval_runtime": 5.268, + "eval_samples_per_second": 113.896, + "eval_steps_per_second": 3.607, + "step": 8193 + }, + { + "epoch": 437.33, + "learning_rate": 4.938271604938272e-06, + "loss": 0.017, + "step": 8200 + }, + { + "epoch": 437.87, + "learning_rate": 4.876543209876543e-06, + "loss": 0.0242, + "step": 8210 + }, + { + "epoch": 437.97, + "eval_accuracy": 0.24, + "eval_loss": 4.898138046264648, + "eval_runtime": 5.2464, + "eval_samples_per_second": 114.365, + "eval_steps_per_second": 3.622, + "step": 8212 + }, + { + "epoch": 438.4, + "learning_rate": 4.814814814814815e-06, + "loss": 0.026, + "step": 8220 + }, + { + "epoch": 438.93, + "learning_rate": 4.753086419753087e-06, + "loss": 0.0227, + "step": 8230 + }, + { + "epoch": 438.99, + "eval_accuracy": 0.24, + "eval_loss": 4.8857102394104, + "eval_runtime": 5.264, + "eval_samples_per_second": 113.981, + "eval_steps_per_second": 3.609, + "step": 8231 + }, + { + "epoch": 439.47, + "learning_rate": 4.691358024691358e-06, + "loss": 0.0309, + "step": 8240 + }, + { + "epoch": 440.0, + "learning_rate": 4.6296296296296296e-06, + "loss": 0.021, + "step": 8250 + }, + { + "epoch": 440.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.887371063232422, + "eval_runtime": 5.2856, + "eval_samples_per_second": 113.516, + "eval_steps_per_second": 3.595, + "step": 8250 + }, + { + "epoch": 440.53, + "learning_rate": 4.567901234567902e-06, + "loss": 0.0244, + "step": 8260 + }, + { + "epoch": 440.96, + "eval_accuracy": 0.24, + "eval_loss": 4.884664058685303, + "eval_runtime": 5.2546, + "eval_samples_per_second": 114.187, + "eval_steps_per_second": 3.616, + "step": 8268 + }, + { + "epoch": 441.07, + "learning_rate": 4.506172839506173e-06, + "loss": 0.0484, + "step": 8270 + }, + { + "epoch": 441.6, + "learning_rate": 4.444444444444445e-06, + "loss": 0.0234, + "step": 8280 + }, + { + "epoch": 441.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.896438121795654, + "eval_runtime": 5.2769, + "eval_samples_per_second": 113.703, + "eval_steps_per_second": 3.601, + "step": 8287 + }, + { + "epoch": 442.13, + "learning_rate": 4.382716049382716e-06, + "loss": 0.0293, + "step": 8290 + }, + { + "epoch": 442.67, + "learning_rate": 4.3209876543209875e-06, + "loss": 0.0278, + "step": 8300 + }, + { + "epoch": 442.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.916093349456787, + "eval_runtime": 5.2536, + "eval_samples_per_second": 114.207, + "eval_steps_per_second": 3.617, + "step": 8306 + }, + { + "epoch": 443.2, + "learning_rate": 4.2592592592592596e-06, + "loss": 0.0224, + "step": 8310 + }, + { + "epoch": 443.73, + "learning_rate": 4.197530864197531e-06, + "loss": 0.0322, + "step": 8320 + }, + { + "epoch": 444.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.921158790588379, + "eval_runtime": 5.2757, + "eval_samples_per_second": 113.729, + "eval_steps_per_second": 3.601, + "step": 8325 + }, + { + "epoch": 444.27, + "learning_rate": 4.135802469135803e-06, + "loss": 0.0201, + "step": 8330 + }, + { + "epoch": 444.8, + "learning_rate": 4.074074074074075e-06, + "loss": 0.038, + "step": 8340 + }, + { + "epoch": 444.96, + "eval_accuracy": 0.24, + "eval_loss": 4.925096035003662, + "eval_runtime": 5.3078, + "eval_samples_per_second": 113.041, + "eval_steps_per_second": 3.58, + "step": 8343 + }, + { + "epoch": 445.33, + "learning_rate": 4.012345679012345e-06, + "loss": 0.035, + "step": 8350 + }, + { + "epoch": 445.87, + "learning_rate": 3.9506172839506175e-06, + "loss": 0.0327, + "step": 8360 + }, + { + "epoch": 445.97, + "eval_accuracy": 0.24, + "eval_loss": 4.933958530426025, + "eval_runtime": 5.2639, + "eval_samples_per_second": 113.985, + "eval_steps_per_second": 3.61, + "step": 8362 + }, + { + "epoch": 446.4, + "learning_rate": 3.888888888888889e-06, + "loss": 0.0226, + "step": 8370 + }, + { + "epoch": 446.93, + "learning_rate": 3.827160493827161e-06, + "loss": 0.0256, + "step": 8380 + }, + { + "epoch": 446.99, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.924610614776611, + "eval_runtime": 5.2666, + "eval_samples_per_second": 113.926, + "eval_steps_per_second": 3.608, + "step": 8381 + }, + { + "epoch": 447.47, + "learning_rate": 3.7654320987654325e-06, + "loss": 0.027, + "step": 8390 + }, + { + "epoch": 448.0, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0327, + "step": 8400 + }, + { + "epoch": 448.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.929351329803467, + "eval_runtime": 5.2953, + "eval_samples_per_second": 113.308, + "eval_steps_per_second": 3.588, + "step": 8400 + }, + { + "epoch": 448.53, + "learning_rate": 3.6419753086419754e-06, + "loss": 0.0246, + "step": 8410 + }, + { + "epoch": 448.96, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.931128025054932, + "eval_runtime": 5.3144, + "eval_samples_per_second": 112.901, + "eval_steps_per_second": 3.575, + "step": 8418 + }, + { + "epoch": 449.07, + "learning_rate": 3.580246913580247e-06, + "loss": 0.027, + "step": 8420 + }, + { + "epoch": 449.6, + "learning_rate": 3.5185185185185187e-06, + "loss": 0.0239, + "step": 8430 + }, + { + "epoch": 449.97, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.92203950881958, + "eval_runtime": 5.2899, + "eval_samples_per_second": 113.424, + "eval_steps_per_second": 3.592, + "step": 8437 + }, + { + "epoch": 450.13, + "learning_rate": 3.45679012345679e-06, + "loss": 0.0313, + "step": 8440 + }, + { + "epoch": 450.67, + "learning_rate": 3.3950617283950617e-06, + "loss": 0.0219, + "step": 8450 + }, + { + "epoch": 450.99, + "eval_accuracy": 0.24, + "eval_loss": 4.920533180236816, + "eval_runtime": 5.4385, + "eval_samples_per_second": 110.325, + "eval_steps_per_second": 3.494, + "step": 8456 + }, + { + "epoch": 451.2, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0214, + "step": 8460 + }, + { + "epoch": 451.73, + "learning_rate": 3.271604938271605e-06, + "loss": 0.0287, + "step": 8470 + }, + { + "epoch": 452.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.924895763397217, + "eval_runtime": 5.3449, + "eval_samples_per_second": 112.256, + "eval_steps_per_second": 3.555, + "step": 8475 + }, + { + "epoch": 452.27, + "learning_rate": 3.209876543209877e-06, + "loss": 0.0226, + "step": 8480 + }, + { + "epoch": 452.8, + "learning_rate": 3.148148148148148e-06, + "loss": 0.0244, + "step": 8490 + }, + { + "epoch": 452.96, + "eval_accuracy": 0.24, + "eval_loss": 4.927518844604492, + "eval_runtime": 5.3234, + "eval_samples_per_second": 112.711, + "eval_steps_per_second": 3.569, + "step": 8493 + }, + { + "epoch": 453.33, + "learning_rate": 3.0864197530864196e-06, + "loss": 0.0215, + "step": 8500 + }, + { + "epoch": 453.87, + "learning_rate": 3.0246913580246917e-06, + "loss": 0.0222, + "step": 8510 + }, + { + "epoch": 453.97, + "eval_accuracy": 0.24166666666666667, + "eval_loss": 4.932238578796387, + "eval_runtime": 5.3268, + "eval_samples_per_second": 112.637, + "eval_steps_per_second": 3.567, + "step": 8512 + }, + { + "epoch": 454.4, + "learning_rate": 2.9629629629629633e-06, + "loss": 0.0269, + "step": 8520 + }, + { + "epoch": 454.93, + "learning_rate": 2.9012345679012346e-06, + "loss": 0.0277, + "step": 8530 + }, + { + "epoch": 454.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.931756019592285, + "eval_runtime": 5.3343, + "eval_samples_per_second": 112.48, + "eval_steps_per_second": 3.562, + "step": 8531 + }, + { + "epoch": 455.47, + "learning_rate": 2.8395061728395062e-06, + "loss": 0.0355, + "step": 8540 + }, + { + "epoch": 456.0, + "learning_rate": 2.777777777777778e-06, + "loss": 0.0315, + "step": 8550 + }, + { + "epoch": 456.0, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.929112911224365, + "eval_runtime": 5.4018, + "eval_samples_per_second": 111.075, + "eval_steps_per_second": 3.517, + "step": 8550 + }, + { + "epoch": 456.53, + "learning_rate": 2.7160493827160496e-06, + "loss": 0.021, + "step": 8560 + }, + { + "epoch": 456.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.929343223571777, + "eval_runtime": 5.3767, + "eval_samples_per_second": 111.594, + "eval_steps_per_second": 3.534, + "step": 8568 + }, + { + "epoch": 457.07, + "learning_rate": 2.6543209876543212e-06, + "loss": 0.0232, + "step": 8570 + }, + { + "epoch": 457.6, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.0288, + "step": 8580 + }, + { + "epoch": 457.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.923261642456055, + "eval_runtime": 5.3734, + "eval_samples_per_second": 111.661, + "eval_steps_per_second": 3.536, + "step": 8587 + }, + { + "epoch": 458.13, + "learning_rate": 2.5308641975308646e-06, + "loss": 0.0381, + "step": 8590 + }, + { + "epoch": 458.67, + "learning_rate": 2.469135802469136e-06, + "loss": 0.0229, + "step": 8600 + }, + { + "epoch": 458.99, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.923562049865723, + "eval_runtime": 5.3963, + "eval_samples_per_second": 111.186, + "eval_steps_per_second": 3.521, + "step": 8606 + }, + { + "epoch": 459.2, + "learning_rate": 2.4074074074074075e-06, + "loss": 0.0308, + "step": 8610 + }, + { + "epoch": 459.73, + "learning_rate": 2.345679012345679e-06, + "loss": 0.0257, + "step": 8620 + }, + { + "epoch": 460.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.922508716583252, + "eval_runtime": 5.4351, + "eval_samples_per_second": 110.394, + "eval_steps_per_second": 3.496, + "step": 8625 + }, + { + "epoch": 460.27, + "learning_rate": 2.283950617283951e-06, + "loss": 0.0193, + "step": 8630 + }, + { + "epoch": 460.8, + "learning_rate": 2.2222222222222225e-06, + "loss": 0.0291, + "step": 8640 + }, + { + "epoch": 460.96, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.922194004058838, + "eval_runtime": 5.3746, + "eval_samples_per_second": 111.636, + "eval_steps_per_second": 3.535, + "step": 8643 + }, + { + "epoch": 461.33, + "learning_rate": 2.1604938271604937e-06, + "loss": 0.0292, + "step": 8650 + }, + { + "epoch": 461.87, + "learning_rate": 2.0987654320987654e-06, + "loss": 0.0325, + "step": 8660 + }, + { + "epoch": 461.97, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.921637535095215, + "eval_runtime": 5.38, + "eval_samples_per_second": 111.524, + "eval_steps_per_second": 3.532, + "step": 8662 + }, + { + "epoch": 462.4, + "learning_rate": 2.0370370370370375e-06, + "loss": 0.0292, + "step": 8670 + }, + { + "epoch": 462.93, + "learning_rate": 1.9753086419753087e-06, + "loss": 0.0268, + "step": 8680 + }, + { + "epoch": 462.99, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.92024564743042, + "eval_runtime": 5.3601, + "eval_samples_per_second": 111.939, + "eval_steps_per_second": 3.545, + "step": 8681 + }, + { + "epoch": 463.47, + "learning_rate": 1.9135802469135804e-06, + "loss": 0.0268, + "step": 8690 + }, + { + "epoch": 464.0, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.0156, + "step": 8700 + }, + { + "epoch": 464.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.91749906539917, + "eval_runtime": 5.3813, + "eval_samples_per_second": 111.497, + "eval_steps_per_second": 3.531, + "step": 8700 + }, + { + "epoch": 464.53, + "learning_rate": 1.7901234567901235e-06, + "loss": 0.0196, + "step": 8710 + }, + { + "epoch": 464.96, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.914690017700195, + "eval_runtime": 5.3651, + "eval_samples_per_second": 111.834, + "eval_steps_per_second": 3.541, + "step": 8718 + }, + { + "epoch": 465.07, + "learning_rate": 1.728395061728395e-06, + "loss": 0.0199, + "step": 8720 + }, + { + "epoch": 465.6, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0448, + "step": 8730 + }, + { + "epoch": 465.97, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.910024642944336, + "eval_runtime": 5.359, + "eval_samples_per_second": 111.962, + "eval_steps_per_second": 3.545, + "step": 8737 + }, + { + "epoch": 466.13, + "learning_rate": 1.6049382716049385e-06, + "loss": 0.021, + "step": 8740 + }, + { + "epoch": 466.67, + "learning_rate": 1.5432098765432098e-06, + "loss": 0.0232, + "step": 8750 + }, + { + "epoch": 466.99, + "eval_accuracy": 0.23333333333333334, + "eval_loss": 4.908828258514404, + "eval_runtime": 5.3987, + "eval_samples_per_second": 111.138, + "eval_steps_per_second": 3.519, + "step": 8756 + }, + { + "epoch": 467.2, + "learning_rate": 1.4814814814814817e-06, + "loss": 0.0224, + "step": 8760 + }, + { + "epoch": 467.73, + "learning_rate": 1.4197530864197531e-06, + "loss": 0.0274, + "step": 8770 + }, + { + "epoch": 468.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.909602165222168, + "eval_runtime": 5.4071, + "eval_samples_per_second": 110.966, + "eval_steps_per_second": 3.514, + "step": 8775 + }, + { + "epoch": 468.27, + "learning_rate": 1.3580246913580248e-06, + "loss": 0.0344, + "step": 8780 + }, + { + "epoch": 468.8, + "learning_rate": 1.2962962962962962e-06, + "loss": 0.029, + "step": 8790 + }, + { + "epoch": 468.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.910549640655518, + "eval_runtime": 5.3868, + "eval_samples_per_second": 111.383, + "eval_steps_per_second": 3.527, + "step": 8793 + }, + { + "epoch": 469.33, + "learning_rate": 1.234567901234568e-06, + "loss": 0.0265, + "step": 8800 + }, + { + "epoch": 469.87, + "learning_rate": 1.1728395061728396e-06, + "loss": 0.0337, + "step": 8810 + }, + { + "epoch": 469.97, + "eval_accuracy": 0.235, + "eval_loss": 4.912467956542969, + "eval_runtime": 5.3985, + "eval_samples_per_second": 111.143, + "eval_steps_per_second": 3.52, + "step": 8812 + }, + { + "epoch": 470.4, + "learning_rate": 1.1111111111111112e-06, + "loss": 0.0298, + "step": 8820 + }, + { + "epoch": 470.93, + "learning_rate": 1.0493827160493827e-06, + "loss": 0.0178, + "step": 8830 + }, + { + "epoch": 470.99, + "eval_accuracy": 0.235, + "eval_loss": 4.91201639175415, + "eval_runtime": 5.399, + "eval_samples_per_second": 111.132, + "eval_steps_per_second": 3.519, + "step": 8831 + }, + { + "epoch": 471.47, + "learning_rate": 9.876543209876544e-07, + "loss": 0.0179, + "step": 8840 + }, + { + "epoch": 472.0, + "learning_rate": 9.259259259259259e-07, + "loss": 0.0286, + "step": 8850 + }, + { + "epoch": 472.0, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.912468910217285, + "eval_runtime": 5.4913, + "eval_samples_per_second": 109.263, + "eval_steps_per_second": 3.46, + "step": 8850 + }, + { + "epoch": 472.53, + "learning_rate": 8.641975308641975e-07, + "loss": 0.0159, + "step": 8860 + }, + { + "epoch": 472.96, + "eval_accuracy": 0.23666666666666666, + "eval_loss": 4.910217761993408, + "eval_runtime": 5.4334, + "eval_samples_per_second": 110.428, + "eval_steps_per_second": 3.497, + "step": 8868 + }, + { + "epoch": 473.07, + "learning_rate": 8.024691358024693e-07, + "loss": 0.0247, + "step": 8870 + }, + { + "epoch": 473.6, + "learning_rate": 7.407407407407408e-07, + "loss": 0.0318, + "step": 8880 + }, + { + "epoch": 473.97, + "eval_accuracy": 0.23833333333333334, + "eval_loss": 4.9116291999816895, + "eval_runtime": 5.4254, + "eval_samples_per_second": 110.59, + "eval_steps_per_second": 3.502, + "step": 8887 + }, + { + "epoch": 474.13, + "learning_rate": 6.790123456790124e-07, + "loss": 0.0311, + "step": 8890 + }, + { + "epoch": 474.67, + "learning_rate": 6.17283950617284e-07, + "loss": 0.0302, + "step": 8900 + }, + { + "epoch": 474.99, + "eval_accuracy": 0.24, + "eval_loss": 4.911314487457275, + "eval_runtime": 5.4263, + "eval_samples_per_second": 110.573, + "eval_steps_per_second": 3.501, + "step": 8906 + }, + { + "epoch": 475.2, + "learning_rate": 5.555555555555556e-07, + "loss": 0.0305, + "step": 8910 + }, + { + "epoch": 475.73, + "learning_rate": 4.938271604938272e-07, + "loss": 0.0184, + "step": 8920 + }, + { + "epoch": 476.0, + "eval_accuracy": 0.24, + "eval_loss": 4.911987781524658, + "eval_runtime": 5.4618, + "eval_samples_per_second": 109.854, + "eval_steps_per_second": 3.479, + "step": 8925 + }, + { + "epoch": 476.27, + "learning_rate": 4.3209876543209875e-07, + "loss": 0.0415, + "step": 8930 + }, + { + "epoch": 476.8, + "learning_rate": 3.703703703703704e-07, + "loss": 0.025, + "step": 8940 + }, + { + "epoch": 476.96, + "eval_accuracy": 0.24, + "eval_loss": 4.912769794464111, + "eval_runtime": 5.4549, + "eval_samples_per_second": 109.993, + "eval_steps_per_second": 3.483, + "step": 8943 + }, + { + "epoch": 477.33, + "learning_rate": 3.08641975308642e-07, + "loss": 0.013, + "step": 8950 + }, + { + "epoch": 477.87, + "learning_rate": 2.469135802469136e-07, + "loss": 0.027, + "step": 8960 + }, + { + "epoch": 477.97, + "eval_accuracy": 0.24, + "eval_loss": 4.912613868713379, + "eval_runtime": 5.4968, + "eval_samples_per_second": 109.154, + "eval_steps_per_second": 3.457, + "step": 8962 + }, + { + "epoch": 478.4, + "learning_rate": 1.851851851851852e-07, + "loss": 0.0156, + "step": 8970 + }, + { + "epoch": 478.93, + "learning_rate": 1.234567901234568e-07, + "loss": 0.0298, + "step": 8980 + }, + { + "epoch": 478.99, + "eval_accuracy": 0.24, + "eval_loss": 4.913006782531738, + "eval_runtime": 5.4672, + "eval_samples_per_second": 109.745, + "eval_steps_per_second": 3.475, + "step": 8981 + }, + { + "epoch": 479.47, + "learning_rate": 6.17283950617284e-08, + "loss": 0.0213, + "step": 8990 + }, + { + "epoch": 480.0, + "learning_rate": 0.0, + "loss": 0.0349, + "step": 9000 + }, + { + "epoch": 480.0, + "eval_accuracy": 0.24, + "eval_loss": 4.9129838943481445, + "eval_runtime": 5.4668, + "eval_samples_per_second": 109.753, + "eval_steps_per_second": 3.476, + "step": 9000 + }, + { + "epoch": 480.0, + "step": 9000, + "total_flos": 6.6744785965028475e+19, + "train_loss": 0.3827814753833744, + "train_runtime": 27227.0796, + "train_samples_per_second": 44.019, + "train_steps_per_second": 0.331 + } + ], + "max_steps": 9000, + "num_train_epochs": 500, + "total_flos": 6.6744785965028475e+19, + "trial_name": null, + "trial_params": null +}