{ "best_metric": 5.664193153381348, "best_model_checkpoint": "output_nf_3/checkpoint-1737", "epoch": 200.0, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "learning_rate": 9.95e-07, "loss": 12.1762, "step": 9 }, { "epoch": 1.0, "eval_accuracy": 0.002932551319648094, "eval_loss": 11.818068504333496, "eval_runtime": 3.6297, "eval_samples_per_second": 0.276, "eval_steps_per_second": 0.276, "step": 9 }, { "epoch": 2.0, "learning_rate": 9.9e-07, "loss": 11.6538, "step": 18 }, { "epoch": 2.0, "eval_accuracy": 0.0009775171065493646, "eval_loss": 11.346792221069336, "eval_runtime": 3.7372, "eval_samples_per_second": 0.268, "eval_steps_per_second": 0.268, "step": 18 }, { "epoch": 3.0, "learning_rate": 9.849999999999999e-07, "loss": 11.2876, "step": 27 }, { "epoch": 3.0, "eval_accuracy": 0.0009775171065493646, "eval_loss": 10.912660598754883, "eval_runtime": 4.2343, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.236, "step": 27 }, { "epoch": 4.0, "learning_rate": 9.8e-07, "loss": 10.8664, "step": 36 }, { "epoch": 4.0, "eval_accuracy": 0.0009775171065493646, "eval_loss": 10.516217231750488, "eval_runtime": 3.4285, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 36 }, { "epoch": 5.0, "learning_rate": 9.75e-07, "loss": 10.6184, "step": 45 }, { "epoch": 5.0, "eval_accuracy": 0.0019550342130987292, "eval_loss": 10.181687355041504, "eval_runtime": 3.9045, "eval_samples_per_second": 0.256, "eval_steps_per_second": 0.256, "step": 45 }, { "epoch": 6.0, "learning_rate": 9.7e-07, "loss": 10.3132, "step": 54 }, { "epoch": 6.0, "eval_accuracy": 0.0009775171065493646, "eval_loss": 9.876521110534668, "eval_runtime": 3.4438, "eval_samples_per_second": 0.29, "eval_steps_per_second": 0.29, "step": 54 }, { "epoch": 7.0, "learning_rate": 9.649999999999999e-07, "loss": 9.9087, "step": 63 }, { "epoch": 7.0, "eval_accuracy": 0.0009775171065493646, "eval_loss": 9.607266426086426, "eval_runtime": 3.7078, "eval_samples_per_second": 0.27, "eval_steps_per_second": 0.27, "step": 63 }, { "epoch": 8.0, "learning_rate": 9.6e-07, "loss": 9.6953, "step": 72 }, { "epoch": 8.0, "eval_accuracy": 0.0, "eval_loss": 9.324377059936523, "eval_runtime": 3.5648, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 72 }, { "epoch": 9.0, "learning_rate": 9.55e-07, "loss": 9.3741, "step": 81 }, { "epoch": 9.0, "eval_accuracy": 0.0, "eval_loss": 9.031828880310059, "eval_runtime": 3.42, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 81 }, { "epoch": 10.0, "learning_rate": 9.499999999999999e-07, "loss": 9.2045, "step": 90 }, { "epoch": 10.0, "eval_accuracy": 0.0, "eval_loss": 8.746149063110352, "eval_runtime": 3.746, "eval_samples_per_second": 0.267, "eval_steps_per_second": 0.267, "step": 90 }, { "epoch": 11.0, "learning_rate": 9.45e-07, "loss": 8.9079, "step": 99 }, { "epoch": 11.0, "eval_accuracy": 0.0019550342130987292, "eval_loss": 8.47550106048584, "eval_runtime": 3.3518, "eval_samples_per_second": 0.298, "eval_steps_per_second": 0.298, "step": 99 }, { "epoch": 12.0, "learning_rate": 9.399999999999999e-07, "loss": 8.7047, "step": 108 }, { "epoch": 12.0, "eval_accuracy": 0.011730205278592375, "eval_loss": 8.222208976745605, "eval_runtime": 3.4648, "eval_samples_per_second": 0.289, "eval_steps_per_second": 0.289, "step": 108 }, { "epoch": 13.0, "learning_rate": 9.35e-07, "loss": 8.4622, "step": 117 }, { "epoch": 13.0, "eval_accuracy": 0.03128054740957967, "eval_loss": 7.995096206665039, "eval_runtime": 3.5365, "eval_samples_per_second": 0.283, "eval_steps_per_second": 0.283, "step": 117 }, { "epoch": 14.0, "learning_rate": 9.3e-07, "loss": 8.2649, "step": 126 }, { "epoch": 14.0, "eval_accuracy": 0.06549364613880743, "eval_loss": 7.805217266082764, "eval_runtime": 3.456, "eval_samples_per_second": 0.289, "eval_steps_per_second": 0.289, "step": 126 }, { "epoch": 15.0, "learning_rate": 9.25e-07, "loss": 8.043, "step": 135 }, { "epoch": 15.0, "eval_accuracy": 0.10654936461388075, "eval_loss": 7.649549961090088, "eval_runtime": 3.7554, "eval_samples_per_second": 0.266, "eval_steps_per_second": 0.266, "step": 135 }, { "epoch": 16.0, "learning_rate": 9.2e-07, "loss": 7.9092, "step": 144 }, { "epoch": 16.0, "eval_accuracy": 0.1378299120234604, "eval_loss": 7.516005516052246, "eval_runtime": 4.0508, "eval_samples_per_second": 0.247, "eval_steps_per_second": 0.247, "step": 144 }, { "epoch": 17.0, "learning_rate": 9.15e-07, "loss": 7.7103, "step": 153 }, { "epoch": 17.0, "eval_accuracy": 0.16911045943304007, "eval_loss": 7.388144016265869, "eval_runtime": 3.4529, "eval_samples_per_second": 0.29, "eval_steps_per_second": 0.29, "step": 153 }, { "epoch": 18.0, "learning_rate": 9.1e-07, "loss": 7.5701, "step": 162 }, { "epoch": 18.0, "eval_accuracy": 0.18181818181818182, "eval_loss": 7.269326686859131, "eval_runtime": 3.467, "eval_samples_per_second": 0.288, "eval_steps_per_second": 0.288, "step": 162 }, { "epoch": 19.0, "learning_rate": 9.05e-07, "loss": 7.4483, "step": 171 }, { "epoch": 19.0, "eval_accuracy": 0.19941348973607037, "eval_loss": 7.154385089874268, "eval_runtime": 4.1166, "eval_samples_per_second": 0.243, "eval_steps_per_second": 0.243, "step": 171 }, { "epoch": 20.0, "learning_rate": 9e-07, "loss": 7.314, "step": 180 }, { "epoch": 20.0, "eval_accuracy": 0.21603128054740958, "eval_loss": 7.046892166137695, "eval_runtime": 5.2069, "eval_samples_per_second": 0.192, "eval_steps_per_second": 0.192, "step": 180 }, { "epoch": 21.0, "learning_rate": 8.95e-07, "loss": 7.2101, "step": 189 }, { "epoch": 21.0, "eval_accuracy": 0.23264907135874877, "eval_loss": 6.9440107345581055, "eval_runtime": 3.9239, "eval_samples_per_second": 0.255, "eval_steps_per_second": 0.255, "step": 189 }, { "epoch": 22.0, "learning_rate": 8.9e-07, "loss": 7.1026, "step": 198 }, { "epoch": 22.0, "eval_accuracy": 0.23655913978494625, "eval_loss": 6.854238033294678, "eval_runtime": 3.4311, "eval_samples_per_second": 0.291, "eval_steps_per_second": 0.291, "step": 198 }, { "epoch": 23.0, "learning_rate": 8.85e-07, "loss": 6.9954, "step": 207 }, { "epoch": 23.0, "eval_accuracy": 0.2482893450635386, "eval_loss": 6.768421173095703, "eval_runtime": 4.5026, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 207 }, { "epoch": 24.0, "learning_rate": 8.799999999999999e-07, "loss": 6.9206, "step": 216 }, { "epoch": 24.0, "eval_accuracy": 0.2512218963831867, "eval_loss": 6.685108184814453, "eval_runtime": 9.9405, "eval_samples_per_second": 0.101, "eval_steps_per_second": 0.101, "step": 216 }, { "epoch": 25.0, "learning_rate": 8.75e-07, "loss": 6.8588, "step": 225 }, { "epoch": 25.0, "eval_accuracy": 0.2561094819159335, "eval_loss": 6.621737957000732, "eval_runtime": 3.7996, "eval_samples_per_second": 0.263, "eval_steps_per_second": 0.263, "step": 225 }, { "epoch": 26.0, "learning_rate": 8.699999999999999e-07, "loss": 6.7975, "step": 234 }, { "epoch": 26.0, "eval_accuracy": 0.2590420332355816, "eval_loss": 6.562064170837402, "eval_runtime": 4.7111, "eval_samples_per_second": 0.212, "eval_steps_per_second": 0.212, "step": 234 }, { "epoch": 27.0, "learning_rate": 8.65e-07, "loss": 6.7355, "step": 243 }, { "epoch": 27.0, "eval_accuracy": 0.26099706744868034, "eval_loss": 6.5111308097839355, "eval_runtime": 3.4887, "eval_samples_per_second": 0.287, "eval_steps_per_second": 0.287, "step": 243 }, { "epoch": 28.0, "learning_rate": 8.599999999999999e-07, "loss": 6.6928, "step": 252 }, { "epoch": 28.0, "eval_accuracy": 0.26392961876832843, "eval_loss": 6.462972640991211, "eval_runtime": 10.4927, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.095, "step": 252 }, { "epoch": 29.0, "learning_rate": 8.55e-07, "loss": 6.6483, "step": 261 }, { "epoch": 29.0, "eval_accuracy": 0.2697947214076246, "eval_loss": 6.413713455200195, "eval_runtime": 9.1985, "eval_samples_per_second": 0.109, "eval_steps_per_second": 0.109, "step": 261 }, { "epoch": 30.0, "learning_rate": 8.499999999999999e-07, "loss": 6.6169, "step": 270 }, { "epoch": 30.0, "eval_accuracy": 0.27174975562072334, "eval_loss": 6.367518424987793, "eval_runtime": 4.5985, "eval_samples_per_second": 0.217, "eval_steps_per_second": 0.217, "step": 270 }, { "epoch": 31.0, "learning_rate": 8.45e-07, "loss": 6.5498, "step": 279 }, { "epoch": 31.0, "eval_accuracy": 0.27468230694037143, "eval_loss": 6.329223155975342, "eval_runtime": 4.0947, "eval_samples_per_second": 0.244, "eval_steps_per_second": 0.244, "step": 279 }, { "epoch": 32.0, "learning_rate": 8.399999999999999e-07, "loss": 6.5385, "step": 288 }, { "epoch": 32.0, "eval_accuracy": 0.27663734115347016, "eval_loss": 6.296669960021973, "eval_runtime": 3.6593, "eval_samples_per_second": 0.273, "eval_steps_per_second": 0.273, "step": 288 }, { "epoch": 33.0, "learning_rate": 8.349999999999999e-07, "loss": 6.5111, "step": 297 }, { "epoch": 33.0, "eval_accuracy": 0.2785923753665689, "eval_loss": 6.2592363357543945, "eval_runtime": 4.3919, "eval_samples_per_second": 0.228, "eval_steps_per_second": 0.228, "step": 297 }, { "epoch": 34.0, "learning_rate": 8.299999999999999e-07, "loss": 6.4748, "step": 306 }, { "epoch": 34.0, "eval_accuracy": 0.28152492668621704, "eval_loss": 6.231565475463867, "eval_runtime": 4.3688, "eval_samples_per_second": 0.229, "eval_steps_per_second": 0.229, "step": 306 }, { "epoch": 35.0, "learning_rate": 8.249999999999999e-07, "loss": 6.4575, "step": 315 }, { "epoch": 35.0, "eval_accuracy": 0.28347996089931576, "eval_loss": 6.209580421447754, "eval_runtime": 3.5499, "eval_samples_per_second": 0.282, "eval_steps_per_second": 0.282, "step": 315 }, { "epoch": 36.0, "learning_rate": 8.199999999999999e-07, "loss": 6.4251, "step": 324 }, { "epoch": 36.0, "eval_accuracy": 0.2844574780058651, "eval_loss": 6.184682369232178, "eval_runtime": 3.6926, "eval_samples_per_second": 0.271, "eval_steps_per_second": 0.271, "step": 324 }, { "epoch": 37.0, "learning_rate": 8.149999999999999e-07, "loss": 6.4096, "step": 333 }, { "epoch": 37.0, "eval_accuracy": 0.2854349951124145, "eval_loss": 6.161267280578613, "eval_runtime": 3.9736, "eval_samples_per_second": 0.252, "eval_steps_per_second": 0.252, "step": 333 }, { "epoch": 38.0, "learning_rate": 8.1e-07, "loss": 6.3741, "step": 342 }, { "epoch": 38.0, "eval_accuracy": 0.2854349951124145, "eval_loss": 6.140196800231934, "eval_runtime": 3.9847, "eval_samples_per_second": 0.251, "eval_steps_per_second": 0.251, "step": 342 }, { "epoch": 39.0, "learning_rate": 8.05e-07, "loss": 6.3645, "step": 351 }, { "epoch": 39.0, "eval_accuracy": 0.2873900293255132, "eval_loss": 6.128504753112793, "eval_runtime": 3.6581, "eval_samples_per_second": 0.273, "eval_steps_per_second": 0.273, "step": 351 }, { "epoch": 40.0, "learning_rate": 8e-07, "loss": 6.3511, "step": 360 }, { "epoch": 40.0, "eval_accuracy": 0.2873900293255132, "eval_loss": 6.11335563659668, "eval_runtime": 3.9362, "eval_samples_per_second": 0.254, "eval_steps_per_second": 0.254, "step": 360 }, { "epoch": 41.0, "learning_rate": 7.95e-07, "loss": 6.3254, "step": 369 }, { "epoch": 41.0, "eval_accuracy": 0.2873900293255132, "eval_loss": 6.095658779144287, "eval_runtime": 3.8555, "eval_samples_per_second": 0.259, "eval_steps_per_second": 0.259, "step": 369 }, { "epoch": 42.0, "learning_rate": 7.9e-07, "loss": 6.3077, "step": 378 }, { "epoch": 42.0, "eval_accuracy": 0.2873900293255132, "eval_loss": 6.081845283508301, "eval_runtime": 3.4864, "eval_samples_per_second": 0.287, "eval_steps_per_second": 0.287, "step": 378 }, { "epoch": 43.0, "learning_rate": 7.85e-07, "loss": 6.301, "step": 387 }, { "epoch": 43.0, "eval_accuracy": 0.2873900293255132, "eval_loss": 6.0687761306762695, "eval_runtime": 4.7364, "eval_samples_per_second": 0.211, "eval_steps_per_second": 0.211, "step": 387 }, { "epoch": 44.0, "learning_rate": 7.799999999999999e-07, "loss": 6.2846, "step": 396 }, { "epoch": 44.0, "eval_accuracy": 0.2883675464320626, "eval_loss": 6.051051616668701, "eval_runtime": 3.4251, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.292, "step": 396 }, { "epoch": 45.0, "learning_rate": 7.75e-07, "loss": 6.2739, "step": 405 }, { "epoch": 45.0, "eval_accuracy": 0.28934506353861195, "eval_loss": 6.039752006530762, "eval_runtime": 4.3563, "eval_samples_per_second": 0.23, "eval_steps_per_second": 0.23, "step": 405 }, { "epoch": 46.0, "learning_rate": 7.699999999999999e-07, "loss": 6.2569, "step": 414 }, { "epoch": 46.0, "eval_accuracy": 0.2913000977517107, "eval_loss": 6.030134677886963, "eval_runtime": 3.8175, "eval_samples_per_second": 0.262, "eval_steps_per_second": 0.262, "step": 414 }, { "epoch": 47.0, "learning_rate": 7.65e-07, "loss": 6.258, "step": 423 }, { "epoch": 47.0, "eval_accuracy": 0.2913000977517107, "eval_loss": 6.01760721206665, "eval_runtime": 4.1737, "eval_samples_per_second": 0.24, "eval_steps_per_second": 0.24, "step": 423 }, { "epoch": 48.0, "learning_rate": 7.599999999999999e-07, "loss": 6.2273, "step": 432 }, { "epoch": 48.0, "eval_accuracy": 0.2932551319648094, "eval_loss": 6.003724575042725, "eval_runtime": 3.8811, "eval_samples_per_second": 0.258, "eval_steps_per_second": 0.258, "step": 432 }, { "epoch": 49.0, "learning_rate": 7.55e-07, "loss": 6.2256, "step": 441 }, { "epoch": 49.0, "eval_accuracy": 0.2932551319648094, "eval_loss": 5.9961748123168945, "eval_runtime": 4.331, "eval_samples_per_second": 0.231, "eval_steps_per_second": 0.231, "step": 441 }, { "epoch": 50.0, "learning_rate": 7.5e-07, "loss": 6.2065, "step": 450 }, { "epoch": 50.0, "eval_accuracy": 0.2932551319648094, "eval_loss": 5.986941337585449, "eval_runtime": 4.6185, "eval_samples_per_second": 0.217, "eval_steps_per_second": 0.217, "step": 450 }, { "epoch": 51.0, "learning_rate": 7.45e-07, "loss": 6.1991, "step": 459 }, { "epoch": 51.0, "eval_accuracy": 0.2932551319648094, "eval_loss": 5.975553512573242, "eval_runtime": 3.9164, "eval_samples_per_second": 0.255, "eval_steps_per_second": 0.255, "step": 459 }, { "epoch": 52.0, "learning_rate": 7.4e-07, "loss": 6.1895, "step": 468 }, { "epoch": 52.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.968191623687744, "eval_runtime": 4.4356, "eval_samples_per_second": 0.225, "eval_steps_per_second": 0.225, "step": 468 }, { "epoch": 53.0, "learning_rate": 7.35e-07, "loss": 6.1763, "step": 477 }, { "epoch": 53.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.961073398590088, "eval_runtime": 4.4638, "eval_samples_per_second": 0.224, "eval_steps_per_second": 0.224, "step": 477 }, { "epoch": 54.0, "learning_rate": 7.3e-07, "loss": 6.1734, "step": 486 }, { "epoch": 54.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.953503608703613, "eval_runtime": 3.7888, "eval_samples_per_second": 0.264, "eval_steps_per_second": 0.264, "step": 486 }, { "epoch": 55.0, "learning_rate": 7.249999999999999e-07, "loss": 6.1702, "step": 495 }, { "epoch": 55.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.94571590423584, "eval_runtime": 4.1672, "eval_samples_per_second": 0.24, "eval_steps_per_second": 0.24, "step": 495 }, { "epoch": 56.0, "learning_rate": 7.2e-07, "loss": 6.1556, "step": 504 }, { "epoch": 56.0, "eval_accuracy": 0.29521016617790813, "eval_loss": 5.937567710876465, "eval_runtime": 4.244, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.236, "step": 504 }, { "epoch": 57.0, "learning_rate": 7.149999999999999e-07, "loss": 6.1481, "step": 513 }, { "epoch": 57.0, "eval_accuracy": 0.29521016617790813, "eval_loss": 5.930552005767822, "eval_runtime": 4.4958, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 513 }, { "epoch": 58.0, "learning_rate": 7.1e-07, "loss": 6.1425, "step": 522 }, { "epoch": 58.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.922237396240234, "eval_runtime": 4.1228, "eval_samples_per_second": 0.243, "eval_steps_per_second": 0.243, "step": 522 }, { "epoch": 59.0, "learning_rate": 7.049999999999999e-07, "loss": 6.1416, "step": 531 }, { "epoch": 59.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.91697883605957, "eval_runtime": 4.4905, "eval_samples_per_second": 0.223, "eval_steps_per_second": 0.223, "step": 531 }, { "epoch": 60.0, "learning_rate": 7e-07, "loss": 6.1328, "step": 540 }, { "epoch": 60.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.914181232452393, "eval_runtime": 4.5555, "eval_samples_per_second": 0.22, "eval_steps_per_second": 0.22, "step": 540 }, { "epoch": 61.0, "learning_rate": 6.949999999999999e-07, "loss": 6.1176, "step": 549 }, { "epoch": 61.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.906389236450195, "eval_runtime": 4.1516, "eval_samples_per_second": 0.241, "eval_steps_per_second": 0.241, "step": 549 }, { "epoch": 62.0, "learning_rate": 6.9e-07, "loss": 6.1091, "step": 558 }, { "epoch": 62.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.901010513305664, "eval_runtime": 4.2358, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.236, "step": 558 }, { "epoch": 63.0, "learning_rate": 6.85e-07, "loss": 6.104, "step": 567 }, { "epoch": 63.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.896124362945557, "eval_runtime": 4.4857, "eval_samples_per_second": 0.223, "eval_steps_per_second": 0.223, "step": 567 }, { "epoch": 64.0, "learning_rate": 6.800000000000001e-07, "loss": 6.0986, "step": 576 }, { "epoch": 64.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.891510486602783, "eval_runtime": 4.4644, "eval_samples_per_second": 0.224, "eval_steps_per_second": 0.224, "step": 576 }, { "epoch": 65.0, "learning_rate": 6.75e-07, "loss": 6.089, "step": 585 }, { "epoch": 65.0, "eval_accuracy": 0.29521016617790813, "eval_loss": 5.885429382324219, "eval_runtime": 3.9204, "eval_samples_per_second": 0.255, "eval_steps_per_second": 0.255, "step": 585 }, { "epoch": 66.0, "learning_rate": 6.7e-07, "loss": 6.0734, "step": 594 }, { "epoch": 66.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.880984783172607, "eval_runtime": 4.2715, "eval_samples_per_second": 0.234, "eval_steps_per_second": 0.234, "step": 594 }, { "epoch": 67.0, "learning_rate": 6.65e-07, "loss": 6.0905, "step": 603 }, { "epoch": 67.0, "eval_accuracy": 0.29423264907135877, "eval_loss": 5.876211166381836, "eval_runtime": 4.4141, "eval_samples_per_second": 0.227, "eval_steps_per_second": 0.227, "step": 603 }, { "epoch": 68.0, "learning_rate": 6.6e-07, "loss": 6.0701, "step": 612 }, { "epoch": 68.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.873714447021484, "eval_runtime": 4.3879, "eval_samples_per_second": 0.228, "eval_steps_per_second": 0.228, "step": 612 }, { "epoch": 69.0, "learning_rate": 6.55e-07, "loss": 6.0595, "step": 621 }, { "epoch": 69.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.869368553161621, "eval_runtime": 4.2313, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.236, "step": 621 }, { "epoch": 70.0, "learning_rate": 6.5e-07, "loss": 6.0616, "step": 630 }, { "epoch": 70.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.8661274909973145, "eval_runtime": 4.1245, "eval_samples_per_second": 0.242, "eval_steps_per_second": 0.242, "step": 630 }, { "epoch": 71.0, "learning_rate": 6.45e-07, "loss": 6.0512, "step": 639 }, { "epoch": 71.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.863522052764893, "eval_runtime": 4.2876, "eval_samples_per_second": 0.233, "eval_steps_per_second": 0.233, "step": 639 }, { "epoch": 72.0, "learning_rate": 6.4e-07, "loss": 6.0415, "step": 648 }, { "epoch": 72.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.861262321472168, "eval_runtime": 4.932, "eval_samples_per_second": 0.203, "eval_steps_per_second": 0.203, "step": 648 }, { "epoch": 73.0, "learning_rate": 6.35e-07, "loss": 6.0391, "step": 657 }, { "epoch": 73.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.858279228210449, "eval_runtime": 4.4327, "eval_samples_per_second": 0.226, "eval_steps_per_second": 0.226, "step": 657 }, { "epoch": 74.0, "learning_rate": 6.3e-07, "loss": 6.032, "step": 666 }, { "epoch": 74.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.854796886444092, "eval_runtime": 4.4253, "eval_samples_per_second": 0.226, "eval_steps_per_second": 0.226, "step": 666 }, { "epoch": 75.0, "learning_rate": 6.249999999999999e-07, "loss": 6.0317, "step": 675 }, { "epoch": 75.0, "eval_accuracy": 0.2961876832844575, "eval_loss": 5.851076126098633, "eval_runtime": 4.5175, "eval_samples_per_second": 0.221, "eval_steps_per_second": 0.221, "step": 675 }, { "epoch": 76.0, "learning_rate": 6.2e-07, "loss": 6.0343, "step": 684 }, { "epoch": 76.0, "eval_accuracy": 0.29716520039100686, "eval_loss": 5.847884178161621, "eval_runtime": 4.5177, "eval_samples_per_second": 0.221, "eval_steps_per_second": 0.221, "step": 684 }, { "epoch": 77.0, "learning_rate": 6.149999999999999e-07, "loss": 6.0156, "step": 693 }, { "epoch": 77.0, "eval_accuracy": 0.2981427174975562, "eval_loss": 5.843489646911621, "eval_runtime": 4.7261, "eval_samples_per_second": 0.212, "eval_steps_per_second": 0.212, "step": 693 }, { "epoch": 78.0, "learning_rate": 6.1e-07, "loss": 6.0167, "step": 702 }, { "epoch": 78.0, "eval_accuracy": 0.2981427174975562, "eval_loss": 5.8403215408325195, "eval_runtime": 4.3157, "eval_samples_per_second": 0.232, "eval_steps_per_second": 0.232, "step": 702 }, { "epoch": 79.0, "learning_rate": 6.049999999999999e-07, "loss": 6.0052, "step": 711 }, { "epoch": 79.0, "eval_accuracy": 0.2981427174975562, "eval_loss": 5.83635950088501, "eval_runtime": 4.3221, "eval_samples_per_second": 0.231, "eval_steps_per_second": 0.231, "step": 711 }, { "epoch": 80.0, "learning_rate": 6e-07, "loss": 6.0057, "step": 720 }, { "epoch": 80.0, "eval_accuracy": 0.2981427174975562, "eval_loss": 5.8336262702941895, "eval_runtime": 4.8317, "eval_samples_per_second": 0.207, "eval_steps_per_second": 0.207, "step": 720 }, { "epoch": 81.0, "learning_rate": 5.949999999999999e-07, "loss": 6.0001, "step": 729 }, { "epoch": 81.0, "eval_accuracy": 0.2981427174975562, "eval_loss": 5.8319411277771, "eval_runtime": 4.5299, "eval_samples_per_second": 0.221, "eval_steps_per_second": 0.221, "step": 729 }, { "epoch": 82.0, "learning_rate": 5.9e-07, "loss": 6.001, "step": 738 }, { "epoch": 82.0, "eval_accuracy": 0.2991202346041056, "eval_loss": 5.828815937042236, "eval_runtime": 4.4118, "eval_samples_per_second": 0.227, "eval_steps_per_second": 0.227, "step": 738 }, { "epoch": 83.0, "learning_rate": 5.849999999999999e-07, "loss": 5.9905, "step": 747 }, { "epoch": 83.0, "eval_accuracy": 0.3010752688172043, "eval_loss": 5.826550006866455, "eval_runtime": 4.3318, "eval_samples_per_second": 0.231, "eval_steps_per_second": 0.231, "step": 747 }, { "epoch": 84.0, "learning_rate": 5.8e-07, "loss": 5.9906, "step": 756 }, { "epoch": 84.0, "eval_accuracy": 0.3020527859237537, "eval_loss": 5.824170112609863, "eval_runtime": 4.4876, "eval_samples_per_second": 0.223, "eval_steps_per_second": 0.223, "step": 756 }, { "epoch": 85.0, "learning_rate": 5.749999999999999e-07, "loss": 5.9862, "step": 765 }, { "epoch": 85.0, "eval_accuracy": 0.3020527859237537, "eval_loss": 5.821638584136963, "eval_runtime": 4.7768, "eval_samples_per_second": 0.209, "eval_steps_per_second": 0.209, "step": 765 }, { "epoch": 86.0, "learning_rate": 5.699999999999999e-07, "loss": 5.9829, "step": 774 }, { "epoch": 86.0, "eval_accuracy": 0.30498533724340177, "eval_loss": 5.819102764129639, "eval_runtime": 4.2697, "eval_samples_per_second": 0.234, "eval_steps_per_second": 0.234, "step": 774 }, { "epoch": 87.0, "learning_rate": 5.649999999999999e-07, "loss": 5.9725, "step": 783 }, { "epoch": 87.0, "eval_accuracy": 0.3069403714565005, "eval_loss": 5.817332744598389, "eval_runtime": 4.3341, "eval_samples_per_second": 0.231, "eval_steps_per_second": 0.231, "step": 783 }, { "epoch": 88.0, "learning_rate": 5.6e-07, "loss": 5.9795, "step": 792 }, { "epoch": 88.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.814181804656982, "eval_runtime": 4.759, "eval_samples_per_second": 0.21, "eval_steps_per_second": 0.21, "step": 792 }, { "epoch": 89.0, "learning_rate": 5.55e-07, "loss": 5.9695, "step": 801 }, { "epoch": 89.0, "eval_accuracy": 0.3069403714565005, "eval_loss": 5.811527729034424, "eval_runtime": 4.3738, "eval_samples_per_second": 0.229, "eval_steps_per_second": 0.229, "step": 801 }, { "epoch": 90.0, "learning_rate": 5.5e-07, "loss": 5.9607, "step": 810 }, { "epoch": 90.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.808058261871338, "eval_runtime": 3.4007, "eval_samples_per_second": 0.294, "eval_steps_per_second": 0.294, "step": 810 }, { "epoch": 91.0, "learning_rate": 5.45e-07, "loss": 5.9605, "step": 819 }, { "epoch": 91.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.805708885192871, "eval_runtime": 3.8195, "eval_samples_per_second": 0.262, "eval_steps_per_second": 0.262, "step": 819 }, { "epoch": 92.0, "learning_rate": 5.4e-07, "loss": 5.9591, "step": 828 }, { "epoch": 92.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.80366849899292, "eval_runtime": 4.5002, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 828 }, { "epoch": 93.0, "learning_rate": 5.35e-07, "loss": 5.9481, "step": 837 }, { "epoch": 93.0, "eval_accuracy": 0.3069403714565005, "eval_loss": 5.801074981689453, "eval_runtime": 4.3082, "eval_samples_per_second": 0.232, "eval_steps_per_second": 0.232, "step": 837 }, { "epoch": 94.0, "learning_rate": 5.3e-07, "loss": 5.9501, "step": 846 }, { "epoch": 94.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.798312664031982, "eval_runtime": 4.9824, "eval_samples_per_second": 0.201, "eval_steps_per_second": 0.201, "step": 846 }, { "epoch": 95.0, "learning_rate": 5.25e-07, "loss": 5.948, "step": 855 }, { "epoch": 95.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.794301509857178, "eval_runtime": 3.717, "eval_samples_per_second": 0.269, "eval_steps_per_second": 0.269, "step": 855 }, { "epoch": 96.0, "learning_rate": 5.2e-07, "loss": 5.9488, "step": 864 }, { "epoch": 96.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.790666580200195, "eval_runtime": 3.5092, "eval_samples_per_second": 0.285, "eval_steps_per_second": 0.285, "step": 864 }, { "epoch": 97.0, "learning_rate": 5.149999999999999e-07, "loss": 5.9449, "step": 873 }, { "epoch": 97.0, "eval_accuracy": 0.3118279569892473, "eval_loss": 5.788764476776123, "eval_runtime": 4.3467, "eval_samples_per_second": 0.23, "eval_steps_per_second": 0.23, "step": 873 }, { "epoch": 98.0, "learning_rate": 5.1e-07, "loss": 5.9357, "step": 882 }, { "epoch": 98.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.787247180938721, "eval_runtime": 4.2514, "eval_samples_per_second": 0.235, "eval_steps_per_second": 0.235, "step": 882 }, { "epoch": 99.0, "learning_rate": 5.049999999999999e-07, "loss": 5.9363, "step": 891 }, { "epoch": 99.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.783356666564941, "eval_runtime": 3.6762, "eval_samples_per_second": 0.272, "eval_steps_per_second": 0.272, "step": 891 }, { "epoch": 100.0, "learning_rate": 5e-07, "loss": 5.9368, "step": 900 }, { "epoch": 100.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.781695365905762, "eval_runtime": 4.5979, "eval_samples_per_second": 0.217, "eval_steps_per_second": 0.217, "step": 900 }, { "epoch": 101.0, "learning_rate": 4.95e-07, "loss": 5.9215, "step": 909 }, { "epoch": 101.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.779068946838379, "eval_runtime": 4.3432, "eval_samples_per_second": 0.23, "eval_steps_per_second": 0.23, "step": 909 }, { "epoch": 102.0, "learning_rate": 4.9e-07, "loss": 5.9264, "step": 918 }, { "epoch": 102.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.77487325668335, "eval_runtime": 3.8835, "eval_samples_per_second": 0.258, "eval_steps_per_second": 0.258, "step": 918 }, { "epoch": 103.0, "learning_rate": 4.85e-07, "loss": 5.9126, "step": 927 }, { "epoch": 103.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.772488594055176, "eval_runtime": 4.0354, "eval_samples_per_second": 0.248, "eval_steps_per_second": 0.248, "step": 927 }, { "epoch": 104.0, "learning_rate": 4.8e-07, "loss": 5.9163, "step": 936 }, { "epoch": 104.0, "eval_accuracy": 0.3069403714565005, "eval_loss": 5.769767761230469, "eval_runtime": 3.6621, "eval_samples_per_second": 0.273, "eval_steps_per_second": 0.273, "step": 936 }, { "epoch": 105.0, "learning_rate": 4.7499999999999995e-07, "loss": 5.9183, "step": 945 }, { "epoch": 105.0, "eval_accuracy": 0.3069403714565005, "eval_loss": 5.767302989959717, "eval_runtime": 4.3085, "eval_samples_per_second": 0.232, "eval_steps_per_second": 0.232, "step": 945 }, { "epoch": 106.0, "learning_rate": 4.6999999999999995e-07, "loss": 5.9154, "step": 954 }, { "epoch": 106.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.763917446136475, "eval_runtime": 4.2316, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.236, "step": 954 }, { "epoch": 107.0, "learning_rate": 4.65e-07, "loss": 5.9149, "step": 963 }, { "epoch": 107.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.76121187210083, "eval_runtime": 4.5023, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 963 }, { "epoch": 108.0, "learning_rate": 4.6e-07, "loss": 5.9082, "step": 972 }, { "epoch": 108.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.757934093475342, "eval_runtime": 4.5081, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 972 }, { "epoch": 109.0, "learning_rate": 4.55e-07, "loss": 5.9051, "step": 981 }, { "epoch": 109.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.755875110626221, "eval_runtime": 4.207, "eval_samples_per_second": 0.238, "eval_steps_per_second": 0.238, "step": 981 }, { "epoch": 110.0, "learning_rate": 4.5e-07, "loss": 5.908, "step": 990 }, { "epoch": 110.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.753073692321777, "eval_runtime": 4.5603, "eval_samples_per_second": 0.219, "eval_steps_per_second": 0.219, "step": 990 }, { "epoch": 111.0, "learning_rate": 4.45e-07, "loss": 5.901, "step": 999 }, { "epoch": 111.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.750200271606445, "eval_runtime": 4.3073, "eval_samples_per_second": 0.232, "eval_steps_per_second": 0.232, "step": 999 }, { "epoch": 112.0, "learning_rate": 4.3999999999999997e-07, "loss": 5.9064, "step": 1008 }, { "epoch": 112.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.748438835144043, "eval_runtime": 4.0892, "eval_samples_per_second": 0.245, "eval_steps_per_second": 0.245, "step": 1008 }, { "epoch": 113.0, "learning_rate": 4.3499999999999996e-07, "loss": 5.895, "step": 1017 }, { "epoch": 113.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.745972156524658, "eval_runtime": 3.6273, "eval_samples_per_second": 0.276, "eval_steps_per_second": 0.276, "step": 1017 }, { "epoch": 114.0, "learning_rate": 4.2999999999999996e-07, "loss": 5.894, "step": 1026 }, { "epoch": 114.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.743812561035156, "eval_runtime": 3.373, "eval_samples_per_second": 0.296, "eval_steps_per_second": 0.296, "step": 1026 }, { "epoch": 115.0, "learning_rate": 4.2499999999999995e-07, "loss": 5.8809, "step": 1035 }, { "epoch": 115.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.741939067840576, "eval_runtime": 3.9496, "eval_samples_per_second": 0.253, "eval_steps_per_second": 0.253, "step": 1035 }, { "epoch": 116.0, "learning_rate": 4.1999999999999995e-07, "loss": 5.8893, "step": 1044 }, { "epoch": 116.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.74192476272583, "eval_runtime": 4.7772, "eval_samples_per_second": 0.209, "eval_steps_per_second": 0.209, "step": 1044 }, { "epoch": 117.0, "learning_rate": 4.1499999999999994e-07, "loss": 5.8874, "step": 1053 }, { "epoch": 117.0, "eval_accuracy": 0.3069403714565005, "eval_loss": 5.739171504974365, "eval_runtime": 4.0083, "eval_samples_per_second": 0.249, "eval_steps_per_second": 0.249, "step": 1053 }, { "epoch": 118.0, "learning_rate": 4.0999999999999994e-07, "loss": 5.8798, "step": 1062 }, { "epoch": 118.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.73491907119751, "eval_runtime": 4.1773, "eval_samples_per_second": 0.239, "eval_steps_per_second": 0.239, "step": 1062 }, { "epoch": 119.0, "learning_rate": 4.05e-07, "loss": 5.8826, "step": 1071 }, { "epoch": 119.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.732446193695068, "eval_runtime": 4.4141, "eval_samples_per_second": 0.227, "eval_steps_per_second": 0.227, "step": 1071 }, { "epoch": 120.0, "learning_rate": 4e-07, "loss": 5.8736, "step": 1080 }, { "epoch": 120.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.729852199554443, "eval_runtime": 4.4117, "eval_samples_per_second": 0.227, "eval_steps_per_second": 0.227, "step": 1080 }, { "epoch": 121.0, "learning_rate": 3.95e-07, "loss": 5.8751, "step": 1089 }, { "epoch": 121.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.726986408233643, "eval_runtime": 4.9388, "eval_samples_per_second": 0.202, "eval_steps_per_second": 0.202, "step": 1089 }, { "epoch": 122.0, "learning_rate": 3.8999999999999997e-07, "loss": 5.8699, "step": 1098 }, { "epoch": 122.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.72526216506958, "eval_runtime": 4.0541, "eval_samples_per_second": 0.247, "eval_steps_per_second": 0.247, "step": 1098 }, { "epoch": 123.0, "learning_rate": 3.8499999999999997e-07, "loss": 5.8802, "step": 1107 }, { "epoch": 123.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.723138809204102, "eval_runtime": 4.1908, "eval_samples_per_second": 0.239, "eval_steps_per_second": 0.239, "step": 1107 }, { "epoch": 124.0, "learning_rate": 3.7999999999999996e-07, "loss": 5.8707, "step": 1116 }, { "epoch": 124.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.7239861488342285, "eval_runtime": 4.4694, "eval_samples_per_second": 0.224, "eval_steps_per_second": 0.224, "step": 1116 }, { "epoch": 125.0, "learning_rate": 3.75e-07, "loss": 5.8653, "step": 1125 }, { "epoch": 125.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.723176002502441, "eval_runtime": 4.2543, "eval_samples_per_second": 0.235, "eval_steps_per_second": 0.235, "step": 1125 }, { "epoch": 126.0, "learning_rate": 3.7e-07, "loss": 5.8693, "step": 1134 }, { "epoch": 126.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.7180495262146, "eval_runtime": 4.3031, "eval_samples_per_second": 0.232, "eval_steps_per_second": 0.232, "step": 1134 }, { "epoch": 127.0, "learning_rate": 3.65e-07, "loss": 5.8662, "step": 1143 }, { "epoch": 127.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.714703559875488, "eval_runtime": 4.1167, "eval_samples_per_second": 0.243, "eval_steps_per_second": 0.243, "step": 1143 }, { "epoch": 128.0, "learning_rate": 3.6e-07, "loss": 5.8539, "step": 1152 }, { "epoch": 128.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.713204383850098, "eval_runtime": 4.0648, "eval_samples_per_second": 0.246, "eval_steps_per_second": 0.246, "step": 1152 }, { "epoch": 129.0, "learning_rate": 3.55e-07, "loss": 5.8611, "step": 1161 }, { "epoch": 129.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.712745189666748, "eval_runtime": 4.0643, "eval_samples_per_second": 0.246, "eval_steps_per_second": 0.246, "step": 1161 }, { "epoch": 130.0, "learning_rate": 3.5e-07, "loss": 5.8495, "step": 1170 }, { "epoch": 130.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.713523864746094, "eval_runtime": 4.0466, "eval_samples_per_second": 0.247, "eval_steps_per_second": 0.247, "step": 1170 }, { "epoch": 131.0, "learning_rate": 3.45e-07, "loss": 5.8602, "step": 1179 }, { "epoch": 131.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.711104393005371, "eval_runtime": 3.9889, "eval_samples_per_second": 0.251, "eval_steps_per_second": 0.251, "step": 1179 }, { "epoch": 132.0, "learning_rate": 3.4000000000000003e-07, "loss": 5.8512, "step": 1188 }, { "epoch": 132.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.7077484130859375, "eval_runtime": 3.6772, "eval_samples_per_second": 0.272, "eval_steps_per_second": 0.272, "step": 1188 }, { "epoch": 133.0, "learning_rate": 3.35e-07, "loss": 5.8493, "step": 1197 }, { "epoch": 133.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.704965591430664, "eval_runtime": 4.3621, "eval_samples_per_second": 0.229, "eval_steps_per_second": 0.229, "step": 1197 }, { "epoch": 134.0, "learning_rate": 3.3e-07, "loss": 5.8477, "step": 1206 }, { "epoch": 134.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.704052925109863, "eval_runtime": 4.4972, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 1206 }, { "epoch": 135.0, "learning_rate": 3.25e-07, "loss": 5.8464, "step": 1215 }, { "epoch": 135.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.704152584075928, "eval_runtime": 3.8106, "eval_samples_per_second": 0.262, "eval_steps_per_second": 0.262, "step": 1215 }, { "epoch": 136.0, "learning_rate": 3.2e-07, "loss": 5.8459, "step": 1224 }, { "epoch": 136.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.702281951904297, "eval_runtime": 4.7548, "eval_samples_per_second": 0.21, "eval_steps_per_second": 0.21, "step": 1224 }, { "epoch": 137.0, "learning_rate": 3.15e-07, "loss": 5.8475, "step": 1233 }, { "epoch": 137.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.7000274658203125, "eval_runtime": 4.1681, "eval_samples_per_second": 0.24, "eval_steps_per_second": 0.24, "step": 1233 }, { "epoch": 138.0, "learning_rate": 3.1e-07, "loss": 5.8384, "step": 1242 }, { "epoch": 138.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.698208332061768, "eval_runtime": 4.6023, "eval_samples_per_second": 0.217, "eval_steps_per_second": 0.217, "step": 1242 }, { "epoch": 139.0, "learning_rate": 3.05e-07, "loss": 5.8453, "step": 1251 }, { "epoch": 139.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.697631359100342, "eval_runtime": 3.8429, "eval_samples_per_second": 0.26, "eval_steps_per_second": 0.26, "step": 1251 }, { "epoch": 140.0, "learning_rate": 3e-07, "loss": 5.8441, "step": 1260 }, { "epoch": 140.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.697261810302734, "eval_runtime": 4.2789, "eval_samples_per_second": 0.234, "eval_steps_per_second": 0.234, "step": 1260 }, { "epoch": 141.0, "learning_rate": 2.95e-07, "loss": 5.838, "step": 1269 }, { "epoch": 141.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.697081565856934, "eval_runtime": 4.6381, "eval_samples_per_second": 0.216, "eval_steps_per_second": 0.216, "step": 1269 }, { "epoch": 142.0, "learning_rate": 2.9e-07, "loss": 5.8463, "step": 1278 }, { "epoch": 142.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.695047855377197, "eval_runtime": 4.2254, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.237, "step": 1278 }, { "epoch": 143.0, "learning_rate": 2.8499999999999997e-07, "loss": 5.8385, "step": 1287 }, { "epoch": 143.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.692078113555908, "eval_runtime": 3.9314, "eval_samples_per_second": 0.254, "eval_steps_per_second": 0.254, "step": 1287 }, { "epoch": 144.0, "learning_rate": 2.8e-07, "loss": 5.8354, "step": 1296 }, { "epoch": 144.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.69093656539917, "eval_runtime": 4.4756, "eval_samples_per_second": 0.223, "eval_steps_per_second": 0.223, "step": 1296 }, { "epoch": 145.0, "learning_rate": 2.75e-07, "loss": 5.8283, "step": 1305 }, { "epoch": 145.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.690831184387207, "eval_runtime": 4.2673, "eval_samples_per_second": 0.234, "eval_steps_per_second": 0.234, "step": 1305 }, { "epoch": 146.0, "learning_rate": 2.7e-07, "loss": 5.8363, "step": 1314 }, { "epoch": 146.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.6901655197143555, "eval_runtime": 4.2991, "eval_samples_per_second": 0.233, "eval_steps_per_second": 0.233, "step": 1314 }, { "epoch": 147.0, "learning_rate": 2.65e-07, "loss": 5.8433, "step": 1323 }, { "epoch": 147.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.689047813415527, "eval_runtime": 4.5899, "eval_samples_per_second": 0.218, "eval_steps_per_second": 0.218, "step": 1323 }, { "epoch": 148.0, "learning_rate": 2.6e-07, "loss": 5.8302, "step": 1332 }, { "epoch": 148.0, "eval_accuracy": 0.30791788856304986, "eval_loss": 5.689029216766357, "eval_runtime": 4.2306, "eval_samples_per_second": 0.236, "eval_steps_per_second": 0.236, "step": 1332 }, { "epoch": 149.0, "learning_rate": 2.55e-07, "loss": 5.8276, "step": 1341 }, { "epoch": 149.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.688091278076172, "eval_runtime": 4.1426, "eval_samples_per_second": 0.241, "eval_steps_per_second": 0.241, "step": 1341 }, { "epoch": 150.0, "learning_rate": 2.5e-07, "loss": 5.8366, "step": 1350 }, { "epoch": 150.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.686432838439941, "eval_runtime": 4.4478, "eval_samples_per_second": 0.225, "eval_steps_per_second": 0.225, "step": 1350 }, { "epoch": 151.0, "learning_rate": 2.45e-07, "loss": 5.826, "step": 1359 }, { "epoch": 151.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.685244083404541, "eval_runtime": 4.0347, "eval_samples_per_second": 0.248, "eval_steps_per_second": 0.248, "step": 1359 }, { "epoch": 152.0, "learning_rate": 2.4e-07, "loss": 5.8293, "step": 1368 }, { "epoch": 152.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.68437385559082, "eval_runtime": 3.7004, "eval_samples_per_second": 0.27, "eval_steps_per_second": 0.27, "step": 1368 }, { "epoch": 153.0, "learning_rate": 2.3499999999999997e-07, "loss": 5.8278, "step": 1377 }, { "epoch": 153.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.683380603790283, "eval_runtime": 4.0294, "eval_samples_per_second": 0.248, "eval_steps_per_second": 0.248, "step": 1377 }, { "epoch": 154.0, "learning_rate": 2.3e-07, "loss": 5.8239, "step": 1386 }, { "epoch": 154.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.683019638061523, "eval_runtime": 4.1012, "eval_samples_per_second": 0.244, "eval_steps_per_second": 0.244, "step": 1386 }, { "epoch": 155.0, "learning_rate": 2.25e-07, "loss": 5.8262, "step": 1395 }, { "epoch": 155.0, "eval_accuracy": 0.3088954056695992, "eval_loss": 5.681789875030518, "eval_runtime": 4.3436, "eval_samples_per_second": 0.23, "eval_steps_per_second": 0.23, "step": 1395 }, { "epoch": 156.0, "learning_rate": 2.1999999999999998e-07, "loss": 5.8253, "step": 1404 }, { "epoch": 156.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.680798530578613, "eval_runtime": 3.5916, "eval_samples_per_second": 0.278, "eval_steps_per_second": 0.278, "step": 1404 }, { "epoch": 157.0, "learning_rate": 2.1499999999999998e-07, "loss": 5.8169, "step": 1413 }, { "epoch": 157.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.679258346557617, "eval_runtime": 5.377, "eval_samples_per_second": 0.186, "eval_steps_per_second": 0.186, "step": 1413 }, { "epoch": 158.0, "learning_rate": 2.0999999999999997e-07, "loss": 5.8201, "step": 1422 }, { "epoch": 158.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.679547309875488, "eval_runtime": 3.4728, "eval_samples_per_second": 0.288, "eval_steps_per_second": 0.288, "step": 1422 }, { "epoch": 159.0, "learning_rate": 2.0499999999999997e-07, "loss": 5.8077, "step": 1431 }, { "epoch": 159.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.6798624992370605, "eval_runtime": 4.4591, "eval_samples_per_second": 0.224, "eval_steps_per_second": 0.224, "step": 1431 }, { "epoch": 160.0, "learning_rate": 2e-07, "loss": 5.8222, "step": 1440 }, { "epoch": 160.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.678928852081299, "eval_runtime": 3.9956, "eval_samples_per_second": 0.25, "eval_steps_per_second": 0.25, "step": 1440 }, { "epoch": 161.0, "learning_rate": 1.9499999999999999e-07, "loss": 5.8191, "step": 1449 }, { "epoch": 161.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.677757263183594, "eval_runtime": 3.5865, "eval_samples_per_second": 0.279, "eval_steps_per_second": 0.279, "step": 1449 }, { "epoch": 162.0, "learning_rate": 1.8999999999999998e-07, "loss": 5.83, "step": 1458 }, { "epoch": 162.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.676819801330566, "eval_runtime": 4.7244, "eval_samples_per_second": 0.212, "eval_steps_per_second": 0.212, "step": 1458 }, { "epoch": 163.0, "learning_rate": 1.85e-07, "loss": 5.8183, "step": 1467 }, { "epoch": 163.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.675673484802246, "eval_runtime": 4.423, "eval_samples_per_second": 0.226, "eval_steps_per_second": 0.226, "step": 1467 }, { "epoch": 164.0, "learning_rate": 1.8e-07, "loss": 5.8124, "step": 1476 }, { "epoch": 164.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.674676418304443, "eval_runtime": 4.507, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.222, "step": 1476 }, { "epoch": 165.0, "learning_rate": 1.75e-07, "loss": 5.8119, "step": 1485 }, { "epoch": 165.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.674499034881592, "eval_runtime": 4.5793, "eval_samples_per_second": 0.218, "eval_steps_per_second": 0.218, "step": 1485 }, { "epoch": 166.0, "learning_rate": 1.7000000000000001e-07, "loss": 5.821, "step": 1494 }, { "epoch": 166.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.674384117126465, "eval_runtime": 4.1895, "eval_samples_per_second": 0.239, "eval_steps_per_second": 0.239, "step": 1494 }, { "epoch": 167.0, "learning_rate": 1.65e-07, "loss": 5.807, "step": 1503 }, { "epoch": 167.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.6736602783203125, "eval_runtime": 3.6058, "eval_samples_per_second": 0.277, "eval_steps_per_second": 0.277, "step": 1503 }, { "epoch": 168.0, "learning_rate": 1.6e-07, "loss": 5.8177, "step": 1512 }, { "epoch": 168.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.672515392303467, "eval_runtime": 4.5742, "eval_samples_per_second": 0.219, "eval_steps_per_second": 0.219, "step": 1512 }, { "epoch": 169.0, "learning_rate": 1.55e-07, "loss": 5.8046, "step": 1521 }, { "epoch": 169.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.671017169952393, "eval_runtime": 3.599, "eval_samples_per_second": 0.278, "eval_steps_per_second": 0.278, "step": 1521 }, { "epoch": 170.0, "learning_rate": 1.5e-07, "loss": 5.8093, "step": 1530 }, { "epoch": 170.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.670760154724121, "eval_runtime": 3.7495, "eval_samples_per_second": 0.267, "eval_steps_per_second": 0.267, "step": 1530 }, { "epoch": 171.0, "learning_rate": 1.45e-07, "loss": 5.8145, "step": 1539 }, { "epoch": 171.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.6709699630737305, "eval_runtime": 4.4875, "eval_samples_per_second": 0.223, "eval_steps_per_second": 0.223, "step": 1539 }, { "epoch": 172.0, "learning_rate": 1.4e-07, "loss": 5.803, "step": 1548 }, { "epoch": 172.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.67042350769043, "eval_runtime": 4.2512, "eval_samples_per_second": 0.235, "eval_steps_per_second": 0.235, "step": 1548 }, { "epoch": 173.0, "learning_rate": 1.35e-07, "loss": 5.8038, "step": 1557 }, { "epoch": 173.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.6697306632995605, "eval_runtime": 3.7575, "eval_samples_per_second": 0.266, "eval_steps_per_second": 0.266, "step": 1557 }, { "epoch": 174.0, "learning_rate": 1.3e-07, "loss": 5.807, "step": 1566 }, { "epoch": 174.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.668872356414795, "eval_runtime": 3.653, "eval_samples_per_second": 0.274, "eval_steps_per_second": 0.274, "step": 1566 }, { "epoch": 175.0, "learning_rate": 1.25e-07, "loss": 5.7974, "step": 1575 }, { "epoch": 175.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.668323993682861, "eval_runtime": 3.3471, "eval_samples_per_second": 0.299, "eval_steps_per_second": 0.299, "step": 1575 }, { "epoch": 176.0, "learning_rate": 1.2e-07, "loss": 5.8089, "step": 1584 }, { "epoch": 176.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.66883659362793, "eval_runtime": 3.7471, "eval_samples_per_second": 0.267, "eval_steps_per_second": 0.267, "step": 1584 }, { "epoch": 177.0, "learning_rate": 1.15e-07, "loss": 5.8067, "step": 1593 }, { "epoch": 177.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.669277667999268, "eval_runtime": 3.325, "eval_samples_per_second": 0.301, "eval_steps_per_second": 0.301, "step": 1593 }, { "epoch": 178.0, "learning_rate": 1.0999999999999999e-07, "loss": 5.8092, "step": 1602 }, { "epoch": 178.0, "eval_accuracy": 0.31085043988269795, "eval_loss": 5.669507026672363, "eval_runtime": 3.3536, "eval_samples_per_second": 0.298, "eval_steps_per_second": 0.298, "step": 1602 }, { "epoch": 179.0, "learning_rate": 1.0499999999999999e-07, "loss": 5.8047, "step": 1611 }, { "epoch": 179.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.668713092803955, "eval_runtime": 4.0451, "eval_samples_per_second": 0.247, "eval_steps_per_second": 0.247, "step": 1611 }, { "epoch": 180.0, "learning_rate": 1e-07, "loss": 5.8007, "step": 1620 }, { "epoch": 180.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.667922019958496, "eval_runtime": 4.4531, "eval_samples_per_second": 0.225, "eval_steps_per_second": 0.225, "step": 1620 }, { "epoch": 181.0, "learning_rate": 9.499999999999999e-08, "loss": 5.8041, "step": 1629 }, { "epoch": 181.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.667240619659424, "eval_runtime": 4.9291, "eval_samples_per_second": 0.203, "eval_steps_per_second": 0.203, "step": 1629 }, { "epoch": 182.0, "learning_rate": 9e-08, "loss": 5.8072, "step": 1638 }, { "epoch": 182.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.666664123535156, "eval_runtime": 3.1517, "eval_samples_per_second": 0.317, "eval_steps_per_second": 0.317, "step": 1638 }, { "epoch": 183.0, "learning_rate": 8.500000000000001e-08, "loss": 5.8093, "step": 1647 }, { "epoch": 183.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.66619873046875, "eval_runtime": 3.528, "eval_samples_per_second": 0.283, "eval_steps_per_second": 0.283, "step": 1647 }, { "epoch": 184.0, "learning_rate": 8e-08, "loss": 5.7948, "step": 1656 }, { "epoch": 184.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.665794372558594, "eval_runtime": 3.2036, "eval_samples_per_second": 0.312, "eval_steps_per_second": 0.312, "step": 1656 }, { "epoch": 185.0, "learning_rate": 7.5e-08, "loss": 5.7968, "step": 1665 }, { "epoch": 185.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.665581703186035, "eval_runtime": 4.064, "eval_samples_per_second": 0.246, "eval_steps_per_second": 0.246, "step": 1665 }, { "epoch": 186.0, "learning_rate": 7e-08, "loss": 5.8033, "step": 1674 }, { "epoch": 186.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.665287494659424, "eval_runtime": 3.7071, "eval_samples_per_second": 0.27, "eval_steps_per_second": 0.27, "step": 1674 }, { "epoch": 187.0, "learning_rate": 6.5e-08, "loss": 5.8031, "step": 1683 }, { "epoch": 187.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.665118217468262, "eval_runtime": 3.8502, "eval_samples_per_second": 0.26, "eval_steps_per_second": 0.26, "step": 1683 }, { "epoch": 188.0, "learning_rate": 6e-08, "loss": 5.7953, "step": 1692 }, { "epoch": 188.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664963722229004, "eval_runtime": 3.5099, "eval_samples_per_second": 0.285, "eval_steps_per_second": 0.285, "step": 1692 }, { "epoch": 189.0, "learning_rate": 5.4999999999999996e-08, "loss": 5.8085, "step": 1701 }, { "epoch": 189.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.6647257804870605, "eval_runtime": 3.1431, "eval_samples_per_second": 0.318, "eval_steps_per_second": 0.318, "step": 1701 }, { "epoch": 190.0, "learning_rate": 5e-08, "loss": 5.8021, "step": 1710 }, { "epoch": 190.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664588928222656, "eval_runtime": 3.9583, "eval_samples_per_second": 0.253, "eval_steps_per_second": 0.253, "step": 1710 }, { "epoch": 191.0, "learning_rate": 4.5e-08, "loss": 5.7995, "step": 1719 }, { "epoch": 191.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664261817932129, "eval_runtime": 3.9361, "eval_samples_per_second": 0.254, "eval_steps_per_second": 0.254, "step": 1719 }, { "epoch": 192.0, "learning_rate": 4e-08, "loss": 5.8057, "step": 1728 }, { "epoch": 192.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664216041564941, "eval_runtime": 3.9765, "eval_samples_per_second": 0.251, "eval_steps_per_second": 0.251, "step": 1728 }, { "epoch": 193.0, "learning_rate": 3.5e-08, "loss": 5.7989, "step": 1737 }, { "epoch": 193.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664193153381348, "eval_runtime": 3.0391, "eval_samples_per_second": 0.329, "eval_steps_per_second": 0.329, "step": 1737 }, { "epoch": 194.0, "learning_rate": 3e-08, "loss": 5.7977, "step": 1746 }, { "epoch": 194.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664215564727783, "eval_runtime": 3.3818, "eval_samples_per_second": 0.296, "eval_steps_per_second": 0.296, "step": 1746 }, { "epoch": 195.0, "learning_rate": 2.5e-08, "loss": 5.8009, "step": 1755 }, { "epoch": 195.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664393901824951, "eval_runtime": 3.3657, "eval_samples_per_second": 0.297, "eval_steps_per_second": 0.297, "step": 1755 }, { "epoch": 196.0, "learning_rate": 2e-08, "loss": 5.7988, "step": 1764 }, { "epoch": 196.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.6645026206970215, "eval_runtime": 3.9312, "eval_samples_per_second": 0.254, "eval_steps_per_second": 0.254, "step": 1764 }, { "epoch": 197.0, "learning_rate": 1.5e-08, "loss": 5.8016, "step": 1773 }, { "epoch": 197.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664463043212891, "eval_runtime": 3.7271, "eval_samples_per_second": 0.268, "eval_steps_per_second": 0.268, "step": 1773 }, { "epoch": 198.0, "learning_rate": 1e-08, "loss": 5.7929, "step": 1782 }, { "epoch": 198.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664458751678467, "eval_runtime": 3.5464, "eval_samples_per_second": 0.282, "eval_steps_per_second": 0.282, "step": 1782 }, { "epoch": 199.0, "learning_rate": 5e-09, "loss": 5.7973, "step": 1791 }, { "epoch": 199.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664454936981201, "eval_runtime": 3.6778, "eval_samples_per_second": 0.272, "eval_steps_per_second": 0.272, "step": 1791 }, { "epoch": 200.0, "learning_rate": 0.0, "loss": 5.8022, "step": 1800 }, { "epoch": 200.0, "eval_accuracy": 0.3098729227761486, "eval_loss": 5.664452075958252, "eval_runtime": 3.5398, "eval_samples_per_second": 0.283, "eval_steps_per_second": 0.283, "step": 1800 }, { "epoch": 200.0, "step": 1800, "total_flos": 7834231111680000.0, "train_loss": 6.335995424058702, "train_runtime": 30730.8013, "train_samples_per_second": 0.059, "train_steps_per_second": 0.059 } ], "logging_steps": 500, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 500, "total_flos": 7834231111680000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }