{ "best_metric": 1.2840522527694702, "best_model_checkpoint": "./results/checkpoint-1500", "epoch": 0.6, "eval_steps": 4, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.9951999999999997e-07, "loss": 2.6285, "step": 4 }, { "epoch": 0.0, "eval_loss": 2.4697508811950684, "eval_runtime": 0.485, "eval_samples_per_second": 8.248, "eval_steps_per_second": 2.062, "step": 4 }, { "epoch": 0.0, "learning_rate": 2.9904e-07, "loss": 2.6222, "step": 8 }, { "epoch": 0.0, "eval_loss": 2.465975284576416, "eval_runtime": 0.6323, "eval_samples_per_second": 6.326, "eval_steps_per_second": 1.582, "step": 8 }, { "epoch": 0.0, "learning_rate": 2.9856e-07, "loss": 2.6536, "step": 12 }, { "epoch": 0.0, "eval_loss": 2.460374116897583, "eval_runtime": 0.6478, "eval_samples_per_second": 6.175, "eval_steps_per_second": 1.544, "step": 12 }, { "epoch": 0.01, "learning_rate": 2.9808e-07, "loss": 2.6785, "step": 16 }, { "epoch": 0.01, "eval_loss": 2.4556970596313477, "eval_runtime": 0.6653, "eval_samples_per_second": 6.012, "eval_steps_per_second": 1.503, "step": 16 }, { "epoch": 0.01, "learning_rate": 2.9759999999999996e-07, "loss": 2.6085, "step": 20 }, { "epoch": 0.01, "eval_loss": 2.4514715671539307, "eval_runtime": 0.5241, "eval_samples_per_second": 7.632, "eval_steps_per_second": 1.908, "step": 20 }, { "epoch": 0.01, "learning_rate": 2.9711999999999995e-07, "loss": 2.5907, "step": 24 }, { "epoch": 0.01, "eval_loss": 2.4462974071502686, "eval_runtime": 0.4689, "eval_samples_per_second": 8.53, "eval_steps_per_second": 2.133, "step": 24 }, { "epoch": 0.01, "learning_rate": 2.9664e-07, "loss": 2.5942, "step": 28 }, { "epoch": 0.01, "eval_loss": 2.4415194988250732, "eval_runtime": 0.4829, "eval_samples_per_second": 8.284, "eval_steps_per_second": 2.071, "step": 28 }, { "epoch": 0.01, "learning_rate": 2.9615999999999997e-07, "loss": 2.6101, "step": 32 }, { "epoch": 0.01, "eval_loss": 2.437161922454834, "eval_runtime": 0.4715, "eval_samples_per_second": 8.483, "eval_steps_per_second": 2.121, "step": 32 }, { "epoch": 0.01, "learning_rate": 2.9568e-07, "loss": 2.5827, "step": 36 }, { "epoch": 0.01, "eval_loss": 2.432689666748047, "eval_runtime": 0.4938, "eval_samples_per_second": 8.1, "eval_steps_per_second": 2.025, "step": 36 }, { "epoch": 0.02, "learning_rate": 2.952e-07, "loss": 2.5729, "step": 40 }, { "epoch": 0.02, "eval_loss": 2.4281153678894043, "eval_runtime": 0.5021, "eval_samples_per_second": 7.966, "eval_steps_per_second": 1.991, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.9472e-07, "loss": 2.5856, "step": 44 }, { "epoch": 0.02, "eval_loss": 2.423053741455078, "eval_runtime": 0.593, "eval_samples_per_second": 6.746, "eval_steps_per_second": 1.686, "step": 44 }, { "epoch": 0.02, "learning_rate": 2.9423999999999997e-07, "loss": 2.589, "step": 48 }, { "epoch": 0.02, "eval_loss": 2.418571949005127, "eval_runtime": 0.6933, "eval_samples_per_second": 5.77, "eval_steps_per_second": 1.442, "step": 48 }, { "epoch": 0.02, "learning_rate": 2.9375999999999995e-07, "loss": 2.6483, "step": 52 }, { "epoch": 0.02, "eval_loss": 2.414531946182251, "eval_runtime": 0.7167, "eval_samples_per_second": 5.581, "eval_steps_per_second": 1.395, "step": 52 }, { "epoch": 0.02, "learning_rate": 2.9328e-07, "loss": 2.517, "step": 56 }, { "epoch": 0.02, "eval_loss": 2.409538745880127, "eval_runtime": 0.4826, "eval_samples_per_second": 8.289, "eval_steps_per_second": 2.072, "step": 56 }, { "epoch": 0.02, "learning_rate": 2.928e-07, "loss": 2.5987, "step": 60 }, { "epoch": 0.02, "eval_loss": 2.4050426483154297, "eval_runtime": 0.4757, "eval_samples_per_second": 8.409, "eval_steps_per_second": 2.102, "step": 60 }, { "epoch": 0.03, "learning_rate": 2.9232e-07, "loss": 2.5489, "step": 64 }, { "epoch": 0.03, "eval_loss": 2.400360107421875, "eval_runtime": 0.4945, "eval_samples_per_second": 8.089, "eval_steps_per_second": 2.022, "step": 64 }, { "epoch": 0.03, "learning_rate": 2.9184e-07, "loss": 2.5063, "step": 68 }, { "epoch": 0.03, "eval_loss": 2.396500587463379, "eval_runtime": 0.5, "eval_samples_per_second": 8.001, "eval_steps_per_second": 2.0, "step": 68 }, { "epoch": 0.03, "learning_rate": 2.9136e-07, "loss": 2.5867, "step": 72 }, { "epoch": 0.03, "eval_loss": 2.3916146755218506, "eval_runtime": 0.4602, "eval_samples_per_second": 8.693, "eval_steps_per_second": 2.173, "step": 72 }, { "epoch": 0.03, "learning_rate": 2.9087999999999997e-07, "loss": 2.544, "step": 76 }, { "epoch": 0.03, "eval_loss": 2.3873047828674316, "eval_runtime": 0.4731, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.114, "step": 76 }, { "epoch": 0.03, "learning_rate": 2.9039999999999995e-07, "loss": 2.5596, "step": 80 }, { "epoch": 0.03, "eval_loss": 2.382803440093994, "eval_runtime": 0.6092, "eval_samples_per_second": 6.566, "eval_steps_per_second": 1.642, "step": 80 }, { "epoch": 0.03, "learning_rate": 2.8992e-07, "loss": 2.5744, "step": 84 }, { "epoch": 0.03, "eval_loss": 2.3786380290985107, "eval_runtime": 0.7212, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.387, "step": 84 }, { "epoch": 0.04, "learning_rate": 2.8944e-07, "loss": 2.5588, "step": 88 }, { "epoch": 0.04, "eval_loss": 2.374176502227783, "eval_runtime": 0.6826, "eval_samples_per_second": 5.86, "eval_steps_per_second": 1.465, "step": 88 }, { "epoch": 0.04, "learning_rate": 2.8895999999999996e-07, "loss": 2.5579, "step": 92 }, { "epoch": 0.04, "eval_loss": 2.3702104091644287, "eval_runtime": 0.4896, "eval_samples_per_second": 8.169, "eval_steps_per_second": 2.042, "step": 92 }, { "epoch": 0.04, "learning_rate": 2.8848e-07, "loss": 2.5245, "step": 96 }, { "epoch": 0.04, "eval_loss": 2.3660218715667725, "eval_runtime": 0.4764, "eval_samples_per_second": 8.397, "eval_steps_per_second": 2.099, "step": 96 }, { "epoch": 0.04, "learning_rate": 2.88e-07, "loss": 2.5132, "step": 100 }, { "epoch": 0.04, "eval_loss": 2.36110520362854, "eval_runtime": 0.4799, "eval_samples_per_second": 8.335, "eval_steps_per_second": 2.084, "step": 100 }, { "epoch": 0.04, "learning_rate": 2.8751999999999997e-07, "loss": 2.5037, "step": 104 }, { "epoch": 0.04, "eval_loss": 2.3570125102996826, "eval_runtime": 0.4722, "eval_samples_per_second": 8.47, "eval_steps_per_second": 2.118, "step": 104 }, { "epoch": 0.04, "learning_rate": 2.8704e-07, "loss": 2.4727, "step": 108 }, { "epoch": 0.04, "eval_loss": 2.3530666828155518, "eval_runtime": 0.467, "eval_samples_per_second": 8.565, "eval_steps_per_second": 2.141, "step": 108 }, { "epoch": 0.04, "learning_rate": 2.8656e-07, "loss": 2.4709, "step": 112 }, { "epoch": 0.04, "eval_loss": 2.348759412765503, "eval_runtime": 0.501, "eval_samples_per_second": 7.984, "eval_steps_per_second": 1.996, "step": 112 }, { "epoch": 0.05, "learning_rate": 2.8608e-07, "loss": 2.4711, "step": 116 }, { "epoch": 0.05, "eval_loss": 2.344454050064087, "eval_runtime": 0.6607, "eval_samples_per_second": 6.054, "eval_steps_per_second": 1.513, "step": 116 }, { "epoch": 0.05, "learning_rate": 2.8559999999999996e-07, "loss": 2.5445, "step": 120 }, { "epoch": 0.05, "eval_loss": 2.3402156829833984, "eval_runtime": 0.704, "eval_samples_per_second": 5.682, "eval_steps_per_second": 1.42, "step": 120 }, { "epoch": 0.05, "learning_rate": 2.8512e-07, "loss": 2.4994, "step": 124 }, { "epoch": 0.05, "eval_loss": 2.3362019062042236, "eval_runtime": 0.6849, "eval_samples_per_second": 5.84, "eval_steps_per_second": 1.46, "step": 124 }, { "epoch": 0.05, "learning_rate": 2.8464e-07, "loss": 2.5036, "step": 128 }, { "epoch": 0.05, "eval_loss": 2.3319339752197266, "eval_runtime": 0.4864, "eval_samples_per_second": 8.223, "eval_steps_per_second": 2.056, "step": 128 }, { "epoch": 0.05, "learning_rate": 2.8416e-07, "loss": 2.5525, "step": 132 }, { "epoch": 0.05, "eval_loss": 2.3276522159576416, "eval_runtime": 0.4783, "eval_samples_per_second": 8.364, "eval_steps_per_second": 2.091, "step": 132 }, { "epoch": 0.05, "learning_rate": 2.8368e-07, "loss": 2.5245, "step": 136 }, { "epoch": 0.05, "eval_loss": 2.3241090774536133, "eval_runtime": 0.4805, "eval_samples_per_second": 8.324, "eval_steps_per_second": 2.081, "step": 136 }, { "epoch": 0.06, "learning_rate": 2.832e-07, "loss": 2.4946, "step": 140 }, { "epoch": 0.06, "eval_loss": 2.3198165893554688, "eval_runtime": 0.473, "eval_samples_per_second": 8.457, "eval_steps_per_second": 2.114, "step": 140 }, { "epoch": 0.06, "learning_rate": 2.8272e-07, "loss": 2.5142, "step": 144 }, { "epoch": 0.06, "eval_loss": 2.3152613639831543, "eval_runtime": 0.4858, "eval_samples_per_second": 8.234, "eval_steps_per_second": 2.058, "step": 144 }, { "epoch": 0.06, "learning_rate": 2.8223999999999997e-07, "loss": 2.4639, "step": 148 }, { "epoch": 0.06, "eval_loss": 2.3112645149230957, "eval_runtime": 0.488, "eval_samples_per_second": 8.196, "eval_steps_per_second": 2.049, "step": 148 }, { "epoch": 0.06, "learning_rate": 2.8176e-07, "loss": 2.4796, "step": 152 }, { "epoch": 0.06, "eval_loss": 2.307020902633667, "eval_runtime": 0.6163, "eval_samples_per_second": 6.49, "eval_steps_per_second": 1.623, "step": 152 }, { "epoch": 0.06, "learning_rate": 2.8128e-07, "loss": 2.4529, "step": 156 }, { "epoch": 0.06, "eval_loss": 2.303062915802002, "eval_runtime": 0.6764, "eval_samples_per_second": 5.913, "eval_steps_per_second": 1.478, "step": 156 }, { "epoch": 0.06, "learning_rate": 2.808e-07, "loss": 2.4823, "step": 160 }, { "epoch": 0.06, "eval_loss": 2.2993311882019043, "eval_runtime": 0.6854, "eval_samples_per_second": 5.836, "eval_steps_per_second": 1.459, "step": 160 }, { "epoch": 0.07, "learning_rate": 2.8032e-07, "loss": 2.4439, "step": 164 }, { "epoch": 0.07, "eval_loss": 2.2947850227355957, "eval_runtime": 0.4745, "eval_samples_per_second": 8.429, "eval_steps_per_second": 2.107, "step": 164 }, { "epoch": 0.07, "learning_rate": 2.7984e-07, "loss": 2.4652, "step": 168 }, { "epoch": 0.07, "eval_loss": 2.2908992767333984, "eval_runtime": 0.4759, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.101, "step": 168 }, { "epoch": 0.07, "learning_rate": 2.7936e-07, "loss": 2.4574, "step": 172 }, { "epoch": 0.07, "eval_loss": 2.2867026329040527, "eval_runtime": 0.4973, "eval_samples_per_second": 8.043, "eval_steps_per_second": 2.011, "step": 172 }, { "epoch": 0.07, "learning_rate": 2.7887999999999997e-07, "loss": 2.4557, "step": 176 }, { "epoch": 0.07, "eval_loss": 2.283027172088623, "eval_runtime": 0.4719, "eval_samples_per_second": 8.477, "eval_steps_per_second": 2.119, "step": 176 }, { "epoch": 0.07, "learning_rate": 2.784e-07, "loss": 2.4462, "step": 180 }, { "epoch": 0.07, "eval_loss": 2.2787420749664307, "eval_runtime": 0.472, "eval_samples_per_second": 8.474, "eval_steps_per_second": 2.119, "step": 180 }, { "epoch": 0.07, "learning_rate": 2.7792e-07, "loss": 2.3962, "step": 184 }, { "epoch": 0.07, "eval_loss": 2.2745461463928223, "eval_runtime": 0.6328, "eval_samples_per_second": 6.322, "eval_steps_per_second": 1.58, "step": 184 }, { "epoch": 0.08, "learning_rate": 2.7744e-07, "loss": 2.3666, "step": 188 }, { "epoch": 0.08, "eval_loss": 2.2705912590026855, "eval_runtime": 0.6375, "eval_samples_per_second": 6.274, "eval_steps_per_second": 1.569, "step": 188 }, { "epoch": 0.08, "learning_rate": 2.7696e-07, "loss": 2.5024, "step": 192 }, { "epoch": 0.08, "eval_loss": 2.266995906829834, "eval_runtime": 0.6984, "eval_samples_per_second": 5.727, "eval_steps_per_second": 1.432, "step": 192 }, { "epoch": 0.08, "learning_rate": 2.7648e-07, "loss": 2.4419, "step": 196 }, { "epoch": 0.08, "eval_loss": 2.2626519203186035, "eval_runtime": 0.7334, "eval_samples_per_second": 5.454, "eval_steps_per_second": 1.363, "step": 196 }, { "epoch": 0.08, "learning_rate": 2.76e-07, "loss": 2.4246, "step": 200 }, { "epoch": 0.08, "eval_loss": 2.2583603858947754, "eval_runtime": 0.48, "eval_samples_per_second": 8.333, "eval_steps_per_second": 2.083, "step": 200 }, { "epoch": 0.08, "learning_rate": 2.7551999999999997e-07, "loss": 2.3853, "step": 204 }, { "epoch": 0.08, "eval_loss": 2.2551512718200684, "eval_runtime": 0.4939, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.025, "step": 204 }, { "epoch": 0.08, "learning_rate": 2.7503999999999995e-07, "loss": 2.4032, "step": 208 }, { "epoch": 0.08, "eval_loss": 2.251105785369873, "eval_runtime": 0.46, "eval_samples_per_second": 8.695, "eval_steps_per_second": 2.174, "step": 208 }, { "epoch": 0.08, "learning_rate": 2.7456e-07, "loss": 2.4444, "step": 212 }, { "epoch": 0.08, "eval_loss": 2.247025489807129, "eval_runtime": 0.4948, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.021, "step": 212 }, { "epoch": 0.09, "learning_rate": 2.7408e-07, "loss": 2.2932, "step": 216 }, { "epoch": 0.09, "eval_loss": 2.242764472961426, "eval_runtime": 0.4897, "eval_samples_per_second": 8.168, "eval_steps_per_second": 2.042, "step": 216 }, { "epoch": 0.09, "learning_rate": 2.736e-07, "loss": 2.3929, "step": 220 }, { "epoch": 0.09, "eval_loss": 2.2391483783721924, "eval_runtime": 0.6128, "eval_samples_per_second": 6.528, "eval_steps_per_second": 1.632, "step": 220 }, { "epoch": 0.09, "learning_rate": 2.7312e-07, "loss": 2.4112, "step": 224 }, { "epoch": 0.09, "eval_loss": 2.234977960586548, "eval_runtime": 0.648, "eval_samples_per_second": 6.172, "eval_steps_per_second": 1.543, "step": 224 }, { "epoch": 0.09, "learning_rate": 2.7264e-07, "loss": 2.4191, "step": 228 }, { "epoch": 0.09, "eval_loss": 2.231099843978882, "eval_runtime": 0.6862, "eval_samples_per_second": 5.829, "eval_steps_per_second": 1.457, "step": 228 }, { "epoch": 0.09, "learning_rate": 2.7215999999999997e-07, "loss": 2.4408, "step": 232 }, { "epoch": 0.09, "eval_loss": 2.2272462844848633, "eval_runtime": 0.7076, "eval_samples_per_second": 5.653, "eval_steps_per_second": 1.413, "step": 232 }, { "epoch": 0.09, "learning_rate": 2.7167999999999996e-07, "loss": 2.3884, "step": 236 }, { "epoch": 0.09, "eval_loss": 2.223376750946045, "eval_runtime": 0.5169, "eval_samples_per_second": 7.738, "eval_steps_per_second": 1.935, "step": 236 }, { "epoch": 0.1, "learning_rate": 2.712e-07, "loss": 2.3689, "step": 240 }, { "epoch": 0.1, "eval_loss": 2.2195653915405273, "eval_runtime": 0.4793, "eval_samples_per_second": 8.346, "eval_steps_per_second": 2.086, "step": 240 }, { "epoch": 0.1, "learning_rate": 2.7072e-07, "loss": 2.3689, "step": 244 }, { "epoch": 0.1, "eval_loss": 2.2153775691986084, "eval_runtime": 0.4771, "eval_samples_per_second": 8.384, "eval_steps_per_second": 2.096, "step": 244 }, { "epoch": 0.1, "learning_rate": 2.7024e-07, "loss": 2.3249, "step": 248 }, { "epoch": 0.1, "eval_loss": 2.211355209350586, "eval_runtime": 0.4778, "eval_samples_per_second": 8.372, "eval_steps_per_second": 2.093, "step": 248 }, { "epoch": 0.1, "learning_rate": 2.6976e-07, "loss": 2.4286, "step": 252 }, { "epoch": 0.1, "eval_loss": 2.207773208618164, "eval_runtime": 0.4873, "eval_samples_per_second": 8.209, "eval_steps_per_second": 2.052, "step": 252 }, { "epoch": 0.1, "learning_rate": 2.6928e-07, "loss": 2.3497, "step": 256 }, { "epoch": 0.1, "eval_loss": 2.203867197036743, "eval_runtime": 0.6281, "eval_samples_per_second": 6.368, "eval_steps_per_second": 1.592, "step": 256 }, { "epoch": 0.1, "learning_rate": 2.6879999999999997e-07, "loss": 2.284, "step": 260 }, { "epoch": 0.1, "eval_loss": 2.199937582015991, "eval_runtime": 0.6885, "eval_samples_per_second": 5.81, "eval_steps_per_second": 1.452, "step": 260 }, { "epoch": 0.11, "learning_rate": 2.6831999999999996e-07, "loss": 2.3333, "step": 264 }, { "epoch": 0.11, "eval_loss": 2.1958465576171875, "eval_runtime": 0.6799, "eval_samples_per_second": 5.883, "eval_steps_per_second": 1.471, "step": 264 }, { "epoch": 0.11, "learning_rate": 2.6784e-07, "loss": 2.3305, "step": 268 }, { "epoch": 0.11, "eval_loss": 2.192072868347168, "eval_runtime": 0.7165, "eval_samples_per_second": 5.583, "eval_steps_per_second": 1.396, "step": 268 }, { "epoch": 0.11, "learning_rate": 2.6736e-07, "loss": 2.3465, "step": 272 }, { "epoch": 0.11, "eval_loss": 2.1882476806640625, "eval_runtime": 0.485, "eval_samples_per_second": 8.247, "eval_steps_per_second": 2.062, "step": 272 }, { "epoch": 0.11, "learning_rate": 2.6687999999999997e-07, "loss": 2.3274, "step": 276 }, { "epoch": 0.11, "eval_loss": 2.1841320991516113, "eval_runtime": 0.4767, "eval_samples_per_second": 8.391, "eval_steps_per_second": 2.098, "step": 276 }, { "epoch": 0.11, "learning_rate": 2.664e-07, "loss": 2.3641, "step": 280 }, { "epoch": 0.11, "eval_loss": 2.1803271770477295, "eval_runtime": 0.5146, "eval_samples_per_second": 7.774, "eval_steps_per_second": 1.943, "step": 280 }, { "epoch": 0.11, "learning_rate": 2.6592e-07, "loss": 2.3089, "step": 284 }, { "epoch": 0.11, "eval_loss": 2.176274538040161, "eval_runtime": 0.488, "eval_samples_per_second": 8.196, "eval_steps_per_second": 2.049, "step": 284 }, { "epoch": 0.12, "learning_rate": 2.6543999999999997e-07, "loss": 2.2645, "step": 288 }, { "epoch": 0.12, "eval_loss": 2.1720588207244873, "eval_runtime": 0.4973, "eval_samples_per_second": 8.043, "eval_steps_per_second": 2.011, "step": 288 }, { "epoch": 0.12, "learning_rate": 2.6495999999999996e-07, "loss": 2.3439, "step": 292 }, { "epoch": 0.12, "eval_loss": 2.1687240600585938, "eval_runtime": 0.6283, "eval_samples_per_second": 6.366, "eval_steps_per_second": 1.592, "step": 292 }, { "epoch": 0.12, "learning_rate": 2.6448e-07, "loss": 2.3285, "step": 296 }, { "epoch": 0.12, "eval_loss": 2.1649253368377686, "eval_runtime": 0.6996, "eval_samples_per_second": 5.718, "eval_steps_per_second": 1.429, "step": 296 }, { "epoch": 0.12, "learning_rate": 2.64e-07, "loss": 2.3126, "step": 300 }, { "epoch": 0.12, "eval_loss": 2.160398483276367, "eval_runtime": 0.6904, "eval_samples_per_second": 5.794, "eval_steps_per_second": 1.448, "step": 300 }, { "epoch": 0.12, "learning_rate": 2.6351999999999997e-07, "loss": 2.3356, "step": 304 }, { "epoch": 0.12, "eval_loss": 2.1570284366607666, "eval_runtime": 0.4953, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.019, "step": 304 }, { "epoch": 0.12, "learning_rate": 2.6304e-07, "loss": 2.3396, "step": 308 }, { "epoch": 0.12, "eval_loss": 2.1527013778686523, "eval_runtime": 0.4977, "eval_samples_per_second": 8.037, "eval_steps_per_second": 2.009, "step": 308 }, { "epoch": 0.12, "learning_rate": 2.6256e-07, "loss": 2.2972, "step": 312 }, { "epoch": 0.12, "eval_loss": 2.148724317550659, "eval_runtime": 0.4939, "eval_samples_per_second": 8.099, "eval_steps_per_second": 2.025, "step": 312 }, { "epoch": 0.13, "learning_rate": 2.6208e-07, "loss": 2.3321, "step": 316 }, { "epoch": 0.13, "eval_loss": 2.1449663639068604, "eval_runtime": 0.4784, "eval_samples_per_second": 8.362, "eval_steps_per_second": 2.09, "step": 316 }, { "epoch": 0.13, "learning_rate": 2.616e-07, "loss": 2.3348, "step": 320 }, { "epoch": 0.13, "eval_loss": 2.1414906978607178, "eval_runtime": 0.4949, "eval_samples_per_second": 8.082, "eval_steps_per_second": 2.021, "step": 320 }, { "epoch": 0.13, "learning_rate": 2.6112e-07, "loss": 2.2728, "step": 324 }, { "epoch": 0.13, "eval_loss": 2.1374001502990723, "eval_runtime": 0.6321, "eval_samples_per_second": 6.328, "eval_steps_per_second": 1.582, "step": 324 }, { "epoch": 0.13, "learning_rate": 2.6064e-07, "loss": 2.287, "step": 328 }, { "epoch": 0.13, "eval_loss": 2.1333529949188232, "eval_runtime": 0.6547, "eval_samples_per_second": 6.109, "eval_steps_per_second": 1.527, "step": 328 }, { "epoch": 0.13, "learning_rate": 2.6015999999999997e-07, "loss": 2.2474, "step": 332 }, { "epoch": 0.13, "eval_loss": 2.1297547817230225, "eval_runtime": 0.7093, "eval_samples_per_second": 5.639, "eval_steps_per_second": 1.41, "step": 332 }, { "epoch": 0.13, "learning_rate": 2.5968e-07, "loss": 2.3214, "step": 336 }, { "epoch": 0.13, "eval_loss": 2.126392364501953, "eval_runtime": 0.6909, "eval_samples_per_second": 5.789, "eval_steps_per_second": 1.447, "step": 336 }, { "epoch": 0.14, "learning_rate": 2.592e-07, "loss": 2.2725, "step": 340 }, { "epoch": 0.14, "eval_loss": 2.122309923171997, "eval_runtime": 0.4823, "eval_samples_per_second": 8.293, "eval_steps_per_second": 2.073, "step": 340 }, { "epoch": 0.14, "learning_rate": 2.5872000000000003e-07, "loss": 2.3114, "step": 344 }, { "epoch": 0.14, "eval_loss": 2.118303060531616, "eval_runtime": 0.4954, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.019, "step": 344 }, { "epoch": 0.14, "learning_rate": 2.5824e-07, "loss": 2.2333, "step": 348 }, { "epoch": 0.14, "eval_loss": 2.114621162414551, "eval_runtime": 0.4856, "eval_samples_per_second": 8.238, "eval_steps_per_second": 2.059, "step": 348 }, { "epoch": 0.14, "learning_rate": 2.5776e-07, "loss": 2.2812, "step": 352 }, { "epoch": 0.14, "eval_loss": 2.11067795753479, "eval_runtime": 0.4778, "eval_samples_per_second": 8.372, "eval_steps_per_second": 2.093, "step": 352 }, { "epoch": 0.14, "learning_rate": 2.5728e-07, "loss": 2.2454, "step": 356 }, { "epoch": 0.14, "eval_loss": 2.106940746307373, "eval_runtime": 0.4945, "eval_samples_per_second": 8.089, "eval_steps_per_second": 2.022, "step": 356 }, { "epoch": 0.14, "learning_rate": 2.5679999999999997e-07, "loss": 2.2261, "step": 360 }, { "epoch": 0.14, "eval_loss": 2.1031668186187744, "eval_runtime": 0.6521, "eval_samples_per_second": 6.134, "eval_steps_per_second": 1.533, "step": 360 }, { "epoch": 0.15, "learning_rate": 2.5632e-07, "loss": 2.2841, "step": 364 }, { "epoch": 0.15, "eval_loss": 2.0989203453063965, "eval_runtime": 0.6249, "eval_samples_per_second": 6.401, "eval_steps_per_second": 1.6, "step": 364 }, { "epoch": 0.15, "learning_rate": 2.5584e-07, "loss": 2.2481, "step": 368 }, { "epoch": 0.15, "eval_loss": 2.095189332962036, "eval_runtime": 0.6855, "eval_samples_per_second": 5.835, "eval_steps_per_second": 1.459, "step": 368 }, { "epoch": 0.15, "learning_rate": 2.5536e-07, "loss": 2.278, "step": 372 }, { "epoch": 0.15, "eval_loss": 2.0912463665008545, "eval_runtime": 0.7393, "eval_samples_per_second": 5.411, "eval_steps_per_second": 1.353, "step": 372 }, { "epoch": 0.15, "learning_rate": 2.5488e-07, "loss": 2.2765, "step": 376 }, { "epoch": 0.15, "eval_loss": 2.087336301803589, "eval_runtime": 0.4793, "eval_samples_per_second": 8.345, "eval_steps_per_second": 2.086, "step": 376 }, { "epoch": 0.15, "learning_rate": 2.544e-07, "loss": 2.2232, "step": 380 }, { "epoch": 0.15, "eval_loss": 2.0833120346069336, "eval_runtime": 0.487, "eval_samples_per_second": 8.214, "eval_steps_per_second": 2.053, "step": 380 }, { "epoch": 0.15, "learning_rate": 2.5392e-07, "loss": 2.306, "step": 384 }, { "epoch": 0.15, "eval_loss": 2.079479932785034, "eval_runtime": 0.4722, "eval_samples_per_second": 8.471, "eval_steps_per_second": 2.118, "step": 384 }, { "epoch": 0.16, "learning_rate": 2.5343999999999997e-07, "loss": 2.2126, "step": 388 }, { "epoch": 0.16, "eval_loss": 2.0760295391082764, "eval_runtime": 0.4958, "eval_samples_per_second": 8.068, "eval_steps_per_second": 2.017, "step": 388 }, { "epoch": 0.16, "learning_rate": 2.5295999999999996e-07, "loss": 2.2557, "step": 392 }, { "epoch": 0.16, "eval_loss": 2.072136402130127, "eval_runtime": 0.469, "eval_samples_per_second": 8.529, "eval_steps_per_second": 2.132, "step": 392 }, { "epoch": 0.16, "learning_rate": 2.5248e-07, "loss": 2.1988, "step": 396 }, { "epoch": 0.16, "eval_loss": 2.0683670043945312, "eval_runtime": 0.6385, "eval_samples_per_second": 6.264, "eval_steps_per_second": 1.566, "step": 396 }, { "epoch": 0.16, "learning_rate": 2.52e-07, "loss": 2.1917, "step": 400 }, { "epoch": 0.16, "eval_loss": 2.0638906955718994, "eval_runtime": 0.6834, "eval_samples_per_second": 5.853, "eval_steps_per_second": 1.463, "step": 400 }, { "epoch": 0.16, "learning_rate": 2.5152e-07, "loss": 2.2479, "step": 404 }, { "epoch": 0.16, "eval_loss": 2.0599253177642822, "eval_runtime": 0.7261, "eval_samples_per_second": 5.509, "eval_steps_per_second": 1.377, "step": 404 }, { "epoch": 0.16, "learning_rate": 2.5104e-07, "loss": 2.1484, "step": 408 }, { "epoch": 0.16, "eval_loss": 2.055751085281372, "eval_runtime": 0.7367, "eval_samples_per_second": 5.429, "eval_steps_per_second": 1.357, "step": 408 }, { "epoch": 0.16, "learning_rate": 2.5056e-07, "loss": 2.1886, "step": 412 }, { "epoch": 0.16, "eval_loss": 2.052119016647339, "eval_runtime": 0.4808, "eval_samples_per_second": 8.319, "eval_steps_per_second": 2.08, "step": 412 }, { "epoch": 0.17, "learning_rate": 2.5007999999999997e-07, "loss": 2.2026, "step": 416 }, { "epoch": 0.17, "eval_loss": 2.0482354164123535, "eval_runtime": 0.4856, "eval_samples_per_second": 8.238, "eval_steps_per_second": 2.059, "step": 416 }, { "epoch": 0.17, "learning_rate": 2.4959999999999996e-07, "loss": 2.1572, "step": 420 }, { "epoch": 0.17, "eval_loss": 2.0441887378692627, "eval_runtime": 0.4779, "eval_samples_per_second": 8.37, "eval_steps_per_second": 2.093, "step": 420 }, { "epoch": 0.17, "learning_rate": 2.4912e-07, "loss": 2.1931, "step": 424 }, { "epoch": 0.17, "eval_loss": 2.0399935245513916, "eval_runtime": 0.4803, "eval_samples_per_second": 8.329, "eval_steps_per_second": 2.082, "step": 424 }, { "epoch": 0.17, "learning_rate": 2.4864e-07, "loss": 2.161, "step": 428 }, { "epoch": 0.17, "eval_loss": 2.03645920753479, "eval_runtime": 0.4924, "eval_samples_per_second": 8.123, "eval_steps_per_second": 2.031, "step": 428 }, { "epoch": 0.17, "learning_rate": 2.4816e-07, "loss": 2.1115, "step": 432 }, { "epoch": 0.17, "eval_loss": 2.032196044921875, "eval_runtime": 0.6345, "eval_samples_per_second": 6.304, "eval_steps_per_second": 1.576, "step": 432 }, { "epoch": 0.17, "learning_rate": 2.4768e-07, "loss": 2.173, "step": 436 }, { "epoch": 0.17, "eval_loss": 2.028397560119629, "eval_runtime": 0.6625, "eval_samples_per_second": 6.038, "eval_steps_per_second": 1.509, "step": 436 }, { "epoch": 0.18, "learning_rate": 2.472e-07, "loss": 2.1491, "step": 440 }, { "epoch": 0.18, "eval_loss": 2.0247464179992676, "eval_runtime": 0.6969, "eval_samples_per_second": 5.74, "eval_steps_per_second": 1.435, "step": 440 }, { "epoch": 0.18, "learning_rate": 2.4672e-07, "loss": 2.1716, "step": 444 }, { "epoch": 0.18, "eval_loss": 2.0203933715820312, "eval_runtime": 0.7311, "eval_samples_per_second": 5.471, "eval_steps_per_second": 1.368, "step": 444 }, { "epoch": 0.18, "learning_rate": 2.4623999999999996e-07, "loss": 2.2031, "step": 448 }, { "epoch": 0.18, "eval_loss": 2.016533374786377, "eval_runtime": 0.4875, "eval_samples_per_second": 8.206, "eval_steps_per_second": 2.051, "step": 448 }, { "epoch": 0.18, "learning_rate": 2.4576e-07, "loss": 2.1466, "step": 452 }, { "epoch": 0.18, "eval_loss": 2.012568473815918, "eval_runtime": 0.4897, "eval_samples_per_second": 8.168, "eval_steps_per_second": 2.042, "step": 452 }, { "epoch": 0.18, "learning_rate": 2.4528e-07, "loss": 2.1384, "step": 456 }, { "epoch": 0.18, "eval_loss": 2.0088417530059814, "eval_runtime": 0.4969, "eval_samples_per_second": 8.05, "eval_steps_per_second": 2.013, "step": 456 }, { "epoch": 0.18, "learning_rate": 2.4479999999999997e-07, "loss": 2.1824, "step": 460 }, { "epoch": 0.18, "eval_loss": 2.0047850608825684, "eval_runtime": 0.4897, "eval_samples_per_second": 8.168, "eval_steps_per_second": 2.042, "step": 460 }, { "epoch": 0.19, "learning_rate": 2.4432e-07, "loss": 2.1401, "step": 464 }, { "epoch": 0.19, "eval_loss": 2.0006463527679443, "eval_runtime": 0.4882, "eval_samples_per_second": 8.193, "eval_steps_per_second": 2.048, "step": 464 }, { "epoch": 0.19, "learning_rate": 2.4384e-07, "loss": 2.2086, "step": 468 }, { "epoch": 0.19, "eval_loss": 1.9969314336776733, "eval_runtime": 0.6612, "eval_samples_per_second": 6.049, "eval_steps_per_second": 1.512, "step": 468 }, { "epoch": 0.19, "learning_rate": 2.4336e-07, "loss": 2.1687, "step": 472 }, { "epoch": 0.19, "eval_loss": 1.9925954341888428, "eval_runtime": 0.6804, "eval_samples_per_second": 5.879, "eval_steps_per_second": 1.47, "step": 472 }, { "epoch": 0.19, "learning_rate": 2.4287999999999996e-07, "loss": 2.145, "step": 476 }, { "epoch": 0.19, "eval_loss": 1.9888066053390503, "eval_runtime": 0.6955, "eval_samples_per_second": 5.752, "eval_steps_per_second": 1.438, "step": 476 }, { "epoch": 0.19, "learning_rate": 2.424e-07, "loss": 2.2007, "step": 480 }, { "epoch": 0.19, "eval_loss": 1.9850127696990967, "eval_runtime": 0.7558, "eval_samples_per_second": 5.292, "eval_steps_per_second": 1.323, "step": 480 }, { "epoch": 0.19, "learning_rate": 2.4192e-07, "loss": 2.1367, "step": 484 }, { "epoch": 0.19, "eval_loss": 1.9808437824249268, "eval_runtime": 0.4706, "eval_samples_per_second": 8.499, "eval_steps_per_second": 2.125, "step": 484 }, { "epoch": 0.2, "learning_rate": 2.4143999999999997e-07, "loss": 2.1291, "step": 488 }, { "epoch": 0.2, "eval_loss": 1.9767786264419556, "eval_runtime": 0.4803, "eval_samples_per_second": 8.327, "eval_steps_per_second": 2.082, "step": 488 }, { "epoch": 0.2, "learning_rate": 2.4096e-07, "loss": 2.1124, "step": 492 }, { "epoch": 0.2, "eval_loss": 1.9728602170944214, "eval_runtime": 0.4802, "eval_samples_per_second": 8.33, "eval_steps_per_second": 2.082, "step": 492 }, { "epoch": 0.2, "learning_rate": 2.4048e-07, "loss": 2.0738, "step": 496 }, { "epoch": 0.2, "eval_loss": 1.968900203704834, "eval_runtime": 0.4884, "eval_samples_per_second": 8.189, "eval_steps_per_second": 2.047, "step": 496 }, { "epoch": 0.2, "learning_rate": 2.4e-07, "loss": 2.1048, "step": 500 }, { "epoch": 0.2, "eval_loss": 1.9646457433700562, "eval_runtime": 0.5026, "eval_samples_per_second": 7.959, "eval_steps_per_second": 1.99, "step": 500 }, { "epoch": 0.2, "learning_rate": 2.3951999999999996e-07, "loss": 2.0995, "step": 504 }, { "epoch": 0.2, "eval_loss": 1.9606600999832153, "eval_runtime": 0.7928, "eval_samples_per_second": 5.045, "eval_steps_per_second": 1.261, "step": 504 }, { "epoch": 0.2, "learning_rate": 2.3903999999999995e-07, "loss": 2.0816, "step": 508 }, { "epoch": 0.2, "eval_loss": 1.956822395324707, "eval_runtime": 0.5321, "eval_samples_per_second": 7.518, "eval_steps_per_second": 1.879, "step": 508 }, { "epoch": 0.2, "learning_rate": 2.3856e-07, "loss": 2.0969, "step": 512 }, { "epoch": 0.2, "eval_loss": 1.9526716470718384, "eval_runtime": 0.5174, "eval_samples_per_second": 7.732, "eval_steps_per_second": 1.933, "step": 512 }, { "epoch": 0.21, "learning_rate": 2.3807999999999997e-07, "loss": 2.1034, "step": 516 }, { "epoch": 0.21, "eval_loss": 1.948419451713562, "eval_runtime": 0.5393, "eval_samples_per_second": 7.418, "eval_steps_per_second": 1.854, "step": 516 }, { "epoch": 0.21, "learning_rate": 2.376e-07, "loss": 2.0654, "step": 520 }, { "epoch": 0.21, "eval_loss": 1.9442145824432373, "eval_runtime": 0.5372, "eval_samples_per_second": 7.446, "eval_steps_per_second": 1.861, "step": 520 }, { "epoch": 0.21, "learning_rate": 2.3712e-07, "loss": 2.1175, "step": 524 }, { "epoch": 0.21, "eval_loss": 1.9403698444366455, "eval_runtime": 0.5129, "eval_samples_per_second": 7.798, "eval_steps_per_second": 1.95, "step": 524 }, { "epoch": 0.21, "learning_rate": 2.3663999999999998e-07, "loss": 2.0829, "step": 528 }, { "epoch": 0.21, "eval_loss": 1.936263084411621, "eval_runtime": 0.7202, "eval_samples_per_second": 5.554, "eval_steps_per_second": 1.388, "step": 528 }, { "epoch": 0.21, "learning_rate": 2.3616e-07, "loss": 2.0973, "step": 532 }, { "epoch": 0.21, "eval_loss": 1.9322115182876587, "eval_runtime": 0.6884, "eval_samples_per_second": 5.81, "eval_steps_per_second": 1.453, "step": 532 }, { "epoch": 0.21, "learning_rate": 2.3567999999999998e-07, "loss": 2.0439, "step": 536 }, { "epoch": 0.21, "eval_loss": 1.927826166152954, "eval_runtime": 0.7779, "eval_samples_per_second": 5.142, "eval_steps_per_second": 1.286, "step": 536 }, { "epoch": 0.22, "learning_rate": 2.352e-07, "loss": 2.0791, "step": 540 }, { "epoch": 0.22, "eval_loss": 1.923945426940918, "eval_runtime": 0.7514, "eval_samples_per_second": 5.323, "eval_steps_per_second": 1.331, "step": 540 }, { "epoch": 0.22, "learning_rate": 2.3471999999999997e-07, "loss": 2.0988, "step": 544 }, { "epoch": 0.22, "eval_loss": 1.9202955961227417, "eval_runtime": 0.5194, "eval_samples_per_second": 7.701, "eval_steps_per_second": 1.925, "step": 544 }, { "epoch": 0.22, "learning_rate": 2.3424e-07, "loss": 2.0179, "step": 548 }, { "epoch": 0.22, "eval_loss": 1.916027307510376, "eval_runtime": 0.5072, "eval_samples_per_second": 7.887, "eval_steps_per_second": 1.972, "step": 548 }, { "epoch": 0.22, "learning_rate": 2.3376e-07, "loss": 2.0452, "step": 552 }, { "epoch": 0.22, "eval_loss": 1.911855697631836, "eval_runtime": 0.5112, "eval_samples_per_second": 7.825, "eval_steps_per_second": 1.956, "step": 552 }, { "epoch": 0.22, "learning_rate": 2.3327999999999998e-07, "loss": 1.9792, "step": 556 }, { "epoch": 0.22, "eval_loss": 1.907868504524231, "eval_runtime": 0.5368, "eval_samples_per_second": 7.452, "eval_steps_per_second": 1.863, "step": 556 }, { "epoch": 0.22, "learning_rate": 2.328e-07, "loss": 1.9862, "step": 560 }, { "epoch": 0.22, "eval_loss": 1.9032366275787354, "eval_runtime": 0.52, "eval_samples_per_second": 7.692, "eval_steps_per_second": 1.923, "step": 560 }, { "epoch": 0.23, "learning_rate": 2.3231999999999998e-07, "loss": 2.0176, "step": 564 }, { "epoch": 0.23, "eval_loss": 1.8994207382202148, "eval_runtime": 0.5141, "eval_samples_per_second": 7.78, "eval_steps_per_second": 1.945, "step": 564 }, { "epoch": 0.23, "learning_rate": 2.3184e-07, "loss": 2.0066, "step": 568 }, { "epoch": 0.23, "eval_loss": 1.8953509330749512, "eval_runtime": 0.7027, "eval_samples_per_second": 5.692, "eval_steps_per_second": 1.423, "step": 568 }, { "epoch": 0.23, "learning_rate": 2.3135999999999998e-07, "loss": 2.0333, "step": 572 }, { "epoch": 0.23, "eval_loss": 1.8914432525634766, "eval_runtime": 0.7279, "eval_samples_per_second": 5.495, "eval_steps_per_second": 1.374, "step": 572 }, { "epoch": 0.23, "learning_rate": 2.3088e-07, "loss": 2.0316, "step": 576 }, { "epoch": 0.23, "eval_loss": 1.8870800733566284, "eval_runtime": 0.7212, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.386, "step": 576 }, { "epoch": 0.23, "learning_rate": 2.304e-07, "loss": 2.0114, "step": 580 }, { "epoch": 0.23, "eval_loss": 1.8827916383743286, "eval_runtime": 0.6774, "eval_samples_per_second": 5.905, "eval_steps_per_second": 1.476, "step": 580 }, { "epoch": 0.23, "learning_rate": 2.2991999999999998e-07, "loss": 2.0093, "step": 584 }, { "epoch": 0.23, "eval_loss": 1.8788678646087646, "eval_runtime": 0.5185, "eval_samples_per_second": 7.715, "eval_steps_per_second": 1.929, "step": 584 }, { "epoch": 0.24, "learning_rate": 2.2944e-07, "loss": 1.9829, "step": 588 }, { "epoch": 0.24, "eval_loss": 1.8749186992645264, "eval_runtime": 0.5091, "eval_samples_per_second": 7.857, "eval_steps_per_second": 1.964, "step": 588 }, { "epoch": 0.24, "learning_rate": 2.2895999999999998e-07, "loss": 1.971, "step": 592 }, { "epoch": 0.24, "eval_loss": 1.8706499338150024, "eval_runtime": 0.5204, "eval_samples_per_second": 7.687, "eval_steps_per_second": 1.922, "step": 592 }, { "epoch": 0.24, "learning_rate": 2.2848000000000002e-07, "loss": 2.0188, "step": 596 }, { "epoch": 0.24, "eval_loss": 1.8667842149734497, "eval_runtime": 0.5224, "eval_samples_per_second": 7.657, "eval_steps_per_second": 1.914, "step": 596 }, { "epoch": 0.24, "learning_rate": 2.28e-07, "loss": 2.0081, "step": 600 }, { "epoch": 0.24, "eval_loss": 1.8627525568008423, "eval_runtime": 0.5196, "eval_samples_per_second": 7.699, "eval_steps_per_second": 1.925, "step": 600 }, { "epoch": 0.24, "learning_rate": 2.2752e-07, "loss": 2.0014, "step": 604 }, { "epoch": 0.24, "eval_loss": 1.8587167263031006, "eval_runtime": 0.7373, "eval_samples_per_second": 5.425, "eval_steps_per_second": 1.356, "step": 604 }, { "epoch": 0.24, "learning_rate": 2.2704e-07, "loss": 1.9741, "step": 608 }, { "epoch": 0.24, "eval_loss": 1.8543612957000732, "eval_runtime": 0.7492, "eval_samples_per_second": 5.339, "eval_steps_per_second": 1.335, "step": 608 }, { "epoch": 0.24, "learning_rate": 2.2655999999999999e-07, "loss": 1.9828, "step": 612 }, { "epoch": 0.24, "eval_loss": 1.8504937887191772, "eval_runtime": 0.7242, "eval_samples_per_second": 5.524, "eval_steps_per_second": 1.381, "step": 612 }, { "epoch": 0.25, "learning_rate": 2.2608e-07, "loss": 1.9481, "step": 616 }, { "epoch": 0.25, "eval_loss": 1.8463339805603027, "eval_runtime": 0.6997, "eval_samples_per_second": 5.716, "eval_steps_per_second": 1.429, "step": 616 }, { "epoch": 0.25, "learning_rate": 2.2559999999999998e-07, "loss": 1.9584, "step": 620 }, { "epoch": 0.25, "eval_loss": 1.8423882722854614, "eval_runtime": 0.5137, "eval_samples_per_second": 7.787, "eval_steps_per_second": 1.947, "step": 620 }, { "epoch": 0.25, "learning_rate": 2.2511999999999997e-07, "loss": 1.9449, "step": 624 }, { "epoch": 0.25, "eval_loss": 1.838066577911377, "eval_runtime": 0.5091, "eval_samples_per_second": 7.857, "eval_steps_per_second": 1.964, "step": 624 }, { "epoch": 0.25, "learning_rate": 2.2464e-07, "loss": 1.9753, "step": 628 }, { "epoch": 0.25, "eval_loss": 1.8342829942703247, "eval_runtime": 0.504, "eval_samples_per_second": 7.936, "eval_steps_per_second": 1.984, "step": 628 }, { "epoch": 0.25, "learning_rate": 2.2416e-07, "loss": 2.0055, "step": 632 }, { "epoch": 0.25, "eval_loss": 1.8300307989120483, "eval_runtime": 0.5201, "eval_samples_per_second": 7.691, "eval_steps_per_second": 1.923, "step": 632 }, { "epoch": 0.25, "learning_rate": 2.2368e-07, "loss": 1.98, "step": 636 }, { "epoch": 0.25, "eval_loss": 1.8260575532913208, "eval_runtime": 0.5267, "eval_samples_per_second": 7.594, "eval_steps_per_second": 1.898, "step": 636 }, { "epoch": 0.26, "learning_rate": 2.232e-07, "loss": 1.9757, "step": 640 }, { "epoch": 0.26, "eval_loss": 1.8222540616989136, "eval_runtime": 0.7574, "eval_samples_per_second": 5.281, "eval_steps_per_second": 1.32, "step": 640 }, { "epoch": 0.26, "learning_rate": 2.2271999999999997e-07, "loss": 1.9683, "step": 644 }, { "epoch": 0.26, "eval_loss": 1.818216323852539, "eval_runtime": 0.7304, "eval_samples_per_second": 5.476, "eval_steps_per_second": 1.369, "step": 644 }, { "epoch": 0.26, "learning_rate": 2.2223999999999998e-07, "loss": 1.926, "step": 648 }, { "epoch": 0.26, "eval_loss": 1.8140522241592407, "eval_runtime": 0.7453, "eval_samples_per_second": 5.367, "eval_steps_per_second": 1.342, "step": 648 }, { "epoch": 0.26, "learning_rate": 2.2175999999999997e-07, "loss": 1.9454, "step": 652 }, { "epoch": 0.26, "eval_loss": 1.8100805282592773, "eval_runtime": 0.6536, "eval_samples_per_second": 6.12, "eval_steps_per_second": 1.53, "step": 652 }, { "epoch": 0.26, "learning_rate": 2.2128e-07, "loss": 1.9352, "step": 656 }, { "epoch": 0.26, "eval_loss": 1.8059089183807373, "eval_runtime": 0.5193, "eval_samples_per_second": 7.702, "eval_steps_per_second": 1.926, "step": 656 }, { "epoch": 0.26, "learning_rate": 2.208e-07, "loss": 1.8816, "step": 660 }, { "epoch": 0.26, "eval_loss": 1.8020563125610352, "eval_runtime": 0.5265, "eval_samples_per_second": 7.597, "eval_steps_per_second": 1.899, "step": 660 }, { "epoch": 0.27, "learning_rate": 2.2032e-07, "loss": 1.9182, "step": 664 }, { "epoch": 0.27, "eval_loss": 1.7980492115020752, "eval_runtime": 0.5102, "eval_samples_per_second": 7.84, "eval_steps_per_second": 1.96, "step": 664 }, { "epoch": 0.27, "learning_rate": 2.1984e-07, "loss": 1.9659, "step": 668 }, { "epoch": 0.27, "eval_loss": 1.7941217422485352, "eval_runtime": 0.5988, "eval_samples_per_second": 6.681, "eval_steps_per_second": 1.67, "step": 668 }, { "epoch": 0.27, "learning_rate": 2.1935999999999997e-07, "loss": 1.8932, "step": 672 }, { "epoch": 0.27, "eval_loss": 1.7901490926742554, "eval_runtime": 0.5339, "eval_samples_per_second": 7.492, "eval_steps_per_second": 1.873, "step": 672 }, { "epoch": 0.27, "learning_rate": 2.1887999999999999e-07, "loss": 1.8608, "step": 676 }, { "epoch": 0.27, "eval_loss": 1.786109447479248, "eval_runtime": 0.7219, "eval_samples_per_second": 5.541, "eval_steps_per_second": 1.385, "step": 676 }, { "epoch": 0.27, "learning_rate": 2.184e-07, "loss": 1.941, "step": 680 }, { "epoch": 0.27, "eval_loss": 1.7824102640151978, "eval_runtime": 0.7619, "eval_samples_per_second": 5.25, "eval_steps_per_second": 1.313, "step": 680 }, { "epoch": 0.27, "learning_rate": 2.1792e-07, "loss": 1.8854, "step": 684 }, { "epoch": 0.27, "eval_loss": 1.77846097946167, "eval_runtime": 0.7601, "eval_samples_per_second": 5.262, "eval_steps_per_second": 1.316, "step": 684 }, { "epoch": 0.28, "learning_rate": 2.1744e-07, "loss": 1.8912, "step": 688 }, { "epoch": 0.28, "eval_loss": 1.7742952108383179, "eval_runtime": 0.59, "eval_samples_per_second": 6.78, "eval_steps_per_second": 1.695, "step": 688 }, { "epoch": 0.28, "learning_rate": 2.1695999999999998e-07, "loss": 1.8667, "step": 692 }, { "epoch": 0.28, "eval_loss": 1.770714521408081, "eval_runtime": 0.5262, "eval_samples_per_second": 7.601, "eval_steps_per_second": 1.9, "step": 692 }, { "epoch": 0.28, "learning_rate": 2.1648e-07, "loss": 1.912, "step": 696 }, { "epoch": 0.28, "eval_loss": 1.7666008472442627, "eval_runtime": 0.5272, "eval_samples_per_second": 7.587, "eval_steps_per_second": 1.897, "step": 696 }, { "epoch": 0.28, "learning_rate": 2.1599999999999998e-07, "loss": 1.9009, "step": 700 }, { "epoch": 0.28, "eval_loss": 1.7627824544906616, "eval_runtime": 0.5295, "eval_samples_per_second": 7.555, "eval_steps_per_second": 1.889, "step": 700 }, { "epoch": 0.28, "learning_rate": 2.1552000000000001e-07, "loss": 1.906, "step": 704 }, { "epoch": 0.28, "eval_loss": 1.75889253616333, "eval_runtime": 0.5589, "eval_samples_per_second": 7.157, "eval_steps_per_second": 1.789, "step": 704 }, { "epoch": 0.28, "learning_rate": 2.1504e-07, "loss": 1.8671, "step": 708 }, { "epoch": 0.28, "eval_loss": 1.7549973726272583, "eval_runtime": 0.687, "eval_samples_per_second": 5.822, "eval_steps_per_second": 1.456, "step": 708 }, { "epoch": 0.28, "learning_rate": 2.1455999999999998e-07, "loss": 1.8609, "step": 712 }, { "epoch": 0.28, "eval_loss": 1.7507662773132324, "eval_runtime": 0.7225, "eval_samples_per_second": 5.537, "eval_steps_per_second": 1.384, "step": 712 }, { "epoch": 0.29, "learning_rate": 2.1408e-07, "loss": 1.8485, "step": 716 }, { "epoch": 0.29, "eval_loss": 1.746917486190796, "eval_runtime": 0.7954, "eval_samples_per_second": 5.029, "eval_steps_per_second": 1.257, "step": 716 }, { "epoch": 0.29, "learning_rate": 2.1359999999999998e-07, "loss": 1.8334, "step": 720 }, { "epoch": 0.29, "eval_loss": 1.7430514097213745, "eval_runtime": 0.7433, "eval_samples_per_second": 5.381, "eval_steps_per_second": 1.345, "step": 720 }, { "epoch": 0.29, "learning_rate": 2.1312e-07, "loss": 1.8763, "step": 724 }, { "epoch": 0.29, "eval_loss": 1.7392196655273438, "eval_runtime": 0.5237, "eval_samples_per_second": 7.638, "eval_steps_per_second": 1.91, "step": 724 }, { "epoch": 0.29, "learning_rate": 2.1263999999999998e-07, "loss": 1.9005, "step": 728 }, { "epoch": 0.29, "eval_loss": 1.7355214357376099, "eval_runtime": 0.524, "eval_samples_per_second": 7.634, "eval_steps_per_second": 1.908, "step": 728 }, { "epoch": 0.29, "learning_rate": 2.1216000000000002e-07, "loss": 1.8669, "step": 732 }, { "epoch": 0.29, "eval_loss": 1.731513261795044, "eval_runtime": 0.5593, "eval_samples_per_second": 7.152, "eval_steps_per_second": 1.788, "step": 732 }, { "epoch": 0.29, "learning_rate": 2.1168e-07, "loss": 1.8984, "step": 736 }, { "epoch": 0.29, "eval_loss": 1.727636694908142, "eval_runtime": 0.5241, "eval_samples_per_second": 7.632, "eval_steps_per_second": 1.908, "step": 736 }, { "epoch": 0.3, "learning_rate": 2.1119999999999999e-07, "loss": 1.8074, "step": 740 }, { "epoch": 0.3, "eval_loss": 1.7240556478500366, "eval_runtime": 0.715, "eval_samples_per_second": 5.594, "eval_steps_per_second": 1.399, "step": 740 }, { "epoch": 0.3, "learning_rate": 2.1072e-07, "loss": 1.8614, "step": 744 }, { "epoch": 0.3, "eval_loss": 1.7201639413833618, "eval_runtime": 0.7611, "eval_samples_per_second": 5.256, "eval_steps_per_second": 1.314, "step": 744 }, { "epoch": 0.3, "learning_rate": 2.1023999999999998e-07, "loss": 1.8211, "step": 748 }, { "epoch": 0.3, "eval_loss": 1.7165008783340454, "eval_runtime": 0.7193, "eval_samples_per_second": 5.561, "eval_steps_per_second": 1.39, "step": 748 }, { "epoch": 0.3, "learning_rate": 2.0976e-07, "loss": 1.8553, "step": 752 }, { "epoch": 0.3, "eval_loss": 1.7123990058898926, "eval_runtime": 0.5463, "eval_samples_per_second": 7.323, "eval_steps_per_second": 1.831, "step": 752 }, { "epoch": 0.3, "learning_rate": 2.0927999999999998e-07, "loss": 1.7978, "step": 756 }, { "epoch": 0.3, "eval_loss": 1.7084720134735107, "eval_runtime": 0.574, "eval_samples_per_second": 6.968, "eval_steps_per_second": 1.742, "step": 756 }, { "epoch": 0.3, "learning_rate": 2.0879999999999996e-07, "loss": 1.8203, "step": 760 }, { "epoch": 0.3, "eval_loss": 1.7048146724700928, "eval_runtime": 0.5838, "eval_samples_per_second": 6.852, "eval_steps_per_second": 1.713, "step": 760 }, { "epoch": 0.31, "learning_rate": 2.0832e-07, "loss": 1.8192, "step": 764 }, { "epoch": 0.31, "eval_loss": 1.7010469436645508, "eval_runtime": 0.5225, "eval_samples_per_second": 7.656, "eval_steps_per_second": 1.914, "step": 764 }, { "epoch": 0.31, "learning_rate": 2.0784e-07, "loss": 1.8532, "step": 768 }, { "epoch": 0.31, "eval_loss": 1.6973625421524048, "eval_runtime": 0.525, "eval_samples_per_second": 7.619, "eval_steps_per_second": 1.905, "step": 768 }, { "epoch": 0.31, "learning_rate": 2.0736e-07, "loss": 1.8307, "step": 772 }, { "epoch": 0.31, "eval_loss": 1.6935136318206787, "eval_runtime": 0.7235, "eval_samples_per_second": 5.528, "eval_steps_per_second": 1.382, "step": 772 }, { "epoch": 0.31, "learning_rate": 2.0687999999999998e-07, "loss": 1.8207, "step": 776 }, { "epoch": 0.31, "eval_loss": 1.6895670890808105, "eval_runtime": 0.8289, "eval_samples_per_second": 4.826, "eval_steps_per_second": 1.206, "step": 776 }, { "epoch": 0.31, "learning_rate": 2.0639999999999997e-07, "loss": 1.7895, "step": 780 }, { "epoch": 0.31, "eval_loss": 1.6858075857162476, "eval_runtime": 0.7778, "eval_samples_per_second": 5.143, "eval_steps_per_second": 1.286, "step": 780 }, { "epoch": 0.31, "learning_rate": 2.0592e-07, "loss": 1.7976, "step": 784 }, { "epoch": 0.31, "eval_loss": 1.6820955276489258, "eval_runtime": 0.5265, "eval_samples_per_second": 7.597, "eval_steps_per_second": 1.899, "step": 784 }, { "epoch": 0.32, "learning_rate": 2.0544e-07, "loss": 1.814, "step": 788 }, { "epoch": 0.32, "eval_loss": 1.6785138845443726, "eval_runtime": 0.5179, "eval_samples_per_second": 7.724, "eval_steps_per_second": 1.931, "step": 788 }, { "epoch": 0.32, "learning_rate": 2.0496e-07, "loss": 1.7972, "step": 792 }, { "epoch": 0.32, "eval_loss": 1.674804449081421, "eval_runtime": 0.5304, "eval_samples_per_second": 7.541, "eval_steps_per_second": 1.885, "step": 792 }, { "epoch": 0.32, "learning_rate": 2.0448e-07, "loss": 1.8258, "step": 796 }, { "epoch": 0.32, "eval_loss": 1.6713837385177612, "eval_runtime": 0.5336, "eval_samples_per_second": 7.496, "eval_steps_per_second": 1.874, "step": 796 }, { "epoch": 0.32, "learning_rate": 2.04e-07, "loss": 1.79, "step": 800 }, { "epoch": 0.32, "eval_loss": 1.667376160621643, "eval_runtime": 0.7608, "eval_samples_per_second": 5.258, "eval_steps_per_second": 1.314, "step": 800 }, { "epoch": 0.32, "learning_rate": 2.0351999999999999e-07, "loss": 1.802, "step": 804 }, { "epoch": 0.32, "eval_loss": 1.6640408039093018, "eval_runtime": 0.7498, "eval_samples_per_second": 5.335, "eval_steps_per_second": 1.334, "step": 804 }, { "epoch": 0.32, "learning_rate": 2.0303999999999997e-07, "loss": 1.7784, "step": 808 }, { "epoch": 0.32, "eval_loss": 1.6603385210037231, "eval_runtime": 0.7501, "eval_samples_per_second": 5.333, "eval_steps_per_second": 1.333, "step": 808 }, { "epoch": 0.32, "learning_rate": 2.0256e-07, "loss": 1.7671, "step": 812 }, { "epoch": 0.32, "eval_loss": 1.6568516492843628, "eval_runtime": 0.5206, "eval_samples_per_second": 7.684, "eval_steps_per_second": 1.921, "step": 812 }, { "epoch": 0.33, "learning_rate": 2.0208e-07, "loss": 1.7618, "step": 816 }, { "epoch": 0.33, "eval_loss": 1.653469443321228, "eval_runtime": 0.5354, "eval_samples_per_second": 7.472, "eval_steps_per_second": 1.868, "step": 816 }, { "epoch": 0.33, "learning_rate": 2.016e-07, "loss": 1.8207, "step": 820 }, { "epoch": 0.33, "eval_loss": 1.6502578258514404, "eval_runtime": 0.523, "eval_samples_per_second": 7.648, "eval_steps_per_second": 1.912, "step": 820 }, { "epoch": 0.33, "learning_rate": 2.0112e-07, "loss": 1.7837, "step": 824 }, { "epoch": 0.33, "eval_loss": 1.6467454433441162, "eval_runtime": 0.5297, "eval_samples_per_second": 7.552, "eval_steps_per_second": 1.888, "step": 824 }, { "epoch": 0.33, "learning_rate": 2.0063999999999998e-07, "loss": 1.8066, "step": 828 }, { "epoch": 0.33, "eval_loss": 1.6439214944839478, "eval_runtime": 0.522, "eval_samples_per_second": 7.663, "eval_steps_per_second": 1.916, "step": 828 }, { "epoch": 0.33, "learning_rate": 2.0016e-07, "loss": 1.7814, "step": 832 }, { "epoch": 0.33, "eval_loss": 1.6407381296157837, "eval_runtime": 0.5382, "eval_samples_per_second": 7.432, "eval_steps_per_second": 1.858, "step": 832 }, { "epoch": 0.33, "learning_rate": 1.9967999999999997e-07, "loss": 1.7244, "step": 836 }, { "epoch": 0.33, "eval_loss": 1.6372514963150024, "eval_runtime": 0.7157, "eval_samples_per_second": 5.589, "eval_steps_per_second": 1.397, "step": 836 }, { "epoch": 0.34, "learning_rate": 1.992e-07, "loss": 1.7195, "step": 840 }, { "epoch": 0.34, "eval_loss": 1.634232997894287, "eval_runtime": 0.7254, "eval_samples_per_second": 5.514, "eval_steps_per_second": 1.379, "step": 840 }, { "epoch": 0.34, "learning_rate": 1.9872e-07, "loss": 1.7524, "step": 844 }, { "epoch": 0.34, "eval_loss": 1.6310441493988037, "eval_runtime": 0.7839, "eval_samples_per_second": 5.103, "eval_steps_per_second": 1.276, "step": 844 }, { "epoch": 0.34, "learning_rate": 1.9824e-07, "loss": 1.7644, "step": 848 }, { "epoch": 0.34, "eval_loss": 1.6279191970825195, "eval_runtime": 0.5253, "eval_samples_per_second": 7.615, "eval_steps_per_second": 1.904, "step": 848 }, { "epoch": 0.34, "learning_rate": 1.9776e-07, "loss": 1.7171, "step": 852 }, { "epoch": 0.34, "eval_loss": 1.6244579553604126, "eval_runtime": 0.5359, "eval_samples_per_second": 7.464, "eval_steps_per_second": 1.866, "step": 852 }, { "epoch": 0.34, "learning_rate": 1.9727999999999998e-07, "loss": 1.7418, "step": 856 }, { "epoch": 0.34, "eval_loss": 1.6212078332901, "eval_runtime": 0.5379, "eval_samples_per_second": 7.436, "eval_steps_per_second": 1.859, "step": 856 }, { "epoch": 0.34, "learning_rate": 1.968e-07, "loss": 1.7337, "step": 860 }, { "epoch": 0.34, "eval_loss": 1.6180227994918823, "eval_runtime": 0.5259, "eval_samples_per_second": 7.606, "eval_steps_per_second": 1.902, "step": 860 }, { "epoch": 0.35, "learning_rate": 1.9631999999999997e-07, "loss": 1.7441, "step": 864 }, { "epoch": 0.35, "eval_loss": 1.61477530002594, "eval_runtime": 0.5216, "eval_samples_per_second": 7.669, "eval_steps_per_second": 1.917, "step": 864 }, { "epoch": 0.35, "learning_rate": 1.9584e-07, "loss": 1.694, "step": 868 }, { "epoch": 0.35, "eval_loss": 1.611538052558899, "eval_runtime": 0.6803, "eval_samples_per_second": 5.88, "eval_steps_per_second": 1.47, "step": 868 }, { "epoch": 0.35, "learning_rate": 1.9536e-07, "loss": 1.7601, "step": 872 }, { "epoch": 0.35, "eval_loss": 1.6083098649978638, "eval_runtime": 0.716, "eval_samples_per_second": 5.586, "eval_steps_per_second": 1.397, "step": 872 }, { "epoch": 0.35, "learning_rate": 1.9487999999999998e-07, "loss": 1.7081, "step": 876 }, { "epoch": 0.35, "eval_loss": 1.6050214767456055, "eval_runtime": 0.7622, "eval_samples_per_second": 5.248, "eval_steps_per_second": 1.312, "step": 876 }, { "epoch": 0.35, "learning_rate": 1.944e-07, "loss": 1.7101, "step": 880 }, { "epoch": 0.35, "eval_loss": 1.6019953489303589, "eval_runtime": 0.7766, "eval_samples_per_second": 5.151, "eval_steps_per_second": 1.288, "step": 880 }, { "epoch": 0.35, "learning_rate": 1.9391999999999998e-07, "loss": 1.7271, "step": 884 }, { "epoch": 0.35, "eval_loss": 1.5990221500396729, "eval_runtime": 0.5153, "eval_samples_per_second": 7.763, "eval_steps_per_second": 1.941, "step": 884 }, { "epoch": 0.36, "learning_rate": 1.9344e-07, "loss": 1.7402, "step": 888 }, { "epoch": 0.36, "eval_loss": 1.5954092741012573, "eval_runtime": 0.5168, "eval_samples_per_second": 7.74, "eval_steps_per_second": 1.935, "step": 888 }, { "epoch": 0.36, "learning_rate": 1.9296e-07, "loss": 1.7125, "step": 892 }, { "epoch": 0.36, "eval_loss": 1.5921534299850464, "eval_runtime": 0.5424, "eval_samples_per_second": 7.375, "eval_steps_per_second": 1.844, "step": 892 }, { "epoch": 0.36, "learning_rate": 1.9248e-07, "loss": 1.6949, "step": 896 }, { "epoch": 0.36, "eval_loss": 1.5888370275497437, "eval_runtime": 0.5307, "eval_samples_per_second": 7.537, "eval_steps_per_second": 1.884, "step": 896 }, { "epoch": 0.36, "learning_rate": 1.92e-07, "loss": 1.7145, "step": 900 }, { "epoch": 0.36, "eval_loss": 1.5858186483383179, "eval_runtime": 0.511, "eval_samples_per_second": 7.828, "eval_steps_per_second": 1.957, "step": 900 }, { "epoch": 0.36, "learning_rate": 1.9151999999999998e-07, "loss": 1.6665, "step": 904 }, { "epoch": 0.36, "eval_loss": 1.5824443101882935, "eval_runtime": 0.6907, "eval_samples_per_second": 5.791, "eval_steps_per_second": 1.448, "step": 904 }, { "epoch": 0.36, "learning_rate": 1.9104e-07, "loss": 1.6929, "step": 908 }, { "epoch": 0.36, "eval_loss": 1.5796196460723877, "eval_runtime": 0.7487, "eval_samples_per_second": 5.342, "eval_steps_per_second": 1.336, "step": 908 }, { "epoch": 0.36, "learning_rate": 1.9055999999999998e-07, "loss": 1.7068, "step": 912 }, { "epoch": 0.36, "eval_loss": 1.5765777826309204, "eval_runtime": 0.7477, "eval_samples_per_second": 5.35, "eval_steps_per_second": 1.337, "step": 912 }, { "epoch": 0.37, "learning_rate": 1.9008000000000002e-07, "loss": 1.6877, "step": 916 }, { "epoch": 0.37, "eval_loss": 1.57340669631958, "eval_runtime": 0.753, "eval_samples_per_second": 5.312, "eval_steps_per_second": 1.328, "step": 916 }, { "epoch": 0.37, "learning_rate": 1.896e-07, "loss": 1.6718, "step": 920 }, { "epoch": 0.37, "eval_loss": 1.5706267356872559, "eval_runtime": 0.514, "eval_samples_per_second": 7.782, "eval_steps_per_second": 1.945, "step": 920 }, { "epoch": 0.37, "learning_rate": 1.8912e-07, "loss": 1.6886, "step": 924 }, { "epoch": 0.37, "eval_loss": 1.5676339864730835, "eval_runtime": 0.5222, "eval_samples_per_second": 7.66, "eval_steps_per_second": 1.915, "step": 924 }, { "epoch": 0.37, "learning_rate": 1.8864e-07, "loss": 1.7459, "step": 928 }, { "epoch": 0.37, "eval_loss": 1.5645827054977417, "eval_runtime": 0.5299, "eval_samples_per_second": 7.548, "eval_steps_per_second": 1.887, "step": 928 }, { "epoch": 0.37, "learning_rate": 1.8815999999999999e-07, "loss": 1.6596, "step": 932 }, { "epoch": 0.37, "eval_loss": 1.5616861581802368, "eval_runtime": 0.5303, "eval_samples_per_second": 7.543, "eval_steps_per_second": 1.886, "step": 932 }, { "epoch": 0.37, "learning_rate": 1.8768e-07, "loss": 1.6689, "step": 936 }, { "epoch": 0.37, "eval_loss": 1.5588451623916626, "eval_runtime": 0.5236, "eval_samples_per_second": 7.639, "eval_steps_per_second": 1.91, "step": 936 }, { "epoch": 0.38, "learning_rate": 1.8719999999999998e-07, "loss": 1.6744, "step": 940 }, { "epoch": 0.38, "eval_loss": 1.5560673475265503, "eval_runtime": 0.7233, "eval_samples_per_second": 5.53, "eval_steps_per_second": 1.383, "step": 940 }, { "epoch": 0.38, "learning_rate": 1.8671999999999997e-07, "loss": 1.7009, "step": 944 }, { "epoch": 0.38, "eval_loss": 1.5533243417739868, "eval_runtime": 0.6983, "eval_samples_per_second": 5.728, "eval_steps_per_second": 1.432, "step": 944 }, { "epoch": 0.38, "learning_rate": 1.8624e-07, "loss": 1.6651, "step": 948 }, { "epoch": 0.38, "eval_loss": 1.55048668384552, "eval_runtime": 0.7511, "eval_samples_per_second": 5.325, "eval_steps_per_second": 1.331, "step": 948 }, { "epoch": 0.38, "learning_rate": 1.8576e-07, "loss": 1.6821, "step": 952 }, { "epoch": 0.38, "eval_loss": 1.547943353652954, "eval_runtime": 0.532, "eval_samples_per_second": 7.519, "eval_steps_per_second": 1.88, "step": 952 }, { "epoch": 0.38, "learning_rate": 1.8528e-07, "loss": 1.6453, "step": 956 }, { "epoch": 0.38, "eval_loss": 1.5453405380249023, "eval_runtime": 0.5463, "eval_samples_per_second": 7.322, "eval_steps_per_second": 1.831, "step": 956 }, { "epoch": 0.38, "learning_rate": 1.848e-07, "loss": 1.6624, "step": 960 }, { "epoch": 0.38, "eval_loss": 1.542648196220398, "eval_runtime": 0.5288, "eval_samples_per_second": 7.564, "eval_steps_per_second": 1.891, "step": 960 }, { "epoch": 0.39, "learning_rate": 1.8431999999999997e-07, "loss": 1.6453, "step": 964 }, { "epoch": 0.39, "eval_loss": 1.5402462482452393, "eval_runtime": 0.5242, "eval_samples_per_second": 7.63, "eval_steps_per_second": 1.908, "step": 964 }, { "epoch": 0.39, "learning_rate": 1.8383999999999998e-07, "loss": 1.6451, "step": 968 }, { "epoch": 0.39, "eval_loss": 1.5377165079116821, "eval_runtime": 0.5169, "eval_samples_per_second": 7.738, "eval_steps_per_second": 1.935, "step": 968 }, { "epoch": 0.39, "learning_rate": 1.8335999999999997e-07, "loss": 1.6627, "step": 972 }, { "epoch": 0.39, "eval_loss": 1.5353412628173828, "eval_runtime": 0.6797, "eval_samples_per_second": 5.885, "eval_steps_per_second": 1.471, "step": 972 }, { "epoch": 0.39, "learning_rate": 1.8288e-07, "loss": 1.6423, "step": 976 }, { "epoch": 0.39, "eval_loss": 1.5325669050216675, "eval_runtime": 0.7175, "eval_samples_per_second": 5.575, "eval_steps_per_second": 1.394, "step": 976 }, { "epoch": 0.39, "learning_rate": 1.824e-07, "loss": 1.652, "step": 980 }, { "epoch": 0.39, "eval_loss": 1.530207872390747, "eval_runtime": 0.8099, "eval_samples_per_second": 4.939, "eval_steps_per_second": 1.235, "step": 980 }, { "epoch": 0.39, "learning_rate": 1.8192e-07, "loss": 1.6414, "step": 984 }, { "epoch": 0.39, "eval_loss": 1.5278236865997314, "eval_runtime": 0.7814, "eval_samples_per_second": 5.119, "eval_steps_per_second": 1.28, "step": 984 }, { "epoch": 0.4, "learning_rate": 1.8144e-07, "loss": 1.6107, "step": 988 }, { "epoch": 0.4, "eval_loss": 1.5253430604934692, "eval_runtime": 0.5386, "eval_samples_per_second": 7.427, "eval_steps_per_second": 1.857, "step": 988 }, { "epoch": 0.4, "learning_rate": 1.8095999999999997e-07, "loss": 1.6599, "step": 992 }, { "epoch": 0.4, "eval_loss": 1.5225120782852173, "eval_runtime": 0.5302, "eval_samples_per_second": 7.544, "eval_steps_per_second": 1.886, "step": 992 }, { "epoch": 0.4, "learning_rate": 1.8048e-07, "loss": 1.6326, "step": 996 }, { "epoch": 0.4, "eval_loss": 1.5201939344406128, "eval_runtime": 0.533, "eval_samples_per_second": 7.505, "eval_steps_per_second": 1.876, "step": 996 }, { "epoch": 0.4, "learning_rate": 1.8e-07, "loss": 1.6324, "step": 1000 }, { "epoch": 0.4, "eval_loss": 1.5175316333770752, "eval_runtime": 0.5316, "eval_samples_per_second": 7.525, "eval_steps_per_second": 1.881, "step": 1000 }, { "epoch": 0.4, "learning_rate": 1.7952e-07, "loss": 1.5907, "step": 1004 }, { "epoch": 0.4, "eval_loss": 1.5149424076080322, "eval_runtime": 0.7298, "eval_samples_per_second": 5.481, "eval_steps_per_second": 1.37, "step": 1004 }, { "epoch": 0.4, "learning_rate": 1.7904e-07, "loss": 1.6465, "step": 1008 }, { "epoch": 0.4, "eval_loss": 1.5124318599700928, "eval_runtime": 0.7308, "eval_samples_per_second": 5.473, "eval_steps_per_second": 1.368, "step": 1008 }, { "epoch": 0.4, "learning_rate": 1.7855999999999998e-07, "loss": 1.6148, "step": 1012 }, { "epoch": 0.4, "eval_loss": 1.510151743888855, "eval_runtime": 0.7345, "eval_samples_per_second": 5.446, "eval_steps_per_second": 1.361, "step": 1012 }, { "epoch": 0.41, "learning_rate": 1.7808e-07, "loss": 1.6064, "step": 1016 }, { "epoch": 0.41, "eval_loss": 1.5073630809783936, "eval_runtime": 0.5414, "eval_samples_per_second": 7.388, "eval_steps_per_second": 1.847, "step": 1016 }, { "epoch": 0.41, "learning_rate": 1.7759999999999998e-07, "loss": 1.6342, "step": 1020 }, { "epoch": 0.41, "eval_loss": 1.5052520036697388, "eval_runtime": 0.516, "eval_samples_per_second": 7.751, "eval_steps_per_second": 1.938, "step": 1020 }, { "epoch": 0.41, "learning_rate": 1.7712000000000001e-07, "loss": 1.605, "step": 1024 }, { "epoch": 0.41, "eval_loss": 1.5025243759155273, "eval_runtime": 0.5373, "eval_samples_per_second": 7.445, "eval_steps_per_second": 1.861, "step": 1024 }, { "epoch": 0.41, "learning_rate": 1.7664e-07, "loss": 1.6121, "step": 1028 }, { "epoch": 0.41, "eval_loss": 1.500252604484558, "eval_runtime": 0.5476, "eval_samples_per_second": 7.304, "eval_steps_per_second": 1.826, "step": 1028 }, { "epoch": 0.41, "learning_rate": 1.7616e-07, "loss": 1.617, "step": 1032 }, { "epoch": 0.41, "eval_loss": 1.4977892637252808, "eval_runtime": 0.5255, "eval_samples_per_second": 7.612, "eval_steps_per_second": 1.903, "step": 1032 }, { "epoch": 0.41, "learning_rate": 1.7568e-07, "loss": 1.5897, "step": 1036 }, { "epoch": 0.41, "eval_loss": 1.4954513311386108, "eval_runtime": 0.7255, "eval_samples_per_second": 5.513, "eval_steps_per_second": 1.378, "step": 1036 }, { "epoch": 0.42, "learning_rate": 1.7519999999999998e-07, "loss": 1.6022, "step": 1040 }, { "epoch": 0.42, "eval_loss": 1.4929691553115845, "eval_runtime": 0.6954, "eval_samples_per_second": 5.752, "eval_steps_per_second": 1.438, "step": 1040 }, { "epoch": 0.42, "learning_rate": 1.7472e-07, "loss": 1.5748, "step": 1044 }, { "epoch": 0.42, "eval_loss": 1.4902769327163696, "eval_runtime": 0.8026, "eval_samples_per_second": 4.984, "eval_steps_per_second": 1.246, "step": 1044 }, { "epoch": 0.42, "learning_rate": 1.7423999999999998e-07, "loss": 1.5974, "step": 1048 }, { "epoch": 0.42, "eval_loss": 1.4878779649734497, "eval_runtime": 0.7804, "eval_samples_per_second": 5.125, "eval_steps_per_second": 1.281, "step": 1048 }, { "epoch": 0.42, "learning_rate": 1.7376000000000002e-07, "loss": 1.6126, "step": 1052 }, { "epoch": 0.42, "eval_loss": 1.48554527759552, "eval_runtime": 0.5423, "eval_samples_per_second": 7.376, "eval_steps_per_second": 1.844, "step": 1052 }, { "epoch": 0.42, "learning_rate": 1.7328e-07, "loss": 1.6189, "step": 1056 }, { "epoch": 0.42, "eval_loss": 1.4827589988708496, "eval_runtime": 0.5326, "eval_samples_per_second": 7.511, "eval_steps_per_second": 1.878, "step": 1056 }, { "epoch": 0.42, "learning_rate": 1.7279999999999999e-07, "loss": 1.5916, "step": 1060 }, { "epoch": 0.42, "eval_loss": 1.4803836345672607, "eval_runtime": 0.5273, "eval_samples_per_second": 7.585, "eval_steps_per_second": 1.896, "step": 1060 }, { "epoch": 0.43, "learning_rate": 1.7232e-07, "loss": 1.5938, "step": 1064 }, { "epoch": 0.43, "eval_loss": 1.4778516292572021, "eval_runtime": 0.5436, "eval_samples_per_second": 7.358, "eval_steps_per_second": 1.839, "step": 1064 }, { "epoch": 0.43, "learning_rate": 1.7183999999999998e-07, "loss": 1.6026, "step": 1068 }, { "epoch": 0.43, "eval_loss": 1.475649118423462, "eval_runtime": 0.5298, "eval_samples_per_second": 7.549, "eval_steps_per_second": 1.887, "step": 1068 }, { "epoch": 0.43, "learning_rate": 1.7136e-07, "loss": 1.5687, "step": 1072 }, { "epoch": 0.43, "eval_loss": 1.473489761352539, "eval_runtime": 0.7191, "eval_samples_per_second": 5.562, "eval_steps_per_second": 1.391, "step": 1072 }, { "epoch": 0.43, "learning_rate": 1.7087999999999998e-07, "loss": 1.5413, "step": 1076 }, { "epoch": 0.43, "eval_loss": 1.4712145328521729, "eval_runtime": 0.7022, "eval_samples_per_second": 5.696, "eval_steps_per_second": 1.424, "step": 1076 }, { "epoch": 0.43, "learning_rate": 1.7039999999999996e-07, "loss": 1.5778, "step": 1080 }, { "epoch": 0.43, "eval_loss": 1.4688694477081299, "eval_runtime": 0.7707, "eval_samples_per_second": 5.19, "eval_steps_per_second": 1.298, "step": 1080 }, { "epoch": 0.43, "learning_rate": 1.6992e-07, "loss": 1.5731, "step": 1084 }, { "epoch": 0.43, "eval_loss": 1.4664225578308105, "eval_runtime": 0.5386, "eval_samples_per_second": 7.427, "eval_steps_per_second": 1.857, "step": 1084 }, { "epoch": 0.44, "learning_rate": 1.6944e-07, "loss": 1.5625, "step": 1088 }, { "epoch": 0.44, "eval_loss": 1.464247465133667, "eval_runtime": 0.5484, "eval_samples_per_second": 7.294, "eval_steps_per_second": 1.823, "step": 1088 }, { "epoch": 0.44, "learning_rate": 1.6896e-07, "loss": 1.55, "step": 1092 }, { "epoch": 0.44, "eval_loss": 1.4620987176895142, "eval_runtime": 0.5342, "eval_samples_per_second": 7.488, "eval_steps_per_second": 1.872, "step": 1092 }, { "epoch": 0.44, "learning_rate": 1.6847999999999998e-07, "loss": 1.5852, "step": 1096 }, { "epoch": 0.44, "eval_loss": 1.459930419921875, "eval_runtime": 0.5332, "eval_samples_per_second": 7.501, "eval_steps_per_second": 1.875, "step": 1096 }, { "epoch": 0.44, "learning_rate": 1.68e-07, "loss": 1.5614, "step": 1100 }, { "epoch": 0.44, "eval_loss": 1.4578797817230225, "eval_runtime": 0.5504, "eval_samples_per_second": 7.268, "eval_steps_per_second": 1.817, "step": 1100 }, { "epoch": 0.44, "learning_rate": 1.6752e-07, "loss": 1.5619, "step": 1104 }, { "epoch": 0.44, "eval_loss": 1.4559952020645142, "eval_runtime": 0.7448, "eval_samples_per_second": 5.37, "eval_steps_per_second": 1.343, "step": 1104 }, { "epoch": 0.44, "learning_rate": 1.6704e-07, "loss": 1.5658, "step": 1108 }, { "epoch": 0.44, "eval_loss": 1.454249382019043, "eval_runtime": 0.773, "eval_samples_per_second": 5.174, "eval_steps_per_second": 1.294, "step": 1108 }, { "epoch": 0.44, "learning_rate": 1.6656e-07, "loss": 1.5699, "step": 1112 }, { "epoch": 0.44, "eval_loss": 1.4521348476409912, "eval_runtime": 0.8287, "eval_samples_per_second": 4.827, "eval_steps_per_second": 1.207, "step": 1112 }, { "epoch": 0.45, "learning_rate": 1.6608e-07, "loss": 1.5738, "step": 1116 }, { "epoch": 0.45, "eval_loss": 1.450175404548645, "eval_runtime": 0.5398, "eval_samples_per_second": 7.41, "eval_steps_per_second": 1.852, "step": 1116 }, { "epoch": 0.45, "learning_rate": 1.656e-07, "loss": 1.5823, "step": 1120 }, { "epoch": 0.45, "eval_loss": 1.4481428861618042, "eval_runtime": 0.5592, "eval_samples_per_second": 7.153, "eval_steps_per_second": 1.788, "step": 1120 }, { "epoch": 0.45, "learning_rate": 1.6511999999999999e-07, "loss": 1.5425, "step": 1124 }, { "epoch": 0.45, "eval_loss": 1.4458932876586914, "eval_runtime": 0.5511, "eval_samples_per_second": 7.259, "eval_steps_per_second": 1.815, "step": 1124 }, { "epoch": 0.45, "learning_rate": 1.6463999999999997e-07, "loss": 1.5604, "step": 1128 }, { "epoch": 0.45, "eval_loss": 1.4438304901123047, "eval_runtime": 0.5355, "eval_samples_per_second": 7.47, "eval_steps_per_second": 1.867, "step": 1128 }, { "epoch": 0.45, "learning_rate": 1.6416e-07, "loss": 1.5562, "step": 1132 }, { "epoch": 0.45, "eval_loss": 1.442002773284912, "eval_runtime": 0.5332, "eval_samples_per_second": 7.502, "eval_steps_per_second": 1.876, "step": 1132 }, { "epoch": 0.45, "learning_rate": 1.6368e-07, "loss": 1.555, "step": 1136 }, { "epoch": 0.45, "eval_loss": 1.4399393796920776, "eval_runtime": 0.7524, "eval_samples_per_second": 5.316, "eval_steps_per_second": 1.329, "step": 1136 }, { "epoch": 0.46, "learning_rate": 1.632e-07, "loss": 1.5158, "step": 1140 }, { "epoch": 0.46, "eval_loss": 1.437983512878418, "eval_runtime": 0.7269, "eval_samples_per_second": 5.503, "eval_steps_per_second": 1.376, "step": 1140 }, { "epoch": 0.46, "learning_rate": 1.6272e-07, "loss": 1.5272, "step": 1144 }, { "epoch": 0.46, "eval_loss": 1.435863733291626, "eval_runtime": 0.7356, "eval_samples_per_second": 5.438, "eval_steps_per_second": 1.359, "step": 1144 }, { "epoch": 0.46, "learning_rate": 1.6223999999999998e-07, "loss": 1.5467, "step": 1148 }, { "epoch": 0.46, "eval_loss": 1.4338979721069336, "eval_runtime": 0.5695, "eval_samples_per_second": 7.023, "eval_steps_per_second": 1.756, "step": 1148 }, { "epoch": 0.46, "learning_rate": 1.6176e-07, "loss": 1.5399, "step": 1152 }, { "epoch": 0.46, "eval_loss": 1.4317151308059692, "eval_runtime": 0.5215, "eval_samples_per_second": 7.669, "eval_steps_per_second": 1.917, "step": 1152 }, { "epoch": 0.46, "learning_rate": 1.6127999999999997e-07, "loss": 1.5221, "step": 1156 }, { "epoch": 0.46, "eval_loss": 1.4296718835830688, "eval_runtime": 0.5471, "eval_samples_per_second": 7.311, "eval_steps_per_second": 1.828, "step": 1156 }, { "epoch": 0.46, "learning_rate": 1.608e-07, "loss": 1.5022, "step": 1160 }, { "epoch": 0.46, "eval_loss": 1.4277141094207764, "eval_runtime": 0.5395, "eval_samples_per_second": 7.414, "eval_steps_per_second": 1.853, "step": 1160 }, { "epoch": 0.47, "learning_rate": 1.6032e-07, "loss": 1.5385, "step": 1164 }, { "epoch": 0.47, "eval_loss": 1.4257354736328125, "eval_runtime": 0.5342, "eval_samples_per_second": 7.487, "eval_steps_per_second": 1.872, "step": 1164 }, { "epoch": 0.47, "learning_rate": 1.5984e-07, "loss": 1.5042, "step": 1168 }, { "epoch": 0.47, "eval_loss": 1.4236301183700562, "eval_runtime": 0.6434, "eval_samples_per_second": 6.217, "eval_steps_per_second": 1.554, "step": 1168 }, { "epoch": 0.47, "learning_rate": 1.5936e-07, "loss": 1.5007, "step": 1172 }, { "epoch": 0.47, "eval_loss": 1.421656608581543, "eval_runtime": 0.7224, "eval_samples_per_second": 5.537, "eval_steps_per_second": 1.384, "step": 1172 }, { "epoch": 0.47, "learning_rate": 1.5887999999999998e-07, "loss": 1.5323, "step": 1176 }, { "epoch": 0.47, "eval_loss": 1.4196075201034546, "eval_runtime": 0.7946, "eval_samples_per_second": 5.034, "eval_steps_per_second": 1.259, "step": 1176 }, { "epoch": 0.47, "learning_rate": 1.584e-07, "loss": 1.5269, "step": 1180 }, { "epoch": 0.47, "eval_loss": 1.4174154996871948, "eval_runtime": 0.82, "eval_samples_per_second": 4.878, "eval_steps_per_second": 1.22, "step": 1180 }, { "epoch": 0.47, "learning_rate": 1.5791999999999997e-07, "loss": 1.5379, "step": 1184 }, { "epoch": 0.47, "eval_loss": 1.4156051874160767, "eval_runtime": 0.5319, "eval_samples_per_second": 7.52, "eval_steps_per_second": 1.88, "step": 1184 }, { "epoch": 0.48, "learning_rate": 1.5744e-07, "loss": 1.522, "step": 1188 }, { "epoch": 0.48, "eval_loss": 1.4136687517166138, "eval_runtime": 0.5286, "eval_samples_per_second": 7.567, "eval_steps_per_second": 1.892, "step": 1188 }, { "epoch": 0.48, "learning_rate": 1.5696e-07, "loss": 1.506, "step": 1192 }, { "epoch": 0.48, "eval_loss": 1.4115678071975708, "eval_runtime": 0.553, "eval_samples_per_second": 7.233, "eval_steps_per_second": 1.808, "step": 1192 }, { "epoch": 0.48, "learning_rate": 1.5647999999999998e-07, "loss": 1.4986, "step": 1196 }, { "epoch": 0.48, "eval_loss": 1.409631371498108, "eval_runtime": 0.5273, "eval_samples_per_second": 7.585, "eval_steps_per_second": 1.896, "step": 1196 }, { "epoch": 0.48, "learning_rate": 1.56e-07, "loss": 1.4918, "step": 1200 }, { "epoch": 0.48, "eval_loss": 1.407455563545227, "eval_runtime": 0.5269, "eval_samples_per_second": 7.592, "eval_steps_per_second": 1.898, "step": 1200 }, { "epoch": 0.48, "learning_rate": 1.5551999999999998e-07, "loss": 1.5124, "step": 1204 }, { "epoch": 0.48, "eval_loss": 1.4056380987167358, "eval_runtime": 0.7536, "eval_samples_per_second": 5.308, "eval_steps_per_second": 1.327, "step": 1204 }, { "epoch": 0.48, "learning_rate": 1.5504000000000002e-07, "loss": 1.4926, "step": 1208 }, { "epoch": 0.48, "eval_loss": 1.403800368309021, "eval_runtime": 0.7248, "eval_samples_per_second": 5.519, "eval_steps_per_second": 1.38, "step": 1208 }, { "epoch": 0.48, "learning_rate": 1.5456e-07, "loss": 1.5053, "step": 1212 }, { "epoch": 0.48, "eval_loss": 1.40152907371521, "eval_runtime": 0.7447, "eval_samples_per_second": 5.371, "eval_steps_per_second": 1.343, "step": 1212 }, { "epoch": 0.49, "learning_rate": 1.5408e-07, "loss": 1.5043, "step": 1216 }, { "epoch": 0.49, "eval_loss": 1.3996310234069824, "eval_runtime": 0.738, "eval_samples_per_second": 5.42, "eval_steps_per_second": 1.355, "step": 1216 }, { "epoch": 0.49, "learning_rate": 1.536e-07, "loss": 1.5068, "step": 1220 }, { "epoch": 0.49, "eval_loss": 1.3975541591644287, "eval_runtime": 0.5275, "eval_samples_per_second": 7.583, "eval_steps_per_second": 1.896, "step": 1220 }, { "epoch": 0.49, "learning_rate": 1.5311999999999998e-07, "loss": 1.5039, "step": 1224 }, { "epoch": 0.49, "eval_loss": 1.3954721689224243, "eval_runtime": 0.5317, "eval_samples_per_second": 7.523, "eval_steps_per_second": 1.881, "step": 1224 }, { "epoch": 0.49, "learning_rate": 1.5264e-07, "loss": 1.4772, "step": 1228 }, { "epoch": 0.49, "eval_loss": 1.3933203220367432, "eval_runtime": 0.5283, "eval_samples_per_second": 7.571, "eval_steps_per_second": 1.893, "step": 1228 }, { "epoch": 0.49, "learning_rate": 1.5215999999999998e-07, "loss": 1.4873, "step": 1232 }, { "epoch": 0.49, "eval_loss": 1.3916254043579102, "eval_runtime": 0.5344, "eval_samples_per_second": 7.485, "eval_steps_per_second": 1.871, "step": 1232 }, { "epoch": 0.49, "learning_rate": 1.5168000000000002e-07, "loss": 1.4977, "step": 1236 }, { "epoch": 0.49, "eval_loss": 1.3896205425262451, "eval_runtime": 0.5249, "eval_samples_per_second": 7.62, "eval_steps_per_second": 1.905, "step": 1236 }, { "epoch": 0.5, "learning_rate": 1.512e-07, "loss": 1.5016, "step": 1240 }, { "epoch": 0.5, "eval_loss": 1.3873213529586792, "eval_runtime": 0.7136, "eval_samples_per_second": 5.605, "eval_steps_per_second": 1.401, "step": 1240 }, { "epoch": 0.5, "learning_rate": 1.5072e-07, "loss": 1.495, "step": 1244 }, { "epoch": 0.5, "eval_loss": 1.3854175806045532, "eval_runtime": 0.7372, "eval_samples_per_second": 5.426, "eval_steps_per_second": 1.357, "step": 1244 }, { "epoch": 0.5, "learning_rate": 1.5024e-07, "loss": 1.4803, "step": 1248 }, { "epoch": 0.5, "eval_loss": 1.3834645748138428, "eval_runtime": 0.7836, "eval_samples_per_second": 5.104, "eval_steps_per_second": 1.276, "step": 1248 }, { "epoch": 0.5, "learning_rate": 1.4975999999999999e-07, "loss": 1.4842, "step": 1252 }, { "epoch": 0.5, "eval_loss": 1.381633996963501, "eval_runtime": 0.5401, "eval_samples_per_second": 7.405, "eval_steps_per_second": 1.851, "step": 1252 }, { "epoch": 0.5, "learning_rate": 1.4928e-07, "loss": 1.4762, "step": 1256 }, { "epoch": 0.5, "eval_loss": 1.379853367805481, "eval_runtime": 0.5233, "eval_samples_per_second": 7.644, "eval_steps_per_second": 1.911, "step": 1256 }, { "epoch": 0.5, "learning_rate": 1.4879999999999998e-07, "loss": 1.4859, "step": 1260 }, { "epoch": 0.5, "eval_loss": 1.3780815601348877, "eval_runtime": 0.5276, "eval_samples_per_second": 7.582, "eval_steps_per_second": 1.895, "step": 1260 }, { "epoch": 0.51, "learning_rate": 1.4832e-07, "loss": 1.4948, "step": 1264 }, { "epoch": 0.51, "eval_loss": 1.3763624429702759, "eval_runtime": 0.5355, "eval_samples_per_second": 7.469, "eval_steps_per_second": 1.867, "step": 1264 }, { "epoch": 0.51, "learning_rate": 1.4784e-07, "loss": 1.4851, "step": 1268 }, { "epoch": 0.51, "eval_loss": 1.374289631843567, "eval_runtime": 0.5235, "eval_samples_per_second": 7.64, "eval_steps_per_second": 1.91, "step": 1268 }, { "epoch": 0.51, "learning_rate": 1.4736e-07, "loss": 1.4749, "step": 1272 }, { "epoch": 0.51, "eval_loss": 1.3724154233932495, "eval_runtime": 0.6384, "eval_samples_per_second": 6.266, "eval_steps_per_second": 1.566, "step": 1272 }, { "epoch": 0.51, "learning_rate": 1.4687999999999998e-07, "loss": 1.4594, "step": 1276 }, { "epoch": 0.51, "eval_loss": 1.3709261417388916, "eval_runtime": 0.7249, "eval_samples_per_second": 5.518, "eval_steps_per_second": 1.379, "step": 1276 }, { "epoch": 0.51, "learning_rate": 1.464e-07, "loss": 1.4517, "step": 1280 }, { "epoch": 0.51, "eval_loss": 1.3691537380218506, "eval_runtime": 0.7303, "eval_samples_per_second": 5.477, "eval_steps_per_second": 1.369, "step": 1280 }, { "epoch": 0.51, "learning_rate": 1.4592e-07, "loss": 1.4239, "step": 1284 }, { "epoch": 0.51, "eval_loss": 1.3673396110534668, "eval_runtime": 0.7929, "eval_samples_per_second": 5.044, "eval_steps_per_second": 1.261, "step": 1284 }, { "epoch": 0.52, "learning_rate": 1.4543999999999998e-07, "loss": 1.4775, "step": 1288 }, { "epoch": 0.52, "eval_loss": 1.3657190799713135, "eval_runtime": 0.5509, "eval_samples_per_second": 7.261, "eval_steps_per_second": 1.815, "step": 1288 }, { "epoch": 0.52, "learning_rate": 1.4496e-07, "loss": 1.4483, "step": 1292 }, { "epoch": 0.52, "eval_loss": 1.3642776012420654, "eval_runtime": 0.5236, "eval_samples_per_second": 7.639, "eval_steps_per_second": 1.91, "step": 1292 }, { "epoch": 0.52, "learning_rate": 1.4447999999999998e-07, "loss": 1.4688, "step": 1296 }, { "epoch": 0.52, "eval_loss": 1.3624374866485596, "eval_runtime": 0.5281, "eval_samples_per_second": 7.574, "eval_steps_per_second": 1.893, "step": 1296 }, { "epoch": 0.52, "learning_rate": 1.44e-07, "loss": 1.4566, "step": 1300 }, { "epoch": 0.52, "eval_loss": 1.3608499765396118, "eval_runtime": 0.5346, "eval_samples_per_second": 7.482, "eval_steps_per_second": 1.871, "step": 1300 }, { "epoch": 0.52, "learning_rate": 1.4352e-07, "loss": 1.4592, "step": 1304 }, { "epoch": 0.52, "eval_loss": 1.3591777086257935, "eval_runtime": 0.543, "eval_samples_per_second": 7.367, "eval_steps_per_second": 1.842, "step": 1304 }, { "epoch": 0.52, "learning_rate": 1.4304e-07, "loss": 1.4505, "step": 1308 }, { "epoch": 0.52, "eval_loss": 1.357291340827942, "eval_runtime": 0.7548, "eval_samples_per_second": 5.299, "eval_steps_per_second": 1.325, "step": 1308 }, { "epoch": 0.52, "learning_rate": 1.4256e-07, "loss": 1.4304, "step": 1312 }, { "epoch": 0.52, "eval_loss": 1.3557498455047607, "eval_runtime": 0.7262, "eval_samples_per_second": 5.508, "eval_steps_per_second": 1.377, "step": 1312 }, { "epoch": 0.53, "learning_rate": 1.4208e-07, "loss": 1.4691, "step": 1316 }, { "epoch": 0.53, "eval_loss": 1.3540558815002441, "eval_runtime": 0.7121, "eval_samples_per_second": 5.617, "eval_steps_per_second": 1.404, "step": 1316 }, { "epoch": 0.53, "learning_rate": 1.416e-07, "loss": 1.4423, "step": 1320 }, { "epoch": 0.53, "eval_loss": 1.3522251844406128, "eval_runtime": 0.7774, "eval_samples_per_second": 5.145, "eval_steps_per_second": 1.286, "step": 1320 }, { "epoch": 0.53, "learning_rate": 1.4111999999999998e-07, "loss": 1.4301, "step": 1324 }, { "epoch": 0.53, "eval_loss": 1.3508257865905762, "eval_runtime": 0.5433, "eval_samples_per_second": 7.362, "eval_steps_per_second": 1.841, "step": 1324 }, { "epoch": 0.53, "learning_rate": 1.4064e-07, "loss": 1.4422, "step": 1328 }, { "epoch": 0.53, "eval_loss": 1.3490896224975586, "eval_runtime": 0.5369, "eval_samples_per_second": 7.451, "eval_steps_per_second": 1.863, "step": 1328 }, { "epoch": 0.53, "learning_rate": 1.4016e-07, "loss": 1.4577, "step": 1332 }, { "epoch": 0.53, "eval_loss": 1.347461223602295, "eval_runtime": 0.5223, "eval_samples_per_second": 7.658, "eval_steps_per_second": 1.915, "step": 1332 }, { "epoch": 0.53, "learning_rate": 1.3968e-07, "loss": 1.4541, "step": 1336 }, { "epoch": 0.53, "eval_loss": 1.3457545042037964, "eval_runtime": 0.5399, "eval_samples_per_second": 7.409, "eval_steps_per_second": 1.852, "step": 1336 }, { "epoch": 0.54, "learning_rate": 1.392e-07, "loss": 1.4246, "step": 1340 }, { "epoch": 0.54, "eval_loss": 1.343980073928833, "eval_runtime": 0.5481, "eval_samples_per_second": 7.297, "eval_steps_per_second": 1.824, "step": 1340 }, { "epoch": 0.54, "learning_rate": 1.3872e-07, "loss": 1.4507, "step": 1344 }, { "epoch": 0.54, "eval_loss": 1.3423739671707153, "eval_runtime": 0.7414, "eval_samples_per_second": 5.395, "eval_steps_per_second": 1.349, "step": 1344 }, { "epoch": 0.54, "learning_rate": 1.3824e-07, "loss": 1.4312, "step": 1348 }, { "epoch": 0.54, "eval_loss": 1.3408253192901611, "eval_runtime": 0.7783, "eval_samples_per_second": 5.139, "eval_steps_per_second": 1.285, "step": 1348 }, { "epoch": 0.54, "learning_rate": 1.3775999999999998e-07, "loss": 1.4394, "step": 1352 }, { "epoch": 0.54, "eval_loss": 1.339220404624939, "eval_runtime": 0.771, "eval_samples_per_second": 5.188, "eval_steps_per_second": 1.297, "step": 1352 }, { "epoch": 0.54, "learning_rate": 1.3728e-07, "loss": 1.4271, "step": 1356 }, { "epoch": 0.54, "eval_loss": 1.3373547792434692, "eval_runtime": 0.5264, "eval_samples_per_second": 7.599, "eval_steps_per_second": 1.9, "step": 1356 }, { "epoch": 0.54, "learning_rate": 1.368e-07, "loss": 1.4081, "step": 1360 }, { "epoch": 0.54, "eval_loss": 1.3356679677963257, "eval_runtime": 0.5397, "eval_samples_per_second": 7.412, "eval_steps_per_second": 1.853, "step": 1360 }, { "epoch": 0.55, "learning_rate": 1.3632e-07, "loss": 1.4314, "step": 1364 }, { "epoch": 0.55, "eval_loss": 1.333927035331726, "eval_runtime": 0.5418, "eval_samples_per_second": 7.382, "eval_steps_per_second": 1.846, "step": 1364 }, { "epoch": 0.55, "learning_rate": 1.3583999999999998e-07, "loss": 1.4359, "step": 1368 }, { "epoch": 0.55, "eval_loss": 1.3325647115707397, "eval_runtime": 0.5464, "eval_samples_per_second": 7.321, "eval_steps_per_second": 1.83, "step": 1368 }, { "epoch": 0.55, "learning_rate": 1.3536e-07, "loss": 1.4381, "step": 1372 }, { "epoch": 0.55, "eval_loss": 1.3307887315750122, "eval_runtime": 0.5493, "eval_samples_per_second": 7.282, "eval_steps_per_second": 1.82, "step": 1372 }, { "epoch": 0.55, "learning_rate": 1.3488e-07, "loss": 1.4219, "step": 1376 }, { "epoch": 0.55, "eval_loss": 1.3293663263320923, "eval_runtime": 0.5921, "eval_samples_per_second": 6.755, "eval_steps_per_second": 1.689, "step": 1376 }, { "epoch": 0.55, "learning_rate": 1.3439999999999999e-07, "loss": 1.4669, "step": 1380 }, { "epoch": 0.55, "eval_loss": 1.3278565406799316, "eval_runtime": 0.7788, "eval_samples_per_second": 5.136, "eval_steps_per_second": 1.284, "step": 1380 }, { "epoch": 0.55, "learning_rate": 1.3392e-07, "loss": 1.4163, "step": 1384 }, { "epoch": 0.55, "eval_loss": 1.3260074853897095, "eval_runtime": 0.8128, "eval_samples_per_second": 4.921, "eval_steps_per_second": 1.23, "step": 1384 }, { "epoch": 0.56, "learning_rate": 1.3343999999999998e-07, "loss": 1.4153, "step": 1388 }, { "epoch": 0.56, "eval_loss": 1.3242360353469849, "eval_runtime": 0.8002, "eval_samples_per_second": 4.999, "eval_steps_per_second": 1.25, "step": 1388 }, { "epoch": 0.56, "learning_rate": 1.3296e-07, "loss": 1.4506, "step": 1392 }, { "epoch": 0.56, "eval_loss": 1.3229784965515137, "eval_runtime": 0.5395, "eval_samples_per_second": 7.414, "eval_steps_per_second": 1.854, "step": 1392 }, { "epoch": 0.56, "learning_rate": 1.3247999999999998e-07, "loss": 1.4229, "step": 1396 }, { "epoch": 0.56, "eval_loss": 1.3213036060333252, "eval_runtime": 0.5374, "eval_samples_per_second": 7.444, "eval_steps_per_second": 1.861, "step": 1396 }, { "epoch": 0.56, "learning_rate": 1.32e-07, "loss": 1.4218, "step": 1400 }, { "epoch": 0.56, "eval_loss": 1.3196250200271606, "eval_runtime": 0.5404, "eval_samples_per_second": 7.402, "eval_steps_per_second": 1.851, "step": 1400 }, { "epoch": 0.56, "learning_rate": 1.3152e-07, "loss": 1.4185, "step": 1404 }, { "epoch": 0.56, "eval_loss": 1.3180840015411377, "eval_runtime": 0.5573, "eval_samples_per_second": 7.177, "eval_steps_per_second": 1.794, "step": 1404 }, { "epoch": 0.56, "learning_rate": 1.3104e-07, "loss": 1.4283, "step": 1408 }, { "epoch": 0.56, "eval_loss": 1.316424012184143, "eval_runtime": 0.5204, "eval_samples_per_second": 7.686, "eval_steps_per_second": 1.922, "step": 1408 }, { "epoch": 0.56, "learning_rate": 1.3056e-07, "loss": 1.4202, "step": 1412 }, { "epoch": 0.56, "eval_loss": 1.3148062229156494, "eval_runtime": 0.7628, "eval_samples_per_second": 5.244, "eval_steps_per_second": 1.311, "step": 1412 }, { "epoch": 0.57, "learning_rate": 1.3007999999999998e-07, "loss": 1.3736, "step": 1416 }, { "epoch": 0.57, "eval_loss": 1.3131170272827148, "eval_runtime": 0.7763, "eval_samples_per_second": 5.153, "eval_steps_per_second": 1.288, "step": 1416 }, { "epoch": 0.57, "learning_rate": 1.296e-07, "loss": 1.4332, "step": 1420 }, { "epoch": 0.57, "eval_loss": 1.311560869216919, "eval_runtime": 0.7312, "eval_samples_per_second": 5.471, "eval_steps_per_second": 1.368, "step": 1420 }, { "epoch": 0.57, "learning_rate": 1.2912e-07, "loss": 1.4287, "step": 1424 }, { "epoch": 0.57, "eval_loss": 1.309916615486145, "eval_runtime": 0.6738, "eval_samples_per_second": 5.936, "eval_steps_per_second": 1.484, "step": 1424 }, { "epoch": 0.57, "learning_rate": 1.2864e-07, "loss": 1.4175, "step": 1428 }, { "epoch": 0.57, "eval_loss": 1.3080803155899048, "eval_runtime": 0.5396, "eval_samples_per_second": 7.412, "eval_steps_per_second": 1.853, "step": 1428 }, { "epoch": 0.57, "learning_rate": 1.2816e-07, "loss": 1.4152, "step": 1432 }, { "epoch": 0.57, "eval_loss": 1.3066335916519165, "eval_runtime": 0.5523, "eval_samples_per_second": 7.243, "eval_steps_per_second": 1.811, "step": 1432 }, { "epoch": 0.57, "learning_rate": 1.2768e-07, "loss": 1.4036, "step": 1436 }, { "epoch": 0.57, "eval_loss": 1.3054327964782715, "eval_runtime": 0.5404, "eval_samples_per_second": 7.402, "eval_steps_per_second": 1.851, "step": 1436 }, { "epoch": 0.58, "learning_rate": 1.272e-07, "loss": 1.4033, "step": 1440 }, { "epoch": 0.58, "eval_loss": 1.3037904500961304, "eval_runtime": 0.5534, "eval_samples_per_second": 7.228, "eval_steps_per_second": 1.807, "step": 1440 }, { "epoch": 0.58, "learning_rate": 1.2671999999999999e-07, "loss": 1.4095, "step": 1444 }, { "epoch": 0.58, "eval_loss": 1.302278757095337, "eval_runtime": 0.7546, "eval_samples_per_second": 5.301, "eval_steps_per_second": 1.325, "step": 1444 }, { "epoch": 0.58, "learning_rate": 1.2624e-07, "loss": 1.4129, "step": 1448 }, { "epoch": 0.58, "eval_loss": 1.3008112907409668, "eval_runtime": 0.7157, "eval_samples_per_second": 5.589, "eval_steps_per_second": 1.397, "step": 1448 }, { "epoch": 0.58, "learning_rate": 1.2576e-07, "loss": 1.3838, "step": 1452 }, { "epoch": 0.58, "eval_loss": 1.2994916439056396, "eval_runtime": 0.7773, "eval_samples_per_second": 5.146, "eval_steps_per_second": 1.286, "step": 1452 }, { "epoch": 0.58, "learning_rate": 1.2528e-07, "loss": 1.3939, "step": 1456 }, { "epoch": 0.58, "eval_loss": 1.2979990243911743, "eval_runtime": 0.8203, "eval_samples_per_second": 4.876, "eval_steps_per_second": 1.219, "step": 1456 }, { "epoch": 0.58, "learning_rate": 1.2479999999999998e-07, "loss": 1.4023, "step": 1460 }, { "epoch": 0.58, "eval_loss": 1.2964202165603638, "eval_runtime": 0.5392, "eval_samples_per_second": 7.419, "eval_steps_per_second": 1.855, "step": 1460 }, { "epoch": 0.59, "learning_rate": 1.2432e-07, "loss": 1.3751, "step": 1464 }, { "epoch": 0.59, "eval_loss": 1.2952665090560913, "eval_runtime": 0.533, "eval_samples_per_second": 7.505, "eval_steps_per_second": 1.876, "step": 1464 }, { "epoch": 0.59, "learning_rate": 1.2384e-07, "loss": 1.3657, "step": 1468 }, { "epoch": 0.59, "eval_loss": 1.2935295104980469, "eval_runtime": 0.5428, "eval_samples_per_second": 7.369, "eval_steps_per_second": 1.842, "step": 1468 }, { "epoch": 0.59, "learning_rate": 1.2336e-07, "loss": 1.375, "step": 1472 }, { "epoch": 0.59, "eval_loss": 1.292738914489746, "eval_runtime": 0.5365, "eval_samples_per_second": 7.456, "eval_steps_per_second": 1.864, "step": 1472 }, { "epoch": 0.59, "learning_rate": 1.2288e-07, "loss": 1.3846, "step": 1476 }, { "epoch": 0.59, "eval_loss": 1.291104793548584, "eval_runtime": 0.5462, "eval_samples_per_second": 7.323, "eval_steps_per_second": 1.831, "step": 1476 }, { "epoch": 0.59, "learning_rate": 1.2239999999999998e-07, "loss": 1.4192, "step": 1480 }, { "epoch": 0.59, "eval_loss": 1.2900675535202026, "eval_runtime": 0.7504, "eval_samples_per_second": 5.33, "eval_steps_per_second": 1.333, "step": 1480 }, { "epoch": 0.59, "learning_rate": 1.2192e-07, "loss": 1.3629, "step": 1484 }, { "epoch": 0.59, "eval_loss": 1.2886391878128052, "eval_runtime": 0.7924, "eval_samples_per_second": 5.048, "eval_steps_per_second": 1.262, "step": 1484 }, { "epoch": 0.6, "learning_rate": 1.2143999999999998e-07, "loss": 1.3947, "step": 1488 }, { "epoch": 0.6, "eval_loss": 1.287713646888733, "eval_runtime": 0.7522, "eval_samples_per_second": 5.318, "eval_steps_per_second": 1.329, "step": 1488 }, { "epoch": 0.6, "learning_rate": 1.2096e-07, "loss": 1.3485, "step": 1492 }, { "epoch": 0.6, "eval_loss": 1.2862787246704102, "eval_runtime": 0.5402, "eval_samples_per_second": 7.404, "eval_steps_per_second": 1.851, "step": 1492 }, { "epoch": 0.6, "learning_rate": 1.2048e-07, "loss": 1.405, "step": 1496 }, { "epoch": 0.6, "eval_loss": 1.2850462198257446, "eval_runtime": 0.5452, "eval_samples_per_second": 7.337, "eval_steps_per_second": 1.834, "step": 1496 }, { "epoch": 0.6, "learning_rate": 1.2e-07, "loss": 1.3758, "step": 1500 }, { "epoch": 0.6, "eval_loss": 1.2840522527694702, "eval_runtime": 0.541, "eval_samples_per_second": 7.394, "eval_steps_per_second": 1.849, "step": 1500 } ], "logging_steps": 4, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.9068132261888e+17, "trial_name": null, "trial_params": null }