{ "best_metric": 1.9646457433700562, "best_model_checkpoint": "./results/checkpoint-500", "epoch": 0.2, "eval_steps": 4, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.9951999999999997e-07, "loss": 2.6285, "step": 4 }, { "epoch": 0.0, "eval_loss": 2.4697508811950684, "eval_runtime": 0.485, "eval_samples_per_second": 8.248, "eval_steps_per_second": 2.062, "step": 4 }, { "epoch": 0.0, "learning_rate": 2.9904e-07, "loss": 2.6222, "step": 8 }, { "epoch": 0.0, "eval_loss": 2.465975284576416, "eval_runtime": 0.6323, "eval_samples_per_second": 6.326, "eval_steps_per_second": 1.582, "step": 8 }, { "epoch": 0.0, "learning_rate": 2.9856e-07, "loss": 2.6536, "step": 12 }, { "epoch": 0.0, "eval_loss": 2.460374116897583, "eval_runtime": 0.6478, "eval_samples_per_second": 6.175, "eval_steps_per_second": 1.544, "step": 12 }, { "epoch": 0.01, "learning_rate": 2.9808e-07, "loss": 2.6785, "step": 16 }, { "epoch": 0.01, "eval_loss": 2.4556970596313477, "eval_runtime": 0.6653, "eval_samples_per_second": 6.012, "eval_steps_per_second": 1.503, "step": 16 }, { "epoch": 0.01, "learning_rate": 2.9759999999999996e-07, "loss": 2.6085, "step": 20 }, { "epoch": 0.01, "eval_loss": 2.4514715671539307, "eval_runtime": 0.5241, "eval_samples_per_second": 7.632, "eval_steps_per_second": 1.908, "step": 20 }, { "epoch": 0.01, "learning_rate": 2.9711999999999995e-07, "loss": 2.5907, "step": 24 }, { "epoch": 0.01, "eval_loss": 2.4462974071502686, "eval_runtime": 0.4689, "eval_samples_per_second": 8.53, "eval_steps_per_second": 2.133, "step": 24 }, { "epoch": 0.01, "learning_rate": 2.9664e-07, "loss": 2.5942, "step": 28 }, { "epoch": 0.01, "eval_loss": 2.4415194988250732, "eval_runtime": 0.4829, "eval_samples_per_second": 8.284, "eval_steps_per_second": 2.071, "step": 28 }, { "epoch": 0.01, "learning_rate": 2.9615999999999997e-07, "loss": 2.6101, "step": 32 }, { "epoch": 0.01, "eval_loss": 2.437161922454834, "eval_runtime": 0.4715, "eval_samples_per_second": 8.483, "eval_steps_per_second": 2.121, "step": 32 }, { "epoch": 0.01, "learning_rate": 2.9568e-07, "loss": 2.5827, "step": 36 }, { "epoch": 0.01, "eval_loss": 2.432689666748047, "eval_runtime": 0.4938, "eval_samples_per_second": 8.1, "eval_steps_per_second": 2.025, "step": 36 }, { "epoch": 0.02, "learning_rate": 2.952e-07, "loss": 2.5729, "step": 40 }, { "epoch": 0.02, "eval_loss": 2.4281153678894043, "eval_runtime": 0.5021, "eval_samples_per_second": 7.966, "eval_steps_per_second": 1.991, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.9472e-07, "loss": 2.5856, "step": 44 }, { "epoch": 0.02, "eval_loss": 2.423053741455078, "eval_runtime": 0.593, "eval_samples_per_second": 6.746, "eval_steps_per_second": 1.686, "step": 44 }, { "epoch": 0.02, "learning_rate": 2.9423999999999997e-07, "loss": 2.589, "step": 48 }, { "epoch": 0.02, "eval_loss": 2.418571949005127, "eval_runtime": 0.6933, "eval_samples_per_second": 5.77, "eval_steps_per_second": 1.442, "step": 48 }, { "epoch": 0.02, "learning_rate": 2.9375999999999995e-07, "loss": 2.6483, "step": 52 }, { "epoch": 0.02, "eval_loss": 2.414531946182251, "eval_runtime": 0.7167, "eval_samples_per_second": 5.581, "eval_steps_per_second": 1.395, "step": 52 }, { "epoch": 0.02, "learning_rate": 2.9328e-07, "loss": 2.517, "step": 56 }, { "epoch": 0.02, "eval_loss": 2.409538745880127, "eval_runtime": 0.4826, "eval_samples_per_second": 8.289, "eval_steps_per_second": 2.072, "step": 56 }, { "epoch": 0.02, "learning_rate": 2.928e-07, "loss": 2.5987, "step": 60 }, { "epoch": 0.02, "eval_loss": 2.4050426483154297, "eval_runtime": 0.4757, "eval_samples_per_second": 8.409, "eval_steps_per_second": 2.102, "step": 60 }, { "epoch": 0.03, "learning_rate": 2.9232e-07, "loss": 2.5489, "step": 64 }, { "epoch": 0.03, "eval_loss": 2.400360107421875, "eval_runtime": 0.4945, "eval_samples_per_second": 8.089, "eval_steps_per_second": 2.022, "step": 64 }, { "epoch": 0.03, "learning_rate": 2.9184e-07, "loss": 2.5063, "step": 68 }, { "epoch": 0.03, "eval_loss": 2.396500587463379, "eval_runtime": 0.5, "eval_samples_per_second": 8.001, "eval_steps_per_second": 2.0, "step": 68 }, { "epoch": 0.03, "learning_rate": 2.9136e-07, "loss": 2.5867, "step": 72 }, { "epoch": 0.03, "eval_loss": 2.3916146755218506, "eval_runtime": 0.4602, "eval_samples_per_second": 8.693, "eval_steps_per_second": 2.173, "step": 72 }, { "epoch": 0.03, "learning_rate": 2.9087999999999997e-07, "loss": 2.544, "step": 76 }, { "epoch": 0.03, "eval_loss": 2.3873047828674316, "eval_runtime": 0.4731, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.114, "step": 76 }, { "epoch": 0.03, "learning_rate": 2.9039999999999995e-07, "loss": 2.5596, "step": 80 }, { "epoch": 0.03, "eval_loss": 2.382803440093994, "eval_runtime": 0.6092, "eval_samples_per_second": 6.566, "eval_steps_per_second": 1.642, "step": 80 }, { "epoch": 0.03, "learning_rate": 2.8992e-07, "loss": 2.5744, "step": 84 }, { "epoch": 0.03, "eval_loss": 2.3786380290985107, "eval_runtime": 0.7212, "eval_samples_per_second": 5.546, "eval_steps_per_second": 1.387, "step": 84 }, { "epoch": 0.04, "learning_rate": 2.8944e-07, "loss": 2.5588, "step": 88 }, { "epoch": 0.04, "eval_loss": 2.374176502227783, "eval_runtime": 0.6826, "eval_samples_per_second": 5.86, "eval_steps_per_second": 1.465, "step": 88 }, { "epoch": 0.04, "learning_rate": 2.8895999999999996e-07, "loss": 2.5579, "step": 92 }, { "epoch": 0.04, "eval_loss": 2.3702104091644287, "eval_runtime": 0.4896, "eval_samples_per_second": 8.169, "eval_steps_per_second": 2.042, "step": 92 }, { "epoch": 0.04, "learning_rate": 2.8848e-07, "loss": 2.5245, "step": 96 }, { "epoch": 0.04, "eval_loss": 2.3660218715667725, "eval_runtime": 0.4764, "eval_samples_per_second": 8.397, "eval_steps_per_second": 2.099, "step": 96 }, { "epoch": 0.04, "learning_rate": 2.88e-07, "loss": 2.5132, "step": 100 }, { "epoch": 0.04, "eval_loss": 2.36110520362854, "eval_runtime": 0.4799, "eval_samples_per_second": 8.335, "eval_steps_per_second": 2.084, "step": 100 }, { "epoch": 0.04, "learning_rate": 2.8751999999999997e-07, "loss": 2.5037, "step": 104 }, { "epoch": 0.04, "eval_loss": 2.3570125102996826, "eval_runtime": 0.4722, "eval_samples_per_second": 8.47, "eval_steps_per_second": 2.118, "step": 104 }, { "epoch": 0.04, "learning_rate": 2.8704e-07, "loss": 2.4727, "step": 108 }, { "epoch": 0.04, "eval_loss": 2.3530666828155518, "eval_runtime": 0.467, "eval_samples_per_second": 8.565, "eval_steps_per_second": 2.141, "step": 108 }, { "epoch": 0.04, "learning_rate": 2.8656e-07, "loss": 2.4709, "step": 112 }, { "epoch": 0.04, "eval_loss": 2.348759412765503, "eval_runtime": 0.501, "eval_samples_per_second": 7.984, "eval_steps_per_second": 1.996, "step": 112 }, { "epoch": 0.05, "learning_rate": 2.8608e-07, "loss": 2.4711, "step": 116 }, { "epoch": 0.05, "eval_loss": 2.344454050064087, "eval_runtime": 0.6607, "eval_samples_per_second": 6.054, "eval_steps_per_second": 1.513, "step": 116 }, { "epoch": 0.05, "learning_rate": 2.8559999999999996e-07, "loss": 2.5445, "step": 120 }, { "epoch": 0.05, "eval_loss": 2.3402156829833984, "eval_runtime": 0.704, "eval_samples_per_second": 5.682, "eval_steps_per_second": 1.42, "step": 120 }, { "epoch": 0.05, "learning_rate": 2.8512e-07, "loss": 2.4994, "step": 124 }, { "epoch": 0.05, "eval_loss": 2.3362019062042236, "eval_runtime": 0.6849, "eval_samples_per_second": 5.84, "eval_steps_per_second": 1.46, "step": 124 }, { "epoch": 0.05, "learning_rate": 2.8464e-07, "loss": 2.5036, "step": 128 }, { "epoch": 0.05, "eval_loss": 2.3319339752197266, "eval_runtime": 0.4864, "eval_samples_per_second": 8.223, "eval_steps_per_second": 2.056, "step": 128 }, { "epoch": 0.05, "learning_rate": 2.8416e-07, "loss": 2.5525, "step": 132 }, { "epoch": 0.05, "eval_loss": 2.3276522159576416, "eval_runtime": 0.4783, "eval_samples_per_second": 8.364, "eval_steps_per_second": 2.091, "step": 132 }, { "epoch": 0.05, "learning_rate": 2.8368e-07, "loss": 2.5245, "step": 136 }, { "epoch": 0.05, "eval_loss": 2.3241090774536133, "eval_runtime": 0.4805, "eval_samples_per_second": 8.324, "eval_steps_per_second": 2.081, "step": 136 }, { "epoch": 0.06, "learning_rate": 2.832e-07, "loss": 2.4946, "step": 140 }, { "epoch": 0.06, "eval_loss": 2.3198165893554688, "eval_runtime": 0.473, "eval_samples_per_second": 8.457, "eval_steps_per_second": 2.114, "step": 140 }, { "epoch": 0.06, "learning_rate": 2.8272e-07, "loss": 2.5142, "step": 144 }, { "epoch": 0.06, "eval_loss": 2.3152613639831543, "eval_runtime": 0.4858, "eval_samples_per_second": 8.234, "eval_steps_per_second": 2.058, "step": 144 }, { "epoch": 0.06, "learning_rate": 2.8223999999999997e-07, "loss": 2.4639, "step": 148 }, { "epoch": 0.06, "eval_loss": 2.3112645149230957, "eval_runtime": 0.488, "eval_samples_per_second": 8.196, "eval_steps_per_second": 2.049, "step": 148 }, { "epoch": 0.06, "learning_rate": 2.8176e-07, "loss": 2.4796, "step": 152 }, { "epoch": 0.06, "eval_loss": 2.307020902633667, "eval_runtime": 0.6163, "eval_samples_per_second": 6.49, "eval_steps_per_second": 1.623, "step": 152 }, { "epoch": 0.06, "learning_rate": 2.8128e-07, "loss": 2.4529, "step": 156 }, { "epoch": 0.06, "eval_loss": 2.303062915802002, "eval_runtime": 0.6764, "eval_samples_per_second": 5.913, "eval_steps_per_second": 1.478, "step": 156 }, { "epoch": 0.06, "learning_rate": 2.808e-07, "loss": 2.4823, "step": 160 }, { "epoch": 0.06, "eval_loss": 2.2993311882019043, "eval_runtime": 0.6854, "eval_samples_per_second": 5.836, "eval_steps_per_second": 1.459, "step": 160 }, { "epoch": 0.07, "learning_rate": 2.8032e-07, "loss": 2.4439, "step": 164 }, { "epoch": 0.07, "eval_loss": 2.2947850227355957, "eval_runtime": 0.4745, "eval_samples_per_second": 8.429, "eval_steps_per_second": 2.107, "step": 164 }, { "epoch": 0.07, "learning_rate": 2.7984e-07, "loss": 2.4652, "step": 168 }, { "epoch": 0.07, "eval_loss": 2.2908992767333984, "eval_runtime": 0.4759, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.101, "step": 168 }, { "epoch": 0.07, "learning_rate": 2.7936e-07, "loss": 2.4574, "step": 172 }, { "epoch": 0.07, "eval_loss": 2.2867026329040527, "eval_runtime": 0.4973, "eval_samples_per_second": 8.043, "eval_steps_per_second": 2.011, "step": 172 }, { "epoch": 0.07, "learning_rate": 2.7887999999999997e-07, "loss": 2.4557, "step": 176 }, { "epoch": 0.07, "eval_loss": 2.283027172088623, "eval_runtime": 0.4719, "eval_samples_per_second": 8.477, "eval_steps_per_second": 2.119, "step": 176 }, { "epoch": 0.07, "learning_rate": 2.784e-07, "loss": 2.4462, "step": 180 }, { "epoch": 0.07, "eval_loss": 2.2787420749664307, "eval_runtime": 0.472, "eval_samples_per_second": 8.474, "eval_steps_per_second": 2.119, "step": 180 }, { "epoch": 0.07, "learning_rate": 2.7792e-07, "loss": 2.3962, "step": 184 }, { "epoch": 0.07, "eval_loss": 2.2745461463928223, "eval_runtime": 0.6328, "eval_samples_per_second": 6.322, "eval_steps_per_second": 1.58, "step": 184 }, { "epoch": 0.08, "learning_rate": 2.7744e-07, "loss": 2.3666, "step": 188 }, { "epoch": 0.08, "eval_loss": 2.2705912590026855, "eval_runtime": 0.6375, "eval_samples_per_second": 6.274, "eval_steps_per_second": 1.569, "step": 188 }, { "epoch": 0.08, "learning_rate": 2.7696e-07, "loss": 2.5024, "step": 192 }, { "epoch": 0.08, "eval_loss": 2.266995906829834, "eval_runtime": 0.6984, "eval_samples_per_second": 5.727, "eval_steps_per_second": 1.432, "step": 192 }, { "epoch": 0.08, "learning_rate": 2.7648e-07, "loss": 2.4419, "step": 196 }, { "epoch": 0.08, "eval_loss": 2.2626519203186035, "eval_runtime": 0.7334, "eval_samples_per_second": 5.454, "eval_steps_per_second": 1.363, "step": 196 }, { "epoch": 0.08, "learning_rate": 2.76e-07, "loss": 2.4246, "step": 200 }, { "epoch": 0.08, "eval_loss": 2.2583603858947754, "eval_runtime": 0.48, "eval_samples_per_second": 8.333, "eval_steps_per_second": 2.083, "step": 200 }, { "epoch": 0.08, "learning_rate": 2.7551999999999997e-07, "loss": 2.3853, "step": 204 }, { "epoch": 0.08, "eval_loss": 2.2551512718200684, "eval_runtime": 0.4939, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.025, "step": 204 }, { "epoch": 0.08, "learning_rate": 2.7503999999999995e-07, "loss": 2.4032, "step": 208 }, { "epoch": 0.08, "eval_loss": 2.251105785369873, "eval_runtime": 0.46, "eval_samples_per_second": 8.695, "eval_steps_per_second": 2.174, "step": 208 }, { "epoch": 0.08, "learning_rate": 2.7456e-07, "loss": 2.4444, "step": 212 }, { "epoch": 0.08, "eval_loss": 2.247025489807129, "eval_runtime": 0.4948, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.021, "step": 212 }, { "epoch": 0.09, "learning_rate": 2.7408e-07, "loss": 2.2932, "step": 216 }, { "epoch": 0.09, "eval_loss": 2.242764472961426, "eval_runtime": 0.4897, "eval_samples_per_second": 8.168, "eval_steps_per_second": 2.042, "step": 216 }, { "epoch": 0.09, "learning_rate": 2.736e-07, "loss": 2.3929, "step": 220 }, { "epoch": 0.09, "eval_loss": 2.2391483783721924, "eval_runtime": 0.6128, "eval_samples_per_second": 6.528, "eval_steps_per_second": 1.632, "step": 220 }, { "epoch": 0.09, "learning_rate": 2.7312e-07, "loss": 2.4112, "step": 224 }, { "epoch": 0.09, "eval_loss": 2.234977960586548, "eval_runtime": 0.648, "eval_samples_per_second": 6.172, "eval_steps_per_second": 1.543, "step": 224 }, { "epoch": 0.09, "learning_rate": 2.7264e-07, "loss": 2.4191, "step": 228 }, { "epoch": 0.09, "eval_loss": 2.231099843978882, "eval_runtime": 0.6862, "eval_samples_per_second": 5.829, "eval_steps_per_second": 1.457, "step": 228 }, { "epoch": 0.09, "learning_rate": 2.7215999999999997e-07, "loss": 2.4408, "step": 232 }, { "epoch": 0.09, "eval_loss": 2.2272462844848633, "eval_runtime": 0.7076, "eval_samples_per_second": 5.653, "eval_steps_per_second": 1.413, "step": 232 }, { "epoch": 0.09, "learning_rate": 2.7167999999999996e-07, "loss": 2.3884, "step": 236 }, { "epoch": 0.09, "eval_loss": 2.223376750946045, "eval_runtime": 0.5169, "eval_samples_per_second": 7.738, "eval_steps_per_second": 1.935, "step": 236 }, { "epoch": 0.1, "learning_rate": 2.712e-07, "loss": 2.3689, "step": 240 }, { "epoch": 0.1, "eval_loss": 2.2195653915405273, "eval_runtime": 0.4793, "eval_samples_per_second": 8.346, "eval_steps_per_second": 2.086, "step": 240 }, { "epoch": 0.1, "learning_rate": 2.7072e-07, "loss": 2.3689, "step": 244 }, { "epoch": 0.1, "eval_loss": 2.2153775691986084, "eval_runtime": 0.4771, "eval_samples_per_second": 8.384, "eval_steps_per_second": 2.096, "step": 244 }, { "epoch": 0.1, "learning_rate": 2.7024e-07, "loss": 2.3249, "step": 248 }, { "epoch": 0.1, "eval_loss": 2.211355209350586, "eval_runtime": 0.4778, "eval_samples_per_second": 8.372, "eval_steps_per_second": 2.093, "step": 248 }, { "epoch": 0.1, "learning_rate": 2.6976e-07, "loss": 2.4286, "step": 252 }, { "epoch": 0.1, "eval_loss": 2.207773208618164, "eval_runtime": 0.4873, "eval_samples_per_second": 8.209, "eval_steps_per_second": 2.052, "step": 252 }, { "epoch": 0.1, "learning_rate": 2.6928e-07, "loss": 2.3497, "step": 256 }, { "epoch": 0.1, "eval_loss": 2.203867197036743, "eval_runtime": 0.6281, "eval_samples_per_second": 6.368, "eval_steps_per_second": 1.592, "step": 256 }, { "epoch": 0.1, "learning_rate": 2.6879999999999997e-07, "loss": 2.284, "step": 260 }, { "epoch": 0.1, "eval_loss": 2.199937582015991, "eval_runtime": 0.6885, "eval_samples_per_second": 5.81, "eval_steps_per_second": 1.452, "step": 260 }, { "epoch": 0.11, "learning_rate": 2.6831999999999996e-07, "loss": 2.3333, "step": 264 }, { "epoch": 0.11, "eval_loss": 2.1958465576171875, "eval_runtime": 0.6799, "eval_samples_per_second": 5.883, "eval_steps_per_second": 1.471, "step": 264 }, { "epoch": 0.11, "learning_rate": 2.6784e-07, "loss": 2.3305, "step": 268 }, { "epoch": 0.11, "eval_loss": 2.192072868347168, "eval_runtime": 0.7165, "eval_samples_per_second": 5.583, "eval_steps_per_second": 1.396, "step": 268 }, { "epoch": 0.11, "learning_rate": 2.6736e-07, "loss": 2.3465, "step": 272 }, { "epoch": 0.11, "eval_loss": 2.1882476806640625, "eval_runtime": 0.485, "eval_samples_per_second": 8.247, "eval_steps_per_second": 2.062, "step": 272 }, { "epoch": 0.11, "learning_rate": 2.6687999999999997e-07, "loss": 2.3274, "step": 276 }, { "epoch": 0.11, "eval_loss": 2.1841320991516113, "eval_runtime": 0.4767, "eval_samples_per_second": 8.391, "eval_steps_per_second": 2.098, "step": 276 }, { "epoch": 0.11, "learning_rate": 2.664e-07, "loss": 2.3641, "step": 280 }, { "epoch": 0.11, "eval_loss": 2.1803271770477295, "eval_runtime": 0.5146, "eval_samples_per_second": 7.774, "eval_steps_per_second": 1.943, "step": 280 }, { "epoch": 0.11, "learning_rate": 2.6592e-07, "loss": 2.3089, "step": 284 }, { "epoch": 0.11, "eval_loss": 2.176274538040161, "eval_runtime": 0.488, "eval_samples_per_second": 8.196, "eval_steps_per_second": 2.049, "step": 284 }, { "epoch": 0.12, "learning_rate": 2.6543999999999997e-07, "loss": 2.2645, "step": 288 }, { "epoch": 0.12, "eval_loss": 2.1720588207244873, "eval_runtime": 0.4973, "eval_samples_per_second": 8.043, "eval_steps_per_second": 2.011, "step": 288 }, { "epoch": 0.12, "learning_rate": 2.6495999999999996e-07, "loss": 2.3439, "step": 292 }, { "epoch": 0.12, "eval_loss": 2.1687240600585938, "eval_runtime": 0.6283, "eval_samples_per_second": 6.366, "eval_steps_per_second": 1.592, "step": 292 }, { "epoch": 0.12, "learning_rate": 2.6448e-07, "loss": 2.3285, "step": 296 }, { "epoch": 0.12, "eval_loss": 2.1649253368377686, "eval_runtime": 0.6996, "eval_samples_per_second": 5.718, "eval_steps_per_second": 1.429, "step": 296 }, { "epoch": 0.12, "learning_rate": 2.64e-07, "loss": 2.3126, "step": 300 }, { "epoch": 0.12, "eval_loss": 2.160398483276367, "eval_runtime": 0.6904, "eval_samples_per_second": 5.794, "eval_steps_per_second": 1.448, "step": 300 }, { "epoch": 0.12, "learning_rate": 2.6351999999999997e-07, "loss": 2.3356, "step": 304 }, { "epoch": 0.12, "eval_loss": 2.1570284366607666, "eval_runtime": 0.4953, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.019, "step": 304 }, { "epoch": 0.12, "learning_rate": 2.6304e-07, "loss": 2.3396, "step": 308 }, { "epoch": 0.12, "eval_loss": 2.1527013778686523, "eval_runtime": 0.4977, "eval_samples_per_second": 8.037, "eval_steps_per_second": 2.009, "step": 308 }, { "epoch": 0.12, "learning_rate": 2.6256e-07, "loss": 2.2972, "step": 312 }, { "epoch": 0.12, "eval_loss": 2.148724317550659, "eval_runtime": 0.4939, "eval_samples_per_second": 8.099, "eval_steps_per_second": 2.025, "step": 312 }, { "epoch": 0.13, "learning_rate": 2.6208e-07, "loss": 2.3321, "step": 316 }, { "epoch": 0.13, "eval_loss": 2.1449663639068604, "eval_runtime": 0.4784, "eval_samples_per_second": 8.362, "eval_steps_per_second": 2.09, "step": 316 }, { "epoch": 0.13, "learning_rate": 2.616e-07, "loss": 2.3348, "step": 320 }, { "epoch": 0.13, "eval_loss": 2.1414906978607178, "eval_runtime": 0.4949, "eval_samples_per_second": 8.082, "eval_steps_per_second": 2.021, "step": 320 }, { "epoch": 0.13, "learning_rate": 2.6112e-07, "loss": 2.2728, "step": 324 }, { "epoch": 0.13, "eval_loss": 2.1374001502990723, "eval_runtime": 0.6321, "eval_samples_per_second": 6.328, "eval_steps_per_second": 1.582, "step": 324 }, { "epoch": 0.13, "learning_rate": 2.6064e-07, "loss": 2.287, "step": 328 }, { "epoch": 0.13, "eval_loss": 2.1333529949188232, "eval_runtime": 0.6547, "eval_samples_per_second": 6.109, "eval_steps_per_second": 1.527, "step": 328 }, { "epoch": 0.13, "learning_rate": 2.6015999999999997e-07, "loss": 2.2474, "step": 332 }, { "epoch": 0.13, "eval_loss": 2.1297547817230225, "eval_runtime": 0.7093, "eval_samples_per_second": 5.639, "eval_steps_per_second": 1.41, "step": 332 }, { "epoch": 0.13, "learning_rate": 2.5968e-07, "loss": 2.3214, "step": 336 }, { "epoch": 0.13, "eval_loss": 2.126392364501953, "eval_runtime": 0.6909, "eval_samples_per_second": 5.789, "eval_steps_per_second": 1.447, "step": 336 }, { "epoch": 0.14, "learning_rate": 2.592e-07, "loss": 2.2725, "step": 340 }, { "epoch": 0.14, "eval_loss": 2.122309923171997, "eval_runtime": 0.4823, "eval_samples_per_second": 8.293, "eval_steps_per_second": 2.073, "step": 340 }, { "epoch": 0.14, "learning_rate": 2.5872000000000003e-07, "loss": 2.3114, "step": 344 }, { "epoch": 0.14, "eval_loss": 2.118303060531616, "eval_runtime": 0.4954, "eval_samples_per_second": 8.075, "eval_steps_per_second": 2.019, "step": 344 }, { "epoch": 0.14, "learning_rate": 2.5824e-07, "loss": 2.2333, "step": 348 }, { "epoch": 0.14, "eval_loss": 2.114621162414551, "eval_runtime": 0.4856, "eval_samples_per_second": 8.238, "eval_steps_per_second": 2.059, "step": 348 }, { "epoch": 0.14, "learning_rate": 2.5776e-07, "loss": 2.2812, "step": 352 }, { "epoch": 0.14, "eval_loss": 2.11067795753479, "eval_runtime": 0.4778, "eval_samples_per_second": 8.372, "eval_steps_per_second": 2.093, "step": 352 }, { "epoch": 0.14, "learning_rate": 2.5728e-07, "loss": 2.2454, "step": 356 }, { "epoch": 0.14, "eval_loss": 2.106940746307373, "eval_runtime": 0.4945, "eval_samples_per_second": 8.089, "eval_steps_per_second": 2.022, "step": 356 }, { "epoch": 0.14, "learning_rate": 2.5679999999999997e-07, "loss": 2.2261, "step": 360 }, { "epoch": 0.14, "eval_loss": 2.1031668186187744, "eval_runtime": 0.6521, "eval_samples_per_second": 6.134, "eval_steps_per_second": 1.533, "step": 360 }, { "epoch": 0.15, "learning_rate": 2.5632e-07, "loss": 2.2841, "step": 364 }, { "epoch": 0.15, "eval_loss": 2.0989203453063965, "eval_runtime": 0.6249, "eval_samples_per_second": 6.401, "eval_steps_per_second": 1.6, "step": 364 }, { "epoch": 0.15, "learning_rate": 2.5584e-07, "loss": 2.2481, "step": 368 }, { "epoch": 0.15, "eval_loss": 2.095189332962036, "eval_runtime": 0.6855, "eval_samples_per_second": 5.835, "eval_steps_per_second": 1.459, "step": 368 }, { "epoch": 0.15, "learning_rate": 2.5536e-07, "loss": 2.278, "step": 372 }, { "epoch": 0.15, "eval_loss": 2.0912463665008545, "eval_runtime": 0.7393, "eval_samples_per_second": 5.411, "eval_steps_per_second": 1.353, "step": 372 }, { "epoch": 0.15, "learning_rate": 2.5488e-07, "loss": 2.2765, "step": 376 }, { "epoch": 0.15, "eval_loss": 2.087336301803589, "eval_runtime": 0.4793, "eval_samples_per_second": 8.345, "eval_steps_per_second": 2.086, "step": 376 }, { "epoch": 0.15, "learning_rate": 2.544e-07, "loss": 2.2232, "step": 380 }, { "epoch": 0.15, "eval_loss": 2.0833120346069336, "eval_runtime": 0.487, "eval_samples_per_second": 8.214, "eval_steps_per_second": 2.053, "step": 380 }, { "epoch": 0.15, "learning_rate": 2.5392e-07, "loss": 2.306, "step": 384 }, { "epoch": 0.15, "eval_loss": 2.079479932785034, "eval_runtime": 0.4722, "eval_samples_per_second": 8.471, "eval_steps_per_second": 2.118, "step": 384 }, { "epoch": 0.16, "learning_rate": 2.5343999999999997e-07, "loss": 2.2126, "step": 388 }, { "epoch": 0.16, "eval_loss": 2.0760295391082764, "eval_runtime": 0.4958, "eval_samples_per_second": 8.068, "eval_steps_per_second": 2.017, "step": 388 }, { "epoch": 0.16, "learning_rate": 2.5295999999999996e-07, "loss": 2.2557, "step": 392 }, { "epoch": 0.16, "eval_loss": 2.072136402130127, "eval_runtime": 0.469, "eval_samples_per_second": 8.529, "eval_steps_per_second": 2.132, "step": 392 }, { "epoch": 0.16, "learning_rate": 2.5248e-07, "loss": 2.1988, "step": 396 }, { "epoch": 0.16, "eval_loss": 2.0683670043945312, "eval_runtime": 0.6385, "eval_samples_per_second": 6.264, "eval_steps_per_second": 1.566, "step": 396 }, { "epoch": 0.16, "learning_rate": 2.52e-07, "loss": 2.1917, "step": 400 }, { "epoch": 0.16, "eval_loss": 2.0638906955718994, "eval_runtime": 0.6834, "eval_samples_per_second": 5.853, "eval_steps_per_second": 1.463, "step": 400 }, { "epoch": 0.16, "learning_rate": 2.5152e-07, "loss": 2.2479, "step": 404 }, { "epoch": 0.16, "eval_loss": 2.0599253177642822, "eval_runtime": 0.7261, "eval_samples_per_second": 5.509, "eval_steps_per_second": 1.377, "step": 404 }, { "epoch": 0.16, "learning_rate": 2.5104e-07, "loss": 2.1484, "step": 408 }, { "epoch": 0.16, "eval_loss": 2.055751085281372, "eval_runtime": 0.7367, "eval_samples_per_second": 5.429, "eval_steps_per_second": 1.357, "step": 408 }, { "epoch": 0.16, "learning_rate": 2.5056e-07, "loss": 2.1886, "step": 412 }, { "epoch": 0.16, "eval_loss": 2.052119016647339, "eval_runtime": 0.4808, "eval_samples_per_second": 8.319, "eval_steps_per_second": 2.08, "step": 412 }, { "epoch": 0.17, "learning_rate": 2.5007999999999997e-07, "loss": 2.2026, "step": 416 }, { "epoch": 0.17, "eval_loss": 2.0482354164123535, "eval_runtime": 0.4856, "eval_samples_per_second": 8.238, "eval_steps_per_second": 2.059, "step": 416 }, { "epoch": 0.17, "learning_rate": 2.4959999999999996e-07, "loss": 2.1572, "step": 420 }, { "epoch": 0.17, "eval_loss": 2.0441887378692627, "eval_runtime": 0.4779, "eval_samples_per_second": 8.37, "eval_steps_per_second": 2.093, "step": 420 }, { "epoch": 0.17, "learning_rate": 2.4912e-07, "loss": 2.1931, "step": 424 }, { "epoch": 0.17, "eval_loss": 2.0399935245513916, "eval_runtime": 0.4803, "eval_samples_per_second": 8.329, "eval_steps_per_second": 2.082, "step": 424 }, { "epoch": 0.17, "learning_rate": 2.4864e-07, "loss": 2.161, "step": 428 }, { "epoch": 0.17, "eval_loss": 2.03645920753479, "eval_runtime": 0.4924, "eval_samples_per_second": 8.123, "eval_steps_per_second": 2.031, "step": 428 }, { "epoch": 0.17, "learning_rate": 2.4816e-07, "loss": 2.1115, "step": 432 }, { "epoch": 0.17, "eval_loss": 2.032196044921875, "eval_runtime": 0.6345, "eval_samples_per_second": 6.304, "eval_steps_per_second": 1.576, "step": 432 }, { "epoch": 0.17, "learning_rate": 2.4768e-07, "loss": 2.173, "step": 436 }, { "epoch": 0.17, "eval_loss": 2.028397560119629, "eval_runtime": 0.6625, "eval_samples_per_second": 6.038, "eval_steps_per_second": 1.509, "step": 436 }, { "epoch": 0.18, "learning_rate": 2.472e-07, "loss": 2.1491, "step": 440 }, { "epoch": 0.18, "eval_loss": 2.0247464179992676, "eval_runtime": 0.6969, "eval_samples_per_second": 5.74, "eval_steps_per_second": 1.435, "step": 440 }, { "epoch": 0.18, "learning_rate": 2.4672e-07, "loss": 2.1716, "step": 444 }, { "epoch": 0.18, "eval_loss": 2.0203933715820312, "eval_runtime": 0.7311, "eval_samples_per_second": 5.471, "eval_steps_per_second": 1.368, "step": 444 }, { "epoch": 0.18, "learning_rate": 2.4623999999999996e-07, "loss": 2.2031, "step": 448 }, { "epoch": 0.18, "eval_loss": 2.016533374786377, "eval_runtime": 0.4875, "eval_samples_per_second": 8.206, "eval_steps_per_second": 2.051, "step": 448 }, { "epoch": 0.18, "learning_rate": 2.4576e-07, "loss": 2.1466, "step": 452 }, { "epoch": 0.18, "eval_loss": 2.012568473815918, "eval_runtime": 0.4897, "eval_samples_per_second": 8.168, "eval_steps_per_second": 2.042, "step": 452 }, { "epoch": 0.18, "learning_rate": 2.4528e-07, "loss": 2.1384, "step": 456 }, { "epoch": 0.18, "eval_loss": 2.0088417530059814, "eval_runtime": 0.4969, "eval_samples_per_second": 8.05, "eval_steps_per_second": 2.013, "step": 456 }, { "epoch": 0.18, "learning_rate": 2.4479999999999997e-07, "loss": 2.1824, "step": 460 }, { "epoch": 0.18, "eval_loss": 2.0047850608825684, "eval_runtime": 0.4897, "eval_samples_per_second": 8.168, "eval_steps_per_second": 2.042, "step": 460 }, { "epoch": 0.19, "learning_rate": 2.4432e-07, "loss": 2.1401, "step": 464 }, { "epoch": 0.19, "eval_loss": 2.0006463527679443, "eval_runtime": 0.4882, "eval_samples_per_second": 8.193, "eval_steps_per_second": 2.048, "step": 464 }, { "epoch": 0.19, "learning_rate": 2.4384e-07, "loss": 2.2086, "step": 468 }, { "epoch": 0.19, "eval_loss": 1.9969314336776733, "eval_runtime": 0.6612, "eval_samples_per_second": 6.049, "eval_steps_per_second": 1.512, "step": 468 }, { "epoch": 0.19, "learning_rate": 2.4336e-07, "loss": 2.1687, "step": 472 }, { "epoch": 0.19, "eval_loss": 1.9925954341888428, "eval_runtime": 0.6804, "eval_samples_per_second": 5.879, "eval_steps_per_second": 1.47, "step": 472 }, { "epoch": 0.19, "learning_rate": 2.4287999999999996e-07, "loss": 2.145, "step": 476 }, { "epoch": 0.19, "eval_loss": 1.9888066053390503, "eval_runtime": 0.6955, "eval_samples_per_second": 5.752, "eval_steps_per_second": 1.438, "step": 476 }, { "epoch": 0.19, "learning_rate": 2.424e-07, "loss": 2.2007, "step": 480 }, { "epoch": 0.19, "eval_loss": 1.9850127696990967, "eval_runtime": 0.7558, "eval_samples_per_second": 5.292, "eval_steps_per_second": 1.323, "step": 480 }, { "epoch": 0.19, "learning_rate": 2.4192e-07, "loss": 2.1367, "step": 484 }, { "epoch": 0.19, "eval_loss": 1.9808437824249268, "eval_runtime": 0.4706, "eval_samples_per_second": 8.499, "eval_steps_per_second": 2.125, "step": 484 }, { "epoch": 0.2, "learning_rate": 2.4143999999999997e-07, "loss": 2.1291, "step": 488 }, { "epoch": 0.2, "eval_loss": 1.9767786264419556, "eval_runtime": 0.4803, "eval_samples_per_second": 8.327, "eval_steps_per_second": 2.082, "step": 488 }, { "epoch": 0.2, "learning_rate": 2.4096e-07, "loss": 2.1124, "step": 492 }, { "epoch": 0.2, "eval_loss": 1.9728602170944214, "eval_runtime": 0.4802, "eval_samples_per_second": 8.33, "eval_steps_per_second": 2.082, "step": 492 }, { "epoch": 0.2, "learning_rate": 2.4048e-07, "loss": 2.0738, "step": 496 }, { "epoch": 0.2, "eval_loss": 1.968900203704834, "eval_runtime": 0.4884, "eval_samples_per_second": 8.189, "eval_steps_per_second": 2.047, "step": 496 }, { "epoch": 0.2, "learning_rate": 2.4e-07, "loss": 2.1048, "step": 500 }, { "epoch": 0.2, "eval_loss": 1.9646457433700562, "eval_runtime": 0.5026, "eval_samples_per_second": 7.959, "eval_steps_per_second": 1.99, "step": 500 } ], "logging_steps": 4, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 6.356044087296e+16, "trial_name": null, "trial_params": null }