diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,5270 @@ +{ + "best_metric": 1.2840522527694702, + "best_model_checkpoint": "./results/checkpoint-1500", + "epoch": 0.6, + "eval_steps": 4, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.9951999999999997e-07, + "loss": 2.6285, + "step": 4 + }, + { + "epoch": 0.0, + "eval_loss": 2.4697508811950684, + "eval_runtime": 0.485, + "eval_samples_per_second": 8.248, + "eval_steps_per_second": 2.062, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 2.9904e-07, + "loss": 2.6222, + "step": 8 + }, + { + "epoch": 0.0, + "eval_loss": 2.465975284576416, + "eval_runtime": 0.6323, + "eval_samples_per_second": 6.326, + "eval_steps_per_second": 1.582, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 2.9856e-07, + "loss": 2.6536, + "step": 12 + }, + { + "epoch": 0.0, + "eval_loss": 2.460374116897583, + "eval_runtime": 0.6478, + "eval_samples_per_second": 6.175, + "eval_steps_per_second": 1.544, + "step": 12 + }, + { + "epoch": 0.01, + "learning_rate": 2.9808e-07, + "loss": 2.6785, + "step": 16 + }, + { + "epoch": 0.01, + "eval_loss": 2.4556970596313477, + "eval_runtime": 0.6653, + "eval_samples_per_second": 6.012, + "eval_steps_per_second": 1.503, + "step": 16 + }, + { + "epoch": 0.01, + "learning_rate": 2.9759999999999996e-07, + "loss": 2.6085, + "step": 20 + }, + { + "epoch": 0.01, + "eval_loss": 2.4514715671539307, + "eval_runtime": 0.5241, + "eval_samples_per_second": 7.632, + "eval_steps_per_second": 1.908, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.9711999999999995e-07, + "loss": 2.5907, + "step": 24 + }, + { + "epoch": 0.01, + "eval_loss": 2.4462974071502686, + "eval_runtime": 0.4689, + "eval_samples_per_second": 8.53, + "eval_steps_per_second": 2.133, + "step": 24 + }, + { + "epoch": 0.01, + "learning_rate": 2.9664e-07, + "loss": 2.5942, + "step": 28 + }, + { + "epoch": 0.01, + "eval_loss": 2.4415194988250732, + "eval_runtime": 0.4829, + "eval_samples_per_second": 8.284, + "eval_steps_per_second": 2.071, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 2.9615999999999997e-07, + "loss": 2.6101, + "step": 32 + }, + { + "epoch": 0.01, + "eval_loss": 2.437161922454834, + "eval_runtime": 0.4715, + "eval_samples_per_second": 8.483, + "eval_steps_per_second": 2.121, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 2.9568e-07, + "loss": 2.5827, + "step": 36 + }, + { + "epoch": 0.01, + "eval_loss": 2.432689666748047, + "eval_runtime": 0.4938, + "eval_samples_per_second": 8.1, + "eval_steps_per_second": 2.025, + "step": 36 + }, + { + "epoch": 0.02, + "learning_rate": 2.952e-07, + "loss": 2.5729, + "step": 40 + }, + { + "epoch": 0.02, + "eval_loss": 2.4281153678894043, + "eval_runtime": 0.5021, + "eval_samples_per_second": 7.966, + "eval_steps_per_second": 1.991, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.9472e-07, + "loss": 2.5856, + "step": 44 + }, + { + "epoch": 0.02, + "eval_loss": 2.423053741455078, + "eval_runtime": 0.593, + "eval_samples_per_second": 6.746, + "eval_steps_per_second": 1.686, + "step": 44 + }, + { + "epoch": 0.02, + "learning_rate": 2.9423999999999997e-07, + "loss": 2.589, + "step": 48 + }, + { + "epoch": 0.02, + "eval_loss": 2.418571949005127, + "eval_runtime": 0.6933, + "eval_samples_per_second": 5.77, + "eval_steps_per_second": 1.442, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 2.9375999999999995e-07, + "loss": 2.6483, + "step": 52 + }, + { + "epoch": 0.02, + "eval_loss": 2.414531946182251, + "eval_runtime": 0.7167, + "eval_samples_per_second": 5.581, + "eval_steps_per_second": 1.395, + "step": 52 + }, + { + "epoch": 0.02, + "learning_rate": 2.9328e-07, + "loss": 2.517, + "step": 56 + }, + { + "epoch": 0.02, + "eval_loss": 2.409538745880127, + "eval_runtime": 0.4826, + "eval_samples_per_second": 8.289, + "eval_steps_per_second": 2.072, + "step": 56 + }, + { + "epoch": 0.02, + "learning_rate": 2.928e-07, + "loss": 2.5987, + "step": 60 + }, + { + "epoch": 0.02, + "eval_loss": 2.4050426483154297, + "eval_runtime": 0.4757, + "eval_samples_per_second": 8.409, + "eval_steps_per_second": 2.102, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 2.9232e-07, + "loss": 2.5489, + "step": 64 + }, + { + "epoch": 0.03, + "eval_loss": 2.400360107421875, + "eval_runtime": 0.4945, + "eval_samples_per_second": 8.089, + "eval_steps_per_second": 2.022, + "step": 64 + }, + { + "epoch": 0.03, + "learning_rate": 2.9184e-07, + "loss": 2.5063, + "step": 68 + }, + { + "epoch": 0.03, + "eval_loss": 2.396500587463379, + "eval_runtime": 0.5, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.0, + "step": 68 + }, + { + "epoch": 0.03, + "learning_rate": 2.9136e-07, + "loss": 2.5867, + "step": 72 + }, + { + "epoch": 0.03, + "eval_loss": 2.3916146755218506, + "eval_runtime": 0.4602, + "eval_samples_per_second": 8.693, + "eval_steps_per_second": 2.173, + "step": 72 + }, + { + "epoch": 0.03, + "learning_rate": 2.9087999999999997e-07, + "loss": 2.544, + "step": 76 + }, + { + "epoch": 0.03, + "eval_loss": 2.3873047828674316, + "eval_runtime": 0.4731, + "eval_samples_per_second": 8.456, + "eval_steps_per_second": 2.114, + "step": 76 + }, + { + "epoch": 0.03, + "learning_rate": 2.9039999999999995e-07, + "loss": 2.5596, + "step": 80 + }, + { + "epoch": 0.03, + "eval_loss": 2.382803440093994, + "eval_runtime": 0.6092, + "eval_samples_per_second": 6.566, + "eval_steps_per_second": 1.642, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 2.8992e-07, + "loss": 2.5744, + "step": 84 + }, + { + "epoch": 0.03, + "eval_loss": 2.3786380290985107, + "eval_runtime": 0.7212, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.387, + "step": 84 + }, + { + "epoch": 0.04, + "learning_rate": 2.8944e-07, + "loss": 2.5588, + "step": 88 + }, + { + "epoch": 0.04, + "eval_loss": 2.374176502227783, + "eval_runtime": 0.6826, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 1.465, + "step": 88 + }, + { + "epoch": 0.04, + "learning_rate": 2.8895999999999996e-07, + "loss": 2.5579, + "step": 92 + }, + { + "epoch": 0.04, + "eval_loss": 2.3702104091644287, + "eval_runtime": 0.4896, + "eval_samples_per_second": 8.169, + "eval_steps_per_second": 2.042, + "step": 92 + }, + { + "epoch": 0.04, + "learning_rate": 2.8848e-07, + "loss": 2.5245, + "step": 96 + }, + { + "epoch": 0.04, + "eval_loss": 2.3660218715667725, + "eval_runtime": 0.4764, + "eval_samples_per_second": 8.397, + "eval_steps_per_second": 2.099, + "step": 96 + }, + { + "epoch": 0.04, + "learning_rate": 2.88e-07, + "loss": 2.5132, + "step": 100 + }, + { + "epoch": 0.04, + "eval_loss": 2.36110520362854, + "eval_runtime": 0.4799, + "eval_samples_per_second": 8.335, + "eval_steps_per_second": 2.084, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 2.8751999999999997e-07, + "loss": 2.5037, + "step": 104 + }, + { + "epoch": 0.04, + "eval_loss": 2.3570125102996826, + "eval_runtime": 0.4722, + "eval_samples_per_second": 8.47, + "eval_steps_per_second": 2.118, + "step": 104 + }, + { + "epoch": 0.04, + "learning_rate": 2.8704e-07, + "loss": 2.4727, + "step": 108 + }, + { + "epoch": 0.04, + "eval_loss": 2.3530666828155518, + "eval_runtime": 0.467, + "eval_samples_per_second": 8.565, + "eval_steps_per_second": 2.141, + "step": 108 + }, + { + "epoch": 0.04, + "learning_rate": 2.8656e-07, + "loss": 2.4709, + "step": 112 + }, + { + "epoch": 0.04, + "eval_loss": 2.348759412765503, + "eval_runtime": 0.501, + "eval_samples_per_second": 7.984, + "eval_steps_per_second": 1.996, + "step": 112 + }, + { + "epoch": 0.05, + "learning_rate": 2.8608e-07, + "loss": 2.4711, + "step": 116 + }, + { + "epoch": 0.05, + "eval_loss": 2.344454050064087, + "eval_runtime": 0.6607, + "eval_samples_per_second": 6.054, + "eval_steps_per_second": 1.513, + "step": 116 + }, + { + "epoch": 0.05, + "learning_rate": 2.8559999999999996e-07, + "loss": 2.5445, + "step": 120 + }, + { + "epoch": 0.05, + "eval_loss": 2.3402156829833984, + "eval_runtime": 0.704, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 1.42, + "step": 120 + }, + { + "epoch": 0.05, + "learning_rate": 2.8512e-07, + "loss": 2.4994, + "step": 124 + }, + { + "epoch": 0.05, + "eval_loss": 2.3362019062042236, + "eval_runtime": 0.6849, + "eval_samples_per_second": 5.84, + "eval_steps_per_second": 1.46, + "step": 124 + }, + { + "epoch": 0.05, + "learning_rate": 2.8464e-07, + "loss": 2.5036, + "step": 128 + }, + { + "epoch": 0.05, + "eval_loss": 2.3319339752197266, + "eval_runtime": 0.4864, + "eval_samples_per_second": 8.223, + "eval_steps_per_second": 2.056, + "step": 128 + }, + { + "epoch": 0.05, + "learning_rate": 2.8416e-07, + "loss": 2.5525, + "step": 132 + }, + { + "epoch": 0.05, + "eval_loss": 2.3276522159576416, + "eval_runtime": 0.4783, + "eval_samples_per_second": 8.364, + "eval_steps_per_second": 2.091, + "step": 132 + }, + { + "epoch": 0.05, + "learning_rate": 2.8368e-07, + "loss": 2.5245, + "step": 136 + }, + { + "epoch": 0.05, + "eval_loss": 2.3241090774536133, + "eval_runtime": 0.4805, + "eval_samples_per_second": 8.324, + "eval_steps_per_second": 2.081, + "step": 136 + }, + { + "epoch": 0.06, + "learning_rate": 2.832e-07, + "loss": 2.4946, + "step": 140 + }, + { + "epoch": 0.06, + "eval_loss": 2.3198165893554688, + "eval_runtime": 0.473, + "eval_samples_per_second": 8.457, + "eval_steps_per_second": 2.114, + "step": 140 + }, + { + "epoch": 0.06, + "learning_rate": 2.8272e-07, + "loss": 2.5142, + "step": 144 + }, + { + "epoch": 0.06, + "eval_loss": 2.3152613639831543, + "eval_runtime": 0.4858, + "eval_samples_per_second": 8.234, + "eval_steps_per_second": 2.058, + "step": 144 + }, + { + "epoch": 0.06, + "learning_rate": 2.8223999999999997e-07, + "loss": 2.4639, + "step": 148 + }, + { + "epoch": 0.06, + "eval_loss": 2.3112645149230957, + "eval_runtime": 0.488, + "eval_samples_per_second": 8.196, + "eval_steps_per_second": 2.049, + "step": 148 + }, + { + "epoch": 0.06, + "learning_rate": 2.8176e-07, + "loss": 2.4796, + "step": 152 + }, + { + "epoch": 0.06, + "eval_loss": 2.307020902633667, + "eval_runtime": 0.6163, + "eval_samples_per_second": 6.49, + "eval_steps_per_second": 1.623, + "step": 152 + }, + { + "epoch": 0.06, + "learning_rate": 2.8128e-07, + "loss": 2.4529, + "step": 156 + }, + { + "epoch": 0.06, + "eval_loss": 2.303062915802002, + "eval_runtime": 0.6764, + "eval_samples_per_second": 5.913, + "eval_steps_per_second": 1.478, + "step": 156 + }, + { + "epoch": 0.06, + "learning_rate": 2.808e-07, + "loss": 2.4823, + "step": 160 + }, + { + "epoch": 0.06, + "eval_loss": 2.2993311882019043, + "eval_runtime": 0.6854, + "eval_samples_per_second": 5.836, + "eval_steps_per_second": 1.459, + "step": 160 + }, + { + "epoch": 0.07, + "learning_rate": 2.8032e-07, + "loss": 2.4439, + "step": 164 + }, + { + "epoch": 0.07, + "eval_loss": 2.2947850227355957, + "eval_runtime": 0.4745, + "eval_samples_per_second": 8.429, + "eval_steps_per_second": 2.107, + "step": 164 + }, + { + "epoch": 0.07, + "learning_rate": 2.7984e-07, + "loss": 2.4652, + "step": 168 + }, + { + "epoch": 0.07, + "eval_loss": 2.2908992767333984, + "eval_runtime": 0.4759, + "eval_samples_per_second": 8.406, + "eval_steps_per_second": 2.101, + "step": 168 + }, + { + "epoch": 0.07, + "learning_rate": 2.7936e-07, + "loss": 2.4574, + "step": 172 + }, + { + "epoch": 0.07, + "eval_loss": 2.2867026329040527, + "eval_runtime": 0.4973, + "eval_samples_per_second": 8.043, + "eval_steps_per_second": 2.011, + "step": 172 + }, + { + "epoch": 0.07, + "learning_rate": 2.7887999999999997e-07, + "loss": 2.4557, + "step": 176 + }, + { + "epoch": 0.07, + "eval_loss": 2.283027172088623, + "eval_runtime": 0.4719, + "eval_samples_per_second": 8.477, + "eval_steps_per_second": 2.119, + "step": 176 + }, + { + "epoch": 0.07, + "learning_rate": 2.784e-07, + "loss": 2.4462, + "step": 180 + }, + { + "epoch": 0.07, + "eval_loss": 2.2787420749664307, + "eval_runtime": 0.472, + "eval_samples_per_second": 8.474, + "eval_steps_per_second": 2.119, + "step": 180 + }, + { + "epoch": 0.07, + "learning_rate": 2.7792e-07, + "loss": 2.3962, + "step": 184 + }, + { + "epoch": 0.07, + "eval_loss": 2.2745461463928223, + "eval_runtime": 0.6328, + "eval_samples_per_second": 6.322, + "eval_steps_per_second": 1.58, + "step": 184 + }, + { + "epoch": 0.08, + "learning_rate": 2.7744e-07, + "loss": 2.3666, + "step": 188 + }, + { + "epoch": 0.08, + "eval_loss": 2.2705912590026855, + "eval_runtime": 0.6375, + "eval_samples_per_second": 6.274, + "eval_steps_per_second": 1.569, + "step": 188 + }, + { + "epoch": 0.08, + "learning_rate": 2.7696e-07, + "loss": 2.5024, + "step": 192 + }, + { + "epoch": 0.08, + "eval_loss": 2.266995906829834, + "eval_runtime": 0.6984, + "eval_samples_per_second": 5.727, + "eval_steps_per_second": 1.432, + "step": 192 + }, + { + "epoch": 0.08, + "learning_rate": 2.7648e-07, + "loss": 2.4419, + "step": 196 + }, + { + "epoch": 0.08, + "eval_loss": 2.2626519203186035, + "eval_runtime": 0.7334, + "eval_samples_per_second": 5.454, + "eval_steps_per_second": 1.363, + "step": 196 + }, + { + "epoch": 0.08, + "learning_rate": 2.76e-07, + "loss": 2.4246, + "step": 200 + }, + { + "epoch": 0.08, + "eval_loss": 2.2583603858947754, + "eval_runtime": 0.48, + "eval_samples_per_second": 8.333, + "eval_steps_per_second": 2.083, + "step": 200 + }, + { + "epoch": 0.08, + "learning_rate": 2.7551999999999997e-07, + "loss": 2.3853, + "step": 204 + }, + { + "epoch": 0.08, + "eval_loss": 2.2551512718200684, + "eval_runtime": 0.4939, + "eval_samples_per_second": 8.098, + "eval_steps_per_second": 2.025, + "step": 204 + }, + { + "epoch": 0.08, + "learning_rate": 2.7503999999999995e-07, + "loss": 2.4032, + "step": 208 + }, + { + "epoch": 0.08, + "eval_loss": 2.251105785369873, + "eval_runtime": 0.46, + "eval_samples_per_second": 8.695, + "eval_steps_per_second": 2.174, + "step": 208 + }, + { + "epoch": 0.08, + "learning_rate": 2.7456e-07, + "loss": 2.4444, + "step": 212 + }, + { + "epoch": 0.08, + "eval_loss": 2.247025489807129, + "eval_runtime": 0.4948, + "eval_samples_per_second": 8.084, + "eval_steps_per_second": 2.021, + "step": 212 + }, + { + "epoch": 0.09, + "learning_rate": 2.7408e-07, + "loss": 2.2932, + "step": 216 + }, + { + "epoch": 0.09, + "eval_loss": 2.242764472961426, + "eval_runtime": 0.4897, + "eval_samples_per_second": 8.168, + "eval_steps_per_second": 2.042, + "step": 216 + }, + { + "epoch": 0.09, + "learning_rate": 2.736e-07, + "loss": 2.3929, + "step": 220 + }, + { + "epoch": 0.09, + "eval_loss": 2.2391483783721924, + "eval_runtime": 0.6128, + "eval_samples_per_second": 6.528, + "eval_steps_per_second": 1.632, + "step": 220 + }, + { + "epoch": 0.09, + "learning_rate": 2.7312e-07, + "loss": 2.4112, + "step": 224 + }, + { + "epoch": 0.09, + "eval_loss": 2.234977960586548, + "eval_runtime": 0.648, + "eval_samples_per_second": 6.172, + "eval_steps_per_second": 1.543, + "step": 224 + }, + { + "epoch": 0.09, + "learning_rate": 2.7264e-07, + "loss": 2.4191, + "step": 228 + }, + { + "epoch": 0.09, + "eval_loss": 2.231099843978882, + "eval_runtime": 0.6862, + "eval_samples_per_second": 5.829, + "eval_steps_per_second": 1.457, + "step": 228 + }, + { + "epoch": 0.09, + "learning_rate": 2.7215999999999997e-07, + "loss": 2.4408, + "step": 232 + }, + { + "epoch": 0.09, + "eval_loss": 2.2272462844848633, + "eval_runtime": 0.7076, + "eval_samples_per_second": 5.653, + "eval_steps_per_second": 1.413, + "step": 232 + }, + { + "epoch": 0.09, + "learning_rate": 2.7167999999999996e-07, + "loss": 2.3884, + "step": 236 + }, + { + "epoch": 0.09, + "eval_loss": 2.223376750946045, + "eval_runtime": 0.5169, + "eval_samples_per_second": 7.738, + "eval_steps_per_second": 1.935, + "step": 236 + }, + { + "epoch": 0.1, + "learning_rate": 2.712e-07, + "loss": 2.3689, + "step": 240 + }, + { + "epoch": 0.1, + "eval_loss": 2.2195653915405273, + "eval_runtime": 0.4793, + "eval_samples_per_second": 8.346, + "eval_steps_per_second": 2.086, + "step": 240 + }, + { + "epoch": 0.1, + "learning_rate": 2.7072e-07, + "loss": 2.3689, + "step": 244 + }, + { + "epoch": 0.1, + "eval_loss": 2.2153775691986084, + "eval_runtime": 0.4771, + "eval_samples_per_second": 8.384, + "eval_steps_per_second": 2.096, + "step": 244 + }, + { + "epoch": 0.1, + "learning_rate": 2.7024e-07, + "loss": 2.3249, + "step": 248 + }, + { + "epoch": 0.1, + "eval_loss": 2.211355209350586, + "eval_runtime": 0.4778, + "eval_samples_per_second": 8.372, + "eval_steps_per_second": 2.093, + "step": 248 + }, + { + "epoch": 0.1, + "learning_rate": 2.6976e-07, + "loss": 2.4286, + "step": 252 + }, + { + "epoch": 0.1, + "eval_loss": 2.207773208618164, + "eval_runtime": 0.4873, + "eval_samples_per_second": 8.209, + "eval_steps_per_second": 2.052, + "step": 252 + }, + { + "epoch": 0.1, + "learning_rate": 2.6928e-07, + "loss": 2.3497, + "step": 256 + }, + { + "epoch": 0.1, + "eval_loss": 2.203867197036743, + "eval_runtime": 0.6281, + "eval_samples_per_second": 6.368, + "eval_steps_per_second": 1.592, + "step": 256 + }, + { + "epoch": 0.1, + "learning_rate": 2.6879999999999997e-07, + "loss": 2.284, + "step": 260 + }, + { + "epoch": 0.1, + "eval_loss": 2.199937582015991, + "eval_runtime": 0.6885, + "eval_samples_per_second": 5.81, + "eval_steps_per_second": 1.452, + "step": 260 + }, + { + "epoch": 0.11, + "learning_rate": 2.6831999999999996e-07, + "loss": 2.3333, + "step": 264 + }, + { + "epoch": 0.11, + "eval_loss": 2.1958465576171875, + "eval_runtime": 0.6799, + "eval_samples_per_second": 5.883, + "eval_steps_per_second": 1.471, + "step": 264 + }, + { + "epoch": 0.11, + "learning_rate": 2.6784e-07, + "loss": 2.3305, + "step": 268 + }, + { + "epoch": 0.11, + "eval_loss": 2.192072868347168, + "eval_runtime": 0.7165, + "eval_samples_per_second": 5.583, + "eval_steps_per_second": 1.396, + "step": 268 + }, + { + "epoch": 0.11, + "learning_rate": 2.6736e-07, + "loss": 2.3465, + "step": 272 + }, + { + "epoch": 0.11, + "eval_loss": 2.1882476806640625, + "eval_runtime": 0.485, + "eval_samples_per_second": 8.247, + "eval_steps_per_second": 2.062, + "step": 272 + }, + { + "epoch": 0.11, + "learning_rate": 2.6687999999999997e-07, + "loss": 2.3274, + "step": 276 + }, + { + "epoch": 0.11, + "eval_loss": 2.1841320991516113, + "eval_runtime": 0.4767, + "eval_samples_per_second": 8.391, + "eval_steps_per_second": 2.098, + "step": 276 + }, + { + "epoch": 0.11, + "learning_rate": 2.664e-07, + "loss": 2.3641, + "step": 280 + }, + { + "epoch": 0.11, + "eval_loss": 2.1803271770477295, + "eval_runtime": 0.5146, + "eval_samples_per_second": 7.774, + "eval_steps_per_second": 1.943, + "step": 280 + }, + { + "epoch": 0.11, + "learning_rate": 2.6592e-07, + "loss": 2.3089, + "step": 284 + }, + { + "epoch": 0.11, + "eval_loss": 2.176274538040161, + "eval_runtime": 0.488, + "eval_samples_per_second": 8.196, + "eval_steps_per_second": 2.049, + "step": 284 + }, + { + "epoch": 0.12, + "learning_rate": 2.6543999999999997e-07, + "loss": 2.2645, + "step": 288 + }, + { + "epoch": 0.12, + "eval_loss": 2.1720588207244873, + "eval_runtime": 0.4973, + "eval_samples_per_second": 8.043, + "eval_steps_per_second": 2.011, + "step": 288 + }, + { + "epoch": 0.12, + "learning_rate": 2.6495999999999996e-07, + "loss": 2.3439, + "step": 292 + }, + { + "epoch": 0.12, + "eval_loss": 2.1687240600585938, + "eval_runtime": 0.6283, + "eval_samples_per_second": 6.366, + "eval_steps_per_second": 1.592, + "step": 292 + }, + { + "epoch": 0.12, + "learning_rate": 2.6448e-07, + "loss": 2.3285, + "step": 296 + }, + { + "epoch": 0.12, + "eval_loss": 2.1649253368377686, + "eval_runtime": 0.6996, + "eval_samples_per_second": 5.718, + "eval_steps_per_second": 1.429, + "step": 296 + }, + { + "epoch": 0.12, + "learning_rate": 2.64e-07, + "loss": 2.3126, + "step": 300 + }, + { + "epoch": 0.12, + "eval_loss": 2.160398483276367, + "eval_runtime": 0.6904, + "eval_samples_per_second": 5.794, + "eval_steps_per_second": 1.448, + "step": 300 + }, + { + "epoch": 0.12, + "learning_rate": 2.6351999999999997e-07, + "loss": 2.3356, + "step": 304 + }, + { + "epoch": 0.12, + "eval_loss": 2.1570284366607666, + "eval_runtime": 0.4953, + "eval_samples_per_second": 8.076, + "eval_steps_per_second": 2.019, + "step": 304 + }, + { + "epoch": 0.12, + "learning_rate": 2.6304e-07, + "loss": 2.3396, + "step": 308 + }, + { + "epoch": 0.12, + "eval_loss": 2.1527013778686523, + "eval_runtime": 0.4977, + "eval_samples_per_second": 8.037, + "eval_steps_per_second": 2.009, + "step": 308 + }, + { + "epoch": 0.12, + "learning_rate": 2.6256e-07, + "loss": 2.2972, + "step": 312 + }, + { + "epoch": 0.12, + "eval_loss": 2.148724317550659, + "eval_runtime": 0.4939, + "eval_samples_per_second": 8.099, + "eval_steps_per_second": 2.025, + "step": 312 + }, + { + "epoch": 0.13, + "learning_rate": 2.6208e-07, + "loss": 2.3321, + "step": 316 + }, + { + "epoch": 0.13, + "eval_loss": 2.1449663639068604, + "eval_runtime": 0.4784, + "eval_samples_per_second": 8.362, + "eval_steps_per_second": 2.09, + "step": 316 + }, + { + "epoch": 0.13, + "learning_rate": 2.616e-07, + "loss": 2.3348, + "step": 320 + }, + { + "epoch": 0.13, + "eval_loss": 2.1414906978607178, + "eval_runtime": 0.4949, + "eval_samples_per_second": 8.082, + "eval_steps_per_second": 2.021, + "step": 320 + }, + { + "epoch": 0.13, + "learning_rate": 2.6112e-07, + "loss": 2.2728, + "step": 324 + }, + { + "epoch": 0.13, + "eval_loss": 2.1374001502990723, + "eval_runtime": 0.6321, + "eval_samples_per_second": 6.328, + "eval_steps_per_second": 1.582, + "step": 324 + }, + { + "epoch": 0.13, + "learning_rate": 2.6064e-07, + "loss": 2.287, + "step": 328 + }, + { + "epoch": 0.13, + "eval_loss": 2.1333529949188232, + "eval_runtime": 0.6547, + "eval_samples_per_second": 6.109, + "eval_steps_per_second": 1.527, + "step": 328 + }, + { + "epoch": 0.13, + "learning_rate": 2.6015999999999997e-07, + "loss": 2.2474, + "step": 332 + }, + { + "epoch": 0.13, + "eval_loss": 2.1297547817230225, + "eval_runtime": 0.7093, + "eval_samples_per_second": 5.639, + "eval_steps_per_second": 1.41, + "step": 332 + }, + { + "epoch": 0.13, + "learning_rate": 2.5968e-07, + "loss": 2.3214, + "step": 336 + }, + { + "epoch": 0.13, + "eval_loss": 2.126392364501953, + "eval_runtime": 0.6909, + "eval_samples_per_second": 5.789, + "eval_steps_per_second": 1.447, + "step": 336 + }, + { + "epoch": 0.14, + "learning_rate": 2.592e-07, + "loss": 2.2725, + "step": 340 + }, + { + "epoch": 0.14, + "eval_loss": 2.122309923171997, + "eval_runtime": 0.4823, + "eval_samples_per_second": 8.293, + "eval_steps_per_second": 2.073, + "step": 340 + }, + { + "epoch": 0.14, + "learning_rate": 2.5872000000000003e-07, + "loss": 2.3114, + "step": 344 + }, + { + "epoch": 0.14, + "eval_loss": 2.118303060531616, + "eval_runtime": 0.4954, + "eval_samples_per_second": 8.075, + "eval_steps_per_second": 2.019, + "step": 344 + }, + { + "epoch": 0.14, + "learning_rate": 2.5824e-07, + "loss": 2.2333, + "step": 348 + }, + { + "epoch": 0.14, + "eval_loss": 2.114621162414551, + "eval_runtime": 0.4856, + "eval_samples_per_second": 8.238, + "eval_steps_per_second": 2.059, + "step": 348 + }, + { + "epoch": 0.14, + "learning_rate": 2.5776e-07, + "loss": 2.2812, + "step": 352 + }, + { + "epoch": 0.14, + "eval_loss": 2.11067795753479, + "eval_runtime": 0.4778, + "eval_samples_per_second": 8.372, + "eval_steps_per_second": 2.093, + "step": 352 + }, + { + "epoch": 0.14, + "learning_rate": 2.5728e-07, + "loss": 2.2454, + "step": 356 + }, + { + "epoch": 0.14, + "eval_loss": 2.106940746307373, + "eval_runtime": 0.4945, + "eval_samples_per_second": 8.089, + "eval_steps_per_second": 2.022, + "step": 356 + }, + { + "epoch": 0.14, + "learning_rate": 2.5679999999999997e-07, + "loss": 2.2261, + "step": 360 + }, + { + "epoch": 0.14, + "eval_loss": 2.1031668186187744, + "eval_runtime": 0.6521, + "eval_samples_per_second": 6.134, + "eval_steps_per_second": 1.533, + "step": 360 + }, + { + "epoch": 0.15, + "learning_rate": 2.5632e-07, + "loss": 2.2841, + "step": 364 + }, + { + "epoch": 0.15, + "eval_loss": 2.0989203453063965, + "eval_runtime": 0.6249, + "eval_samples_per_second": 6.401, + "eval_steps_per_second": 1.6, + "step": 364 + }, + { + "epoch": 0.15, + "learning_rate": 2.5584e-07, + "loss": 2.2481, + "step": 368 + }, + { + "epoch": 0.15, + "eval_loss": 2.095189332962036, + "eval_runtime": 0.6855, + "eval_samples_per_second": 5.835, + "eval_steps_per_second": 1.459, + "step": 368 + }, + { + "epoch": 0.15, + "learning_rate": 2.5536e-07, + "loss": 2.278, + "step": 372 + }, + { + "epoch": 0.15, + "eval_loss": 2.0912463665008545, + "eval_runtime": 0.7393, + "eval_samples_per_second": 5.411, + "eval_steps_per_second": 1.353, + "step": 372 + }, + { + "epoch": 0.15, + "learning_rate": 2.5488e-07, + "loss": 2.2765, + "step": 376 + }, + { + "epoch": 0.15, + "eval_loss": 2.087336301803589, + "eval_runtime": 0.4793, + "eval_samples_per_second": 8.345, + "eval_steps_per_second": 2.086, + "step": 376 + }, + { + "epoch": 0.15, + "learning_rate": 2.544e-07, + "loss": 2.2232, + "step": 380 + }, + { + "epoch": 0.15, + "eval_loss": 2.0833120346069336, + "eval_runtime": 0.487, + "eval_samples_per_second": 8.214, + "eval_steps_per_second": 2.053, + "step": 380 + }, + { + "epoch": 0.15, + "learning_rate": 2.5392e-07, + "loss": 2.306, + "step": 384 + }, + { + "epoch": 0.15, + "eval_loss": 2.079479932785034, + "eval_runtime": 0.4722, + "eval_samples_per_second": 8.471, + "eval_steps_per_second": 2.118, + "step": 384 + }, + { + "epoch": 0.16, + "learning_rate": 2.5343999999999997e-07, + "loss": 2.2126, + "step": 388 + }, + { + "epoch": 0.16, + "eval_loss": 2.0760295391082764, + "eval_runtime": 0.4958, + "eval_samples_per_second": 8.068, + "eval_steps_per_second": 2.017, + "step": 388 + }, + { + "epoch": 0.16, + "learning_rate": 2.5295999999999996e-07, + "loss": 2.2557, + "step": 392 + }, + { + "epoch": 0.16, + "eval_loss": 2.072136402130127, + "eval_runtime": 0.469, + "eval_samples_per_second": 8.529, + "eval_steps_per_second": 2.132, + "step": 392 + }, + { + "epoch": 0.16, + "learning_rate": 2.5248e-07, + "loss": 2.1988, + "step": 396 + }, + { + "epoch": 0.16, + "eval_loss": 2.0683670043945312, + "eval_runtime": 0.6385, + "eval_samples_per_second": 6.264, + "eval_steps_per_second": 1.566, + "step": 396 + }, + { + "epoch": 0.16, + "learning_rate": 2.52e-07, + "loss": 2.1917, + "step": 400 + }, + { + "epoch": 0.16, + "eval_loss": 2.0638906955718994, + "eval_runtime": 0.6834, + "eval_samples_per_second": 5.853, + "eval_steps_per_second": 1.463, + "step": 400 + }, + { + "epoch": 0.16, + "learning_rate": 2.5152e-07, + "loss": 2.2479, + "step": 404 + }, + { + "epoch": 0.16, + "eval_loss": 2.0599253177642822, + "eval_runtime": 0.7261, + "eval_samples_per_second": 5.509, + "eval_steps_per_second": 1.377, + "step": 404 + }, + { + "epoch": 0.16, + "learning_rate": 2.5104e-07, + "loss": 2.1484, + "step": 408 + }, + { + "epoch": 0.16, + "eval_loss": 2.055751085281372, + "eval_runtime": 0.7367, + "eval_samples_per_second": 5.429, + "eval_steps_per_second": 1.357, + "step": 408 + }, + { + "epoch": 0.16, + "learning_rate": 2.5056e-07, + "loss": 2.1886, + "step": 412 + }, + { + "epoch": 0.16, + "eval_loss": 2.052119016647339, + "eval_runtime": 0.4808, + "eval_samples_per_second": 8.319, + "eval_steps_per_second": 2.08, + "step": 412 + }, + { + "epoch": 0.17, + "learning_rate": 2.5007999999999997e-07, + "loss": 2.2026, + "step": 416 + }, + { + "epoch": 0.17, + "eval_loss": 2.0482354164123535, + "eval_runtime": 0.4856, + "eval_samples_per_second": 8.238, + "eval_steps_per_second": 2.059, + "step": 416 + }, + { + "epoch": 0.17, + "learning_rate": 2.4959999999999996e-07, + "loss": 2.1572, + "step": 420 + }, + { + "epoch": 0.17, + "eval_loss": 2.0441887378692627, + "eval_runtime": 0.4779, + "eval_samples_per_second": 8.37, + "eval_steps_per_second": 2.093, + "step": 420 + }, + { + "epoch": 0.17, + "learning_rate": 2.4912e-07, + "loss": 2.1931, + "step": 424 + }, + { + "epoch": 0.17, + "eval_loss": 2.0399935245513916, + "eval_runtime": 0.4803, + "eval_samples_per_second": 8.329, + "eval_steps_per_second": 2.082, + "step": 424 + }, + { + "epoch": 0.17, + "learning_rate": 2.4864e-07, + "loss": 2.161, + "step": 428 + }, + { + "epoch": 0.17, + "eval_loss": 2.03645920753479, + "eval_runtime": 0.4924, + "eval_samples_per_second": 8.123, + "eval_steps_per_second": 2.031, + "step": 428 + }, + { + "epoch": 0.17, + "learning_rate": 2.4816e-07, + "loss": 2.1115, + "step": 432 + }, + { + "epoch": 0.17, + "eval_loss": 2.032196044921875, + "eval_runtime": 0.6345, + "eval_samples_per_second": 6.304, + "eval_steps_per_second": 1.576, + "step": 432 + }, + { + "epoch": 0.17, + "learning_rate": 2.4768e-07, + "loss": 2.173, + "step": 436 + }, + { + "epoch": 0.17, + "eval_loss": 2.028397560119629, + "eval_runtime": 0.6625, + "eval_samples_per_second": 6.038, + "eval_steps_per_second": 1.509, + "step": 436 + }, + { + "epoch": 0.18, + "learning_rate": 2.472e-07, + "loss": 2.1491, + "step": 440 + }, + { + "epoch": 0.18, + "eval_loss": 2.0247464179992676, + "eval_runtime": 0.6969, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 1.435, + "step": 440 + }, + { + "epoch": 0.18, + "learning_rate": 2.4672e-07, + "loss": 2.1716, + "step": 444 + }, + { + "epoch": 0.18, + "eval_loss": 2.0203933715820312, + "eval_runtime": 0.7311, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.368, + "step": 444 + }, + { + "epoch": 0.18, + "learning_rate": 2.4623999999999996e-07, + "loss": 2.2031, + "step": 448 + }, + { + "epoch": 0.18, + "eval_loss": 2.016533374786377, + "eval_runtime": 0.4875, + "eval_samples_per_second": 8.206, + "eval_steps_per_second": 2.051, + "step": 448 + }, + { + "epoch": 0.18, + "learning_rate": 2.4576e-07, + "loss": 2.1466, + "step": 452 + }, + { + "epoch": 0.18, + "eval_loss": 2.012568473815918, + "eval_runtime": 0.4897, + "eval_samples_per_second": 8.168, + "eval_steps_per_second": 2.042, + "step": 452 + }, + { + "epoch": 0.18, + "learning_rate": 2.4528e-07, + "loss": 2.1384, + "step": 456 + }, + { + "epoch": 0.18, + "eval_loss": 2.0088417530059814, + "eval_runtime": 0.4969, + "eval_samples_per_second": 8.05, + "eval_steps_per_second": 2.013, + "step": 456 + }, + { + "epoch": 0.18, + "learning_rate": 2.4479999999999997e-07, + "loss": 2.1824, + "step": 460 + }, + { + "epoch": 0.18, + "eval_loss": 2.0047850608825684, + "eval_runtime": 0.4897, + "eval_samples_per_second": 8.168, + "eval_steps_per_second": 2.042, + "step": 460 + }, + { + "epoch": 0.19, + "learning_rate": 2.4432e-07, + "loss": 2.1401, + "step": 464 + }, + { + "epoch": 0.19, + "eval_loss": 2.0006463527679443, + "eval_runtime": 0.4882, + "eval_samples_per_second": 8.193, + "eval_steps_per_second": 2.048, + "step": 464 + }, + { + "epoch": 0.19, + "learning_rate": 2.4384e-07, + "loss": 2.2086, + "step": 468 + }, + { + "epoch": 0.19, + "eval_loss": 1.9969314336776733, + "eval_runtime": 0.6612, + "eval_samples_per_second": 6.049, + "eval_steps_per_second": 1.512, + "step": 468 + }, + { + "epoch": 0.19, + "learning_rate": 2.4336e-07, + "loss": 2.1687, + "step": 472 + }, + { + "epoch": 0.19, + "eval_loss": 1.9925954341888428, + "eval_runtime": 0.6804, + "eval_samples_per_second": 5.879, + "eval_steps_per_second": 1.47, + "step": 472 + }, + { + "epoch": 0.19, + "learning_rate": 2.4287999999999996e-07, + "loss": 2.145, + "step": 476 + }, + { + "epoch": 0.19, + "eval_loss": 1.9888066053390503, + "eval_runtime": 0.6955, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 1.438, + "step": 476 + }, + { + "epoch": 0.19, + "learning_rate": 2.424e-07, + "loss": 2.2007, + "step": 480 + }, + { + "epoch": 0.19, + "eval_loss": 1.9850127696990967, + "eval_runtime": 0.7558, + "eval_samples_per_second": 5.292, + "eval_steps_per_second": 1.323, + "step": 480 + }, + { + "epoch": 0.19, + "learning_rate": 2.4192e-07, + "loss": 2.1367, + "step": 484 + }, + { + "epoch": 0.19, + "eval_loss": 1.9808437824249268, + "eval_runtime": 0.4706, + "eval_samples_per_second": 8.499, + "eval_steps_per_second": 2.125, + "step": 484 + }, + { + "epoch": 0.2, + "learning_rate": 2.4143999999999997e-07, + "loss": 2.1291, + "step": 488 + }, + { + "epoch": 0.2, + "eval_loss": 1.9767786264419556, + "eval_runtime": 0.4803, + "eval_samples_per_second": 8.327, + "eval_steps_per_second": 2.082, + "step": 488 + }, + { + "epoch": 0.2, + "learning_rate": 2.4096e-07, + "loss": 2.1124, + "step": 492 + }, + { + "epoch": 0.2, + "eval_loss": 1.9728602170944214, + "eval_runtime": 0.4802, + "eval_samples_per_second": 8.33, + "eval_steps_per_second": 2.082, + "step": 492 + }, + { + "epoch": 0.2, + "learning_rate": 2.4048e-07, + "loss": 2.0738, + "step": 496 + }, + { + "epoch": 0.2, + "eval_loss": 1.968900203704834, + "eval_runtime": 0.4884, + "eval_samples_per_second": 8.189, + "eval_steps_per_second": 2.047, + "step": 496 + }, + { + "epoch": 0.2, + "learning_rate": 2.4e-07, + "loss": 2.1048, + "step": 500 + }, + { + "epoch": 0.2, + "eval_loss": 1.9646457433700562, + "eval_runtime": 0.5026, + "eval_samples_per_second": 7.959, + "eval_steps_per_second": 1.99, + "step": 500 + }, + { + "epoch": 0.2, + "learning_rate": 2.3951999999999996e-07, + "loss": 2.0995, + "step": 504 + }, + { + "epoch": 0.2, + "eval_loss": 1.9606600999832153, + "eval_runtime": 0.7928, + "eval_samples_per_second": 5.045, + "eval_steps_per_second": 1.261, + "step": 504 + }, + { + "epoch": 0.2, + "learning_rate": 2.3903999999999995e-07, + "loss": 2.0816, + "step": 508 + }, + { + "epoch": 0.2, + "eval_loss": 1.956822395324707, + "eval_runtime": 0.5321, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 1.879, + "step": 508 + }, + { + "epoch": 0.2, + "learning_rate": 2.3856e-07, + "loss": 2.0969, + "step": 512 + }, + { + "epoch": 0.2, + "eval_loss": 1.9526716470718384, + "eval_runtime": 0.5174, + "eval_samples_per_second": 7.732, + "eval_steps_per_second": 1.933, + "step": 512 + }, + { + "epoch": 0.21, + "learning_rate": 2.3807999999999997e-07, + "loss": 2.1034, + "step": 516 + }, + { + "epoch": 0.21, + "eval_loss": 1.948419451713562, + "eval_runtime": 0.5393, + "eval_samples_per_second": 7.418, + "eval_steps_per_second": 1.854, + "step": 516 + }, + { + "epoch": 0.21, + "learning_rate": 2.376e-07, + "loss": 2.0654, + "step": 520 + }, + { + "epoch": 0.21, + "eval_loss": 1.9442145824432373, + "eval_runtime": 0.5372, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 1.861, + "step": 520 + }, + { + "epoch": 0.21, + "learning_rate": 2.3712e-07, + "loss": 2.1175, + "step": 524 + }, + { + "epoch": 0.21, + "eval_loss": 1.9403698444366455, + "eval_runtime": 0.5129, + "eval_samples_per_second": 7.798, + "eval_steps_per_second": 1.95, + "step": 524 + }, + { + "epoch": 0.21, + "learning_rate": 2.3663999999999998e-07, + "loss": 2.0829, + "step": 528 + }, + { + "epoch": 0.21, + "eval_loss": 1.936263084411621, + "eval_runtime": 0.7202, + "eval_samples_per_second": 5.554, + "eval_steps_per_second": 1.388, + "step": 528 + }, + { + "epoch": 0.21, + "learning_rate": 2.3616e-07, + "loss": 2.0973, + "step": 532 + }, + { + "epoch": 0.21, + "eval_loss": 1.9322115182876587, + "eval_runtime": 0.6884, + "eval_samples_per_second": 5.81, + "eval_steps_per_second": 1.453, + "step": 532 + }, + { + "epoch": 0.21, + "learning_rate": 2.3567999999999998e-07, + "loss": 2.0439, + "step": 536 + }, + { + "epoch": 0.21, + "eval_loss": 1.927826166152954, + "eval_runtime": 0.7779, + "eval_samples_per_second": 5.142, + "eval_steps_per_second": 1.286, + "step": 536 + }, + { + "epoch": 0.22, + "learning_rate": 2.352e-07, + "loss": 2.0791, + "step": 540 + }, + { + "epoch": 0.22, + "eval_loss": 1.923945426940918, + "eval_runtime": 0.7514, + "eval_samples_per_second": 5.323, + "eval_steps_per_second": 1.331, + "step": 540 + }, + { + "epoch": 0.22, + "learning_rate": 2.3471999999999997e-07, + "loss": 2.0988, + "step": 544 + }, + { + "epoch": 0.22, + "eval_loss": 1.9202955961227417, + "eval_runtime": 0.5194, + "eval_samples_per_second": 7.701, + "eval_steps_per_second": 1.925, + "step": 544 + }, + { + "epoch": 0.22, + "learning_rate": 2.3424e-07, + "loss": 2.0179, + "step": 548 + }, + { + "epoch": 0.22, + "eval_loss": 1.916027307510376, + "eval_runtime": 0.5072, + "eval_samples_per_second": 7.887, + "eval_steps_per_second": 1.972, + "step": 548 + }, + { + "epoch": 0.22, + "learning_rate": 2.3376e-07, + "loss": 2.0452, + "step": 552 + }, + { + "epoch": 0.22, + "eval_loss": 1.911855697631836, + "eval_runtime": 0.5112, + "eval_samples_per_second": 7.825, + "eval_steps_per_second": 1.956, + "step": 552 + }, + { + "epoch": 0.22, + "learning_rate": 2.3327999999999998e-07, + "loss": 1.9792, + "step": 556 + }, + { + "epoch": 0.22, + "eval_loss": 1.907868504524231, + "eval_runtime": 0.5368, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 1.863, + "step": 556 + }, + { + "epoch": 0.22, + "learning_rate": 2.328e-07, + "loss": 1.9862, + "step": 560 + }, + { + "epoch": 0.22, + "eval_loss": 1.9032366275787354, + "eval_runtime": 0.52, + "eval_samples_per_second": 7.692, + "eval_steps_per_second": 1.923, + "step": 560 + }, + { + "epoch": 0.23, + "learning_rate": 2.3231999999999998e-07, + "loss": 2.0176, + "step": 564 + }, + { + "epoch": 0.23, + "eval_loss": 1.8994207382202148, + "eval_runtime": 0.5141, + "eval_samples_per_second": 7.78, + "eval_steps_per_second": 1.945, + "step": 564 + }, + { + "epoch": 0.23, + "learning_rate": 2.3184e-07, + "loss": 2.0066, + "step": 568 + }, + { + "epoch": 0.23, + "eval_loss": 1.8953509330749512, + "eval_runtime": 0.7027, + "eval_samples_per_second": 5.692, + "eval_steps_per_second": 1.423, + "step": 568 + }, + { + "epoch": 0.23, + "learning_rate": 2.3135999999999998e-07, + "loss": 2.0333, + "step": 572 + }, + { + "epoch": 0.23, + "eval_loss": 1.8914432525634766, + "eval_runtime": 0.7279, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 1.374, + "step": 572 + }, + { + "epoch": 0.23, + "learning_rate": 2.3088e-07, + "loss": 2.0316, + "step": 576 + }, + { + "epoch": 0.23, + "eval_loss": 1.8870800733566284, + "eval_runtime": 0.7212, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.386, + "step": 576 + }, + { + "epoch": 0.23, + "learning_rate": 2.304e-07, + "loss": 2.0114, + "step": 580 + }, + { + "epoch": 0.23, + "eval_loss": 1.8827916383743286, + "eval_runtime": 0.6774, + "eval_samples_per_second": 5.905, + "eval_steps_per_second": 1.476, + "step": 580 + }, + { + "epoch": 0.23, + "learning_rate": 2.2991999999999998e-07, + "loss": 2.0093, + "step": 584 + }, + { + "epoch": 0.23, + "eval_loss": 1.8788678646087646, + "eval_runtime": 0.5185, + "eval_samples_per_second": 7.715, + "eval_steps_per_second": 1.929, + "step": 584 + }, + { + "epoch": 0.24, + "learning_rate": 2.2944e-07, + "loss": 1.9829, + "step": 588 + }, + { + "epoch": 0.24, + "eval_loss": 1.8749186992645264, + "eval_runtime": 0.5091, + "eval_samples_per_second": 7.857, + "eval_steps_per_second": 1.964, + "step": 588 + }, + { + "epoch": 0.24, + "learning_rate": 2.2895999999999998e-07, + "loss": 1.971, + "step": 592 + }, + { + "epoch": 0.24, + "eval_loss": 1.8706499338150024, + "eval_runtime": 0.5204, + "eval_samples_per_second": 7.687, + "eval_steps_per_second": 1.922, + "step": 592 + }, + { + "epoch": 0.24, + "learning_rate": 2.2848000000000002e-07, + "loss": 2.0188, + "step": 596 + }, + { + "epoch": 0.24, + "eval_loss": 1.8667842149734497, + "eval_runtime": 0.5224, + "eval_samples_per_second": 7.657, + "eval_steps_per_second": 1.914, + "step": 596 + }, + { + "epoch": 0.24, + "learning_rate": 2.28e-07, + "loss": 2.0081, + "step": 600 + }, + { + "epoch": 0.24, + "eval_loss": 1.8627525568008423, + "eval_runtime": 0.5196, + "eval_samples_per_second": 7.699, + "eval_steps_per_second": 1.925, + "step": 600 + }, + { + "epoch": 0.24, + "learning_rate": 2.2752e-07, + "loss": 2.0014, + "step": 604 + }, + { + "epoch": 0.24, + "eval_loss": 1.8587167263031006, + "eval_runtime": 0.7373, + "eval_samples_per_second": 5.425, + "eval_steps_per_second": 1.356, + "step": 604 + }, + { + "epoch": 0.24, + "learning_rate": 2.2704e-07, + "loss": 1.9741, + "step": 608 + }, + { + "epoch": 0.24, + "eval_loss": 1.8543612957000732, + "eval_runtime": 0.7492, + "eval_samples_per_second": 5.339, + "eval_steps_per_second": 1.335, + "step": 608 + }, + { + "epoch": 0.24, + "learning_rate": 2.2655999999999999e-07, + "loss": 1.9828, + "step": 612 + }, + { + "epoch": 0.24, + "eval_loss": 1.8504937887191772, + "eval_runtime": 0.7242, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 1.381, + "step": 612 + }, + { + "epoch": 0.25, + "learning_rate": 2.2608e-07, + "loss": 1.9481, + "step": 616 + }, + { + "epoch": 0.25, + "eval_loss": 1.8463339805603027, + "eval_runtime": 0.6997, + "eval_samples_per_second": 5.716, + "eval_steps_per_second": 1.429, + "step": 616 + }, + { + "epoch": 0.25, + "learning_rate": 2.2559999999999998e-07, + "loss": 1.9584, + "step": 620 + }, + { + "epoch": 0.25, + "eval_loss": 1.8423882722854614, + "eval_runtime": 0.5137, + "eval_samples_per_second": 7.787, + "eval_steps_per_second": 1.947, + "step": 620 + }, + { + "epoch": 0.25, + "learning_rate": 2.2511999999999997e-07, + "loss": 1.9449, + "step": 624 + }, + { + "epoch": 0.25, + "eval_loss": 1.838066577911377, + "eval_runtime": 0.5091, + "eval_samples_per_second": 7.857, + "eval_steps_per_second": 1.964, + "step": 624 + }, + { + "epoch": 0.25, + "learning_rate": 2.2464e-07, + "loss": 1.9753, + "step": 628 + }, + { + "epoch": 0.25, + "eval_loss": 1.8342829942703247, + "eval_runtime": 0.504, + "eval_samples_per_second": 7.936, + "eval_steps_per_second": 1.984, + "step": 628 + }, + { + "epoch": 0.25, + "learning_rate": 2.2416e-07, + "loss": 2.0055, + "step": 632 + }, + { + "epoch": 0.25, + "eval_loss": 1.8300307989120483, + "eval_runtime": 0.5201, + "eval_samples_per_second": 7.691, + "eval_steps_per_second": 1.923, + "step": 632 + }, + { + "epoch": 0.25, + "learning_rate": 2.2368e-07, + "loss": 1.98, + "step": 636 + }, + { + "epoch": 0.25, + "eval_loss": 1.8260575532913208, + "eval_runtime": 0.5267, + "eval_samples_per_second": 7.594, + "eval_steps_per_second": 1.898, + "step": 636 + }, + { + "epoch": 0.26, + "learning_rate": 2.232e-07, + "loss": 1.9757, + "step": 640 + }, + { + "epoch": 0.26, + "eval_loss": 1.8222540616989136, + "eval_runtime": 0.7574, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 1.32, + "step": 640 + }, + { + "epoch": 0.26, + "learning_rate": 2.2271999999999997e-07, + "loss": 1.9683, + "step": 644 + }, + { + "epoch": 0.26, + "eval_loss": 1.818216323852539, + "eval_runtime": 0.7304, + "eval_samples_per_second": 5.476, + "eval_steps_per_second": 1.369, + "step": 644 + }, + { + "epoch": 0.26, + "learning_rate": 2.2223999999999998e-07, + "loss": 1.926, + "step": 648 + }, + { + "epoch": 0.26, + "eval_loss": 1.8140522241592407, + "eval_runtime": 0.7453, + "eval_samples_per_second": 5.367, + "eval_steps_per_second": 1.342, + "step": 648 + }, + { + "epoch": 0.26, + "learning_rate": 2.2175999999999997e-07, + "loss": 1.9454, + "step": 652 + }, + { + "epoch": 0.26, + "eval_loss": 1.8100805282592773, + "eval_runtime": 0.6536, + "eval_samples_per_second": 6.12, + "eval_steps_per_second": 1.53, + "step": 652 + }, + { + "epoch": 0.26, + "learning_rate": 2.2128e-07, + "loss": 1.9352, + "step": 656 + }, + { + "epoch": 0.26, + "eval_loss": 1.8059089183807373, + "eval_runtime": 0.5193, + "eval_samples_per_second": 7.702, + "eval_steps_per_second": 1.926, + "step": 656 + }, + { + "epoch": 0.26, + "learning_rate": 2.208e-07, + "loss": 1.8816, + "step": 660 + }, + { + "epoch": 0.26, + "eval_loss": 1.8020563125610352, + "eval_runtime": 0.5265, + "eval_samples_per_second": 7.597, + "eval_steps_per_second": 1.899, + "step": 660 + }, + { + "epoch": 0.27, + "learning_rate": 2.2032e-07, + "loss": 1.9182, + "step": 664 + }, + { + "epoch": 0.27, + "eval_loss": 1.7980492115020752, + "eval_runtime": 0.5102, + "eval_samples_per_second": 7.84, + "eval_steps_per_second": 1.96, + "step": 664 + }, + { + "epoch": 0.27, + "learning_rate": 2.1984e-07, + "loss": 1.9659, + "step": 668 + }, + { + "epoch": 0.27, + "eval_loss": 1.7941217422485352, + "eval_runtime": 0.5988, + "eval_samples_per_second": 6.681, + "eval_steps_per_second": 1.67, + "step": 668 + }, + { + "epoch": 0.27, + "learning_rate": 2.1935999999999997e-07, + "loss": 1.8932, + "step": 672 + }, + { + "epoch": 0.27, + "eval_loss": 1.7901490926742554, + "eval_runtime": 0.5339, + "eval_samples_per_second": 7.492, + "eval_steps_per_second": 1.873, + "step": 672 + }, + { + "epoch": 0.27, + "learning_rate": 2.1887999999999999e-07, + "loss": 1.8608, + "step": 676 + }, + { + "epoch": 0.27, + "eval_loss": 1.786109447479248, + "eval_runtime": 0.7219, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 1.385, + "step": 676 + }, + { + "epoch": 0.27, + "learning_rate": 2.184e-07, + "loss": 1.941, + "step": 680 + }, + { + "epoch": 0.27, + "eval_loss": 1.7824102640151978, + "eval_runtime": 0.7619, + "eval_samples_per_second": 5.25, + "eval_steps_per_second": 1.313, + "step": 680 + }, + { + "epoch": 0.27, + "learning_rate": 2.1792e-07, + "loss": 1.8854, + "step": 684 + }, + { + "epoch": 0.27, + "eval_loss": 1.77846097946167, + "eval_runtime": 0.7601, + "eval_samples_per_second": 5.262, + "eval_steps_per_second": 1.316, + "step": 684 + }, + { + "epoch": 0.28, + "learning_rate": 2.1744e-07, + "loss": 1.8912, + "step": 688 + }, + { + "epoch": 0.28, + "eval_loss": 1.7742952108383179, + "eval_runtime": 0.59, + "eval_samples_per_second": 6.78, + "eval_steps_per_second": 1.695, + "step": 688 + }, + { + "epoch": 0.28, + "learning_rate": 2.1695999999999998e-07, + "loss": 1.8667, + "step": 692 + }, + { + "epoch": 0.28, + "eval_loss": 1.770714521408081, + "eval_runtime": 0.5262, + "eval_samples_per_second": 7.601, + "eval_steps_per_second": 1.9, + "step": 692 + }, + { + "epoch": 0.28, + "learning_rate": 2.1648e-07, + "loss": 1.912, + "step": 696 + }, + { + "epoch": 0.28, + "eval_loss": 1.7666008472442627, + "eval_runtime": 0.5272, + "eval_samples_per_second": 7.587, + "eval_steps_per_second": 1.897, + "step": 696 + }, + { + "epoch": 0.28, + "learning_rate": 2.1599999999999998e-07, + "loss": 1.9009, + "step": 700 + }, + { + "epoch": 0.28, + "eval_loss": 1.7627824544906616, + "eval_runtime": 0.5295, + "eval_samples_per_second": 7.555, + "eval_steps_per_second": 1.889, + "step": 700 + }, + { + "epoch": 0.28, + "learning_rate": 2.1552000000000001e-07, + "loss": 1.906, + "step": 704 + }, + { + "epoch": 0.28, + "eval_loss": 1.75889253616333, + "eval_runtime": 0.5589, + "eval_samples_per_second": 7.157, + "eval_steps_per_second": 1.789, + "step": 704 + }, + { + "epoch": 0.28, + "learning_rate": 2.1504e-07, + "loss": 1.8671, + "step": 708 + }, + { + "epoch": 0.28, + "eval_loss": 1.7549973726272583, + "eval_runtime": 0.687, + "eval_samples_per_second": 5.822, + "eval_steps_per_second": 1.456, + "step": 708 + }, + { + "epoch": 0.28, + "learning_rate": 2.1455999999999998e-07, + "loss": 1.8609, + "step": 712 + }, + { + "epoch": 0.28, + "eval_loss": 1.7507662773132324, + "eval_runtime": 0.7225, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 1.384, + "step": 712 + }, + { + "epoch": 0.29, + "learning_rate": 2.1408e-07, + "loss": 1.8485, + "step": 716 + }, + { + "epoch": 0.29, + "eval_loss": 1.746917486190796, + "eval_runtime": 0.7954, + "eval_samples_per_second": 5.029, + "eval_steps_per_second": 1.257, + "step": 716 + }, + { + "epoch": 0.29, + "learning_rate": 2.1359999999999998e-07, + "loss": 1.8334, + "step": 720 + }, + { + "epoch": 0.29, + "eval_loss": 1.7430514097213745, + "eval_runtime": 0.7433, + "eval_samples_per_second": 5.381, + "eval_steps_per_second": 1.345, + "step": 720 + }, + { + "epoch": 0.29, + "learning_rate": 2.1312e-07, + "loss": 1.8763, + "step": 724 + }, + { + "epoch": 0.29, + "eval_loss": 1.7392196655273438, + "eval_runtime": 0.5237, + "eval_samples_per_second": 7.638, + "eval_steps_per_second": 1.91, + "step": 724 + }, + { + "epoch": 0.29, + "learning_rate": 2.1263999999999998e-07, + "loss": 1.9005, + "step": 728 + }, + { + "epoch": 0.29, + "eval_loss": 1.7355214357376099, + "eval_runtime": 0.524, + "eval_samples_per_second": 7.634, + "eval_steps_per_second": 1.908, + "step": 728 + }, + { + "epoch": 0.29, + "learning_rate": 2.1216000000000002e-07, + "loss": 1.8669, + "step": 732 + }, + { + "epoch": 0.29, + "eval_loss": 1.731513261795044, + "eval_runtime": 0.5593, + "eval_samples_per_second": 7.152, + "eval_steps_per_second": 1.788, + "step": 732 + }, + { + "epoch": 0.29, + "learning_rate": 2.1168e-07, + "loss": 1.8984, + "step": 736 + }, + { + "epoch": 0.29, + "eval_loss": 1.727636694908142, + "eval_runtime": 0.5241, + "eval_samples_per_second": 7.632, + "eval_steps_per_second": 1.908, + "step": 736 + }, + { + "epoch": 0.3, + "learning_rate": 2.1119999999999999e-07, + "loss": 1.8074, + "step": 740 + }, + { + "epoch": 0.3, + "eval_loss": 1.7240556478500366, + "eval_runtime": 0.715, + "eval_samples_per_second": 5.594, + "eval_steps_per_second": 1.399, + "step": 740 + }, + { + "epoch": 0.3, + "learning_rate": 2.1072e-07, + "loss": 1.8614, + "step": 744 + }, + { + "epoch": 0.3, + "eval_loss": 1.7201639413833618, + "eval_runtime": 0.7611, + "eval_samples_per_second": 5.256, + "eval_steps_per_second": 1.314, + "step": 744 + }, + { + "epoch": 0.3, + "learning_rate": 2.1023999999999998e-07, + "loss": 1.8211, + "step": 748 + }, + { + "epoch": 0.3, + "eval_loss": 1.7165008783340454, + "eval_runtime": 0.7193, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 748 + }, + { + "epoch": 0.3, + "learning_rate": 2.0976e-07, + "loss": 1.8553, + "step": 752 + }, + { + "epoch": 0.3, + "eval_loss": 1.7123990058898926, + "eval_runtime": 0.5463, + "eval_samples_per_second": 7.323, + "eval_steps_per_second": 1.831, + "step": 752 + }, + { + "epoch": 0.3, + "learning_rate": 2.0927999999999998e-07, + "loss": 1.7978, + "step": 756 + }, + { + "epoch": 0.3, + "eval_loss": 1.7084720134735107, + "eval_runtime": 0.574, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 1.742, + "step": 756 + }, + { + "epoch": 0.3, + "learning_rate": 2.0879999999999996e-07, + "loss": 1.8203, + "step": 760 + }, + { + "epoch": 0.3, + "eval_loss": 1.7048146724700928, + "eval_runtime": 0.5838, + "eval_samples_per_second": 6.852, + "eval_steps_per_second": 1.713, + "step": 760 + }, + { + "epoch": 0.31, + "learning_rate": 2.0832e-07, + "loss": 1.8192, + "step": 764 + }, + { + "epoch": 0.31, + "eval_loss": 1.7010469436645508, + "eval_runtime": 0.5225, + "eval_samples_per_second": 7.656, + "eval_steps_per_second": 1.914, + "step": 764 + }, + { + "epoch": 0.31, + "learning_rate": 2.0784e-07, + "loss": 1.8532, + "step": 768 + }, + { + "epoch": 0.31, + "eval_loss": 1.6973625421524048, + "eval_runtime": 0.525, + "eval_samples_per_second": 7.619, + "eval_steps_per_second": 1.905, + "step": 768 + }, + { + "epoch": 0.31, + "learning_rate": 2.0736e-07, + "loss": 1.8307, + "step": 772 + }, + { + "epoch": 0.31, + "eval_loss": 1.6935136318206787, + "eval_runtime": 0.7235, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 1.382, + "step": 772 + }, + { + "epoch": 0.31, + "learning_rate": 2.0687999999999998e-07, + "loss": 1.8207, + "step": 776 + }, + { + "epoch": 0.31, + "eval_loss": 1.6895670890808105, + "eval_runtime": 0.8289, + "eval_samples_per_second": 4.826, + "eval_steps_per_second": 1.206, + "step": 776 + }, + { + "epoch": 0.31, + "learning_rate": 2.0639999999999997e-07, + "loss": 1.7895, + "step": 780 + }, + { + "epoch": 0.31, + "eval_loss": 1.6858075857162476, + "eval_runtime": 0.7778, + "eval_samples_per_second": 5.143, + "eval_steps_per_second": 1.286, + "step": 780 + }, + { + "epoch": 0.31, + "learning_rate": 2.0592e-07, + "loss": 1.7976, + "step": 784 + }, + { + "epoch": 0.31, + "eval_loss": 1.6820955276489258, + "eval_runtime": 0.5265, + "eval_samples_per_second": 7.597, + "eval_steps_per_second": 1.899, + "step": 784 + }, + { + "epoch": 0.32, + "learning_rate": 2.0544e-07, + "loss": 1.814, + "step": 788 + }, + { + "epoch": 0.32, + "eval_loss": 1.6785138845443726, + "eval_runtime": 0.5179, + "eval_samples_per_second": 7.724, + "eval_steps_per_second": 1.931, + "step": 788 + }, + { + "epoch": 0.32, + "learning_rate": 2.0496e-07, + "loss": 1.7972, + "step": 792 + }, + { + "epoch": 0.32, + "eval_loss": 1.674804449081421, + "eval_runtime": 0.5304, + "eval_samples_per_second": 7.541, + "eval_steps_per_second": 1.885, + "step": 792 + }, + { + "epoch": 0.32, + "learning_rate": 2.0448e-07, + "loss": 1.8258, + "step": 796 + }, + { + "epoch": 0.32, + "eval_loss": 1.6713837385177612, + "eval_runtime": 0.5336, + "eval_samples_per_second": 7.496, + "eval_steps_per_second": 1.874, + "step": 796 + }, + { + "epoch": 0.32, + "learning_rate": 2.04e-07, + "loss": 1.79, + "step": 800 + }, + { + "epoch": 0.32, + "eval_loss": 1.667376160621643, + "eval_runtime": 0.7608, + "eval_samples_per_second": 5.258, + "eval_steps_per_second": 1.314, + "step": 800 + }, + { + "epoch": 0.32, + "learning_rate": 2.0351999999999999e-07, + "loss": 1.802, + "step": 804 + }, + { + "epoch": 0.32, + "eval_loss": 1.6640408039093018, + "eval_runtime": 0.7498, + "eval_samples_per_second": 5.335, + "eval_steps_per_second": 1.334, + "step": 804 + }, + { + "epoch": 0.32, + "learning_rate": 2.0303999999999997e-07, + "loss": 1.7784, + "step": 808 + }, + { + "epoch": 0.32, + "eval_loss": 1.6603385210037231, + "eval_runtime": 0.7501, + "eval_samples_per_second": 5.333, + "eval_steps_per_second": 1.333, + "step": 808 + }, + { + "epoch": 0.32, + "learning_rate": 2.0256e-07, + "loss": 1.7671, + "step": 812 + }, + { + "epoch": 0.32, + "eval_loss": 1.6568516492843628, + "eval_runtime": 0.5206, + "eval_samples_per_second": 7.684, + "eval_steps_per_second": 1.921, + "step": 812 + }, + { + "epoch": 0.33, + "learning_rate": 2.0208e-07, + "loss": 1.7618, + "step": 816 + }, + { + "epoch": 0.33, + "eval_loss": 1.653469443321228, + "eval_runtime": 0.5354, + "eval_samples_per_second": 7.472, + "eval_steps_per_second": 1.868, + "step": 816 + }, + { + "epoch": 0.33, + "learning_rate": 2.016e-07, + "loss": 1.8207, + "step": 820 + }, + { + "epoch": 0.33, + "eval_loss": 1.6502578258514404, + "eval_runtime": 0.523, + "eval_samples_per_second": 7.648, + "eval_steps_per_second": 1.912, + "step": 820 + }, + { + "epoch": 0.33, + "learning_rate": 2.0112e-07, + "loss": 1.7837, + "step": 824 + }, + { + "epoch": 0.33, + "eval_loss": 1.6467454433441162, + "eval_runtime": 0.5297, + "eval_samples_per_second": 7.552, + "eval_steps_per_second": 1.888, + "step": 824 + }, + { + "epoch": 0.33, + "learning_rate": 2.0063999999999998e-07, + "loss": 1.8066, + "step": 828 + }, + { + "epoch": 0.33, + "eval_loss": 1.6439214944839478, + "eval_runtime": 0.522, + "eval_samples_per_second": 7.663, + "eval_steps_per_second": 1.916, + "step": 828 + }, + { + "epoch": 0.33, + "learning_rate": 2.0016e-07, + "loss": 1.7814, + "step": 832 + }, + { + "epoch": 0.33, + "eval_loss": 1.6407381296157837, + "eval_runtime": 0.5382, + "eval_samples_per_second": 7.432, + "eval_steps_per_second": 1.858, + "step": 832 + }, + { + "epoch": 0.33, + "learning_rate": 1.9967999999999997e-07, + "loss": 1.7244, + "step": 836 + }, + { + "epoch": 0.33, + "eval_loss": 1.6372514963150024, + "eval_runtime": 0.7157, + "eval_samples_per_second": 5.589, + "eval_steps_per_second": 1.397, + "step": 836 + }, + { + "epoch": 0.34, + "learning_rate": 1.992e-07, + "loss": 1.7195, + "step": 840 + }, + { + "epoch": 0.34, + "eval_loss": 1.634232997894287, + "eval_runtime": 0.7254, + "eval_samples_per_second": 5.514, + "eval_steps_per_second": 1.379, + "step": 840 + }, + { + "epoch": 0.34, + "learning_rate": 1.9872e-07, + "loss": 1.7524, + "step": 844 + }, + { + "epoch": 0.34, + "eval_loss": 1.6310441493988037, + "eval_runtime": 0.7839, + "eval_samples_per_second": 5.103, + "eval_steps_per_second": 1.276, + "step": 844 + }, + { + "epoch": 0.34, + "learning_rate": 1.9824e-07, + "loss": 1.7644, + "step": 848 + }, + { + "epoch": 0.34, + "eval_loss": 1.6279191970825195, + "eval_runtime": 0.5253, + "eval_samples_per_second": 7.615, + "eval_steps_per_second": 1.904, + "step": 848 + }, + { + "epoch": 0.34, + "learning_rate": 1.9776e-07, + "loss": 1.7171, + "step": 852 + }, + { + "epoch": 0.34, + "eval_loss": 1.6244579553604126, + "eval_runtime": 0.5359, + "eval_samples_per_second": 7.464, + "eval_steps_per_second": 1.866, + "step": 852 + }, + { + "epoch": 0.34, + "learning_rate": 1.9727999999999998e-07, + "loss": 1.7418, + "step": 856 + }, + { + "epoch": 0.34, + "eval_loss": 1.6212078332901, + "eval_runtime": 0.5379, + "eval_samples_per_second": 7.436, + "eval_steps_per_second": 1.859, + "step": 856 + }, + { + "epoch": 0.34, + "learning_rate": 1.968e-07, + "loss": 1.7337, + "step": 860 + }, + { + "epoch": 0.34, + "eval_loss": 1.6180227994918823, + "eval_runtime": 0.5259, + "eval_samples_per_second": 7.606, + "eval_steps_per_second": 1.902, + "step": 860 + }, + { + "epoch": 0.35, + "learning_rate": 1.9631999999999997e-07, + "loss": 1.7441, + "step": 864 + }, + { + "epoch": 0.35, + "eval_loss": 1.61477530002594, + "eval_runtime": 0.5216, + "eval_samples_per_second": 7.669, + "eval_steps_per_second": 1.917, + "step": 864 + }, + { + "epoch": 0.35, + "learning_rate": 1.9584e-07, + "loss": 1.694, + "step": 868 + }, + { + "epoch": 0.35, + "eval_loss": 1.611538052558899, + "eval_runtime": 0.6803, + "eval_samples_per_second": 5.88, + "eval_steps_per_second": 1.47, + "step": 868 + }, + { + "epoch": 0.35, + "learning_rate": 1.9536e-07, + "loss": 1.7601, + "step": 872 + }, + { + "epoch": 0.35, + "eval_loss": 1.6083098649978638, + "eval_runtime": 0.716, + "eval_samples_per_second": 5.586, + "eval_steps_per_second": 1.397, + "step": 872 + }, + { + "epoch": 0.35, + "learning_rate": 1.9487999999999998e-07, + "loss": 1.7081, + "step": 876 + }, + { + "epoch": 0.35, + "eval_loss": 1.6050214767456055, + "eval_runtime": 0.7622, + "eval_samples_per_second": 5.248, + "eval_steps_per_second": 1.312, + "step": 876 + }, + { + "epoch": 0.35, + "learning_rate": 1.944e-07, + "loss": 1.7101, + "step": 880 + }, + { + "epoch": 0.35, + "eval_loss": 1.6019953489303589, + "eval_runtime": 0.7766, + "eval_samples_per_second": 5.151, + "eval_steps_per_second": 1.288, + "step": 880 + }, + { + "epoch": 0.35, + "learning_rate": 1.9391999999999998e-07, + "loss": 1.7271, + "step": 884 + }, + { + "epoch": 0.35, + "eval_loss": 1.5990221500396729, + "eval_runtime": 0.5153, + "eval_samples_per_second": 7.763, + "eval_steps_per_second": 1.941, + "step": 884 + }, + { + "epoch": 0.36, + "learning_rate": 1.9344e-07, + "loss": 1.7402, + "step": 888 + }, + { + "epoch": 0.36, + "eval_loss": 1.5954092741012573, + "eval_runtime": 0.5168, + "eval_samples_per_second": 7.74, + "eval_steps_per_second": 1.935, + "step": 888 + }, + { + "epoch": 0.36, + "learning_rate": 1.9296e-07, + "loss": 1.7125, + "step": 892 + }, + { + "epoch": 0.36, + "eval_loss": 1.5921534299850464, + "eval_runtime": 0.5424, + "eval_samples_per_second": 7.375, + "eval_steps_per_second": 1.844, + "step": 892 + }, + { + "epoch": 0.36, + "learning_rate": 1.9248e-07, + "loss": 1.6949, + "step": 896 + }, + { + "epoch": 0.36, + "eval_loss": 1.5888370275497437, + "eval_runtime": 0.5307, + "eval_samples_per_second": 7.537, + "eval_steps_per_second": 1.884, + "step": 896 + }, + { + "epoch": 0.36, + "learning_rate": 1.92e-07, + "loss": 1.7145, + "step": 900 + }, + { + "epoch": 0.36, + "eval_loss": 1.5858186483383179, + "eval_runtime": 0.511, + "eval_samples_per_second": 7.828, + "eval_steps_per_second": 1.957, + "step": 900 + }, + { + "epoch": 0.36, + "learning_rate": 1.9151999999999998e-07, + "loss": 1.6665, + "step": 904 + }, + { + "epoch": 0.36, + "eval_loss": 1.5824443101882935, + "eval_runtime": 0.6907, + "eval_samples_per_second": 5.791, + "eval_steps_per_second": 1.448, + "step": 904 + }, + { + "epoch": 0.36, + "learning_rate": 1.9104e-07, + "loss": 1.6929, + "step": 908 + }, + { + "epoch": 0.36, + "eval_loss": 1.5796196460723877, + "eval_runtime": 0.7487, + "eval_samples_per_second": 5.342, + "eval_steps_per_second": 1.336, + "step": 908 + }, + { + "epoch": 0.36, + "learning_rate": 1.9055999999999998e-07, + "loss": 1.7068, + "step": 912 + }, + { + "epoch": 0.36, + "eval_loss": 1.5765777826309204, + "eval_runtime": 0.7477, + "eval_samples_per_second": 5.35, + "eval_steps_per_second": 1.337, + "step": 912 + }, + { + "epoch": 0.37, + "learning_rate": 1.9008000000000002e-07, + "loss": 1.6877, + "step": 916 + }, + { + "epoch": 0.37, + "eval_loss": 1.57340669631958, + "eval_runtime": 0.753, + "eval_samples_per_second": 5.312, + "eval_steps_per_second": 1.328, + "step": 916 + }, + { + "epoch": 0.37, + "learning_rate": 1.896e-07, + "loss": 1.6718, + "step": 920 + }, + { + "epoch": 0.37, + "eval_loss": 1.5706267356872559, + "eval_runtime": 0.514, + "eval_samples_per_second": 7.782, + "eval_steps_per_second": 1.945, + "step": 920 + }, + { + "epoch": 0.37, + "learning_rate": 1.8912e-07, + "loss": 1.6886, + "step": 924 + }, + { + "epoch": 0.37, + "eval_loss": 1.5676339864730835, + "eval_runtime": 0.5222, + "eval_samples_per_second": 7.66, + "eval_steps_per_second": 1.915, + "step": 924 + }, + { + "epoch": 0.37, + "learning_rate": 1.8864e-07, + "loss": 1.7459, + "step": 928 + }, + { + "epoch": 0.37, + "eval_loss": 1.5645827054977417, + "eval_runtime": 0.5299, + "eval_samples_per_second": 7.548, + "eval_steps_per_second": 1.887, + "step": 928 + }, + { + "epoch": 0.37, + "learning_rate": 1.8815999999999999e-07, + "loss": 1.6596, + "step": 932 + }, + { + "epoch": 0.37, + "eval_loss": 1.5616861581802368, + "eval_runtime": 0.5303, + "eval_samples_per_second": 7.543, + "eval_steps_per_second": 1.886, + "step": 932 + }, + { + "epoch": 0.37, + "learning_rate": 1.8768e-07, + "loss": 1.6689, + "step": 936 + }, + { + "epoch": 0.37, + "eval_loss": 1.5588451623916626, + "eval_runtime": 0.5236, + "eval_samples_per_second": 7.639, + "eval_steps_per_second": 1.91, + "step": 936 + }, + { + "epoch": 0.38, + "learning_rate": 1.8719999999999998e-07, + "loss": 1.6744, + "step": 940 + }, + { + "epoch": 0.38, + "eval_loss": 1.5560673475265503, + "eval_runtime": 0.7233, + "eval_samples_per_second": 5.53, + "eval_steps_per_second": 1.383, + "step": 940 + }, + { + "epoch": 0.38, + "learning_rate": 1.8671999999999997e-07, + "loss": 1.7009, + "step": 944 + }, + { + "epoch": 0.38, + "eval_loss": 1.5533243417739868, + "eval_runtime": 0.6983, + "eval_samples_per_second": 5.728, + "eval_steps_per_second": 1.432, + "step": 944 + }, + { + "epoch": 0.38, + "learning_rate": 1.8624e-07, + "loss": 1.6651, + "step": 948 + }, + { + "epoch": 0.38, + "eval_loss": 1.55048668384552, + "eval_runtime": 0.7511, + "eval_samples_per_second": 5.325, + "eval_steps_per_second": 1.331, + "step": 948 + }, + { + "epoch": 0.38, + "learning_rate": 1.8576e-07, + "loss": 1.6821, + "step": 952 + }, + { + "epoch": 0.38, + "eval_loss": 1.547943353652954, + "eval_runtime": 0.532, + "eval_samples_per_second": 7.519, + "eval_steps_per_second": 1.88, + "step": 952 + }, + { + "epoch": 0.38, + "learning_rate": 1.8528e-07, + "loss": 1.6453, + "step": 956 + }, + { + "epoch": 0.38, + "eval_loss": 1.5453405380249023, + "eval_runtime": 0.5463, + "eval_samples_per_second": 7.322, + "eval_steps_per_second": 1.831, + "step": 956 + }, + { + "epoch": 0.38, + "learning_rate": 1.848e-07, + "loss": 1.6624, + "step": 960 + }, + { + "epoch": 0.38, + "eval_loss": 1.542648196220398, + "eval_runtime": 0.5288, + "eval_samples_per_second": 7.564, + "eval_steps_per_second": 1.891, + "step": 960 + }, + { + "epoch": 0.39, + "learning_rate": 1.8431999999999997e-07, + "loss": 1.6453, + "step": 964 + }, + { + "epoch": 0.39, + "eval_loss": 1.5402462482452393, + "eval_runtime": 0.5242, + "eval_samples_per_second": 7.63, + "eval_steps_per_second": 1.908, + "step": 964 + }, + { + "epoch": 0.39, + "learning_rate": 1.8383999999999998e-07, + "loss": 1.6451, + "step": 968 + }, + { + "epoch": 0.39, + "eval_loss": 1.5377165079116821, + "eval_runtime": 0.5169, + "eval_samples_per_second": 7.738, + "eval_steps_per_second": 1.935, + "step": 968 + }, + { + "epoch": 0.39, + "learning_rate": 1.8335999999999997e-07, + "loss": 1.6627, + "step": 972 + }, + { + "epoch": 0.39, + "eval_loss": 1.5353412628173828, + "eval_runtime": 0.6797, + "eval_samples_per_second": 5.885, + "eval_steps_per_second": 1.471, + "step": 972 + }, + { + "epoch": 0.39, + "learning_rate": 1.8288e-07, + "loss": 1.6423, + "step": 976 + }, + { + "epoch": 0.39, + "eval_loss": 1.5325669050216675, + "eval_runtime": 0.7175, + "eval_samples_per_second": 5.575, + "eval_steps_per_second": 1.394, + "step": 976 + }, + { + "epoch": 0.39, + "learning_rate": 1.824e-07, + "loss": 1.652, + "step": 980 + }, + { + "epoch": 0.39, + "eval_loss": 1.530207872390747, + "eval_runtime": 0.8099, + "eval_samples_per_second": 4.939, + "eval_steps_per_second": 1.235, + "step": 980 + }, + { + "epoch": 0.39, + "learning_rate": 1.8192e-07, + "loss": 1.6414, + "step": 984 + }, + { + "epoch": 0.39, + "eval_loss": 1.5278236865997314, + "eval_runtime": 0.7814, + "eval_samples_per_second": 5.119, + "eval_steps_per_second": 1.28, + "step": 984 + }, + { + "epoch": 0.4, + "learning_rate": 1.8144e-07, + "loss": 1.6107, + "step": 988 + }, + { + "epoch": 0.4, + "eval_loss": 1.5253430604934692, + "eval_runtime": 0.5386, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 1.857, + "step": 988 + }, + { + "epoch": 0.4, + "learning_rate": 1.8095999999999997e-07, + "loss": 1.6599, + "step": 992 + }, + { + "epoch": 0.4, + "eval_loss": 1.5225120782852173, + "eval_runtime": 0.5302, + "eval_samples_per_second": 7.544, + "eval_steps_per_second": 1.886, + "step": 992 + }, + { + "epoch": 0.4, + "learning_rate": 1.8048e-07, + "loss": 1.6326, + "step": 996 + }, + { + "epoch": 0.4, + "eval_loss": 1.5201939344406128, + "eval_runtime": 0.533, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 1.876, + "step": 996 + }, + { + "epoch": 0.4, + "learning_rate": 1.8e-07, + "loss": 1.6324, + "step": 1000 + }, + { + "epoch": 0.4, + "eval_loss": 1.5175316333770752, + "eval_runtime": 0.5316, + "eval_samples_per_second": 7.525, + "eval_steps_per_second": 1.881, + "step": 1000 + }, + { + "epoch": 0.4, + "learning_rate": 1.7952e-07, + "loss": 1.5907, + "step": 1004 + }, + { + "epoch": 0.4, + "eval_loss": 1.5149424076080322, + "eval_runtime": 0.7298, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 1.37, + "step": 1004 + }, + { + "epoch": 0.4, + "learning_rate": 1.7904e-07, + "loss": 1.6465, + "step": 1008 + }, + { + "epoch": 0.4, + "eval_loss": 1.5124318599700928, + "eval_runtime": 0.7308, + "eval_samples_per_second": 5.473, + "eval_steps_per_second": 1.368, + "step": 1008 + }, + { + "epoch": 0.4, + "learning_rate": 1.7855999999999998e-07, + "loss": 1.6148, + "step": 1012 + }, + { + "epoch": 0.4, + "eval_loss": 1.510151743888855, + "eval_runtime": 0.7345, + "eval_samples_per_second": 5.446, + "eval_steps_per_second": 1.361, + "step": 1012 + }, + { + "epoch": 0.41, + "learning_rate": 1.7808e-07, + "loss": 1.6064, + "step": 1016 + }, + { + "epoch": 0.41, + "eval_loss": 1.5073630809783936, + "eval_runtime": 0.5414, + "eval_samples_per_second": 7.388, + "eval_steps_per_second": 1.847, + "step": 1016 + }, + { + "epoch": 0.41, + "learning_rate": 1.7759999999999998e-07, + "loss": 1.6342, + "step": 1020 + }, + { + "epoch": 0.41, + "eval_loss": 1.5052520036697388, + "eval_runtime": 0.516, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 1020 + }, + { + "epoch": 0.41, + "learning_rate": 1.7712000000000001e-07, + "loss": 1.605, + "step": 1024 + }, + { + "epoch": 0.41, + "eval_loss": 1.5025243759155273, + "eval_runtime": 0.5373, + "eval_samples_per_second": 7.445, + "eval_steps_per_second": 1.861, + "step": 1024 + }, + { + "epoch": 0.41, + "learning_rate": 1.7664e-07, + "loss": 1.6121, + "step": 1028 + }, + { + "epoch": 0.41, + "eval_loss": 1.500252604484558, + "eval_runtime": 0.5476, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 1.826, + "step": 1028 + }, + { + "epoch": 0.41, + "learning_rate": 1.7616e-07, + "loss": 1.617, + "step": 1032 + }, + { + "epoch": 0.41, + "eval_loss": 1.4977892637252808, + "eval_runtime": 0.5255, + "eval_samples_per_second": 7.612, + "eval_steps_per_second": 1.903, + "step": 1032 + }, + { + "epoch": 0.41, + "learning_rate": 1.7568e-07, + "loss": 1.5897, + "step": 1036 + }, + { + "epoch": 0.41, + "eval_loss": 1.4954513311386108, + "eval_runtime": 0.7255, + "eval_samples_per_second": 5.513, + "eval_steps_per_second": 1.378, + "step": 1036 + }, + { + "epoch": 0.42, + "learning_rate": 1.7519999999999998e-07, + "loss": 1.6022, + "step": 1040 + }, + { + "epoch": 0.42, + "eval_loss": 1.4929691553115845, + "eval_runtime": 0.6954, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 1.438, + "step": 1040 + }, + { + "epoch": 0.42, + "learning_rate": 1.7472e-07, + "loss": 1.5748, + "step": 1044 + }, + { + "epoch": 0.42, + "eval_loss": 1.4902769327163696, + "eval_runtime": 0.8026, + "eval_samples_per_second": 4.984, + "eval_steps_per_second": 1.246, + "step": 1044 + }, + { + "epoch": 0.42, + "learning_rate": 1.7423999999999998e-07, + "loss": 1.5974, + "step": 1048 + }, + { + "epoch": 0.42, + "eval_loss": 1.4878779649734497, + "eval_runtime": 0.7804, + "eval_samples_per_second": 5.125, + "eval_steps_per_second": 1.281, + "step": 1048 + }, + { + "epoch": 0.42, + "learning_rate": 1.7376000000000002e-07, + "loss": 1.6126, + "step": 1052 + }, + { + "epoch": 0.42, + "eval_loss": 1.48554527759552, + "eval_runtime": 0.5423, + "eval_samples_per_second": 7.376, + "eval_steps_per_second": 1.844, + "step": 1052 + }, + { + "epoch": 0.42, + "learning_rate": 1.7328e-07, + "loss": 1.6189, + "step": 1056 + }, + { + "epoch": 0.42, + "eval_loss": 1.4827589988708496, + "eval_runtime": 0.5326, + "eval_samples_per_second": 7.511, + "eval_steps_per_second": 1.878, + "step": 1056 + }, + { + "epoch": 0.42, + "learning_rate": 1.7279999999999999e-07, + "loss": 1.5916, + "step": 1060 + }, + { + "epoch": 0.42, + "eval_loss": 1.4803836345672607, + "eval_runtime": 0.5273, + "eval_samples_per_second": 7.585, + "eval_steps_per_second": 1.896, + "step": 1060 + }, + { + "epoch": 0.43, + "learning_rate": 1.7232e-07, + "loss": 1.5938, + "step": 1064 + }, + { + "epoch": 0.43, + "eval_loss": 1.4778516292572021, + "eval_runtime": 0.5436, + "eval_samples_per_second": 7.358, + "eval_steps_per_second": 1.839, + "step": 1064 + }, + { + "epoch": 0.43, + "learning_rate": 1.7183999999999998e-07, + "loss": 1.6026, + "step": 1068 + }, + { + "epoch": 0.43, + "eval_loss": 1.475649118423462, + "eval_runtime": 0.5298, + "eval_samples_per_second": 7.549, + "eval_steps_per_second": 1.887, + "step": 1068 + }, + { + "epoch": 0.43, + "learning_rate": 1.7136e-07, + "loss": 1.5687, + "step": 1072 + }, + { + "epoch": 0.43, + "eval_loss": 1.473489761352539, + "eval_runtime": 0.7191, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.391, + "step": 1072 + }, + { + "epoch": 0.43, + "learning_rate": 1.7087999999999998e-07, + "loss": 1.5413, + "step": 1076 + }, + { + "epoch": 0.43, + "eval_loss": 1.4712145328521729, + "eval_runtime": 0.7022, + "eval_samples_per_second": 5.696, + "eval_steps_per_second": 1.424, + "step": 1076 + }, + { + "epoch": 0.43, + "learning_rate": 1.7039999999999996e-07, + "loss": 1.5778, + "step": 1080 + }, + { + "epoch": 0.43, + "eval_loss": 1.4688694477081299, + "eval_runtime": 0.7707, + "eval_samples_per_second": 5.19, + "eval_steps_per_second": 1.298, + "step": 1080 + }, + { + "epoch": 0.43, + "learning_rate": 1.6992e-07, + "loss": 1.5731, + "step": 1084 + }, + { + "epoch": 0.43, + "eval_loss": 1.4664225578308105, + "eval_runtime": 0.5386, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 1.857, + "step": 1084 + }, + { + "epoch": 0.44, + "learning_rate": 1.6944e-07, + "loss": 1.5625, + "step": 1088 + }, + { + "epoch": 0.44, + "eval_loss": 1.464247465133667, + "eval_runtime": 0.5484, + "eval_samples_per_second": 7.294, + "eval_steps_per_second": 1.823, + "step": 1088 + }, + { + "epoch": 0.44, + "learning_rate": 1.6896e-07, + "loss": 1.55, + "step": 1092 + }, + { + "epoch": 0.44, + "eval_loss": 1.4620987176895142, + "eval_runtime": 0.5342, + "eval_samples_per_second": 7.488, + "eval_steps_per_second": 1.872, + "step": 1092 + }, + { + "epoch": 0.44, + "learning_rate": 1.6847999999999998e-07, + "loss": 1.5852, + "step": 1096 + }, + { + "epoch": 0.44, + "eval_loss": 1.459930419921875, + "eval_runtime": 0.5332, + "eval_samples_per_second": 7.501, + "eval_steps_per_second": 1.875, + "step": 1096 + }, + { + "epoch": 0.44, + "learning_rate": 1.68e-07, + "loss": 1.5614, + "step": 1100 + }, + { + "epoch": 0.44, + "eval_loss": 1.4578797817230225, + "eval_runtime": 0.5504, + "eval_samples_per_second": 7.268, + "eval_steps_per_second": 1.817, + "step": 1100 + }, + { + "epoch": 0.44, + "learning_rate": 1.6752e-07, + "loss": 1.5619, + "step": 1104 + }, + { + "epoch": 0.44, + "eval_loss": 1.4559952020645142, + "eval_runtime": 0.7448, + "eval_samples_per_second": 5.37, + "eval_steps_per_second": 1.343, + "step": 1104 + }, + { + "epoch": 0.44, + "learning_rate": 1.6704e-07, + "loss": 1.5658, + "step": 1108 + }, + { + "epoch": 0.44, + "eval_loss": 1.454249382019043, + "eval_runtime": 0.773, + "eval_samples_per_second": 5.174, + "eval_steps_per_second": 1.294, + "step": 1108 + }, + { + "epoch": 0.44, + "learning_rate": 1.6656e-07, + "loss": 1.5699, + "step": 1112 + }, + { + "epoch": 0.44, + "eval_loss": 1.4521348476409912, + "eval_runtime": 0.8287, + "eval_samples_per_second": 4.827, + "eval_steps_per_second": 1.207, + "step": 1112 + }, + { + "epoch": 0.45, + "learning_rate": 1.6608e-07, + "loss": 1.5738, + "step": 1116 + }, + { + "epoch": 0.45, + "eval_loss": 1.450175404548645, + "eval_runtime": 0.5398, + "eval_samples_per_second": 7.41, + "eval_steps_per_second": 1.852, + "step": 1116 + }, + { + "epoch": 0.45, + "learning_rate": 1.656e-07, + "loss": 1.5823, + "step": 1120 + }, + { + "epoch": 0.45, + "eval_loss": 1.4481428861618042, + "eval_runtime": 0.5592, + "eval_samples_per_second": 7.153, + "eval_steps_per_second": 1.788, + "step": 1120 + }, + { + "epoch": 0.45, + "learning_rate": 1.6511999999999999e-07, + "loss": 1.5425, + "step": 1124 + }, + { + "epoch": 0.45, + "eval_loss": 1.4458932876586914, + "eval_runtime": 0.5511, + "eval_samples_per_second": 7.259, + "eval_steps_per_second": 1.815, + "step": 1124 + }, + { + "epoch": 0.45, + "learning_rate": 1.6463999999999997e-07, + "loss": 1.5604, + "step": 1128 + }, + { + "epoch": 0.45, + "eval_loss": 1.4438304901123047, + "eval_runtime": 0.5355, + "eval_samples_per_second": 7.47, + "eval_steps_per_second": 1.867, + "step": 1128 + }, + { + "epoch": 0.45, + "learning_rate": 1.6416e-07, + "loss": 1.5562, + "step": 1132 + }, + { + "epoch": 0.45, + "eval_loss": 1.442002773284912, + "eval_runtime": 0.5332, + "eval_samples_per_second": 7.502, + "eval_steps_per_second": 1.876, + "step": 1132 + }, + { + "epoch": 0.45, + "learning_rate": 1.6368e-07, + "loss": 1.555, + "step": 1136 + }, + { + "epoch": 0.45, + "eval_loss": 1.4399393796920776, + "eval_runtime": 0.7524, + "eval_samples_per_second": 5.316, + "eval_steps_per_second": 1.329, + "step": 1136 + }, + { + "epoch": 0.46, + "learning_rate": 1.632e-07, + "loss": 1.5158, + "step": 1140 + }, + { + "epoch": 0.46, + "eval_loss": 1.437983512878418, + "eval_runtime": 0.7269, + "eval_samples_per_second": 5.503, + "eval_steps_per_second": 1.376, + "step": 1140 + }, + { + "epoch": 0.46, + "learning_rate": 1.6272e-07, + "loss": 1.5272, + "step": 1144 + }, + { + "epoch": 0.46, + "eval_loss": 1.435863733291626, + "eval_runtime": 0.7356, + "eval_samples_per_second": 5.438, + "eval_steps_per_second": 1.359, + "step": 1144 + }, + { + "epoch": 0.46, + "learning_rate": 1.6223999999999998e-07, + "loss": 1.5467, + "step": 1148 + }, + { + "epoch": 0.46, + "eval_loss": 1.4338979721069336, + "eval_runtime": 0.5695, + "eval_samples_per_second": 7.023, + "eval_steps_per_second": 1.756, + "step": 1148 + }, + { + "epoch": 0.46, + "learning_rate": 1.6176e-07, + "loss": 1.5399, + "step": 1152 + }, + { + "epoch": 0.46, + "eval_loss": 1.4317151308059692, + "eval_runtime": 0.5215, + "eval_samples_per_second": 7.669, + "eval_steps_per_second": 1.917, + "step": 1152 + }, + { + "epoch": 0.46, + "learning_rate": 1.6127999999999997e-07, + "loss": 1.5221, + "step": 1156 + }, + { + "epoch": 0.46, + "eval_loss": 1.4296718835830688, + "eval_runtime": 0.5471, + "eval_samples_per_second": 7.311, + "eval_steps_per_second": 1.828, + "step": 1156 + }, + { + "epoch": 0.46, + "learning_rate": 1.608e-07, + "loss": 1.5022, + "step": 1160 + }, + { + "epoch": 0.46, + "eval_loss": 1.4277141094207764, + "eval_runtime": 0.5395, + "eval_samples_per_second": 7.414, + "eval_steps_per_second": 1.853, + "step": 1160 + }, + { + "epoch": 0.47, + "learning_rate": 1.6032e-07, + "loss": 1.5385, + "step": 1164 + }, + { + "epoch": 0.47, + "eval_loss": 1.4257354736328125, + "eval_runtime": 0.5342, + "eval_samples_per_second": 7.487, + "eval_steps_per_second": 1.872, + "step": 1164 + }, + { + "epoch": 0.47, + "learning_rate": 1.5984e-07, + "loss": 1.5042, + "step": 1168 + }, + { + "epoch": 0.47, + "eval_loss": 1.4236301183700562, + "eval_runtime": 0.6434, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 1.554, + "step": 1168 + }, + { + "epoch": 0.47, + "learning_rate": 1.5936e-07, + "loss": 1.5007, + "step": 1172 + }, + { + "epoch": 0.47, + "eval_loss": 1.421656608581543, + "eval_runtime": 0.7224, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 1.384, + "step": 1172 + }, + { + "epoch": 0.47, + "learning_rate": 1.5887999999999998e-07, + "loss": 1.5323, + "step": 1176 + }, + { + "epoch": 0.47, + "eval_loss": 1.4196075201034546, + "eval_runtime": 0.7946, + "eval_samples_per_second": 5.034, + "eval_steps_per_second": 1.259, + "step": 1176 + }, + { + "epoch": 0.47, + "learning_rate": 1.584e-07, + "loss": 1.5269, + "step": 1180 + }, + { + "epoch": 0.47, + "eval_loss": 1.4174154996871948, + "eval_runtime": 0.82, + "eval_samples_per_second": 4.878, + "eval_steps_per_second": 1.22, + "step": 1180 + }, + { + "epoch": 0.47, + "learning_rate": 1.5791999999999997e-07, + "loss": 1.5379, + "step": 1184 + }, + { + "epoch": 0.47, + "eval_loss": 1.4156051874160767, + "eval_runtime": 0.5319, + "eval_samples_per_second": 7.52, + "eval_steps_per_second": 1.88, + "step": 1184 + }, + { + "epoch": 0.48, + "learning_rate": 1.5744e-07, + "loss": 1.522, + "step": 1188 + }, + { + "epoch": 0.48, + "eval_loss": 1.4136687517166138, + "eval_runtime": 0.5286, + "eval_samples_per_second": 7.567, + "eval_steps_per_second": 1.892, + "step": 1188 + }, + { + "epoch": 0.48, + "learning_rate": 1.5696e-07, + "loss": 1.506, + "step": 1192 + }, + { + "epoch": 0.48, + "eval_loss": 1.4115678071975708, + "eval_runtime": 0.553, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.808, + "step": 1192 + }, + { + "epoch": 0.48, + "learning_rate": 1.5647999999999998e-07, + "loss": 1.4986, + "step": 1196 + }, + { + "epoch": 0.48, + "eval_loss": 1.409631371498108, + "eval_runtime": 0.5273, + "eval_samples_per_second": 7.585, + "eval_steps_per_second": 1.896, + "step": 1196 + }, + { + "epoch": 0.48, + "learning_rate": 1.56e-07, + "loss": 1.4918, + "step": 1200 + }, + { + "epoch": 0.48, + "eval_loss": 1.407455563545227, + "eval_runtime": 0.5269, + "eval_samples_per_second": 7.592, + "eval_steps_per_second": 1.898, + "step": 1200 + }, + { + "epoch": 0.48, + "learning_rate": 1.5551999999999998e-07, + "loss": 1.5124, + "step": 1204 + }, + { + "epoch": 0.48, + "eval_loss": 1.4056380987167358, + "eval_runtime": 0.7536, + "eval_samples_per_second": 5.308, + "eval_steps_per_second": 1.327, + "step": 1204 + }, + { + "epoch": 0.48, + "learning_rate": 1.5504000000000002e-07, + "loss": 1.4926, + "step": 1208 + }, + { + "epoch": 0.48, + "eval_loss": 1.403800368309021, + "eval_runtime": 0.7248, + "eval_samples_per_second": 5.519, + "eval_steps_per_second": 1.38, + "step": 1208 + }, + { + "epoch": 0.48, + "learning_rate": 1.5456e-07, + "loss": 1.5053, + "step": 1212 + }, + { + "epoch": 0.48, + "eval_loss": 1.40152907371521, + "eval_runtime": 0.7447, + "eval_samples_per_second": 5.371, + "eval_steps_per_second": 1.343, + "step": 1212 + }, + { + "epoch": 0.49, + "learning_rate": 1.5408e-07, + "loss": 1.5043, + "step": 1216 + }, + { + "epoch": 0.49, + "eval_loss": 1.3996310234069824, + "eval_runtime": 0.738, + "eval_samples_per_second": 5.42, + "eval_steps_per_second": 1.355, + "step": 1216 + }, + { + "epoch": 0.49, + "learning_rate": 1.536e-07, + "loss": 1.5068, + "step": 1220 + }, + { + "epoch": 0.49, + "eval_loss": 1.3975541591644287, + "eval_runtime": 0.5275, + "eval_samples_per_second": 7.583, + "eval_steps_per_second": 1.896, + "step": 1220 + }, + { + "epoch": 0.49, + "learning_rate": 1.5311999999999998e-07, + "loss": 1.5039, + "step": 1224 + }, + { + "epoch": 0.49, + "eval_loss": 1.3954721689224243, + "eval_runtime": 0.5317, + "eval_samples_per_second": 7.523, + "eval_steps_per_second": 1.881, + "step": 1224 + }, + { + "epoch": 0.49, + "learning_rate": 1.5264e-07, + "loss": 1.4772, + "step": 1228 + }, + { + "epoch": 0.49, + "eval_loss": 1.3933203220367432, + "eval_runtime": 0.5283, + "eval_samples_per_second": 7.571, + "eval_steps_per_second": 1.893, + "step": 1228 + }, + { + "epoch": 0.49, + "learning_rate": 1.5215999999999998e-07, + "loss": 1.4873, + "step": 1232 + }, + { + "epoch": 0.49, + "eval_loss": 1.3916254043579102, + "eval_runtime": 0.5344, + "eval_samples_per_second": 7.485, + "eval_steps_per_second": 1.871, + "step": 1232 + }, + { + "epoch": 0.49, + "learning_rate": 1.5168000000000002e-07, + "loss": 1.4977, + "step": 1236 + }, + { + "epoch": 0.49, + "eval_loss": 1.3896205425262451, + "eval_runtime": 0.5249, + "eval_samples_per_second": 7.62, + "eval_steps_per_second": 1.905, + "step": 1236 + }, + { + "epoch": 0.5, + "learning_rate": 1.512e-07, + "loss": 1.5016, + "step": 1240 + }, + { + "epoch": 0.5, + "eval_loss": 1.3873213529586792, + "eval_runtime": 0.7136, + "eval_samples_per_second": 5.605, + "eval_steps_per_second": 1.401, + "step": 1240 + }, + { + "epoch": 0.5, + "learning_rate": 1.5072e-07, + "loss": 1.495, + "step": 1244 + }, + { + "epoch": 0.5, + "eval_loss": 1.3854175806045532, + "eval_runtime": 0.7372, + "eval_samples_per_second": 5.426, + "eval_steps_per_second": 1.357, + "step": 1244 + }, + { + "epoch": 0.5, + "learning_rate": 1.5024e-07, + "loss": 1.4803, + "step": 1248 + }, + { + "epoch": 0.5, + "eval_loss": 1.3834645748138428, + "eval_runtime": 0.7836, + "eval_samples_per_second": 5.104, + "eval_steps_per_second": 1.276, + "step": 1248 + }, + { + "epoch": 0.5, + "learning_rate": 1.4975999999999999e-07, + "loss": 1.4842, + "step": 1252 + }, + { + "epoch": 0.5, + "eval_loss": 1.381633996963501, + "eval_runtime": 0.5401, + "eval_samples_per_second": 7.405, + "eval_steps_per_second": 1.851, + "step": 1252 + }, + { + "epoch": 0.5, + "learning_rate": 1.4928e-07, + "loss": 1.4762, + "step": 1256 + }, + { + "epoch": 0.5, + "eval_loss": 1.379853367805481, + "eval_runtime": 0.5233, + "eval_samples_per_second": 7.644, + "eval_steps_per_second": 1.911, + "step": 1256 + }, + { + "epoch": 0.5, + "learning_rate": 1.4879999999999998e-07, + "loss": 1.4859, + "step": 1260 + }, + { + "epoch": 0.5, + "eval_loss": 1.3780815601348877, + "eval_runtime": 0.5276, + "eval_samples_per_second": 7.582, + "eval_steps_per_second": 1.895, + "step": 1260 + }, + { + "epoch": 0.51, + "learning_rate": 1.4832e-07, + "loss": 1.4948, + "step": 1264 + }, + { + "epoch": 0.51, + "eval_loss": 1.3763624429702759, + "eval_runtime": 0.5355, + "eval_samples_per_second": 7.469, + "eval_steps_per_second": 1.867, + "step": 1264 + }, + { + "epoch": 0.51, + "learning_rate": 1.4784e-07, + "loss": 1.4851, + "step": 1268 + }, + { + "epoch": 0.51, + "eval_loss": 1.374289631843567, + "eval_runtime": 0.5235, + "eval_samples_per_second": 7.64, + "eval_steps_per_second": 1.91, + "step": 1268 + }, + { + "epoch": 0.51, + "learning_rate": 1.4736e-07, + "loss": 1.4749, + "step": 1272 + }, + { + "epoch": 0.51, + "eval_loss": 1.3724154233932495, + "eval_runtime": 0.6384, + "eval_samples_per_second": 6.266, + "eval_steps_per_second": 1.566, + "step": 1272 + }, + { + "epoch": 0.51, + "learning_rate": 1.4687999999999998e-07, + "loss": 1.4594, + "step": 1276 + }, + { + "epoch": 0.51, + "eval_loss": 1.3709261417388916, + "eval_runtime": 0.7249, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 1.379, + "step": 1276 + }, + { + "epoch": 0.51, + "learning_rate": 1.464e-07, + "loss": 1.4517, + "step": 1280 + }, + { + "epoch": 0.51, + "eval_loss": 1.3691537380218506, + "eval_runtime": 0.7303, + "eval_samples_per_second": 5.477, + "eval_steps_per_second": 1.369, + "step": 1280 + }, + { + "epoch": 0.51, + "learning_rate": 1.4592e-07, + "loss": 1.4239, + "step": 1284 + }, + { + "epoch": 0.51, + "eval_loss": 1.3673396110534668, + "eval_runtime": 0.7929, + "eval_samples_per_second": 5.044, + "eval_steps_per_second": 1.261, + "step": 1284 + }, + { + "epoch": 0.52, + "learning_rate": 1.4543999999999998e-07, + "loss": 1.4775, + "step": 1288 + }, + { + "epoch": 0.52, + "eval_loss": 1.3657190799713135, + "eval_runtime": 0.5509, + "eval_samples_per_second": 7.261, + "eval_steps_per_second": 1.815, + "step": 1288 + }, + { + "epoch": 0.52, + "learning_rate": 1.4496e-07, + "loss": 1.4483, + "step": 1292 + }, + { + "epoch": 0.52, + "eval_loss": 1.3642776012420654, + "eval_runtime": 0.5236, + "eval_samples_per_second": 7.639, + "eval_steps_per_second": 1.91, + "step": 1292 + }, + { + "epoch": 0.52, + "learning_rate": 1.4447999999999998e-07, + "loss": 1.4688, + "step": 1296 + }, + { + "epoch": 0.52, + "eval_loss": 1.3624374866485596, + "eval_runtime": 0.5281, + "eval_samples_per_second": 7.574, + "eval_steps_per_second": 1.893, + "step": 1296 + }, + { + "epoch": 0.52, + "learning_rate": 1.44e-07, + "loss": 1.4566, + "step": 1300 + }, + { + "epoch": 0.52, + "eval_loss": 1.3608499765396118, + "eval_runtime": 0.5346, + "eval_samples_per_second": 7.482, + "eval_steps_per_second": 1.871, + "step": 1300 + }, + { + "epoch": 0.52, + "learning_rate": 1.4352e-07, + "loss": 1.4592, + "step": 1304 + }, + { + "epoch": 0.52, + "eval_loss": 1.3591777086257935, + "eval_runtime": 0.543, + "eval_samples_per_second": 7.367, + "eval_steps_per_second": 1.842, + "step": 1304 + }, + { + "epoch": 0.52, + "learning_rate": 1.4304e-07, + "loss": 1.4505, + "step": 1308 + }, + { + "epoch": 0.52, + "eval_loss": 1.357291340827942, + "eval_runtime": 0.7548, + "eval_samples_per_second": 5.299, + "eval_steps_per_second": 1.325, + "step": 1308 + }, + { + "epoch": 0.52, + "learning_rate": 1.4256e-07, + "loss": 1.4304, + "step": 1312 + }, + { + "epoch": 0.52, + "eval_loss": 1.3557498455047607, + "eval_runtime": 0.7262, + "eval_samples_per_second": 5.508, + "eval_steps_per_second": 1.377, + "step": 1312 + }, + { + "epoch": 0.53, + "learning_rate": 1.4208e-07, + "loss": 1.4691, + "step": 1316 + }, + { + "epoch": 0.53, + "eval_loss": 1.3540558815002441, + "eval_runtime": 0.7121, + "eval_samples_per_second": 5.617, + "eval_steps_per_second": 1.404, + "step": 1316 + }, + { + "epoch": 0.53, + "learning_rate": 1.416e-07, + "loss": 1.4423, + "step": 1320 + }, + { + "epoch": 0.53, + "eval_loss": 1.3522251844406128, + "eval_runtime": 0.7774, + "eval_samples_per_second": 5.145, + "eval_steps_per_second": 1.286, + "step": 1320 + }, + { + "epoch": 0.53, + "learning_rate": 1.4111999999999998e-07, + "loss": 1.4301, + "step": 1324 + }, + { + "epoch": 0.53, + "eval_loss": 1.3508257865905762, + "eval_runtime": 0.5433, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 1.841, + "step": 1324 + }, + { + "epoch": 0.53, + "learning_rate": 1.4064e-07, + "loss": 1.4422, + "step": 1328 + }, + { + "epoch": 0.53, + "eval_loss": 1.3490896224975586, + "eval_runtime": 0.5369, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 1.863, + "step": 1328 + }, + { + "epoch": 0.53, + "learning_rate": 1.4016e-07, + "loss": 1.4577, + "step": 1332 + }, + { + "epoch": 0.53, + "eval_loss": 1.347461223602295, + "eval_runtime": 0.5223, + "eval_samples_per_second": 7.658, + "eval_steps_per_second": 1.915, + "step": 1332 + }, + { + "epoch": 0.53, + "learning_rate": 1.3968e-07, + "loss": 1.4541, + "step": 1336 + }, + { + "epoch": 0.53, + "eval_loss": 1.3457545042037964, + "eval_runtime": 0.5399, + "eval_samples_per_second": 7.409, + "eval_steps_per_second": 1.852, + "step": 1336 + }, + { + "epoch": 0.54, + "learning_rate": 1.392e-07, + "loss": 1.4246, + "step": 1340 + }, + { + "epoch": 0.54, + "eval_loss": 1.343980073928833, + "eval_runtime": 0.5481, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 1.824, + "step": 1340 + }, + { + "epoch": 0.54, + "learning_rate": 1.3872e-07, + "loss": 1.4507, + "step": 1344 + }, + { + "epoch": 0.54, + "eval_loss": 1.3423739671707153, + "eval_runtime": 0.7414, + "eval_samples_per_second": 5.395, + "eval_steps_per_second": 1.349, + "step": 1344 + }, + { + "epoch": 0.54, + "learning_rate": 1.3824e-07, + "loss": 1.4312, + "step": 1348 + }, + { + "epoch": 0.54, + "eval_loss": 1.3408253192901611, + "eval_runtime": 0.7783, + "eval_samples_per_second": 5.139, + "eval_steps_per_second": 1.285, + "step": 1348 + }, + { + "epoch": 0.54, + "learning_rate": 1.3775999999999998e-07, + "loss": 1.4394, + "step": 1352 + }, + { + "epoch": 0.54, + "eval_loss": 1.339220404624939, + "eval_runtime": 0.771, + "eval_samples_per_second": 5.188, + "eval_steps_per_second": 1.297, + "step": 1352 + }, + { + "epoch": 0.54, + "learning_rate": 1.3728e-07, + "loss": 1.4271, + "step": 1356 + }, + { + "epoch": 0.54, + "eval_loss": 1.3373547792434692, + "eval_runtime": 0.5264, + "eval_samples_per_second": 7.599, + "eval_steps_per_second": 1.9, + "step": 1356 + }, + { + "epoch": 0.54, + "learning_rate": 1.368e-07, + "loss": 1.4081, + "step": 1360 + }, + { + "epoch": 0.54, + "eval_loss": 1.3356679677963257, + "eval_runtime": 0.5397, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.853, + "step": 1360 + }, + { + "epoch": 0.55, + "learning_rate": 1.3632e-07, + "loss": 1.4314, + "step": 1364 + }, + { + "epoch": 0.55, + "eval_loss": 1.333927035331726, + "eval_runtime": 0.5418, + "eval_samples_per_second": 7.382, + "eval_steps_per_second": 1.846, + "step": 1364 + }, + { + "epoch": 0.55, + "learning_rate": 1.3583999999999998e-07, + "loss": 1.4359, + "step": 1368 + }, + { + "epoch": 0.55, + "eval_loss": 1.3325647115707397, + "eval_runtime": 0.5464, + "eval_samples_per_second": 7.321, + "eval_steps_per_second": 1.83, + "step": 1368 + }, + { + "epoch": 0.55, + "learning_rate": 1.3536e-07, + "loss": 1.4381, + "step": 1372 + }, + { + "epoch": 0.55, + "eval_loss": 1.3307887315750122, + "eval_runtime": 0.5493, + "eval_samples_per_second": 7.282, + "eval_steps_per_second": 1.82, + "step": 1372 + }, + { + "epoch": 0.55, + "learning_rate": 1.3488e-07, + "loss": 1.4219, + "step": 1376 + }, + { + "epoch": 0.55, + "eval_loss": 1.3293663263320923, + "eval_runtime": 0.5921, + "eval_samples_per_second": 6.755, + "eval_steps_per_second": 1.689, + "step": 1376 + }, + { + "epoch": 0.55, + "learning_rate": 1.3439999999999999e-07, + "loss": 1.4669, + "step": 1380 + }, + { + "epoch": 0.55, + "eval_loss": 1.3278565406799316, + "eval_runtime": 0.7788, + "eval_samples_per_second": 5.136, + "eval_steps_per_second": 1.284, + "step": 1380 + }, + { + "epoch": 0.55, + "learning_rate": 1.3392e-07, + "loss": 1.4163, + "step": 1384 + }, + { + "epoch": 0.55, + "eval_loss": 1.3260074853897095, + "eval_runtime": 0.8128, + "eval_samples_per_second": 4.921, + "eval_steps_per_second": 1.23, + "step": 1384 + }, + { + "epoch": 0.56, + "learning_rate": 1.3343999999999998e-07, + "loss": 1.4153, + "step": 1388 + }, + { + "epoch": 0.56, + "eval_loss": 1.3242360353469849, + "eval_runtime": 0.8002, + "eval_samples_per_second": 4.999, + "eval_steps_per_second": 1.25, + "step": 1388 + }, + { + "epoch": 0.56, + "learning_rate": 1.3296e-07, + "loss": 1.4506, + "step": 1392 + }, + { + "epoch": 0.56, + "eval_loss": 1.3229784965515137, + "eval_runtime": 0.5395, + "eval_samples_per_second": 7.414, + "eval_steps_per_second": 1.854, + "step": 1392 + }, + { + "epoch": 0.56, + "learning_rate": 1.3247999999999998e-07, + "loss": 1.4229, + "step": 1396 + }, + { + "epoch": 0.56, + "eval_loss": 1.3213036060333252, + "eval_runtime": 0.5374, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 1.861, + "step": 1396 + }, + { + "epoch": 0.56, + "learning_rate": 1.32e-07, + "loss": 1.4218, + "step": 1400 + }, + { + "epoch": 0.56, + "eval_loss": 1.3196250200271606, + "eval_runtime": 0.5404, + "eval_samples_per_second": 7.402, + "eval_steps_per_second": 1.851, + "step": 1400 + }, + { + "epoch": 0.56, + "learning_rate": 1.3152e-07, + "loss": 1.4185, + "step": 1404 + }, + { + "epoch": 0.56, + "eval_loss": 1.3180840015411377, + "eval_runtime": 0.5573, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 1.794, + "step": 1404 + }, + { + "epoch": 0.56, + "learning_rate": 1.3104e-07, + "loss": 1.4283, + "step": 1408 + }, + { + "epoch": 0.56, + "eval_loss": 1.316424012184143, + "eval_runtime": 0.5204, + "eval_samples_per_second": 7.686, + "eval_steps_per_second": 1.922, + "step": 1408 + }, + { + "epoch": 0.56, + "learning_rate": 1.3056e-07, + "loss": 1.4202, + "step": 1412 + }, + { + "epoch": 0.56, + "eval_loss": 1.3148062229156494, + "eval_runtime": 0.7628, + "eval_samples_per_second": 5.244, + "eval_steps_per_second": 1.311, + "step": 1412 + }, + { + "epoch": 0.57, + "learning_rate": 1.3007999999999998e-07, + "loss": 1.3736, + "step": 1416 + }, + { + "epoch": 0.57, + "eval_loss": 1.3131170272827148, + "eval_runtime": 0.7763, + "eval_samples_per_second": 5.153, + "eval_steps_per_second": 1.288, + "step": 1416 + }, + { + "epoch": 0.57, + "learning_rate": 1.296e-07, + "loss": 1.4332, + "step": 1420 + }, + { + "epoch": 0.57, + "eval_loss": 1.311560869216919, + "eval_runtime": 0.7312, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.368, + "step": 1420 + }, + { + "epoch": 0.57, + "learning_rate": 1.2912e-07, + "loss": 1.4287, + "step": 1424 + }, + { + "epoch": 0.57, + "eval_loss": 1.309916615486145, + "eval_runtime": 0.6738, + "eval_samples_per_second": 5.936, + "eval_steps_per_second": 1.484, + "step": 1424 + }, + { + "epoch": 0.57, + "learning_rate": 1.2864e-07, + "loss": 1.4175, + "step": 1428 + }, + { + "epoch": 0.57, + "eval_loss": 1.3080803155899048, + "eval_runtime": 0.5396, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.853, + "step": 1428 + }, + { + "epoch": 0.57, + "learning_rate": 1.2816e-07, + "loss": 1.4152, + "step": 1432 + }, + { + "epoch": 0.57, + "eval_loss": 1.3066335916519165, + "eval_runtime": 0.5523, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.811, + "step": 1432 + }, + { + "epoch": 0.57, + "learning_rate": 1.2768e-07, + "loss": 1.4036, + "step": 1436 + }, + { + "epoch": 0.57, + "eval_loss": 1.3054327964782715, + "eval_runtime": 0.5404, + "eval_samples_per_second": 7.402, + "eval_steps_per_second": 1.851, + "step": 1436 + }, + { + "epoch": 0.58, + "learning_rate": 1.272e-07, + "loss": 1.4033, + "step": 1440 + }, + { + "epoch": 0.58, + "eval_loss": 1.3037904500961304, + "eval_runtime": 0.5534, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.807, + "step": 1440 + }, + { + "epoch": 0.58, + "learning_rate": 1.2671999999999999e-07, + "loss": 1.4095, + "step": 1444 + }, + { + "epoch": 0.58, + "eval_loss": 1.302278757095337, + "eval_runtime": 0.7546, + "eval_samples_per_second": 5.301, + "eval_steps_per_second": 1.325, + "step": 1444 + }, + { + "epoch": 0.58, + "learning_rate": 1.2624e-07, + "loss": 1.4129, + "step": 1448 + }, + { + "epoch": 0.58, + "eval_loss": 1.3008112907409668, + "eval_runtime": 0.7157, + "eval_samples_per_second": 5.589, + "eval_steps_per_second": 1.397, + "step": 1448 + }, + { + "epoch": 0.58, + "learning_rate": 1.2576e-07, + "loss": 1.3838, + "step": 1452 + }, + { + "epoch": 0.58, + "eval_loss": 1.2994916439056396, + "eval_runtime": 0.7773, + "eval_samples_per_second": 5.146, + "eval_steps_per_second": 1.286, + "step": 1452 + }, + { + "epoch": 0.58, + "learning_rate": 1.2528e-07, + "loss": 1.3939, + "step": 1456 + }, + { + "epoch": 0.58, + "eval_loss": 1.2979990243911743, + "eval_runtime": 0.8203, + "eval_samples_per_second": 4.876, + "eval_steps_per_second": 1.219, + "step": 1456 + }, + { + "epoch": 0.58, + "learning_rate": 1.2479999999999998e-07, + "loss": 1.4023, + "step": 1460 + }, + { + "epoch": 0.58, + "eval_loss": 1.2964202165603638, + "eval_runtime": 0.5392, + "eval_samples_per_second": 7.419, + "eval_steps_per_second": 1.855, + "step": 1460 + }, + { + "epoch": 0.59, + "learning_rate": 1.2432e-07, + "loss": 1.3751, + "step": 1464 + }, + { + "epoch": 0.59, + "eval_loss": 1.2952665090560913, + "eval_runtime": 0.533, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 1.876, + "step": 1464 + }, + { + "epoch": 0.59, + "learning_rate": 1.2384e-07, + "loss": 1.3657, + "step": 1468 + }, + { + "epoch": 0.59, + "eval_loss": 1.2935295104980469, + "eval_runtime": 0.5428, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 1.842, + "step": 1468 + }, + { + "epoch": 0.59, + "learning_rate": 1.2336e-07, + "loss": 1.375, + "step": 1472 + }, + { + "epoch": 0.59, + "eval_loss": 1.292738914489746, + "eval_runtime": 0.5365, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 1.864, + "step": 1472 + }, + { + "epoch": 0.59, + "learning_rate": 1.2288e-07, + "loss": 1.3846, + "step": 1476 + }, + { + "epoch": 0.59, + "eval_loss": 1.291104793548584, + "eval_runtime": 0.5462, + "eval_samples_per_second": 7.323, + "eval_steps_per_second": 1.831, + "step": 1476 + }, + { + "epoch": 0.59, + "learning_rate": 1.2239999999999998e-07, + "loss": 1.4192, + "step": 1480 + }, + { + "epoch": 0.59, + "eval_loss": 1.2900675535202026, + "eval_runtime": 0.7504, + "eval_samples_per_second": 5.33, + "eval_steps_per_second": 1.333, + "step": 1480 + }, + { + "epoch": 0.59, + "learning_rate": 1.2192e-07, + "loss": 1.3629, + "step": 1484 + }, + { + "epoch": 0.59, + "eval_loss": 1.2886391878128052, + "eval_runtime": 0.7924, + "eval_samples_per_second": 5.048, + "eval_steps_per_second": 1.262, + "step": 1484 + }, + { + "epoch": 0.6, + "learning_rate": 1.2143999999999998e-07, + "loss": 1.3947, + "step": 1488 + }, + { + "epoch": 0.6, + "eval_loss": 1.287713646888733, + "eval_runtime": 0.7522, + "eval_samples_per_second": 5.318, + "eval_steps_per_second": 1.329, + "step": 1488 + }, + { + "epoch": 0.6, + "learning_rate": 1.2096e-07, + "loss": 1.3485, + "step": 1492 + }, + { + "epoch": 0.6, + "eval_loss": 1.2862787246704102, + "eval_runtime": 0.5402, + "eval_samples_per_second": 7.404, + "eval_steps_per_second": 1.851, + "step": 1492 + }, + { + "epoch": 0.6, + "learning_rate": 1.2048e-07, + "loss": 1.405, + "step": 1496 + }, + { + "epoch": 0.6, + "eval_loss": 1.2850462198257446, + "eval_runtime": 0.5452, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 1.834, + "step": 1496 + }, + { + "epoch": 0.6, + "learning_rate": 1.2e-07, + "loss": 1.3758, + "step": 1500 + }, + { + "epoch": 0.6, + "eval_loss": 1.2840522527694702, + "eval_runtime": 0.541, + "eval_samples_per_second": 7.394, + "eval_steps_per_second": 1.849, + "step": 1500 + } + ], + "logging_steps": 4, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 1.9068132261888e+17, + "trial_name": null, + "trial_params": null +}