diff --git "a/checkpoint-2500/trainer_state.json" "b/checkpoint-2500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2500/trainer_state.json" @@ -0,0 +1,8770 @@ +{ + "best_metric": 1.1666896343231201, + "best_model_checkpoint": "./results/checkpoint-2500", + "epoch": 1.0, + "eval_steps": 4, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.9951999999999997e-07, + "loss": 2.6285, + "step": 4 + }, + { + "epoch": 0.0, + "eval_loss": 2.4697508811950684, + "eval_runtime": 0.485, + "eval_samples_per_second": 8.248, + "eval_steps_per_second": 2.062, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 2.9904e-07, + "loss": 2.6222, + "step": 8 + }, + { + "epoch": 0.0, + "eval_loss": 2.465975284576416, + "eval_runtime": 0.6323, + "eval_samples_per_second": 6.326, + "eval_steps_per_second": 1.582, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 2.9856e-07, + "loss": 2.6536, + "step": 12 + }, + { + "epoch": 0.0, + "eval_loss": 2.460374116897583, + "eval_runtime": 0.6478, + "eval_samples_per_second": 6.175, + "eval_steps_per_second": 1.544, + "step": 12 + }, + { + "epoch": 0.01, + "learning_rate": 2.9808e-07, + "loss": 2.6785, + "step": 16 + }, + { + "epoch": 0.01, + "eval_loss": 2.4556970596313477, + "eval_runtime": 0.6653, + "eval_samples_per_second": 6.012, + "eval_steps_per_second": 1.503, + "step": 16 + }, + { + "epoch": 0.01, + "learning_rate": 2.9759999999999996e-07, + "loss": 2.6085, + "step": 20 + }, + { + "epoch": 0.01, + "eval_loss": 2.4514715671539307, + "eval_runtime": 0.5241, + "eval_samples_per_second": 7.632, + "eval_steps_per_second": 1.908, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 2.9711999999999995e-07, + "loss": 2.5907, + "step": 24 + }, + { + "epoch": 0.01, + "eval_loss": 2.4462974071502686, + "eval_runtime": 0.4689, + "eval_samples_per_second": 8.53, + "eval_steps_per_second": 2.133, + "step": 24 + }, + { + "epoch": 0.01, + "learning_rate": 2.9664e-07, + "loss": 2.5942, + "step": 28 + }, + { + "epoch": 0.01, + "eval_loss": 2.4415194988250732, + "eval_runtime": 0.4829, + "eval_samples_per_second": 8.284, + "eval_steps_per_second": 2.071, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 2.9615999999999997e-07, + "loss": 2.6101, + "step": 32 + }, + { + "epoch": 0.01, + "eval_loss": 2.437161922454834, + "eval_runtime": 0.4715, + "eval_samples_per_second": 8.483, + "eval_steps_per_second": 2.121, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 2.9568e-07, + "loss": 2.5827, + "step": 36 + }, + { + "epoch": 0.01, + "eval_loss": 2.432689666748047, + "eval_runtime": 0.4938, + "eval_samples_per_second": 8.1, + "eval_steps_per_second": 2.025, + "step": 36 + }, + { + "epoch": 0.02, + "learning_rate": 2.952e-07, + "loss": 2.5729, + "step": 40 + }, + { + "epoch": 0.02, + "eval_loss": 2.4281153678894043, + "eval_runtime": 0.5021, + "eval_samples_per_second": 7.966, + "eval_steps_per_second": 1.991, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.9472e-07, + "loss": 2.5856, + "step": 44 + }, + { + "epoch": 0.02, + "eval_loss": 2.423053741455078, + "eval_runtime": 0.593, + "eval_samples_per_second": 6.746, + "eval_steps_per_second": 1.686, + "step": 44 + }, + { + "epoch": 0.02, + "learning_rate": 2.9423999999999997e-07, + "loss": 2.589, + "step": 48 + }, + { + "epoch": 0.02, + "eval_loss": 2.418571949005127, + "eval_runtime": 0.6933, + "eval_samples_per_second": 5.77, + "eval_steps_per_second": 1.442, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 2.9375999999999995e-07, + "loss": 2.6483, + "step": 52 + }, + { + "epoch": 0.02, + "eval_loss": 2.414531946182251, + "eval_runtime": 0.7167, + "eval_samples_per_second": 5.581, + "eval_steps_per_second": 1.395, + "step": 52 + }, + { + "epoch": 0.02, + "learning_rate": 2.9328e-07, + "loss": 2.517, + "step": 56 + }, + { + "epoch": 0.02, + "eval_loss": 2.409538745880127, + "eval_runtime": 0.4826, + "eval_samples_per_second": 8.289, + "eval_steps_per_second": 2.072, + "step": 56 + }, + { + "epoch": 0.02, + "learning_rate": 2.928e-07, + "loss": 2.5987, + "step": 60 + }, + { + "epoch": 0.02, + "eval_loss": 2.4050426483154297, + "eval_runtime": 0.4757, + "eval_samples_per_second": 8.409, + "eval_steps_per_second": 2.102, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 2.9232e-07, + "loss": 2.5489, + "step": 64 + }, + { + "epoch": 0.03, + "eval_loss": 2.400360107421875, + "eval_runtime": 0.4945, + "eval_samples_per_second": 8.089, + "eval_steps_per_second": 2.022, + "step": 64 + }, + { + "epoch": 0.03, + "learning_rate": 2.9184e-07, + "loss": 2.5063, + "step": 68 + }, + { + "epoch": 0.03, + "eval_loss": 2.396500587463379, + "eval_runtime": 0.5, + "eval_samples_per_second": 8.001, + "eval_steps_per_second": 2.0, + "step": 68 + }, + { + "epoch": 0.03, + "learning_rate": 2.9136e-07, + "loss": 2.5867, + "step": 72 + }, + { + "epoch": 0.03, + "eval_loss": 2.3916146755218506, + "eval_runtime": 0.4602, + "eval_samples_per_second": 8.693, + "eval_steps_per_second": 2.173, + "step": 72 + }, + { + "epoch": 0.03, + "learning_rate": 2.9087999999999997e-07, + "loss": 2.544, + "step": 76 + }, + { + "epoch": 0.03, + "eval_loss": 2.3873047828674316, + "eval_runtime": 0.4731, + "eval_samples_per_second": 8.456, + "eval_steps_per_second": 2.114, + "step": 76 + }, + { + "epoch": 0.03, + "learning_rate": 2.9039999999999995e-07, + "loss": 2.5596, + "step": 80 + }, + { + "epoch": 0.03, + "eval_loss": 2.382803440093994, + "eval_runtime": 0.6092, + "eval_samples_per_second": 6.566, + "eval_steps_per_second": 1.642, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 2.8992e-07, + "loss": 2.5744, + "step": 84 + }, + { + "epoch": 0.03, + "eval_loss": 2.3786380290985107, + "eval_runtime": 0.7212, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.387, + "step": 84 + }, + { + "epoch": 0.04, + "learning_rate": 2.8944e-07, + "loss": 2.5588, + "step": 88 + }, + { + "epoch": 0.04, + "eval_loss": 2.374176502227783, + "eval_runtime": 0.6826, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 1.465, + "step": 88 + }, + { + "epoch": 0.04, + "learning_rate": 2.8895999999999996e-07, + "loss": 2.5579, + "step": 92 + }, + { + "epoch": 0.04, + "eval_loss": 2.3702104091644287, + "eval_runtime": 0.4896, + "eval_samples_per_second": 8.169, + "eval_steps_per_second": 2.042, + "step": 92 + }, + { + "epoch": 0.04, + "learning_rate": 2.8848e-07, + "loss": 2.5245, + "step": 96 + }, + { + "epoch": 0.04, + "eval_loss": 2.3660218715667725, + "eval_runtime": 0.4764, + "eval_samples_per_second": 8.397, + "eval_steps_per_second": 2.099, + "step": 96 + }, + { + "epoch": 0.04, + "learning_rate": 2.88e-07, + "loss": 2.5132, + "step": 100 + }, + { + "epoch": 0.04, + "eval_loss": 2.36110520362854, + "eval_runtime": 0.4799, + "eval_samples_per_second": 8.335, + "eval_steps_per_second": 2.084, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 2.8751999999999997e-07, + "loss": 2.5037, + "step": 104 + }, + { + "epoch": 0.04, + "eval_loss": 2.3570125102996826, + "eval_runtime": 0.4722, + "eval_samples_per_second": 8.47, + "eval_steps_per_second": 2.118, + "step": 104 + }, + { + "epoch": 0.04, + "learning_rate": 2.8704e-07, + "loss": 2.4727, + "step": 108 + }, + { + "epoch": 0.04, + "eval_loss": 2.3530666828155518, + "eval_runtime": 0.467, + "eval_samples_per_second": 8.565, + "eval_steps_per_second": 2.141, + "step": 108 + }, + { + "epoch": 0.04, + "learning_rate": 2.8656e-07, + "loss": 2.4709, + "step": 112 + }, + { + "epoch": 0.04, + "eval_loss": 2.348759412765503, + "eval_runtime": 0.501, + "eval_samples_per_second": 7.984, + "eval_steps_per_second": 1.996, + "step": 112 + }, + { + "epoch": 0.05, + "learning_rate": 2.8608e-07, + "loss": 2.4711, + "step": 116 + }, + { + "epoch": 0.05, + "eval_loss": 2.344454050064087, + "eval_runtime": 0.6607, + "eval_samples_per_second": 6.054, + "eval_steps_per_second": 1.513, + "step": 116 + }, + { + "epoch": 0.05, + "learning_rate": 2.8559999999999996e-07, + "loss": 2.5445, + "step": 120 + }, + { + "epoch": 0.05, + "eval_loss": 2.3402156829833984, + "eval_runtime": 0.704, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 1.42, + "step": 120 + }, + { + "epoch": 0.05, + "learning_rate": 2.8512e-07, + "loss": 2.4994, + "step": 124 + }, + { + "epoch": 0.05, + "eval_loss": 2.3362019062042236, + "eval_runtime": 0.6849, + "eval_samples_per_second": 5.84, + "eval_steps_per_second": 1.46, + "step": 124 + }, + { + "epoch": 0.05, + "learning_rate": 2.8464e-07, + "loss": 2.5036, + "step": 128 + }, + { + "epoch": 0.05, + "eval_loss": 2.3319339752197266, + "eval_runtime": 0.4864, + "eval_samples_per_second": 8.223, + "eval_steps_per_second": 2.056, + "step": 128 + }, + { + "epoch": 0.05, + "learning_rate": 2.8416e-07, + "loss": 2.5525, + "step": 132 + }, + { + "epoch": 0.05, + "eval_loss": 2.3276522159576416, + "eval_runtime": 0.4783, + "eval_samples_per_second": 8.364, + "eval_steps_per_second": 2.091, + "step": 132 + }, + { + "epoch": 0.05, + "learning_rate": 2.8368e-07, + "loss": 2.5245, + "step": 136 + }, + { + "epoch": 0.05, + "eval_loss": 2.3241090774536133, + "eval_runtime": 0.4805, + "eval_samples_per_second": 8.324, + "eval_steps_per_second": 2.081, + "step": 136 + }, + { + "epoch": 0.06, + "learning_rate": 2.832e-07, + "loss": 2.4946, + "step": 140 + }, + { + "epoch": 0.06, + "eval_loss": 2.3198165893554688, + "eval_runtime": 0.473, + "eval_samples_per_second": 8.457, + "eval_steps_per_second": 2.114, + "step": 140 + }, + { + "epoch": 0.06, + "learning_rate": 2.8272e-07, + "loss": 2.5142, + "step": 144 + }, + { + "epoch": 0.06, + "eval_loss": 2.3152613639831543, + "eval_runtime": 0.4858, + "eval_samples_per_second": 8.234, + "eval_steps_per_second": 2.058, + "step": 144 + }, + { + "epoch": 0.06, + "learning_rate": 2.8223999999999997e-07, + "loss": 2.4639, + "step": 148 + }, + { + "epoch": 0.06, + "eval_loss": 2.3112645149230957, + "eval_runtime": 0.488, + "eval_samples_per_second": 8.196, + "eval_steps_per_second": 2.049, + "step": 148 + }, + { + "epoch": 0.06, + "learning_rate": 2.8176e-07, + "loss": 2.4796, + "step": 152 + }, + { + "epoch": 0.06, + "eval_loss": 2.307020902633667, + "eval_runtime": 0.6163, + "eval_samples_per_second": 6.49, + "eval_steps_per_second": 1.623, + "step": 152 + }, + { + "epoch": 0.06, + "learning_rate": 2.8128e-07, + "loss": 2.4529, + "step": 156 + }, + { + "epoch": 0.06, + "eval_loss": 2.303062915802002, + "eval_runtime": 0.6764, + "eval_samples_per_second": 5.913, + "eval_steps_per_second": 1.478, + "step": 156 + }, + { + "epoch": 0.06, + "learning_rate": 2.808e-07, + "loss": 2.4823, + "step": 160 + }, + { + "epoch": 0.06, + "eval_loss": 2.2993311882019043, + "eval_runtime": 0.6854, + "eval_samples_per_second": 5.836, + "eval_steps_per_second": 1.459, + "step": 160 + }, + { + "epoch": 0.07, + "learning_rate": 2.8032e-07, + "loss": 2.4439, + "step": 164 + }, + { + "epoch": 0.07, + "eval_loss": 2.2947850227355957, + "eval_runtime": 0.4745, + "eval_samples_per_second": 8.429, + "eval_steps_per_second": 2.107, + "step": 164 + }, + { + "epoch": 0.07, + "learning_rate": 2.7984e-07, + "loss": 2.4652, + "step": 168 + }, + { + "epoch": 0.07, + "eval_loss": 2.2908992767333984, + "eval_runtime": 0.4759, + "eval_samples_per_second": 8.406, + "eval_steps_per_second": 2.101, + "step": 168 + }, + { + "epoch": 0.07, + "learning_rate": 2.7936e-07, + "loss": 2.4574, + "step": 172 + }, + { + "epoch": 0.07, + "eval_loss": 2.2867026329040527, + "eval_runtime": 0.4973, + "eval_samples_per_second": 8.043, + "eval_steps_per_second": 2.011, + "step": 172 + }, + { + "epoch": 0.07, + "learning_rate": 2.7887999999999997e-07, + "loss": 2.4557, + "step": 176 + }, + { + "epoch": 0.07, + "eval_loss": 2.283027172088623, + "eval_runtime": 0.4719, + "eval_samples_per_second": 8.477, + "eval_steps_per_second": 2.119, + "step": 176 + }, + { + "epoch": 0.07, + "learning_rate": 2.784e-07, + "loss": 2.4462, + "step": 180 + }, + { + "epoch": 0.07, + "eval_loss": 2.2787420749664307, + "eval_runtime": 0.472, + "eval_samples_per_second": 8.474, + "eval_steps_per_second": 2.119, + "step": 180 + }, + { + "epoch": 0.07, + "learning_rate": 2.7792e-07, + "loss": 2.3962, + "step": 184 + }, + { + "epoch": 0.07, + "eval_loss": 2.2745461463928223, + "eval_runtime": 0.6328, + "eval_samples_per_second": 6.322, + "eval_steps_per_second": 1.58, + "step": 184 + }, + { + "epoch": 0.08, + "learning_rate": 2.7744e-07, + "loss": 2.3666, + "step": 188 + }, + { + "epoch": 0.08, + "eval_loss": 2.2705912590026855, + "eval_runtime": 0.6375, + "eval_samples_per_second": 6.274, + "eval_steps_per_second": 1.569, + "step": 188 + }, + { + "epoch": 0.08, + "learning_rate": 2.7696e-07, + "loss": 2.5024, + "step": 192 + }, + { + "epoch": 0.08, + "eval_loss": 2.266995906829834, + "eval_runtime": 0.6984, + "eval_samples_per_second": 5.727, + "eval_steps_per_second": 1.432, + "step": 192 + }, + { + "epoch": 0.08, + "learning_rate": 2.7648e-07, + "loss": 2.4419, + "step": 196 + }, + { + "epoch": 0.08, + "eval_loss": 2.2626519203186035, + "eval_runtime": 0.7334, + "eval_samples_per_second": 5.454, + "eval_steps_per_second": 1.363, + "step": 196 + }, + { + "epoch": 0.08, + "learning_rate": 2.76e-07, + "loss": 2.4246, + "step": 200 + }, + { + "epoch": 0.08, + "eval_loss": 2.2583603858947754, + "eval_runtime": 0.48, + "eval_samples_per_second": 8.333, + "eval_steps_per_second": 2.083, + "step": 200 + }, + { + "epoch": 0.08, + "learning_rate": 2.7551999999999997e-07, + "loss": 2.3853, + "step": 204 + }, + { + "epoch": 0.08, + "eval_loss": 2.2551512718200684, + "eval_runtime": 0.4939, + "eval_samples_per_second": 8.098, + "eval_steps_per_second": 2.025, + "step": 204 + }, + { + "epoch": 0.08, + "learning_rate": 2.7503999999999995e-07, + "loss": 2.4032, + "step": 208 + }, + { + "epoch": 0.08, + "eval_loss": 2.251105785369873, + "eval_runtime": 0.46, + "eval_samples_per_second": 8.695, + "eval_steps_per_second": 2.174, + "step": 208 + }, + { + "epoch": 0.08, + "learning_rate": 2.7456e-07, + "loss": 2.4444, + "step": 212 + }, + { + "epoch": 0.08, + "eval_loss": 2.247025489807129, + "eval_runtime": 0.4948, + "eval_samples_per_second": 8.084, + "eval_steps_per_second": 2.021, + "step": 212 + }, + { + "epoch": 0.09, + "learning_rate": 2.7408e-07, + "loss": 2.2932, + "step": 216 + }, + { + "epoch": 0.09, + "eval_loss": 2.242764472961426, + "eval_runtime": 0.4897, + "eval_samples_per_second": 8.168, + "eval_steps_per_second": 2.042, + "step": 216 + }, + { + "epoch": 0.09, + "learning_rate": 2.736e-07, + "loss": 2.3929, + "step": 220 + }, + { + "epoch": 0.09, + "eval_loss": 2.2391483783721924, + "eval_runtime": 0.6128, + "eval_samples_per_second": 6.528, + "eval_steps_per_second": 1.632, + "step": 220 + }, + { + "epoch": 0.09, + "learning_rate": 2.7312e-07, + "loss": 2.4112, + "step": 224 + }, + { + "epoch": 0.09, + "eval_loss": 2.234977960586548, + "eval_runtime": 0.648, + "eval_samples_per_second": 6.172, + "eval_steps_per_second": 1.543, + "step": 224 + }, + { + "epoch": 0.09, + "learning_rate": 2.7264e-07, + "loss": 2.4191, + "step": 228 + }, + { + "epoch": 0.09, + "eval_loss": 2.231099843978882, + "eval_runtime": 0.6862, + "eval_samples_per_second": 5.829, + "eval_steps_per_second": 1.457, + "step": 228 + }, + { + "epoch": 0.09, + "learning_rate": 2.7215999999999997e-07, + "loss": 2.4408, + "step": 232 + }, + { + "epoch": 0.09, + "eval_loss": 2.2272462844848633, + "eval_runtime": 0.7076, + "eval_samples_per_second": 5.653, + "eval_steps_per_second": 1.413, + "step": 232 + }, + { + "epoch": 0.09, + "learning_rate": 2.7167999999999996e-07, + "loss": 2.3884, + "step": 236 + }, + { + "epoch": 0.09, + "eval_loss": 2.223376750946045, + "eval_runtime": 0.5169, + "eval_samples_per_second": 7.738, + "eval_steps_per_second": 1.935, + "step": 236 + }, + { + "epoch": 0.1, + "learning_rate": 2.712e-07, + "loss": 2.3689, + "step": 240 + }, + { + "epoch": 0.1, + "eval_loss": 2.2195653915405273, + "eval_runtime": 0.4793, + "eval_samples_per_second": 8.346, + "eval_steps_per_second": 2.086, + "step": 240 + }, + { + "epoch": 0.1, + "learning_rate": 2.7072e-07, + "loss": 2.3689, + "step": 244 + }, + { + "epoch": 0.1, + "eval_loss": 2.2153775691986084, + "eval_runtime": 0.4771, + "eval_samples_per_second": 8.384, + "eval_steps_per_second": 2.096, + "step": 244 + }, + { + "epoch": 0.1, + "learning_rate": 2.7024e-07, + "loss": 2.3249, + "step": 248 + }, + { + "epoch": 0.1, + "eval_loss": 2.211355209350586, + "eval_runtime": 0.4778, + "eval_samples_per_second": 8.372, + "eval_steps_per_second": 2.093, + "step": 248 + }, + { + "epoch": 0.1, + "learning_rate": 2.6976e-07, + "loss": 2.4286, + "step": 252 + }, + { + "epoch": 0.1, + "eval_loss": 2.207773208618164, + "eval_runtime": 0.4873, + "eval_samples_per_second": 8.209, + "eval_steps_per_second": 2.052, + "step": 252 + }, + { + "epoch": 0.1, + "learning_rate": 2.6928e-07, + "loss": 2.3497, + "step": 256 + }, + { + "epoch": 0.1, + "eval_loss": 2.203867197036743, + "eval_runtime": 0.6281, + "eval_samples_per_second": 6.368, + "eval_steps_per_second": 1.592, + "step": 256 + }, + { + "epoch": 0.1, + "learning_rate": 2.6879999999999997e-07, + "loss": 2.284, + "step": 260 + }, + { + "epoch": 0.1, + "eval_loss": 2.199937582015991, + "eval_runtime": 0.6885, + "eval_samples_per_second": 5.81, + "eval_steps_per_second": 1.452, + "step": 260 + }, + { + "epoch": 0.11, + "learning_rate": 2.6831999999999996e-07, + "loss": 2.3333, + "step": 264 + }, + { + "epoch": 0.11, + "eval_loss": 2.1958465576171875, + "eval_runtime": 0.6799, + "eval_samples_per_second": 5.883, + "eval_steps_per_second": 1.471, + "step": 264 + }, + { + "epoch": 0.11, + "learning_rate": 2.6784e-07, + "loss": 2.3305, + "step": 268 + }, + { + "epoch": 0.11, + "eval_loss": 2.192072868347168, + "eval_runtime": 0.7165, + "eval_samples_per_second": 5.583, + "eval_steps_per_second": 1.396, + "step": 268 + }, + { + "epoch": 0.11, + "learning_rate": 2.6736e-07, + "loss": 2.3465, + "step": 272 + }, + { + "epoch": 0.11, + "eval_loss": 2.1882476806640625, + "eval_runtime": 0.485, + "eval_samples_per_second": 8.247, + "eval_steps_per_second": 2.062, + "step": 272 + }, + { + "epoch": 0.11, + "learning_rate": 2.6687999999999997e-07, + "loss": 2.3274, + "step": 276 + }, + { + "epoch": 0.11, + "eval_loss": 2.1841320991516113, + "eval_runtime": 0.4767, + "eval_samples_per_second": 8.391, + "eval_steps_per_second": 2.098, + "step": 276 + }, + { + "epoch": 0.11, + "learning_rate": 2.664e-07, + "loss": 2.3641, + "step": 280 + }, + { + "epoch": 0.11, + "eval_loss": 2.1803271770477295, + "eval_runtime": 0.5146, + "eval_samples_per_second": 7.774, + "eval_steps_per_second": 1.943, + "step": 280 + }, + { + "epoch": 0.11, + "learning_rate": 2.6592e-07, + "loss": 2.3089, + "step": 284 + }, + { + "epoch": 0.11, + "eval_loss": 2.176274538040161, + "eval_runtime": 0.488, + "eval_samples_per_second": 8.196, + "eval_steps_per_second": 2.049, + "step": 284 + }, + { + "epoch": 0.12, + "learning_rate": 2.6543999999999997e-07, + "loss": 2.2645, + "step": 288 + }, + { + "epoch": 0.12, + "eval_loss": 2.1720588207244873, + "eval_runtime": 0.4973, + "eval_samples_per_second": 8.043, + "eval_steps_per_second": 2.011, + "step": 288 + }, + { + "epoch": 0.12, + "learning_rate": 2.6495999999999996e-07, + "loss": 2.3439, + "step": 292 + }, + { + "epoch": 0.12, + "eval_loss": 2.1687240600585938, + "eval_runtime": 0.6283, + "eval_samples_per_second": 6.366, + "eval_steps_per_second": 1.592, + "step": 292 + }, + { + "epoch": 0.12, + "learning_rate": 2.6448e-07, + "loss": 2.3285, + "step": 296 + }, + { + "epoch": 0.12, + "eval_loss": 2.1649253368377686, + "eval_runtime": 0.6996, + "eval_samples_per_second": 5.718, + "eval_steps_per_second": 1.429, + "step": 296 + }, + { + "epoch": 0.12, + "learning_rate": 2.64e-07, + "loss": 2.3126, + "step": 300 + }, + { + "epoch": 0.12, + "eval_loss": 2.160398483276367, + "eval_runtime": 0.6904, + "eval_samples_per_second": 5.794, + "eval_steps_per_second": 1.448, + "step": 300 + }, + { + "epoch": 0.12, + "learning_rate": 2.6351999999999997e-07, + "loss": 2.3356, + "step": 304 + }, + { + "epoch": 0.12, + "eval_loss": 2.1570284366607666, + "eval_runtime": 0.4953, + "eval_samples_per_second": 8.076, + "eval_steps_per_second": 2.019, + "step": 304 + }, + { + "epoch": 0.12, + "learning_rate": 2.6304e-07, + "loss": 2.3396, + "step": 308 + }, + { + "epoch": 0.12, + "eval_loss": 2.1527013778686523, + "eval_runtime": 0.4977, + "eval_samples_per_second": 8.037, + "eval_steps_per_second": 2.009, + "step": 308 + }, + { + "epoch": 0.12, + "learning_rate": 2.6256e-07, + "loss": 2.2972, + "step": 312 + }, + { + "epoch": 0.12, + "eval_loss": 2.148724317550659, + "eval_runtime": 0.4939, + "eval_samples_per_second": 8.099, + "eval_steps_per_second": 2.025, + "step": 312 + }, + { + "epoch": 0.13, + "learning_rate": 2.6208e-07, + "loss": 2.3321, + "step": 316 + }, + { + "epoch": 0.13, + "eval_loss": 2.1449663639068604, + "eval_runtime": 0.4784, + "eval_samples_per_second": 8.362, + "eval_steps_per_second": 2.09, + "step": 316 + }, + { + "epoch": 0.13, + "learning_rate": 2.616e-07, + "loss": 2.3348, + "step": 320 + }, + { + "epoch": 0.13, + "eval_loss": 2.1414906978607178, + "eval_runtime": 0.4949, + "eval_samples_per_second": 8.082, + "eval_steps_per_second": 2.021, + "step": 320 + }, + { + "epoch": 0.13, + "learning_rate": 2.6112e-07, + "loss": 2.2728, + "step": 324 + }, + { + "epoch": 0.13, + "eval_loss": 2.1374001502990723, + "eval_runtime": 0.6321, + "eval_samples_per_second": 6.328, + "eval_steps_per_second": 1.582, + "step": 324 + }, + { + "epoch": 0.13, + "learning_rate": 2.6064e-07, + "loss": 2.287, + "step": 328 + }, + { + "epoch": 0.13, + "eval_loss": 2.1333529949188232, + "eval_runtime": 0.6547, + "eval_samples_per_second": 6.109, + "eval_steps_per_second": 1.527, + "step": 328 + }, + { + "epoch": 0.13, + "learning_rate": 2.6015999999999997e-07, + "loss": 2.2474, + "step": 332 + }, + { + "epoch": 0.13, + "eval_loss": 2.1297547817230225, + "eval_runtime": 0.7093, + "eval_samples_per_second": 5.639, + "eval_steps_per_second": 1.41, + "step": 332 + }, + { + "epoch": 0.13, + "learning_rate": 2.5968e-07, + "loss": 2.3214, + "step": 336 + }, + { + "epoch": 0.13, + "eval_loss": 2.126392364501953, + "eval_runtime": 0.6909, + "eval_samples_per_second": 5.789, + "eval_steps_per_second": 1.447, + "step": 336 + }, + { + "epoch": 0.14, + "learning_rate": 2.592e-07, + "loss": 2.2725, + "step": 340 + }, + { + "epoch": 0.14, + "eval_loss": 2.122309923171997, + "eval_runtime": 0.4823, + "eval_samples_per_second": 8.293, + "eval_steps_per_second": 2.073, + "step": 340 + }, + { + "epoch": 0.14, + "learning_rate": 2.5872000000000003e-07, + "loss": 2.3114, + "step": 344 + }, + { + "epoch": 0.14, + "eval_loss": 2.118303060531616, + "eval_runtime": 0.4954, + "eval_samples_per_second": 8.075, + "eval_steps_per_second": 2.019, + "step": 344 + }, + { + "epoch": 0.14, + "learning_rate": 2.5824e-07, + "loss": 2.2333, + "step": 348 + }, + { + "epoch": 0.14, + "eval_loss": 2.114621162414551, + "eval_runtime": 0.4856, + "eval_samples_per_second": 8.238, + "eval_steps_per_second": 2.059, + "step": 348 + }, + { + "epoch": 0.14, + "learning_rate": 2.5776e-07, + "loss": 2.2812, + "step": 352 + }, + { + "epoch": 0.14, + "eval_loss": 2.11067795753479, + "eval_runtime": 0.4778, + "eval_samples_per_second": 8.372, + "eval_steps_per_second": 2.093, + "step": 352 + }, + { + "epoch": 0.14, + "learning_rate": 2.5728e-07, + "loss": 2.2454, + "step": 356 + }, + { + "epoch": 0.14, + "eval_loss": 2.106940746307373, + "eval_runtime": 0.4945, + "eval_samples_per_second": 8.089, + "eval_steps_per_second": 2.022, + "step": 356 + }, + { + "epoch": 0.14, + "learning_rate": 2.5679999999999997e-07, + "loss": 2.2261, + "step": 360 + }, + { + "epoch": 0.14, + "eval_loss": 2.1031668186187744, + "eval_runtime": 0.6521, + "eval_samples_per_second": 6.134, + "eval_steps_per_second": 1.533, + "step": 360 + }, + { + "epoch": 0.15, + "learning_rate": 2.5632e-07, + "loss": 2.2841, + "step": 364 + }, + { + "epoch": 0.15, + "eval_loss": 2.0989203453063965, + "eval_runtime": 0.6249, + "eval_samples_per_second": 6.401, + "eval_steps_per_second": 1.6, + "step": 364 + }, + { + "epoch": 0.15, + "learning_rate": 2.5584e-07, + "loss": 2.2481, + "step": 368 + }, + { + "epoch": 0.15, + "eval_loss": 2.095189332962036, + "eval_runtime": 0.6855, + "eval_samples_per_second": 5.835, + "eval_steps_per_second": 1.459, + "step": 368 + }, + { + "epoch": 0.15, + "learning_rate": 2.5536e-07, + "loss": 2.278, + "step": 372 + }, + { + "epoch": 0.15, + "eval_loss": 2.0912463665008545, + "eval_runtime": 0.7393, + "eval_samples_per_second": 5.411, + "eval_steps_per_second": 1.353, + "step": 372 + }, + { + "epoch": 0.15, + "learning_rate": 2.5488e-07, + "loss": 2.2765, + "step": 376 + }, + { + "epoch": 0.15, + "eval_loss": 2.087336301803589, + "eval_runtime": 0.4793, + "eval_samples_per_second": 8.345, + "eval_steps_per_second": 2.086, + "step": 376 + }, + { + "epoch": 0.15, + "learning_rate": 2.544e-07, + "loss": 2.2232, + "step": 380 + }, + { + "epoch": 0.15, + "eval_loss": 2.0833120346069336, + "eval_runtime": 0.487, + "eval_samples_per_second": 8.214, + "eval_steps_per_second": 2.053, + "step": 380 + }, + { + "epoch": 0.15, + "learning_rate": 2.5392e-07, + "loss": 2.306, + "step": 384 + }, + { + "epoch": 0.15, + "eval_loss": 2.079479932785034, + "eval_runtime": 0.4722, + "eval_samples_per_second": 8.471, + "eval_steps_per_second": 2.118, + "step": 384 + }, + { + "epoch": 0.16, + "learning_rate": 2.5343999999999997e-07, + "loss": 2.2126, + "step": 388 + }, + { + "epoch": 0.16, + "eval_loss": 2.0760295391082764, + "eval_runtime": 0.4958, + "eval_samples_per_second": 8.068, + "eval_steps_per_second": 2.017, + "step": 388 + }, + { + "epoch": 0.16, + "learning_rate": 2.5295999999999996e-07, + "loss": 2.2557, + "step": 392 + }, + { + "epoch": 0.16, + "eval_loss": 2.072136402130127, + "eval_runtime": 0.469, + "eval_samples_per_second": 8.529, + "eval_steps_per_second": 2.132, + "step": 392 + }, + { + "epoch": 0.16, + "learning_rate": 2.5248e-07, + "loss": 2.1988, + "step": 396 + }, + { + "epoch": 0.16, + "eval_loss": 2.0683670043945312, + "eval_runtime": 0.6385, + "eval_samples_per_second": 6.264, + "eval_steps_per_second": 1.566, + "step": 396 + }, + { + "epoch": 0.16, + "learning_rate": 2.52e-07, + "loss": 2.1917, + "step": 400 + }, + { + "epoch": 0.16, + "eval_loss": 2.0638906955718994, + "eval_runtime": 0.6834, + "eval_samples_per_second": 5.853, + "eval_steps_per_second": 1.463, + "step": 400 + }, + { + "epoch": 0.16, + "learning_rate": 2.5152e-07, + "loss": 2.2479, + "step": 404 + }, + { + "epoch": 0.16, + "eval_loss": 2.0599253177642822, + "eval_runtime": 0.7261, + "eval_samples_per_second": 5.509, + "eval_steps_per_second": 1.377, + "step": 404 + }, + { + "epoch": 0.16, + "learning_rate": 2.5104e-07, + "loss": 2.1484, + "step": 408 + }, + { + "epoch": 0.16, + "eval_loss": 2.055751085281372, + "eval_runtime": 0.7367, + "eval_samples_per_second": 5.429, + "eval_steps_per_second": 1.357, + "step": 408 + }, + { + "epoch": 0.16, + "learning_rate": 2.5056e-07, + "loss": 2.1886, + "step": 412 + }, + { + "epoch": 0.16, + "eval_loss": 2.052119016647339, + "eval_runtime": 0.4808, + "eval_samples_per_second": 8.319, + "eval_steps_per_second": 2.08, + "step": 412 + }, + { + "epoch": 0.17, + "learning_rate": 2.5007999999999997e-07, + "loss": 2.2026, + "step": 416 + }, + { + "epoch": 0.17, + "eval_loss": 2.0482354164123535, + "eval_runtime": 0.4856, + "eval_samples_per_second": 8.238, + "eval_steps_per_second": 2.059, + "step": 416 + }, + { + "epoch": 0.17, + "learning_rate": 2.4959999999999996e-07, + "loss": 2.1572, + "step": 420 + }, + { + "epoch": 0.17, + "eval_loss": 2.0441887378692627, + "eval_runtime": 0.4779, + "eval_samples_per_second": 8.37, + "eval_steps_per_second": 2.093, + "step": 420 + }, + { + "epoch": 0.17, + "learning_rate": 2.4912e-07, + "loss": 2.1931, + "step": 424 + }, + { + "epoch": 0.17, + "eval_loss": 2.0399935245513916, + "eval_runtime": 0.4803, + "eval_samples_per_second": 8.329, + "eval_steps_per_second": 2.082, + "step": 424 + }, + { + "epoch": 0.17, + "learning_rate": 2.4864e-07, + "loss": 2.161, + "step": 428 + }, + { + "epoch": 0.17, + "eval_loss": 2.03645920753479, + "eval_runtime": 0.4924, + "eval_samples_per_second": 8.123, + "eval_steps_per_second": 2.031, + "step": 428 + }, + { + "epoch": 0.17, + "learning_rate": 2.4816e-07, + "loss": 2.1115, + "step": 432 + }, + { + "epoch": 0.17, + "eval_loss": 2.032196044921875, + "eval_runtime": 0.6345, + "eval_samples_per_second": 6.304, + "eval_steps_per_second": 1.576, + "step": 432 + }, + { + "epoch": 0.17, + "learning_rate": 2.4768e-07, + "loss": 2.173, + "step": 436 + }, + { + "epoch": 0.17, + "eval_loss": 2.028397560119629, + "eval_runtime": 0.6625, + "eval_samples_per_second": 6.038, + "eval_steps_per_second": 1.509, + "step": 436 + }, + { + "epoch": 0.18, + "learning_rate": 2.472e-07, + "loss": 2.1491, + "step": 440 + }, + { + "epoch": 0.18, + "eval_loss": 2.0247464179992676, + "eval_runtime": 0.6969, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 1.435, + "step": 440 + }, + { + "epoch": 0.18, + "learning_rate": 2.4672e-07, + "loss": 2.1716, + "step": 444 + }, + { + "epoch": 0.18, + "eval_loss": 2.0203933715820312, + "eval_runtime": 0.7311, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.368, + "step": 444 + }, + { + "epoch": 0.18, + "learning_rate": 2.4623999999999996e-07, + "loss": 2.2031, + "step": 448 + }, + { + "epoch": 0.18, + "eval_loss": 2.016533374786377, + "eval_runtime": 0.4875, + "eval_samples_per_second": 8.206, + "eval_steps_per_second": 2.051, + "step": 448 + }, + { + "epoch": 0.18, + "learning_rate": 2.4576e-07, + "loss": 2.1466, + "step": 452 + }, + { + "epoch": 0.18, + "eval_loss": 2.012568473815918, + "eval_runtime": 0.4897, + "eval_samples_per_second": 8.168, + "eval_steps_per_second": 2.042, + "step": 452 + }, + { + "epoch": 0.18, + "learning_rate": 2.4528e-07, + "loss": 2.1384, + "step": 456 + }, + { + "epoch": 0.18, + "eval_loss": 2.0088417530059814, + "eval_runtime": 0.4969, + "eval_samples_per_second": 8.05, + "eval_steps_per_second": 2.013, + "step": 456 + }, + { + "epoch": 0.18, + "learning_rate": 2.4479999999999997e-07, + "loss": 2.1824, + "step": 460 + }, + { + "epoch": 0.18, + "eval_loss": 2.0047850608825684, + "eval_runtime": 0.4897, + "eval_samples_per_second": 8.168, + "eval_steps_per_second": 2.042, + "step": 460 + }, + { + "epoch": 0.19, + "learning_rate": 2.4432e-07, + "loss": 2.1401, + "step": 464 + }, + { + "epoch": 0.19, + "eval_loss": 2.0006463527679443, + "eval_runtime": 0.4882, + "eval_samples_per_second": 8.193, + "eval_steps_per_second": 2.048, + "step": 464 + }, + { + "epoch": 0.19, + "learning_rate": 2.4384e-07, + "loss": 2.2086, + "step": 468 + }, + { + "epoch": 0.19, + "eval_loss": 1.9969314336776733, + "eval_runtime": 0.6612, + "eval_samples_per_second": 6.049, + "eval_steps_per_second": 1.512, + "step": 468 + }, + { + "epoch": 0.19, + "learning_rate": 2.4336e-07, + "loss": 2.1687, + "step": 472 + }, + { + "epoch": 0.19, + "eval_loss": 1.9925954341888428, + "eval_runtime": 0.6804, + "eval_samples_per_second": 5.879, + "eval_steps_per_second": 1.47, + "step": 472 + }, + { + "epoch": 0.19, + "learning_rate": 2.4287999999999996e-07, + "loss": 2.145, + "step": 476 + }, + { + "epoch": 0.19, + "eval_loss": 1.9888066053390503, + "eval_runtime": 0.6955, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 1.438, + "step": 476 + }, + { + "epoch": 0.19, + "learning_rate": 2.424e-07, + "loss": 2.2007, + "step": 480 + }, + { + "epoch": 0.19, + "eval_loss": 1.9850127696990967, + "eval_runtime": 0.7558, + "eval_samples_per_second": 5.292, + "eval_steps_per_second": 1.323, + "step": 480 + }, + { + "epoch": 0.19, + "learning_rate": 2.4192e-07, + "loss": 2.1367, + "step": 484 + }, + { + "epoch": 0.19, + "eval_loss": 1.9808437824249268, + "eval_runtime": 0.4706, + "eval_samples_per_second": 8.499, + "eval_steps_per_second": 2.125, + "step": 484 + }, + { + "epoch": 0.2, + "learning_rate": 2.4143999999999997e-07, + "loss": 2.1291, + "step": 488 + }, + { + "epoch": 0.2, + "eval_loss": 1.9767786264419556, + "eval_runtime": 0.4803, + "eval_samples_per_second": 8.327, + "eval_steps_per_second": 2.082, + "step": 488 + }, + { + "epoch": 0.2, + "learning_rate": 2.4096e-07, + "loss": 2.1124, + "step": 492 + }, + { + "epoch": 0.2, + "eval_loss": 1.9728602170944214, + "eval_runtime": 0.4802, + "eval_samples_per_second": 8.33, + "eval_steps_per_second": 2.082, + "step": 492 + }, + { + "epoch": 0.2, + "learning_rate": 2.4048e-07, + "loss": 2.0738, + "step": 496 + }, + { + "epoch": 0.2, + "eval_loss": 1.968900203704834, + "eval_runtime": 0.4884, + "eval_samples_per_second": 8.189, + "eval_steps_per_second": 2.047, + "step": 496 + }, + { + "epoch": 0.2, + "learning_rate": 2.4e-07, + "loss": 2.1048, + "step": 500 + }, + { + "epoch": 0.2, + "eval_loss": 1.9646457433700562, + "eval_runtime": 0.5026, + "eval_samples_per_second": 7.959, + "eval_steps_per_second": 1.99, + "step": 500 + }, + { + "epoch": 0.2, + "learning_rate": 2.3951999999999996e-07, + "loss": 2.0995, + "step": 504 + }, + { + "epoch": 0.2, + "eval_loss": 1.9606600999832153, + "eval_runtime": 0.7928, + "eval_samples_per_second": 5.045, + "eval_steps_per_second": 1.261, + "step": 504 + }, + { + "epoch": 0.2, + "learning_rate": 2.3903999999999995e-07, + "loss": 2.0816, + "step": 508 + }, + { + "epoch": 0.2, + "eval_loss": 1.956822395324707, + "eval_runtime": 0.5321, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 1.879, + "step": 508 + }, + { + "epoch": 0.2, + "learning_rate": 2.3856e-07, + "loss": 2.0969, + "step": 512 + }, + { + "epoch": 0.2, + "eval_loss": 1.9526716470718384, + "eval_runtime": 0.5174, + "eval_samples_per_second": 7.732, + "eval_steps_per_second": 1.933, + "step": 512 + }, + { + "epoch": 0.21, + "learning_rate": 2.3807999999999997e-07, + "loss": 2.1034, + "step": 516 + }, + { + "epoch": 0.21, + "eval_loss": 1.948419451713562, + "eval_runtime": 0.5393, + "eval_samples_per_second": 7.418, + "eval_steps_per_second": 1.854, + "step": 516 + }, + { + "epoch": 0.21, + "learning_rate": 2.376e-07, + "loss": 2.0654, + "step": 520 + }, + { + "epoch": 0.21, + "eval_loss": 1.9442145824432373, + "eval_runtime": 0.5372, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 1.861, + "step": 520 + }, + { + "epoch": 0.21, + "learning_rate": 2.3712e-07, + "loss": 2.1175, + "step": 524 + }, + { + "epoch": 0.21, + "eval_loss": 1.9403698444366455, + "eval_runtime": 0.5129, + "eval_samples_per_second": 7.798, + "eval_steps_per_second": 1.95, + "step": 524 + }, + { + "epoch": 0.21, + "learning_rate": 2.3663999999999998e-07, + "loss": 2.0829, + "step": 528 + }, + { + "epoch": 0.21, + "eval_loss": 1.936263084411621, + "eval_runtime": 0.7202, + "eval_samples_per_second": 5.554, + "eval_steps_per_second": 1.388, + "step": 528 + }, + { + "epoch": 0.21, + "learning_rate": 2.3616e-07, + "loss": 2.0973, + "step": 532 + }, + { + "epoch": 0.21, + "eval_loss": 1.9322115182876587, + "eval_runtime": 0.6884, + "eval_samples_per_second": 5.81, + "eval_steps_per_second": 1.453, + "step": 532 + }, + { + "epoch": 0.21, + "learning_rate": 2.3567999999999998e-07, + "loss": 2.0439, + "step": 536 + }, + { + "epoch": 0.21, + "eval_loss": 1.927826166152954, + "eval_runtime": 0.7779, + "eval_samples_per_second": 5.142, + "eval_steps_per_second": 1.286, + "step": 536 + }, + { + "epoch": 0.22, + "learning_rate": 2.352e-07, + "loss": 2.0791, + "step": 540 + }, + { + "epoch": 0.22, + "eval_loss": 1.923945426940918, + "eval_runtime": 0.7514, + "eval_samples_per_second": 5.323, + "eval_steps_per_second": 1.331, + "step": 540 + }, + { + "epoch": 0.22, + "learning_rate": 2.3471999999999997e-07, + "loss": 2.0988, + "step": 544 + }, + { + "epoch": 0.22, + "eval_loss": 1.9202955961227417, + "eval_runtime": 0.5194, + "eval_samples_per_second": 7.701, + "eval_steps_per_second": 1.925, + "step": 544 + }, + { + "epoch": 0.22, + "learning_rate": 2.3424e-07, + "loss": 2.0179, + "step": 548 + }, + { + "epoch": 0.22, + "eval_loss": 1.916027307510376, + "eval_runtime": 0.5072, + "eval_samples_per_second": 7.887, + "eval_steps_per_second": 1.972, + "step": 548 + }, + { + "epoch": 0.22, + "learning_rate": 2.3376e-07, + "loss": 2.0452, + "step": 552 + }, + { + "epoch": 0.22, + "eval_loss": 1.911855697631836, + "eval_runtime": 0.5112, + "eval_samples_per_second": 7.825, + "eval_steps_per_second": 1.956, + "step": 552 + }, + { + "epoch": 0.22, + "learning_rate": 2.3327999999999998e-07, + "loss": 1.9792, + "step": 556 + }, + { + "epoch": 0.22, + "eval_loss": 1.907868504524231, + "eval_runtime": 0.5368, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 1.863, + "step": 556 + }, + { + "epoch": 0.22, + "learning_rate": 2.328e-07, + "loss": 1.9862, + "step": 560 + }, + { + "epoch": 0.22, + "eval_loss": 1.9032366275787354, + "eval_runtime": 0.52, + "eval_samples_per_second": 7.692, + "eval_steps_per_second": 1.923, + "step": 560 + }, + { + "epoch": 0.23, + "learning_rate": 2.3231999999999998e-07, + "loss": 2.0176, + "step": 564 + }, + { + "epoch": 0.23, + "eval_loss": 1.8994207382202148, + "eval_runtime": 0.5141, + "eval_samples_per_second": 7.78, + "eval_steps_per_second": 1.945, + "step": 564 + }, + { + "epoch": 0.23, + "learning_rate": 2.3184e-07, + "loss": 2.0066, + "step": 568 + }, + { + "epoch": 0.23, + "eval_loss": 1.8953509330749512, + "eval_runtime": 0.7027, + "eval_samples_per_second": 5.692, + "eval_steps_per_second": 1.423, + "step": 568 + }, + { + "epoch": 0.23, + "learning_rate": 2.3135999999999998e-07, + "loss": 2.0333, + "step": 572 + }, + { + "epoch": 0.23, + "eval_loss": 1.8914432525634766, + "eval_runtime": 0.7279, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 1.374, + "step": 572 + }, + { + "epoch": 0.23, + "learning_rate": 2.3088e-07, + "loss": 2.0316, + "step": 576 + }, + { + "epoch": 0.23, + "eval_loss": 1.8870800733566284, + "eval_runtime": 0.7212, + "eval_samples_per_second": 5.546, + "eval_steps_per_second": 1.386, + "step": 576 + }, + { + "epoch": 0.23, + "learning_rate": 2.304e-07, + "loss": 2.0114, + "step": 580 + }, + { + "epoch": 0.23, + "eval_loss": 1.8827916383743286, + "eval_runtime": 0.6774, + "eval_samples_per_second": 5.905, + "eval_steps_per_second": 1.476, + "step": 580 + }, + { + "epoch": 0.23, + "learning_rate": 2.2991999999999998e-07, + "loss": 2.0093, + "step": 584 + }, + { + "epoch": 0.23, + "eval_loss": 1.8788678646087646, + "eval_runtime": 0.5185, + "eval_samples_per_second": 7.715, + "eval_steps_per_second": 1.929, + "step": 584 + }, + { + "epoch": 0.24, + "learning_rate": 2.2944e-07, + "loss": 1.9829, + "step": 588 + }, + { + "epoch": 0.24, + "eval_loss": 1.8749186992645264, + "eval_runtime": 0.5091, + "eval_samples_per_second": 7.857, + "eval_steps_per_second": 1.964, + "step": 588 + }, + { + "epoch": 0.24, + "learning_rate": 2.2895999999999998e-07, + "loss": 1.971, + "step": 592 + }, + { + "epoch": 0.24, + "eval_loss": 1.8706499338150024, + "eval_runtime": 0.5204, + "eval_samples_per_second": 7.687, + "eval_steps_per_second": 1.922, + "step": 592 + }, + { + "epoch": 0.24, + "learning_rate": 2.2848000000000002e-07, + "loss": 2.0188, + "step": 596 + }, + { + "epoch": 0.24, + "eval_loss": 1.8667842149734497, + "eval_runtime": 0.5224, + "eval_samples_per_second": 7.657, + "eval_steps_per_second": 1.914, + "step": 596 + }, + { + "epoch": 0.24, + "learning_rate": 2.28e-07, + "loss": 2.0081, + "step": 600 + }, + { + "epoch": 0.24, + "eval_loss": 1.8627525568008423, + "eval_runtime": 0.5196, + "eval_samples_per_second": 7.699, + "eval_steps_per_second": 1.925, + "step": 600 + }, + { + "epoch": 0.24, + "learning_rate": 2.2752e-07, + "loss": 2.0014, + "step": 604 + }, + { + "epoch": 0.24, + "eval_loss": 1.8587167263031006, + "eval_runtime": 0.7373, + "eval_samples_per_second": 5.425, + "eval_steps_per_second": 1.356, + "step": 604 + }, + { + "epoch": 0.24, + "learning_rate": 2.2704e-07, + "loss": 1.9741, + "step": 608 + }, + { + "epoch": 0.24, + "eval_loss": 1.8543612957000732, + "eval_runtime": 0.7492, + "eval_samples_per_second": 5.339, + "eval_steps_per_second": 1.335, + "step": 608 + }, + { + "epoch": 0.24, + "learning_rate": 2.2655999999999999e-07, + "loss": 1.9828, + "step": 612 + }, + { + "epoch": 0.24, + "eval_loss": 1.8504937887191772, + "eval_runtime": 0.7242, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 1.381, + "step": 612 + }, + { + "epoch": 0.25, + "learning_rate": 2.2608e-07, + "loss": 1.9481, + "step": 616 + }, + { + "epoch": 0.25, + "eval_loss": 1.8463339805603027, + "eval_runtime": 0.6997, + "eval_samples_per_second": 5.716, + "eval_steps_per_second": 1.429, + "step": 616 + }, + { + "epoch": 0.25, + "learning_rate": 2.2559999999999998e-07, + "loss": 1.9584, + "step": 620 + }, + { + "epoch": 0.25, + "eval_loss": 1.8423882722854614, + "eval_runtime": 0.5137, + "eval_samples_per_second": 7.787, + "eval_steps_per_second": 1.947, + "step": 620 + }, + { + "epoch": 0.25, + "learning_rate": 2.2511999999999997e-07, + "loss": 1.9449, + "step": 624 + }, + { + "epoch": 0.25, + "eval_loss": 1.838066577911377, + "eval_runtime": 0.5091, + "eval_samples_per_second": 7.857, + "eval_steps_per_second": 1.964, + "step": 624 + }, + { + "epoch": 0.25, + "learning_rate": 2.2464e-07, + "loss": 1.9753, + "step": 628 + }, + { + "epoch": 0.25, + "eval_loss": 1.8342829942703247, + "eval_runtime": 0.504, + "eval_samples_per_second": 7.936, + "eval_steps_per_second": 1.984, + "step": 628 + }, + { + "epoch": 0.25, + "learning_rate": 2.2416e-07, + "loss": 2.0055, + "step": 632 + }, + { + "epoch": 0.25, + "eval_loss": 1.8300307989120483, + "eval_runtime": 0.5201, + "eval_samples_per_second": 7.691, + "eval_steps_per_second": 1.923, + "step": 632 + }, + { + "epoch": 0.25, + "learning_rate": 2.2368e-07, + "loss": 1.98, + "step": 636 + }, + { + "epoch": 0.25, + "eval_loss": 1.8260575532913208, + "eval_runtime": 0.5267, + "eval_samples_per_second": 7.594, + "eval_steps_per_second": 1.898, + "step": 636 + }, + { + "epoch": 0.26, + "learning_rate": 2.232e-07, + "loss": 1.9757, + "step": 640 + }, + { + "epoch": 0.26, + "eval_loss": 1.8222540616989136, + "eval_runtime": 0.7574, + "eval_samples_per_second": 5.281, + "eval_steps_per_second": 1.32, + "step": 640 + }, + { + "epoch": 0.26, + "learning_rate": 2.2271999999999997e-07, + "loss": 1.9683, + "step": 644 + }, + { + "epoch": 0.26, + "eval_loss": 1.818216323852539, + "eval_runtime": 0.7304, + "eval_samples_per_second": 5.476, + "eval_steps_per_second": 1.369, + "step": 644 + }, + { + "epoch": 0.26, + "learning_rate": 2.2223999999999998e-07, + "loss": 1.926, + "step": 648 + }, + { + "epoch": 0.26, + "eval_loss": 1.8140522241592407, + "eval_runtime": 0.7453, + "eval_samples_per_second": 5.367, + "eval_steps_per_second": 1.342, + "step": 648 + }, + { + "epoch": 0.26, + "learning_rate": 2.2175999999999997e-07, + "loss": 1.9454, + "step": 652 + }, + { + "epoch": 0.26, + "eval_loss": 1.8100805282592773, + "eval_runtime": 0.6536, + "eval_samples_per_second": 6.12, + "eval_steps_per_second": 1.53, + "step": 652 + }, + { + "epoch": 0.26, + "learning_rate": 2.2128e-07, + "loss": 1.9352, + "step": 656 + }, + { + "epoch": 0.26, + "eval_loss": 1.8059089183807373, + "eval_runtime": 0.5193, + "eval_samples_per_second": 7.702, + "eval_steps_per_second": 1.926, + "step": 656 + }, + { + "epoch": 0.26, + "learning_rate": 2.208e-07, + "loss": 1.8816, + "step": 660 + }, + { + "epoch": 0.26, + "eval_loss": 1.8020563125610352, + "eval_runtime": 0.5265, + "eval_samples_per_second": 7.597, + "eval_steps_per_second": 1.899, + "step": 660 + }, + { + "epoch": 0.27, + "learning_rate": 2.2032e-07, + "loss": 1.9182, + "step": 664 + }, + { + "epoch": 0.27, + "eval_loss": 1.7980492115020752, + "eval_runtime": 0.5102, + "eval_samples_per_second": 7.84, + "eval_steps_per_second": 1.96, + "step": 664 + }, + { + "epoch": 0.27, + "learning_rate": 2.1984e-07, + "loss": 1.9659, + "step": 668 + }, + { + "epoch": 0.27, + "eval_loss": 1.7941217422485352, + "eval_runtime": 0.5988, + "eval_samples_per_second": 6.681, + "eval_steps_per_second": 1.67, + "step": 668 + }, + { + "epoch": 0.27, + "learning_rate": 2.1935999999999997e-07, + "loss": 1.8932, + "step": 672 + }, + { + "epoch": 0.27, + "eval_loss": 1.7901490926742554, + "eval_runtime": 0.5339, + "eval_samples_per_second": 7.492, + "eval_steps_per_second": 1.873, + "step": 672 + }, + { + "epoch": 0.27, + "learning_rate": 2.1887999999999999e-07, + "loss": 1.8608, + "step": 676 + }, + { + "epoch": 0.27, + "eval_loss": 1.786109447479248, + "eval_runtime": 0.7219, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 1.385, + "step": 676 + }, + { + "epoch": 0.27, + "learning_rate": 2.184e-07, + "loss": 1.941, + "step": 680 + }, + { + "epoch": 0.27, + "eval_loss": 1.7824102640151978, + "eval_runtime": 0.7619, + "eval_samples_per_second": 5.25, + "eval_steps_per_second": 1.313, + "step": 680 + }, + { + "epoch": 0.27, + "learning_rate": 2.1792e-07, + "loss": 1.8854, + "step": 684 + }, + { + "epoch": 0.27, + "eval_loss": 1.77846097946167, + "eval_runtime": 0.7601, + "eval_samples_per_second": 5.262, + "eval_steps_per_second": 1.316, + "step": 684 + }, + { + "epoch": 0.28, + "learning_rate": 2.1744e-07, + "loss": 1.8912, + "step": 688 + }, + { + "epoch": 0.28, + "eval_loss": 1.7742952108383179, + "eval_runtime": 0.59, + "eval_samples_per_second": 6.78, + "eval_steps_per_second": 1.695, + "step": 688 + }, + { + "epoch": 0.28, + "learning_rate": 2.1695999999999998e-07, + "loss": 1.8667, + "step": 692 + }, + { + "epoch": 0.28, + "eval_loss": 1.770714521408081, + "eval_runtime": 0.5262, + "eval_samples_per_second": 7.601, + "eval_steps_per_second": 1.9, + "step": 692 + }, + { + "epoch": 0.28, + "learning_rate": 2.1648e-07, + "loss": 1.912, + "step": 696 + }, + { + "epoch": 0.28, + "eval_loss": 1.7666008472442627, + "eval_runtime": 0.5272, + "eval_samples_per_second": 7.587, + "eval_steps_per_second": 1.897, + "step": 696 + }, + { + "epoch": 0.28, + "learning_rate": 2.1599999999999998e-07, + "loss": 1.9009, + "step": 700 + }, + { + "epoch": 0.28, + "eval_loss": 1.7627824544906616, + "eval_runtime": 0.5295, + "eval_samples_per_second": 7.555, + "eval_steps_per_second": 1.889, + "step": 700 + }, + { + "epoch": 0.28, + "learning_rate": 2.1552000000000001e-07, + "loss": 1.906, + "step": 704 + }, + { + "epoch": 0.28, + "eval_loss": 1.75889253616333, + "eval_runtime": 0.5589, + "eval_samples_per_second": 7.157, + "eval_steps_per_second": 1.789, + "step": 704 + }, + { + "epoch": 0.28, + "learning_rate": 2.1504e-07, + "loss": 1.8671, + "step": 708 + }, + { + "epoch": 0.28, + "eval_loss": 1.7549973726272583, + "eval_runtime": 0.687, + "eval_samples_per_second": 5.822, + "eval_steps_per_second": 1.456, + "step": 708 + }, + { + "epoch": 0.28, + "learning_rate": 2.1455999999999998e-07, + "loss": 1.8609, + "step": 712 + }, + { + "epoch": 0.28, + "eval_loss": 1.7507662773132324, + "eval_runtime": 0.7225, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 1.384, + "step": 712 + }, + { + "epoch": 0.29, + "learning_rate": 2.1408e-07, + "loss": 1.8485, + "step": 716 + }, + { + "epoch": 0.29, + "eval_loss": 1.746917486190796, + "eval_runtime": 0.7954, + "eval_samples_per_second": 5.029, + "eval_steps_per_second": 1.257, + "step": 716 + }, + { + "epoch": 0.29, + "learning_rate": 2.1359999999999998e-07, + "loss": 1.8334, + "step": 720 + }, + { + "epoch": 0.29, + "eval_loss": 1.7430514097213745, + "eval_runtime": 0.7433, + "eval_samples_per_second": 5.381, + "eval_steps_per_second": 1.345, + "step": 720 + }, + { + "epoch": 0.29, + "learning_rate": 2.1312e-07, + "loss": 1.8763, + "step": 724 + }, + { + "epoch": 0.29, + "eval_loss": 1.7392196655273438, + "eval_runtime": 0.5237, + "eval_samples_per_second": 7.638, + "eval_steps_per_second": 1.91, + "step": 724 + }, + { + "epoch": 0.29, + "learning_rate": 2.1263999999999998e-07, + "loss": 1.9005, + "step": 728 + }, + { + "epoch": 0.29, + "eval_loss": 1.7355214357376099, + "eval_runtime": 0.524, + "eval_samples_per_second": 7.634, + "eval_steps_per_second": 1.908, + "step": 728 + }, + { + "epoch": 0.29, + "learning_rate": 2.1216000000000002e-07, + "loss": 1.8669, + "step": 732 + }, + { + "epoch": 0.29, + "eval_loss": 1.731513261795044, + "eval_runtime": 0.5593, + "eval_samples_per_second": 7.152, + "eval_steps_per_second": 1.788, + "step": 732 + }, + { + "epoch": 0.29, + "learning_rate": 2.1168e-07, + "loss": 1.8984, + "step": 736 + }, + { + "epoch": 0.29, + "eval_loss": 1.727636694908142, + "eval_runtime": 0.5241, + "eval_samples_per_second": 7.632, + "eval_steps_per_second": 1.908, + "step": 736 + }, + { + "epoch": 0.3, + "learning_rate": 2.1119999999999999e-07, + "loss": 1.8074, + "step": 740 + }, + { + "epoch": 0.3, + "eval_loss": 1.7240556478500366, + "eval_runtime": 0.715, + "eval_samples_per_second": 5.594, + "eval_steps_per_second": 1.399, + "step": 740 + }, + { + "epoch": 0.3, + "learning_rate": 2.1072e-07, + "loss": 1.8614, + "step": 744 + }, + { + "epoch": 0.3, + "eval_loss": 1.7201639413833618, + "eval_runtime": 0.7611, + "eval_samples_per_second": 5.256, + "eval_steps_per_second": 1.314, + "step": 744 + }, + { + "epoch": 0.3, + "learning_rate": 2.1023999999999998e-07, + "loss": 1.8211, + "step": 748 + }, + { + "epoch": 0.3, + "eval_loss": 1.7165008783340454, + "eval_runtime": 0.7193, + "eval_samples_per_second": 5.561, + "eval_steps_per_second": 1.39, + "step": 748 + }, + { + "epoch": 0.3, + "learning_rate": 2.0976e-07, + "loss": 1.8553, + "step": 752 + }, + { + "epoch": 0.3, + "eval_loss": 1.7123990058898926, + "eval_runtime": 0.5463, + "eval_samples_per_second": 7.323, + "eval_steps_per_second": 1.831, + "step": 752 + }, + { + "epoch": 0.3, + "learning_rate": 2.0927999999999998e-07, + "loss": 1.7978, + "step": 756 + }, + { + "epoch": 0.3, + "eval_loss": 1.7084720134735107, + "eval_runtime": 0.574, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 1.742, + "step": 756 + }, + { + "epoch": 0.3, + "learning_rate": 2.0879999999999996e-07, + "loss": 1.8203, + "step": 760 + }, + { + "epoch": 0.3, + "eval_loss": 1.7048146724700928, + "eval_runtime": 0.5838, + "eval_samples_per_second": 6.852, + "eval_steps_per_second": 1.713, + "step": 760 + }, + { + "epoch": 0.31, + "learning_rate": 2.0832e-07, + "loss": 1.8192, + "step": 764 + }, + { + "epoch": 0.31, + "eval_loss": 1.7010469436645508, + "eval_runtime": 0.5225, + "eval_samples_per_second": 7.656, + "eval_steps_per_second": 1.914, + "step": 764 + }, + { + "epoch": 0.31, + "learning_rate": 2.0784e-07, + "loss": 1.8532, + "step": 768 + }, + { + "epoch": 0.31, + "eval_loss": 1.6973625421524048, + "eval_runtime": 0.525, + "eval_samples_per_second": 7.619, + "eval_steps_per_second": 1.905, + "step": 768 + }, + { + "epoch": 0.31, + "learning_rate": 2.0736e-07, + "loss": 1.8307, + "step": 772 + }, + { + "epoch": 0.31, + "eval_loss": 1.6935136318206787, + "eval_runtime": 0.7235, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 1.382, + "step": 772 + }, + { + "epoch": 0.31, + "learning_rate": 2.0687999999999998e-07, + "loss": 1.8207, + "step": 776 + }, + { + "epoch": 0.31, + "eval_loss": 1.6895670890808105, + "eval_runtime": 0.8289, + "eval_samples_per_second": 4.826, + "eval_steps_per_second": 1.206, + "step": 776 + }, + { + "epoch": 0.31, + "learning_rate": 2.0639999999999997e-07, + "loss": 1.7895, + "step": 780 + }, + { + "epoch": 0.31, + "eval_loss": 1.6858075857162476, + "eval_runtime": 0.7778, + "eval_samples_per_second": 5.143, + "eval_steps_per_second": 1.286, + "step": 780 + }, + { + "epoch": 0.31, + "learning_rate": 2.0592e-07, + "loss": 1.7976, + "step": 784 + }, + { + "epoch": 0.31, + "eval_loss": 1.6820955276489258, + "eval_runtime": 0.5265, + "eval_samples_per_second": 7.597, + "eval_steps_per_second": 1.899, + "step": 784 + }, + { + "epoch": 0.32, + "learning_rate": 2.0544e-07, + "loss": 1.814, + "step": 788 + }, + { + "epoch": 0.32, + "eval_loss": 1.6785138845443726, + "eval_runtime": 0.5179, + "eval_samples_per_second": 7.724, + "eval_steps_per_second": 1.931, + "step": 788 + }, + { + "epoch": 0.32, + "learning_rate": 2.0496e-07, + "loss": 1.7972, + "step": 792 + }, + { + "epoch": 0.32, + "eval_loss": 1.674804449081421, + "eval_runtime": 0.5304, + "eval_samples_per_second": 7.541, + "eval_steps_per_second": 1.885, + "step": 792 + }, + { + "epoch": 0.32, + "learning_rate": 2.0448e-07, + "loss": 1.8258, + "step": 796 + }, + { + "epoch": 0.32, + "eval_loss": 1.6713837385177612, + "eval_runtime": 0.5336, + "eval_samples_per_second": 7.496, + "eval_steps_per_second": 1.874, + "step": 796 + }, + { + "epoch": 0.32, + "learning_rate": 2.04e-07, + "loss": 1.79, + "step": 800 + }, + { + "epoch": 0.32, + "eval_loss": 1.667376160621643, + "eval_runtime": 0.7608, + "eval_samples_per_second": 5.258, + "eval_steps_per_second": 1.314, + "step": 800 + }, + { + "epoch": 0.32, + "learning_rate": 2.0351999999999999e-07, + "loss": 1.802, + "step": 804 + }, + { + "epoch": 0.32, + "eval_loss": 1.6640408039093018, + "eval_runtime": 0.7498, + "eval_samples_per_second": 5.335, + "eval_steps_per_second": 1.334, + "step": 804 + }, + { + "epoch": 0.32, + "learning_rate": 2.0303999999999997e-07, + "loss": 1.7784, + "step": 808 + }, + { + "epoch": 0.32, + "eval_loss": 1.6603385210037231, + "eval_runtime": 0.7501, + "eval_samples_per_second": 5.333, + "eval_steps_per_second": 1.333, + "step": 808 + }, + { + "epoch": 0.32, + "learning_rate": 2.0256e-07, + "loss": 1.7671, + "step": 812 + }, + { + "epoch": 0.32, + "eval_loss": 1.6568516492843628, + "eval_runtime": 0.5206, + "eval_samples_per_second": 7.684, + "eval_steps_per_second": 1.921, + "step": 812 + }, + { + "epoch": 0.33, + "learning_rate": 2.0208e-07, + "loss": 1.7618, + "step": 816 + }, + { + "epoch": 0.33, + "eval_loss": 1.653469443321228, + "eval_runtime": 0.5354, + "eval_samples_per_second": 7.472, + "eval_steps_per_second": 1.868, + "step": 816 + }, + { + "epoch": 0.33, + "learning_rate": 2.016e-07, + "loss": 1.8207, + "step": 820 + }, + { + "epoch": 0.33, + "eval_loss": 1.6502578258514404, + "eval_runtime": 0.523, + "eval_samples_per_second": 7.648, + "eval_steps_per_second": 1.912, + "step": 820 + }, + { + "epoch": 0.33, + "learning_rate": 2.0112e-07, + "loss": 1.7837, + "step": 824 + }, + { + "epoch": 0.33, + "eval_loss": 1.6467454433441162, + "eval_runtime": 0.5297, + "eval_samples_per_second": 7.552, + "eval_steps_per_second": 1.888, + "step": 824 + }, + { + "epoch": 0.33, + "learning_rate": 2.0063999999999998e-07, + "loss": 1.8066, + "step": 828 + }, + { + "epoch": 0.33, + "eval_loss": 1.6439214944839478, + "eval_runtime": 0.522, + "eval_samples_per_second": 7.663, + "eval_steps_per_second": 1.916, + "step": 828 + }, + { + "epoch": 0.33, + "learning_rate": 2.0016e-07, + "loss": 1.7814, + "step": 832 + }, + { + "epoch": 0.33, + "eval_loss": 1.6407381296157837, + "eval_runtime": 0.5382, + "eval_samples_per_second": 7.432, + "eval_steps_per_second": 1.858, + "step": 832 + }, + { + "epoch": 0.33, + "learning_rate": 1.9967999999999997e-07, + "loss": 1.7244, + "step": 836 + }, + { + "epoch": 0.33, + "eval_loss": 1.6372514963150024, + "eval_runtime": 0.7157, + "eval_samples_per_second": 5.589, + "eval_steps_per_second": 1.397, + "step": 836 + }, + { + "epoch": 0.34, + "learning_rate": 1.992e-07, + "loss": 1.7195, + "step": 840 + }, + { + "epoch": 0.34, + "eval_loss": 1.634232997894287, + "eval_runtime": 0.7254, + "eval_samples_per_second": 5.514, + "eval_steps_per_second": 1.379, + "step": 840 + }, + { + "epoch": 0.34, + "learning_rate": 1.9872e-07, + "loss": 1.7524, + "step": 844 + }, + { + "epoch": 0.34, + "eval_loss": 1.6310441493988037, + "eval_runtime": 0.7839, + "eval_samples_per_second": 5.103, + "eval_steps_per_second": 1.276, + "step": 844 + }, + { + "epoch": 0.34, + "learning_rate": 1.9824e-07, + "loss": 1.7644, + "step": 848 + }, + { + "epoch": 0.34, + "eval_loss": 1.6279191970825195, + "eval_runtime": 0.5253, + "eval_samples_per_second": 7.615, + "eval_steps_per_second": 1.904, + "step": 848 + }, + { + "epoch": 0.34, + "learning_rate": 1.9776e-07, + "loss": 1.7171, + "step": 852 + }, + { + "epoch": 0.34, + "eval_loss": 1.6244579553604126, + "eval_runtime": 0.5359, + "eval_samples_per_second": 7.464, + "eval_steps_per_second": 1.866, + "step": 852 + }, + { + "epoch": 0.34, + "learning_rate": 1.9727999999999998e-07, + "loss": 1.7418, + "step": 856 + }, + { + "epoch": 0.34, + "eval_loss": 1.6212078332901, + "eval_runtime": 0.5379, + "eval_samples_per_second": 7.436, + "eval_steps_per_second": 1.859, + "step": 856 + }, + { + "epoch": 0.34, + "learning_rate": 1.968e-07, + "loss": 1.7337, + "step": 860 + }, + { + "epoch": 0.34, + "eval_loss": 1.6180227994918823, + "eval_runtime": 0.5259, + "eval_samples_per_second": 7.606, + "eval_steps_per_second": 1.902, + "step": 860 + }, + { + "epoch": 0.35, + "learning_rate": 1.9631999999999997e-07, + "loss": 1.7441, + "step": 864 + }, + { + "epoch": 0.35, + "eval_loss": 1.61477530002594, + "eval_runtime": 0.5216, + "eval_samples_per_second": 7.669, + "eval_steps_per_second": 1.917, + "step": 864 + }, + { + "epoch": 0.35, + "learning_rate": 1.9584e-07, + "loss": 1.694, + "step": 868 + }, + { + "epoch": 0.35, + "eval_loss": 1.611538052558899, + "eval_runtime": 0.6803, + "eval_samples_per_second": 5.88, + "eval_steps_per_second": 1.47, + "step": 868 + }, + { + "epoch": 0.35, + "learning_rate": 1.9536e-07, + "loss": 1.7601, + "step": 872 + }, + { + "epoch": 0.35, + "eval_loss": 1.6083098649978638, + "eval_runtime": 0.716, + "eval_samples_per_second": 5.586, + "eval_steps_per_second": 1.397, + "step": 872 + }, + { + "epoch": 0.35, + "learning_rate": 1.9487999999999998e-07, + "loss": 1.7081, + "step": 876 + }, + { + "epoch": 0.35, + "eval_loss": 1.6050214767456055, + "eval_runtime": 0.7622, + "eval_samples_per_second": 5.248, + "eval_steps_per_second": 1.312, + "step": 876 + }, + { + "epoch": 0.35, + "learning_rate": 1.944e-07, + "loss": 1.7101, + "step": 880 + }, + { + "epoch": 0.35, + "eval_loss": 1.6019953489303589, + "eval_runtime": 0.7766, + "eval_samples_per_second": 5.151, + "eval_steps_per_second": 1.288, + "step": 880 + }, + { + "epoch": 0.35, + "learning_rate": 1.9391999999999998e-07, + "loss": 1.7271, + "step": 884 + }, + { + "epoch": 0.35, + "eval_loss": 1.5990221500396729, + "eval_runtime": 0.5153, + "eval_samples_per_second": 7.763, + "eval_steps_per_second": 1.941, + "step": 884 + }, + { + "epoch": 0.36, + "learning_rate": 1.9344e-07, + "loss": 1.7402, + "step": 888 + }, + { + "epoch": 0.36, + "eval_loss": 1.5954092741012573, + "eval_runtime": 0.5168, + "eval_samples_per_second": 7.74, + "eval_steps_per_second": 1.935, + "step": 888 + }, + { + "epoch": 0.36, + "learning_rate": 1.9296e-07, + "loss": 1.7125, + "step": 892 + }, + { + "epoch": 0.36, + "eval_loss": 1.5921534299850464, + "eval_runtime": 0.5424, + "eval_samples_per_second": 7.375, + "eval_steps_per_second": 1.844, + "step": 892 + }, + { + "epoch": 0.36, + "learning_rate": 1.9248e-07, + "loss": 1.6949, + "step": 896 + }, + { + "epoch": 0.36, + "eval_loss": 1.5888370275497437, + "eval_runtime": 0.5307, + "eval_samples_per_second": 7.537, + "eval_steps_per_second": 1.884, + "step": 896 + }, + { + "epoch": 0.36, + "learning_rate": 1.92e-07, + "loss": 1.7145, + "step": 900 + }, + { + "epoch": 0.36, + "eval_loss": 1.5858186483383179, + "eval_runtime": 0.511, + "eval_samples_per_second": 7.828, + "eval_steps_per_second": 1.957, + "step": 900 + }, + { + "epoch": 0.36, + "learning_rate": 1.9151999999999998e-07, + "loss": 1.6665, + "step": 904 + }, + { + "epoch": 0.36, + "eval_loss": 1.5824443101882935, + "eval_runtime": 0.6907, + "eval_samples_per_second": 5.791, + "eval_steps_per_second": 1.448, + "step": 904 + }, + { + "epoch": 0.36, + "learning_rate": 1.9104e-07, + "loss": 1.6929, + "step": 908 + }, + { + "epoch": 0.36, + "eval_loss": 1.5796196460723877, + "eval_runtime": 0.7487, + "eval_samples_per_second": 5.342, + "eval_steps_per_second": 1.336, + "step": 908 + }, + { + "epoch": 0.36, + "learning_rate": 1.9055999999999998e-07, + "loss": 1.7068, + "step": 912 + }, + { + "epoch": 0.36, + "eval_loss": 1.5765777826309204, + "eval_runtime": 0.7477, + "eval_samples_per_second": 5.35, + "eval_steps_per_second": 1.337, + "step": 912 + }, + { + "epoch": 0.37, + "learning_rate": 1.9008000000000002e-07, + "loss": 1.6877, + "step": 916 + }, + { + "epoch": 0.37, + "eval_loss": 1.57340669631958, + "eval_runtime": 0.753, + "eval_samples_per_second": 5.312, + "eval_steps_per_second": 1.328, + "step": 916 + }, + { + "epoch": 0.37, + "learning_rate": 1.896e-07, + "loss": 1.6718, + "step": 920 + }, + { + "epoch": 0.37, + "eval_loss": 1.5706267356872559, + "eval_runtime": 0.514, + "eval_samples_per_second": 7.782, + "eval_steps_per_second": 1.945, + "step": 920 + }, + { + "epoch": 0.37, + "learning_rate": 1.8912e-07, + "loss": 1.6886, + "step": 924 + }, + { + "epoch": 0.37, + "eval_loss": 1.5676339864730835, + "eval_runtime": 0.5222, + "eval_samples_per_second": 7.66, + "eval_steps_per_second": 1.915, + "step": 924 + }, + { + "epoch": 0.37, + "learning_rate": 1.8864e-07, + "loss": 1.7459, + "step": 928 + }, + { + "epoch": 0.37, + "eval_loss": 1.5645827054977417, + "eval_runtime": 0.5299, + "eval_samples_per_second": 7.548, + "eval_steps_per_second": 1.887, + "step": 928 + }, + { + "epoch": 0.37, + "learning_rate": 1.8815999999999999e-07, + "loss": 1.6596, + "step": 932 + }, + { + "epoch": 0.37, + "eval_loss": 1.5616861581802368, + "eval_runtime": 0.5303, + "eval_samples_per_second": 7.543, + "eval_steps_per_second": 1.886, + "step": 932 + }, + { + "epoch": 0.37, + "learning_rate": 1.8768e-07, + "loss": 1.6689, + "step": 936 + }, + { + "epoch": 0.37, + "eval_loss": 1.5588451623916626, + "eval_runtime": 0.5236, + "eval_samples_per_second": 7.639, + "eval_steps_per_second": 1.91, + "step": 936 + }, + { + "epoch": 0.38, + "learning_rate": 1.8719999999999998e-07, + "loss": 1.6744, + "step": 940 + }, + { + "epoch": 0.38, + "eval_loss": 1.5560673475265503, + "eval_runtime": 0.7233, + "eval_samples_per_second": 5.53, + "eval_steps_per_second": 1.383, + "step": 940 + }, + { + "epoch": 0.38, + "learning_rate": 1.8671999999999997e-07, + "loss": 1.7009, + "step": 944 + }, + { + "epoch": 0.38, + "eval_loss": 1.5533243417739868, + "eval_runtime": 0.6983, + "eval_samples_per_second": 5.728, + "eval_steps_per_second": 1.432, + "step": 944 + }, + { + "epoch": 0.38, + "learning_rate": 1.8624e-07, + "loss": 1.6651, + "step": 948 + }, + { + "epoch": 0.38, + "eval_loss": 1.55048668384552, + "eval_runtime": 0.7511, + "eval_samples_per_second": 5.325, + "eval_steps_per_second": 1.331, + "step": 948 + }, + { + "epoch": 0.38, + "learning_rate": 1.8576e-07, + "loss": 1.6821, + "step": 952 + }, + { + "epoch": 0.38, + "eval_loss": 1.547943353652954, + "eval_runtime": 0.532, + "eval_samples_per_second": 7.519, + "eval_steps_per_second": 1.88, + "step": 952 + }, + { + "epoch": 0.38, + "learning_rate": 1.8528e-07, + "loss": 1.6453, + "step": 956 + }, + { + "epoch": 0.38, + "eval_loss": 1.5453405380249023, + "eval_runtime": 0.5463, + "eval_samples_per_second": 7.322, + "eval_steps_per_second": 1.831, + "step": 956 + }, + { + "epoch": 0.38, + "learning_rate": 1.848e-07, + "loss": 1.6624, + "step": 960 + }, + { + "epoch": 0.38, + "eval_loss": 1.542648196220398, + "eval_runtime": 0.5288, + "eval_samples_per_second": 7.564, + "eval_steps_per_second": 1.891, + "step": 960 + }, + { + "epoch": 0.39, + "learning_rate": 1.8431999999999997e-07, + "loss": 1.6453, + "step": 964 + }, + { + "epoch": 0.39, + "eval_loss": 1.5402462482452393, + "eval_runtime": 0.5242, + "eval_samples_per_second": 7.63, + "eval_steps_per_second": 1.908, + "step": 964 + }, + { + "epoch": 0.39, + "learning_rate": 1.8383999999999998e-07, + "loss": 1.6451, + "step": 968 + }, + { + "epoch": 0.39, + "eval_loss": 1.5377165079116821, + "eval_runtime": 0.5169, + "eval_samples_per_second": 7.738, + "eval_steps_per_second": 1.935, + "step": 968 + }, + { + "epoch": 0.39, + "learning_rate": 1.8335999999999997e-07, + "loss": 1.6627, + "step": 972 + }, + { + "epoch": 0.39, + "eval_loss": 1.5353412628173828, + "eval_runtime": 0.6797, + "eval_samples_per_second": 5.885, + "eval_steps_per_second": 1.471, + "step": 972 + }, + { + "epoch": 0.39, + "learning_rate": 1.8288e-07, + "loss": 1.6423, + "step": 976 + }, + { + "epoch": 0.39, + "eval_loss": 1.5325669050216675, + "eval_runtime": 0.7175, + "eval_samples_per_second": 5.575, + "eval_steps_per_second": 1.394, + "step": 976 + }, + { + "epoch": 0.39, + "learning_rate": 1.824e-07, + "loss": 1.652, + "step": 980 + }, + { + "epoch": 0.39, + "eval_loss": 1.530207872390747, + "eval_runtime": 0.8099, + "eval_samples_per_second": 4.939, + "eval_steps_per_second": 1.235, + "step": 980 + }, + { + "epoch": 0.39, + "learning_rate": 1.8192e-07, + "loss": 1.6414, + "step": 984 + }, + { + "epoch": 0.39, + "eval_loss": 1.5278236865997314, + "eval_runtime": 0.7814, + "eval_samples_per_second": 5.119, + "eval_steps_per_second": 1.28, + "step": 984 + }, + { + "epoch": 0.4, + "learning_rate": 1.8144e-07, + "loss": 1.6107, + "step": 988 + }, + { + "epoch": 0.4, + "eval_loss": 1.5253430604934692, + "eval_runtime": 0.5386, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 1.857, + "step": 988 + }, + { + "epoch": 0.4, + "learning_rate": 1.8095999999999997e-07, + "loss": 1.6599, + "step": 992 + }, + { + "epoch": 0.4, + "eval_loss": 1.5225120782852173, + "eval_runtime": 0.5302, + "eval_samples_per_second": 7.544, + "eval_steps_per_second": 1.886, + "step": 992 + }, + { + "epoch": 0.4, + "learning_rate": 1.8048e-07, + "loss": 1.6326, + "step": 996 + }, + { + "epoch": 0.4, + "eval_loss": 1.5201939344406128, + "eval_runtime": 0.533, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 1.876, + "step": 996 + }, + { + "epoch": 0.4, + "learning_rate": 1.8e-07, + "loss": 1.6324, + "step": 1000 + }, + { + "epoch": 0.4, + "eval_loss": 1.5175316333770752, + "eval_runtime": 0.5316, + "eval_samples_per_second": 7.525, + "eval_steps_per_second": 1.881, + "step": 1000 + }, + { + "epoch": 0.4, + "learning_rate": 1.7952e-07, + "loss": 1.5907, + "step": 1004 + }, + { + "epoch": 0.4, + "eval_loss": 1.5149424076080322, + "eval_runtime": 0.7298, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 1.37, + "step": 1004 + }, + { + "epoch": 0.4, + "learning_rate": 1.7904e-07, + "loss": 1.6465, + "step": 1008 + }, + { + "epoch": 0.4, + "eval_loss": 1.5124318599700928, + "eval_runtime": 0.7308, + "eval_samples_per_second": 5.473, + "eval_steps_per_second": 1.368, + "step": 1008 + }, + { + "epoch": 0.4, + "learning_rate": 1.7855999999999998e-07, + "loss": 1.6148, + "step": 1012 + }, + { + "epoch": 0.4, + "eval_loss": 1.510151743888855, + "eval_runtime": 0.7345, + "eval_samples_per_second": 5.446, + "eval_steps_per_second": 1.361, + "step": 1012 + }, + { + "epoch": 0.41, + "learning_rate": 1.7808e-07, + "loss": 1.6064, + "step": 1016 + }, + { + "epoch": 0.41, + "eval_loss": 1.5073630809783936, + "eval_runtime": 0.5414, + "eval_samples_per_second": 7.388, + "eval_steps_per_second": 1.847, + "step": 1016 + }, + { + "epoch": 0.41, + "learning_rate": 1.7759999999999998e-07, + "loss": 1.6342, + "step": 1020 + }, + { + "epoch": 0.41, + "eval_loss": 1.5052520036697388, + "eval_runtime": 0.516, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 1020 + }, + { + "epoch": 0.41, + "learning_rate": 1.7712000000000001e-07, + "loss": 1.605, + "step": 1024 + }, + { + "epoch": 0.41, + "eval_loss": 1.5025243759155273, + "eval_runtime": 0.5373, + "eval_samples_per_second": 7.445, + "eval_steps_per_second": 1.861, + "step": 1024 + }, + { + "epoch": 0.41, + "learning_rate": 1.7664e-07, + "loss": 1.6121, + "step": 1028 + }, + { + "epoch": 0.41, + "eval_loss": 1.500252604484558, + "eval_runtime": 0.5476, + "eval_samples_per_second": 7.304, + "eval_steps_per_second": 1.826, + "step": 1028 + }, + { + "epoch": 0.41, + "learning_rate": 1.7616e-07, + "loss": 1.617, + "step": 1032 + }, + { + "epoch": 0.41, + "eval_loss": 1.4977892637252808, + "eval_runtime": 0.5255, + "eval_samples_per_second": 7.612, + "eval_steps_per_second": 1.903, + "step": 1032 + }, + { + "epoch": 0.41, + "learning_rate": 1.7568e-07, + "loss": 1.5897, + "step": 1036 + }, + { + "epoch": 0.41, + "eval_loss": 1.4954513311386108, + "eval_runtime": 0.7255, + "eval_samples_per_second": 5.513, + "eval_steps_per_second": 1.378, + "step": 1036 + }, + { + "epoch": 0.42, + "learning_rate": 1.7519999999999998e-07, + "loss": 1.6022, + "step": 1040 + }, + { + "epoch": 0.42, + "eval_loss": 1.4929691553115845, + "eval_runtime": 0.6954, + "eval_samples_per_second": 5.752, + "eval_steps_per_second": 1.438, + "step": 1040 + }, + { + "epoch": 0.42, + "learning_rate": 1.7472e-07, + "loss": 1.5748, + "step": 1044 + }, + { + "epoch": 0.42, + "eval_loss": 1.4902769327163696, + "eval_runtime": 0.8026, + "eval_samples_per_second": 4.984, + "eval_steps_per_second": 1.246, + "step": 1044 + }, + { + "epoch": 0.42, + "learning_rate": 1.7423999999999998e-07, + "loss": 1.5974, + "step": 1048 + }, + { + "epoch": 0.42, + "eval_loss": 1.4878779649734497, + "eval_runtime": 0.7804, + "eval_samples_per_second": 5.125, + "eval_steps_per_second": 1.281, + "step": 1048 + }, + { + "epoch": 0.42, + "learning_rate": 1.7376000000000002e-07, + "loss": 1.6126, + "step": 1052 + }, + { + "epoch": 0.42, + "eval_loss": 1.48554527759552, + "eval_runtime": 0.5423, + "eval_samples_per_second": 7.376, + "eval_steps_per_second": 1.844, + "step": 1052 + }, + { + "epoch": 0.42, + "learning_rate": 1.7328e-07, + "loss": 1.6189, + "step": 1056 + }, + { + "epoch": 0.42, + "eval_loss": 1.4827589988708496, + "eval_runtime": 0.5326, + "eval_samples_per_second": 7.511, + "eval_steps_per_second": 1.878, + "step": 1056 + }, + { + "epoch": 0.42, + "learning_rate": 1.7279999999999999e-07, + "loss": 1.5916, + "step": 1060 + }, + { + "epoch": 0.42, + "eval_loss": 1.4803836345672607, + "eval_runtime": 0.5273, + "eval_samples_per_second": 7.585, + "eval_steps_per_second": 1.896, + "step": 1060 + }, + { + "epoch": 0.43, + "learning_rate": 1.7232e-07, + "loss": 1.5938, + "step": 1064 + }, + { + "epoch": 0.43, + "eval_loss": 1.4778516292572021, + "eval_runtime": 0.5436, + "eval_samples_per_second": 7.358, + "eval_steps_per_second": 1.839, + "step": 1064 + }, + { + "epoch": 0.43, + "learning_rate": 1.7183999999999998e-07, + "loss": 1.6026, + "step": 1068 + }, + { + "epoch": 0.43, + "eval_loss": 1.475649118423462, + "eval_runtime": 0.5298, + "eval_samples_per_second": 7.549, + "eval_steps_per_second": 1.887, + "step": 1068 + }, + { + "epoch": 0.43, + "learning_rate": 1.7136e-07, + "loss": 1.5687, + "step": 1072 + }, + { + "epoch": 0.43, + "eval_loss": 1.473489761352539, + "eval_runtime": 0.7191, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 1.391, + "step": 1072 + }, + { + "epoch": 0.43, + "learning_rate": 1.7087999999999998e-07, + "loss": 1.5413, + "step": 1076 + }, + { + "epoch": 0.43, + "eval_loss": 1.4712145328521729, + "eval_runtime": 0.7022, + "eval_samples_per_second": 5.696, + "eval_steps_per_second": 1.424, + "step": 1076 + }, + { + "epoch": 0.43, + "learning_rate": 1.7039999999999996e-07, + "loss": 1.5778, + "step": 1080 + }, + { + "epoch": 0.43, + "eval_loss": 1.4688694477081299, + "eval_runtime": 0.7707, + "eval_samples_per_second": 5.19, + "eval_steps_per_second": 1.298, + "step": 1080 + }, + { + "epoch": 0.43, + "learning_rate": 1.6992e-07, + "loss": 1.5731, + "step": 1084 + }, + { + "epoch": 0.43, + "eval_loss": 1.4664225578308105, + "eval_runtime": 0.5386, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 1.857, + "step": 1084 + }, + { + "epoch": 0.44, + "learning_rate": 1.6944e-07, + "loss": 1.5625, + "step": 1088 + }, + { + "epoch": 0.44, + "eval_loss": 1.464247465133667, + "eval_runtime": 0.5484, + "eval_samples_per_second": 7.294, + "eval_steps_per_second": 1.823, + "step": 1088 + }, + { + "epoch": 0.44, + "learning_rate": 1.6896e-07, + "loss": 1.55, + "step": 1092 + }, + { + "epoch": 0.44, + "eval_loss": 1.4620987176895142, + "eval_runtime": 0.5342, + "eval_samples_per_second": 7.488, + "eval_steps_per_second": 1.872, + "step": 1092 + }, + { + "epoch": 0.44, + "learning_rate": 1.6847999999999998e-07, + "loss": 1.5852, + "step": 1096 + }, + { + "epoch": 0.44, + "eval_loss": 1.459930419921875, + "eval_runtime": 0.5332, + "eval_samples_per_second": 7.501, + "eval_steps_per_second": 1.875, + "step": 1096 + }, + { + "epoch": 0.44, + "learning_rate": 1.68e-07, + "loss": 1.5614, + "step": 1100 + }, + { + "epoch": 0.44, + "eval_loss": 1.4578797817230225, + "eval_runtime": 0.5504, + "eval_samples_per_second": 7.268, + "eval_steps_per_second": 1.817, + "step": 1100 + }, + { + "epoch": 0.44, + "learning_rate": 1.6752e-07, + "loss": 1.5619, + "step": 1104 + }, + { + "epoch": 0.44, + "eval_loss": 1.4559952020645142, + "eval_runtime": 0.7448, + "eval_samples_per_second": 5.37, + "eval_steps_per_second": 1.343, + "step": 1104 + }, + { + "epoch": 0.44, + "learning_rate": 1.6704e-07, + "loss": 1.5658, + "step": 1108 + }, + { + "epoch": 0.44, + "eval_loss": 1.454249382019043, + "eval_runtime": 0.773, + "eval_samples_per_second": 5.174, + "eval_steps_per_second": 1.294, + "step": 1108 + }, + { + "epoch": 0.44, + "learning_rate": 1.6656e-07, + "loss": 1.5699, + "step": 1112 + }, + { + "epoch": 0.44, + "eval_loss": 1.4521348476409912, + "eval_runtime": 0.8287, + "eval_samples_per_second": 4.827, + "eval_steps_per_second": 1.207, + "step": 1112 + }, + { + "epoch": 0.45, + "learning_rate": 1.6608e-07, + "loss": 1.5738, + "step": 1116 + }, + { + "epoch": 0.45, + "eval_loss": 1.450175404548645, + "eval_runtime": 0.5398, + "eval_samples_per_second": 7.41, + "eval_steps_per_second": 1.852, + "step": 1116 + }, + { + "epoch": 0.45, + "learning_rate": 1.656e-07, + "loss": 1.5823, + "step": 1120 + }, + { + "epoch": 0.45, + "eval_loss": 1.4481428861618042, + "eval_runtime": 0.5592, + "eval_samples_per_second": 7.153, + "eval_steps_per_second": 1.788, + "step": 1120 + }, + { + "epoch": 0.45, + "learning_rate": 1.6511999999999999e-07, + "loss": 1.5425, + "step": 1124 + }, + { + "epoch": 0.45, + "eval_loss": 1.4458932876586914, + "eval_runtime": 0.5511, + "eval_samples_per_second": 7.259, + "eval_steps_per_second": 1.815, + "step": 1124 + }, + { + "epoch": 0.45, + "learning_rate": 1.6463999999999997e-07, + "loss": 1.5604, + "step": 1128 + }, + { + "epoch": 0.45, + "eval_loss": 1.4438304901123047, + "eval_runtime": 0.5355, + "eval_samples_per_second": 7.47, + "eval_steps_per_second": 1.867, + "step": 1128 + }, + { + "epoch": 0.45, + "learning_rate": 1.6416e-07, + "loss": 1.5562, + "step": 1132 + }, + { + "epoch": 0.45, + "eval_loss": 1.442002773284912, + "eval_runtime": 0.5332, + "eval_samples_per_second": 7.502, + "eval_steps_per_second": 1.876, + "step": 1132 + }, + { + "epoch": 0.45, + "learning_rate": 1.6368e-07, + "loss": 1.555, + "step": 1136 + }, + { + "epoch": 0.45, + "eval_loss": 1.4399393796920776, + "eval_runtime": 0.7524, + "eval_samples_per_second": 5.316, + "eval_steps_per_second": 1.329, + "step": 1136 + }, + { + "epoch": 0.46, + "learning_rate": 1.632e-07, + "loss": 1.5158, + "step": 1140 + }, + { + "epoch": 0.46, + "eval_loss": 1.437983512878418, + "eval_runtime": 0.7269, + "eval_samples_per_second": 5.503, + "eval_steps_per_second": 1.376, + "step": 1140 + }, + { + "epoch": 0.46, + "learning_rate": 1.6272e-07, + "loss": 1.5272, + "step": 1144 + }, + { + "epoch": 0.46, + "eval_loss": 1.435863733291626, + "eval_runtime": 0.7356, + "eval_samples_per_second": 5.438, + "eval_steps_per_second": 1.359, + "step": 1144 + }, + { + "epoch": 0.46, + "learning_rate": 1.6223999999999998e-07, + "loss": 1.5467, + "step": 1148 + }, + { + "epoch": 0.46, + "eval_loss": 1.4338979721069336, + "eval_runtime": 0.5695, + "eval_samples_per_second": 7.023, + "eval_steps_per_second": 1.756, + "step": 1148 + }, + { + "epoch": 0.46, + "learning_rate": 1.6176e-07, + "loss": 1.5399, + "step": 1152 + }, + { + "epoch": 0.46, + "eval_loss": 1.4317151308059692, + "eval_runtime": 0.5215, + "eval_samples_per_second": 7.669, + "eval_steps_per_second": 1.917, + "step": 1152 + }, + { + "epoch": 0.46, + "learning_rate": 1.6127999999999997e-07, + "loss": 1.5221, + "step": 1156 + }, + { + "epoch": 0.46, + "eval_loss": 1.4296718835830688, + "eval_runtime": 0.5471, + "eval_samples_per_second": 7.311, + "eval_steps_per_second": 1.828, + "step": 1156 + }, + { + "epoch": 0.46, + "learning_rate": 1.608e-07, + "loss": 1.5022, + "step": 1160 + }, + { + "epoch": 0.46, + "eval_loss": 1.4277141094207764, + "eval_runtime": 0.5395, + "eval_samples_per_second": 7.414, + "eval_steps_per_second": 1.853, + "step": 1160 + }, + { + "epoch": 0.47, + "learning_rate": 1.6032e-07, + "loss": 1.5385, + "step": 1164 + }, + { + "epoch": 0.47, + "eval_loss": 1.4257354736328125, + "eval_runtime": 0.5342, + "eval_samples_per_second": 7.487, + "eval_steps_per_second": 1.872, + "step": 1164 + }, + { + "epoch": 0.47, + "learning_rate": 1.5984e-07, + "loss": 1.5042, + "step": 1168 + }, + { + "epoch": 0.47, + "eval_loss": 1.4236301183700562, + "eval_runtime": 0.6434, + "eval_samples_per_second": 6.217, + "eval_steps_per_second": 1.554, + "step": 1168 + }, + { + "epoch": 0.47, + "learning_rate": 1.5936e-07, + "loss": 1.5007, + "step": 1172 + }, + { + "epoch": 0.47, + "eval_loss": 1.421656608581543, + "eval_runtime": 0.7224, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 1.384, + "step": 1172 + }, + { + "epoch": 0.47, + "learning_rate": 1.5887999999999998e-07, + "loss": 1.5323, + "step": 1176 + }, + { + "epoch": 0.47, + "eval_loss": 1.4196075201034546, + "eval_runtime": 0.7946, + "eval_samples_per_second": 5.034, + "eval_steps_per_second": 1.259, + "step": 1176 + }, + { + "epoch": 0.47, + "learning_rate": 1.584e-07, + "loss": 1.5269, + "step": 1180 + }, + { + "epoch": 0.47, + "eval_loss": 1.4174154996871948, + "eval_runtime": 0.82, + "eval_samples_per_second": 4.878, + "eval_steps_per_second": 1.22, + "step": 1180 + }, + { + "epoch": 0.47, + "learning_rate": 1.5791999999999997e-07, + "loss": 1.5379, + "step": 1184 + }, + { + "epoch": 0.47, + "eval_loss": 1.4156051874160767, + "eval_runtime": 0.5319, + "eval_samples_per_second": 7.52, + "eval_steps_per_second": 1.88, + "step": 1184 + }, + { + "epoch": 0.48, + "learning_rate": 1.5744e-07, + "loss": 1.522, + "step": 1188 + }, + { + "epoch": 0.48, + "eval_loss": 1.4136687517166138, + "eval_runtime": 0.5286, + "eval_samples_per_second": 7.567, + "eval_steps_per_second": 1.892, + "step": 1188 + }, + { + "epoch": 0.48, + "learning_rate": 1.5696e-07, + "loss": 1.506, + "step": 1192 + }, + { + "epoch": 0.48, + "eval_loss": 1.4115678071975708, + "eval_runtime": 0.553, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.808, + "step": 1192 + }, + { + "epoch": 0.48, + "learning_rate": 1.5647999999999998e-07, + "loss": 1.4986, + "step": 1196 + }, + { + "epoch": 0.48, + "eval_loss": 1.409631371498108, + "eval_runtime": 0.5273, + "eval_samples_per_second": 7.585, + "eval_steps_per_second": 1.896, + "step": 1196 + }, + { + "epoch": 0.48, + "learning_rate": 1.56e-07, + "loss": 1.4918, + "step": 1200 + }, + { + "epoch": 0.48, + "eval_loss": 1.407455563545227, + "eval_runtime": 0.5269, + "eval_samples_per_second": 7.592, + "eval_steps_per_second": 1.898, + "step": 1200 + }, + { + "epoch": 0.48, + "learning_rate": 1.5551999999999998e-07, + "loss": 1.5124, + "step": 1204 + }, + { + "epoch": 0.48, + "eval_loss": 1.4056380987167358, + "eval_runtime": 0.7536, + "eval_samples_per_second": 5.308, + "eval_steps_per_second": 1.327, + "step": 1204 + }, + { + "epoch": 0.48, + "learning_rate": 1.5504000000000002e-07, + "loss": 1.4926, + "step": 1208 + }, + { + "epoch": 0.48, + "eval_loss": 1.403800368309021, + "eval_runtime": 0.7248, + "eval_samples_per_second": 5.519, + "eval_steps_per_second": 1.38, + "step": 1208 + }, + { + "epoch": 0.48, + "learning_rate": 1.5456e-07, + "loss": 1.5053, + "step": 1212 + }, + { + "epoch": 0.48, + "eval_loss": 1.40152907371521, + "eval_runtime": 0.7447, + "eval_samples_per_second": 5.371, + "eval_steps_per_second": 1.343, + "step": 1212 + }, + { + "epoch": 0.49, + "learning_rate": 1.5408e-07, + "loss": 1.5043, + "step": 1216 + }, + { + "epoch": 0.49, + "eval_loss": 1.3996310234069824, + "eval_runtime": 0.738, + "eval_samples_per_second": 5.42, + "eval_steps_per_second": 1.355, + "step": 1216 + }, + { + "epoch": 0.49, + "learning_rate": 1.536e-07, + "loss": 1.5068, + "step": 1220 + }, + { + "epoch": 0.49, + "eval_loss": 1.3975541591644287, + "eval_runtime": 0.5275, + "eval_samples_per_second": 7.583, + "eval_steps_per_second": 1.896, + "step": 1220 + }, + { + "epoch": 0.49, + "learning_rate": 1.5311999999999998e-07, + "loss": 1.5039, + "step": 1224 + }, + { + "epoch": 0.49, + "eval_loss": 1.3954721689224243, + "eval_runtime": 0.5317, + "eval_samples_per_second": 7.523, + "eval_steps_per_second": 1.881, + "step": 1224 + }, + { + "epoch": 0.49, + "learning_rate": 1.5264e-07, + "loss": 1.4772, + "step": 1228 + }, + { + "epoch": 0.49, + "eval_loss": 1.3933203220367432, + "eval_runtime": 0.5283, + "eval_samples_per_second": 7.571, + "eval_steps_per_second": 1.893, + "step": 1228 + }, + { + "epoch": 0.49, + "learning_rate": 1.5215999999999998e-07, + "loss": 1.4873, + "step": 1232 + }, + { + "epoch": 0.49, + "eval_loss": 1.3916254043579102, + "eval_runtime": 0.5344, + "eval_samples_per_second": 7.485, + "eval_steps_per_second": 1.871, + "step": 1232 + }, + { + "epoch": 0.49, + "learning_rate": 1.5168000000000002e-07, + "loss": 1.4977, + "step": 1236 + }, + { + "epoch": 0.49, + "eval_loss": 1.3896205425262451, + "eval_runtime": 0.5249, + "eval_samples_per_second": 7.62, + "eval_steps_per_second": 1.905, + "step": 1236 + }, + { + "epoch": 0.5, + "learning_rate": 1.512e-07, + "loss": 1.5016, + "step": 1240 + }, + { + "epoch": 0.5, + "eval_loss": 1.3873213529586792, + "eval_runtime": 0.7136, + "eval_samples_per_second": 5.605, + "eval_steps_per_second": 1.401, + "step": 1240 + }, + { + "epoch": 0.5, + "learning_rate": 1.5072e-07, + "loss": 1.495, + "step": 1244 + }, + { + "epoch": 0.5, + "eval_loss": 1.3854175806045532, + "eval_runtime": 0.7372, + "eval_samples_per_second": 5.426, + "eval_steps_per_second": 1.357, + "step": 1244 + }, + { + "epoch": 0.5, + "learning_rate": 1.5024e-07, + "loss": 1.4803, + "step": 1248 + }, + { + "epoch": 0.5, + "eval_loss": 1.3834645748138428, + "eval_runtime": 0.7836, + "eval_samples_per_second": 5.104, + "eval_steps_per_second": 1.276, + "step": 1248 + }, + { + "epoch": 0.5, + "learning_rate": 1.4975999999999999e-07, + "loss": 1.4842, + "step": 1252 + }, + { + "epoch": 0.5, + "eval_loss": 1.381633996963501, + "eval_runtime": 0.5401, + "eval_samples_per_second": 7.405, + "eval_steps_per_second": 1.851, + "step": 1252 + }, + { + "epoch": 0.5, + "learning_rate": 1.4928e-07, + "loss": 1.4762, + "step": 1256 + }, + { + "epoch": 0.5, + "eval_loss": 1.379853367805481, + "eval_runtime": 0.5233, + "eval_samples_per_second": 7.644, + "eval_steps_per_second": 1.911, + "step": 1256 + }, + { + "epoch": 0.5, + "learning_rate": 1.4879999999999998e-07, + "loss": 1.4859, + "step": 1260 + }, + { + "epoch": 0.5, + "eval_loss": 1.3780815601348877, + "eval_runtime": 0.5276, + "eval_samples_per_second": 7.582, + "eval_steps_per_second": 1.895, + "step": 1260 + }, + { + "epoch": 0.51, + "learning_rate": 1.4832e-07, + "loss": 1.4948, + "step": 1264 + }, + { + "epoch": 0.51, + "eval_loss": 1.3763624429702759, + "eval_runtime": 0.5355, + "eval_samples_per_second": 7.469, + "eval_steps_per_second": 1.867, + "step": 1264 + }, + { + "epoch": 0.51, + "learning_rate": 1.4784e-07, + "loss": 1.4851, + "step": 1268 + }, + { + "epoch": 0.51, + "eval_loss": 1.374289631843567, + "eval_runtime": 0.5235, + "eval_samples_per_second": 7.64, + "eval_steps_per_second": 1.91, + "step": 1268 + }, + { + "epoch": 0.51, + "learning_rate": 1.4736e-07, + "loss": 1.4749, + "step": 1272 + }, + { + "epoch": 0.51, + "eval_loss": 1.3724154233932495, + "eval_runtime": 0.6384, + "eval_samples_per_second": 6.266, + "eval_steps_per_second": 1.566, + "step": 1272 + }, + { + "epoch": 0.51, + "learning_rate": 1.4687999999999998e-07, + "loss": 1.4594, + "step": 1276 + }, + { + "epoch": 0.51, + "eval_loss": 1.3709261417388916, + "eval_runtime": 0.7249, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 1.379, + "step": 1276 + }, + { + "epoch": 0.51, + "learning_rate": 1.464e-07, + "loss": 1.4517, + "step": 1280 + }, + { + "epoch": 0.51, + "eval_loss": 1.3691537380218506, + "eval_runtime": 0.7303, + "eval_samples_per_second": 5.477, + "eval_steps_per_second": 1.369, + "step": 1280 + }, + { + "epoch": 0.51, + "learning_rate": 1.4592e-07, + "loss": 1.4239, + "step": 1284 + }, + { + "epoch": 0.51, + "eval_loss": 1.3673396110534668, + "eval_runtime": 0.7929, + "eval_samples_per_second": 5.044, + "eval_steps_per_second": 1.261, + "step": 1284 + }, + { + "epoch": 0.52, + "learning_rate": 1.4543999999999998e-07, + "loss": 1.4775, + "step": 1288 + }, + { + "epoch": 0.52, + "eval_loss": 1.3657190799713135, + "eval_runtime": 0.5509, + "eval_samples_per_second": 7.261, + "eval_steps_per_second": 1.815, + "step": 1288 + }, + { + "epoch": 0.52, + "learning_rate": 1.4496e-07, + "loss": 1.4483, + "step": 1292 + }, + { + "epoch": 0.52, + "eval_loss": 1.3642776012420654, + "eval_runtime": 0.5236, + "eval_samples_per_second": 7.639, + "eval_steps_per_second": 1.91, + "step": 1292 + }, + { + "epoch": 0.52, + "learning_rate": 1.4447999999999998e-07, + "loss": 1.4688, + "step": 1296 + }, + { + "epoch": 0.52, + "eval_loss": 1.3624374866485596, + "eval_runtime": 0.5281, + "eval_samples_per_second": 7.574, + "eval_steps_per_second": 1.893, + "step": 1296 + }, + { + "epoch": 0.52, + "learning_rate": 1.44e-07, + "loss": 1.4566, + "step": 1300 + }, + { + "epoch": 0.52, + "eval_loss": 1.3608499765396118, + "eval_runtime": 0.5346, + "eval_samples_per_second": 7.482, + "eval_steps_per_second": 1.871, + "step": 1300 + }, + { + "epoch": 0.52, + "learning_rate": 1.4352e-07, + "loss": 1.4592, + "step": 1304 + }, + { + "epoch": 0.52, + "eval_loss": 1.3591777086257935, + "eval_runtime": 0.543, + "eval_samples_per_second": 7.367, + "eval_steps_per_second": 1.842, + "step": 1304 + }, + { + "epoch": 0.52, + "learning_rate": 1.4304e-07, + "loss": 1.4505, + "step": 1308 + }, + { + "epoch": 0.52, + "eval_loss": 1.357291340827942, + "eval_runtime": 0.7548, + "eval_samples_per_second": 5.299, + "eval_steps_per_second": 1.325, + "step": 1308 + }, + { + "epoch": 0.52, + "learning_rate": 1.4256e-07, + "loss": 1.4304, + "step": 1312 + }, + { + "epoch": 0.52, + "eval_loss": 1.3557498455047607, + "eval_runtime": 0.7262, + "eval_samples_per_second": 5.508, + "eval_steps_per_second": 1.377, + "step": 1312 + }, + { + "epoch": 0.53, + "learning_rate": 1.4208e-07, + "loss": 1.4691, + "step": 1316 + }, + { + "epoch": 0.53, + "eval_loss": 1.3540558815002441, + "eval_runtime": 0.7121, + "eval_samples_per_second": 5.617, + "eval_steps_per_second": 1.404, + "step": 1316 + }, + { + "epoch": 0.53, + "learning_rate": 1.416e-07, + "loss": 1.4423, + "step": 1320 + }, + { + "epoch": 0.53, + "eval_loss": 1.3522251844406128, + "eval_runtime": 0.7774, + "eval_samples_per_second": 5.145, + "eval_steps_per_second": 1.286, + "step": 1320 + }, + { + "epoch": 0.53, + "learning_rate": 1.4111999999999998e-07, + "loss": 1.4301, + "step": 1324 + }, + { + "epoch": 0.53, + "eval_loss": 1.3508257865905762, + "eval_runtime": 0.5433, + "eval_samples_per_second": 7.362, + "eval_steps_per_second": 1.841, + "step": 1324 + }, + { + "epoch": 0.53, + "learning_rate": 1.4064e-07, + "loss": 1.4422, + "step": 1328 + }, + { + "epoch": 0.53, + "eval_loss": 1.3490896224975586, + "eval_runtime": 0.5369, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 1.863, + "step": 1328 + }, + { + "epoch": 0.53, + "learning_rate": 1.4016e-07, + "loss": 1.4577, + "step": 1332 + }, + { + "epoch": 0.53, + "eval_loss": 1.347461223602295, + "eval_runtime": 0.5223, + "eval_samples_per_second": 7.658, + "eval_steps_per_second": 1.915, + "step": 1332 + }, + { + "epoch": 0.53, + "learning_rate": 1.3968e-07, + "loss": 1.4541, + "step": 1336 + }, + { + "epoch": 0.53, + "eval_loss": 1.3457545042037964, + "eval_runtime": 0.5399, + "eval_samples_per_second": 7.409, + "eval_steps_per_second": 1.852, + "step": 1336 + }, + { + "epoch": 0.54, + "learning_rate": 1.392e-07, + "loss": 1.4246, + "step": 1340 + }, + { + "epoch": 0.54, + "eval_loss": 1.343980073928833, + "eval_runtime": 0.5481, + "eval_samples_per_second": 7.297, + "eval_steps_per_second": 1.824, + "step": 1340 + }, + { + "epoch": 0.54, + "learning_rate": 1.3872e-07, + "loss": 1.4507, + "step": 1344 + }, + { + "epoch": 0.54, + "eval_loss": 1.3423739671707153, + "eval_runtime": 0.7414, + "eval_samples_per_second": 5.395, + "eval_steps_per_second": 1.349, + "step": 1344 + }, + { + "epoch": 0.54, + "learning_rate": 1.3824e-07, + "loss": 1.4312, + "step": 1348 + }, + { + "epoch": 0.54, + "eval_loss": 1.3408253192901611, + "eval_runtime": 0.7783, + "eval_samples_per_second": 5.139, + "eval_steps_per_second": 1.285, + "step": 1348 + }, + { + "epoch": 0.54, + "learning_rate": 1.3775999999999998e-07, + "loss": 1.4394, + "step": 1352 + }, + { + "epoch": 0.54, + "eval_loss": 1.339220404624939, + "eval_runtime": 0.771, + "eval_samples_per_second": 5.188, + "eval_steps_per_second": 1.297, + "step": 1352 + }, + { + "epoch": 0.54, + "learning_rate": 1.3728e-07, + "loss": 1.4271, + "step": 1356 + }, + { + "epoch": 0.54, + "eval_loss": 1.3373547792434692, + "eval_runtime": 0.5264, + "eval_samples_per_second": 7.599, + "eval_steps_per_second": 1.9, + "step": 1356 + }, + { + "epoch": 0.54, + "learning_rate": 1.368e-07, + "loss": 1.4081, + "step": 1360 + }, + { + "epoch": 0.54, + "eval_loss": 1.3356679677963257, + "eval_runtime": 0.5397, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.853, + "step": 1360 + }, + { + "epoch": 0.55, + "learning_rate": 1.3632e-07, + "loss": 1.4314, + "step": 1364 + }, + { + "epoch": 0.55, + "eval_loss": 1.333927035331726, + "eval_runtime": 0.5418, + "eval_samples_per_second": 7.382, + "eval_steps_per_second": 1.846, + "step": 1364 + }, + { + "epoch": 0.55, + "learning_rate": 1.3583999999999998e-07, + "loss": 1.4359, + "step": 1368 + }, + { + "epoch": 0.55, + "eval_loss": 1.3325647115707397, + "eval_runtime": 0.5464, + "eval_samples_per_second": 7.321, + "eval_steps_per_second": 1.83, + "step": 1368 + }, + { + "epoch": 0.55, + "learning_rate": 1.3536e-07, + "loss": 1.4381, + "step": 1372 + }, + { + "epoch": 0.55, + "eval_loss": 1.3307887315750122, + "eval_runtime": 0.5493, + "eval_samples_per_second": 7.282, + "eval_steps_per_second": 1.82, + "step": 1372 + }, + { + "epoch": 0.55, + "learning_rate": 1.3488e-07, + "loss": 1.4219, + "step": 1376 + }, + { + "epoch": 0.55, + "eval_loss": 1.3293663263320923, + "eval_runtime": 0.5921, + "eval_samples_per_second": 6.755, + "eval_steps_per_second": 1.689, + "step": 1376 + }, + { + "epoch": 0.55, + "learning_rate": 1.3439999999999999e-07, + "loss": 1.4669, + "step": 1380 + }, + { + "epoch": 0.55, + "eval_loss": 1.3278565406799316, + "eval_runtime": 0.7788, + "eval_samples_per_second": 5.136, + "eval_steps_per_second": 1.284, + "step": 1380 + }, + { + "epoch": 0.55, + "learning_rate": 1.3392e-07, + "loss": 1.4163, + "step": 1384 + }, + { + "epoch": 0.55, + "eval_loss": 1.3260074853897095, + "eval_runtime": 0.8128, + "eval_samples_per_second": 4.921, + "eval_steps_per_second": 1.23, + "step": 1384 + }, + { + "epoch": 0.56, + "learning_rate": 1.3343999999999998e-07, + "loss": 1.4153, + "step": 1388 + }, + { + "epoch": 0.56, + "eval_loss": 1.3242360353469849, + "eval_runtime": 0.8002, + "eval_samples_per_second": 4.999, + "eval_steps_per_second": 1.25, + "step": 1388 + }, + { + "epoch": 0.56, + "learning_rate": 1.3296e-07, + "loss": 1.4506, + "step": 1392 + }, + { + "epoch": 0.56, + "eval_loss": 1.3229784965515137, + "eval_runtime": 0.5395, + "eval_samples_per_second": 7.414, + "eval_steps_per_second": 1.854, + "step": 1392 + }, + { + "epoch": 0.56, + "learning_rate": 1.3247999999999998e-07, + "loss": 1.4229, + "step": 1396 + }, + { + "epoch": 0.56, + "eval_loss": 1.3213036060333252, + "eval_runtime": 0.5374, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 1.861, + "step": 1396 + }, + { + "epoch": 0.56, + "learning_rate": 1.32e-07, + "loss": 1.4218, + "step": 1400 + }, + { + "epoch": 0.56, + "eval_loss": 1.3196250200271606, + "eval_runtime": 0.5404, + "eval_samples_per_second": 7.402, + "eval_steps_per_second": 1.851, + "step": 1400 + }, + { + "epoch": 0.56, + "learning_rate": 1.3152e-07, + "loss": 1.4185, + "step": 1404 + }, + { + "epoch": 0.56, + "eval_loss": 1.3180840015411377, + "eval_runtime": 0.5573, + "eval_samples_per_second": 7.177, + "eval_steps_per_second": 1.794, + "step": 1404 + }, + { + "epoch": 0.56, + "learning_rate": 1.3104e-07, + "loss": 1.4283, + "step": 1408 + }, + { + "epoch": 0.56, + "eval_loss": 1.316424012184143, + "eval_runtime": 0.5204, + "eval_samples_per_second": 7.686, + "eval_steps_per_second": 1.922, + "step": 1408 + }, + { + "epoch": 0.56, + "learning_rate": 1.3056e-07, + "loss": 1.4202, + "step": 1412 + }, + { + "epoch": 0.56, + "eval_loss": 1.3148062229156494, + "eval_runtime": 0.7628, + "eval_samples_per_second": 5.244, + "eval_steps_per_second": 1.311, + "step": 1412 + }, + { + "epoch": 0.57, + "learning_rate": 1.3007999999999998e-07, + "loss": 1.3736, + "step": 1416 + }, + { + "epoch": 0.57, + "eval_loss": 1.3131170272827148, + "eval_runtime": 0.7763, + "eval_samples_per_second": 5.153, + "eval_steps_per_second": 1.288, + "step": 1416 + }, + { + "epoch": 0.57, + "learning_rate": 1.296e-07, + "loss": 1.4332, + "step": 1420 + }, + { + "epoch": 0.57, + "eval_loss": 1.311560869216919, + "eval_runtime": 0.7312, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.368, + "step": 1420 + }, + { + "epoch": 0.57, + "learning_rate": 1.2912e-07, + "loss": 1.4287, + "step": 1424 + }, + { + "epoch": 0.57, + "eval_loss": 1.309916615486145, + "eval_runtime": 0.6738, + "eval_samples_per_second": 5.936, + "eval_steps_per_second": 1.484, + "step": 1424 + }, + { + "epoch": 0.57, + "learning_rate": 1.2864e-07, + "loss": 1.4175, + "step": 1428 + }, + { + "epoch": 0.57, + "eval_loss": 1.3080803155899048, + "eval_runtime": 0.5396, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.853, + "step": 1428 + }, + { + "epoch": 0.57, + "learning_rate": 1.2816e-07, + "loss": 1.4152, + "step": 1432 + }, + { + "epoch": 0.57, + "eval_loss": 1.3066335916519165, + "eval_runtime": 0.5523, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.811, + "step": 1432 + }, + { + "epoch": 0.57, + "learning_rate": 1.2768e-07, + "loss": 1.4036, + "step": 1436 + }, + { + "epoch": 0.57, + "eval_loss": 1.3054327964782715, + "eval_runtime": 0.5404, + "eval_samples_per_second": 7.402, + "eval_steps_per_second": 1.851, + "step": 1436 + }, + { + "epoch": 0.58, + "learning_rate": 1.272e-07, + "loss": 1.4033, + "step": 1440 + }, + { + "epoch": 0.58, + "eval_loss": 1.3037904500961304, + "eval_runtime": 0.5534, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 1.807, + "step": 1440 + }, + { + "epoch": 0.58, + "learning_rate": 1.2671999999999999e-07, + "loss": 1.4095, + "step": 1444 + }, + { + "epoch": 0.58, + "eval_loss": 1.302278757095337, + "eval_runtime": 0.7546, + "eval_samples_per_second": 5.301, + "eval_steps_per_second": 1.325, + "step": 1444 + }, + { + "epoch": 0.58, + "learning_rate": 1.2624e-07, + "loss": 1.4129, + "step": 1448 + }, + { + "epoch": 0.58, + "eval_loss": 1.3008112907409668, + "eval_runtime": 0.7157, + "eval_samples_per_second": 5.589, + "eval_steps_per_second": 1.397, + "step": 1448 + }, + { + "epoch": 0.58, + "learning_rate": 1.2576e-07, + "loss": 1.3838, + "step": 1452 + }, + { + "epoch": 0.58, + "eval_loss": 1.2994916439056396, + "eval_runtime": 0.7773, + "eval_samples_per_second": 5.146, + "eval_steps_per_second": 1.286, + "step": 1452 + }, + { + "epoch": 0.58, + "learning_rate": 1.2528e-07, + "loss": 1.3939, + "step": 1456 + }, + { + "epoch": 0.58, + "eval_loss": 1.2979990243911743, + "eval_runtime": 0.8203, + "eval_samples_per_second": 4.876, + "eval_steps_per_second": 1.219, + "step": 1456 + }, + { + "epoch": 0.58, + "learning_rate": 1.2479999999999998e-07, + "loss": 1.4023, + "step": 1460 + }, + { + "epoch": 0.58, + "eval_loss": 1.2964202165603638, + "eval_runtime": 0.5392, + "eval_samples_per_second": 7.419, + "eval_steps_per_second": 1.855, + "step": 1460 + }, + { + "epoch": 0.59, + "learning_rate": 1.2432e-07, + "loss": 1.3751, + "step": 1464 + }, + { + "epoch": 0.59, + "eval_loss": 1.2952665090560913, + "eval_runtime": 0.533, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 1.876, + "step": 1464 + }, + { + "epoch": 0.59, + "learning_rate": 1.2384e-07, + "loss": 1.3657, + "step": 1468 + }, + { + "epoch": 0.59, + "eval_loss": 1.2935295104980469, + "eval_runtime": 0.5428, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 1.842, + "step": 1468 + }, + { + "epoch": 0.59, + "learning_rate": 1.2336e-07, + "loss": 1.375, + "step": 1472 + }, + { + "epoch": 0.59, + "eval_loss": 1.292738914489746, + "eval_runtime": 0.5365, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 1.864, + "step": 1472 + }, + { + "epoch": 0.59, + "learning_rate": 1.2288e-07, + "loss": 1.3846, + "step": 1476 + }, + { + "epoch": 0.59, + "eval_loss": 1.291104793548584, + "eval_runtime": 0.5462, + "eval_samples_per_second": 7.323, + "eval_steps_per_second": 1.831, + "step": 1476 + }, + { + "epoch": 0.59, + "learning_rate": 1.2239999999999998e-07, + "loss": 1.4192, + "step": 1480 + }, + { + "epoch": 0.59, + "eval_loss": 1.2900675535202026, + "eval_runtime": 0.7504, + "eval_samples_per_second": 5.33, + "eval_steps_per_second": 1.333, + "step": 1480 + }, + { + "epoch": 0.59, + "learning_rate": 1.2192e-07, + "loss": 1.3629, + "step": 1484 + }, + { + "epoch": 0.59, + "eval_loss": 1.2886391878128052, + "eval_runtime": 0.7924, + "eval_samples_per_second": 5.048, + "eval_steps_per_second": 1.262, + "step": 1484 + }, + { + "epoch": 0.6, + "learning_rate": 1.2143999999999998e-07, + "loss": 1.3947, + "step": 1488 + }, + { + "epoch": 0.6, + "eval_loss": 1.287713646888733, + "eval_runtime": 0.7522, + "eval_samples_per_second": 5.318, + "eval_steps_per_second": 1.329, + "step": 1488 + }, + { + "epoch": 0.6, + "learning_rate": 1.2096e-07, + "loss": 1.3485, + "step": 1492 + }, + { + "epoch": 0.6, + "eval_loss": 1.2862787246704102, + "eval_runtime": 0.5402, + "eval_samples_per_second": 7.404, + "eval_steps_per_second": 1.851, + "step": 1492 + }, + { + "epoch": 0.6, + "learning_rate": 1.2048e-07, + "loss": 1.405, + "step": 1496 + }, + { + "epoch": 0.6, + "eval_loss": 1.2850462198257446, + "eval_runtime": 0.5452, + "eval_samples_per_second": 7.337, + "eval_steps_per_second": 1.834, + "step": 1496 + }, + { + "epoch": 0.6, + "learning_rate": 1.2e-07, + "loss": 1.3758, + "step": 1500 + }, + { + "epoch": 0.6, + "eval_loss": 1.2840522527694702, + "eval_runtime": 0.541, + "eval_samples_per_second": 7.394, + "eval_steps_per_second": 1.849, + "step": 1500 + }, + { + "epoch": 0.6, + "learning_rate": 1.1951999999999997e-07, + "loss": 1.3832, + "step": 1504 + }, + { + "epoch": 0.6, + "eval_loss": 1.282808542251587, + "eval_runtime": 0.5449, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 1.835, + "step": 1504 + }, + { + "epoch": 0.6, + "learning_rate": 1.1903999999999999e-07, + "loss": 1.3314, + "step": 1508 + }, + { + "epoch": 0.6, + "eval_loss": 1.2814455032348633, + "eval_runtime": 0.5367, + "eval_samples_per_second": 7.453, + "eval_steps_per_second": 1.863, + "step": 1508 + }, + { + "epoch": 0.6, + "learning_rate": 1.1856e-07, + "loss": 1.3458, + "step": 1512 + }, + { + "epoch": 0.6, + "eval_loss": 1.2800332307815552, + "eval_runtime": 0.5306, + "eval_samples_per_second": 7.538, + "eval_steps_per_second": 1.884, + "step": 1512 + }, + { + "epoch": 0.61, + "learning_rate": 1.1808e-07, + "loss": 1.357, + "step": 1516 + }, + { + "epoch": 0.61, + "eval_loss": 1.2792009115219116, + "eval_runtime": 0.545, + "eval_samples_per_second": 7.34, + "eval_steps_per_second": 1.835, + "step": 1516 + }, + { + "epoch": 0.61, + "learning_rate": 1.176e-07, + "loss": 1.3808, + "step": 1520 + }, + { + "epoch": 0.61, + "eval_loss": 1.2777525186538696, + "eval_runtime": 0.5523, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.811, + "step": 1520 + }, + { + "epoch": 0.61, + "learning_rate": 1.1712e-07, + "loss": 1.3692, + "step": 1524 + }, + { + "epoch": 0.61, + "eval_loss": 1.2765049934387207, + "eval_runtime": 0.752, + "eval_samples_per_second": 5.319, + "eval_steps_per_second": 1.33, + "step": 1524 + }, + { + "epoch": 0.61, + "learning_rate": 1.1663999999999999e-07, + "loss": 1.3763, + "step": 1528 + }, + { + "epoch": 0.61, + "eval_loss": 1.2754418849945068, + "eval_runtime": 0.7731, + "eval_samples_per_second": 5.174, + "eval_steps_per_second": 1.293, + "step": 1528 + }, + { + "epoch": 0.61, + "learning_rate": 1.1615999999999999e-07, + "loss": 1.3505, + "step": 1532 + }, + { + "epoch": 0.61, + "eval_loss": 1.2741388082504272, + "eval_runtime": 0.798, + "eval_samples_per_second": 5.012, + "eval_steps_per_second": 1.253, + "step": 1532 + }, + { + "epoch": 0.61, + "learning_rate": 1.1567999999999999e-07, + "loss": 1.3579, + "step": 1536 + }, + { + "epoch": 0.61, + "eval_loss": 1.273074746131897, + "eval_runtime": 0.5381, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 1.858, + "step": 1536 + }, + { + "epoch": 0.62, + "learning_rate": 1.152e-07, + "loss": 1.3567, + "step": 1540 + }, + { + "epoch": 0.62, + "eval_loss": 1.2716789245605469, + "eval_runtime": 0.5423, + "eval_samples_per_second": 7.376, + "eval_steps_per_second": 1.844, + "step": 1540 + }, + { + "epoch": 0.62, + "learning_rate": 1.1472e-07, + "loss": 1.3608, + "step": 1544 + }, + { + "epoch": 0.62, + "eval_loss": 1.270503044128418, + "eval_runtime": 0.538, + "eval_samples_per_second": 7.435, + "eval_steps_per_second": 1.859, + "step": 1544 + }, + { + "epoch": 0.62, + "learning_rate": 1.1424000000000001e-07, + "loss": 1.3895, + "step": 1548 + }, + { + "epoch": 0.62, + "eval_loss": 1.269209384918213, + "eval_runtime": 0.5287, + "eval_samples_per_second": 7.566, + "eval_steps_per_second": 1.892, + "step": 1548 + }, + { + "epoch": 0.62, + "learning_rate": 1.1376e-07, + "loss": 1.3614, + "step": 1552 + }, + { + "epoch": 0.62, + "eval_loss": 1.2682329416275024, + "eval_runtime": 0.5516, + "eval_samples_per_second": 7.252, + "eval_steps_per_second": 1.813, + "step": 1552 + }, + { + "epoch": 0.62, + "learning_rate": 1.1327999999999999e-07, + "loss": 1.3643, + "step": 1556 + }, + { + "epoch": 0.62, + "eval_loss": 1.2671722173690796, + "eval_runtime": 0.7443, + "eval_samples_per_second": 5.374, + "eval_steps_per_second": 1.344, + "step": 1556 + }, + { + "epoch": 0.62, + "learning_rate": 1.1279999999999999e-07, + "loss": 1.3537, + "step": 1560 + }, + { + "epoch": 0.62, + "eval_loss": 1.2658395767211914, + "eval_runtime": 0.7724, + "eval_samples_per_second": 5.179, + "eval_steps_per_second": 1.295, + "step": 1560 + }, + { + "epoch": 0.63, + "learning_rate": 1.1232e-07, + "loss": 1.3564, + "step": 1564 + }, + { + "epoch": 0.63, + "eval_loss": 1.2647554874420166, + "eval_runtime": 0.7818, + "eval_samples_per_second": 5.116, + "eval_steps_per_second": 1.279, + "step": 1564 + }, + { + "epoch": 0.63, + "learning_rate": 1.1184e-07, + "loss": 1.3433, + "step": 1568 + }, + { + "epoch": 0.63, + "eval_loss": 1.263366937637329, + "eval_runtime": 0.7501, + "eval_samples_per_second": 5.333, + "eval_steps_per_second": 1.333, + "step": 1568 + }, + { + "epoch": 0.63, + "learning_rate": 1.1135999999999999e-07, + "loss": 1.3805, + "step": 1572 + }, + { + "epoch": 0.63, + "eval_loss": 1.2624878883361816, + "eval_runtime": 0.5467, + "eval_samples_per_second": 7.316, + "eval_steps_per_second": 1.829, + "step": 1572 + }, + { + "epoch": 0.63, + "learning_rate": 1.1087999999999998e-07, + "loss": 1.3309, + "step": 1576 + }, + { + "epoch": 0.63, + "eval_loss": 1.2612154483795166, + "eval_runtime": 0.5298, + "eval_samples_per_second": 7.55, + "eval_steps_per_second": 1.888, + "step": 1576 + }, + { + "epoch": 0.63, + "learning_rate": 1.104e-07, + "loss": 1.3408, + "step": 1580 + }, + { + "epoch": 0.63, + "eval_loss": 1.260028600692749, + "eval_runtime": 0.5364, + "eval_samples_per_second": 7.457, + "eval_steps_per_second": 1.864, + "step": 1580 + }, + { + "epoch": 0.63, + "learning_rate": 1.0992e-07, + "loss": 1.3505, + "step": 1584 + }, + { + "epoch": 0.63, + "eval_loss": 1.2590596675872803, + "eval_runtime": 0.5419, + "eval_samples_per_second": 7.381, + "eval_steps_per_second": 1.845, + "step": 1584 + }, + { + "epoch": 0.64, + "learning_rate": 1.0943999999999999e-07, + "loss": 1.355, + "step": 1588 + }, + { + "epoch": 0.64, + "eval_loss": 1.258219838142395, + "eval_runtime": 0.5356, + "eval_samples_per_second": 7.469, + "eval_steps_per_second": 1.867, + "step": 1588 + }, + { + "epoch": 0.64, + "learning_rate": 1.0896e-07, + "loss": 1.3426, + "step": 1592 + }, + { + "epoch": 0.64, + "eval_loss": 1.2570216655731201, + "eval_runtime": 0.7481, + "eval_samples_per_second": 5.347, + "eval_steps_per_second": 1.337, + "step": 1592 + }, + { + "epoch": 0.64, + "learning_rate": 1.0847999999999999e-07, + "loss": 1.3476, + "step": 1596 + }, + { + "epoch": 0.64, + "eval_loss": 1.2555859088897705, + "eval_runtime": 0.7344, + "eval_samples_per_second": 5.447, + "eval_steps_per_second": 1.362, + "step": 1596 + }, + { + "epoch": 0.64, + "learning_rate": 1.0799999999999999e-07, + "loss": 1.3448, + "step": 1600 + }, + { + "epoch": 0.64, + "eval_loss": 1.2546257972717285, + "eval_runtime": 0.8107, + "eval_samples_per_second": 4.934, + "eval_steps_per_second": 1.233, + "step": 1600 + }, + { + "epoch": 0.64, + "learning_rate": 1.0752e-07, + "loss": 1.3702, + "step": 1604 + }, + { + "epoch": 0.64, + "eval_loss": 1.2532060146331787, + "eval_runtime": 0.5269, + "eval_samples_per_second": 7.592, + "eval_steps_per_second": 1.898, + "step": 1604 + }, + { + "epoch": 0.64, + "learning_rate": 1.0704e-07, + "loss": 1.3198, + "step": 1608 + }, + { + "epoch": 0.64, + "eval_loss": 1.2521923780441284, + "eval_runtime": 0.5439, + "eval_samples_per_second": 7.354, + "eval_steps_per_second": 1.839, + "step": 1608 + }, + { + "epoch": 0.64, + "learning_rate": 1.0656e-07, + "loss": 1.3565, + "step": 1612 + }, + { + "epoch": 0.64, + "eval_loss": 1.2510970830917358, + "eval_runtime": 0.548, + "eval_samples_per_second": 7.3, + "eval_steps_per_second": 1.825, + "step": 1612 + }, + { + "epoch": 0.65, + "learning_rate": 1.0608000000000001e-07, + "loss": 1.3496, + "step": 1616 + }, + { + "epoch": 0.65, + "eval_loss": 1.2498608827590942, + "eval_runtime": 0.5393, + "eval_samples_per_second": 7.417, + "eval_steps_per_second": 1.854, + "step": 1616 + }, + { + "epoch": 0.65, + "learning_rate": 1.0559999999999999e-07, + "loss": 1.3346, + "step": 1620 + }, + { + "epoch": 0.65, + "eval_loss": 1.2489620447158813, + "eval_runtime": 0.5325, + "eval_samples_per_second": 7.512, + "eval_steps_per_second": 1.878, + "step": 1620 + }, + { + "epoch": 0.65, + "learning_rate": 1.0511999999999999e-07, + "loss": 1.3097, + "step": 1624 + }, + { + "epoch": 0.65, + "eval_loss": 1.2476825714111328, + "eval_runtime": 0.7695, + "eval_samples_per_second": 5.198, + "eval_steps_per_second": 1.3, + "step": 1624 + }, + { + "epoch": 0.65, + "learning_rate": 1.0463999999999999e-07, + "loss": 1.3224, + "step": 1628 + }, + { + "epoch": 0.65, + "eval_loss": 1.2467668056488037, + "eval_runtime": 0.7586, + "eval_samples_per_second": 5.273, + "eval_steps_per_second": 1.318, + "step": 1628 + }, + { + "epoch": 0.65, + "learning_rate": 1.0416e-07, + "loss": 1.321, + "step": 1632 + }, + { + "epoch": 0.65, + "eval_loss": 1.2455267906188965, + "eval_runtime": 0.7554, + "eval_samples_per_second": 5.295, + "eval_steps_per_second": 1.324, + "step": 1632 + }, + { + "epoch": 0.65, + "learning_rate": 1.0368e-07, + "loss": 1.3069, + "step": 1636 + }, + { + "epoch": 0.65, + "eval_loss": 1.2445768117904663, + "eval_runtime": 0.7823, + "eval_samples_per_second": 5.113, + "eval_steps_per_second": 1.278, + "step": 1636 + }, + { + "epoch": 0.66, + "learning_rate": 1.0319999999999998e-07, + "loss": 1.3358, + "step": 1640 + }, + { + "epoch": 0.66, + "eval_loss": 1.243558406829834, + "eval_runtime": 0.5542, + "eval_samples_per_second": 7.218, + "eval_steps_per_second": 1.804, + "step": 1640 + }, + { + "epoch": 0.66, + "learning_rate": 1.0272e-07, + "loss": 1.3413, + "step": 1644 + }, + { + "epoch": 0.66, + "eval_loss": 1.242706060409546, + "eval_runtime": 0.5488, + "eval_samples_per_second": 7.289, + "eval_steps_per_second": 1.822, + "step": 1644 + }, + { + "epoch": 0.66, + "learning_rate": 1.0224e-07, + "loss": 1.3328, + "step": 1648 + }, + { + "epoch": 0.66, + "eval_loss": 1.2416179180145264, + "eval_runtime": 0.5351, + "eval_samples_per_second": 7.475, + "eval_steps_per_second": 1.869, + "step": 1648 + }, + { + "epoch": 0.66, + "learning_rate": 1.0175999999999999e-07, + "loss": 1.341, + "step": 1652 + }, + { + "epoch": 0.66, + "eval_loss": 1.2406481504440308, + "eval_runtime": 0.5426, + "eval_samples_per_second": 7.372, + "eval_steps_per_second": 1.843, + "step": 1652 + }, + { + "epoch": 0.66, + "learning_rate": 1.0128e-07, + "loss": 1.3022, + "step": 1656 + }, + { + "epoch": 0.66, + "eval_loss": 1.2396165132522583, + "eval_runtime": 0.5368, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 1.863, + "step": 1656 + }, + { + "epoch": 0.66, + "learning_rate": 1.008e-07, + "loss": 1.3309, + "step": 1660 + }, + { + "epoch": 0.66, + "eval_loss": 1.2385637760162354, + "eval_runtime": 0.731, + "eval_samples_per_second": 5.472, + "eval_steps_per_second": 1.368, + "step": 1660 + }, + { + "epoch": 0.67, + "learning_rate": 1.0031999999999999e-07, + "loss": 1.3099, + "step": 1664 + }, + { + "epoch": 0.67, + "eval_loss": 1.237720012664795, + "eval_runtime": 0.7217, + "eval_samples_per_second": 5.543, + "eval_steps_per_second": 1.386, + "step": 1664 + }, + { + "epoch": 0.67, + "learning_rate": 9.983999999999999e-08, + "loss": 1.2979, + "step": 1668 + }, + { + "epoch": 0.67, + "eval_loss": 1.2368155717849731, + "eval_runtime": 0.7961, + "eval_samples_per_second": 5.024, + "eval_steps_per_second": 1.256, + "step": 1668 + }, + { + "epoch": 0.67, + "learning_rate": 9.936e-08, + "loss": 1.3219, + "step": 1672 + }, + { + "epoch": 0.67, + "eval_loss": 1.2358148097991943, + "eval_runtime": 0.6349, + "eval_samples_per_second": 6.3, + "eval_steps_per_second": 1.575, + "step": 1672 + }, + { + "epoch": 0.67, + "learning_rate": 9.888e-08, + "loss": 1.328, + "step": 1676 + }, + { + "epoch": 0.67, + "eval_loss": 1.2349071502685547, + "eval_runtime": 0.5583, + "eval_samples_per_second": 7.165, + "eval_steps_per_second": 1.791, + "step": 1676 + }, + { + "epoch": 0.67, + "learning_rate": 9.84e-08, + "loss": 1.3161, + "step": 1680 + }, + { + "epoch": 0.67, + "eval_loss": 1.233930230140686, + "eval_runtime": 0.5369, + "eval_samples_per_second": 7.45, + "eval_steps_per_second": 1.862, + "step": 1680 + }, + { + "epoch": 0.67, + "learning_rate": 9.792e-08, + "loss": 1.3435, + "step": 1684 + }, + { + "epoch": 0.67, + "eval_loss": 1.2328994274139404, + "eval_runtime": 0.5546, + "eval_samples_per_second": 7.213, + "eval_steps_per_second": 1.803, + "step": 1684 + }, + { + "epoch": 0.68, + "learning_rate": 9.743999999999999e-08, + "loss": 1.3191, + "step": 1688 + }, + { + "epoch": 0.68, + "eval_loss": 1.2323206663131714, + "eval_runtime": 0.5626, + "eval_samples_per_second": 7.11, + "eval_steps_per_second": 1.777, + "step": 1688 + }, + { + "epoch": 0.68, + "learning_rate": 9.695999999999999e-08, + "loss": 1.3427, + "step": 1692 + }, + { + "epoch": 0.68, + "eval_loss": 1.2313920259475708, + "eval_runtime": 0.7579, + "eval_samples_per_second": 5.278, + "eval_steps_per_second": 1.319, + "step": 1692 + }, + { + "epoch": 0.68, + "learning_rate": 9.648e-08, + "loss": 1.3157, + "step": 1696 + }, + { + "epoch": 0.68, + "eval_loss": 1.2306554317474365, + "eval_runtime": 0.7565, + "eval_samples_per_second": 5.287, + "eval_steps_per_second": 1.322, + "step": 1696 + }, + { + "epoch": 0.68, + "learning_rate": 9.6e-08, + "loss": 1.31, + "step": 1700 + }, + { + "epoch": 0.68, + "eval_loss": 1.229779839515686, + "eval_runtime": 0.8359, + "eval_samples_per_second": 4.785, + "eval_steps_per_second": 1.196, + "step": 1700 + }, + { + "epoch": 0.68, + "learning_rate": 9.552e-08, + "loss": 1.3418, + "step": 1704 + }, + { + "epoch": 0.68, + "eval_loss": 1.2287366390228271, + "eval_runtime": 0.8, + "eval_samples_per_second": 5.0, + "eval_steps_per_second": 1.25, + "step": 1704 + }, + { + "epoch": 0.68, + "learning_rate": 9.504000000000001e-08, + "loss": 1.3051, + "step": 1708 + }, + { + "epoch": 0.68, + "eval_loss": 1.2279709577560425, + "eval_runtime": 0.5523, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.811, + "step": 1708 + }, + { + "epoch": 0.68, + "learning_rate": 9.456e-08, + "loss": 1.3453, + "step": 1712 + }, + { + "epoch": 0.68, + "eval_loss": 1.227084755897522, + "eval_runtime": 0.5304, + "eval_samples_per_second": 7.542, + "eval_steps_per_second": 1.885, + "step": 1712 + }, + { + "epoch": 0.69, + "learning_rate": 9.407999999999999e-08, + "loss": 1.3146, + "step": 1716 + }, + { + "epoch": 0.69, + "eval_loss": 1.226120114326477, + "eval_runtime": 0.536, + "eval_samples_per_second": 7.462, + "eval_steps_per_second": 1.866, + "step": 1716 + }, + { + "epoch": 0.69, + "learning_rate": 9.359999999999999e-08, + "loss": 1.2961, + "step": 1720 + }, + { + "epoch": 0.69, + "eval_loss": 1.2254786491394043, + "eval_runtime": 0.5464, + "eval_samples_per_second": 7.32, + "eval_steps_per_second": 1.83, + "step": 1720 + }, + { + "epoch": 0.69, + "learning_rate": 9.312e-08, + "loss": 1.2989, + "step": 1724 + }, + { + "epoch": 0.69, + "eval_loss": 1.2248467206954956, + "eval_runtime": 0.5328, + "eval_samples_per_second": 7.507, + "eval_steps_per_second": 1.877, + "step": 1724 + }, + { + "epoch": 0.69, + "learning_rate": 9.264e-08, + "loss": 1.314, + "step": 1728 + }, + { + "epoch": 0.69, + "eval_loss": 1.2238779067993164, + "eval_runtime": 0.7698, + "eval_samples_per_second": 5.196, + "eval_steps_per_second": 1.299, + "step": 1728 + }, + { + "epoch": 0.69, + "learning_rate": 9.215999999999999e-08, + "loss": 1.3137, + "step": 1732 + }, + { + "epoch": 0.69, + "eval_loss": 1.2231634855270386, + "eval_runtime": 0.7516, + "eval_samples_per_second": 5.322, + "eval_steps_per_second": 1.33, + "step": 1732 + }, + { + "epoch": 0.69, + "learning_rate": 9.167999999999998e-08, + "loss": 1.323, + "step": 1736 + }, + { + "epoch": 0.69, + "eval_loss": 1.222124457359314, + "eval_runtime": 0.8004, + "eval_samples_per_second": 4.997, + "eval_steps_per_second": 1.249, + "step": 1736 + }, + { + "epoch": 0.7, + "learning_rate": 9.12e-08, + "loss": 1.3194, + "step": 1740 + }, + { + "epoch": 0.7, + "eval_loss": 1.2216134071350098, + "eval_runtime": 0.5409, + "eval_samples_per_second": 7.394, + "eval_steps_per_second": 1.849, + "step": 1740 + }, + { + "epoch": 0.7, + "learning_rate": 9.072e-08, + "loss": 1.2857, + "step": 1744 + }, + { + "epoch": 0.7, + "eval_loss": 1.220569372177124, + "eval_runtime": 0.5422, + "eval_samples_per_second": 7.377, + "eval_steps_per_second": 1.844, + "step": 1744 + }, + { + "epoch": 0.7, + "learning_rate": 9.024e-08, + "loss": 1.3101, + "step": 1748 + }, + { + "epoch": 0.7, + "eval_loss": 1.2198562622070312, + "eval_runtime": 0.5528, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 1.809, + "step": 1748 + }, + { + "epoch": 0.7, + "learning_rate": 8.976e-08, + "loss": 1.2962, + "step": 1752 + }, + { + "epoch": 0.7, + "eval_loss": 1.2193635702133179, + "eval_runtime": 0.5512, + "eval_samples_per_second": 7.257, + "eval_steps_per_second": 1.814, + "step": 1752 + }, + { + "epoch": 0.7, + "learning_rate": 8.927999999999999e-08, + "loss": 1.2927, + "step": 1756 + }, + { + "epoch": 0.7, + "eval_loss": 1.218583345413208, + "eval_runtime": 0.5559, + "eval_samples_per_second": 7.195, + "eval_steps_per_second": 1.799, + "step": 1756 + }, + { + "epoch": 0.7, + "learning_rate": 8.879999999999999e-08, + "loss": 1.2728, + "step": 1760 + }, + { + "epoch": 0.7, + "eval_loss": 1.2178785800933838, + "eval_runtime": 0.7605, + "eval_samples_per_second": 5.26, + "eval_steps_per_second": 1.315, + "step": 1760 + }, + { + "epoch": 0.71, + "learning_rate": 8.832e-08, + "loss": 1.2903, + "step": 1764 + }, + { + "epoch": 0.71, + "eval_loss": 1.2169758081436157, + "eval_runtime": 0.7602, + "eval_samples_per_second": 5.262, + "eval_steps_per_second": 1.315, + "step": 1764 + }, + { + "epoch": 0.71, + "learning_rate": 8.784e-08, + "loss": 1.3108, + "step": 1768 + }, + { + "epoch": 0.71, + "eval_loss": 1.2163543701171875, + "eval_runtime": 0.7924, + "eval_samples_per_second": 5.048, + "eval_steps_per_second": 1.262, + "step": 1768 + }, + { + "epoch": 0.71, + "learning_rate": 8.736e-08, + "loss": 1.2899, + "step": 1772 + }, + { + "epoch": 0.71, + "eval_loss": 1.2156285047531128, + "eval_runtime": 0.5511, + "eval_samples_per_second": 7.259, + "eval_steps_per_second": 1.815, + "step": 1772 + }, + { + "epoch": 0.71, + "learning_rate": 8.688000000000001e-08, + "loss": 1.2996, + "step": 1776 + }, + { + "epoch": 0.71, + "eval_loss": 1.2148629426956177, + "eval_runtime": 0.5465, + "eval_samples_per_second": 7.32, + "eval_steps_per_second": 1.83, + "step": 1776 + }, + { + "epoch": 0.71, + "learning_rate": 8.639999999999999e-08, + "loss": 1.2865, + "step": 1780 + }, + { + "epoch": 0.71, + "eval_loss": 1.2141329050064087, + "eval_runtime": 0.5496, + "eval_samples_per_second": 7.278, + "eval_steps_per_second": 1.819, + "step": 1780 + }, + { + "epoch": 0.71, + "learning_rate": 8.591999999999999e-08, + "loss": 1.3004, + "step": 1784 + }, + { + "epoch": 0.71, + "eval_loss": 1.2135276794433594, + "eval_runtime": 0.8113, + "eval_samples_per_second": 4.93, + "eval_steps_per_second": 1.233, + "step": 1784 + }, + { + "epoch": 0.72, + "learning_rate": 8.543999999999999e-08, + "loss": 1.2916, + "step": 1788 + }, + { + "epoch": 0.72, + "eval_loss": 1.2127509117126465, + "eval_runtime": 1.0008, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.999, + "step": 1788 + }, + { + "epoch": 0.72, + "learning_rate": 8.496e-08, + "loss": 1.2975, + "step": 1792 + }, + { + "epoch": 0.72, + "eval_loss": 1.2119174003601074, + "eval_runtime": 0.9106, + "eval_samples_per_second": 4.393, + "eval_steps_per_second": 1.098, + "step": 1792 + }, + { + "epoch": 0.72, + "learning_rate": 8.448e-08, + "loss": 1.3071, + "step": 1796 + }, + { + "epoch": 0.72, + "eval_loss": 1.2113782167434692, + "eval_runtime": 0.8267, + "eval_samples_per_second": 4.838, + "eval_steps_per_second": 1.21, + "step": 1796 + }, + { + "epoch": 0.72, + "learning_rate": 8.4e-08, + "loss": 1.2793, + "step": 1800 + }, + { + "epoch": 0.72, + "eval_loss": 1.2105748653411865, + "eval_runtime": 0.9729, + "eval_samples_per_second": 4.111, + "eval_steps_per_second": 1.028, + "step": 1800 + }, + { + "epoch": 0.72, + "learning_rate": 8.352e-08, + "loss": 1.2755, + "step": 1804 + }, + { + "epoch": 0.72, + "eval_loss": 1.2097927331924438, + "eval_runtime": 1.0248, + "eval_samples_per_second": 3.903, + "eval_steps_per_second": 0.976, + "step": 1804 + }, + { + "epoch": 0.72, + "learning_rate": 8.304e-08, + "loss": 1.2968, + "step": 1808 + }, + { + "epoch": 0.72, + "eval_loss": 1.2092970609664917, + "eval_runtime": 0.8732, + "eval_samples_per_second": 4.581, + "eval_steps_per_second": 1.145, + "step": 1808 + }, + { + "epoch": 0.72, + "learning_rate": 8.255999999999999e-08, + "loss": 1.3226, + "step": 1812 + }, + { + "epoch": 0.72, + "eval_loss": 1.2085187435150146, + "eval_runtime": 0.994, + "eval_samples_per_second": 4.024, + "eval_steps_per_second": 1.006, + "step": 1812 + }, + { + "epoch": 0.73, + "learning_rate": 8.208e-08, + "loss": 1.3117, + "step": 1816 + }, + { + "epoch": 0.73, + "eval_loss": 1.2078773975372314, + "eval_runtime": 0.9321, + "eval_samples_per_second": 4.291, + "eval_steps_per_second": 1.073, + "step": 1816 + }, + { + "epoch": 0.73, + "learning_rate": 8.16e-08, + "loss": 1.2957, + "step": 1820 + }, + { + "epoch": 0.73, + "eval_loss": 1.207309603691101, + "eval_runtime": 0.8433, + "eval_samples_per_second": 4.743, + "eval_steps_per_second": 1.186, + "step": 1820 + }, + { + "epoch": 0.73, + "learning_rate": 8.111999999999999e-08, + "loss": 1.2885, + "step": 1824 + }, + { + "epoch": 0.73, + "eval_loss": 1.206638216972351, + "eval_runtime": 1.0078, + "eval_samples_per_second": 3.969, + "eval_steps_per_second": 0.992, + "step": 1824 + }, + { + "epoch": 0.73, + "learning_rate": 8.063999999999999e-08, + "loss": 1.2731, + "step": 1828 + }, + { + "epoch": 0.73, + "eval_loss": 1.2057451009750366, + "eval_runtime": 0.9696, + "eval_samples_per_second": 4.125, + "eval_steps_per_second": 1.031, + "step": 1828 + }, + { + "epoch": 0.73, + "learning_rate": 8.016e-08, + "loss": 1.2821, + "step": 1832 + }, + { + "epoch": 0.73, + "eval_loss": 1.2051252126693726, + "eval_runtime": 0.8398, + "eval_samples_per_second": 4.763, + "eval_steps_per_second": 1.191, + "step": 1832 + }, + { + "epoch": 0.73, + "learning_rate": 7.968e-08, + "loss": 1.2944, + "step": 1836 + }, + { + "epoch": 0.73, + "eval_loss": 1.2045457363128662, + "eval_runtime": 0.8549, + "eval_samples_per_second": 4.679, + "eval_steps_per_second": 1.17, + "step": 1836 + }, + { + "epoch": 0.74, + "learning_rate": 7.92e-08, + "loss": 1.2768, + "step": 1840 + }, + { + "epoch": 0.74, + "eval_loss": 1.2040053606033325, + "eval_runtime": 0.9035, + "eval_samples_per_second": 4.427, + "eval_steps_per_second": 1.107, + "step": 1840 + }, + { + "epoch": 0.74, + "learning_rate": 7.872e-08, + "loss": 1.2917, + "step": 1844 + }, + { + "epoch": 0.74, + "eval_loss": 1.2034111022949219, + "eval_runtime": 0.8657, + "eval_samples_per_second": 4.621, + "eval_steps_per_second": 1.155, + "step": 1844 + }, + { + "epoch": 0.74, + "learning_rate": 7.823999999999999e-08, + "loss": 1.2935, + "step": 1848 + }, + { + "epoch": 0.74, + "eval_loss": 1.2027428150177002, + "eval_runtime": 0.8157, + "eval_samples_per_second": 4.903, + "eval_steps_per_second": 1.226, + "step": 1848 + }, + { + "epoch": 0.74, + "learning_rate": 7.775999999999999e-08, + "loss": 1.268, + "step": 1852 + }, + { + "epoch": 0.74, + "eval_loss": 1.2022608518600464, + "eval_runtime": 0.8551, + "eval_samples_per_second": 4.678, + "eval_steps_per_second": 1.169, + "step": 1852 + }, + { + "epoch": 0.74, + "learning_rate": 7.728e-08, + "loss": 1.2761, + "step": 1856 + }, + { + "epoch": 0.74, + "eval_loss": 1.2016183137893677, + "eval_runtime": 0.8348, + "eval_samples_per_second": 4.791, + "eval_steps_per_second": 1.198, + "step": 1856 + }, + { + "epoch": 0.74, + "learning_rate": 7.68e-08, + "loss": 1.3107, + "step": 1860 + }, + { + "epoch": 0.74, + "eval_loss": 1.201023817062378, + "eval_runtime": 0.8161, + "eval_samples_per_second": 4.901, + "eval_steps_per_second": 1.225, + "step": 1860 + }, + { + "epoch": 0.75, + "learning_rate": 7.632e-08, + "loss": 1.2752, + "step": 1864 + }, + { + "epoch": 0.75, + "eval_loss": 1.2003718614578247, + "eval_runtime": 1.0033, + "eval_samples_per_second": 3.987, + "eval_steps_per_second": 0.997, + "step": 1864 + }, + { + "epoch": 0.75, + "learning_rate": 7.584000000000001e-08, + "loss": 1.2661, + "step": 1868 + }, + { + "epoch": 0.75, + "eval_loss": 1.1996574401855469, + "eval_runtime": 0.8381, + "eval_samples_per_second": 4.773, + "eval_steps_per_second": 1.193, + "step": 1868 + }, + { + "epoch": 0.75, + "learning_rate": 7.536e-08, + "loss": 1.2985, + "step": 1872 + }, + { + "epoch": 0.75, + "eval_loss": 1.199149250984192, + "eval_runtime": 0.8342, + "eval_samples_per_second": 4.795, + "eval_steps_per_second": 1.199, + "step": 1872 + }, + { + "epoch": 0.75, + "learning_rate": 7.487999999999999e-08, + "loss": 1.2801, + "step": 1876 + }, + { + "epoch": 0.75, + "eval_loss": 1.1985127925872803, + "eval_runtime": 0.9225, + "eval_samples_per_second": 4.336, + "eval_steps_per_second": 1.084, + "step": 1876 + }, + { + "epoch": 0.75, + "learning_rate": 7.439999999999999e-08, + "loss": 1.2775, + "step": 1880 + }, + { + "epoch": 0.75, + "eval_loss": 1.1979784965515137, + "eval_runtime": 0.92, + "eval_samples_per_second": 4.348, + "eval_steps_per_second": 1.087, + "step": 1880 + }, + { + "epoch": 0.75, + "learning_rate": 7.392e-08, + "loss": 1.2741, + "step": 1884 + }, + { + "epoch": 0.75, + "eval_loss": 1.1976195573806763, + "eval_runtime": 0.8775, + "eval_samples_per_second": 4.558, + "eval_steps_per_second": 1.14, + "step": 1884 + }, + { + "epoch": 0.76, + "learning_rate": 7.343999999999999e-08, + "loss": 1.2747, + "step": 1888 + }, + { + "epoch": 0.76, + "eval_loss": 1.197205662727356, + "eval_runtime": 0.8885, + "eval_samples_per_second": 4.502, + "eval_steps_per_second": 1.125, + "step": 1888 + }, + { + "epoch": 0.76, + "learning_rate": 7.296e-08, + "loss": 1.2772, + "step": 1892 + }, + { + "epoch": 0.76, + "eval_loss": 1.1963942050933838, + "eval_runtime": 0.9218, + "eval_samples_per_second": 4.339, + "eval_steps_per_second": 1.085, + "step": 1892 + }, + { + "epoch": 0.76, + "learning_rate": 7.248e-08, + "loss": 1.2953, + "step": 1896 + }, + { + "epoch": 0.76, + "eval_loss": 1.1961135864257812, + "eval_runtime": 0.8111, + "eval_samples_per_second": 4.932, + "eval_steps_per_second": 1.233, + "step": 1896 + }, + { + "epoch": 0.76, + "learning_rate": 7.2e-08, + "loss": 1.3052, + "step": 1900 + }, + { + "epoch": 0.76, + "eval_loss": 1.1957508325576782, + "eval_runtime": 0.9897, + "eval_samples_per_second": 4.042, + "eval_steps_per_second": 1.01, + "step": 1900 + }, + { + "epoch": 0.76, + "learning_rate": 7.152e-08, + "loss": 1.2505, + "step": 1904 + }, + { + "epoch": 0.76, + "eval_loss": 1.1951295137405396, + "eval_runtime": 0.9069, + "eval_samples_per_second": 4.41, + "eval_steps_per_second": 1.103, + "step": 1904 + }, + { + "epoch": 0.76, + "learning_rate": 7.104e-08, + "loss": 1.3088, + "step": 1908 + }, + { + "epoch": 0.76, + "eval_loss": 1.1944581270217896, + "eval_runtime": 0.7895, + "eval_samples_per_second": 5.066, + "eval_steps_per_second": 1.267, + "step": 1908 + }, + { + "epoch": 0.76, + "learning_rate": 7.055999999999999e-08, + "loss": 1.2705, + "step": 1912 + }, + { + "epoch": 0.76, + "eval_loss": 1.1939201354980469, + "eval_runtime": 0.912, + "eval_samples_per_second": 4.386, + "eval_steps_per_second": 1.097, + "step": 1912 + }, + { + "epoch": 0.77, + "learning_rate": 7.008e-08, + "loss": 1.2606, + "step": 1916 + }, + { + "epoch": 0.77, + "eval_loss": 1.1934112310409546, + "eval_runtime": 0.948, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 1.055, + "step": 1916 + }, + { + "epoch": 0.77, + "learning_rate": 6.96e-08, + "loss": 1.2729, + "step": 1920 + }, + { + "epoch": 0.77, + "eval_loss": 1.1931556463241577, + "eval_runtime": 0.8456, + "eval_samples_per_second": 4.73, + "eval_steps_per_second": 1.183, + "step": 1920 + }, + { + "epoch": 0.77, + "learning_rate": 6.912e-08, + "loss": 1.2642, + "step": 1924 + }, + { + "epoch": 0.77, + "eval_loss": 1.1926673650741577, + "eval_runtime": 0.8112, + "eval_samples_per_second": 4.931, + "eval_steps_per_second": 1.233, + "step": 1924 + }, + { + "epoch": 0.77, + "learning_rate": 6.864e-08, + "loss": 1.2903, + "step": 1928 + }, + { + "epoch": 0.77, + "eval_loss": 1.1919676065444946, + "eval_runtime": 0.865, + "eval_samples_per_second": 4.624, + "eval_steps_per_second": 1.156, + "step": 1928 + }, + { + "epoch": 0.77, + "learning_rate": 6.816e-08, + "loss": 1.2688, + "step": 1932 + }, + { + "epoch": 0.77, + "eval_loss": 1.191756010055542, + "eval_runtime": 0.8924, + "eval_samples_per_second": 4.482, + "eval_steps_per_second": 1.121, + "step": 1932 + }, + { + "epoch": 0.77, + "learning_rate": 6.768e-08, + "loss": 1.2677, + "step": 1936 + }, + { + "epoch": 0.77, + "eval_loss": 1.1909488439559937, + "eval_runtime": 0.8423, + "eval_samples_per_second": 4.749, + "eval_steps_per_second": 1.187, + "step": 1936 + }, + { + "epoch": 0.78, + "learning_rate": 6.719999999999999e-08, + "loss": 1.2747, + "step": 1940 + }, + { + "epoch": 0.78, + "eval_loss": 1.1905359029769897, + "eval_runtime": 0.8982, + "eval_samples_per_second": 4.453, + "eval_steps_per_second": 1.113, + "step": 1940 + }, + { + "epoch": 0.78, + "learning_rate": 6.671999999999999e-08, + "loss": 1.2512, + "step": 1944 + }, + { + "epoch": 0.78, + "eval_loss": 1.19022798538208, + "eval_runtime": 0.8246, + "eval_samples_per_second": 4.851, + "eval_steps_per_second": 1.213, + "step": 1944 + }, + { + "epoch": 0.78, + "learning_rate": 6.623999999999999e-08, + "loss": 1.2651, + "step": 1948 + }, + { + "epoch": 0.78, + "eval_loss": 1.1898229122161865, + "eval_runtime": 0.8116, + "eval_samples_per_second": 4.928, + "eval_steps_per_second": 1.232, + "step": 1948 + }, + { + "epoch": 0.78, + "learning_rate": 6.576e-08, + "loss": 1.2655, + "step": 1952 + }, + { + "epoch": 0.78, + "eval_loss": 1.189262866973877, + "eval_runtime": 1.0243, + "eval_samples_per_second": 3.905, + "eval_steps_per_second": 0.976, + "step": 1952 + }, + { + "epoch": 0.78, + "learning_rate": 6.528e-08, + "loss": 1.2617, + "step": 1956 + }, + { + "epoch": 0.78, + "eval_loss": 1.1888002157211304, + "eval_runtime": 0.8812, + "eval_samples_per_second": 4.539, + "eval_steps_per_second": 1.135, + "step": 1956 + }, + { + "epoch": 0.78, + "learning_rate": 6.48e-08, + "loss": 1.2764, + "step": 1960 + }, + { + "epoch": 0.78, + "eval_loss": 1.1885006427764893, + "eval_runtime": 0.8185, + "eval_samples_per_second": 4.887, + "eval_steps_per_second": 1.222, + "step": 1960 + }, + { + "epoch": 0.79, + "learning_rate": 6.432e-08, + "loss": 1.2531, + "step": 1964 + }, + { + "epoch": 0.79, + "eval_loss": 1.188267469406128, + "eval_runtime": 0.9375, + "eval_samples_per_second": 4.267, + "eval_steps_per_second": 1.067, + "step": 1964 + }, + { + "epoch": 0.79, + "learning_rate": 6.384e-08, + "loss": 1.2911, + "step": 1968 + }, + { + "epoch": 0.79, + "eval_loss": 1.1874159574508667, + "eval_runtime": 0.9562, + "eval_samples_per_second": 4.183, + "eval_steps_per_second": 1.046, + "step": 1968 + }, + { + "epoch": 0.79, + "learning_rate": 6.335999999999999e-08, + "loss": 1.2616, + "step": 1972 + }, + { + "epoch": 0.79, + "eval_loss": 1.1871256828308105, + "eval_runtime": 0.8052, + "eval_samples_per_second": 4.968, + "eval_steps_per_second": 1.242, + "step": 1972 + }, + { + "epoch": 0.79, + "learning_rate": 6.288e-08, + "loss": 1.2537, + "step": 1976 + }, + { + "epoch": 0.79, + "eval_loss": 1.1868540048599243, + "eval_runtime": 1.0118, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 0.988, + "step": 1976 + }, + { + "epoch": 0.79, + "learning_rate": 6.239999999999999e-08, + "loss": 1.2548, + "step": 1980 + }, + { + "epoch": 0.79, + "eval_loss": 1.1864162683486938, + "eval_runtime": 1.0123, + "eval_samples_per_second": 3.951, + "eval_steps_per_second": 0.988, + "step": 1980 + }, + { + "epoch": 0.79, + "learning_rate": 6.192e-08, + "loss": 1.2722, + "step": 1984 + }, + { + "epoch": 0.79, + "eval_loss": 1.1861025094985962, + "eval_runtime": 0.8118, + "eval_samples_per_second": 4.927, + "eval_steps_per_second": 1.232, + "step": 1984 + }, + { + "epoch": 0.8, + "learning_rate": 6.144e-08, + "loss": 1.2717, + "step": 1988 + }, + { + "epoch": 0.8, + "eval_loss": 1.1857701539993286, + "eval_runtime": 0.9863, + "eval_samples_per_second": 4.056, + "eval_steps_per_second": 1.014, + "step": 1988 + }, + { + "epoch": 0.8, + "learning_rate": 6.096e-08, + "loss": 1.281, + "step": 1992 + }, + { + "epoch": 0.8, + "eval_loss": 1.1854195594787598, + "eval_runtime": 0.8058, + "eval_samples_per_second": 4.964, + "eval_steps_per_second": 1.241, + "step": 1992 + }, + { + "epoch": 0.8, + "learning_rate": 6.048e-08, + "loss": 1.2766, + "step": 1996 + }, + { + "epoch": 0.8, + "eval_loss": 1.1849565505981445, + "eval_runtime": 0.6952, + "eval_samples_per_second": 5.754, + "eval_steps_per_second": 1.438, + "step": 1996 + }, + { + "epoch": 0.8, + "learning_rate": 6e-08, + "loss": 1.2962, + "step": 2000 + }, + { + "epoch": 0.8, + "eval_loss": 1.1846204996109009, + "eval_runtime": 0.5591, + "eval_samples_per_second": 7.154, + "eval_steps_per_second": 1.789, + "step": 2000 + }, + { + "epoch": 0.8, + "learning_rate": 5.951999999999999e-08, + "loss": 1.2732, + "step": 2004 + }, + { + "epoch": 0.8, + "eval_loss": 1.1842423677444458, + "eval_runtime": 0.6989, + "eval_samples_per_second": 5.724, + "eval_steps_per_second": 1.431, + "step": 2004 + }, + { + "epoch": 0.8, + "learning_rate": 5.904e-08, + "loss": 1.2588, + "step": 2008 + }, + { + "epoch": 0.8, + "eval_loss": 1.1838123798370361, + "eval_runtime": 0.533, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 1.876, + "step": 2008 + }, + { + "epoch": 0.8, + "learning_rate": 5.856e-08, + "loss": 1.2651, + "step": 2012 + }, + { + "epoch": 0.8, + "eval_loss": 1.1836127042770386, + "eval_runtime": 0.5522, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 1.811, + "step": 2012 + }, + { + "epoch": 0.81, + "learning_rate": 5.8079999999999995e-08, + "loss": 1.2755, + "step": 2016 + }, + { + "epoch": 0.81, + "eval_loss": 1.1833029985427856, + "eval_runtime": 0.5378, + "eval_samples_per_second": 7.438, + "eval_steps_per_second": 1.859, + "step": 2016 + }, + { + "epoch": 0.81, + "learning_rate": 5.76e-08, + "loss": 1.2513, + "step": 2020 + }, + { + "epoch": 0.81, + "eval_loss": 1.1827694177627563, + "eval_runtime": 0.5697, + "eval_samples_per_second": 7.021, + "eval_steps_per_second": 1.755, + "step": 2020 + }, + { + "epoch": 0.81, + "learning_rate": 5.7120000000000005e-08, + "loss": 1.2645, + "step": 2024 + }, + { + "epoch": 0.81, + "eval_loss": 1.1824350357055664, + "eval_runtime": 0.7918, + "eval_samples_per_second": 5.052, + "eval_steps_per_second": 1.263, + "step": 2024 + }, + { + "epoch": 0.81, + "learning_rate": 5.6639999999999996e-08, + "loss": 1.2593, + "step": 2028 + }, + { + "epoch": 0.81, + "eval_loss": 1.181903600692749, + "eval_runtime": 0.7394, + "eval_samples_per_second": 5.41, + "eval_steps_per_second": 1.353, + "step": 2028 + }, + { + "epoch": 0.81, + "learning_rate": 5.616e-08, + "loss": 1.2947, + "step": 2032 + }, + { + "epoch": 0.81, + "eval_loss": 1.1818093061447144, + "eval_runtime": 0.8202, + "eval_samples_per_second": 4.877, + "eval_steps_per_second": 1.219, + "step": 2032 + }, + { + "epoch": 0.81, + "learning_rate": 5.567999999999999e-08, + "loss": 1.2599, + "step": 2036 + }, + { + "epoch": 0.81, + "eval_loss": 1.1813161373138428, + "eval_runtime": 0.8494, + "eval_samples_per_second": 4.709, + "eval_steps_per_second": 1.177, + "step": 2036 + }, + { + "epoch": 0.82, + "learning_rate": 5.52e-08, + "loss": 1.2094, + "step": 2040 + }, + { + "epoch": 0.82, + "eval_loss": 1.1808911561965942, + "eval_runtime": 0.5448, + "eval_samples_per_second": 7.342, + "eval_steps_per_second": 1.835, + "step": 2040 + }, + { + "epoch": 0.82, + "learning_rate": 5.4719999999999996e-08, + "loss": 1.2707, + "step": 2044 + }, + { + "epoch": 0.82, + "eval_loss": 1.180704116821289, + "eval_runtime": 0.5486, + "eval_samples_per_second": 7.291, + "eval_steps_per_second": 1.823, + "step": 2044 + }, + { + "epoch": 0.82, + "learning_rate": 5.4239999999999995e-08, + "loss": 1.2653, + "step": 2048 + }, + { + "epoch": 0.82, + "eval_loss": 1.1802550554275513, + "eval_runtime": 0.5729, + "eval_samples_per_second": 6.982, + "eval_steps_per_second": 1.745, + "step": 2048 + }, + { + "epoch": 0.82, + "learning_rate": 5.376e-08, + "loss": 1.2637, + "step": 2052 + }, + { + "epoch": 0.82, + "eval_loss": 1.1799267530441284, + "eval_runtime": 0.55, + "eval_samples_per_second": 7.273, + "eval_steps_per_second": 1.818, + "step": 2052 + }, + { + "epoch": 0.82, + "learning_rate": 5.328e-08, + "loss": 1.2708, + "step": 2056 + }, + { + "epoch": 0.82, + "eval_loss": 1.1799120903015137, + "eval_runtime": 0.5421, + "eval_samples_per_second": 7.379, + "eval_steps_per_second": 1.845, + "step": 2056 + }, + { + "epoch": 0.82, + "learning_rate": 5.2799999999999996e-08, + "loss": 1.283, + "step": 2060 + }, + { + "epoch": 0.82, + "eval_loss": 1.1794401407241821, + "eval_runtime": 0.7544, + "eval_samples_per_second": 5.302, + "eval_steps_per_second": 1.325, + "step": 2060 + }, + { + "epoch": 0.83, + "learning_rate": 5.2319999999999995e-08, + "loss": 1.2853, + "step": 2064 + }, + { + "epoch": 0.83, + "eval_loss": 1.1792134046554565, + "eval_runtime": 0.7805, + "eval_samples_per_second": 5.125, + "eval_steps_per_second": 1.281, + "step": 2064 + }, + { + "epoch": 0.83, + "learning_rate": 5.184e-08, + "loss": 1.2617, + "step": 2068 + }, + { + "epoch": 0.83, + "eval_loss": 1.1792347431182861, + "eval_runtime": 0.7985, + "eval_samples_per_second": 5.01, + "eval_steps_per_second": 1.252, + "step": 2068 + }, + { + "epoch": 0.83, + "learning_rate": 5.136e-08, + "loss": 1.2476, + "step": 2072 + }, + { + "epoch": 0.83, + "eval_loss": 1.178797721862793, + "eval_runtime": 0.8348, + "eval_samples_per_second": 4.792, + "eval_steps_per_second": 1.198, + "step": 2072 + }, + { + "epoch": 0.83, + "learning_rate": 5.0879999999999996e-08, + "loss": 1.2355, + "step": 2076 + }, + { + "epoch": 0.83, + "eval_loss": 1.1785314083099365, + "eval_runtime": 0.5603, + "eval_samples_per_second": 7.139, + "eval_steps_per_second": 1.785, + "step": 2076 + }, + { + "epoch": 0.83, + "learning_rate": 5.04e-08, + "loss": 1.2348, + "step": 2080 + }, + { + "epoch": 0.83, + "eval_loss": 1.1781315803527832, + "eval_runtime": 0.5683, + "eval_samples_per_second": 7.038, + "eval_steps_per_second": 1.76, + "step": 2080 + }, + { + "epoch": 0.83, + "learning_rate": 4.991999999999999e-08, + "loss": 1.2468, + "step": 2084 + }, + { + "epoch": 0.83, + "eval_loss": 1.1779569387435913, + "eval_runtime": 0.5524, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 1.81, + "step": 2084 + }, + { + "epoch": 0.84, + "learning_rate": 4.944e-08, + "loss": 1.2715, + "step": 2088 + }, + { + "epoch": 0.84, + "eval_loss": 1.177626609802246, + "eval_runtime": 0.5397, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.853, + "step": 2088 + }, + { + "epoch": 0.84, + "learning_rate": 4.896e-08, + "loss": 1.2502, + "step": 2092 + }, + { + "epoch": 0.84, + "eval_loss": 1.1772172451019287, + "eval_runtime": 0.5501, + "eval_samples_per_second": 7.271, + "eval_steps_per_second": 1.818, + "step": 2092 + }, + { + "epoch": 0.84, + "learning_rate": 4.8479999999999995e-08, + "loss": 1.284, + "step": 2096 + }, + { + "epoch": 0.84, + "eval_loss": 1.177082896232605, + "eval_runtime": 0.7726, + "eval_samples_per_second": 5.177, + "eval_steps_per_second": 1.294, + "step": 2096 + }, + { + "epoch": 0.84, + "learning_rate": 4.8e-08, + "loss": 1.2417, + "step": 2100 + }, + { + "epoch": 0.84, + "eval_loss": 1.1768074035644531, + "eval_runtime": 0.8241, + "eval_samples_per_second": 4.854, + "eval_steps_per_second": 1.213, + "step": 2100 + }, + { + "epoch": 0.84, + "learning_rate": 4.7520000000000005e-08, + "loss": 1.2516, + "step": 2104 + }, + { + "epoch": 0.84, + "eval_loss": 1.1765819787979126, + "eval_runtime": 0.7686, + "eval_samples_per_second": 5.204, + "eval_steps_per_second": 1.301, + "step": 2104 + }, + { + "epoch": 0.84, + "learning_rate": 4.7039999999999996e-08, + "loss": 1.2748, + "step": 2108 + }, + { + "epoch": 0.84, + "eval_loss": 1.1763153076171875, + "eval_runtime": 0.5397, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.853, + "step": 2108 + }, + { + "epoch": 0.84, + "learning_rate": 4.656e-08, + "loss": 1.2744, + "step": 2112 + }, + { + "epoch": 0.84, + "eval_loss": 1.1761606931686401, + "eval_runtime": 0.559, + "eval_samples_per_second": 7.156, + "eval_steps_per_second": 1.789, + "step": 2112 + }, + { + "epoch": 0.85, + "learning_rate": 4.607999999999999e-08, + "loss": 1.2551, + "step": 2116 + }, + { + "epoch": 0.85, + "eval_loss": 1.1757404804229736, + "eval_runtime": 0.5541, + "eval_samples_per_second": 7.219, + "eval_steps_per_second": 1.805, + "step": 2116 + }, + { + "epoch": 0.85, + "learning_rate": 4.56e-08, + "loss": 1.2687, + "step": 2120 + }, + { + "epoch": 0.85, + "eval_loss": 1.175508737564087, + "eval_runtime": 0.5519, + "eval_samples_per_second": 7.247, + "eval_steps_per_second": 1.812, + "step": 2120 + }, + { + "epoch": 0.85, + "learning_rate": 4.512e-08, + "loss": 1.2575, + "step": 2124 + }, + { + "epoch": 0.85, + "eval_loss": 1.1754050254821777, + "eval_runtime": 0.5573, + "eval_samples_per_second": 7.178, + "eval_steps_per_second": 1.794, + "step": 2124 + }, + { + "epoch": 0.85, + "learning_rate": 4.4639999999999995e-08, + "loss": 1.2501, + "step": 2128 + }, + { + "epoch": 0.85, + "eval_loss": 1.1749186515808105, + "eval_runtime": 0.8148, + "eval_samples_per_second": 4.909, + "eval_steps_per_second": 1.227, + "step": 2128 + }, + { + "epoch": 0.85, + "learning_rate": 4.416e-08, + "loss": 1.2728, + "step": 2132 + }, + { + "epoch": 0.85, + "eval_loss": 1.1750520467758179, + "eval_runtime": 0.8081, + "eval_samples_per_second": 4.95, + "eval_steps_per_second": 1.237, + "step": 2132 + }, + { + "epoch": 0.85, + "learning_rate": 4.368e-08, + "loss": 1.2411, + "step": 2136 + }, + { + "epoch": 0.85, + "eval_loss": 1.174950122833252, + "eval_runtime": 0.7994, + "eval_samples_per_second": 5.004, + "eval_steps_per_second": 1.251, + "step": 2136 + }, + { + "epoch": 0.86, + "learning_rate": 4.3199999999999996e-08, + "loss": 1.2505, + "step": 2140 + }, + { + "epoch": 0.86, + "eval_loss": 1.1743714809417725, + "eval_runtime": 0.8793, + "eval_samples_per_second": 4.549, + "eval_steps_per_second": 1.137, + "step": 2140 + }, + { + "epoch": 0.86, + "learning_rate": 4.2719999999999995e-08, + "loss": 1.2363, + "step": 2144 + }, + { + "epoch": 0.86, + "eval_loss": 1.1743040084838867, + "eval_runtime": 0.5548, + "eval_samples_per_second": 7.209, + "eval_steps_per_second": 1.802, + "step": 2144 + }, + { + "epoch": 0.86, + "learning_rate": 4.224e-08, + "loss": 1.2408, + "step": 2148 + }, + { + "epoch": 0.86, + "eval_loss": 1.1740877628326416, + "eval_runtime": 0.5539, + "eval_samples_per_second": 7.221, + "eval_steps_per_second": 1.805, + "step": 2148 + }, + { + "epoch": 0.86, + "learning_rate": 4.176e-08, + "loss": 1.25, + "step": 2152 + }, + { + "epoch": 0.86, + "eval_loss": 1.1735849380493164, + "eval_runtime": 0.5552, + "eval_samples_per_second": 7.204, + "eval_steps_per_second": 1.801, + "step": 2152 + }, + { + "epoch": 0.86, + "learning_rate": 4.1279999999999996e-08, + "loss": 1.2729, + "step": 2156 + }, + { + "epoch": 0.86, + "eval_loss": 1.1735070943832397, + "eval_runtime": 0.5396, + "eval_samples_per_second": 7.413, + "eval_steps_per_second": 1.853, + "step": 2156 + }, + { + "epoch": 0.86, + "learning_rate": 4.08e-08, + "loss": 1.2467, + "step": 2160 + }, + { + "epoch": 0.86, + "eval_loss": 1.1735484600067139, + "eval_runtime": 0.5593, + "eval_samples_per_second": 7.152, + "eval_steps_per_second": 1.788, + "step": 2160 + }, + { + "epoch": 0.87, + "learning_rate": 4.031999999999999e-08, + "loss": 1.2377, + "step": 2164 + }, + { + "epoch": 0.87, + "eval_loss": 1.1734050512313843, + "eval_runtime": 0.7724, + "eval_samples_per_second": 5.179, + "eval_steps_per_second": 1.295, + "step": 2164 + }, + { + "epoch": 0.87, + "learning_rate": 3.984e-08, + "loss": 1.2876, + "step": 2168 + }, + { + "epoch": 0.87, + "eval_loss": 1.172935962677002, + "eval_runtime": 0.7737, + "eval_samples_per_second": 5.17, + "eval_steps_per_second": 1.292, + "step": 2168 + }, + { + "epoch": 0.87, + "learning_rate": 3.936e-08, + "loss": 1.255, + "step": 2172 + }, + { + "epoch": 0.87, + "eval_loss": 1.172789216041565, + "eval_runtime": 0.8022, + "eval_samples_per_second": 4.986, + "eval_steps_per_second": 1.247, + "step": 2172 + }, + { + "epoch": 0.87, + "learning_rate": 3.8879999999999995e-08, + "loss": 1.2471, + "step": 2176 + }, + { + "epoch": 0.87, + "eval_loss": 1.1724050045013428, + "eval_runtime": 0.5443, + "eval_samples_per_second": 7.349, + "eval_steps_per_second": 1.837, + "step": 2176 + }, + { + "epoch": 0.87, + "learning_rate": 3.84e-08, + "loss": 1.2641, + "step": 2180 + }, + { + "epoch": 0.87, + "eval_loss": 1.1726024150848389, + "eval_runtime": 0.5537, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 1.806, + "step": 2180 + }, + { + "epoch": 0.87, + "learning_rate": 3.7920000000000005e-08, + "loss": 1.2594, + "step": 2184 + }, + { + "epoch": 0.87, + "eval_loss": 1.1722073554992676, + "eval_runtime": 0.5672, + "eval_samples_per_second": 7.053, + "eval_steps_per_second": 1.763, + "step": 2184 + }, + { + "epoch": 0.88, + "learning_rate": 3.7439999999999996e-08, + "loss": 1.2803, + "step": 2188 + }, + { + "epoch": 0.88, + "eval_loss": 1.1718605756759644, + "eval_runtime": 0.5653, + "eval_samples_per_second": 7.076, + "eval_steps_per_second": 1.769, + "step": 2188 + }, + { + "epoch": 0.88, + "learning_rate": 3.696e-08, + "loss": 1.2404, + "step": 2192 + }, + { + "epoch": 0.88, + "eval_loss": 1.1717666387557983, + "eval_runtime": 0.5424, + "eval_samples_per_second": 7.375, + "eval_steps_per_second": 1.844, + "step": 2192 + }, + { + "epoch": 0.88, + "learning_rate": 3.648e-08, + "loss": 1.2538, + "step": 2196 + }, + { + "epoch": 0.88, + "eval_loss": 1.1718065738677979, + "eval_runtime": 0.8245, + "eval_samples_per_second": 4.852, + "eval_steps_per_second": 1.213, + "step": 2196 + }, + { + "epoch": 0.88, + "learning_rate": 3.6e-08, + "loss": 1.2423, + "step": 2200 + }, + { + "epoch": 0.88, + "eval_loss": 1.1715894937515259, + "eval_runtime": 0.7792, + "eval_samples_per_second": 5.134, + "eval_steps_per_second": 1.283, + "step": 2200 + }, + { + "epoch": 0.88, + "learning_rate": 3.552e-08, + "loss": 1.2344, + "step": 2204 + }, + { + "epoch": 0.88, + "eval_loss": 1.1713180541992188, + "eval_runtime": 0.8087, + "eval_samples_per_second": 4.946, + "eval_steps_per_second": 1.237, + "step": 2204 + }, + { + "epoch": 0.88, + "learning_rate": 3.504e-08, + "loss": 1.2646, + "step": 2208 + }, + { + "epoch": 0.88, + "eval_loss": 1.1710913181304932, + "eval_runtime": 0.7178, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 1.393, + "step": 2208 + }, + { + "epoch": 0.88, + "learning_rate": 3.456e-08, + "loss": 1.2501, + "step": 2212 + }, + { + "epoch": 0.88, + "eval_loss": 1.1711719036102295, + "eval_runtime": 0.5726, + "eval_samples_per_second": 6.985, + "eval_steps_per_second": 1.746, + "step": 2212 + }, + { + "epoch": 0.89, + "learning_rate": 3.408e-08, + "loss": 1.2455, + "step": 2216 + }, + { + "epoch": 0.89, + "eval_loss": 1.1708979606628418, + "eval_runtime": 0.5581, + "eval_samples_per_second": 7.168, + "eval_steps_per_second": 1.792, + "step": 2216 + }, + { + "epoch": 0.89, + "learning_rate": 3.3599999999999996e-08, + "loss": 1.2588, + "step": 2220 + }, + { + "epoch": 0.89, + "eval_loss": 1.170513391494751, + "eval_runtime": 0.5523, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 1.811, + "step": 2220 + }, + { + "epoch": 0.89, + "learning_rate": 3.3119999999999995e-08, + "loss": 1.2629, + "step": 2224 + }, + { + "epoch": 0.89, + "eval_loss": 1.1705873012542725, + "eval_runtime": 0.5461, + "eval_samples_per_second": 7.325, + "eval_steps_per_second": 1.831, + "step": 2224 + }, + { + "epoch": 0.89, + "learning_rate": 3.264e-08, + "loss": 1.2572, + "step": 2228 + }, + { + "epoch": 0.89, + "eval_loss": 1.1705511808395386, + "eval_runtime": 0.6444, + "eval_samples_per_second": 6.208, + "eval_steps_per_second": 1.552, + "step": 2228 + }, + { + "epoch": 0.89, + "learning_rate": 3.216e-08, + "loss": 1.2625, + "step": 2232 + }, + { + "epoch": 0.89, + "eval_loss": 1.1704365015029907, + "eval_runtime": 0.7974, + "eval_samples_per_second": 5.016, + "eval_steps_per_second": 1.254, + "step": 2232 + }, + { + "epoch": 0.89, + "learning_rate": 3.1679999999999996e-08, + "loss": 1.2479, + "step": 2236 + }, + { + "epoch": 0.89, + "eval_loss": 1.16998291015625, + "eval_runtime": 0.783, + "eval_samples_per_second": 5.108, + "eval_steps_per_second": 1.277, + "step": 2236 + }, + { + "epoch": 0.9, + "learning_rate": 3.1199999999999995e-08, + "loss": 1.2698, + "step": 2240 + }, + { + "epoch": 0.9, + "eval_loss": 1.1701174974441528, + "eval_runtime": 0.778, + "eval_samples_per_second": 5.141, + "eval_steps_per_second": 1.285, + "step": 2240 + }, + { + "epoch": 0.9, + "learning_rate": 3.072e-08, + "loss": 1.2619, + "step": 2244 + }, + { + "epoch": 0.9, + "eval_loss": 1.1698575019836426, + "eval_runtime": 0.5528, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 1.809, + "step": 2244 + }, + { + "epoch": 0.9, + "learning_rate": 3.024e-08, + "loss": 1.2455, + "step": 2248 + }, + { + "epoch": 0.9, + "eval_loss": 1.1696205139160156, + "eval_runtime": 0.5491, + "eval_samples_per_second": 7.285, + "eval_steps_per_second": 1.821, + "step": 2248 + }, + { + "epoch": 0.9, + "learning_rate": 2.9759999999999996e-08, + "loss": 1.2523, + "step": 2252 + }, + { + "epoch": 0.9, + "eval_loss": 1.1695911884307861, + "eval_runtime": 0.5404, + "eval_samples_per_second": 7.402, + "eval_steps_per_second": 1.85, + "step": 2252 + }, + { + "epoch": 0.9, + "learning_rate": 2.928e-08, + "loss": 1.2695, + "step": 2256 + }, + { + "epoch": 0.9, + "eval_loss": 1.1692249774932861, + "eval_runtime": 0.55, + "eval_samples_per_second": 7.273, + "eval_steps_per_second": 1.818, + "step": 2256 + }, + { + "epoch": 0.9, + "learning_rate": 2.88e-08, + "loss": 1.258, + "step": 2260 + }, + { + "epoch": 0.9, + "eval_loss": 1.1691182851791382, + "eval_runtime": 0.5535, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 1.807, + "step": 2260 + }, + { + "epoch": 0.91, + "learning_rate": 2.8319999999999998e-08, + "loss": 1.2393, + "step": 2264 + }, + { + "epoch": 0.91, + "eval_loss": 1.169084906578064, + "eval_runtime": 0.7449, + "eval_samples_per_second": 5.37, + "eval_steps_per_second": 1.342, + "step": 2264 + }, + { + "epoch": 0.91, + "learning_rate": 2.7839999999999997e-08, + "loss": 1.2567, + "step": 2268 + }, + { + "epoch": 0.91, + "eval_loss": 1.1689860820770264, + "eval_runtime": 0.7442, + "eval_samples_per_second": 5.375, + "eval_steps_per_second": 1.344, + "step": 2268 + }, + { + "epoch": 0.91, + "learning_rate": 2.7359999999999998e-08, + "loss": 1.2373, + "step": 2272 + }, + { + "epoch": 0.91, + "eval_loss": 1.1687620878219604, + "eval_runtime": 0.7861, + "eval_samples_per_second": 5.089, + "eval_steps_per_second": 1.272, + "step": 2272 + }, + { + "epoch": 0.91, + "learning_rate": 2.688e-08, + "loss": 1.2797, + "step": 2276 + }, + { + "epoch": 0.91, + "eval_loss": 1.1686581373214722, + "eval_runtime": 0.8296, + "eval_samples_per_second": 4.822, + "eval_steps_per_second": 1.205, + "step": 2276 + }, + { + "epoch": 0.91, + "learning_rate": 2.6399999999999998e-08, + "loss": 1.2646, + "step": 2280 + }, + { + "epoch": 0.91, + "eval_loss": 1.1686054468154907, + "eval_runtime": 0.5581, + "eval_samples_per_second": 7.167, + "eval_steps_per_second": 1.792, + "step": 2280 + }, + { + "epoch": 0.91, + "learning_rate": 2.592e-08, + "loss": 1.2502, + "step": 2284 + }, + { + "epoch": 0.91, + "eval_loss": 1.16860032081604, + "eval_runtime": 0.5609, + "eval_samples_per_second": 7.131, + "eval_steps_per_second": 1.783, + "step": 2284 + }, + { + "epoch": 0.92, + "learning_rate": 2.5439999999999998e-08, + "loss": 1.2317, + "step": 2288 + }, + { + "epoch": 0.92, + "eval_loss": 1.1684176921844482, + "eval_runtime": 0.5432, + "eval_samples_per_second": 7.363, + "eval_steps_per_second": 1.841, + "step": 2288 + }, + { + "epoch": 0.92, + "learning_rate": 2.4959999999999997e-08, + "loss": 1.2443, + "step": 2292 + }, + { + "epoch": 0.92, + "eval_loss": 1.168290138244629, + "eval_runtime": 0.5516, + "eval_samples_per_second": 7.251, + "eval_steps_per_second": 1.813, + "step": 2292 + }, + { + "epoch": 0.92, + "learning_rate": 2.448e-08, + "loss": 1.2563, + "step": 2296 + }, + { + "epoch": 0.92, + "eval_loss": 1.1683218479156494, + "eval_runtime": 0.5823, + "eval_samples_per_second": 6.87, + "eval_steps_per_second": 1.717, + "step": 2296 + }, + { + "epoch": 0.92, + "learning_rate": 2.4e-08, + "loss": 1.2485, + "step": 2300 + }, + { + "epoch": 0.92, + "eval_loss": 1.168270230293274, + "eval_runtime": 0.8027, + "eval_samples_per_second": 4.983, + "eval_steps_per_second": 1.246, + "step": 2300 + }, + { + "epoch": 0.92, + "learning_rate": 2.3519999999999998e-08, + "loss": 1.2542, + "step": 2304 + }, + { + "epoch": 0.92, + "eval_loss": 1.1681807041168213, + "eval_runtime": 0.8267, + "eval_samples_per_second": 4.839, + "eval_steps_per_second": 1.21, + "step": 2304 + }, + { + "epoch": 0.92, + "learning_rate": 2.3039999999999997e-08, + "loss": 1.2381, + "step": 2308 + }, + { + "epoch": 0.92, + "eval_loss": 1.1679259538650513, + "eval_runtime": 0.8423, + "eval_samples_per_second": 4.749, + "eval_steps_per_second": 1.187, + "step": 2308 + }, + { + "epoch": 0.92, + "learning_rate": 2.256e-08, + "loss": 1.2244, + "step": 2312 + }, + { + "epoch": 0.92, + "eval_loss": 1.167891502380371, + "eval_runtime": 0.5627, + "eval_samples_per_second": 7.109, + "eval_steps_per_second": 1.777, + "step": 2312 + }, + { + "epoch": 0.93, + "learning_rate": 2.208e-08, + "loss": 1.2725, + "step": 2316 + }, + { + "epoch": 0.93, + "eval_loss": 1.1678441762924194, + "eval_runtime": 0.5993, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 1.669, + "step": 2316 + }, + { + "epoch": 0.93, + "learning_rate": 2.1599999999999998e-08, + "loss": 1.2441, + "step": 2320 + }, + { + "epoch": 0.93, + "eval_loss": 1.1677234172821045, + "eval_runtime": 0.671, + "eval_samples_per_second": 5.961, + "eval_steps_per_second": 1.49, + "step": 2320 + }, + { + "epoch": 0.93, + "learning_rate": 2.112e-08, + "loss": 1.2441, + "step": 2324 + }, + { + "epoch": 0.93, + "eval_loss": 1.1677547693252563, + "eval_runtime": 0.5853, + "eval_samples_per_second": 6.834, + "eval_steps_per_second": 1.708, + "step": 2324 + }, + { + "epoch": 0.93, + "learning_rate": 2.0639999999999998e-08, + "loss": 1.2546, + "step": 2328 + }, + { + "epoch": 0.93, + "eval_loss": 1.1675907373428345, + "eval_runtime": 1.0373, + "eval_samples_per_second": 3.856, + "eval_steps_per_second": 0.964, + "step": 2328 + }, + { + "epoch": 0.93, + "learning_rate": 2.0159999999999997e-08, + "loss": 1.2279, + "step": 2332 + }, + { + "epoch": 0.93, + "eval_loss": 1.1674381494522095, + "eval_runtime": 1.2312, + "eval_samples_per_second": 3.249, + "eval_steps_per_second": 0.812, + "step": 2332 + }, + { + "epoch": 0.93, + "learning_rate": 1.968e-08, + "loss": 1.2635, + "step": 2336 + }, + { + "epoch": 0.93, + "eval_loss": 1.1675413846969604, + "eval_runtime": 1.8946, + "eval_samples_per_second": 2.111, + "eval_steps_per_second": 0.528, + "step": 2336 + }, + { + "epoch": 0.94, + "learning_rate": 1.92e-08, + "loss": 1.2572, + "step": 2340 + }, + { + "epoch": 0.94, + "eval_loss": 1.1673345565795898, + "eval_runtime": 1.8538, + "eval_samples_per_second": 2.158, + "eval_steps_per_second": 0.539, + "step": 2340 + }, + { + "epoch": 0.94, + "learning_rate": 1.8719999999999998e-08, + "loss": 1.2421, + "step": 2344 + }, + { + "epoch": 0.94, + "eval_loss": 1.1672680377960205, + "eval_runtime": 0.5759, + "eval_samples_per_second": 6.945, + "eval_steps_per_second": 1.736, + "step": 2344 + }, + { + "epoch": 0.94, + "learning_rate": 1.824e-08, + "loss": 1.2022, + "step": 2348 + }, + { + "epoch": 0.94, + "eval_loss": 1.1671053171157837, + "eval_runtime": 0.8746, + "eval_samples_per_second": 4.574, + "eval_steps_per_second": 1.143, + "step": 2348 + }, + { + "epoch": 0.94, + "learning_rate": 1.776e-08, + "loss": 1.2307, + "step": 2352 + }, + { + "epoch": 0.94, + "eval_loss": 1.167036771774292, + "eval_runtime": 0.5771, + "eval_samples_per_second": 6.931, + "eval_steps_per_second": 1.733, + "step": 2352 + }, + { + "epoch": 0.94, + "learning_rate": 1.728e-08, + "loss": 1.2525, + "step": 2356 + }, + { + "epoch": 0.94, + "eval_loss": 1.1670310497283936, + "eval_runtime": 0.5992, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 1.669, + "step": 2356 + }, + { + "epoch": 0.94, + "learning_rate": 1.6799999999999998e-08, + "loss": 1.2353, + "step": 2360 + }, + { + "epoch": 0.94, + "eval_loss": 1.1670902967453003, + "eval_runtime": 1.3469, + "eval_samples_per_second": 2.97, + "eval_steps_per_second": 0.742, + "step": 2360 + }, + { + "epoch": 0.95, + "learning_rate": 1.632e-08, + "loss": 1.25, + "step": 2364 + }, + { + "epoch": 0.95, + "eval_loss": 1.1672464609146118, + "eval_runtime": 0.8027, + "eval_samples_per_second": 4.983, + "eval_steps_per_second": 1.246, + "step": 2364 + }, + { + "epoch": 0.95, + "learning_rate": 1.5839999999999998e-08, + "loss": 1.2493, + "step": 2368 + }, + { + "epoch": 0.95, + "eval_loss": 1.1669551134109497, + "eval_runtime": 0.7946, + "eval_samples_per_second": 5.034, + "eval_steps_per_second": 1.258, + "step": 2368 + }, + { + "epoch": 0.95, + "learning_rate": 1.536e-08, + "loss": 1.2453, + "step": 2372 + }, + { + "epoch": 0.95, + "eval_loss": 1.167033076286316, + "eval_runtime": 0.7944, + "eval_samples_per_second": 5.035, + "eval_steps_per_second": 1.259, + "step": 2372 + }, + { + "epoch": 0.95, + "learning_rate": 1.4879999999999998e-08, + "loss": 1.2714, + "step": 2376 + }, + { + "epoch": 0.95, + "eval_loss": 1.1669402122497559, + "eval_runtime": 0.6166, + "eval_samples_per_second": 6.487, + "eval_steps_per_second": 1.622, + "step": 2376 + }, + { + "epoch": 0.95, + "learning_rate": 1.44e-08, + "loss": 1.2435, + "step": 2380 + }, + { + "epoch": 0.95, + "eval_loss": 1.166806936264038, + "eval_runtime": 0.5575, + "eval_samples_per_second": 7.175, + "eval_steps_per_second": 1.794, + "step": 2380 + }, + { + "epoch": 0.95, + "learning_rate": 1.3919999999999998e-08, + "loss": 1.2518, + "step": 2384 + }, + { + "epoch": 0.95, + "eval_loss": 1.1668052673339844, + "eval_runtime": 0.562, + "eval_samples_per_second": 7.117, + "eval_steps_per_second": 1.779, + "step": 2384 + }, + { + "epoch": 0.96, + "learning_rate": 1.344e-08, + "loss": 1.2594, + "step": 2388 + }, + { + "epoch": 0.96, + "eval_loss": 1.166872262954712, + "eval_runtime": 0.5759, + "eval_samples_per_second": 6.946, + "eval_steps_per_second": 1.736, + "step": 2388 + }, + { + "epoch": 0.96, + "learning_rate": 1.296e-08, + "loss": 1.2149, + "step": 2392 + }, + { + "epoch": 0.96, + "eval_loss": 1.1669771671295166, + "eval_runtime": 0.777, + "eval_samples_per_second": 5.148, + "eval_steps_per_second": 1.287, + "step": 2392 + }, + { + "epoch": 0.96, + "learning_rate": 1.2479999999999998e-08, + "loss": 1.2676, + "step": 2396 + }, + { + "epoch": 0.96, + "eval_loss": 1.1666895151138306, + "eval_runtime": 0.8143, + "eval_samples_per_second": 4.912, + "eval_steps_per_second": 1.228, + "step": 2396 + }, + { + "epoch": 0.96, + "learning_rate": 1.2e-08, + "loss": 1.2337, + "step": 2400 + }, + { + "epoch": 0.96, + "eval_loss": 1.1668546199798584, + "eval_runtime": 0.8514, + "eval_samples_per_second": 4.698, + "eval_steps_per_second": 1.175, + "step": 2400 + }, + { + "epoch": 0.96, + "learning_rate": 1.1519999999999998e-08, + "loss": 1.2329, + "step": 2404 + }, + { + "epoch": 0.96, + "eval_loss": 1.1665568351745605, + "eval_runtime": 0.5814, + "eval_samples_per_second": 6.88, + "eval_steps_per_second": 1.72, + "step": 2404 + }, + { + "epoch": 0.96, + "learning_rate": 1.104e-08, + "loss": 1.269, + "step": 2408 + }, + { + "epoch": 0.96, + "eval_loss": 1.1666715145111084, + "eval_runtime": 0.5762, + "eval_samples_per_second": 6.942, + "eval_steps_per_second": 1.735, + "step": 2408 + }, + { + "epoch": 0.96, + "learning_rate": 1.056e-08, + "loss": 1.2298, + "step": 2412 + }, + { + "epoch": 0.96, + "eval_loss": 1.1665081977844238, + "eval_runtime": 0.6241, + "eval_samples_per_second": 6.409, + "eval_steps_per_second": 1.602, + "step": 2412 + }, + { + "epoch": 0.97, + "learning_rate": 1.0079999999999998e-08, + "loss": 1.2481, + "step": 2416 + }, + { + "epoch": 0.97, + "eval_loss": 1.1666505336761475, + "eval_runtime": 0.5768, + "eval_samples_per_second": 6.934, + "eval_steps_per_second": 1.734, + "step": 2416 + }, + { + "epoch": 0.97, + "learning_rate": 9.6e-09, + "loss": 1.2674, + "step": 2420 + }, + { + "epoch": 0.97, + "eval_loss": 1.1667877435684204, + "eval_runtime": 0.555, + "eval_samples_per_second": 7.207, + "eval_steps_per_second": 1.802, + "step": 2420 + }, + { + "epoch": 0.97, + "learning_rate": 9.12e-09, + "loss": 1.2482, + "step": 2424 + }, + { + "epoch": 0.97, + "eval_loss": 1.1666263341903687, + "eval_runtime": 0.7907, + "eval_samples_per_second": 5.058, + "eval_steps_per_second": 1.265, + "step": 2424 + }, + { + "epoch": 0.97, + "learning_rate": 8.64e-09, + "loss": 1.2604, + "step": 2428 + }, + { + "epoch": 0.97, + "eval_loss": 1.1666624546051025, + "eval_runtime": 0.9212, + "eval_samples_per_second": 4.342, + "eval_steps_per_second": 1.086, + "step": 2428 + }, + { + "epoch": 0.97, + "learning_rate": 8.16e-09, + "loss": 1.2471, + "step": 2432 + }, + { + "epoch": 0.97, + "eval_loss": 1.1666383743286133, + "eval_runtime": 0.9451, + "eval_samples_per_second": 4.232, + "eval_steps_per_second": 1.058, + "step": 2432 + }, + { + "epoch": 0.97, + "learning_rate": 7.68e-09, + "loss": 1.2069, + "step": 2436 + }, + { + "epoch": 0.97, + "eval_loss": 1.1664601564407349, + "eval_runtime": 0.566, + "eval_samples_per_second": 7.067, + "eval_steps_per_second": 1.767, + "step": 2436 + }, + { + "epoch": 0.98, + "learning_rate": 7.2e-09, + "loss": 1.2734, + "step": 2440 + }, + { + "epoch": 0.98, + "eval_loss": 1.1667420864105225, + "eval_runtime": 0.5729, + "eval_samples_per_second": 6.982, + "eval_steps_per_second": 1.746, + "step": 2440 + }, + { + "epoch": 0.98, + "learning_rate": 6.72e-09, + "loss": 1.239, + "step": 2444 + }, + { + "epoch": 0.98, + "eval_loss": 1.1666040420532227, + "eval_runtime": 0.5792, + "eval_samples_per_second": 6.906, + "eval_steps_per_second": 1.727, + "step": 2444 + }, + { + "epoch": 0.98, + "learning_rate": 6.239999999999999e-09, + "loss": 1.2245, + "step": 2448 + }, + { + "epoch": 0.98, + "eval_loss": 1.1663960218429565, + "eval_runtime": 0.6104, + "eval_samples_per_second": 6.553, + "eval_steps_per_second": 1.638, + "step": 2448 + }, + { + "epoch": 0.98, + "learning_rate": 5.759999999999999e-09, + "loss": 1.244, + "step": 2452 + }, + { + "epoch": 0.98, + "eval_loss": 1.1664263010025024, + "eval_runtime": 0.5997, + "eval_samples_per_second": 6.669, + "eval_steps_per_second": 1.667, + "step": 2452 + }, + { + "epoch": 0.98, + "learning_rate": 5.28e-09, + "loss": 1.2458, + "step": 2456 + }, + { + "epoch": 0.98, + "eval_loss": 1.1665791273117065, + "eval_runtime": 0.8211, + "eval_samples_per_second": 4.872, + "eval_steps_per_second": 1.218, + "step": 2456 + }, + { + "epoch": 0.98, + "learning_rate": 4.8e-09, + "loss": 1.2566, + "step": 2460 + }, + { + "epoch": 0.98, + "eval_loss": 1.1664774417877197, + "eval_runtime": 0.8342, + "eval_samples_per_second": 4.795, + "eval_steps_per_second": 1.199, + "step": 2460 + }, + { + "epoch": 0.99, + "learning_rate": 4.32e-09, + "loss": 1.2582, + "step": 2464 + }, + { + "epoch": 0.99, + "eval_loss": 1.1667250394821167, + "eval_runtime": 0.8696, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 1.15, + "step": 2464 + }, + { + "epoch": 0.99, + "learning_rate": 3.84e-09, + "loss": 1.2312, + "step": 2468 + }, + { + "epoch": 0.99, + "eval_loss": 1.1667333841323853, + "eval_runtime": 0.5961, + "eval_samples_per_second": 6.71, + "eval_steps_per_second": 1.678, + "step": 2468 + }, + { + "epoch": 0.99, + "learning_rate": 3.36e-09, + "loss": 1.229, + "step": 2472 + }, + { + "epoch": 0.99, + "eval_loss": 1.1666063070297241, + "eval_runtime": 0.5936, + "eval_samples_per_second": 6.739, + "eval_steps_per_second": 1.685, + "step": 2472 + }, + { + "epoch": 0.99, + "learning_rate": 2.8799999999999996e-09, + "loss": 1.2186, + "step": 2476 + }, + { + "epoch": 0.99, + "eval_loss": 1.1666913032531738, + "eval_runtime": 0.5904, + "eval_samples_per_second": 6.775, + "eval_steps_per_second": 1.694, + "step": 2476 + }, + { + "epoch": 0.99, + "learning_rate": 2.4e-09, + "loss": 1.2423, + "step": 2480 + }, + { + "epoch": 0.99, + "eval_loss": 1.1665089130401611, + "eval_runtime": 0.5708, + "eval_samples_per_second": 7.007, + "eval_steps_per_second": 1.752, + "step": 2480 + }, + { + "epoch": 0.99, + "learning_rate": 1.92e-09, + "loss": 1.2465, + "step": 2484 + }, + { + "epoch": 0.99, + "eval_loss": 1.166528344154358, + "eval_runtime": 0.6394, + "eval_samples_per_second": 6.256, + "eval_steps_per_second": 1.564, + "step": 2484 + }, + { + "epoch": 1.0, + "learning_rate": 1.4399999999999998e-09, + "loss": 1.2332, + "step": 2488 + }, + { + "epoch": 1.0, + "eval_loss": 1.1663110256195068, + "eval_runtime": 0.8509, + "eval_samples_per_second": 4.701, + "eval_steps_per_second": 1.175, + "step": 2488 + }, + { + "epoch": 1.0, + "learning_rate": 9.6e-10, + "loss": 1.2466, + "step": 2492 + }, + { + "epoch": 1.0, + "eval_loss": 1.1665472984313965, + "eval_runtime": 0.8456, + "eval_samples_per_second": 4.73, + "eval_steps_per_second": 1.183, + "step": 2492 + }, + { + "epoch": 1.0, + "learning_rate": 4.8e-10, + "loss": 1.2308, + "step": 2496 + }, + { + "epoch": 1.0, + "eval_loss": 1.1665329933166504, + "eval_runtime": 0.8802, + "eval_samples_per_second": 4.544, + "eval_steps_per_second": 1.136, + "step": 2496 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "loss": 1.2435, + "step": 2500 + }, + { + "epoch": 1.0, + "eval_loss": 1.1666896343231201, + "eval_runtime": 0.5837, + "eval_samples_per_second": 6.853, + "eval_steps_per_second": 1.713, + "step": 2500 + } + ], + "logging_steps": 4, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 3.178022043648e+17, + "trial_name": null, + "trial_params": null +}