diff --git "a/checkpoints/Qwen2.5-7B/babylm_shuffle_control_10M_seed0/runs/checkpoint-1290/trainer_state.json" "b/checkpoints/Qwen2.5-7B/babylm_shuffle_control_10M_seed0/runs/checkpoint-1290/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoints/Qwen2.5-7B/babylm_shuffle_control_10M_seed0/runs/checkpoint-1290/trainer_state.json"
@@ -0,0 +1,10095 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1290,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0015503875968992248,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6553,
+      "step": 1
+    },
+    {
+      "epoch": 0.0031007751937984496,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6785,
+      "step": 2
+    },
+    {
+      "epoch": 0.004651162790697674,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6452,
+      "step": 3
+    },
+    {
+      "epoch": 0.006201550387596899,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6734,
+      "step": 4
+    },
+    {
+      "epoch": 0.007751937984496124,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6731,
+      "step": 5
+    },
+    {
+      "epoch": 0.009302325581395349,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.671,
+      "step": 6
+    },
+    {
+      "epoch": 0.010852713178294573,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.7004,
+      "step": 7
+    },
+    {
+      "epoch": 0.012403100775193798,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6789,
+      "step": 8
+    },
+    {
+      "epoch": 0.013953488372093023,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6501,
+      "step": 9
+    },
+    {
+      "epoch": 0.015503875968992248,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6589,
+      "step": 10
+    },
+    {
+      "epoch": 0.015503875968992248,
+      "eval_loss": 1.6840144395828247,
+      "eval_runtime": 44.2414,
+      "eval_samples_per_second": 22.603,
+      "eval_steps_per_second": 1.424,
+      "step": 10
+    },
+    {
+      "epoch": 0.017054263565891473,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6732,
+      "step": 11
+    },
+    {
+      "epoch": 0.018604651162790697,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.7091,
+      "step": 12
+    },
+    {
+      "epoch": 0.020155038759689922,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6712,
+      "step": 13
+    },
+    {
+      "epoch": 0.021705426356589147,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6595,
+      "step": 14
+    },
+    {
+      "epoch": 0.023255813953488372,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6723,
+      "step": 15
+    },
+    {
+      "epoch": 0.024806201550387597,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6729,
+      "step": 16
+    },
+    {
+      "epoch": 0.02635658914728682,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6801,
+      "step": 17
+    },
+    {
+      "epoch": 0.027906976744186046,
+      "grad_norm": 4.608963489532471,
+      "learning_rate": 3.875968992248062e-08,
+      "loss": 1.6527,
+      "step": 18
+    },
+    {
+      "epoch": 0.02945736434108527,
+      "grad_norm": 4.420685291290283,
+      "learning_rate": 7.751937984496124e-08,
+      "loss": 1.6801,
+      "step": 19
+    },
+    {
+      "epoch": 0.031007751937984496,
+      "grad_norm": 3.385024309158325,
+      "learning_rate": 1.1627906976744187e-07,
+      "loss": 1.6978,
+      "step": 20
+    },
+    {
+      "epoch": 0.031007751937984496,
+      "eval_loss": 1.6839702129364014,
+      "eval_runtime": 44.132,
+      "eval_samples_per_second": 22.659,
+      "eval_steps_per_second": 1.428,
+      "step": 20
+    },
+    {
+      "epoch": 0.03255813953488372,
+      "grad_norm": 4.05656099319458,
+      "learning_rate": 1.5503875968992249e-07,
+      "loss": 1.6369,
+      "step": 21
+    },
+    {
+      "epoch": 0.034108527131782945,
+      "grad_norm": 4.343766689300537,
+      "learning_rate": 1.9379844961240311e-07,
+      "loss": 1.6709,
+      "step": 22
+    },
+    {
+      "epoch": 0.03565891472868217,
+      "grad_norm": 4.232193946838379,
+      "learning_rate": 2.3255813953488374e-07,
+      "loss": 1.6855,
+      "step": 23
+    },
+    {
+      "epoch": 0.037209302325581395,
+      "grad_norm": 3.5586187839508057,
+      "learning_rate": 2.7131782945736437e-07,
+      "loss": 1.6812,
+      "step": 24
+    },
+    {
+      "epoch": 0.03875968992248062,
+      "grad_norm": 3.9483323097229004,
+      "learning_rate": 3.1007751937984497e-07,
+      "loss": 1.6795,
+      "step": 25
+    },
+    {
+      "epoch": 0.040310077519379844,
+      "grad_norm": 2.9243829250335693,
+      "learning_rate": 3.488372093023256e-07,
+      "loss": 1.6395,
+      "step": 26
+    },
+    {
+      "epoch": 0.04186046511627907,
+      "grad_norm": 3.643986701965332,
+      "learning_rate": 3.8759689922480623e-07,
+      "loss": 1.6803,
+      "step": 27
+    },
+    {
+      "epoch": 0.043410852713178294,
+      "grad_norm": 3.2554931640625,
+      "learning_rate": 4.2635658914728683e-07,
+      "loss": 1.6848,
+      "step": 28
+    },
+    {
+      "epoch": 0.04496124031007752,
+      "grad_norm": 3.6439590454101562,
+      "learning_rate": 4.651162790697675e-07,
+      "loss": 1.6376,
+      "step": 29
+    },
+    {
+      "epoch": 0.046511627906976744,
+      "grad_norm": 4.260690689086914,
+      "learning_rate": 5.038759689922481e-07,
+      "loss": 1.6478,
+      "step": 30
+    },
+    {
+      "epoch": 0.046511627906976744,
+      "eval_loss": 1.6623069047927856,
+      "eval_runtime": 45.3876,
+      "eval_samples_per_second": 22.032,
+      "eval_steps_per_second": 1.388,
+      "step": 30
+    },
+    {
+      "epoch": 0.04806201550387597,
+      "grad_norm": 3.7455801963806152,
+      "learning_rate": 5.426356589147287e-07,
+      "loss": 1.6443,
+      "step": 31
+    },
+    {
+      "epoch": 0.04961240310077519,
+      "grad_norm": 5.081115245819092,
+      "learning_rate": 5.813953488372094e-07,
+      "loss": 1.6436,
+      "step": 32
+    },
+    {
+      "epoch": 0.05116279069767442,
+      "grad_norm": 1.788765788078308,
+      "learning_rate": 6.201550387596899e-07,
+      "loss": 1.645,
+      "step": 33
+    },
+    {
+      "epoch": 0.05271317829457364,
+      "grad_norm": 1.6885591745376587,
+      "learning_rate": 6.589147286821707e-07,
+      "loss": 1.6139,
+      "step": 34
+    },
+    {
+      "epoch": 0.05426356589147287,
+      "grad_norm": 2.4668631553649902,
+      "learning_rate": 6.976744186046513e-07,
+      "loss": 1.6299,
+      "step": 35
+    },
+    {
+      "epoch": 0.05581395348837209,
+      "grad_norm": 2.387413740158081,
+      "learning_rate": 7.364341085271319e-07,
+      "loss": 1.64,
+      "step": 36
+    },
+    {
+      "epoch": 0.05736434108527132,
+      "grad_norm": 2.5546491146087646,
+      "learning_rate": 7.751937984496125e-07,
+      "loss": 1.6272,
+      "step": 37
+    },
+    {
+      "epoch": 0.05891472868217054,
+      "grad_norm": 1.8689916133880615,
+      "learning_rate": 8.139534883720931e-07,
+      "loss": 1.5905,
+      "step": 38
+    },
+    {
+      "epoch": 0.06046511627906977,
+      "grad_norm": 2.584873914718628,
+      "learning_rate": 8.527131782945737e-07,
+      "loss": 1.607,
+      "step": 39
+    },
+    {
+      "epoch": 0.06201550387596899,
+      "grad_norm": 2.200226306915283,
+      "learning_rate": 8.914728682170544e-07,
+      "loss": 1.6105,
+      "step": 40
+    },
+    {
+      "epoch": 0.06201550387596899,
+      "eval_loss": 1.616817593574524,
+      "eval_runtime": 46.2754,
+      "eval_samples_per_second": 21.61,
+      "eval_steps_per_second": 1.361,
+      "step": 40
+    },
+    {
+      "epoch": 0.06356589147286822,
+      "grad_norm": 1.4535961151123047,
+      "learning_rate": 9.30232558139535e-07,
+      "loss": 1.6013,
+      "step": 41
+    },
+    {
+      "epoch": 0.06511627906976744,
+      "grad_norm": 3.4520232677459717,
+      "learning_rate": 9.689922480620157e-07,
+      "loss": 1.5835,
+      "step": 42
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 2.159172534942627,
+      "learning_rate": 1.0077519379844962e-06,
+      "loss": 1.5887,
+      "step": 43
+    },
+    {
+      "epoch": 0.06821705426356589,
+      "grad_norm": 1.8451075553894043,
+      "learning_rate": 1.0465116279069768e-06,
+      "loss": 1.5939,
+      "step": 44
+    },
+    {
+      "epoch": 0.06976744186046512,
+      "grad_norm": 1.8367409706115723,
+      "learning_rate": 1.0852713178294575e-06,
+      "loss": 1.5711,
+      "step": 45
+    },
+    {
+      "epoch": 0.07131782945736434,
+      "grad_norm": 2.565183401107788,
+      "learning_rate": 1.1240310077519381e-06,
+      "loss": 1.585,
+      "step": 46
+    },
+    {
+      "epoch": 0.07286821705426356,
+      "grad_norm": 3.1191582679748535,
+      "learning_rate": 1.1627906976744188e-06,
+      "loss": 1.6068,
+      "step": 47
+    },
+    {
+      "epoch": 0.07441860465116279,
+      "grad_norm": 2.8720552921295166,
+      "learning_rate": 1.2015503875968994e-06,
+      "loss": 1.5557,
+      "step": 48
+    },
+    {
+      "epoch": 0.07596899224806201,
+      "grad_norm": 2.1380953788757324,
+      "learning_rate": 1.2403100775193799e-06,
+      "loss": 1.566,
+      "step": 49
+    },
+    {
+      "epoch": 0.07751937984496124,
+      "grad_norm": 2.3946728706359863,
+      "learning_rate": 1.2790697674418605e-06,
+      "loss": 1.5299,
+      "step": 50
+    },
+    {
+      "epoch": 0.07751937984496124,
+      "eval_loss": 1.5494492053985596,
+      "eval_runtime": 46.4926,
+      "eval_samples_per_second": 21.509,
+      "eval_steps_per_second": 1.355,
+      "step": 50
+    },
+    {
+      "epoch": 0.07906976744186046,
+      "grad_norm": 2.220069646835327,
+      "learning_rate": 1.3178294573643414e-06,
+      "loss": 1.5401,
+      "step": 51
+    },
+    {
+      "epoch": 0.08062015503875969,
+      "grad_norm": 1.8800451755523682,
+      "learning_rate": 1.3565891472868216e-06,
+      "loss": 1.5078,
+      "step": 52
+    },
+    {
+      "epoch": 0.08217054263565891,
+      "grad_norm": 1.897739052772522,
+      "learning_rate": 1.3953488372093025e-06,
+      "loss": 1.4914,
+      "step": 53
+    },
+    {
+      "epoch": 0.08372093023255814,
+      "grad_norm": 2.7918992042541504,
+      "learning_rate": 1.4341085271317832e-06,
+      "loss": 1.4992,
+      "step": 54
+    },
+    {
+      "epoch": 0.08527131782945736,
+      "grad_norm": 2.1271169185638428,
+      "learning_rate": 1.4728682170542638e-06,
+      "loss": 1.4418,
+      "step": 55
+    },
+    {
+      "epoch": 0.08682170542635659,
+      "grad_norm": 2.3548240661621094,
+      "learning_rate": 1.5116279069767443e-06,
+      "loss": 1.4575,
+      "step": 56
+    },
+    {
+      "epoch": 0.08837209302325581,
+      "grad_norm": 2.4243152141571045,
+      "learning_rate": 1.550387596899225e-06,
+      "loss": 1.4292,
+      "step": 57
+    },
+    {
+      "epoch": 0.08992248062015504,
+      "grad_norm": 1.9162991046905518,
+      "learning_rate": 1.5891472868217056e-06,
+      "loss": 1.4401,
+      "step": 58
+    },
+    {
+      "epoch": 0.09147286821705426,
+      "grad_norm": 2.9491307735443115,
+      "learning_rate": 1.6279069767441862e-06,
+      "loss": 1.4409,
+      "step": 59
+    },
+    {
+      "epoch": 0.09302325581395349,
+      "grad_norm": 2.5566985607147217,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 1.3898,
+      "step": 60
+    },
+    {
+      "epoch": 0.09302325581395349,
+      "eval_loss": 1.4331709146499634,
+      "eval_runtime": 46.6366,
+      "eval_samples_per_second": 21.442,
+      "eval_steps_per_second": 1.351,
+      "step": 60
+    },
+    {
+      "epoch": 0.09457364341085271,
+      "grad_norm": 1.8447020053863525,
+      "learning_rate": 1.7054263565891473e-06,
+      "loss": 1.4007,
+      "step": 61
+    },
+    {
+      "epoch": 0.09612403100775194,
+      "grad_norm": 3.000507354736328,
+      "learning_rate": 1.7441860465116282e-06,
+      "loss": 1.3862,
+      "step": 62
+    },
+    {
+      "epoch": 0.09767441860465116,
+      "grad_norm": 2.1395022869110107,
+      "learning_rate": 1.7829457364341088e-06,
+      "loss": 1.4188,
+      "step": 63
+    },
+    {
+      "epoch": 0.09922480620155039,
+      "grad_norm": 2.0232176780700684,
+      "learning_rate": 1.8217054263565893e-06,
+      "loss": 1.369,
+      "step": 64
+    },
+    {
+      "epoch": 0.10077519379844961,
+      "grad_norm": 3.6658763885498047,
+      "learning_rate": 1.86046511627907e-06,
+      "loss": 1.3842,
+      "step": 65
+    },
+    {
+      "epoch": 0.10232558139534884,
+      "grad_norm": 1.7414005994796753,
+      "learning_rate": 1.8992248062015506e-06,
+      "loss": 1.3289,
+      "step": 66
+    },
+    {
+      "epoch": 0.10387596899224806,
+      "grad_norm": 1.593327522277832,
+      "learning_rate": 1.9379844961240315e-06,
+      "loss": 1.3289,
+      "step": 67
+    },
+    {
+      "epoch": 0.10542635658914729,
+      "grad_norm": 1.774498462677002,
+      "learning_rate": 1.976744186046512e-06,
+      "loss": 1.3283,
+      "step": 68
+    },
+    {
+      "epoch": 0.10697674418604651,
+      "grad_norm": 1.7857030630111694,
+      "learning_rate": 2.0155038759689923e-06,
+      "loss": 1.3127,
+      "step": 69
+    },
+    {
+      "epoch": 0.10852713178294573,
+      "grad_norm": 1.4791711568832397,
+      "learning_rate": 2.054263565891473e-06,
+      "loss": 1.3534,
+      "step": 70
+    },
+    {
+      "epoch": 0.10852713178294573,
+      "eval_loss": 1.3698604106903076,
+      "eval_runtime": 46.6094,
+      "eval_samples_per_second": 21.455,
+      "eval_steps_per_second": 1.352,
+      "step": 70
+    },
+    {
+      "epoch": 0.11007751937984496,
+      "grad_norm": 1.0254175662994385,
+      "learning_rate": 2.0930232558139536e-06,
+      "loss": 1.3053,
+      "step": 71
+    },
+    {
+      "epoch": 0.11162790697674418,
+      "grad_norm": 1.492088794708252,
+      "learning_rate": 2.131782945736434e-06,
+      "loss": 1.3374,
+      "step": 72
+    },
+    {
+      "epoch": 0.11317829457364341,
+      "grad_norm": 1.8035080432891846,
+      "learning_rate": 2.170542635658915e-06,
+      "loss": 1.3438,
+      "step": 73
+    },
+    {
+      "epoch": 0.11472868217054263,
+      "grad_norm": 2.005145311355591,
+      "learning_rate": 2.2093023255813954e-06,
+      "loss": 1.277,
+      "step": 74
+    },
+    {
+      "epoch": 0.11627906976744186,
+      "grad_norm": 2.1278674602508545,
+      "learning_rate": 2.2480620155038763e-06,
+      "loss": 1.3063,
+      "step": 75
+    },
+    {
+      "epoch": 0.11782945736434108,
+      "grad_norm": 2.4920172691345215,
+      "learning_rate": 2.2868217054263567e-06,
+      "loss": 1.2928,
+      "step": 76
+    },
+    {
+      "epoch": 0.11937984496124031,
+      "grad_norm": 1.801375389099121,
+      "learning_rate": 2.3255813953488376e-06,
+      "loss": 1.2927,
+      "step": 77
+    },
+    {
+      "epoch": 0.12093023255813953,
+      "grad_norm": 1.4796046018600464,
+      "learning_rate": 2.364341085271318e-06,
+      "loss": 1.2427,
+      "step": 78
+    },
+    {
+      "epoch": 0.12248062015503876,
+      "grad_norm": 1.9576448202133179,
+      "learning_rate": 2.403100775193799e-06,
+      "loss": 1.244,
+      "step": 79
+    },
+    {
+      "epoch": 0.12403100775193798,
+      "grad_norm": 2.2607786655426025,
+      "learning_rate": 2.4418604651162793e-06,
+      "loss": 1.2798,
+      "step": 80
+    },
+    {
+      "epoch": 0.12403100775193798,
+      "eval_loss": 1.3285717964172363,
+      "eval_runtime": 46.5854,
+      "eval_samples_per_second": 21.466,
+      "eval_steps_per_second": 1.352,
+      "step": 80
+    },
+    {
+      "epoch": 0.12558139534883722,
+      "grad_norm": 1.7419644594192505,
+      "learning_rate": 2.4806201550387598e-06,
+      "loss": 1.2767,
+      "step": 81
+    },
+    {
+      "epoch": 0.12713178294573643,
+      "grad_norm": 1.6084612607955933,
+      "learning_rate": 2.5193798449612406e-06,
+      "loss": 1.2692,
+      "step": 82
+    },
+    {
+      "epoch": 0.12868217054263567,
+      "grad_norm": 1.8201138973236084,
+      "learning_rate": 2.558139534883721e-06,
+      "loss": 1.259,
+      "step": 83
+    },
+    {
+      "epoch": 0.13023255813953488,
+      "grad_norm": 1.7329964637756348,
+      "learning_rate": 2.596899224806202e-06,
+      "loss": 1.2547,
+      "step": 84
+    },
+    {
+      "epoch": 0.13178294573643412,
+      "grad_norm": 1.6506623029708862,
+      "learning_rate": 2.635658914728683e-06,
+      "loss": 1.2642,
+      "step": 85
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 2.2526042461395264,
+      "learning_rate": 2.674418604651163e-06,
+      "loss": 1.2583,
+      "step": 86
+    },
+    {
+      "epoch": 0.13488372093023257,
+      "grad_norm": 2.1409525871276855,
+      "learning_rate": 2.7131782945736433e-06,
+      "loss": 1.2749,
+      "step": 87
+    },
+    {
+      "epoch": 0.13643410852713178,
+      "grad_norm": 2.350247621536255,
+      "learning_rate": 2.751937984496124e-06,
+      "loss": 1.2073,
+      "step": 88
+    },
+    {
+      "epoch": 0.13798449612403102,
+      "grad_norm": 2.1530516147613525,
+      "learning_rate": 2.790697674418605e-06,
+      "loss": 1.2357,
+      "step": 89
+    },
+    {
+      "epoch": 0.13953488372093023,
+      "grad_norm": 2.1129376888275146,
+      "learning_rate": 2.8294573643410855e-06,
+      "loss": 1.2374,
+      "step": 90
+    },
+    {
+      "epoch": 0.13953488372093023,
+      "eval_loss": 1.296906590461731,
+      "eval_runtime": 46.5431,
+      "eval_samples_per_second": 21.485,
+      "eval_steps_per_second": 1.354,
+      "step": 90
+    },
+    {
+      "epoch": 0.14108527131782947,
+      "grad_norm": 1.8323308229446411,
+      "learning_rate": 2.8682170542635663e-06,
+      "loss": 1.2455,
+      "step": 91
+    },
+    {
+      "epoch": 0.14263565891472868,
+      "grad_norm": 2.5223309993743896,
+      "learning_rate": 2.9069767441860468e-06,
+      "loss": 1.241,
+      "step": 92
+    },
+    {
+      "epoch": 0.14418604651162792,
+      "grad_norm": 1.8278683423995972,
+      "learning_rate": 2.9457364341085276e-06,
+      "loss": 1.1999,
+      "step": 93
+    },
+    {
+      "epoch": 0.14573643410852713,
+      "grad_norm": 1.683199167251587,
+      "learning_rate": 2.9844961240310076e-06,
+      "loss": 1.2185,
+      "step": 94
+    },
+    {
+      "epoch": 0.14728682170542637,
+      "grad_norm": 1.664718747138977,
+      "learning_rate": 3.0232558139534885e-06,
+      "loss": 1.2202,
+      "step": 95
+    },
+    {
+      "epoch": 0.14883720930232558,
+      "grad_norm": 1.8569575548171997,
+      "learning_rate": 3.062015503875969e-06,
+      "loss": 1.2114,
+      "step": 96
+    },
+    {
+      "epoch": 0.15038759689922482,
+      "grad_norm": 1.8585015535354614,
+      "learning_rate": 3.10077519379845e-06,
+      "loss": 1.1885,
+      "step": 97
+    },
+    {
+      "epoch": 0.15193798449612403,
+      "grad_norm": 1.98724365234375,
+      "learning_rate": 3.1395348837209307e-06,
+      "loss": 1.2183,
+      "step": 98
+    },
+    {
+      "epoch": 0.15348837209302327,
+      "grad_norm": 1.7224115133285522,
+      "learning_rate": 3.178294573643411e-06,
+      "loss": 1.2109,
+      "step": 99
+    },
+    {
+      "epoch": 0.15503875968992248,
+      "grad_norm": 1.634524941444397,
+      "learning_rate": 3.217054263565892e-06,
+      "loss": 1.2071,
+      "step": 100
+    },
+    {
+      "epoch": 0.15503875968992248,
+      "eval_loss": 1.2693296670913696,
+      "eval_runtime": 46.5124,
+      "eval_samples_per_second": 21.5,
+      "eval_steps_per_second": 1.354,
+      "step": 100
+    },
+    {
+      "epoch": 0.15658914728682172,
+      "grad_norm": 1.557700514793396,
+      "learning_rate": 3.2558139534883724e-06,
+      "loss": 1.2432,
+      "step": 101
+    },
+    {
+      "epoch": 0.15813953488372093,
+      "grad_norm": 1.700620174407959,
+      "learning_rate": 3.294573643410853e-06,
+      "loss": 1.2114,
+      "step": 102
+    },
+    {
+      "epoch": 0.15968992248062017,
+      "grad_norm": 1.8116543292999268,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 1.2158,
+      "step": 103
+    },
+    {
+      "epoch": 0.16124031007751938,
+      "grad_norm": 1.6625306606292725,
+      "learning_rate": 3.372093023255814e-06,
+      "loss": 1.2171,
+      "step": 104
+    },
+    {
+      "epoch": 0.16279069767441862,
+      "grad_norm": 2.052786111831665,
+      "learning_rate": 3.4108527131782946e-06,
+      "loss": 1.2303,
+      "step": 105
+    },
+    {
+      "epoch": 0.16434108527131783,
+      "grad_norm": 2.397324800491333,
+      "learning_rate": 3.4496124031007755e-06,
+      "loss": 1.2279,
+      "step": 106
+    },
+    {
+      "epoch": 0.16589147286821707,
+      "grad_norm": 2.131593942642212,
+      "learning_rate": 3.4883720930232564e-06,
+      "loss": 1.1918,
+      "step": 107
+    },
+    {
+      "epoch": 0.16744186046511628,
+      "grad_norm": 1.8036249876022339,
+      "learning_rate": 3.527131782945737e-06,
+      "loss": 1.1777,
+      "step": 108
+    },
+    {
+      "epoch": 0.16899224806201552,
+      "grad_norm": 1.5316169261932373,
+      "learning_rate": 3.5658914728682177e-06,
+      "loss": 1.2162,
+      "step": 109
+    },
+    {
+      "epoch": 0.17054263565891473,
+      "grad_norm": 2.1200644969940186,
+      "learning_rate": 3.6046511627906977e-06,
+      "loss": 1.1771,
+      "step": 110
+    },
+    {
+      "epoch": 0.17054263565891473,
+      "eval_loss": 1.2527676820755005,
+      "eval_runtime": 46.5002,
+      "eval_samples_per_second": 21.505,
+      "eval_steps_per_second": 1.355,
+      "step": 110
+    },
+    {
+      "epoch": 0.17209302325581396,
+      "grad_norm": 2.45514178276062,
+      "learning_rate": 3.6434108527131786e-06,
+      "loss": 1.1977,
+      "step": 111
+    },
+    {
+      "epoch": 0.17364341085271318,
+      "grad_norm": 1.8457828760147095,
+      "learning_rate": 3.682170542635659e-06,
+      "loss": 1.1834,
+      "step": 112
+    },
+    {
+      "epoch": 0.17519379844961241,
+      "grad_norm": 2.7428853511810303,
+      "learning_rate": 3.72093023255814e-06,
+      "loss": 1.1401,
+      "step": 113
+    },
+    {
+      "epoch": 0.17674418604651163,
+      "grad_norm": 2.583146095275879,
+      "learning_rate": 3.7596899224806203e-06,
+      "loss": 1.1642,
+      "step": 114
+    },
+    {
+      "epoch": 0.17829457364341086,
+      "grad_norm": 2.205864667892456,
+      "learning_rate": 3.798449612403101e-06,
+      "loss": 1.1595,
+      "step": 115
+    },
+    {
+      "epoch": 0.17984496124031008,
+      "grad_norm": 1.9899259805679321,
+      "learning_rate": 3.837209302325582e-06,
+      "loss": 1.1645,
+      "step": 116
+    },
+    {
+      "epoch": 0.1813953488372093,
+      "grad_norm": 1.7081761360168457,
+      "learning_rate": 3.875968992248063e-06,
+      "loss": 1.1369,
+      "step": 117
+    },
+    {
+      "epoch": 0.18294573643410852,
+      "grad_norm": 2.1556506156921387,
+      "learning_rate": 3.914728682170543e-06,
+      "loss": 1.148,
+      "step": 118
+    },
+    {
+      "epoch": 0.18449612403100776,
+      "grad_norm": 1.763687014579773,
+      "learning_rate": 3.953488372093024e-06,
+      "loss": 1.1631,
+      "step": 119
+    },
+    {
+      "epoch": 0.18604651162790697,
+      "grad_norm": 1.9201045036315918,
+      "learning_rate": 3.992248062015504e-06,
+      "loss": 1.1472,
+      "step": 120
+    },
+    {
+      "epoch": 0.18604651162790697,
+      "eval_loss": 1.2374286651611328,
+      "eval_runtime": 46.5809,
+      "eval_samples_per_second": 21.468,
+      "eval_steps_per_second": 1.352,
+      "step": 120
+    },
+    {
+      "epoch": 0.1875968992248062,
+      "grad_norm": 2.8066470623016357,
+      "learning_rate": 4.031007751937985e-06,
+      "loss": 1.1611,
+      "step": 121
+    },
+    {
+      "epoch": 0.18914728682170542,
+      "grad_norm": 1.8357489109039307,
+      "learning_rate": 4.0697674418604655e-06,
+      "loss": 1.1304,
+      "step": 122
+    },
+    {
+      "epoch": 0.19069767441860466,
+      "grad_norm": 2.2851312160491943,
+      "learning_rate": 4.108527131782946e-06,
+      "loss": 1.161,
+      "step": 123
+    },
+    {
+      "epoch": 0.19224806201550387,
+      "grad_norm": 1.841111660003662,
+      "learning_rate": 4.1472868217054264e-06,
+      "loss": 1.1538,
+      "step": 124
+    },
+    {
+      "epoch": 0.1937984496124031,
+      "grad_norm": 2.4325246810913086,
+      "learning_rate": 4.186046511627907e-06,
+      "loss": 1.1469,
+      "step": 125
+    },
+    {
+      "epoch": 0.19534883720930232,
+      "grad_norm": 1.4917737245559692,
+      "learning_rate": 4.224806201550387e-06,
+      "loss": 1.1266,
+      "step": 126
+    },
+    {
+      "epoch": 0.19689922480620156,
+      "grad_norm": 2.5839462280273438,
+      "learning_rate": 4.263565891472868e-06,
+      "loss": 1.1439,
+      "step": 127
+    },
+    {
+      "epoch": 0.19844961240310077,
+      "grad_norm": 2.911651611328125,
+      "learning_rate": 4.302325581395349e-06,
+      "loss": 1.1537,
+      "step": 128
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.2323522567749023,
+      "learning_rate": 4.34108527131783e-06,
+      "loss": 1.1498,
+      "step": 129
+    },
+    {
+      "epoch": 0.20155038759689922,
+      "grad_norm": 1.976426124572754,
+      "learning_rate": 4.379844961240311e-06,
+      "loss": 1.1103,
+      "step": 130
+    },
+    {
+      "epoch": 0.20155038759689922,
+      "eval_loss": 1.2214908599853516,
+      "eval_runtime": 46.6151,
+      "eval_samples_per_second": 21.452,
+      "eval_steps_per_second": 1.351,
+      "step": 130
+    },
+    {
+      "epoch": 0.20310077519379846,
+      "grad_norm": 1.4584643840789795,
+      "learning_rate": 4.418604651162791e-06,
+      "loss": 1.1413,
+      "step": 131
+    },
+    {
+      "epoch": 0.20465116279069767,
+      "grad_norm": 1.827351689338684,
+      "learning_rate": 4.457364341085272e-06,
+      "loss": 1.142,
+      "step": 132
+    },
+    {
+      "epoch": 0.2062015503875969,
+      "grad_norm": 2.4622914791107178,
+      "learning_rate": 4.4961240310077525e-06,
+      "loss": 1.1604,
+      "step": 133
+    },
+    {
+      "epoch": 0.20775193798449612,
+      "grad_norm": 1.939195156097412,
+      "learning_rate": 4.5348837209302326e-06,
+      "loss": 1.0974,
+      "step": 134
+    },
+    {
+      "epoch": 0.20930232558139536,
+      "grad_norm": 1.4621402025222778,
+      "learning_rate": 4.573643410852713e-06,
+      "loss": 1.1114,
+      "step": 135
+    },
+    {
+      "epoch": 0.21085271317829457,
+      "grad_norm": 2.018838882446289,
+      "learning_rate": 4.612403100775194e-06,
+      "loss": 1.1252,
+      "step": 136
+    },
+    {
+      "epoch": 0.2124031007751938,
+      "grad_norm": 2.0258100032806396,
+      "learning_rate": 4.651162790697675e-06,
+      "loss": 1.1045,
+      "step": 137
+    },
+    {
+      "epoch": 0.21395348837209302,
+      "grad_norm": 1.714446783065796,
+      "learning_rate": 4.689922480620155e-06,
+      "loss": 1.1026,
+      "step": 138
+    },
+    {
+      "epoch": 0.21550387596899226,
+      "grad_norm": 1.330955147743225,
+      "learning_rate": 4.728682170542636e-06,
+      "loss": 1.096,
+      "step": 139
+    },
+    {
+      "epoch": 0.21705426356589147,
+      "grad_norm": 2.121669054031372,
+      "learning_rate": 4.767441860465117e-06,
+      "loss": 1.0984,
+      "step": 140
+    },
+    {
+      "epoch": 0.21705426356589147,
+      "eval_loss": 1.2069424390792847,
+      "eval_runtime": 46.2613,
+      "eval_samples_per_second": 21.616,
+      "eval_steps_per_second": 1.362,
+      "step": 140
+    },
+    {
+      "epoch": 0.2186046511627907,
+      "grad_norm": 1.4863253831863403,
+      "learning_rate": 4.806201550387598e-06,
+      "loss": 1.1096,
+      "step": 141
+    },
+    {
+      "epoch": 0.22015503875968992,
+      "grad_norm": 2.0480966567993164,
+      "learning_rate": 4.844961240310078e-06,
+      "loss": 1.1017,
+      "step": 142
+    },
+    {
+      "epoch": 0.22170542635658916,
+      "grad_norm": 1.563471794128418,
+      "learning_rate": 4.883720930232559e-06,
+      "loss": 1.1047,
+      "step": 143
+    },
+    {
+      "epoch": 0.22325581395348837,
+      "grad_norm": 1.5344929695129395,
+      "learning_rate": 4.922480620155039e-06,
+      "loss": 1.118,
+      "step": 144
+    },
+    {
+      "epoch": 0.2248062015503876,
+      "grad_norm": 1.8281855583190918,
+      "learning_rate": 4.9612403100775195e-06,
+      "loss": 1.0955,
+      "step": 145
+    },
+    {
+      "epoch": 0.22635658914728682,
+      "grad_norm": 1.6736650466918945,
+      "learning_rate": 5e-06,
+      "loss": 1.1399,
+      "step": 146
+    },
+    {
+      "epoch": 0.22790697674418606,
+      "grad_norm": 2.0063889026641846,
+      "learning_rate": 4.995693367786392e-06,
+      "loss": 1.0867,
+      "step": 147
+    },
+    {
+      "epoch": 0.22945736434108527,
+      "grad_norm": 1.842398762702942,
+      "learning_rate": 4.991386735572782e-06,
+      "loss": 1.0799,
+      "step": 148
+    },
+    {
+      "epoch": 0.2310077519379845,
+      "grad_norm": 1.9986329078674316,
+      "learning_rate": 4.987080103359174e-06,
+      "loss": 1.1371,
+      "step": 149
+    },
+    {
+      "epoch": 0.23255813953488372,
+      "grad_norm": 1.5376241207122803,
+      "learning_rate": 4.982773471145564e-06,
+      "loss": 1.0728,
+      "step": 150
+    },
+    {
+      "epoch": 0.23255813953488372,
+      "eval_loss": 1.1861159801483154,
+      "eval_runtime": 46.3174,
+      "eval_samples_per_second": 21.59,
+      "eval_steps_per_second": 1.36,
+      "step": 150
+    },
+    {
+      "epoch": 0.23410852713178296,
+      "grad_norm": 2.3728039264678955,
+      "learning_rate": 4.978466838931956e-06,
+      "loss": 1.128,
+      "step": 151
+    },
+    {
+      "epoch": 0.23565891472868217,
+      "grad_norm": 2.1690421104431152,
+      "learning_rate": 4.974160206718346e-06,
+      "loss": 1.1249,
+      "step": 152
+    },
+    {
+      "epoch": 0.2372093023255814,
+      "grad_norm": 1.9691181182861328,
+      "learning_rate": 4.969853574504738e-06,
+      "loss": 1.0955,
+      "step": 153
+    },
+    {
+      "epoch": 0.23875968992248062,
+      "grad_norm": 2.235119581222534,
+      "learning_rate": 4.965546942291129e-06,
+      "loss": 1.0875,
+      "step": 154
+    },
+    {
+      "epoch": 0.24031007751937986,
+      "grad_norm": 1.7278294563293457,
+      "learning_rate": 4.9612403100775195e-06,
+      "loss": 1.072,
+      "step": 155
+    },
+    {
+      "epoch": 0.24186046511627907,
+      "grad_norm": 1.7765733003616333,
+      "learning_rate": 4.956933677863911e-06,
+      "loss": 1.0662,
+      "step": 156
+    },
+    {
+      "epoch": 0.2434108527131783,
+      "grad_norm": 2.1046664714813232,
+      "learning_rate": 4.9526270456503015e-06,
+      "loss": 1.0748,
+      "step": 157
+    },
+    {
+      "epoch": 0.24496124031007752,
+      "grad_norm": 1.7691307067871094,
+      "learning_rate": 4.948320413436693e-06,
+      "loss": 1.0809,
+      "step": 158
+    },
+    {
+      "epoch": 0.24651162790697675,
+      "grad_norm": 1.506629228591919,
+      "learning_rate": 4.944013781223083e-06,
+      "loss": 1.0807,
+      "step": 159
+    },
+    {
+      "epoch": 0.24806201550387597,
+      "grad_norm": 1.8298925161361694,
+      "learning_rate": 4.939707149009475e-06,
+      "loss": 1.0976,
+      "step": 160
+    },
+    {
+      "epoch": 0.24806201550387597,
+      "eval_loss": 1.1819802522659302,
+      "eval_runtime": 46.1853,
+      "eval_samples_per_second": 21.652,
+      "eval_steps_per_second": 1.364,
+      "step": 160
+    },
+    {
+      "epoch": 0.2496124031007752,
+      "grad_norm": 1.712808609008789,
+      "learning_rate": 4.935400516795866e-06,
+      "loss": 1.0967,
+      "step": 161
+    },
+    {
+      "epoch": 0.25116279069767444,
+      "grad_norm": 2.387460947036743,
+      "learning_rate": 4.931093884582257e-06,
+      "loss": 1.1141,
+      "step": 162
+    },
+    {
+      "epoch": 0.2527131782945736,
+      "grad_norm": 1.8970969915390015,
+      "learning_rate": 4.926787252368648e-06,
+      "loss": 1.0605,
+      "step": 163
+    },
+    {
+      "epoch": 0.25426356589147286,
+      "grad_norm": 1.6015546321868896,
+      "learning_rate": 4.922480620155039e-06,
+      "loss": 1.0739,
+      "step": 164
+    },
+    {
+      "epoch": 0.2558139534883721,
+      "grad_norm": 1.9534488916397095,
+      "learning_rate": 4.91817398794143e-06,
+      "loss": 1.0496,
+      "step": 165
+    },
+    {
+      "epoch": 0.25736434108527134,
+      "grad_norm": 2.064868927001953,
+      "learning_rate": 4.913867355727821e-06,
+      "loss": 1.0796,
+      "step": 166
+    },
+    {
+      "epoch": 0.2589147286821705,
+      "grad_norm": 1.4253895282745361,
+      "learning_rate": 4.909560723514212e-06,
+      "loss": 1.0444,
+      "step": 167
+    },
+    {
+      "epoch": 0.26046511627906976,
+      "grad_norm": 2.0125179290771484,
+      "learning_rate": 4.905254091300603e-06,
+      "loss": 1.0418,
+      "step": 168
+    },
+    {
+      "epoch": 0.262015503875969,
+      "grad_norm": 1.7060673236846924,
+      "learning_rate": 4.900947459086994e-06,
+      "loss": 1.1122,
+      "step": 169
+    },
+    {
+      "epoch": 0.26356589147286824,
+      "grad_norm": 1.7167747020721436,
+      "learning_rate": 4.896640826873385e-06,
+      "loss": 1.092,
+      "step": 170
+    },
+    {
+      "epoch": 0.26356589147286824,
+      "eval_loss": 1.1651350259780884,
+      "eval_runtime": 46.3335,
+      "eval_samples_per_second": 21.583,
+      "eval_steps_per_second": 1.36,
+      "step": 170
+    },
+    {
+      "epoch": 0.2651162790697674,
+      "grad_norm": 1.3494670391082764,
+      "learning_rate": 4.892334194659777e-06,
+      "loss": 1.069,
+      "step": 171
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 2.1221230030059814,
+      "learning_rate": 4.888027562446167e-06,
+      "loss": 1.0827,
+      "step": 172
+    },
+    {
+      "epoch": 0.2682170542635659,
+      "grad_norm": 1.819272756576538,
+      "learning_rate": 4.883720930232559e-06,
+      "loss": 1.0702,
+      "step": 173
+    },
+    {
+      "epoch": 0.26976744186046514,
+      "grad_norm": 1.7097699642181396,
+      "learning_rate": 4.879414298018949e-06,
+      "loss": 1.0715,
+      "step": 174
+    },
+    {
+      "epoch": 0.2713178294573643,
+      "grad_norm": 2.005596399307251,
+      "learning_rate": 4.875107665805341e-06,
+      "loss": 1.0736,
+      "step": 175
+    },
+    {
+      "epoch": 0.27286821705426356,
+      "grad_norm": 2.6549158096313477,
+      "learning_rate": 4.870801033591732e-06,
+      "loss": 1.0681,
+      "step": 176
+    },
+    {
+      "epoch": 0.2744186046511628,
+      "grad_norm": 1.527632713317871,
+      "learning_rate": 4.8664944013781225e-06,
+      "loss": 1.0675,
+      "step": 177
+    },
+    {
+      "epoch": 0.27596899224806204,
+      "grad_norm": 3.4207332134246826,
+      "learning_rate": 4.862187769164514e-06,
+      "loss": 1.0977,
+      "step": 178
+    },
+    {
+      "epoch": 0.2775193798449612,
+      "grad_norm": 2.768385648727417,
+      "learning_rate": 4.857881136950905e-06,
+      "loss": 1.0411,
+      "step": 179
+    },
+    {
+      "epoch": 0.27906976744186046,
+      "grad_norm": 2.1901512145996094,
+      "learning_rate": 4.853574504737296e-06,
+      "loss": 1.0946,
+      "step": 180
+    },
+    {
+      "epoch": 0.27906976744186046,
+      "eval_loss": 1.1633468866348267,
+      "eval_runtime": 46.353,
+      "eval_samples_per_second": 21.574,
+      "eval_steps_per_second": 1.359,
+      "step": 180
+    },
+    {
+      "epoch": 0.2806201550387597,
+      "grad_norm": 2.5135326385498047,
+      "learning_rate": 4.849267872523687e-06,
+      "loss": 1.0576,
+      "step": 181
+    },
+    {
+      "epoch": 0.28217054263565894,
+      "grad_norm": 1.8446581363677979,
+      "learning_rate": 4.844961240310078e-06,
+      "loss": 1.0539,
+      "step": 182
+    },
+    {
+      "epoch": 0.2837209302325581,
+      "grad_norm": 1.7093740701675415,
+      "learning_rate": 4.840654608096469e-06,
+      "loss": 1.0753,
+      "step": 183
+    },
+    {
+      "epoch": 0.28527131782945736,
+      "grad_norm": 1.4978317022323608,
+      "learning_rate": 4.8363479758828606e-06,
+      "loss": 1.0597,
+      "step": 184
+    },
+    {
+      "epoch": 0.2868217054263566,
+      "grad_norm": 1.7498767375946045,
+      "learning_rate": 4.832041343669251e-06,
+      "loss": 1.0675,
+      "step": 185
+    },
+    {
+      "epoch": 0.28837209302325584,
+      "grad_norm": 1.6801395416259766,
+      "learning_rate": 4.8277347114556425e-06,
+      "loss": 1.0532,
+      "step": 186
+    },
+    {
+      "epoch": 0.289922480620155,
+      "grad_norm": 2.029369354248047,
+      "learning_rate": 4.823428079242033e-06,
+      "loss": 1.075,
+      "step": 187
+    },
+    {
+      "epoch": 0.29147286821705426,
+      "grad_norm": 1.808666467666626,
+      "learning_rate": 4.8191214470284244e-06,
+      "loss": 1.0713,
+      "step": 188
+    },
+    {
+      "epoch": 0.2930232558139535,
+      "grad_norm": 1.4356886148452759,
+      "learning_rate": 4.814814814814815e-06,
+      "loss": 1.0571,
+      "step": 189
+    },
+    {
+      "epoch": 0.29457364341085274,
+      "grad_norm": 1.743287205696106,
+      "learning_rate": 4.810508182601206e-06,
+      "loss": 1.047,
+      "step": 190
+    },
+    {
+      "epoch": 0.29457364341085274,
+      "eval_loss": 1.1456130743026733,
+      "eval_runtime": 46.2826,
+      "eval_samples_per_second": 21.606,
+      "eval_steps_per_second": 1.361,
+      "step": 190
+    },
+    {
+      "epoch": 0.2961240310077519,
+      "grad_norm": 1.9248270988464355,
+      "learning_rate": 4.806201550387598e-06,
+      "loss": 1.061,
+      "step": 191
+    },
+    {
+      "epoch": 0.29767441860465116,
+      "grad_norm": 1.4090445041656494,
+      "learning_rate": 4.801894918173988e-06,
+      "loss": 1.085,
+      "step": 192
+    },
+    {
+      "epoch": 0.2992248062015504,
+      "grad_norm": 1.858083724975586,
+      "learning_rate": 4.79758828596038e-06,
+      "loss": 1.0412,
+      "step": 193
+    },
+    {
+      "epoch": 0.30077519379844964,
+      "grad_norm": 1.6754525899887085,
+      "learning_rate": 4.79328165374677e-06,
+      "loss": 1.0611,
+      "step": 194
+    },
+    {
+      "epoch": 0.3023255813953488,
+      "grad_norm": 1.5942410230636597,
+      "learning_rate": 4.788975021533162e-06,
+      "loss": 1.0373,
+      "step": 195
+    },
+    {
+      "epoch": 0.30387596899224806,
+      "grad_norm": 1.6549283266067505,
+      "learning_rate": 4.784668389319552e-06,
+      "loss": 1.0343,
+      "step": 196
+    },
+    {
+      "epoch": 0.3054263565891473,
+      "grad_norm": 2.2277469635009766,
+      "learning_rate": 4.780361757105944e-06,
+      "loss": 1.0881,
+      "step": 197
+    },
+    {
+      "epoch": 0.30697674418604654,
+      "grad_norm": 1.6255255937576294,
+      "learning_rate": 4.776055124892335e-06,
+      "loss": 0.9892,
+      "step": 198
+    },
+    {
+      "epoch": 0.3085271317829457,
+      "grad_norm": 1.761346697807312,
+      "learning_rate": 4.7717484926787255e-06,
+      "loss": 1.0547,
+      "step": 199
+    },
+    {
+      "epoch": 0.31007751937984496,
+      "grad_norm": 1.673264503479004,
+      "learning_rate": 4.767441860465117e-06,
+      "loss": 1.0516,
+      "step": 200
+    },
+    {
+      "epoch": 0.31007751937984496,
+      "eval_loss": 1.141044020652771,
+      "eval_runtime": 46.3982,
+      "eval_samples_per_second": 21.553,
+      "eval_steps_per_second": 1.358,
+      "step": 200
+    },
+    {
+      "epoch": 0.3116279069767442,
+      "grad_norm": 2.031104803085327,
+      "learning_rate": 4.7631352282515074e-06,
+      "loss": 1.0646,
+      "step": 201
+    },
+    {
+      "epoch": 0.31317829457364343,
+      "grad_norm": 2.710296630859375,
+      "learning_rate": 4.758828596037899e-06,
+      "loss": 1.0444,
+      "step": 202
+    },
+    {
+      "epoch": 0.3147286821705426,
+      "grad_norm": 1.8177646398544312,
+      "learning_rate": 4.754521963824289e-06,
+      "loss": 1.05,
+      "step": 203
+    },
+    {
+      "epoch": 0.31627906976744186,
+      "grad_norm": 1.5122544765472412,
+      "learning_rate": 4.750215331610681e-06,
+      "loss": 1.0373,
+      "step": 204
+    },
+    {
+      "epoch": 0.3178294573643411,
+      "grad_norm": 2.730760335922241,
+      "learning_rate": 4.745908699397072e-06,
+      "loss": 1.0868,
+      "step": 205
+    },
+    {
+      "epoch": 0.31937984496124033,
+      "grad_norm": 2.1081650257110596,
+      "learning_rate": 4.741602067183463e-06,
+      "loss": 1.0587,
+      "step": 206
+    },
+    {
+      "epoch": 0.3209302325581395,
+      "grad_norm": 2.4439713954925537,
+      "learning_rate": 4.737295434969854e-06,
+      "loss": 1.0528,
+      "step": 207
+    },
+    {
+      "epoch": 0.32248062015503876,
+      "grad_norm": 1.6500515937805176,
+      "learning_rate": 4.732988802756245e-06,
+      "loss": 1.0446,
+      "step": 208
+    },
+    {
+      "epoch": 0.324031007751938,
+      "grad_norm": 1.764997959136963,
+      "learning_rate": 4.728682170542636e-06,
+      "loss": 1.0973,
+      "step": 209
+    },
+    {
+      "epoch": 0.32558139534883723,
+      "grad_norm": 1.9934346675872803,
+      "learning_rate": 4.724375538329027e-06,
+      "loss": 1.0559,
+      "step": 210
+    },
+    {
+      "epoch": 0.32558139534883723,
+      "eval_loss": 1.139148473739624,
+      "eval_runtime": 46.3053,
+      "eval_samples_per_second": 21.596,
+      "eval_steps_per_second": 1.361,
+      "step": 210
+    },
+    {
+      "epoch": 0.3271317829457364,
+      "grad_norm": 2.008603572845459,
+      "learning_rate": 4.720068906115418e-06,
+      "loss": 1.0477,
+      "step": 211
+    },
+    {
+      "epoch": 0.32868217054263565,
+      "grad_norm": 1.631761074066162,
+      "learning_rate": 4.715762273901809e-06,
+      "loss": 1.0588,
+      "step": 212
+    },
+    {
+      "epoch": 0.3302325581395349,
+      "grad_norm": 1.682797908782959,
+      "learning_rate": 4.7114556416882e-06,
+      "loss": 1.045,
+      "step": 213
+    },
+    {
+      "epoch": 0.33178294573643413,
+      "grad_norm": 1.7425205707550049,
+      "learning_rate": 4.707149009474591e-06,
+      "loss": 1.0116,
+      "step": 214
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.903198480606079,
+      "learning_rate": 4.702842377260982e-06,
+      "loss": 1.0358,
+      "step": 215
+    },
+    {
+      "epoch": 0.33488372093023255,
+      "grad_norm": 1.5025352239608765,
+      "learning_rate": 4.698535745047373e-06,
+      "loss": 1.0306,
+      "step": 216
+    },
+    {
+      "epoch": 0.3364341085271318,
+      "grad_norm": 1.7889755964279175,
+      "learning_rate": 4.694229112833764e-06,
+      "loss": 1.0451,
+      "step": 217
+    },
+    {
+      "epoch": 0.33798449612403103,
+      "grad_norm": 1.4736870527267456,
+      "learning_rate": 4.689922480620155e-06,
+      "loss": 1.0245,
+      "step": 218
+    },
+    {
+      "epoch": 0.3395348837209302,
+      "grad_norm": 1.4667205810546875,
+      "learning_rate": 4.6856158484065466e-06,
+      "loss": 0.9702,
+      "step": 219
+    },
+    {
+      "epoch": 0.34108527131782945,
+      "grad_norm": 1.825951337814331,
+      "learning_rate": 4.681309216192937e-06,
+      "loss": 1.0135,
+      "step": 220
+    },
+    {
+      "epoch": 0.34108527131782945,
+      "eval_loss": 1.125650405883789,
+      "eval_runtime": 46.3561,
+      "eval_samples_per_second": 21.572,
+      "eval_steps_per_second": 1.359,
+      "step": 220
+    },
+    {
+      "epoch": 0.3426356589147287,
+      "grad_norm": 1.6442840099334717,
+      "learning_rate": 4.6770025839793285e-06,
+      "loss": 1.0692,
+      "step": 221
+    },
+    {
+      "epoch": 0.34418604651162793,
+      "grad_norm": 1.994643211364746,
+      "learning_rate": 4.672695951765719e-06,
+      "loss": 1.0225,
+      "step": 222
+    },
+    {
+      "epoch": 0.3457364341085271,
+      "grad_norm": 1.6924593448638916,
+      "learning_rate": 4.6683893195521104e-06,
+      "loss": 1.0258,
+      "step": 223
+    },
+    {
+      "epoch": 0.34728682170542635,
+      "grad_norm": 1.9766777753829956,
+      "learning_rate": 4.664082687338502e-06,
+      "loss": 1.0297,
+      "step": 224
+    },
+    {
+      "epoch": 0.3488372093023256,
+      "grad_norm": 1.4598184823989868,
+      "learning_rate": 4.659776055124892e-06,
+      "loss": 1.0646,
+      "step": 225
+    },
+    {
+      "epoch": 0.35038759689922483,
+      "grad_norm": 1.876902461051941,
+      "learning_rate": 4.655469422911284e-06,
+      "loss": 1.0572,
+      "step": 226
+    },
+    {
+      "epoch": 0.351937984496124,
+      "grad_norm": 1.948635220527649,
+      "learning_rate": 4.651162790697675e-06,
+      "loss": 1.0218,
+      "step": 227
+    },
+    {
+      "epoch": 0.35348837209302325,
+      "grad_norm": 1.4742966890335083,
+      "learning_rate": 4.646856158484066e-06,
+      "loss": 1.0234,
+      "step": 228
+    },
+    {
+      "epoch": 0.3550387596899225,
+      "grad_norm": 1.8568025827407837,
+      "learning_rate": 4.642549526270457e-06,
+      "loss": 1.0538,
+      "step": 229
+    },
+    {
+      "epoch": 0.35658914728682173,
+      "grad_norm": 1.664680004119873,
+      "learning_rate": 4.638242894056848e-06,
+      "loss": 1.0414,
+      "step": 230
+    },
+    {
+      "epoch": 0.35658914728682173,
+      "eval_loss": 1.1241024732589722,
+      "eval_runtime": 46.3527,
+      "eval_samples_per_second": 21.574,
+      "eval_steps_per_second": 1.359,
+      "step": 230
+    },
+    {
+      "epoch": 0.3581395348837209,
+      "grad_norm": 1.617343544960022,
+      "learning_rate": 4.633936261843239e-06,
+      "loss": 1.0035,
+      "step": 231
+    },
+    {
+      "epoch": 0.35968992248062015,
+      "grad_norm": 1.8938863277435303,
+      "learning_rate": 4.62962962962963e-06,
+      "loss": 1.0488,
+      "step": 232
+    },
+    {
+      "epoch": 0.3612403100775194,
+      "grad_norm": 1.4788931608200073,
+      "learning_rate": 4.625322997416021e-06,
+      "loss": 1.0032,
+      "step": 233
+    },
+    {
+      "epoch": 0.3627906976744186,
+      "grad_norm": 1.4180240631103516,
+      "learning_rate": 4.621016365202412e-06,
+      "loss": 1.0016,
+      "step": 234
+    },
+    {
+      "epoch": 0.3643410852713178,
+      "grad_norm": 2.0693907737731934,
+      "learning_rate": 4.616709732988803e-06,
+      "loss": 1.0599,
+      "step": 235
+    },
+    {
+      "epoch": 0.36589147286821705,
+      "grad_norm": 1.6417962312698364,
+      "learning_rate": 4.612403100775194e-06,
+      "loss": 1.0194,
+      "step": 236
+    },
+    {
+      "epoch": 0.3674418604651163,
+      "grad_norm": 2.0166518688201904,
+      "learning_rate": 4.608096468561586e-06,
+      "loss": 1.0507,
+      "step": 237
+    },
+    {
+      "epoch": 0.3689922480620155,
+      "grad_norm": 1.8325533866882324,
+      "learning_rate": 4.603789836347976e-06,
+      "loss": 1.0052,
+      "step": 238
+    },
+    {
+      "epoch": 0.3705426356589147,
+      "grad_norm": 1.4332795143127441,
+      "learning_rate": 4.599483204134368e-06,
+      "loss": 1.0225,
+      "step": 239
+    },
+    {
+      "epoch": 0.37209302325581395,
+      "grad_norm": 2.0295088291168213,
+      "learning_rate": 4.595176571920759e-06,
+      "loss": 1.0317,
+      "step": 240
+    },
+    {
+      "epoch": 0.37209302325581395,
+      "eval_loss": 1.1203081607818604,
+      "eval_runtime": 46.3802,
+      "eval_samples_per_second": 21.561,
+      "eval_steps_per_second": 1.358,
+      "step": 240
+    },
+    {
+      "epoch": 0.3736434108527132,
+      "grad_norm": 2.3276357650756836,
+      "learning_rate": 4.5908699397071495e-06,
+      "loss": 1.0321,
+      "step": 241
+    },
+    {
+      "epoch": 0.3751937984496124,
+      "grad_norm": 1.6102005243301392,
+      "learning_rate": 4.586563307493541e-06,
+      "loss": 1.0207,
+      "step": 242
+    },
+    {
+      "epoch": 0.3767441860465116,
+      "grad_norm": 2.5230729579925537,
+      "learning_rate": 4.5822566752799315e-06,
+      "loss": 1.0534,
+      "step": 243
+    },
+    {
+      "epoch": 0.37829457364341085,
+      "grad_norm": 2.3337669372558594,
+      "learning_rate": 4.577950043066323e-06,
+      "loss": 1.0261,
+      "step": 244
+    },
+    {
+      "epoch": 0.3798449612403101,
+      "grad_norm": 2.6739656925201416,
+      "learning_rate": 4.573643410852713e-06,
+      "loss": 1.0279,
+      "step": 245
+    },
+    {
+      "epoch": 0.3813953488372093,
+      "grad_norm": 2.375446081161499,
+      "learning_rate": 4.569336778639105e-06,
+      "loss": 1.0272,
+      "step": 246
+    },
+    {
+      "epoch": 0.3829457364341085,
+      "grad_norm": 1.7358253002166748,
+      "learning_rate": 4.565030146425496e-06,
+      "loss": 1.0303,
+      "step": 247
+    },
+    {
+      "epoch": 0.38449612403100775,
+      "grad_norm": 2.24027156829834,
+      "learning_rate": 4.560723514211887e-06,
+      "loss": 1.021,
+      "step": 248
+    },
+    {
+      "epoch": 0.386046511627907,
+      "grad_norm": 2.228729724884033,
+      "learning_rate": 4.556416881998278e-06,
+      "loss": 1.0436,
+      "step": 249
+    },
+    {
+      "epoch": 0.3875968992248062,
+      "grad_norm": 1.5235542058944702,
+      "learning_rate": 4.552110249784669e-06,
+      "loss": 1.0248,
+      "step": 250
+    },
+    {
+      "epoch": 0.3875968992248062,
+      "eval_loss": 1.1150026321411133,
+      "eval_runtime": 46.3762,
+      "eval_samples_per_second": 21.563,
+      "eval_steps_per_second": 1.358,
+      "step": 250
+    },
+    {
+      "epoch": 0.3891472868217054,
+      "grad_norm": 1.5835521221160889,
+      "learning_rate": 4.54780361757106e-06,
+      "loss": 1.0066,
+      "step": 251
+    },
+    {
+      "epoch": 0.39069767441860465,
+      "grad_norm": 1.6656657457351685,
+      "learning_rate": 4.543496985357451e-06,
+      "loss": 1.0438,
+      "step": 252
+    },
+    {
+      "epoch": 0.3922480620155039,
+      "grad_norm": 2.0052080154418945,
+      "learning_rate": 4.539190353143842e-06,
+      "loss": 0.9879,
+      "step": 253
+    },
+    {
+      "epoch": 0.3937984496124031,
+      "grad_norm": 1.6761670112609863,
+      "learning_rate": 4.5348837209302326e-06,
+      "loss": 0.9964,
+      "step": 254
+    },
+    {
+      "epoch": 0.3953488372093023,
+      "grad_norm": 2.0463616847991943,
+      "learning_rate": 4.530577088716624e-06,
+      "loss": 1.0137,
+      "step": 255
+    },
+    {
+      "epoch": 0.39689922480620154,
+      "grad_norm": 1.684457540512085,
+      "learning_rate": 4.526270456503015e-06,
+      "loss": 0.9994,
+      "step": 256
+    },
+    {
+      "epoch": 0.3984496124031008,
+      "grad_norm": 1.9504594802856445,
+      "learning_rate": 4.521963824289406e-06,
+      "loss": 1.008,
+      "step": 257
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.0540153980255127,
+      "learning_rate": 4.517657192075797e-06,
+      "loss": 1.0145,
+      "step": 258
+    },
+    {
+      "epoch": 0.4015503875968992,
+      "grad_norm": 2.077800989151001,
+      "learning_rate": 4.513350559862188e-06,
+      "loss": 1.0078,
+      "step": 259
+    },
+    {
+      "epoch": 0.40310077519379844,
+      "grad_norm": 2.054661273956299,
+      "learning_rate": 4.509043927648579e-06,
+      "loss": 1.0202,
+      "step": 260
+    },
+    {
+      "epoch": 0.40310077519379844,
+      "eval_loss": 1.112622857093811,
+      "eval_runtime": 46.4135,
+      "eval_samples_per_second": 21.545,
+      "eval_steps_per_second": 1.357,
+      "step": 260
+    },
+    {
+      "epoch": 0.4046511627906977,
+      "grad_norm": 1.7232954502105713,
+      "learning_rate": 4.50473729543497e-06,
+      "loss": 0.9975,
+      "step": 261
+    },
+    {
+      "epoch": 0.4062015503875969,
+      "grad_norm": 1.672469973564148,
+      "learning_rate": 4.500430663221361e-06,
+      "loss": 1.0262,
+      "step": 262
+    },
+    {
+      "epoch": 0.4077519379844961,
+      "grad_norm": 1.5814067125320435,
+      "learning_rate": 4.4961240310077525e-06,
+      "loss": 0.9977,
+      "step": 263
+    },
+    {
+      "epoch": 0.40930232558139534,
+      "grad_norm": 2.1820123195648193,
+      "learning_rate": 4.491817398794143e-06,
+      "loss": 1.0404,
+      "step": 264
+    },
+    {
+      "epoch": 0.4108527131782946,
+      "grad_norm": 1.5097086429595947,
+      "learning_rate": 4.4875107665805345e-06,
+      "loss": 1.0023,
+      "step": 265
+    },
+    {
+      "epoch": 0.4124031007751938,
+      "grad_norm": 1.551700472831726,
+      "learning_rate": 4.483204134366925e-06,
+      "loss": 0.9823,
+      "step": 266
+    },
+    {
+      "epoch": 0.413953488372093,
+      "grad_norm": 1.595078945159912,
+      "learning_rate": 4.478897502153316e-06,
+      "loss": 1.0101,
+      "step": 267
+    },
+    {
+      "epoch": 0.41550387596899224,
+      "grad_norm": 1.7391844987869263,
+      "learning_rate": 4.474590869939707e-06,
+      "loss": 1.0208,
+      "step": 268
+    },
+    {
+      "epoch": 0.4170542635658915,
+      "grad_norm": 1.820359706878662,
+      "learning_rate": 4.470284237726098e-06,
+      "loss": 0.982,
+      "step": 269
+    },
+    {
+      "epoch": 0.4186046511627907,
+      "grad_norm": 1.852407693862915,
+      "learning_rate": 4.46597760551249e-06,
+      "loss": 0.9752,
+      "step": 270
+    },
+    {
+      "epoch": 0.4186046511627907,
+      "eval_loss": 1.109637975692749,
+      "eval_runtime": 46.4539,
+      "eval_samples_per_second": 21.527,
+      "eval_steps_per_second": 1.356,
+      "step": 270
+    },
+    {
+      "epoch": 0.4201550387596899,
+      "grad_norm": 2.0624260902404785,
+      "learning_rate": 4.46167097329888e-06,
+      "loss": 0.98,
+      "step": 271
+    },
+    {
+      "epoch": 0.42170542635658914,
+      "grad_norm": 1.4282116889953613,
+      "learning_rate": 4.457364341085272e-06,
+      "loss": 0.9782,
+      "step": 272
+    },
+    {
+      "epoch": 0.4232558139534884,
+      "grad_norm": 2.189063549041748,
+      "learning_rate": 4.453057708871662e-06,
+      "loss": 1.0447,
+      "step": 273
+    },
+    {
+      "epoch": 0.4248062015503876,
+      "grad_norm": 1.8906320333480835,
+      "learning_rate": 4.448751076658054e-06,
+      "loss": 1.0013,
+      "step": 274
+    },
+    {
+      "epoch": 0.4263565891472868,
+      "grad_norm": 1.7382593154907227,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.9955,
+      "step": 275
+    },
+    {
+      "epoch": 0.42790697674418604,
+      "grad_norm": 1.7929259538650513,
+      "learning_rate": 4.4401378122308355e-06,
+      "loss": 1.0087,
+      "step": 276
+    },
+    {
+      "epoch": 0.4294573643410853,
+      "grad_norm": 1.4211113452911377,
+      "learning_rate": 4.435831180017227e-06,
+      "loss": 1.0109,
+      "step": 277
+    },
+    {
+      "epoch": 0.4310077519379845,
+      "grad_norm": 2.430835008621216,
+      "learning_rate": 4.4315245478036175e-06,
+      "loss": 1.0401,
+      "step": 278
+    },
+    {
+      "epoch": 0.4325581395348837,
+      "grad_norm": 2.3752498626708984,
+      "learning_rate": 4.427217915590009e-06,
+      "loss": 1.0202,
+      "step": 279
+    },
+    {
+      "epoch": 0.43410852713178294,
+      "grad_norm": 1.6907297372817993,
+      "learning_rate": 4.4229112833764e-06,
+      "loss": 1.001,
+      "step": 280
+    },
+    {
+      "epoch": 0.43410852713178294,
+      "eval_loss": 1.098842978477478,
+      "eval_runtime": 46.5116,
+      "eval_samples_per_second": 21.5,
+      "eval_steps_per_second": 1.355,
+      "step": 280
+    },
+    {
+      "epoch": 0.4356589147286822,
+      "grad_norm": 1.5076638460159302,
+      "learning_rate": 4.418604651162791e-06,
+      "loss": 1.0402,
+      "step": 281
+    },
+    {
+      "epoch": 0.4372093023255814,
+      "grad_norm": 1.7280958890914917,
+      "learning_rate": 4.414298018949182e-06,
+      "loss": 1.0115,
+      "step": 282
+    },
+    {
+      "epoch": 0.4387596899224806,
+      "grad_norm": 1.711249589920044,
+      "learning_rate": 4.409991386735573e-06,
+      "loss": 0.9946,
+      "step": 283
+    },
+    {
+      "epoch": 0.44031007751937984,
+      "grad_norm": 1.9347134828567505,
+      "learning_rate": 4.405684754521964e-06,
+      "loss": 1.0077,
+      "step": 284
+    },
+    {
+      "epoch": 0.4418604651162791,
+      "grad_norm": 1.5475000143051147,
+      "learning_rate": 4.4013781223083555e-06,
+      "loss": 1.0163,
+      "step": 285
+    },
+    {
+      "epoch": 0.4434108527131783,
+      "grad_norm": 1.4094364643096924,
+      "learning_rate": 4.397071490094746e-06,
+      "loss": 0.9963,
+      "step": 286
+    },
+    {
+      "epoch": 0.4449612403100775,
+      "grad_norm": 1.7702579498291016,
+      "learning_rate": 4.3927648578811375e-06,
+      "loss": 0.9867,
+      "step": 287
+    },
+    {
+      "epoch": 0.44651162790697674,
+      "grad_norm": 2.4476609230041504,
+      "learning_rate": 4.388458225667529e-06,
+      "loss": 1.0049,
+      "step": 288
+    },
+    {
+      "epoch": 0.448062015503876,
+      "grad_norm": 1.6922342777252197,
+      "learning_rate": 4.384151593453919e-06,
+      "loss": 1.0192,
+      "step": 289
+    },
+    {
+      "epoch": 0.4496124031007752,
+      "grad_norm": 1.8502883911132812,
+      "learning_rate": 4.379844961240311e-06,
+      "loss": 1.0031,
+      "step": 290
+    },
+    {
+      "epoch": 0.4496124031007752,
+      "eval_loss": 1.095705270767212,
+      "eval_runtime": 46.5015,
+      "eval_samples_per_second": 21.505,
+      "eval_steps_per_second": 1.355,
+      "step": 290
+    },
+    {
+      "epoch": 0.4511627906976744,
+      "grad_norm": 1.6261955499649048,
+      "learning_rate": 4.375538329026701e-06,
+      "loss": 0.9904,
+      "step": 291
+    },
+    {
+      "epoch": 0.45271317829457364,
+      "grad_norm": 1.3905696868896484,
+      "learning_rate": 4.371231696813093e-06,
+      "loss": 0.9788,
+      "step": 292
+    },
+    {
+      "epoch": 0.4542635658914729,
+      "grad_norm": 1.8039098978042603,
+      "learning_rate": 4.366925064599484e-06,
+      "loss": 0.9855,
+      "step": 293
+    },
+    {
+      "epoch": 0.4558139534883721,
+      "grad_norm": 1.7904952764511108,
+      "learning_rate": 4.362618432385875e-06,
+      "loss": 1.0097,
+      "step": 294
+    },
+    {
+      "epoch": 0.4573643410852713,
+      "grad_norm": 1.918278455734253,
+      "learning_rate": 4.358311800172266e-06,
+      "loss": 0.9839,
+      "step": 295
+    },
+    {
+      "epoch": 0.45891472868217054,
+      "grad_norm": 1.6572740077972412,
+      "learning_rate": 4.354005167958657e-06,
+      "loss": 1.0167,
+      "step": 296
+    },
+    {
+      "epoch": 0.4604651162790698,
+      "grad_norm": 1.6543735265731812,
+      "learning_rate": 4.349698535745048e-06,
+      "loss": 1.0181,
+      "step": 297
+    },
+    {
+      "epoch": 0.462015503875969,
+      "grad_norm": 1.5788277387619019,
+      "learning_rate": 4.345391903531439e-06,
+      "loss": 1.013,
+      "step": 298
+    },
+    {
+      "epoch": 0.4635658914728682,
+      "grad_norm": 1.4497977495193481,
+      "learning_rate": 4.34108527131783e-06,
+      "loss": 0.9837,
+      "step": 299
+    },
+    {
+      "epoch": 0.46511627906976744,
+      "grad_norm": 1.2367212772369385,
+      "learning_rate": 4.336778639104221e-06,
+      "loss": 0.9611,
+      "step": 300
+    },
+    {
+      "epoch": 0.46511627906976744,
+      "eval_loss": 1.0975078344345093,
+      "eval_runtime": 46.5651,
+      "eval_samples_per_second": 21.475,
+      "eval_steps_per_second": 1.353,
+      "step": 300
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 1.7050374746322632,
+      "learning_rate": 4.332472006890612e-06,
+      "loss": 1.0096,
+      "step": 301
+    },
+    {
+      "epoch": 0.4682170542635659,
+      "grad_norm": 1.4360929727554321,
+      "learning_rate": 4.328165374677003e-06,
+      "loss": 0.9569,
+      "step": 302
+    },
+    {
+      "epoch": 0.4697674418604651,
+      "grad_norm": 1.49595308303833,
+      "learning_rate": 4.323858742463394e-06,
+      "loss": 1.0097,
+      "step": 303
+    },
+    {
+      "epoch": 0.47131782945736433,
+      "grad_norm": 1.9372719526290894,
+      "learning_rate": 4.319552110249785e-06,
+      "loss": 0.9716,
+      "step": 304
+    },
+    {
+      "epoch": 0.4728682170542636,
+      "grad_norm": 1.4308205842971802,
+      "learning_rate": 4.3152454780361766e-06,
+      "loss": 0.9773,
+      "step": 305
+    },
+    {
+      "epoch": 0.4744186046511628,
+      "grad_norm": 1.724716067314148,
+      "learning_rate": 4.310938845822567e-06,
+      "loss": 0.972,
+      "step": 306
+    },
+    {
+      "epoch": 0.475968992248062,
+      "grad_norm": 1.7052260637283325,
+      "learning_rate": 4.3066322136089585e-06,
+      "loss": 0.9538,
+      "step": 307
+    },
+    {
+      "epoch": 0.47751937984496123,
+      "grad_norm": 1.9146831035614014,
+      "learning_rate": 4.302325581395349e-06,
+      "loss": 0.995,
+      "step": 308
+    },
+    {
+      "epoch": 0.4790697674418605,
+      "grad_norm": 1.581652283668518,
+      "learning_rate": 4.2980189491817404e-06,
+      "loss": 0.9966,
+      "step": 309
+    },
+    {
+      "epoch": 0.4806201550387597,
+      "grad_norm": 1.852246642112732,
+      "learning_rate": 4.293712316968131e-06,
+      "loss": 1.0029,
+      "step": 310
+    },
+    {
+      "epoch": 0.4806201550387597,
+      "eval_loss": 1.0914039611816406,
+      "eval_runtime": 46.5191,
+      "eval_samples_per_second": 21.497,
+      "eval_steps_per_second": 1.354,
+      "step": 310
+    },
+    {
+      "epoch": 0.4821705426356589,
+      "grad_norm": 1.5872552394866943,
+      "learning_rate": 4.289405684754522e-06,
+      "loss": 1.052,
+      "step": 311
+    },
+    {
+      "epoch": 0.48372093023255813,
+      "grad_norm": 2.0670878887176514,
+      "learning_rate": 4.285099052540914e-06,
+      "loss": 0.99,
+      "step": 312
+    },
+    {
+      "epoch": 0.48527131782945737,
+      "grad_norm": 1.722896695137024,
+      "learning_rate": 4.280792420327304e-06,
+      "loss": 1.0338,
+      "step": 313
+    },
+    {
+      "epoch": 0.4868217054263566,
+      "grad_norm": 1.4839717149734497,
+      "learning_rate": 4.276485788113696e-06,
+      "loss": 0.9857,
+      "step": 314
+    },
+    {
+      "epoch": 0.4883720930232558,
+      "grad_norm": 1.6284306049346924,
+      "learning_rate": 4.272179155900086e-06,
+      "loss": 1.003,
+      "step": 315
+    },
+    {
+      "epoch": 0.48992248062015503,
+      "grad_norm": 1.7428269386291504,
+      "learning_rate": 4.267872523686478e-06,
+      "loss": 0.9737,
+      "step": 316
+    },
+    {
+      "epoch": 0.49147286821705427,
+      "grad_norm": 1.281343698501587,
+      "learning_rate": 4.263565891472868e-06,
+      "loss": 0.9812,
+      "step": 317
+    },
+    {
+      "epoch": 0.4930232558139535,
+      "grad_norm": 1.4639809131622314,
+      "learning_rate": 4.2592592592592596e-06,
+      "loss": 0.9783,
+      "step": 318
+    },
+    {
+      "epoch": 0.4945736434108527,
+      "grad_norm": 1.636660099029541,
+      "learning_rate": 4.25495262704565e-06,
+      "loss": 0.9923,
+      "step": 319
+    },
+    {
+      "epoch": 0.49612403100775193,
+      "grad_norm": 1.8043773174285889,
+      "learning_rate": 4.2506459948320415e-06,
+      "loss": 0.9543,
+      "step": 320
+    },
+    {
+      "epoch": 0.49612403100775193,
+      "eval_loss": 1.0884485244750977,
+      "eval_runtime": 46.5738,
+      "eval_samples_per_second": 21.471,
+      "eval_steps_per_second": 1.353,
+      "step": 320
+    },
+    {
+      "epoch": 0.49767441860465117,
+      "grad_norm": 1.836240530014038,
+      "learning_rate": 4.246339362618433e-06,
+      "loss": 1.0107,
+      "step": 321
+    },
+    {
+      "epoch": 0.4992248062015504,
+      "grad_norm": 1.5559934377670288,
+      "learning_rate": 4.2420327304048234e-06,
+      "loss": 0.9607,
+      "step": 322
+    },
+    {
+      "epoch": 0.5007751937984496,
+      "grad_norm": 1.523479700088501,
+      "learning_rate": 4.237726098191215e-06,
+      "loss": 0.9307,
+      "step": 323
+    },
+    {
+      "epoch": 0.5023255813953489,
+      "grad_norm": 1.6808844804763794,
+      "learning_rate": 4.233419465977605e-06,
+      "loss": 1.0017,
+      "step": 324
+    },
+    {
+      "epoch": 0.5038759689922481,
+      "grad_norm": 1.4419201612472534,
+      "learning_rate": 4.229112833763997e-06,
+      "loss": 0.9715,
+      "step": 325
+    },
+    {
+      "epoch": 0.5054263565891473,
+      "grad_norm": 1.8381807804107666,
+      "learning_rate": 4.224806201550387e-06,
+      "loss": 1.0087,
+      "step": 326
+    },
+    {
+      "epoch": 0.5069767441860465,
+      "grad_norm": 1.410925030708313,
+      "learning_rate": 4.220499569336779e-06,
+      "loss": 0.996,
+      "step": 327
+    },
+    {
+      "epoch": 0.5085271317829457,
+      "grad_norm": 1.5510520935058594,
+      "learning_rate": 4.21619293712317e-06,
+      "loss": 1.0014,
+      "step": 328
+    },
+    {
+      "epoch": 0.5100775193798449,
+      "grad_norm": 1.7308863401412964,
+      "learning_rate": 4.211886304909561e-06,
+      "loss": 1.0047,
+      "step": 329
+    },
+    {
+      "epoch": 0.5116279069767442,
+      "grad_norm": 1.627509355545044,
+      "learning_rate": 4.207579672695952e-06,
+      "loss": 0.9879,
+      "step": 330
+    },
+    {
+      "epoch": 0.5116279069767442,
+      "eval_loss": 1.0827207565307617,
+      "eval_runtime": 46.6671,
+      "eval_samples_per_second": 21.428,
+      "eval_steps_per_second": 1.35,
+      "step": 330
+    },
+    {
+      "epoch": 0.5131782945736434,
+      "grad_norm": 1.7517614364624023,
+      "learning_rate": 4.203273040482343e-06,
+      "loss": 0.9946,
+      "step": 331
+    },
+    {
+      "epoch": 0.5147286821705427,
+      "grad_norm": 1.7542222738265991,
+      "learning_rate": 4.198966408268734e-06,
+      "loss": 0.977,
+      "step": 332
+    },
+    {
+      "epoch": 0.5162790697674419,
+      "grad_norm": 1.5376570224761963,
+      "learning_rate": 4.194659776055125e-06,
+      "loss": 0.9587,
+      "step": 333
+    },
+    {
+      "epoch": 0.517829457364341,
+      "grad_norm": 1.613654613494873,
+      "learning_rate": 4.190353143841516e-06,
+      "loss": 0.9844,
+      "step": 334
+    },
+    {
+      "epoch": 0.5193798449612403,
+      "grad_norm": 1.5097827911376953,
+      "learning_rate": 4.186046511627907e-06,
+      "loss": 0.9881,
+      "step": 335
+    },
+    {
+      "epoch": 0.5209302325581395,
+      "grad_norm": 1.6402108669281006,
+      "learning_rate": 4.181739879414298e-06,
+      "loss": 0.9919,
+      "step": 336
+    },
+    {
+      "epoch": 0.5224806201550387,
+      "grad_norm": 1.4518394470214844,
+      "learning_rate": 4.177433247200689e-06,
+      "loss": 0.9837,
+      "step": 337
+    },
+    {
+      "epoch": 0.524031007751938,
+      "grad_norm": 1.3409701585769653,
+      "learning_rate": 4.173126614987081e-06,
+      "loss": 0.9892,
+      "step": 338
+    },
+    {
+      "epoch": 0.5255813953488372,
+      "grad_norm": 1.3539661169052124,
+      "learning_rate": 4.168819982773471e-06,
+      "loss": 0.9845,
+      "step": 339
+    },
+    {
+      "epoch": 0.5271317829457365,
+      "grad_norm": 1.314976692199707,
+      "learning_rate": 4.1645133505598626e-06,
+      "loss": 1.0105,
+      "step": 340
+    },
+    {
+      "epoch": 0.5271317829457365,
+      "eval_loss": 1.0878815650939941,
+      "eval_runtime": 46.7206,
+      "eval_samples_per_second": 21.404,
+      "eval_steps_per_second": 1.348,
+      "step": 340
+    },
+    {
+      "epoch": 0.5286821705426357,
+      "grad_norm": 1.6854596138000488,
+      "learning_rate": 4.160206718346254e-06,
+      "loss": 0.9637,
+      "step": 341
+    },
+    {
+      "epoch": 0.5302325581395348,
+      "grad_norm": 1.7850494384765625,
+      "learning_rate": 4.1559000861326445e-06,
+      "loss": 0.9895,
+      "step": 342
+    },
+    {
+      "epoch": 0.5317829457364341,
+      "grad_norm": 1.2481727600097656,
+      "learning_rate": 4.151593453919036e-06,
+      "loss": 0.9999,
+      "step": 343
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 1.3739581108093262,
+      "learning_rate": 4.1472868217054264e-06,
+      "loss": 0.9676,
+      "step": 344
+    },
+    {
+      "epoch": 0.5348837209302325,
+      "grad_norm": 2.001826763153076,
+      "learning_rate": 4.142980189491818e-06,
+      "loss": 1.0295,
+      "step": 345
+    },
+    {
+      "epoch": 0.5364341085271318,
+      "grad_norm": 1.5063499212265015,
+      "learning_rate": 4.138673557278209e-06,
+      "loss": 0.9818,
+      "step": 346
+    },
+    {
+      "epoch": 0.537984496124031,
+      "grad_norm": 1.5834412574768066,
+      "learning_rate": 4.1343669250646e-06,
+      "loss": 0.9757,
+      "step": 347
+    },
+    {
+      "epoch": 0.5395348837209303,
+      "grad_norm": 1.3693667650222778,
+      "learning_rate": 4.130060292850991e-06,
+      "loss": 0.9734,
+      "step": 348
+    },
+    {
+      "epoch": 0.5410852713178295,
+      "grad_norm": 1.5714665651321411,
+      "learning_rate": 4.1257536606373825e-06,
+      "loss": 0.9877,
+      "step": 349
+    },
+    {
+      "epoch": 0.5426356589147286,
+      "grad_norm": 1.579288125038147,
+      "learning_rate": 4.121447028423773e-06,
+      "loss": 0.9917,
+      "step": 350
+    },
+    {
+      "epoch": 0.5426356589147286,
+      "eval_loss": 1.0909483432769775,
+      "eval_runtime": 46.7187,
+      "eval_samples_per_second": 21.405,
+      "eval_steps_per_second": 1.348,
+      "step": 350
+    },
+    {
+      "epoch": 0.5441860465116279,
+      "grad_norm": 1.3985519409179688,
+      "learning_rate": 4.1171403962101645e-06,
+      "loss": 0.9619,
+      "step": 351
+    },
+    {
+      "epoch": 0.5457364341085271,
+      "grad_norm": 1.4393198490142822,
+      "learning_rate": 4.112833763996555e-06,
+      "loss": 0.9612,
+      "step": 352
+    },
+    {
+      "epoch": 0.5472868217054263,
+      "grad_norm": 1.7534122467041016,
+      "learning_rate": 4.108527131782946e-06,
+      "loss": 0.9831,
+      "step": 353
+    },
+    {
+      "epoch": 0.5488372093023256,
+      "grad_norm": 1.56703782081604,
+      "learning_rate": 4.104220499569337e-06,
+      "loss": 1.015,
+      "step": 354
+    },
+    {
+      "epoch": 0.5503875968992248,
+      "grad_norm": 1.3634308576583862,
+      "learning_rate": 4.099913867355728e-06,
+      "loss": 0.9443,
+      "step": 355
+    },
+    {
+      "epoch": 0.5519379844961241,
+      "grad_norm": 1.6318840980529785,
+      "learning_rate": 4.09560723514212e-06,
+      "loss": 0.9574,
+      "step": 356
+    },
+    {
+      "epoch": 0.5534883720930233,
+      "grad_norm": 1.642081618309021,
+      "learning_rate": 4.09130060292851e-06,
+      "loss": 0.9887,
+      "step": 357
+    },
+    {
+      "epoch": 0.5550387596899224,
+      "grad_norm": 1.4372543096542358,
+      "learning_rate": 4.086993970714902e-06,
+      "loss": 0.991,
+      "step": 358
+    },
+    {
+      "epoch": 0.5565891472868217,
+      "grad_norm": 1.6540495157241821,
+      "learning_rate": 4.082687338501292e-06,
+      "loss": 0.9589,
+      "step": 359
+    },
+    {
+      "epoch": 0.5581395348837209,
+      "grad_norm": 1.7728432416915894,
+      "learning_rate": 4.078380706287684e-06,
+      "loss": 0.9945,
+      "step": 360
+    },
+    {
+      "epoch": 0.5581395348837209,
+      "eval_loss": 1.084891676902771,
+      "eval_runtime": 46.7029,
+      "eval_samples_per_second": 21.412,
+      "eval_steps_per_second": 1.349,
+      "step": 360
+    },
+    {
+      "epoch": 0.5596899224806201,
+      "grad_norm": 1.5775972604751587,
+      "learning_rate": 4.074074074074074e-06,
+      "loss": 0.9392,
+      "step": 361
+    },
+    {
+      "epoch": 0.5612403100775194,
+      "grad_norm": 1.696594476699829,
+      "learning_rate": 4.0697674418604655e-06,
+      "loss": 1.0223,
+      "step": 362
+    },
+    {
+      "epoch": 0.5627906976744186,
+      "grad_norm": 1.5510598421096802,
+      "learning_rate": 4.065460809646857e-06,
+      "loss": 0.9487,
+      "step": 363
+    },
+    {
+      "epoch": 0.5643410852713179,
+      "grad_norm": 1.3794918060302734,
+      "learning_rate": 4.0611541774332475e-06,
+      "loss": 0.964,
+      "step": 364
+    },
+    {
+      "epoch": 0.5658914728682171,
+      "grad_norm": 1.5407869815826416,
+      "learning_rate": 4.056847545219639e-06,
+      "loss": 0.9757,
+      "step": 365
+    },
+    {
+      "epoch": 0.5674418604651162,
+      "grad_norm": 1.3653312921524048,
+      "learning_rate": 4.052540913006029e-06,
+      "loss": 0.9175,
+      "step": 366
+    },
+    {
+      "epoch": 0.5689922480620155,
+      "grad_norm": 1.6502870321273804,
+      "learning_rate": 4.048234280792421e-06,
+      "loss": 0.9658,
+      "step": 367
+    },
+    {
+      "epoch": 0.5705426356589147,
+      "grad_norm": 1.4073630571365356,
+      "learning_rate": 4.043927648578811e-06,
+      "loss": 0.9482,
+      "step": 368
+    },
+    {
+      "epoch": 0.5720930232558139,
+      "grad_norm": 1.6393091678619385,
+      "learning_rate": 4.039621016365203e-06,
+      "loss": 0.9714,
+      "step": 369
+    },
+    {
+      "epoch": 0.5736434108527132,
+      "grad_norm": 1.6095997095108032,
+      "learning_rate": 4.035314384151594e-06,
+      "loss": 0.9513,
+      "step": 370
+    },
+    {
+      "epoch": 0.5736434108527132,
+      "eval_loss": 1.0774425268173218,
+      "eval_runtime": 46.7096,
+      "eval_samples_per_second": 21.409,
+      "eval_steps_per_second": 1.349,
+      "step": 370
+    },
+    {
+      "epoch": 0.5751937984496124,
+      "grad_norm": 1.783840298652649,
+      "learning_rate": 4.031007751937985e-06,
+      "loss": 0.9564,
+      "step": 371
+    },
+    {
+      "epoch": 0.5767441860465117,
+      "grad_norm": 1.4101550579071045,
+      "learning_rate": 4.026701119724376e-06,
+      "loss": 0.9904,
+      "step": 372
+    },
+    {
+      "epoch": 0.5782945736434109,
+      "grad_norm": 1.8022546768188477,
+      "learning_rate": 4.022394487510767e-06,
+      "loss": 0.9826,
+      "step": 373
+    },
+    {
+      "epoch": 0.57984496124031,
+      "grad_norm": 2.191577672958374,
+      "learning_rate": 4.018087855297158e-06,
+      "loss": 0.973,
+      "step": 374
+    },
+    {
+      "epoch": 0.5813953488372093,
+      "grad_norm": 1.8315773010253906,
+      "learning_rate": 4.0137812230835486e-06,
+      "loss": 0.9809,
+      "step": 375
+    },
+    {
+      "epoch": 0.5829457364341085,
+      "grad_norm": 1.692712664604187,
+      "learning_rate": 4.00947459086994e-06,
+      "loss": 0.9805,
+      "step": 376
+    },
+    {
+      "epoch": 0.5844961240310077,
+      "grad_norm": 1.888137698173523,
+      "learning_rate": 4.005167958656331e-06,
+      "loss": 1.0037,
+      "step": 377
+    },
+    {
+      "epoch": 0.586046511627907,
+      "grad_norm": 1.4804847240447998,
+      "learning_rate": 4.000861326442722e-06,
+      "loss": 0.9519,
+      "step": 378
+    },
+    {
+      "epoch": 0.5875968992248062,
+      "grad_norm": 1.3487441539764404,
+      "learning_rate": 3.996554694229113e-06,
+      "loss": 0.9736,
+      "step": 379
+    },
+    {
+      "epoch": 0.5891472868217055,
+      "grad_norm": 1.4220670461654663,
+      "learning_rate": 3.992248062015504e-06,
+      "loss": 0.9745,
+      "step": 380
+    },
+    {
+      "epoch": 0.5891472868217055,
+      "eval_loss": 1.078254222869873,
+      "eval_runtime": 46.7579,
+      "eval_samples_per_second": 21.387,
+      "eval_steps_per_second": 1.347,
+      "step": 380
+    },
+    {
+      "epoch": 0.5906976744186047,
+      "grad_norm": 1.7034496068954468,
+      "learning_rate": 3.987941429801895e-06,
+      "loss": 0.9566,
+      "step": 381
+    },
+    {
+      "epoch": 0.5922480620155038,
+      "grad_norm": 1.5833885669708252,
+      "learning_rate": 3.983634797588286e-06,
+      "loss": 0.9712,
+      "step": 382
+    },
+    {
+      "epoch": 0.5937984496124031,
+      "grad_norm": 1.4043247699737549,
+      "learning_rate": 3.979328165374677e-06,
+      "loss": 0.9919,
+      "step": 383
+    },
+    {
+      "epoch": 0.5953488372093023,
+      "grad_norm": 1.7453659772872925,
+      "learning_rate": 3.9750215331610685e-06,
+      "loss": 1.0067,
+      "step": 384
+    },
+    {
+      "epoch": 0.5968992248062015,
+      "grad_norm": 1.2731187343597412,
+      "learning_rate": 3.970714900947459e-06,
+      "loss": 0.9521,
+      "step": 385
+    },
+    {
+      "epoch": 0.5984496124031008,
+      "grad_norm": 1.5030323266983032,
+      "learning_rate": 3.9664082687338505e-06,
+      "loss": 0.9613,
+      "step": 386
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.53596031665802,
+      "learning_rate": 3.962101636520241e-06,
+      "loss": 0.9654,
+      "step": 387
+    },
+    {
+      "epoch": 0.6015503875968993,
+      "grad_norm": 1.6132758855819702,
+      "learning_rate": 3.957795004306632e-06,
+      "loss": 0.9967,
+      "step": 388
+    },
+    {
+      "epoch": 0.6031007751937985,
+      "grad_norm": 1.3358467817306519,
+      "learning_rate": 3.953488372093024e-06,
+      "loss": 0.9542,
+      "step": 389
+    },
+    {
+      "epoch": 0.6046511627906976,
+      "grad_norm": 1.7046970129013062,
+      "learning_rate": 3.949181739879414e-06,
+      "loss": 0.9607,
+      "step": 390
+    },
+    {
+      "epoch": 0.6046511627906976,
+      "eval_loss": 1.076040267944336,
+      "eval_runtime": 46.6238,
+      "eval_samples_per_second": 21.448,
+      "eval_steps_per_second": 1.351,
+      "step": 390
+    },
+    {
+      "epoch": 0.6062015503875969,
+      "grad_norm": 1.3822777271270752,
+      "learning_rate": 3.944875107665806e-06,
+      "loss": 0.9484,
+      "step": 391
+    },
+    {
+      "epoch": 0.6077519379844961,
+      "grad_norm": 2.166247844696045,
+      "learning_rate": 3.940568475452196e-06,
+      "loss": 0.9447,
+      "step": 392
+    },
+    {
+      "epoch": 0.6093023255813953,
+      "grad_norm": 1.4651894569396973,
+      "learning_rate": 3.936261843238588e-06,
+      "loss": 0.9858,
+      "step": 393
+    },
+    {
+      "epoch": 0.6108527131782946,
+      "grad_norm": 1.5112606287002563,
+      "learning_rate": 3.931955211024979e-06,
+      "loss": 0.9934,
+      "step": 394
+    },
+    {
+      "epoch": 0.6124031007751938,
+      "grad_norm": 1.657082200050354,
+      "learning_rate": 3.92764857881137e-06,
+      "loss": 0.9742,
+      "step": 395
+    },
+    {
+      "epoch": 0.6139534883720931,
+      "grad_norm": 1.5030896663665771,
+      "learning_rate": 3.923341946597761e-06,
+      "loss": 0.9566,
+      "step": 396
+    },
+    {
+      "epoch": 0.6155038759689923,
+      "grad_norm": 1.536012887954712,
+      "learning_rate": 3.919035314384152e-06,
+      "loss": 0.986,
+      "step": 397
+    },
+    {
+      "epoch": 0.6170542635658914,
+      "grad_norm": 1.7443193197250366,
+      "learning_rate": 3.914728682170543e-06,
+      "loss": 0.9722,
+      "step": 398
+    },
+    {
+      "epoch": 0.6186046511627907,
+      "grad_norm": 1.4055274724960327,
+      "learning_rate": 3.910422049956934e-06,
+      "loss": 0.9561,
+      "step": 399
+    },
+    {
+      "epoch": 0.6201550387596899,
+      "grad_norm": 1.6770274639129639,
+      "learning_rate": 3.906115417743325e-06,
+      "loss": 0.9506,
+      "step": 400
+    },
+    {
+      "epoch": 0.6201550387596899,
+      "eval_loss": 1.0693814754486084,
+      "eval_runtime": 46.7609,
+      "eval_samples_per_second": 21.385,
+      "eval_steps_per_second": 1.347,
+      "step": 400
+    },
+    {
+      "epoch": 0.6217054263565891,
+      "grad_norm": 1.9705766439437866,
+      "learning_rate": 3.901808785529716e-06,
+      "loss": 0.9313,
+      "step": 401
+    },
+    {
+      "epoch": 0.6232558139534884,
+      "grad_norm": 1.6670687198638916,
+      "learning_rate": 3.897502153316108e-06,
+      "loss": 0.9685,
+      "step": 402
+    },
+    {
+      "epoch": 0.6248062015503876,
+      "grad_norm": 1.290456771850586,
+      "learning_rate": 3.893195521102498e-06,
+      "loss": 0.9516,
+      "step": 403
+    },
+    {
+      "epoch": 0.6263565891472869,
+      "grad_norm": 1.2610697746276855,
+      "learning_rate": 3.88888888888889e-06,
+      "loss": 0.958,
+      "step": 404
+    },
+    {
+      "epoch": 0.627906976744186,
+      "grad_norm": 1.427687406539917,
+      "learning_rate": 3.88458225667528e-06,
+      "loss": 0.9522,
+      "step": 405
+    },
+    {
+      "epoch": 0.6294573643410852,
+      "grad_norm": 1.5878995656967163,
+      "learning_rate": 3.8802756244616715e-06,
+      "loss": 0.9856,
+      "step": 406
+    },
+    {
+      "epoch": 0.6310077519379845,
+      "grad_norm": 1.3636142015457153,
+      "learning_rate": 3.875968992248063e-06,
+      "loss": 0.9166,
+      "step": 407
+    },
+    {
+      "epoch": 0.6325581395348837,
+      "grad_norm": 1.675515055656433,
+      "learning_rate": 3.8716623600344535e-06,
+      "loss": 0.9417,
+      "step": 408
+    },
+    {
+      "epoch": 0.6341085271317829,
+      "grad_norm": 1.4354209899902344,
+      "learning_rate": 3.867355727820845e-06,
+      "loss": 0.9434,
+      "step": 409
+    },
+    {
+      "epoch": 0.6356589147286822,
+      "grad_norm": 1.6979807615280151,
+      "learning_rate": 3.863049095607235e-06,
+      "loss": 0.9692,
+      "step": 410
+    },
+    {
+      "epoch": 0.6356589147286822,
+      "eval_loss": 1.0683481693267822,
+      "eval_runtime": 46.7785,
+      "eval_samples_per_second": 21.377,
+      "eval_steps_per_second": 1.347,
+      "step": 410
+    },
+    {
+      "epoch": 0.6372093023255814,
+      "grad_norm": 1.300763726234436,
+      "learning_rate": 3.858742463393627e-06,
+      "loss": 0.9572,
+      "step": 411
+    },
+    {
+      "epoch": 0.6387596899224807,
+      "grad_norm": 1.6949375867843628,
+      "learning_rate": 3.854435831180017e-06,
+      "loss": 0.9986,
+      "step": 412
+    },
+    {
+      "epoch": 0.6403100775193798,
+      "grad_norm": 1.6742664575576782,
+      "learning_rate": 3.850129198966409e-06,
+      "loss": 1.0145,
+      "step": 413
+    },
+    {
+      "epoch": 0.641860465116279,
+      "grad_norm": 1.3767681121826172,
+      "learning_rate": 3.8458225667528e-06,
+      "loss": 0.9573,
+      "step": 414
+    },
+    {
+      "epoch": 0.6434108527131783,
+      "grad_norm": 1.656080722808838,
+      "learning_rate": 3.841515934539191e-06,
+      "loss": 0.9311,
+      "step": 415
+    },
+    {
+      "epoch": 0.6449612403100775,
+      "grad_norm": 1.4878498315811157,
+      "learning_rate": 3.837209302325582e-06,
+      "loss": 0.9467,
+      "step": 416
+    },
+    {
+      "epoch": 0.6465116279069767,
+      "grad_norm": 2.324364185333252,
+      "learning_rate": 3.832902670111973e-06,
+      "loss": 0.9442,
+      "step": 417
+    },
+    {
+      "epoch": 0.648062015503876,
+      "grad_norm": 1.2944444417953491,
+      "learning_rate": 3.828596037898364e-06,
+      "loss": 0.9664,
+      "step": 418
+    },
+    {
+      "epoch": 0.6496124031007752,
+      "grad_norm": 1.3722344636917114,
+      "learning_rate": 3.8242894056847545e-06,
+      "loss": 0.9458,
+      "step": 419
+    },
+    {
+      "epoch": 0.6511627906976745,
+      "grad_norm": 2.1579277515411377,
+      "learning_rate": 3.819982773471146e-06,
+      "loss": 0.9934,
+      "step": 420
+    },
+    {
+      "epoch": 0.6511627906976745,
+      "eval_loss": 1.0679476261138916,
+      "eval_runtime": 46.7186,
+      "eval_samples_per_second": 21.405,
+      "eval_steps_per_second": 1.348,
+      "step": 420
+    },
+    {
+      "epoch": 0.6527131782945736,
+      "grad_norm": 2.1869912147521973,
+      "learning_rate": 3.815676141257537e-06,
+      "loss": 0.922,
+      "step": 421
+    },
+    {
+      "epoch": 0.6542635658914728,
+      "grad_norm": 1.5521175861358643,
+      "learning_rate": 3.811369509043928e-06,
+      "loss": 0.9917,
+      "step": 422
+    },
+    {
+      "epoch": 0.6558139534883721,
+      "grad_norm": 1.3776301145553589,
+      "learning_rate": 3.8070628768303192e-06,
+      "loss": 0.9766,
+      "step": 423
+    },
+    {
+      "epoch": 0.6573643410852713,
+      "grad_norm": 2.0059685707092285,
+      "learning_rate": 3.8027562446167098e-06,
+      "loss": 0.9768,
+      "step": 424
+    },
+    {
+      "epoch": 0.6589147286821705,
+      "grad_norm": 1.6401809453964233,
+      "learning_rate": 3.798449612403101e-06,
+      "loss": 0.9773,
+      "step": 425
+    },
+    {
+      "epoch": 0.6604651162790698,
+      "grad_norm": 1.5309780836105347,
+      "learning_rate": 3.7941429801894917e-06,
+      "loss": 0.9605,
+      "step": 426
+    },
+    {
+      "epoch": 0.662015503875969,
+      "grad_norm": 1.6547430753707886,
+      "learning_rate": 3.789836347975883e-06,
+      "loss": 0.9674,
+      "step": 427
+    },
+    {
+      "epoch": 0.6635658914728683,
+      "grad_norm": 1.8590940237045288,
+      "learning_rate": 3.7855297157622745e-06,
+      "loss": 0.994,
+      "step": 428
+    },
+    {
+      "epoch": 0.6651162790697674,
+      "grad_norm": 1.347604751586914,
+      "learning_rate": 3.781223083548665e-06,
+      "loss": 0.9635,
+      "step": 429
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.4679491519927979,
+      "learning_rate": 3.7769164513350564e-06,
+      "loss": 0.9318,
+      "step": 430
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "eval_loss": 1.0620640516281128,
+      "eval_runtime": 46.7063,
+      "eval_samples_per_second": 21.41,
+      "eval_steps_per_second": 1.349,
+      "step": 430
+    },
+    {
+      "epoch": 0.6682170542635659,
+      "grad_norm": 1.8079434633255005,
+      "learning_rate": 3.772609819121447e-06,
+      "loss": 0.9651,
+      "step": 431
+    },
+    {
+      "epoch": 0.6697674418604651,
+      "grad_norm": 1.7883085012435913,
+      "learning_rate": 3.7683031869078384e-06,
+      "loss": 0.9598,
+      "step": 432
+    },
+    {
+      "epoch": 0.6713178294573643,
+      "grad_norm": 1.6297731399536133,
+      "learning_rate": 3.7639965546942293e-06,
+      "loss": 1.0005,
+      "step": 433
+    },
+    {
+      "epoch": 0.6728682170542636,
+      "grad_norm": 1.4521540403366089,
+      "learning_rate": 3.7596899224806203e-06,
+      "loss": 0.9638,
+      "step": 434
+    },
+    {
+      "epoch": 0.6744186046511628,
+      "grad_norm": 1.7151950597763062,
+      "learning_rate": 3.7553832902670117e-06,
+      "loss": 0.9498,
+      "step": 435
+    },
+    {
+      "epoch": 0.6759689922480621,
+      "grad_norm": 1.7151180505752563,
+      "learning_rate": 3.7510766580534027e-06,
+      "loss": 0.9506,
+      "step": 436
+    },
+    {
+      "epoch": 0.6775193798449612,
+      "grad_norm": 1.6881719827651978,
+      "learning_rate": 3.7467700258397936e-06,
+      "loss": 0.9792,
+      "step": 437
+    },
+    {
+      "epoch": 0.6790697674418604,
+      "grad_norm": 1.6680631637573242,
+      "learning_rate": 3.7424633936261846e-06,
+      "loss": 0.9621,
+      "step": 438
+    },
+    {
+      "epoch": 0.6806201550387597,
+      "grad_norm": 1.5197463035583496,
+      "learning_rate": 3.7381567614125756e-06,
+      "loss": 0.9458,
+      "step": 439
+    },
+    {
+      "epoch": 0.6821705426356589,
+      "grad_norm": 1.3536181449890137,
+      "learning_rate": 3.7338501291989665e-06,
+      "loss": 0.9585,
+      "step": 440
+    },
+    {
+      "epoch": 0.6821705426356589,
+      "eval_loss": 1.0652538537979126,
+      "eval_runtime": 46.6998,
+      "eval_samples_per_second": 21.413,
+      "eval_steps_per_second": 1.349,
+      "step": 440
+    },
+    {
+      "epoch": 0.6837209302325581,
+      "grad_norm": 1.3938905000686646,
+      "learning_rate": 3.729543496985358e-06,
+      "loss": 0.9654,
+      "step": 441
+    },
+    {
+      "epoch": 0.6852713178294574,
+      "grad_norm": 1.677477478981018,
+      "learning_rate": 3.725236864771749e-06,
+      "loss": 0.9453,
+      "step": 442
+    },
+    {
+      "epoch": 0.6868217054263566,
+      "grad_norm": 1.654832363128662,
+      "learning_rate": 3.72093023255814e-06,
+      "loss": 0.9384,
+      "step": 443
+    },
+    {
+      "epoch": 0.6883720930232559,
+      "grad_norm": 1.330737829208374,
+      "learning_rate": 3.7166236003445313e-06,
+      "loss": 0.9682,
+      "step": 444
+    },
+    {
+      "epoch": 0.689922480620155,
+      "grad_norm": 1.4447482824325562,
+      "learning_rate": 3.712316968130922e-06,
+      "loss": 0.9465,
+      "step": 445
+    },
+    {
+      "epoch": 0.6914728682170542,
+      "grad_norm": 1.5042719841003418,
+      "learning_rate": 3.708010335917313e-06,
+      "loss": 0.9519,
+      "step": 446
+    },
+    {
+      "epoch": 0.6930232558139535,
+      "grad_norm": 1.5131895542144775,
+      "learning_rate": 3.7037037037037037e-06,
+      "loss": 0.9435,
+      "step": 447
+    },
+    {
+      "epoch": 0.6945736434108527,
+      "grad_norm": 1.4704760313034058,
+      "learning_rate": 3.699397071490095e-06,
+      "loss": 0.9643,
+      "step": 448
+    },
+    {
+      "epoch": 0.6961240310077519,
+      "grad_norm": 1.6629536151885986,
+      "learning_rate": 3.6950904392764865e-06,
+      "loss": 0.9813,
+      "step": 449
+    },
+    {
+      "epoch": 0.6976744186046512,
+      "grad_norm": 1.8209941387176514,
+      "learning_rate": 3.690783807062877e-06,
+      "loss": 0.9872,
+      "step": 450
+    },
+    {
+      "epoch": 0.6976744186046512,
+      "eval_loss": 1.0628501176834106,
+      "eval_runtime": 46.7022,
+      "eval_samples_per_second": 21.412,
+      "eval_steps_per_second": 1.349,
+      "step": 450
+    },
+    {
+      "epoch": 0.6992248062015504,
+      "grad_norm": 1.903017282485962,
+      "learning_rate": 3.6864771748492685e-06,
+      "loss": 0.9505,
+      "step": 451
+    },
+    {
+      "epoch": 0.7007751937984497,
+      "grad_norm": 1.5136739015579224,
+      "learning_rate": 3.682170542635659e-06,
+      "loss": 0.9826,
+      "step": 452
+    },
+    {
+      "epoch": 0.7023255813953488,
+      "grad_norm": 1.440664291381836,
+      "learning_rate": 3.6778639104220504e-06,
+      "loss": 0.9321,
+      "step": 453
+    },
+    {
+      "epoch": 0.703875968992248,
+      "grad_norm": 1.4127347469329834,
+      "learning_rate": 3.673557278208441e-06,
+      "loss": 0.9422,
+      "step": 454
+    },
+    {
+      "epoch": 0.7054263565891473,
+      "grad_norm": 1.6021867990493774,
+      "learning_rate": 3.6692506459948323e-06,
+      "loss": 0.9645,
+      "step": 455
+    },
+    {
+      "epoch": 0.7069767441860465,
+      "grad_norm": 1.1911089420318604,
+      "learning_rate": 3.6649440137812233e-06,
+      "loss": 0.9296,
+      "step": 456
+    },
+    {
+      "epoch": 0.7085271317829457,
+      "grad_norm": 1.9586585760116577,
+      "learning_rate": 3.6606373815676143e-06,
+      "loss": 0.9404,
+      "step": 457
+    },
+    {
+      "epoch": 0.710077519379845,
+      "grad_norm": 1.6152875423431396,
+      "learning_rate": 3.6563307493540057e-06,
+      "loss": 0.9489,
+      "step": 458
+    },
+    {
+      "epoch": 0.7116279069767442,
+      "grad_norm": 1.417580246925354,
+      "learning_rate": 3.652024117140396e-06,
+      "loss": 0.9628,
+      "step": 459
+    },
+    {
+      "epoch": 0.7131782945736435,
+      "grad_norm": 1.3339707851409912,
+      "learning_rate": 3.6477174849267876e-06,
+      "loss": 0.9415,
+      "step": 460
+    },
+    {
+      "epoch": 0.7131782945736435,
+      "eval_loss": 1.061155080795288,
+      "eval_runtime": 46.6993,
+      "eval_samples_per_second": 21.414,
+      "eval_steps_per_second": 1.349,
+      "step": 460
+    },
+    {
+      "epoch": 0.7147286821705426,
+      "grad_norm": 1.5678375959396362,
+      "learning_rate": 3.6434108527131786e-06,
+      "loss": 0.9344,
+      "step": 461
+    },
+    {
+      "epoch": 0.7162790697674418,
+      "grad_norm": 1.6713725328445435,
+      "learning_rate": 3.6391042204995695e-06,
+      "loss": 0.9387,
+      "step": 462
+    },
+    {
+      "epoch": 0.7178294573643411,
+      "grad_norm": 1.3815056085586548,
+      "learning_rate": 3.6347975882859605e-06,
+      "loss": 0.9666,
+      "step": 463
+    },
+    {
+      "epoch": 0.7193798449612403,
+      "grad_norm": 1.466451644897461,
+      "learning_rate": 3.630490956072352e-06,
+      "loss": 0.9766,
+      "step": 464
+    },
+    {
+      "epoch": 0.7209302325581395,
+      "grad_norm": 1.3906329870224,
+      "learning_rate": 3.626184323858743e-06,
+      "loss": 0.9517,
+      "step": 465
+    },
+    {
+      "epoch": 0.7224806201550388,
+      "grad_norm": 1.4931296110153198,
+      "learning_rate": 3.621877691645134e-06,
+      "loss": 0.9618,
+      "step": 466
+    },
+    {
+      "epoch": 0.724031007751938,
+      "grad_norm": 1.3734408617019653,
+      "learning_rate": 3.617571059431525e-06,
+      "loss": 0.9456,
+      "step": 467
+    },
+    {
+      "epoch": 0.7255813953488373,
+      "grad_norm": 1.6479203701019287,
+      "learning_rate": 3.6132644272179158e-06,
+      "loss": 0.9547,
+      "step": 468
+    },
+    {
+      "epoch": 0.7271317829457364,
+      "grad_norm": 1.3253997564315796,
+      "learning_rate": 3.608957795004307e-06,
+      "loss": 0.9598,
+      "step": 469
+    },
+    {
+      "epoch": 0.7286821705426356,
+      "grad_norm": 1.624961018562317,
+      "learning_rate": 3.6046511627906977e-06,
+      "loss": 0.9769,
+      "step": 470
+    },
+    {
+      "epoch": 0.7286821705426356,
+      "eval_loss": 1.0569216012954712,
+      "eval_runtime": 46.6486,
+      "eval_samples_per_second": 21.437,
+      "eval_steps_per_second": 1.351,
+      "step": 470
+    },
+    {
+      "epoch": 0.7302325581395349,
+      "grad_norm": 1.2979047298431396,
+      "learning_rate": 3.600344530577089e-06,
+      "loss": 0.9171,
+      "step": 471
+    },
+    {
+      "epoch": 0.7317829457364341,
+      "grad_norm": 1.3191477060317993,
+      "learning_rate": 3.5960378983634805e-06,
+      "loss": 0.943,
+      "step": 472
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 1.6449686288833618,
+      "learning_rate": 3.591731266149871e-06,
+      "loss": 0.9488,
+      "step": 473
+    },
+    {
+      "epoch": 0.7348837209302326,
+      "grad_norm": 1.4530247449874878,
+      "learning_rate": 3.5874246339362624e-06,
+      "loss": 0.9479,
+      "step": 474
+    },
+    {
+      "epoch": 0.7364341085271318,
+      "grad_norm": 1.400500774383545,
+      "learning_rate": 3.583118001722653e-06,
+      "loss": 0.9182,
+      "step": 475
+    },
+    {
+      "epoch": 0.737984496124031,
+      "grad_norm": 1.5714648962020874,
+      "learning_rate": 3.5788113695090443e-06,
+      "loss": 0.9756,
+      "step": 476
+    },
+    {
+      "epoch": 0.7395348837209302,
+      "grad_norm": 1.4749916791915894,
+      "learning_rate": 3.574504737295435e-06,
+      "loss": 0.9322,
+      "step": 477
+    },
+    {
+      "epoch": 0.7410852713178294,
+      "grad_norm": 1.3581645488739014,
+      "learning_rate": 3.5701981050818263e-06,
+      "loss": 0.9456,
+      "step": 478
+    },
+    {
+      "epoch": 0.7426356589147287,
+      "grad_norm": 1.58745276927948,
+      "learning_rate": 3.5658914728682177e-06,
+      "loss": 0.9197,
+      "step": 479
+    },
+    {
+      "epoch": 0.7441860465116279,
+      "grad_norm": 1.3937815427780151,
+      "learning_rate": 3.5615848406546082e-06,
+      "loss": 0.9271,
+      "step": 480
+    },
+    {
+      "epoch": 0.7441860465116279,
+      "eval_loss": 1.0562697649002075,
+      "eval_runtime": 46.7957,
+      "eval_samples_per_second": 21.369,
+      "eval_steps_per_second": 1.346,
+      "step": 480
+    },
+    {
+      "epoch": 0.7457364341085271,
+      "grad_norm": 1.3200627565383911,
+      "learning_rate": 3.5572782084409996e-06,
+      "loss": 0.9454,
+      "step": 481
+    },
+    {
+      "epoch": 0.7472868217054264,
+      "grad_norm": 1.427779197692871,
+      "learning_rate": 3.55297157622739e-06,
+      "loss": 0.9604,
+      "step": 482
+    },
+    {
+      "epoch": 0.7488372093023256,
+      "grad_norm": 1.8657872676849365,
+      "learning_rate": 3.5486649440137815e-06,
+      "loss": 0.9359,
+      "step": 483
+    },
+    {
+      "epoch": 0.7503875968992249,
+      "grad_norm": 2.0786523818969727,
+      "learning_rate": 3.5443583118001725e-06,
+      "loss": 0.9995,
+      "step": 484
+    },
+    {
+      "epoch": 0.751937984496124,
+      "grad_norm": 1.6857866048812866,
+      "learning_rate": 3.5400516795865635e-06,
+      "loss": 0.9657,
+      "step": 485
+    },
+    {
+      "epoch": 0.7534883720930232,
+      "grad_norm": 1.380631923675537,
+      "learning_rate": 3.535745047372955e-06,
+      "loss": 0.9625,
+      "step": 486
+    },
+    {
+      "epoch": 0.7550387596899225,
+      "grad_norm": 1.4253772497177124,
+      "learning_rate": 3.5314384151593454e-06,
+      "loss": 0.9499,
+      "step": 487
+    },
+    {
+      "epoch": 0.7565891472868217,
+      "grad_norm": 1.6862024068832397,
+      "learning_rate": 3.527131782945737e-06,
+      "loss": 0.9358,
+      "step": 488
+    },
+    {
+      "epoch": 0.7581395348837209,
+      "grad_norm": 1.6502622365951538,
+      "learning_rate": 3.5228251507321278e-06,
+      "loss": 0.926,
+      "step": 489
+    },
+    {
+      "epoch": 0.7596899224806202,
+      "grad_norm": 1.6885582208633423,
+      "learning_rate": 3.5185185185185187e-06,
+      "loss": 0.9426,
+      "step": 490
+    },
+    {
+      "epoch": 0.7596899224806202,
+      "eval_loss": 1.0555542707443237,
+      "eval_runtime": 46.8684,
+      "eval_samples_per_second": 21.336,
+      "eval_steps_per_second": 1.344,
+      "step": 490
+    },
+    {
+      "epoch": 0.7612403100775194,
+      "grad_norm": 1.6672579050064087,
+      "learning_rate": 3.5142118863049097e-06,
+      "loss": 0.9195,
+      "step": 491
+    },
+    {
+      "epoch": 0.7627906976744186,
+      "grad_norm": 1.3683370351791382,
+      "learning_rate": 3.509905254091301e-06,
+      "loss": 0.9301,
+      "step": 492
+    },
+    {
+      "epoch": 0.7643410852713178,
+      "grad_norm": 1.5962402820587158,
+      "learning_rate": 3.505598621877692e-06,
+      "loss": 0.9496,
+      "step": 493
+    },
+    {
+      "epoch": 0.765891472868217,
+      "grad_norm": 1.45559561252594,
+      "learning_rate": 3.501291989664083e-06,
+      "loss": 0.9263,
+      "step": 494
+    },
+    {
+      "epoch": 0.7674418604651163,
+      "grad_norm": 1.8540050983428955,
+      "learning_rate": 3.496985357450474e-06,
+      "loss": 0.9307,
+      "step": 495
+    },
+    {
+      "epoch": 0.7689922480620155,
+      "grad_norm": 2.0742030143737793,
+      "learning_rate": 3.492678725236865e-06,
+      "loss": 0.9206,
+      "step": 496
+    },
+    {
+      "epoch": 0.7705426356589147,
+      "grad_norm": 1.6275883913040161,
+      "learning_rate": 3.4883720930232564e-06,
+      "loss": 0.9237,
+      "step": 497
+    },
+    {
+      "epoch": 0.772093023255814,
+      "grad_norm": 1.5503264665603638,
+      "learning_rate": 3.484065460809647e-06,
+      "loss": 0.9468,
+      "step": 498
+    },
+    {
+      "epoch": 0.7736434108527132,
+      "grad_norm": 2.0982398986816406,
+      "learning_rate": 3.4797588285960383e-06,
+      "loss": 0.9697,
+      "step": 499
+    },
+    {
+      "epoch": 0.7751937984496124,
+      "grad_norm": 1.5442216396331787,
+      "learning_rate": 3.4754521963824293e-06,
+      "loss": 0.988,
+      "step": 500
+    },
+    {
+      "epoch": 0.7751937984496124,
+      "eval_loss": 1.0530461072921753,
+      "eval_runtime": 46.7519,
+      "eval_samples_per_second": 21.389,
+      "eval_steps_per_second": 1.348,
+      "step": 500
+    },
+    {
+      "epoch": 0.7767441860465116,
+      "grad_norm": 1.2036796808242798,
+      "learning_rate": 3.4711455641688202e-06,
+      "loss": 0.9044,
+      "step": 501
+    },
+    {
+      "epoch": 0.7782945736434108,
+      "grad_norm": 1.345171332359314,
+      "learning_rate": 3.4668389319552116e-06,
+      "loss": 0.9189,
+      "step": 502
+    },
+    {
+      "epoch": 0.7798449612403101,
+      "grad_norm": 1.998370885848999,
+      "learning_rate": 3.462532299741602e-06,
+      "loss": 0.94,
+      "step": 503
+    },
+    {
+      "epoch": 0.7813953488372093,
+      "grad_norm": 1.5674575567245483,
+      "learning_rate": 3.4582256675279936e-06,
+      "loss": 0.9254,
+      "step": 504
+    },
+    {
+      "epoch": 0.7829457364341085,
+      "grad_norm": 1.315887689590454,
+      "learning_rate": 3.453919035314384e-06,
+      "loss": 0.9419,
+      "step": 505
+    },
+    {
+      "epoch": 0.7844961240310078,
+      "grad_norm": 1.5615918636322021,
+      "learning_rate": 3.4496124031007755e-06,
+      "loss": 0.9518,
+      "step": 506
+    },
+    {
+      "epoch": 0.786046511627907,
+      "grad_norm": 1.6197636127471924,
+      "learning_rate": 3.445305770887167e-06,
+      "loss": 0.9401,
+      "step": 507
+    },
+    {
+      "epoch": 0.7875968992248062,
+      "grad_norm": 1.2421844005584717,
+      "learning_rate": 3.4409991386735574e-06,
+      "loss": 0.934,
+      "step": 508
+    },
+    {
+      "epoch": 0.7891472868217054,
+      "grad_norm": 1.2320061922073364,
+      "learning_rate": 3.436692506459949e-06,
+      "loss": 0.9228,
+      "step": 509
+    },
+    {
+      "epoch": 0.7906976744186046,
+      "grad_norm": 1.4907335042953491,
+      "learning_rate": 3.4323858742463394e-06,
+      "loss": 0.9741,
+      "step": 510
+    },
+    {
+      "epoch": 0.7906976744186046,
+      "eval_loss": 1.0517873764038086,
+      "eval_runtime": 46.9265,
+      "eval_samples_per_second": 21.31,
+      "eval_steps_per_second": 1.343,
+      "step": 510
+    },
+    {
+      "epoch": 0.7922480620155039,
+      "grad_norm": 1.1737737655639648,
+      "learning_rate": 3.4280792420327308e-06,
+      "loss": 0.8744,
+      "step": 511
+    },
+    {
+      "epoch": 0.7937984496124031,
+      "grad_norm": 1.3589445352554321,
+      "learning_rate": 3.4237726098191217e-06,
+      "loss": 0.9366,
+      "step": 512
+    },
+    {
+      "epoch": 0.7953488372093023,
+      "grad_norm": 1.3620688915252686,
+      "learning_rate": 3.4194659776055127e-06,
+      "loss": 0.9422,
+      "step": 513
+    },
+    {
+      "epoch": 0.7968992248062016,
+      "grad_norm": 1.8479820489883423,
+      "learning_rate": 3.415159345391904e-06,
+      "loss": 0.9338,
+      "step": 514
+    },
+    {
+      "epoch": 0.7984496124031008,
+      "grad_norm": 1.673426866531372,
+      "learning_rate": 3.4108527131782946e-06,
+      "loss": 0.9237,
+      "step": 515
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.6327366828918457,
+      "learning_rate": 3.406546080964686e-06,
+      "loss": 0.937,
+      "step": 516
+    },
+    {
+      "epoch": 0.8015503875968992,
+      "grad_norm": 1.3908108472824097,
+      "learning_rate": 3.402239448751077e-06,
+      "loss": 0.9524,
+      "step": 517
+    },
+    {
+      "epoch": 0.8031007751937984,
+      "grad_norm": 1.5284862518310547,
+      "learning_rate": 3.397932816537468e-06,
+      "loss": 0.9818,
+      "step": 518
+    },
+    {
+      "epoch": 0.8046511627906977,
+      "grad_norm": 1.657374620437622,
+      "learning_rate": 3.393626184323859e-06,
+      "loss": 0.8977,
+      "step": 519
+    },
+    {
+      "epoch": 0.8062015503875969,
+      "grad_norm": 1.6133347749710083,
+      "learning_rate": 3.38931955211025e-06,
+      "loss": 0.9226,
+      "step": 520
+    },
+    {
+      "epoch": 0.8062015503875969,
+      "eval_loss": 1.0512946844100952,
+      "eval_runtime": 46.8983,
+      "eval_samples_per_second": 21.323,
+      "eval_steps_per_second": 1.343,
+      "step": 520
+    },
+    {
+      "epoch": 0.8077519379844961,
+      "grad_norm": 1.406281590461731,
+      "learning_rate": 3.3850129198966413e-06,
+      "loss": 0.9272,
+      "step": 521
+    },
+    {
+      "epoch": 0.8093023255813954,
+      "grad_norm": 1.2945131063461304,
+      "learning_rate": 3.3807062876830323e-06,
+      "loss": 0.9195,
+      "step": 522
+    },
+    {
+      "epoch": 0.8108527131782945,
+      "grad_norm": 1.5606355667114258,
+      "learning_rate": 3.3763996554694232e-06,
+      "loss": 0.919,
+      "step": 523
+    },
+    {
+      "epoch": 0.8124031007751938,
+      "grad_norm": 1.3578014373779297,
+      "learning_rate": 3.372093023255814e-06,
+      "loss": 0.9443,
+      "step": 524
+    },
+    {
+      "epoch": 0.813953488372093,
+      "grad_norm": 1.62376868724823,
+      "learning_rate": 3.3677863910422056e-06,
+      "loss": 0.9629,
+      "step": 525
+    },
+    {
+      "epoch": 0.8155038759689922,
+      "grad_norm": 1.5871691703796387,
+      "learning_rate": 3.363479758828596e-06,
+      "loss": 0.9555,
+      "step": 526
+    },
+    {
+      "epoch": 0.8170542635658915,
+      "grad_norm": 1.765234112739563,
+      "learning_rate": 3.3591731266149875e-06,
+      "loss": 0.9506,
+      "step": 527
+    },
+    {
+      "epoch": 0.8186046511627907,
+      "grad_norm": 1.445127248764038,
+      "learning_rate": 3.354866494401378e-06,
+      "loss": 0.9583,
+      "step": 528
+    },
+    {
+      "epoch": 0.8201550387596899,
+      "grad_norm": 1.3839683532714844,
+      "learning_rate": 3.3505598621877695e-06,
+      "loss": 0.9735,
+      "step": 529
+    },
+    {
+      "epoch": 0.8217054263565892,
+      "grad_norm": 1.2828640937805176,
+      "learning_rate": 3.346253229974161e-06,
+      "loss": 0.917,
+      "step": 530
+    },
+    {
+      "epoch": 0.8217054263565892,
+      "eval_loss": 1.0487127304077148,
+      "eval_runtime": 46.8274,
+      "eval_samples_per_second": 21.355,
+      "eval_steps_per_second": 1.345,
+      "step": 530
+    },
+    {
+      "epoch": 0.8232558139534883,
+      "grad_norm": 1.6673675775527954,
+      "learning_rate": 3.3419465977605514e-06,
+      "loss": 0.9419,
+      "step": 531
+    },
+    {
+      "epoch": 0.8248062015503876,
+      "grad_norm": 1.508750319480896,
+      "learning_rate": 3.3376399655469428e-06,
+      "loss": 0.9363,
+      "step": 532
+    },
+    {
+      "epoch": 0.8263565891472868,
+      "grad_norm": 1.4697679281234741,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 0.9531,
+      "step": 533
+    },
+    {
+      "epoch": 0.827906976744186,
+      "grad_norm": 1.2485036849975586,
+      "learning_rate": 3.3290267011197247e-06,
+      "loss": 0.938,
+      "step": 534
+    },
+    {
+      "epoch": 0.8294573643410853,
+      "grad_norm": 1.6659992933273315,
+      "learning_rate": 3.3247200689061153e-06,
+      "loss": 0.9419,
+      "step": 535
+    },
+    {
+      "epoch": 0.8310077519379845,
+      "grad_norm": 1.207901120185852,
+      "learning_rate": 3.3204134366925067e-06,
+      "loss": 0.9582,
+      "step": 536
+    },
+    {
+      "epoch": 0.8325581395348837,
+      "grad_norm": 1.456590175628662,
+      "learning_rate": 3.316106804478898e-06,
+      "loss": 0.9355,
+      "step": 537
+    },
+    {
+      "epoch": 0.834108527131783,
+      "grad_norm": 1.2961267232894897,
+      "learning_rate": 3.3118001722652886e-06,
+      "loss": 0.9317,
+      "step": 538
+    },
+    {
+      "epoch": 0.8356589147286821,
+      "grad_norm": 1.304347276687622,
+      "learning_rate": 3.30749354005168e-06,
+      "loss": 0.9133,
+      "step": 539
+    },
+    {
+      "epoch": 0.8372093023255814,
+      "grad_norm": 1.2878105640411377,
+      "learning_rate": 3.3031869078380705e-06,
+      "loss": 0.908,
+      "step": 540
+    },
+    {
+      "epoch": 0.8372093023255814,
+      "eval_loss": 1.047052264213562,
+      "eval_runtime": 46.8479,
+      "eval_samples_per_second": 21.346,
+      "eval_steps_per_second": 1.345,
+      "step": 540
+    },
+    {
+      "epoch": 0.8387596899224806,
+      "grad_norm": 1.663776159286499,
+      "learning_rate": 3.298880275624462e-06,
+      "loss": 0.8975,
+      "step": 541
+    },
+    {
+      "epoch": 0.8403100775193798,
+      "grad_norm": 1.3324629068374634,
+      "learning_rate": 3.294573643410853e-06,
+      "loss": 0.9524,
+      "step": 542
+    },
+    {
+      "epoch": 0.8418604651162791,
+      "grad_norm": 1.5450448989868164,
+      "learning_rate": 3.290267011197244e-06,
+      "loss": 0.9771,
+      "step": 543
+    },
+    {
+      "epoch": 0.8434108527131783,
+      "grad_norm": 1.4546128511428833,
+      "learning_rate": 3.2859603789836352e-06,
+      "loss": 0.9168,
+      "step": 544
+    },
+    {
+      "epoch": 0.8449612403100775,
+      "grad_norm": 1.623872995376587,
+      "learning_rate": 3.281653746770026e-06,
+      "loss": 0.9637,
+      "step": 545
+    },
+    {
+      "epoch": 0.8465116279069768,
+      "grad_norm": 1.4465365409851074,
+      "learning_rate": 3.277347114556417e-06,
+      "loss": 0.9399,
+      "step": 546
+    },
+    {
+      "epoch": 0.8480620155038759,
+      "grad_norm": 1.4867907762527466,
+      "learning_rate": 3.273040482342808e-06,
+      "loss": 0.9274,
+      "step": 547
+    },
+    {
+      "epoch": 0.8496124031007752,
+      "grad_norm": 1.429142713546753,
+      "learning_rate": 3.268733850129199e-06,
+      "loss": 0.9566,
+      "step": 548
+    },
+    {
+      "epoch": 0.8511627906976744,
+      "grad_norm": 1.2755072116851807,
+      "learning_rate": 3.26442721791559e-06,
+      "loss": 0.9417,
+      "step": 549
+    },
+    {
+      "epoch": 0.8527131782945736,
+      "grad_norm": 1.5570671558380127,
+      "learning_rate": 3.2601205857019815e-06,
+      "loss": 0.9316,
+      "step": 550
+    },
+    {
+      "epoch": 0.8527131782945736,
+      "eval_loss": 1.0439643859863281,
+      "eval_runtime": 46.7656,
+      "eval_samples_per_second": 21.383,
+      "eval_steps_per_second": 1.347,
+      "step": 550
+    },
+    {
+      "epoch": 0.8542635658914729,
+      "grad_norm": 1.3648735284805298,
+      "learning_rate": 3.2558139534883724e-06,
+      "loss": 0.9174,
+      "step": 551
+    },
+    {
+      "epoch": 0.8558139534883721,
+      "grad_norm": 1.1797237396240234,
+      "learning_rate": 3.2515073212747634e-06,
+      "loss": 0.9101,
+      "step": 552
+    },
+    {
+      "epoch": 0.8573643410852713,
+      "grad_norm": 1.3258795738220215,
+      "learning_rate": 3.247200689061155e-06,
+      "loss": 0.9288,
+      "step": 553
+    },
+    {
+      "epoch": 0.8589147286821706,
+      "grad_norm": 1.6303619146347046,
+      "learning_rate": 3.2428940568475453e-06,
+      "loss": 0.9293,
+      "step": 554
+    },
+    {
+      "epoch": 0.8604651162790697,
+      "grad_norm": 1.412011981010437,
+      "learning_rate": 3.2385874246339367e-06,
+      "loss": 0.9597,
+      "step": 555
+    },
+    {
+      "epoch": 0.862015503875969,
+      "grad_norm": 1.3949530124664307,
+      "learning_rate": 3.2342807924203273e-06,
+      "loss": 0.8872,
+      "step": 556
+    },
+    {
+      "epoch": 0.8635658914728682,
+      "grad_norm": 1.3831912279129028,
+      "learning_rate": 3.2299741602067187e-06,
+      "loss": 0.9067,
+      "step": 557
+    },
+    {
+      "epoch": 0.8651162790697674,
+      "grad_norm": 1.4866950511932373,
+      "learning_rate": 3.22566752799311e-06,
+      "loss": 0.9392,
+      "step": 558
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 1.6705503463745117,
+      "learning_rate": 3.2213608957795006e-06,
+      "loss": 0.9563,
+      "step": 559
+    },
+    {
+      "epoch": 0.8682170542635659,
+      "grad_norm": 1.3490782976150513,
+      "learning_rate": 3.217054263565892e-06,
+      "loss": 0.9131,
+      "step": 560
+    },
+    {
+      "epoch": 0.8682170542635659,
+      "eval_loss": 1.0431301593780518,
+      "eval_runtime": 46.8914,
+      "eval_samples_per_second": 21.326,
+      "eval_steps_per_second": 1.344,
+      "step": 560
+    },
+    {
+      "epoch": 0.8697674418604651,
+      "grad_norm": 1.2459760904312134,
+      "learning_rate": 3.2127476313522825e-06,
+      "loss": 0.9982,
+      "step": 561
+    },
+    {
+      "epoch": 0.8713178294573644,
+      "grad_norm": 1.5161173343658447,
+      "learning_rate": 3.208440999138674e-06,
+      "loss": 0.9763,
+      "step": 562
+    },
+    {
+      "epoch": 0.8728682170542635,
+      "grad_norm": 1.6294832229614258,
+      "learning_rate": 3.2041343669250645e-06,
+      "loss": 0.9324,
+      "step": 563
+    },
+    {
+      "epoch": 0.8744186046511628,
+      "grad_norm": 1.414649248123169,
+      "learning_rate": 3.199827734711456e-06,
+      "loss": 0.9336,
+      "step": 564
+    },
+    {
+      "epoch": 0.875968992248062,
+      "grad_norm": 1.5806972980499268,
+      "learning_rate": 3.1955211024978473e-06,
+      "loss": 0.9293,
+      "step": 565
+    },
+    {
+      "epoch": 0.8775193798449612,
+      "grad_norm": 1.524552345275879,
+      "learning_rate": 3.191214470284238e-06,
+      "loss": 0.9531,
+      "step": 566
+    },
+    {
+      "epoch": 0.8790697674418605,
+      "grad_norm": 1.7706551551818848,
+      "learning_rate": 3.186907838070629e-06,
+      "loss": 0.9478,
+      "step": 567
+    },
+    {
+      "epoch": 0.8806201550387597,
+      "grad_norm": 1.6926461458206177,
+      "learning_rate": 3.1826012058570197e-06,
+      "loss": 0.9311,
+      "step": 568
+    },
+    {
+      "epoch": 0.8821705426356589,
+      "grad_norm": 1.43769109249115,
+      "learning_rate": 3.178294573643411e-06,
+      "loss": 0.9509,
+      "step": 569
+    },
+    {
+      "epoch": 0.8837209302325582,
+      "grad_norm": 1.8943246603012085,
+      "learning_rate": 3.173987941429802e-06,
+      "loss": 0.9301,
+      "step": 570
+    },
+    {
+      "epoch": 0.8837209302325582,
+      "eval_loss": 1.0409941673278809,
+      "eval_runtime": 46.854,
+      "eval_samples_per_second": 21.343,
+      "eval_steps_per_second": 1.345,
+      "step": 570
+    },
+    {
+      "epoch": 0.8852713178294573,
+      "grad_norm": 1.3987243175506592,
+      "learning_rate": 3.169681309216193e-06,
+      "loss": 0.9351,
+      "step": 571
+    },
+    {
+      "epoch": 0.8868217054263566,
+      "grad_norm": 1.3229787349700928,
+      "learning_rate": 3.1653746770025845e-06,
+      "loss": 0.9429,
+      "step": 572
+    },
+    {
+      "epoch": 0.8883720930232558,
+      "grad_norm": 1.3845443725585938,
+      "learning_rate": 3.1610680447889754e-06,
+      "loss": 0.931,
+      "step": 573
+    },
+    {
+      "epoch": 0.889922480620155,
+      "grad_norm": 1.3660778999328613,
+      "learning_rate": 3.1567614125753664e-06,
+      "loss": 0.9146,
+      "step": 574
+    },
+    {
+      "epoch": 0.8914728682170543,
+      "grad_norm": 1.6824007034301758,
+      "learning_rate": 3.1524547803617574e-06,
+      "loss": 0.9355,
+      "step": 575
+    },
+    {
+      "epoch": 0.8930232558139535,
+      "grad_norm": 1.283627986907959,
+      "learning_rate": 3.1481481481481483e-06,
+      "loss": 0.9401,
+      "step": 576
+    },
+    {
+      "epoch": 0.8945736434108527,
+      "grad_norm": 1.3251144886016846,
+      "learning_rate": 3.1438415159345393e-06,
+      "loss": 0.9103,
+      "step": 577
+    },
+    {
+      "epoch": 0.896124031007752,
+      "grad_norm": 1.3561856746673584,
+      "learning_rate": 3.1395348837209307e-06,
+      "loss": 0.9096,
+      "step": 578
+    },
+    {
+      "epoch": 0.8976744186046511,
+      "grad_norm": 1.5303910970687866,
+      "learning_rate": 3.1352282515073217e-06,
+      "loss": 0.9187,
+      "step": 579
+    },
+    {
+      "epoch": 0.8992248062015504,
+      "grad_norm": 1.2881441116333008,
+      "learning_rate": 3.1309216192937126e-06,
+      "loss": 0.9671,
+      "step": 580
+    },
+    {
+      "epoch": 0.8992248062015504,
+      "eval_loss": 1.0374441146850586,
+      "eval_runtime": 46.9153,
+      "eval_samples_per_second": 21.315,
+      "eval_steps_per_second": 1.343,
+      "step": 580
+    },
+    {
+      "epoch": 0.9007751937984496,
+      "grad_norm": 1.260320782661438,
+      "learning_rate": 3.126614987080104e-06,
+      "loss": 0.9375,
+      "step": 581
+    },
+    {
+      "epoch": 0.9023255813953488,
+      "grad_norm": 1.4502344131469727,
+      "learning_rate": 3.1223083548664946e-06,
+      "loss": 0.9049,
+      "step": 582
+    },
+    {
+      "epoch": 0.9038759689922481,
+      "grad_norm": 1.310264229774475,
+      "learning_rate": 3.118001722652886e-06,
+      "loss": 0.926,
+      "step": 583
+    },
+    {
+      "epoch": 0.9054263565891473,
+      "grad_norm": 1.3460558652877808,
+      "learning_rate": 3.1136950904392765e-06,
+      "loss": 0.9237,
+      "step": 584
+    },
+    {
+      "epoch": 0.9069767441860465,
+      "grad_norm": 1.5001479387283325,
+      "learning_rate": 3.109388458225668e-06,
+      "loss": 0.9161,
+      "step": 585
+    },
+    {
+      "epoch": 0.9085271317829458,
+      "grad_norm": 1.394601583480835,
+      "learning_rate": 3.1050818260120593e-06,
+      "loss": 0.9684,
+      "step": 586
+    },
+    {
+      "epoch": 0.9100775193798449,
+      "grad_norm": 1.2460273504257202,
+      "learning_rate": 3.10077519379845e-06,
+      "loss": 0.9369,
+      "step": 587
+    },
+    {
+      "epoch": 0.9116279069767442,
+      "grad_norm": 1.4341247081756592,
+      "learning_rate": 3.096468561584841e-06,
+      "loss": 0.9552,
+      "step": 588
+    },
+    {
+      "epoch": 0.9131782945736434,
+      "grad_norm": 1.6534271240234375,
+      "learning_rate": 3.0921619293712318e-06,
+      "loss": 0.8792,
+      "step": 589
+    },
+    {
+      "epoch": 0.9147286821705426,
+      "grad_norm": 2.0037407875061035,
+      "learning_rate": 3.087855297157623e-06,
+      "loss": 0.9157,
+      "step": 590
+    },
+    {
+      "epoch": 0.9147286821705426,
+      "eval_loss": 1.0375549793243408,
+      "eval_runtime": 46.8146,
+      "eval_samples_per_second": 21.361,
+      "eval_steps_per_second": 1.346,
+      "step": 590
+    },
+    {
+      "epoch": 0.9162790697674419,
+      "grad_norm": 1.5423766374588013,
+      "learning_rate": 3.0835486649440137e-06,
+      "loss": 0.95,
+      "step": 591
+    },
+    {
+      "epoch": 0.9178294573643411,
+      "grad_norm": 1.272499918937683,
+      "learning_rate": 3.079242032730405e-06,
+      "loss": 0.9288,
+      "step": 592
+    },
+    {
+      "epoch": 0.9193798449612403,
+      "grad_norm": 1.304702639579773,
+      "learning_rate": 3.074935400516796e-06,
+      "loss": 0.9131,
+      "step": 593
+    },
+    {
+      "epoch": 0.9209302325581395,
+      "grad_norm": 1.9163035154342651,
+      "learning_rate": 3.070628768303187e-06,
+      "loss": 0.8756,
+      "step": 594
+    },
+    {
+      "epoch": 0.9224806201550387,
+      "grad_norm": 1.6911531686782837,
+      "learning_rate": 3.0663221360895784e-06,
+      "loss": 0.9391,
+      "step": 595
+    },
+    {
+      "epoch": 0.924031007751938,
+      "grad_norm": 1.4596829414367676,
+      "learning_rate": 3.062015503875969e-06,
+      "loss": 0.9413,
+      "step": 596
+    },
+    {
+      "epoch": 0.9255813953488372,
+      "grad_norm": 1.4826016426086426,
+      "learning_rate": 3.0577088716623603e-06,
+      "loss": 0.9136,
+      "step": 597
+    },
+    {
+      "epoch": 0.9271317829457364,
+      "grad_norm": 1.2671222686767578,
+      "learning_rate": 3.0534022394487513e-06,
+      "loss": 0.8991,
+      "step": 598
+    },
+    {
+      "epoch": 0.9286821705426357,
+      "grad_norm": 1.6153379678726196,
+      "learning_rate": 3.0490956072351423e-06,
+      "loss": 0.925,
+      "step": 599
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "grad_norm": 1.5550309419631958,
+      "learning_rate": 3.0447889750215332e-06,
+      "loss": 0.9235,
+      "step": 600
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "eval_loss": 1.0385956764221191,
+      "eval_runtime": 46.8361,
+      "eval_samples_per_second": 21.351,
+      "eval_steps_per_second": 1.345,
+      "step": 600
+    },
+    {
+      "epoch": 0.931782945736434,
+      "grad_norm": 1.2835649251937866,
+      "learning_rate": 3.0404823428079242e-06,
+      "loss": 0.9586,
+      "step": 601
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 1.3722798824310303,
+      "learning_rate": 3.0361757105943156e-06,
+      "loss": 0.9018,
+      "step": 602
+    },
+    {
+      "epoch": 0.9348837209302325,
+      "grad_norm": 1.4283156394958496,
+      "learning_rate": 3.0318690783807066e-06,
+      "loss": 0.939,
+      "step": 603
+    },
+    {
+      "epoch": 0.9364341085271318,
+      "grad_norm": 1.3454813957214355,
+      "learning_rate": 3.0275624461670975e-06,
+      "loss": 0.9296,
+      "step": 604
+    },
+    {
+      "epoch": 0.937984496124031,
+      "grad_norm": 1.2258172035217285,
+      "learning_rate": 3.0232558139534885e-06,
+      "loss": 0.9141,
+      "step": 605
+    },
+    {
+      "epoch": 0.9395348837209302,
+      "grad_norm": 1.4662123918533325,
+      "learning_rate": 3.01894918173988e-06,
+      "loss": 0.9382,
+      "step": 606
+    },
+    {
+      "epoch": 0.9410852713178295,
+      "grad_norm": 1.8197379112243652,
+      "learning_rate": 3.0146425495262704e-06,
+      "loss": 0.9361,
+      "step": 607
+    },
+    {
+      "epoch": 0.9426356589147287,
+      "grad_norm": 1.3216370344161987,
+      "learning_rate": 3.010335917312662e-06,
+      "loss": 0.9353,
+      "step": 608
+    },
+    {
+      "epoch": 0.9441860465116279,
+      "grad_norm": 1.6560941934585571,
+      "learning_rate": 3.006029285099053e-06,
+      "loss": 0.9102,
+      "step": 609
+    },
+    {
+      "epoch": 0.9457364341085271,
+      "grad_norm": 1.6229398250579834,
+      "learning_rate": 3.0017226528854438e-06,
+      "loss": 0.9177,
+      "step": 610
+    },
+    {
+      "epoch": 0.9457364341085271,
+      "eval_loss": 1.0402629375457764,
+      "eval_runtime": 46.9417,
+      "eval_samples_per_second": 21.303,
+      "eval_steps_per_second": 1.342,
+      "step": 610
+    },
+    {
+      "epoch": 0.9472868217054263,
+      "grad_norm": 1.4512161016464233,
+      "learning_rate": 2.997416020671835e-06,
+      "loss": 0.9386,
+      "step": 611
+    },
+    {
+      "epoch": 0.9488372093023256,
+      "grad_norm": 1.578696370124817,
+      "learning_rate": 2.9931093884582257e-06,
+      "loss": 0.919,
+      "step": 612
+    },
+    {
+      "epoch": 0.9503875968992248,
+      "grad_norm": 1.3147693872451782,
+      "learning_rate": 2.988802756244617e-06,
+      "loss": 0.9037,
+      "step": 613
+    },
+    {
+      "epoch": 0.951937984496124,
+      "grad_norm": 1.4229583740234375,
+      "learning_rate": 2.9844961240310076e-06,
+      "loss": 0.924,
+      "step": 614
+    },
+    {
+      "epoch": 0.9534883720930233,
+      "grad_norm": 1.5262490510940552,
+      "learning_rate": 2.980189491817399e-06,
+      "loss": 0.9287,
+      "step": 615
+    },
+    {
+      "epoch": 0.9550387596899225,
+      "grad_norm": 1.342050552368164,
+      "learning_rate": 2.9758828596037904e-06,
+      "loss": 0.9327,
+      "step": 616
+    },
+    {
+      "epoch": 0.9565891472868217,
+      "grad_norm": 1.3471406698226929,
+      "learning_rate": 2.971576227390181e-06,
+      "loss": 0.9113,
+      "step": 617
+    },
+    {
+      "epoch": 0.958139534883721,
+      "grad_norm": 1.7015011310577393,
+      "learning_rate": 2.9672695951765724e-06,
+      "loss": 0.9062,
+      "step": 618
+    },
+    {
+      "epoch": 0.9596899224806201,
+      "grad_norm": 1.6812702417373657,
+      "learning_rate": 2.962962962962963e-06,
+      "loss": 0.9307,
+      "step": 619
+    },
+    {
+      "epoch": 0.9612403100775194,
+      "grad_norm": 1.5313730239868164,
+      "learning_rate": 2.9586563307493543e-06,
+      "loss": 0.9149,
+      "step": 620
+    },
+    {
+      "epoch": 0.9612403100775194,
+      "eval_loss": 1.0406851768493652,
+      "eval_runtime": 46.8759,
+      "eval_samples_per_second": 21.333,
+      "eval_steps_per_second": 1.344,
+      "step": 620
+    },
+    {
+      "epoch": 0.9627906976744186,
+      "grad_norm": 1.806265950202942,
+      "learning_rate": 2.954349698535745e-06,
+      "loss": 0.9579,
+      "step": 621
+    },
+    {
+      "epoch": 0.9643410852713178,
+      "grad_norm": 1.4483739137649536,
+      "learning_rate": 2.9500430663221362e-06,
+      "loss": 0.9086,
+      "step": 622
+    },
+    {
+      "epoch": 0.9658914728682171,
+      "grad_norm": 1.275719165802002,
+      "learning_rate": 2.9457364341085276e-06,
+      "loss": 0.8883,
+      "step": 623
+    },
+    {
+      "epoch": 0.9674418604651163,
+      "grad_norm": 1.3039225339889526,
+      "learning_rate": 2.941429801894918e-06,
+      "loss": 0.9115,
+      "step": 624
+    },
+    {
+      "epoch": 0.9689922480620154,
+      "grad_norm": 1.658772587776184,
+      "learning_rate": 2.9371231696813096e-06,
+      "loss": 0.8889,
+      "step": 625
+    },
+    {
+      "epoch": 0.9705426356589147,
+      "grad_norm": 1.480093240737915,
+      "learning_rate": 2.9328165374677005e-06,
+      "loss": 0.9302,
+      "step": 626
+    },
+    {
+      "epoch": 0.9720930232558139,
+      "grad_norm": 1.272149682044983,
+      "learning_rate": 2.9285099052540915e-06,
+      "loss": 0.9357,
+      "step": 627
+    },
+    {
+      "epoch": 0.9736434108527132,
+      "grad_norm": 1.2398171424865723,
+      "learning_rate": 2.9242032730404825e-06,
+      "loss": 0.9148,
+      "step": 628
+    },
+    {
+      "epoch": 0.9751937984496124,
+      "grad_norm": 1.6542110443115234,
+      "learning_rate": 2.9198966408268734e-06,
+      "loss": 0.9419,
+      "step": 629
+    },
+    {
+      "epoch": 0.9767441860465116,
+      "grad_norm": 1.2841962575912476,
+      "learning_rate": 2.915590008613265e-06,
+      "loss": 0.9098,
+      "step": 630
+    },
+    {
+      "epoch": 0.9767441860465116,
+      "eval_loss": 1.039470911026001,
+      "eval_runtime": 46.948,
+      "eval_samples_per_second": 21.3,
+      "eval_steps_per_second": 1.342,
+      "step": 630
+    },
+    {
+      "epoch": 0.9782945736434109,
+      "grad_norm": 1.546913743019104,
+      "learning_rate": 2.911283376399656e-06,
+      "loss": 0.8966,
+      "step": 631
+    },
+    {
+      "epoch": 0.9798449612403101,
+      "grad_norm": 1.6431787014007568,
+      "learning_rate": 2.9069767441860468e-06,
+      "loss": 0.9415,
+      "step": 632
+    },
+    {
+      "epoch": 0.9813953488372092,
+      "grad_norm": 1.4569215774536133,
+      "learning_rate": 2.9026701119724377e-06,
+      "loss": 0.9119,
+      "step": 633
+    },
+    {
+      "epoch": 0.9829457364341085,
+      "grad_norm": 1.4622957706451416,
+      "learning_rate": 2.898363479758829e-06,
+      "loss": 0.9187,
+      "step": 634
+    },
+    {
+      "epoch": 0.9844961240310077,
+      "grad_norm": 1.2801949977874756,
+      "learning_rate": 2.8940568475452197e-06,
+      "loss": 0.9243,
+      "step": 635
+    },
+    {
+      "epoch": 0.986046511627907,
+      "grad_norm": 1.4466568231582642,
+      "learning_rate": 2.889750215331611e-06,
+      "loss": 0.9346,
+      "step": 636
+    },
+    {
+      "epoch": 0.9875968992248062,
+      "grad_norm": 1.5166065692901611,
+      "learning_rate": 2.885443583118002e-06,
+      "loss": 0.9137,
+      "step": 637
+    },
+    {
+      "epoch": 0.9891472868217054,
+      "grad_norm": 1.5546586513519287,
+      "learning_rate": 2.881136950904393e-06,
+      "loss": 0.9281,
+      "step": 638
+    },
+    {
+      "epoch": 0.9906976744186047,
+      "grad_norm": 1.2714097499847412,
+      "learning_rate": 2.8768303186907844e-06,
+      "loss": 0.9187,
+      "step": 639
+    },
+    {
+      "epoch": 0.9922480620155039,
+      "grad_norm": 1.4462742805480957,
+      "learning_rate": 2.872523686477175e-06,
+      "loss": 0.9156,
+      "step": 640
+    },
+    {
+      "epoch": 0.9922480620155039,
+      "eval_loss": 1.0363060235977173,
+      "eval_runtime": 46.8096,
+      "eval_samples_per_second": 21.363,
+      "eval_steps_per_second": 1.346,
+      "step": 640
+    },
+    {
+      "epoch": 0.993798449612403,
+      "grad_norm": 1.4602237939834595,
+      "learning_rate": 2.8682170542635663e-06,
+      "loss": 0.9291,
+      "step": 641
+    },
+    {
+      "epoch": 0.9953488372093023,
+      "grad_norm": 1.36737859249115,
+      "learning_rate": 2.863910422049957e-06,
+      "loss": 0.8979,
+      "step": 642
+    },
+    {
+      "epoch": 0.9968992248062015,
+      "grad_norm": 1.4275497198104858,
+      "learning_rate": 2.8596037898363483e-06,
+      "loss": 0.8784,
+      "step": 643
+    },
+    {
+      "epoch": 0.9984496124031008,
+      "grad_norm": 1.1453689336776733,
+      "learning_rate": 2.8552971576227396e-06,
+      "loss": 0.904,
+      "step": 644
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.5238977670669556,
+      "learning_rate": 2.85099052540913e-06,
+      "loss": 0.8926,
+      "step": 645
+    },
+    {
+      "epoch": 1.0015503875968992,
+      "grad_norm": 1.5500783920288086,
+      "learning_rate": 2.8466838931955216e-06,
+      "loss": 0.9153,
+      "step": 646
+    },
+    {
+      "epoch": 1.0031007751937984,
+      "grad_norm": 1.5735803842544556,
+      "learning_rate": 2.842377260981912e-06,
+      "loss": 0.9061,
+      "step": 647
+    },
+    {
+      "epoch": 1.0046511627906978,
+      "grad_norm": 1.289521336555481,
+      "learning_rate": 2.8380706287683035e-06,
+      "loss": 0.9081,
+      "step": 648
+    },
+    {
+      "epoch": 1.006201550387597,
+      "grad_norm": 1.4659291505813599,
+      "learning_rate": 2.833763996554694e-06,
+      "loss": 0.8713,
+      "step": 649
+    },
+    {
+      "epoch": 1.0077519379844961,
+      "grad_norm": 1.3533378839492798,
+      "learning_rate": 2.8294573643410855e-06,
+      "loss": 0.9052,
+      "step": 650
+    },
+    {
+      "epoch": 1.0077519379844961,
+      "eval_loss": 1.0426983833312988,
+      "eval_runtime": 46.7738,
+      "eval_samples_per_second": 21.379,
+      "eval_steps_per_second": 1.347,
+      "step": 650
+    },
+    {
+      "epoch": 1.0093023255813953,
+      "grad_norm": 1.3357157707214355,
+      "learning_rate": 2.825150732127477e-06,
+      "loss": 0.8823,
+      "step": 651
+    },
+    {
+      "epoch": 1.0108527131782945,
+      "grad_norm": 1.4741071462631226,
+      "learning_rate": 2.8208440999138674e-06,
+      "loss": 0.8922,
+      "step": 652
+    },
+    {
+      "epoch": 1.012403100775194,
+      "grad_norm": 1.5935308933258057,
+      "learning_rate": 2.8165374677002588e-06,
+      "loss": 0.8688,
+      "step": 653
+    },
+    {
+      "epoch": 1.013953488372093,
+      "grad_norm": 1.4742423295974731,
+      "learning_rate": 2.8122308354866497e-06,
+      "loss": 0.8912,
+      "step": 654
+    },
+    {
+      "epoch": 1.0155038759689923,
+      "grad_norm": 1.3270450830459595,
+      "learning_rate": 2.8079242032730407e-06,
+      "loss": 0.8977,
+      "step": 655
+    },
+    {
+      "epoch": 1.0170542635658915,
+      "grad_norm": 1.7434755563735962,
+      "learning_rate": 2.8036175710594317e-06,
+      "loss": 0.912,
+      "step": 656
+    },
+    {
+      "epoch": 1.0186046511627906,
+      "grad_norm": 1.4647204875946045,
+      "learning_rate": 2.7993109388458226e-06,
+      "loss": 0.8749,
+      "step": 657
+    },
+    {
+      "epoch": 1.0201550387596898,
+      "grad_norm": 1.4478235244750977,
+      "learning_rate": 2.795004306632214e-06,
+      "loss": 0.9043,
+      "step": 658
+    },
+    {
+      "epoch": 1.0217054263565892,
+      "grad_norm": 1.345246434211731,
+      "learning_rate": 2.790697674418605e-06,
+      "loss": 0.9033,
+      "step": 659
+    },
+    {
+      "epoch": 1.0232558139534884,
+      "grad_norm": 1.8008910417556763,
+      "learning_rate": 2.786391042204996e-06,
+      "loss": 0.9009,
+      "step": 660
+    },
+    {
+      "epoch": 1.0232558139534884,
+      "eval_loss": 1.0404847860336304,
+      "eval_runtime": 46.7813,
+      "eval_samples_per_second": 21.376,
+      "eval_steps_per_second": 1.347,
+      "step": 660
+    },
+    {
+      "epoch": 1.0248062015503876,
+      "grad_norm": 1.5372025966644287,
+      "learning_rate": 2.782084409991387e-06,
+      "loss": 0.8794,
+      "step": 661
+    },
+    {
+      "epoch": 1.0263565891472868,
+      "grad_norm": 1.5077435970306396,
+      "learning_rate": 2.7777777777777783e-06,
+      "loss": 0.8922,
+      "step": 662
+    },
+    {
+      "epoch": 1.027906976744186,
+      "grad_norm": 1.6719175577163696,
+      "learning_rate": 2.773471145564169e-06,
+      "loss": 0.9135,
+      "step": 663
+    },
+    {
+      "epoch": 1.0294573643410854,
+      "grad_norm": 1.5047252178192139,
+      "learning_rate": 2.7691645133505603e-06,
+      "loss": 0.8653,
+      "step": 664
+    },
+    {
+      "epoch": 1.0310077519379846,
+      "grad_norm": 1.3442022800445557,
+      "learning_rate": 2.764857881136951e-06,
+      "loss": 0.9235,
+      "step": 665
+    },
+    {
+      "epoch": 1.0325581395348837,
+      "grad_norm": 1.690727949142456,
+      "learning_rate": 2.760551248923342e-06,
+      "loss": 0.8707,
+      "step": 666
+    },
+    {
+      "epoch": 1.034108527131783,
+      "grad_norm": 1.3899257183074951,
+      "learning_rate": 2.7562446167097336e-06,
+      "loss": 0.8895,
+      "step": 667
+    },
+    {
+      "epoch": 1.035658914728682,
+      "grad_norm": 1.6989372968673706,
+      "learning_rate": 2.751937984496124e-06,
+      "loss": 0.8967,
+      "step": 668
+    },
+    {
+      "epoch": 1.0372093023255813,
+      "grad_norm": 1.4262644052505493,
+      "learning_rate": 2.7476313522825155e-06,
+      "loss": 0.8856,
+      "step": 669
+    },
+    {
+      "epoch": 1.0387596899224807,
+      "grad_norm": 1.456101894378662,
+      "learning_rate": 2.743324720068906e-06,
+      "loss": 0.8853,
+      "step": 670
+    },
+    {
+      "epoch": 1.0387596899224807,
+      "eval_loss": 1.0409016609191895,
+      "eval_runtime": 46.7499,
+      "eval_samples_per_second": 21.39,
+      "eval_steps_per_second": 1.348,
+      "step": 670
+    },
+    {
+      "epoch": 1.0403100775193799,
+      "grad_norm": 1.3542068004608154,
+      "learning_rate": 2.7390180878552975e-06,
+      "loss": 0.8617,
+      "step": 671
+    },
+    {
+      "epoch": 1.041860465116279,
+      "grad_norm": 1.4485602378845215,
+      "learning_rate": 2.734711455641688e-06,
+      "loss": 0.8755,
+      "step": 672
+    },
+    {
+      "epoch": 1.0434108527131782,
+      "grad_norm": 1.6059707403182983,
+      "learning_rate": 2.7304048234280794e-06,
+      "loss": 0.8995,
+      "step": 673
+    },
+    {
+      "epoch": 1.0449612403100774,
+      "grad_norm": 1.3006627559661865,
+      "learning_rate": 2.726098191214471e-06,
+      "loss": 0.8453,
+      "step": 674
+    },
+    {
+      "epoch": 1.0465116279069768,
+      "grad_norm": 1.3277314901351929,
+      "learning_rate": 2.7217915590008613e-06,
+      "loss": 0.894,
+      "step": 675
+    },
+    {
+      "epoch": 1.048062015503876,
+      "grad_norm": 1.4030394554138184,
+      "learning_rate": 2.7174849267872527e-06,
+      "loss": 0.8726,
+      "step": 676
+    },
+    {
+      "epoch": 1.0496124031007752,
+      "grad_norm": 1.308321237564087,
+      "learning_rate": 2.7131782945736433e-06,
+      "loss": 0.9121,
+      "step": 677
+    },
+    {
+      "epoch": 1.0511627906976744,
+      "grad_norm": 1.4298022985458374,
+      "learning_rate": 2.7088716623600347e-06,
+      "loss": 0.8867,
+      "step": 678
+    },
+    {
+      "epoch": 1.0527131782945736,
+      "grad_norm": 1.473902702331543,
+      "learning_rate": 2.7045650301464256e-06,
+      "loss": 0.8777,
+      "step": 679
+    },
+    {
+      "epoch": 1.054263565891473,
+      "grad_norm": 1.275970697402954,
+      "learning_rate": 2.7002583979328166e-06,
+      "loss": 0.884,
+      "step": 680
+    },
+    {
+      "epoch": 1.054263565891473,
+      "eval_loss": 1.0358476638793945,
+      "eval_runtime": 46.7815,
+      "eval_samples_per_second": 21.376,
+      "eval_steps_per_second": 1.347,
+      "step": 680
+    },
+    {
+      "epoch": 1.0558139534883721,
+      "grad_norm": 1.3834108114242554,
+      "learning_rate": 2.695951765719208e-06,
+      "loss": 0.902,
+      "step": 681
+    },
+    {
+      "epoch": 1.0573643410852713,
+      "grad_norm": 1.5840026140213013,
+      "learning_rate": 2.691645133505599e-06,
+      "loss": 0.8921,
+      "step": 682
+    },
+    {
+      "epoch": 1.0589147286821705,
+      "grad_norm": 1.3392385244369507,
+      "learning_rate": 2.68733850129199e-06,
+      "loss": 0.8747,
+      "step": 683
+    },
+    {
+      "epoch": 1.0604651162790697,
+      "grad_norm": 1.3050808906555176,
+      "learning_rate": 2.683031869078381e-06,
+      "loss": 0.9219,
+      "step": 684
+    },
+    {
+      "epoch": 1.062015503875969,
+      "grad_norm": 1.5907018184661865,
+      "learning_rate": 2.678725236864772e-06,
+      "loss": 0.9024,
+      "step": 685
+    },
+    {
+      "epoch": 1.0635658914728683,
+      "grad_norm": 1.4619678258895874,
+      "learning_rate": 2.674418604651163e-06,
+      "loss": 0.8858,
+      "step": 686
+    },
+    {
+      "epoch": 1.0651162790697675,
+      "grad_norm": 1.3546361923217773,
+      "learning_rate": 2.6701119724375542e-06,
+      "loss": 0.8739,
+      "step": 687
+    },
+    {
+      "epoch": 1.0666666666666667,
+      "grad_norm": 1.4255887269973755,
+      "learning_rate": 2.665805340223945e-06,
+      "loss": 0.8941,
+      "step": 688
+    },
+    {
+      "epoch": 1.0682170542635658,
+      "grad_norm": 1.3893107175827026,
+      "learning_rate": 2.661498708010336e-06,
+      "loss": 0.8999,
+      "step": 689
+    },
+    {
+      "epoch": 1.069767441860465,
+      "grad_norm": 1.3497520685195923,
+      "learning_rate": 2.657192075796727e-06,
+      "loss": 0.8921,
+      "step": 690
+    },
+    {
+      "epoch": 1.069767441860465,
+      "eval_loss": 1.0340094566345215,
+      "eval_runtime": 46.7821,
+      "eval_samples_per_second": 21.376,
+      "eval_steps_per_second": 1.347,
+      "step": 690
+    },
+    {
+      "epoch": 1.0713178294573644,
+      "grad_norm": 1.3812423944473267,
+      "learning_rate": 2.652885443583118e-06,
+      "loss": 0.8989,
+      "step": 691
+    },
+    {
+      "epoch": 1.0728682170542636,
+      "grad_norm": 1.6061986684799194,
+      "learning_rate": 2.6485788113695095e-06,
+      "loss": 0.8508,
+      "step": 692
+    },
+    {
+      "epoch": 1.0744186046511628,
+      "grad_norm": 1.4533159732818604,
+      "learning_rate": 2.6442721791559e-06,
+      "loss": 0.8722,
+      "step": 693
+    },
+    {
+      "epoch": 1.075968992248062,
+      "grad_norm": 1.398797869682312,
+      "learning_rate": 2.6399655469422914e-06,
+      "loss": 0.9048,
+      "step": 694
+    },
+    {
+      "epoch": 1.0775193798449612,
+      "grad_norm": 1.404852271080017,
+      "learning_rate": 2.635658914728683e-06,
+      "loss": 0.8665,
+      "step": 695
+    },
+    {
+      "epoch": 1.0790697674418606,
+      "grad_norm": 1.5127512216567993,
+      "learning_rate": 2.6313522825150734e-06,
+      "loss": 0.8915,
+      "step": 696
+    },
+    {
+      "epoch": 1.0806201550387597,
+      "grad_norm": 1.6629828214645386,
+      "learning_rate": 2.6270456503014647e-06,
+      "loss": 0.8929,
+      "step": 697
+    },
+    {
+      "epoch": 1.082170542635659,
+      "grad_norm": 1.7313421964645386,
+      "learning_rate": 2.6227390180878553e-06,
+      "loss": 0.8704,
+      "step": 698
+    },
+    {
+      "epoch": 1.083720930232558,
+      "grad_norm": 1.555357813835144,
+      "learning_rate": 2.6184323858742467e-06,
+      "loss": 0.9208,
+      "step": 699
+    },
+    {
+      "epoch": 1.0852713178294573,
+      "grad_norm": 1.4509795904159546,
+      "learning_rate": 2.6141257536606372e-06,
+      "loss": 0.9248,
+      "step": 700
+    },
+    {
+      "epoch": 1.0852713178294573,
+      "eval_loss": 1.0356652736663818,
+      "eval_runtime": 46.7938,
+      "eval_samples_per_second": 21.37,
+      "eval_steps_per_second": 1.346,
+      "step": 700
+    },
+    {
+      "epoch": 1.0868217054263565,
+      "grad_norm": 1.5227144956588745,
+      "learning_rate": 2.6098191214470286e-06,
+      "loss": 0.8902,
+      "step": 701
+    },
+    {
+      "epoch": 1.0883720930232559,
+      "grad_norm": 1.8152211904525757,
+      "learning_rate": 2.60551248923342e-06,
+      "loss": 0.9044,
+      "step": 702
+    },
+    {
+      "epoch": 1.089922480620155,
+      "grad_norm": 1.3922104835510254,
+      "learning_rate": 2.6012058570198106e-06,
+      "loss": 0.9128,
+      "step": 703
+    },
+    {
+      "epoch": 1.0914728682170542,
+      "grad_norm": 1.2599694728851318,
+      "learning_rate": 2.596899224806202e-06,
+      "loss": 0.9084,
+      "step": 704
+    },
+    {
+      "epoch": 1.0930232558139534,
+      "grad_norm": 1.395390510559082,
+      "learning_rate": 2.5925925925925925e-06,
+      "loss": 0.923,
+      "step": 705
+    },
+    {
+      "epoch": 1.0945736434108526,
+      "grad_norm": 1.374895691871643,
+      "learning_rate": 2.588285960378984e-06,
+      "loss": 0.8996,
+      "step": 706
+    },
+    {
+      "epoch": 1.096124031007752,
+      "grad_norm": 1.4008183479309082,
+      "learning_rate": 2.583979328165375e-06,
+      "loss": 0.9291,
+      "step": 707
+    },
+    {
+      "epoch": 1.0976744186046512,
+      "grad_norm": 1.3724210262298584,
+      "learning_rate": 2.579672695951766e-06,
+      "loss": 0.8805,
+      "step": 708
+    },
+    {
+      "epoch": 1.0992248062015504,
+      "grad_norm": 1.417680025100708,
+      "learning_rate": 2.575366063738157e-06,
+      "loss": 0.8795,
+      "step": 709
+    },
+    {
+      "epoch": 1.1007751937984496,
+      "grad_norm": 1.5281082391738892,
+      "learning_rate": 2.5710594315245478e-06,
+      "loss": 0.9098,
+      "step": 710
+    },
+    {
+      "epoch": 1.1007751937984496,
+      "eval_loss": 1.036144733428955,
+      "eval_runtime": 46.7339,
+      "eval_samples_per_second": 21.398,
+      "eval_steps_per_second": 1.348,
+      "step": 710
+    },
+    {
+      "epoch": 1.1023255813953488,
+      "grad_norm": 1.631238341331482,
+      "learning_rate": 2.566752799310939e-06,
+      "loss": 0.9207,
+      "step": 711
+    },
+    {
+      "epoch": 1.1038759689922482,
+      "grad_norm": 1.405727744102478,
+      "learning_rate": 2.56244616709733e-06,
+      "loss": 0.8988,
+      "step": 712
+    },
+    {
+      "epoch": 1.1054263565891473,
+      "grad_norm": 1.3681135177612305,
+      "learning_rate": 2.558139534883721e-06,
+      "loss": 0.8691,
+      "step": 713
+    },
+    {
+      "epoch": 1.1069767441860465,
+      "grad_norm": 1.588548183441162,
+      "learning_rate": 2.553832902670112e-06,
+      "loss": 0.9006,
+      "step": 714
+    },
+    {
+      "epoch": 1.1085271317829457,
+      "grad_norm": 1.4774502515792847,
+      "learning_rate": 2.5495262704565034e-06,
+      "loss": 0.8987,
+      "step": 715
+    },
+    {
+      "epoch": 1.110077519379845,
+      "grad_norm": 1.4388251304626465,
+      "learning_rate": 2.5452196382428944e-06,
+      "loss": 0.8796,
+      "step": 716
+    },
+    {
+      "epoch": 1.1116279069767443,
+      "grad_norm": 1.5053632259368896,
+      "learning_rate": 2.5409130060292854e-06,
+      "loss": 0.9051,
+      "step": 717
+    },
+    {
+      "epoch": 1.1131782945736435,
+      "grad_norm": 1.7834973335266113,
+      "learning_rate": 2.5366063738156763e-06,
+      "loss": 0.9509,
+      "step": 718
+    },
+    {
+      "epoch": 1.1147286821705427,
+      "grad_norm": 1.351125717163086,
+      "learning_rate": 2.5322997416020673e-06,
+      "loss": 0.9065,
+      "step": 719
+    },
+    {
+      "epoch": 1.1162790697674418,
+      "grad_norm": 1.4976617097854614,
+      "learning_rate": 2.5279931093884587e-06,
+      "loss": 0.8987,
+      "step": 720
+    },
+    {
+      "epoch": 1.1162790697674418,
+      "eval_loss": 1.0367296934127808,
+      "eval_runtime": 46.7237,
+      "eval_samples_per_second": 21.402,
+      "eval_steps_per_second": 1.348,
+      "step": 720
+    },
+    {
+      "epoch": 1.117829457364341,
+      "grad_norm": 1.302541732788086,
+      "learning_rate": 2.5236864771748492e-06,
+      "loss": 0.8542,
+      "step": 721
+    },
+    {
+      "epoch": 1.1193798449612402,
+      "grad_norm": 1.36167311668396,
+      "learning_rate": 2.5193798449612406e-06,
+      "loss": 0.886,
+      "step": 722
+    },
+    {
+      "epoch": 1.1209302325581396,
+      "grad_norm": 1.4094293117523193,
+      "learning_rate": 2.515073212747632e-06,
+      "loss": 0.912,
+      "step": 723
+    },
+    {
+      "epoch": 1.1224806201550388,
+      "grad_norm": 1.5545191764831543,
+      "learning_rate": 2.5107665805340226e-06,
+      "loss": 0.9054,
+      "step": 724
+    },
+    {
+      "epoch": 1.124031007751938,
+      "grad_norm": 1.5490257740020752,
+      "learning_rate": 2.506459948320414e-06,
+      "loss": 0.9226,
+      "step": 725
+    },
+    {
+      "epoch": 1.1255813953488372,
+      "grad_norm": 1.3758277893066406,
+      "learning_rate": 2.5021533161068045e-06,
+      "loss": 0.8985,
+      "step": 726
+    },
+    {
+      "epoch": 1.1271317829457363,
+      "grad_norm": 1.403664231300354,
+      "learning_rate": 2.497846683893196e-06,
+      "loss": 0.8851,
+      "step": 727
+    },
+    {
+      "epoch": 1.1286821705426358,
+      "grad_norm": 1.4780570268630981,
+      "learning_rate": 2.493540051679587e-06,
+      "loss": 0.932,
+      "step": 728
+    },
+    {
+      "epoch": 1.130232558139535,
+      "grad_norm": 1.527536153793335,
+      "learning_rate": 2.489233419465978e-06,
+      "loss": 0.8963,
+      "step": 729
+    },
+    {
+      "epoch": 1.1317829457364341,
+      "grad_norm": 1.3349652290344238,
+      "learning_rate": 2.484926787252369e-06,
+      "loss": 0.8905,
+      "step": 730
+    },
+    {
+      "epoch": 1.1317829457364341,
+      "eval_loss": 1.031362533569336,
+      "eval_runtime": 46.6489,
+      "eval_samples_per_second": 21.437,
+      "eval_steps_per_second": 1.351,
+      "step": 730
+    },
+    {
+      "epoch": 1.1333333333333333,
+      "grad_norm": 1.4726243019104004,
+      "learning_rate": 2.4806201550387598e-06,
+      "loss": 0.9037,
+      "step": 731
+    },
+    {
+      "epoch": 1.1348837209302325,
+      "grad_norm": 1.2975237369537354,
+      "learning_rate": 2.4763135228251507e-06,
+      "loss": 0.9131,
+      "step": 732
+    },
+    {
+      "epoch": 1.1364341085271317,
+      "grad_norm": 1.412916660308838,
+      "learning_rate": 2.4720068906115417e-06,
+      "loss": 0.9124,
+      "step": 733
+    },
+    {
+      "epoch": 1.137984496124031,
+      "grad_norm": 1.5443305969238281,
+      "learning_rate": 2.467700258397933e-06,
+      "loss": 0.9019,
+      "step": 734
+    },
+    {
+      "epoch": 1.1395348837209303,
+      "grad_norm": 1.422319769859314,
+      "learning_rate": 2.463393626184324e-06,
+      "loss": 0.8924,
+      "step": 735
+    },
+    {
+      "epoch": 1.1410852713178294,
+      "grad_norm": 1.3379539251327515,
+      "learning_rate": 2.459086993970715e-06,
+      "loss": 0.8707,
+      "step": 736
+    },
+    {
+      "epoch": 1.1426356589147286,
+      "grad_norm": 1.4538874626159668,
+      "learning_rate": 2.454780361757106e-06,
+      "loss": 0.8878,
+      "step": 737
+    },
+    {
+      "epoch": 1.1441860465116278,
+      "grad_norm": 1.3966622352600098,
+      "learning_rate": 2.450473729543497e-06,
+      "loss": 0.8923,
+      "step": 738
+    },
+    {
+      "epoch": 1.1457364341085272,
+      "grad_norm": 1.3471720218658447,
+      "learning_rate": 2.4461670973298884e-06,
+      "loss": 0.8864,
+      "step": 739
+    },
+    {
+      "epoch": 1.1472868217054264,
+      "grad_norm": 1.378870964050293,
+      "learning_rate": 2.4418604651162793e-06,
+      "loss": 0.8591,
+      "step": 740
+    },
+    {
+      "epoch": 1.1472868217054264,
+      "eval_loss": 1.0319344997406006,
+      "eval_runtime": 46.729,
+      "eval_samples_per_second": 21.4,
+      "eval_steps_per_second": 1.348,
+      "step": 740
+    },
+    {
+      "epoch": 1.1488372093023256,
+      "grad_norm": 1.4001867771148682,
+      "learning_rate": 2.4375538329026703e-06,
+      "loss": 0.8649,
+      "step": 741
+    },
+    {
+      "epoch": 1.1503875968992248,
+      "grad_norm": 1.3273779153823853,
+      "learning_rate": 2.4332472006890613e-06,
+      "loss": 0.8754,
+      "step": 742
+    },
+    {
+      "epoch": 1.151937984496124,
+      "grad_norm": 1.8371070623397827,
+      "learning_rate": 2.4289405684754527e-06,
+      "loss": 0.9154,
+      "step": 743
+    },
+    {
+      "epoch": 1.1534883720930234,
+      "grad_norm": 1.433976411819458,
+      "learning_rate": 2.4246339362618436e-06,
+      "loss": 0.883,
+      "step": 744
+    },
+    {
+      "epoch": 1.1550387596899225,
+      "grad_norm": 1.651013731956482,
+      "learning_rate": 2.4203273040482346e-06,
+      "loss": 0.8659,
+      "step": 745
+    },
+    {
+      "epoch": 1.1565891472868217,
+      "grad_norm": 1.5998989343643188,
+      "learning_rate": 2.4160206718346256e-06,
+      "loss": 0.8754,
+      "step": 746
+    },
+    {
+      "epoch": 1.158139534883721,
+      "grad_norm": 1.4122543334960938,
+      "learning_rate": 2.4117140396210165e-06,
+      "loss": 0.8901,
+      "step": 747
+    },
+    {
+      "epoch": 1.15968992248062,
+      "grad_norm": 1.4213175773620605,
+      "learning_rate": 2.4074074074074075e-06,
+      "loss": 0.8925,
+      "step": 748
+    },
+    {
+      "epoch": 1.1612403100775195,
+      "grad_norm": 1.3378446102142334,
+      "learning_rate": 2.403100775193799e-06,
+      "loss": 0.9257,
+      "step": 749
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 1.4897722005844116,
+      "learning_rate": 2.39879414298019e-06,
+      "loss": 0.8783,
+      "step": 750
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "eval_loss": 1.0316088199615479,
+      "eval_runtime": 46.8059,
+      "eval_samples_per_second": 21.365,
+      "eval_steps_per_second": 1.346,
+      "step": 750
+    },
+    {
+      "epoch": 1.1643410852713179,
+      "grad_norm": 1.5811142921447754,
+      "learning_rate": 2.394487510766581e-06,
+      "loss": 0.8976,
+      "step": 751
+    },
+    {
+      "epoch": 1.165891472868217,
+      "grad_norm": 1.561867117881775,
+      "learning_rate": 2.390180878552972e-06,
+      "loss": 0.8731,
+      "step": 752
+    },
+    {
+      "epoch": 1.1674418604651162,
+      "grad_norm": 1.3906325101852417,
+      "learning_rate": 2.3858742463393628e-06,
+      "loss": 0.9042,
+      "step": 753
+    },
+    {
+      "epoch": 1.1689922480620156,
+      "grad_norm": 1.5785614252090454,
+      "learning_rate": 2.3815676141257537e-06,
+      "loss": 0.8894,
+      "step": 754
+    },
+    {
+      "epoch": 1.1705426356589148,
+      "grad_norm": 1.5823315382003784,
+      "learning_rate": 2.3772609819121447e-06,
+      "loss": 0.8894,
+      "step": 755
+    },
+    {
+      "epoch": 1.172093023255814,
+      "grad_norm": 1.4650275707244873,
+      "learning_rate": 2.372954349698536e-06,
+      "loss": 0.9021,
+      "step": 756
+    },
+    {
+      "epoch": 1.1736434108527132,
+      "grad_norm": 1.4748083353042603,
+      "learning_rate": 2.368647717484927e-06,
+      "loss": 0.9031,
+      "step": 757
+    },
+    {
+      "epoch": 1.1751937984496124,
+      "grad_norm": 1.4308465719223022,
+      "learning_rate": 2.364341085271318e-06,
+      "loss": 0.8562,
+      "step": 758
+    },
+    {
+      "epoch": 1.1767441860465115,
+      "grad_norm": 1.4933222532272339,
+      "learning_rate": 2.360034453057709e-06,
+      "loss": 0.8546,
+      "step": 759
+    },
+    {
+      "epoch": 1.178294573643411,
+      "grad_norm": 1.376835823059082,
+      "learning_rate": 2.3557278208441e-06,
+      "loss": 0.8787,
+      "step": 760
+    },
+    {
+      "epoch": 1.178294573643411,
+      "eval_loss": 1.0326775312423706,
+      "eval_runtime": 46.6601,
+      "eval_samples_per_second": 21.432,
+      "eval_steps_per_second": 1.35,
+      "step": 760
+    },
+    {
+      "epoch": 1.1798449612403101,
+      "grad_norm": 1.7554876804351807,
+      "learning_rate": 2.351421188630491e-06,
+      "loss": 0.9169,
+      "step": 761
+    },
+    {
+      "epoch": 1.1813953488372093,
+      "grad_norm": 1.345408320426941,
+      "learning_rate": 2.347114556416882e-06,
+      "loss": 0.8787,
+      "step": 762
+    },
+    {
+      "epoch": 1.1829457364341085,
+      "grad_norm": 1.4579490423202515,
+      "learning_rate": 2.3428079242032733e-06,
+      "loss": 0.8528,
+      "step": 763
+    },
+    {
+      "epoch": 1.1844961240310077,
+      "grad_norm": 1.4105459451675415,
+      "learning_rate": 2.3385012919896642e-06,
+      "loss": 0.8845,
+      "step": 764
+    },
+    {
+      "epoch": 1.1860465116279069,
+      "grad_norm": 1.4601131677627563,
+      "learning_rate": 2.3341946597760552e-06,
+      "loss": 0.8702,
+      "step": 765
+    },
+    {
+      "epoch": 1.1875968992248063,
+      "grad_norm": 1.5710184574127197,
+      "learning_rate": 2.329888027562446e-06,
+      "loss": 0.8971,
+      "step": 766
+    },
+    {
+      "epoch": 1.1891472868217055,
+      "grad_norm": 1.5850234031677246,
+      "learning_rate": 2.3255813953488376e-06,
+      "loss": 0.8814,
+      "step": 767
+    },
+    {
+      "epoch": 1.1906976744186046,
+      "grad_norm": 1.4523247480392456,
+      "learning_rate": 2.3212747631352285e-06,
+      "loss": 0.9126,
+      "step": 768
+    },
+    {
+      "epoch": 1.1922480620155038,
+      "grad_norm": 1.5494409799575806,
+      "learning_rate": 2.3169681309216195e-06,
+      "loss": 0.9006,
+      "step": 769
+    },
+    {
+      "epoch": 1.193798449612403,
+      "grad_norm": 1.81002676486969,
+      "learning_rate": 2.3126614987080105e-06,
+      "loss": 0.9018,
+      "step": 770
+    },
+    {
+      "epoch": 1.193798449612403,
+      "eval_loss": 1.0328283309936523,
+      "eval_runtime": 46.7572,
+      "eval_samples_per_second": 21.387,
+      "eval_steps_per_second": 1.347,
+      "step": 770
+    },
+    {
+      "epoch": 1.1953488372093024,
+      "grad_norm": 1.527626633644104,
+      "learning_rate": 2.3083548664944014e-06,
+      "loss": 0.866,
+      "step": 771
+    },
+    {
+      "epoch": 1.1968992248062016,
+      "grad_norm": 1.469875693321228,
+      "learning_rate": 2.304048234280793e-06,
+      "loss": 0.8965,
+      "step": 772
+    },
+    {
+      "epoch": 1.1984496124031008,
+      "grad_norm": 1.4807052612304688,
+      "learning_rate": 2.299741602067184e-06,
+      "loss": 0.8492,
+      "step": 773
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.494244933128357,
+      "learning_rate": 2.2954349698535748e-06,
+      "loss": 0.906,
+      "step": 774
+    },
+    {
+      "epoch": 1.2015503875968991,
+      "grad_norm": 1.5816459655761719,
+      "learning_rate": 2.2911283376399657e-06,
+      "loss": 0.8956,
+      "step": 775
+    },
+    {
+      "epoch": 1.2031007751937985,
+      "grad_norm": 1.479781150817871,
+      "learning_rate": 2.2868217054263567e-06,
+      "loss": 0.8492,
+      "step": 776
+    },
+    {
+      "epoch": 1.2046511627906977,
+      "grad_norm": 1.469319462776184,
+      "learning_rate": 2.282515073212748e-06,
+      "loss": 0.8865,
+      "step": 777
+    },
+    {
+      "epoch": 1.206201550387597,
+      "grad_norm": 1.88156259059906,
+      "learning_rate": 2.278208440999139e-06,
+      "loss": 0.9198,
+      "step": 778
+    },
+    {
+      "epoch": 1.207751937984496,
+      "grad_norm": 1.3706072568893433,
+      "learning_rate": 2.27390180878553e-06,
+      "loss": 0.88,
+      "step": 779
+    },
+    {
+      "epoch": 1.2093023255813953,
+      "grad_norm": 1.4646852016448975,
+      "learning_rate": 2.269595176571921e-06,
+      "loss": 0.8898,
+      "step": 780
+    },
+    {
+      "epoch": 1.2093023255813953,
+      "eval_loss": 1.029195785522461,
+      "eval_runtime": 46.8464,
+      "eval_samples_per_second": 21.346,
+      "eval_steps_per_second": 1.345,
+      "step": 780
+    },
+    {
+      "epoch": 1.2108527131782947,
+      "grad_norm": 1.5722054243087769,
+      "learning_rate": 2.265288544358312e-06,
+      "loss": 0.8598,
+      "step": 781
+    },
+    {
+      "epoch": 1.2124031007751939,
+      "grad_norm": 1.3713362216949463,
+      "learning_rate": 2.260981912144703e-06,
+      "loss": 0.9053,
+      "step": 782
+    },
+    {
+      "epoch": 1.213953488372093,
+      "grad_norm": 1.4536453485488892,
+      "learning_rate": 2.256675279931094e-06,
+      "loss": 0.8515,
+      "step": 783
+    },
+    {
+      "epoch": 1.2155038759689922,
+      "grad_norm": 1.6506465673446655,
+      "learning_rate": 2.252368647717485e-06,
+      "loss": 0.9042,
+      "step": 784
+    },
+    {
+      "epoch": 1.2170542635658914,
+      "grad_norm": 1.5541293621063232,
+      "learning_rate": 2.2480620155038763e-06,
+      "loss": 0.8998,
+      "step": 785
+    },
+    {
+      "epoch": 1.2186046511627908,
+      "grad_norm": 1.3651714324951172,
+      "learning_rate": 2.2437553832902672e-06,
+      "loss": 0.9014,
+      "step": 786
+    },
+    {
+      "epoch": 1.22015503875969,
+      "grad_norm": 1.7241014242172241,
+      "learning_rate": 2.239448751076658e-06,
+      "loss": 0.9098,
+      "step": 787
+    },
+    {
+      "epoch": 1.2217054263565892,
+      "grad_norm": 1.600130558013916,
+      "learning_rate": 2.235142118863049e-06,
+      "loss": 0.8904,
+      "step": 788
+    },
+    {
+      "epoch": 1.2232558139534884,
+      "grad_norm": 1.5315771102905273,
+      "learning_rate": 2.23083548664944e-06,
+      "loss": 0.9212,
+      "step": 789
+    },
+    {
+      "epoch": 1.2248062015503876,
+      "grad_norm": 1.463247537612915,
+      "learning_rate": 2.226528854435831e-06,
+      "loss": 0.8708,
+      "step": 790
+    },
+    {
+      "epoch": 1.2248062015503876,
+      "eval_loss": 1.0294724702835083,
+      "eval_runtime": 46.9288,
+      "eval_samples_per_second": 21.309,
+      "eval_steps_per_second": 1.342,
+      "step": 790
+    },
+    {
+      "epoch": 1.2263565891472867,
+      "grad_norm": 1.5448787212371826,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.8834,
+      "step": 791
+    },
+    {
+      "epoch": 1.2279069767441861,
+      "grad_norm": 1.6028432846069336,
+      "learning_rate": 2.2179155900086135e-06,
+      "loss": 0.8902,
+      "step": 792
+    },
+    {
+      "epoch": 1.2294573643410853,
+      "grad_norm": 1.4141221046447754,
+      "learning_rate": 2.2136089577950044e-06,
+      "loss": 0.8757,
+      "step": 793
+    },
+    {
+      "epoch": 1.2310077519379845,
+      "grad_norm": 1.753729224205017,
+      "learning_rate": 2.2093023255813954e-06,
+      "loss": 0.9163,
+      "step": 794
+    },
+    {
+      "epoch": 1.2325581395348837,
+      "grad_norm": 1.4352757930755615,
+      "learning_rate": 2.2049956933677864e-06,
+      "loss": 0.8727,
+      "step": 795
+    },
+    {
+      "epoch": 1.2341085271317829,
+      "grad_norm": 1.4990347623825073,
+      "learning_rate": 2.2006890611541778e-06,
+      "loss": 0.8983,
+      "step": 796
+    },
+    {
+      "epoch": 1.235658914728682,
+      "grad_norm": 1.5358068943023682,
+      "learning_rate": 2.1963824289405687e-06,
+      "loss": 0.904,
+      "step": 797
+    },
+    {
+      "epoch": 1.2372093023255815,
+      "grad_norm": 1.5169130563735962,
+      "learning_rate": 2.1920757967269597e-06,
+      "loss": 0.9223,
+      "step": 798
+    },
+    {
+      "epoch": 1.2387596899224806,
+      "grad_norm": 1.5093134641647339,
+      "learning_rate": 2.1877691645133507e-06,
+      "loss": 0.8884,
+      "step": 799
+    },
+    {
+      "epoch": 1.2403100775193798,
+      "grad_norm": 1.4294668436050415,
+      "learning_rate": 2.183462532299742e-06,
+      "loss": 0.8961,
+      "step": 800
+    },
+    {
+      "epoch": 1.2403100775193798,
+      "eval_loss": 1.0295883417129517,
+      "eval_runtime": 46.8817,
+      "eval_samples_per_second": 21.33,
+      "eval_steps_per_second": 1.344,
+      "step": 800
+    },
+    {
+      "epoch": 1.241860465116279,
+      "grad_norm": 1.3810713291168213,
+      "learning_rate": 2.179155900086133e-06,
+      "loss": 0.8758,
+      "step": 801
+    },
+    {
+      "epoch": 1.2434108527131782,
+      "grad_norm": 1.5186481475830078,
+      "learning_rate": 2.174849267872524e-06,
+      "loss": 0.8674,
+      "step": 802
+    },
+    {
+      "epoch": 1.2449612403100776,
+      "grad_norm": 1.4637038707733154,
+      "learning_rate": 2.170542635658915e-06,
+      "loss": 0.8852,
+      "step": 803
+    },
+    {
+      "epoch": 1.2465116279069768,
+      "grad_norm": 1.553092122077942,
+      "learning_rate": 2.166236003445306e-06,
+      "loss": 0.886,
+      "step": 804
+    },
+    {
+      "epoch": 1.248062015503876,
+      "grad_norm": 1.5428677797317505,
+      "learning_rate": 2.161929371231697e-06,
+      "loss": 0.8984,
+      "step": 805
+    },
+    {
+      "epoch": 1.2496124031007751,
+      "grad_norm": 1.6426329612731934,
+      "learning_rate": 2.1576227390180883e-06,
+      "loss": 0.935,
+      "step": 806
+    },
+    {
+      "epoch": 1.2511627906976743,
+      "grad_norm": 1.3592965602874756,
+      "learning_rate": 2.1533161068044793e-06,
+      "loss": 0.9014,
+      "step": 807
+    },
+    {
+      "epoch": 1.2527131782945737,
+      "grad_norm": 1.5501923561096191,
+      "learning_rate": 2.1490094745908702e-06,
+      "loss": 0.8659,
+      "step": 808
+    },
+    {
+      "epoch": 1.254263565891473,
+      "grad_norm": 1.3319815397262573,
+      "learning_rate": 2.144702842377261e-06,
+      "loss": 0.8818,
+      "step": 809
+    },
+    {
+      "epoch": 1.255813953488372,
+      "grad_norm": 1.3006103038787842,
+      "learning_rate": 2.140396210163652e-06,
+      "loss": 0.8787,
+      "step": 810
+    },
+    {
+      "epoch": 1.255813953488372,
+      "eval_loss": 1.028241753578186,
+      "eval_runtime": 46.6041,
+      "eval_samples_per_second": 21.457,
+      "eval_steps_per_second": 1.352,
+      "step": 810
+    },
+    {
+      "epoch": 1.2573643410852713,
+      "grad_norm": 1.4956690073013306,
+      "learning_rate": 2.136089577950043e-06,
+      "loss": 0.8905,
+      "step": 811
+    },
+    {
+      "epoch": 1.2589147286821705,
+      "grad_norm": 1.419439673423767,
+      "learning_rate": 2.131782945736434e-06,
+      "loss": 0.8329,
+      "step": 812
+    },
+    {
+      "epoch": 1.2604651162790699,
+      "grad_norm": 1.302194356918335,
+      "learning_rate": 2.127476313522825e-06,
+      "loss": 0.8731,
+      "step": 813
+    },
+    {
+      "epoch": 1.262015503875969,
+      "grad_norm": 1.4846899509429932,
+      "learning_rate": 2.1231696813092165e-06,
+      "loss": 0.8761,
+      "step": 814
+    },
+    {
+      "epoch": 1.2635658914728682,
+      "grad_norm": 1.4464991092681885,
+      "learning_rate": 2.1188630490956074e-06,
+      "loss": 0.8933,
+      "step": 815
+    },
+    {
+      "epoch": 1.2651162790697674,
+      "grad_norm": 1.411321759223938,
+      "learning_rate": 2.1145564168819984e-06,
+      "loss": 0.9096,
+      "step": 816
+    },
+    {
+      "epoch": 1.2666666666666666,
+      "grad_norm": 1.3309530019760132,
+      "learning_rate": 2.1102497846683894e-06,
+      "loss": 0.8554,
+      "step": 817
+    },
+    {
+      "epoch": 1.268217054263566,
+      "grad_norm": 1.4233335256576538,
+      "learning_rate": 2.1059431524547803e-06,
+      "loss": 0.9031,
+      "step": 818
+    },
+    {
+      "epoch": 1.2697674418604652,
+      "grad_norm": 1.4139257669448853,
+      "learning_rate": 2.1016365202411713e-06,
+      "loss": 0.8552,
+      "step": 819
+    },
+    {
+      "epoch": 1.2713178294573644,
+      "grad_norm": 1.3686296939849854,
+      "learning_rate": 2.0973298880275627e-06,
+      "loss": 0.8897,
+      "step": 820
+    },
+    {
+      "epoch": 1.2713178294573644,
+      "eval_loss": 1.0273642539978027,
+      "eval_runtime": 46.6682,
+      "eval_samples_per_second": 21.428,
+      "eval_steps_per_second": 1.35,
+      "step": 820
+    },
+    {
+      "epoch": 1.2728682170542636,
+      "grad_norm": 1.530033826828003,
+      "learning_rate": 2.0930232558139536e-06,
+      "loss": 0.8856,
+      "step": 821
+    },
+    {
+      "epoch": 1.2744186046511627,
+      "grad_norm": 1.335328221321106,
+      "learning_rate": 2.0887166236003446e-06,
+      "loss": 0.8437,
+      "step": 822
+    },
+    {
+      "epoch": 1.2759689922480622,
+      "grad_norm": 1.6352286338806152,
+      "learning_rate": 2.0844099913867356e-06,
+      "loss": 0.9048,
+      "step": 823
+    },
+    {
+      "epoch": 1.2775193798449611,
+      "grad_norm": 1.414017677307129,
+      "learning_rate": 2.080103359173127e-06,
+      "loss": 0.8967,
+      "step": 824
+    },
+    {
+      "epoch": 1.2790697674418605,
+      "grad_norm": 1.5296484231948853,
+      "learning_rate": 2.075796726959518e-06,
+      "loss": 0.8628,
+      "step": 825
+    },
+    {
+      "epoch": 1.2806201550387597,
+      "grad_norm": 1.4565972089767456,
+      "learning_rate": 2.071490094745909e-06,
+      "loss": 0.8846,
+      "step": 826
+    },
+    {
+      "epoch": 1.2821705426356589,
+      "grad_norm": 1.4244471788406372,
+      "learning_rate": 2.0671834625323e-06,
+      "loss": 0.8899,
+      "step": 827
+    },
+    {
+      "epoch": 1.283720930232558,
+      "grad_norm": 1.3682016134262085,
+      "learning_rate": 2.0628768303186913e-06,
+      "loss": 0.8437,
+      "step": 828
+    },
+    {
+      "epoch": 1.2852713178294572,
+      "grad_norm": 1.3579398393630981,
+      "learning_rate": 2.0585701981050822e-06,
+      "loss": 0.8981,
+      "step": 829
+    },
+    {
+      "epoch": 1.2868217054263567,
+      "grad_norm": 1.6583482027053833,
+      "learning_rate": 2.054263565891473e-06,
+      "loss": 0.8743,
+      "step": 830
+    },
+    {
+      "epoch": 1.2868217054263567,
+      "eval_loss": 1.0267916917800903,
+      "eval_runtime": 46.7644,
+      "eval_samples_per_second": 21.384,
+      "eval_steps_per_second": 1.347,
+      "step": 830
+    },
+    {
+      "epoch": 1.2883720930232558,
+      "grad_norm": 1.6551933288574219,
+      "learning_rate": 2.049956933677864e-06,
+      "loss": 0.9314,
+      "step": 831
+    },
+    {
+      "epoch": 1.289922480620155,
+      "grad_norm": 1.3006973266601562,
+      "learning_rate": 2.045650301464255e-06,
+      "loss": 0.8844,
+      "step": 832
+    },
+    {
+      "epoch": 1.2914728682170542,
+      "grad_norm": 1.4112263917922974,
+      "learning_rate": 2.041343669250646e-06,
+      "loss": 0.9061,
+      "step": 833
+    },
+    {
+      "epoch": 1.2930232558139534,
+      "grad_norm": 1.4028356075286865,
+      "learning_rate": 2.037037037037037e-06,
+      "loss": 0.8872,
+      "step": 834
+    },
+    {
+      "epoch": 1.2945736434108528,
+      "grad_norm": 1.5988967418670654,
+      "learning_rate": 2.0327304048234285e-06,
+      "loss": 0.8994,
+      "step": 835
+    },
+    {
+      "epoch": 1.296124031007752,
+      "grad_norm": 1.464126706123352,
+      "learning_rate": 2.0284237726098194e-06,
+      "loss": 0.8918,
+      "step": 836
+    },
+    {
+      "epoch": 1.2976744186046512,
+      "grad_norm": 1.4168930053710938,
+      "learning_rate": 2.0241171403962104e-06,
+      "loss": 0.9086,
+      "step": 837
+    },
+    {
+      "epoch": 1.2992248062015503,
+      "grad_norm": 1.6398478746414185,
+      "learning_rate": 2.0198105081826014e-06,
+      "loss": 0.88,
+      "step": 838
+    },
+    {
+      "epoch": 1.3007751937984495,
+      "grad_norm": 1.3792091608047485,
+      "learning_rate": 2.0155038759689923e-06,
+      "loss": 0.8951,
+      "step": 839
+    },
+    {
+      "epoch": 1.302325581395349,
+      "grad_norm": 1.5528695583343506,
+      "learning_rate": 2.0111972437553833e-06,
+      "loss": 0.8858,
+      "step": 840
+    },
+    {
+      "epoch": 1.302325581395349,
+      "eval_loss": 1.026399850845337,
+      "eval_runtime": 46.7644,
+      "eval_samples_per_second": 21.384,
+      "eval_steps_per_second": 1.347,
+      "step": 840
+    },
+    {
+      "epoch": 1.3038759689922481,
+      "grad_norm": 1.271855115890503,
+      "learning_rate": 2.0068906115417743e-06,
+      "loss": 0.8877,
+      "step": 841
+    },
+    {
+      "epoch": 1.3054263565891473,
+      "grad_norm": 1.2496144771575928,
+      "learning_rate": 2.0025839793281657e-06,
+      "loss": 0.8959,
+      "step": 842
+    },
+    {
+      "epoch": 1.3069767441860465,
+      "grad_norm": 1.5338107347488403,
+      "learning_rate": 1.9982773471145566e-06,
+      "loss": 0.865,
+      "step": 843
+    },
+    {
+      "epoch": 1.3085271317829457,
+      "grad_norm": 1.368303894996643,
+      "learning_rate": 1.9939707149009476e-06,
+      "loss": 0.8666,
+      "step": 844
+    },
+    {
+      "epoch": 1.310077519379845,
+      "grad_norm": 1.3765580654144287,
+      "learning_rate": 1.9896640826873386e-06,
+      "loss": 0.8953,
+      "step": 845
+    },
+    {
+      "epoch": 1.3116279069767443,
+      "grad_norm": 1.5001338720321655,
+      "learning_rate": 1.9853574504737295e-06,
+      "loss": 0.8634,
+      "step": 846
+    },
+    {
+      "epoch": 1.3131782945736434,
+      "grad_norm": 1.650890588760376,
+      "learning_rate": 1.9810508182601205e-06,
+      "loss": 0.8917,
+      "step": 847
+    },
+    {
+      "epoch": 1.3147286821705426,
+      "grad_norm": 1.4128278493881226,
+      "learning_rate": 1.976744186046512e-06,
+      "loss": 0.9183,
+      "step": 848
+    },
+    {
+      "epoch": 1.3162790697674418,
+      "grad_norm": 1.561835527420044,
+      "learning_rate": 1.972437553832903e-06,
+      "loss": 0.8975,
+      "step": 849
+    },
+    {
+      "epoch": 1.3178294573643412,
+      "grad_norm": 1.3987250328063965,
+      "learning_rate": 1.968130921619294e-06,
+      "loss": 0.9322,
+      "step": 850
+    },
+    {
+      "epoch": 1.3178294573643412,
+      "eval_loss": 1.0248056650161743,
+      "eval_runtime": 46.7823,
+      "eval_samples_per_second": 21.376,
+      "eval_steps_per_second": 1.347,
+      "step": 850
+    },
+    {
+      "epoch": 1.3193798449612404,
+      "grad_norm": 1.349995493888855,
+      "learning_rate": 1.963824289405685e-06,
+      "loss": 0.8965,
+      "step": 851
+    },
+    {
+      "epoch": 1.3209302325581396,
+      "grad_norm": 1.7483686208724976,
+      "learning_rate": 1.959517657192076e-06,
+      "loss": 0.88,
+      "step": 852
+    },
+    {
+      "epoch": 1.3224806201550388,
+      "grad_norm": 1.3280638456344604,
+      "learning_rate": 1.955211024978467e-06,
+      "loss": 0.9086,
+      "step": 853
+    },
+    {
+      "epoch": 1.324031007751938,
+      "grad_norm": 1.4023438692092896,
+      "learning_rate": 1.950904392764858e-06,
+      "loss": 0.8778,
+      "step": 854
+    },
+    {
+      "epoch": 1.3255813953488373,
+      "grad_norm": 1.488553762435913,
+      "learning_rate": 1.946597760551249e-06,
+      "loss": 0.9065,
+      "step": 855
+    },
+    {
+      "epoch": 1.3271317829457363,
+      "grad_norm": 1.4213240146636963,
+      "learning_rate": 1.94229112833764e-06,
+      "loss": 0.8722,
+      "step": 856
+    },
+    {
+      "epoch": 1.3286821705426357,
+      "grad_norm": 1.4626022577285767,
+      "learning_rate": 1.9379844961240315e-06,
+      "loss": 0.8851,
+      "step": 857
+    },
+    {
+      "epoch": 1.330232558139535,
+      "grad_norm": 1.3919224739074707,
+      "learning_rate": 1.9336778639104224e-06,
+      "loss": 0.8531,
+      "step": 858
+    },
+    {
+      "epoch": 1.331782945736434,
+      "grad_norm": 1.4865458011627197,
+      "learning_rate": 1.9293712316968134e-06,
+      "loss": 0.897,
+      "step": 859
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 1.4093918800354004,
+      "learning_rate": 1.9250645994832044e-06,
+      "loss": 0.8791,
+      "step": 860
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "eval_loss": 1.0247600078582764,
+      "eval_runtime": 46.7524,
+      "eval_samples_per_second": 21.389,
+      "eval_steps_per_second": 1.348,
+      "step": 860
+    },
+    {
+      "epoch": 1.3348837209302324,
+      "grad_norm": 1.3935306072235107,
+      "learning_rate": 1.9207579672695953e-06,
+      "loss": 0.8745,
+      "step": 861
+    },
+    {
+      "epoch": 1.3364341085271318,
+      "grad_norm": 1.6292296648025513,
+      "learning_rate": 1.9164513350559863e-06,
+      "loss": 0.8661,
+      "step": 862
+    },
+    {
+      "epoch": 1.337984496124031,
+      "grad_norm": 1.5986114740371704,
+      "learning_rate": 1.9121447028423773e-06,
+      "loss": 0.8763,
+      "step": 863
+    },
+    {
+      "epoch": 1.3395348837209302,
+      "grad_norm": 1.5964652299880981,
+      "learning_rate": 1.9078380706287687e-06,
+      "loss": 0.8869,
+      "step": 864
+    },
+    {
+      "epoch": 1.3410852713178294,
+      "grad_norm": 1.3685358762741089,
+      "learning_rate": 1.9035314384151596e-06,
+      "loss": 0.8891,
+      "step": 865
+    },
+    {
+      "epoch": 1.3426356589147286,
+      "grad_norm": 1.6272172927856445,
+      "learning_rate": 1.8992248062015506e-06,
+      "loss": 0.8855,
+      "step": 866
+    },
+    {
+      "epoch": 1.344186046511628,
+      "grad_norm": 1.669884204864502,
+      "learning_rate": 1.8949181739879416e-06,
+      "loss": 0.879,
+      "step": 867
+    },
+    {
+      "epoch": 1.3457364341085272,
+      "grad_norm": 1.4029791355133057,
+      "learning_rate": 1.8906115417743325e-06,
+      "loss": 0.8685,
+      "step": 868
+    },
+    {
+      "epoch": 1.3472868217054264,
+      "grad_norm": 1.5204144716262817,
+      "learning_rate": 1.8863049095607235e-06,
+      "loss": 0.9151,
+      "step": 869
+    },
+    {
+      "epoch": 1.3488372093023255,
+      "grad_norm": 1.6033802032470703,
+      "learning_rate": 1.8819982773471147e-06,
+      "loss": 0.8891,
+      "step": 870
+    },
+    {
+      "epoch": 1.3488372093023255,
+      "eval_loss": 1.0215697288513184,
+      "eval_runtime": 46.7118,
+      "eval_samples_per_second": 21.408,
+      "eval_steps_per_second": 1.349,
+      "step": 870
+    },
+    {
+      "epoch": 1.3503875968992247,
+      "grad_norm": 1.290747046470642,
+      "learning_rate": 1.8776916451335059e-06,
+      "loss": 0.8761,
+      "step": 871
+    },
+    {
+      "epoch": 1.3519379844961241,
+      "grad_norm": 1.5597420930862427,
+      "learning_rate": 1.8733850129198968e-06,
+      "loss": 0.8585,
+      "step": 872
+    },
+    {
+      "epoch": 1.3534883720930233,
+      "grad_norm": 1.3587156534194946,
+      "learning_rate": 1.8690783807062878e-06,
+      "loss": 0.8602,
+      "step": 873
+    },
+    {
+      "epoch": 1.3550387596899225,
+      "grad_norm": 1.3756153583526611,
+      "learning_rate": 1.864771748492679e-06,
+      "loss": 0.9024,
+      "step": 874
+    },
+    {
+      "epoch": 1.3565891472868217,
+      "grad_norm": 1.3279125690460205,
+      "learning_rate": 1.86046511627907e-06,
+      "loss": 0.8898,
+      "step": 875
+    },
+    {
+      "epoch": 1.3581395348837209,
+      "grad_norm": 1.416416883468628,
+      "learning_rate": 1.856158484065461e-06,
+      "loss": 0.8923,
+      "step": 876
+    },
+    {
+      "epoch": 1.3596899224806203,
+      "grad_norm": 1.5571491718292236,
+      "learning_rate": 1.8518518518518519e-06,
+      "loss": 0.9037,
+      "step": 877
+    },
+    {
+      "epoch": 1.3612403100775194,
+      "grad_norm": 1.5523213148117065,
+      "learning_rate": 1.8475452196382433e-06,
+      "loss": 0.8944,
+      "step": 878
+    },
+    {
+      "epoch": 1.3627906976744186,
+      "grad_norm": 1.4088724851608276,
+      "learning_rate": 1.8432385874246342e-06,
+      "loss": 0.9118,
+      "step": 879
+    },
+    {
+      "epoch": 1.3643410852713178,
+      "grad_norm": 1.3843094110488892,
+      "learning_rate": 1.8389319552110252e-06,
+      "loss": 0.8646,
+      "step": 880
+    },
+    {
+      "epoch": 1.3643410852713178,
+      "eval_loss": 1.0226423740386963,
+      "eval_runtime": 46.7174,
+      "eval_samples_per_second": 21.405,
+      "eval_steps_per_second": 1.349,
+      "step": 880
+    },
+    {
+      "epoch": 1.365891472868217,
+      "grad_norm": 1.386268138885498,
+      "learning_rate": 1.8346253229974162e-06,
+      "loss": 0.8754,
+      "step": 881
+    },
+    {
+      "epoch": 1.3674418604651164,
+      "grad_norm": 1.3752778768539429,
+      "learning_rate": 1.8303186907838071e-06,
+      "loss": 0.8663,
+      "step": 882
+    },
+    {
+      "epoch": 1.3689922480620156,
+      "grad_norm": 1.56470787525177,
+      "learning_rate": 1.826012058570198e-06,
+      "loss": 0.8255,
+      "step": 883
+    },
+    {
+      "epoch": 1.3705426356589148,
+      "grad_norm": 1.2622406482696533,
+      "learning_rate": 1.8217054263565893e-06,
+      "loss": 0.8959,
+      "step": 884
+    },
+    {
+      "epoch": 1.372093023255814,
+      "grad_norm": 1.473328948020935,
+      "learning_rate": 1.8173987941429802e-06,
+      "loss": 0.8822,
+      "step": 885
+    },
+    {
+      "epoch": 1.3736434108527131,
+      "grad_norm": 1.4207509756088257,
+      "learning_rate": 1.8130921619293714e-06,
+      "loss": 0.8792,
+      "step": 886
+    },
+    {
+      "epoch": 1.3751937984496125,
+      "grad_norm": 1.6432552337646484,
+      "learning_rate": 1.8087855297157624e-06,
+      "loss": 0.8789,
+      "step": 887
+    },
+    {
+      "epoch": 1.3767441860465115,
+      "grad_norm": 1.360485315322876,
+      "learning_rate": 1.8044788975021536e-06,
+      "loss": 0.8578,
+      "step": 888
+    },
+    {
+      "epoch": 1.378294573643411,
+      "grad_norm": 1.2980175018310547,
+      "learning_rate": 1.8001722652885445e-06,
+      "loss": 0.9028,
+      "step": 889
+    },
+    {
+      "epoch": 1.37984496124031,
+      "grad_norm": 1.4118475914001465,
+      "learning_rate": 1.7958656330749355e-06,
+      "loss": 0.8923,
+      "step": 890
+    },
+    {
+      "epoch": 1.37984496124031,
+      "eval_loss": 1.0218552350997925,
+      "eval_runtime": 46.7621,
+      "eval_samples_per_second": 21.385,
+      "eval_steps_per_second": 1.347,
+      "step": 890
+    },
+    {
+      "epoch": 1.3813953488372093,
+      "grad_norm": 1.4917428493499756,
+      "learning_rate": 1.7915590008613265e-06,
+      "loss": 0.865,
+      "step": 891
+    },
+    {
+      "epoch": 1.3829457364341085,
+      "grad_norm": 1.2792720794677734,
+      "learning_rate": 1.7872523686477174e-06,
+      "loss": 0.8735,
+      "step": 892
+    },
+    {
+      "epoch": 1.3844961240310076,
+      "grad_norm": 1.3006173372268677,
+      "learning_rate": 1.7829457364341088e-06,
+      "loss": 0.8578,
+      "step": 893
+    },
+    {
+      "epoch": 1.386046511627907,
+      "grad_norm": 1.5386368036270142,
+      "learning_rate": 1.7786391042204998e-06,
+      "loss": 0.8904,
+      "step": 894
+    },
+    {
+      "epoch": 1.3875968992248062,
+      "grad_norm": 1.5131858587265015,
+      "learning_rate": 1.7743324720068908e-06,
+      "loss": 0.8607,
+      "step": 895
+    },
+    {
+      "epoch": 1.3891472868217054,
+      "grad_norm": 1.478264570236206,
+      "learning_rate": 1.7700258397932817e-06,
+      "loss": 0.8655,
+      "step": 896
+    },
+    {
+      "epoch": 1.3906976744186046,
+      "grad_norm": 1.4144504070281982,
+      "learning_rate": 1.7657192075796727e-06,
+      "loss": 0.8745,
+      "step": 897
+    },
+    {
+      "epoch": 1.3922480620155038,
+      "grad_norm": 1.3581087589263916,
+      "learning_rate": 1.7614125753660639e-06,
+      "loss": 0.8595,
+      "step": 898
+    },
+    {
+      "epoch": 1.3937984496124032,
+      "grad_norm": 1.3146580457687378,
+      "learning_rate": 1.7571059431524549e-06,
+      "loss": 0.8602,
+      "step": 899
+    },
+    {
+      "epoch": 1.3953488372093024,
+      "grad_norm": 1.5949207544326782,
+      "learning_rate": 1.752799310938846e-06,
+      "loss": 0.9105,
+      "step": 900
+    },
+    {
+      "epoch": 1.3953488372093024,
+      "eval_loss": 1.0218842029571533,
+      "eval_runtime": 46.6933,
+      "eval_samples_per_second": 21.416,
+      "eval_steps_per_second": 1.349,
+      "step": 900
+    },
+    {
+      "epoch": 1.3968992248062015,
+      "grad_norm": 1.3286627531051636,
+      "learning_rate": 1.748492678725237e-06,
+      "loss": 0.8767,
+      "step": 901
+    },
+    {
+      "epoch": 1.3984496124031007,
+      "grad_norm": 1.3980764150619507,
+      "learning_rate": 1.7441860465116282e-06,
+      "loss": 0.8573,
+      "step": 902
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.4759293794631958,
+      "learning_rate": 1.7398794142980192e-06,
+      "loss": 0.8628,
+      "step": 903
+    },
+    {
+      "epoch": 1.4015503875968993,
+      "grad_norm": 1.4072186946868896,
+      "learning_rate": 1.7355727820844101e-06,
+      "loss": 0.8829,
+      "step": 904
+    },
+    {
+      "epoch": 1.4031007751937985,
+      "grad_norm": 1.3058583736419678,
+      "learning_rate": 1.731266149870801e-06,
+      "loss": 0.8833,
+      "step": 905
+    },
+    {
+      "epoch": 1.4046511627906977,
+      "grad_norm": 1.5181350708007812,
+      "learning_rate": 1.726959517657192e-06,
+      "loss": 0.8492,
+      "step": 906
+    },
+    {
+      "epoch": 1.4062015503875969,
+      "grad_norm": 1.4261865615844727,
+      "learning_rate": 1.7226528854435834e-06,
+      "loss": 0.8881,
+      "step": 907
+    },
+    {
+      "epoch": 1.407751937984496,
+      "grad_norm": 1.3683359622955322,
+      "learning_rate": 1.7183462532299744e-06,
+      "loss": 0.8904,
+      "step": 908
+    },
+    {
+      "epoch": 1.4093023255813955,
+      "grad_norm": 1.519061803817749,
+      "learning_rate": 1.7140396210163654e-06,
+      "loss": 0.8609,
+      "step": 909
+    },
+    {
+      "epoch": 1.4108527131782946,
+      "grad_norm": 1.5184195041656494,
+      "learning_rate": 1.7097329888027563e-06,
+      "loss": 0.8768,
+      "step": 910
+    },
+    {
+      "epoch": 1.4108527131782946,
+      "eval_loss": 1.0236916542053223,
+      "eval_runtime": 46.6499,
+      "eval_samples_per_second": 21.436,
+      "eval_steps_per_second": 1.35,
+      "step": 910
+    },
+    {
+      "epoch": 1.4124031007751938,
+      "grad_norm": 1.2919946908950806,
+      "learning_rate": 1.7054263565891473e-06,
+      "loss": 0.8799,
+      "step": 911
+    },
+    {
+      "epoch": 1.413953488372093,
+      "grad_norm": 1.4635928869247437,
+      "learning_rate": 1.7011197243755385e-06,
+      "loss": 0.9136,
+      "step": 912
+    },
+    {
+      "epoch": 1.4155038759689922,
+      "grad_norm": 1.3878158330917358,
+      "learning_rate": 1.6968130921619295e-06,
+      "loss": 0.8462,
+      "step": 913
+    },
+    {
+      "epoch": 1.4170542635658916,
+      "grad_norm": 1.375191569328308,
+      "learning_rate": 1.6925064599483206e-06,
+      "loss": 0.8843,
+      "step": 914
+    },
+    {
+      "epoch": 1.4186046511627908,
+      "grad_norm": 1.5198922157287598,
+      "learning_rate": 1.6881998277347116e-06,
+      "loss": 0.842,
+      "step": 915
+    },
+    {
+      "epoch": 1.42015503875969,
+      "grad_norm": 1.5138405561447144,
+      "learning_rate": 1.6838931955211028e-06,
+      "loss": 0.8676,
+      "step": 916
+    },
+    {
+      "epoch": 1.4217054263565891,
+      "grad_norm": 1.3270615339279175,
+      "learning_rate": 1.6795865633074938e-06,
+      "loss": 0.8844,
+      "step": 917
+    },
+    {
+      "epoch": 1.4232558139534883,
+      "grad_norm": 1.3569897413253784,
+      "learning_rate": 1.6752799310938847e-06,
+      "loss": 0.8724,
+      "step": 918
+    },
+    {
+      "epoch": 1.4248062015503877,
+      "grad_norm": 1.3835643529891968,
+      "learning_rate": 1.6709732988802757e-06,
+      "loss": 0.921,
+      "step": 919
+    },
+    {
+      "epoch": 1.4263565891472867,
+      "grad_norm": 1.406451940536499,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.8555,
+      "step": 920
+    },
+    {
+      "epoch": 1.4263565891472867,
+      "eval_loss": 1.0234382152557373,
+      "eval_runtime": 46.7532,
+      "eval_samples_per_second": 21.389,
+      "eval_steps_per_second": 1.348,
+      "step": 920
+    },
+    {
+      "epoch": 1.427906976744186,
+      "grad_norm": 1.3230928182601929,
+      "learning_rate": 1.6623600344530576e-06,
+      "loss": 0.8871,
+      "step": 921
+    },
+    {
+      "epoch": 1.4294573643410853,
+      "grad_norm": 1.4418071508407593,
+      "learning_rate": 1.658053402239449e-06,
+      "loss": 0.8918,
+      "step": 922
+    },
+    {
+      "epoch": 1.4310077519379845,
+      "grad_norm": 1.3506301641464233,
+      "learning_rate": 1.65374677002584e-06,
+      "loss": 0.8637,
+      "step": 923
+    },
+    {
+      "epoch": 1.4325581395348836,
+      "grad_norm": 1.537008285522461,
+      "learning_rate": 1.649440137812231e-06,
+      "loss": 0.8866,
+      "step": 924
+    },
+    {
+      "epoch": 1.4341085271317828,
+      "grad_norm": 1.418975830078125,
+      "learning_rate": 1.645133505598622e-06,
+      "loss": 0.9255,
+      "step": 925
+    },
+    {
+      "epoch": 1.4356589147286822,
+      "grad_norm": 1.5376110076904297,
+      "learning_rate": 1.640826873385013e-06,
+      "loss": 0.8596,
+      "step": 926
+    },
+    {
+      "epoch": 1.4372093023255814,
+      "grad_norm": 1.4103554487228394,
+      "learning_rate": 1.636520241171404e-06,
+      "loss": 0.869,
+      "step": 927
+    },
+    {
+      "epoch": 1.4387596899224806,
+      "grad_norm": 1.6398717164993286,
+      "learning_rate": 1.632213608957795e-06,
+      "loss": 0.9124,
+      "step": 928
+    },
+    {
+      "epoch": 1.4403100775193798,
+      "grad_norm": 1.4364229440689087,
+      "learning_rate": 1.6279069767441862e-06,
+      "loss": 0.9052,
+      "step": 929
+    },
+    {
+      "epoch": 1.441860465116279,
+      "grad_norm": 1.9486956596374512,
+      "learning_rate": 1.6236003445305774e-06,
+      "loss": 0.876,
+      "step": 930
+    },
+    {
+      "epoch": 1.441860465116279,
+      "eval_loss": 1.0196127891540527,
+      "eval_runtime": 46.6689,
+      "eval_samples_per_second": 21.428,
+      "eval_steps_per_second": 1.35,
+      "step": 930
+    },
+    {
+      "epoch": 1.4434108527131784,
+      "grad_norm": 1.3562145233154297,
+      "learning_rate": 1.6192937123169684e-06,
+      "loss": 0.9019,
+      "step": 931
+    },
+    {
+      "epoch": 1.4449612403100776,
+      "grad_norm": 1.6002224683761597,
+      "learning_rate": 1.6149870801033593e-06,
+      "loss": 0.8805,
+      "step": 932
+    },
+    {
+      "epoch": 1.4465116279069767,
+      "grad_norm": 1.3504170179367065,
+      "learning_rate": 1.6106804478897503e-06,
+      "loss": 0.9131,
+      "step": 933
+    },
+    {
+      "epoch": 1.448062015503876,
+      "grad_norm": 1.3638203144073486,
+      "learning_rate": 1.6063738156761413e-06,
+      "loss": 0.8759,
+      "step": 934
+    },
+    {
+      "epoch": 1.449612403100775,
+      "grad_norm": 1.2600183486938477,
+      "learning_rate": 1.6020671834625322e-06,
+      "loss": 0.8757,
+      "step": 935
+    },
+    {
+      "epoch": 1.4511627906976745,
+      "grad_norm": 1.4572343826293945,
+      "learning_rate": 1.5977605512489236e-06,
+      "loss": 0.8628,
+      "step": 936
+    },
+    {
+      "epoch": 1.4527131782945737,
+      "grad_norm": 1.6208621263504028,
+      "learning_rate": 1.5934539190353146e-06,
+      "loss": 0.899,
+      "step": 937
+    },
+    {
+      "epoch": 1.4542635658914729,
+      "grad_norm": 1.4335441589355469,
+      "learning_rate": 1.5891472868217056e-06,
+      "loss": 0.8645,
+      "step": 938
+    },
+    {
+      "epoch": 1.455813953488372,
+      "grad_norm": 1.396388292312622,
+      "learning_rate": 1.5848406546080965e-06,
+      "loss": 0.9317,
+      "step": 939
+    },
+    {
+      "epoch": 1.4573643410852712,
+      "grad_norm": 1.5560067892074585,
+      "learning_rate": 1.5805340223944877e-06,
+      "loss": 0.8483,
+      "step": 940
+    },
+    {
+      "epoch": 1.4573643410852712,
+      "eval_loss": 1.0217546224594116,
+      "eval_runtime": 46.6616,
+      "eval_samples_per_second": 21.431,
+      "eval_steps_per_second": 1.35,
+      "step": 940
+    },
+    {
+      "epoch": 1.4589147286821706,
+      "grad_norm": 1.4226781129837036,
+      "learning_rate": 1.5762273901808787e-06,
+      "loss": 0.896,
+      "step": 941
+    },
+    {
+      "epoch": 1.4604651162790698,
+      "grad_norm": 1.2623789310455322,
+      "learning_rate": 1.5719207579672696e-06,
+      "loss": 0.875,
+      "step": 942
+    },
+    {
+      "epoch": 1.462015503875969,
+      "grad_norm": 1.440442442893982,
+      "learning_rate": 1.5676141257536608e-06,
+      "loss": 0.8916,
+      "step": 943
+    },
+    {
+      "epoch": 1.4635658914728682,
+      "grad_norm": 1.4717267751693726,
+      "learning_rate": 1.563307493540052e-06,
+      "loss": 0.8684,
+      "step": 944
+    },
+    {
+      "epoch": 1.4651162790697674,
+      "grad_norm": 1.418413758277893,
+      "learning_rate": 1.559000861326443e-06,
+      "loss": 0.8677,
+      "step": 945
+    },
+    {
+      "epoch": 1.4666666666666668,
+      "grad_norm": 1.4560842514038086,
+      "learning_rate": 1.554694229112834e-06,
+      "loss": 0.8496,
+      "step": 946
+    },
+    {
+      "epoch": 1.468217054263566,
+      "grad_norm": 1.7570191621780396,
+      "learning_rate": 1.550387596899225e-06,
+      "loss": 0.9065,
+      "step": 947
+    },
+    {
+      "epoch": 1.4697674418604652,
+      "grad_norm": 1.2934919595718384,
+      "learning_rate": 1.5460809646856159e-06,
+      "loss": 0.8618,
+      "step": 948
+    },
+    {
+      "epoch": 1.4713178294573643,
+      "grad_norm": 1.404919147491455,
+      "learning_rate": 1.5417743324720068e-06,
+      "loss": 0.8713,
+      "step": 949
+    },
+    {
+      "epoch": 1.4728682170542635,
+      "grad_norm": 1.5484105348587036,
+      "learning_rate": 1.537467700258398e-06,
+      "loss": 0.8946,
+      "step": 950
+    },
+    {
+      "epoch": 1.4728682170542635,
+      "eval_loss": 1.0181382894515991,
+      "eval_runtime": 46.6806,
+      "eval_samples_per_second": 21.422,
+      "eval_steps_per_second": 1.35,
+      "step": 950
+    },
+    {
+      "epoch": 1.474418604651163,
+      "grad_norm": 1.3862688541412354,
+      "learning_rate": 1.5331610680447892e-06,
+      "loss": 0.8691,
+      "step": 951
+    },
+    {
+      "epoch": 1.4759689922480619,
+      "grad_norm": 1.417663335800171,
+      "learning_rate": 1.5288544358311802e-06,
+      "loss": 0.8743,
+      "step": 952
+    },
+    {
+      "epoch": 1.4775193798449613,
+      "grad_norm": 1.374737024307251,
+      "learning_rate": 1.5245478036175711e-06,
+      "loss": 0.8856,
+      "step": 953
+    },
+    {
+      "epoch": 1.4790697674418605,
+      "grad_norm": 1.3962092399597168,
+      "learning_rate": 1.5202411714039621e-06,
+      "loss": 0.9071,
+      "step": 954
+    },
+    {
+      "epoch": 1.4806201550387597,
+      "grad_norm": 1.3754186630249023,
+      "learning_rate": 1.5159345391903533e-06,
+      "loss": 0.879,
+      "step": 955
+    },
+    {
+      "epoch": 1.4821705426356588,
+      "grad_norm": 1.3639583587646484,
+      "learning_rate": 1.5116279069767443e-06,
+      "loss": 0.853,
+      "step": 956
+    },
+    {
+      "epoch": 1.483720930232558,
+      "grad_norm": 1.312952995300293,
+      "learning_rate": 1.5073212747631352e-06,
+      "loss": 0.8534,
+      "step": 957
+    },
+    {
+      "epoch": 1.4852713178294574,
+      "grad_norm": 1.432968258857727,
+      "learning_rate": 1.5030146425495264e-06,
+      "loss": 0.8553,
+      "step": 958
+    },
+    {
+      "epoch": 1.4868217054263566,
+      "grad_norm": 1.5956652164459229,
+      "learning_rate": 1.4987080103359176e-06,
+      "loss": 0.8849,
+      "step": 959
+    },
+    {
+      "epoch": 1.4883720930232558,
+      "grad_norm": 1.5594089031219482,
+      "learning_rate": 1.4944013781223086e-06,
+      "loss": 0.8654,
+      "step": 960
+    },
+    {
+      "epoch": 1.4883720930232558,
+      "eval_loss": 1.017091989517212,
+      "eval_runtime": 46.7467,
+      "eval_samples_per_second": 21.392,
+      "eval_steps_per_second": 1.348,
+      "step": 960
+    },
+    {
+      "epoch": 1.489922480620155,
+      "grad_norm": 1.3392845392227173,
+      "learning_rate": 1.4900947459086995e-06,
+      "loss": 0.882,
+      "step": 961
+    },
+    {
+      "epoch": 1.4914728682170542,
+      "grad_norm": 1.3368501663208008,
+      "learning_rate": 1.4857881136950905e-06,
+      "loss": 0.8967,
+      "step": 962
+    },
+    {
+      "epoch": 1.4930232558139536,
+      "grad_norm": 1.3974400758743286,
+      "learning_rate": 1.4814814814814815e-06,
+      "loss": 0.8687,
+      "step": 963
+    },
+    {
+      "epoch": 1.4945736434108527,
+      "grad_norm": 1.4317644834518433,
+      "learning_rate": 1.4771748492678724e-06,
+      "loss": 0.8867,
+      "step": 964
+    },
+    {
+      "epoch": 1.496124031007752,
+      "grad_norm": 1.5514066219329834,
+      "learning_rate": 1.4728682170542638e-06,
+      "loss": 0.8486,
+      "step": 965
+    },
+    {
+      "epoch": 1.4976744186046511,
+      "grad_norm": 1.5198034048080444,
+      "learning_rate": 1.4685615848406548e-06,
+      "loss": 0.8577,
+      "step": 966
+    },
+    {
+      "epoch": 1.4992248062015503,
+      "grad_norm": 1.3727871179580688,
+      "learning_rate": 1.4642549526270457e-06,
+      "loss": 0.8795,
+      "step": 967
+    },
+    {
+      "epoch": 1.5007751937984497,
+      "grad_norm": 1.3373262882232666,
+      "learning_rate": 1.4599483204134367e-06,
+      "loss": 0.8795,
+      "step": 968
+    },
+    {
+      "epoch": 1.5023255813953489,
+      "grad_norm": 1.358616590499878,
+      "learning_rate": 1.455641688199828e-06,
+      "loss": 0.8924,
+      "step": 969
+    },
+    {
+      "epoch": 1.503875968992248,
+      "grad_norm": 1.3942570686340332,
+      "learning_rate": 1.4513350559862189e-06,
+      "loss": 0.8977,
+      "step": 970
+    },
+    {
+      "epoch": 1.503875968992248,
+      "eval_loss": 1.018630027770996,
+      "eval_runtime": 46.7829,
+      "eval_samples_per_second": 21.375,
+      "eval_steps_per_second": 1.347,
+      "step": 970
+    },
+    {
+      "epoch": 1.5054263565891473,
+      "grad_norm": 1.560254454612732,
+      "learning_rate": 1.4470284237726098e-06,
+      "loss": 0.8894,
+      "step": 971
+    },
+    {
+      "epoch": 1.5069767441860464,
+      "grad_norm": 1.3579188585281372,
+      "learning_rate": 1.442721791559001e-06,
+      "loss": 0.8544,
+      "step": 972
+    },
+    {
+      "epoch": 1.5085271317829458,
+      "grad_norm": 1.4697273969650269,
+      "learning_rate": 1.4384151593453922e-06,
+      "loss": 0.896,
+      "step": 973
+    },
+    {
+      "epoch": 1.5100775193798448,
+      "grad_norm": 1.4097820520401,
+      "learning_rate": 1.4341085271317832e-06,
+      "loss": 0.8713,
+      "step": 974
+    },
+    {
+      "epoch": 1.5116279069767442,
+      "grad_norm": 1.326789379119873,
+      "learning_rate": 1.4298018949181741e-06,
+      "loss": 0.8309,
+      "step": 975
+    },
+    {
+      "epoch": 1.5131782945736434,
+      "grad_norm": 1.573042631149292,
+      "learning_rate": 1.425495262704565e-06,
+      "loss": 0.8647,
+      "step": 976
+    },
+    {
+      "epoch": 1.5147286821705426,
+      "grad_norm": 1.374802589416504,
+      "learning_rate": 1.421188630490956e-06,
+      "loss": 0.9035,
+      "step": 977
+    },
+    {
+      "epoch": 1.516279069767442,
+      "grad_norm": 1.363222360610962,
+      "learning_rate": 1.416881998277347e-06,
+      "loss": 0.9028,
+      "step": 978
+    },
+    {
+      "epoch": 1.517829457364341,
+      "grad_norm": 1.6008557081222534,
+      "learning_rate": 1.4125753660637384e-06,
+      "loss": 0.9165,
+      "step": 979
+    },
+    {
+      "epoch": 1.5193798449612403,
+      "grad_norm": 1.4018700122833252,
+      "learning_rate": 1.4082687338501294e-06,
+      "loss": 0.879,
+      "step": 980
+    },
+    {
+      "epoch": 1.5193798449612403,
+      "eval_loss": 1.0206389427185059,
+      "eval_runtime": 47.0863,
+      "eval_samples_per_second": 21.238,
+      "eval_steps_per_second": 1.338,
+      "step": 980
+    },
+    {
+      "epoch": 1.5209302325581395,
+      "grad_norm": 1.3979579210281372,
+      "learning_rate": 1.4039621016365204e-06,
+      "loss": 0.8617,
+      "step": 981
+    },
+    {
+      "epoch": 1.5224806201550387,
+      "grad_norm": 1.4407587051391602,
+      "learning_rate": 1.3996554694229113e-06,
+      "loss": 0.8488,
+      "step": 982
+    },
+    {
+      "epoch": 1.5240310077519381,
+      "grad_norm": 1.4984155893325806,
+      "learning_rate": 1.3953488372093025e-06,
+      "loss": 0.8823,
+      "step": 983
+    },
+    {
+      "epoch": 1.525581395348837,
+      "grad_norm": 1.3611596822738647,
+      "learning_rate": 1.3910422049956935e-06,
+      "loss": 0.8595,
+      "step": 984
+    },
+    {
+      "epoch": 1.5271317829457365,
+      "grad_norm": 1.49971604347229,
+      "learning_rate": 1.3867355727820844e-06,
+      "loss": 0.9004,
+      "step": 985
+    },
+    {
+      "epoch": 1.5286821705426357,
+      "grad_norm": 1.4561238288879395,
+      "learning_rate": 1.3824289405684754e-06,
+      "loss": 0.8699,
+      "step": 986
+    },
+    {
+      "epoch": 1.5302325581395348,
+      "grad_norm": 1.4849098920822144,
+      "learning_rate": 1.3781223083548668e-06,
+      "loss": 0.8434,
+      "step": 987
+    },
+    {
+      "epoch": 1.5317829457364343,
+      "grad_norm": 1.4471638202667236,
+      "learning_rate": 1.3738156761412578e-06,
+      "loss": 0.8643,
+      "step": 988
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 1.2716907262802124,
+      "learning_rate": 1.3695090439276487e-06,
+      "loss": 0.8657,
+      "step": 989
+    },
+    {
+      "epoch": 1.5348837209302326,
+      "grad_norm": 1.3805012702941895,
+      "learning_rate": 1.3652024117140397e-06,
+      "loss": 0.8757,
+      "step": 990
+    },
+    {
+      "epoch": 1.5348837209302326,
+      "eval_loss": 1.0188567638397217,
+      "eval_runtime": 46.9241,
+      "eval_samples_per_second": 21.311,
+      "eval_steps_per_second": 1.343,
+      "step": 990
+    },
+    {
+      "epoch": 1.5364341085271318,
+      "grad_norm": 1.333486557006836,
+      "learning_rate": 1.3608957795004307e-06,
+      "loss": 0.8503,
+      "step": 991
+    },
+    {
+      "epoch": 1.537984496124031,
+      "grad_norm": 1.3029228448867798,
+      "learning_rate": 1.3565891472868216e-06,
+      "loss": 0.8412,
+      "step": 992
+    },
+    {
+      "epoch": 1.5395348837209304,
+      "grad_norm": 1.4335917234420776,
+      "learning_rate": 1.3522825150732128e-06,
+      "loss": 0.8768,
+      "step": 993
+    },
+    {
+      "epoch": 1.5410852713178294,
+      "grad_norm": 1.6070386171340942,
+      "learning_rate": 1.347975882859604e-06,
+      "loss": 0.8478,
+      "step": 994
+    },
+    {
+      "epoch": 1.5426356589147288,
+      "grad_norm": 1.4501957893371582,
+      "learning_rate": 1.343669250645995e-06,
+      "loss": 0.8681,
+      "step": 995
+    },
+    {
+      "epoch": 1.544186046511628,
+      "grad_norm": 1.4788450002670288,
+      "learning_rate": 1.339362618432386e-06,
+      "loss": 0.8757,
+      "step": 996
+    },
+    {
+      "epoch": 1.5457364341085271,
+      "grad_norm": 1.4308056831359863,
+      "learning_rate": 1.3350559862187771e-06,
+      "loss": 0.8965,
+      "step": 997
+    },
+    {
+      "epoch": 1.5472868217054263,
+      "grad_norm": 1.5292010307312012,
+      "learning_rate": 1.330749354005168e-06,
+      "loss": 0.8452,
+      "step": 998
+    },
+    {
+      "epoch": 1.5488372093023255,
+      "grad_norm": 1.4352688789367676,
+      "learning_rate": 1.326442721791559e-06,
+      "loss": 0.8677,
+      "step": 999
+    },
+    {
+      "epoch": 1.550387596899225,
+      "grad_norm": 1.3322887420654297,
+      "learning_rate": 1.32213608957795e-06,
+      "loss": 0.8788,
+      "step": 1000
+    },
+    {
+      "epoch": 1.550387596899225,
+      "eval_loss": 1.014811635017395,
+      "eval_runtime": 46.8871,
+      "eval_samples_per_second": 21.328,
+      "eval_steps_per_second": 1.344,
+      "step": 1000
+    },
+    {
+      "epoch": 1.551937984496124,
+      "grad_norm": 1.272476077079773,
+      "learning_rate": 1.3178294573643414e-06,
+      "loss": 0.8553,
+      "step": 1001
+    },
+    {
+      "epoch": 1.5534883720930233,
+      "grad_norm": 1.3475803136825562,
+      "learning_rate": 1.3135228251507324e-06,
+      "loss": 0.9133,
+      "step": 1002
+    },
+    {
+      "epoch": 1.5550387596899224,
+      "grad_norm": 1.677804708480835,
+      "learning_rate": 1.3092161929371233e-06,
+      "loss": 0.8689,
+      "step": 1003
+    },
+    {
+      "epoch": 1.5565891472868216,
+      "grad_norm": 1.4577577114105225,
+      "learning_rate": 1.3049095607235143e-06,
+      "loss": 0.8971,
+      "step": 1004
+    },
+    {
+      "epoch": 1.558139534883721,
+      "grad_norm": 1.408029317855835,
+      "learning_rate": 1.3006029285099053e-06,
+      "loss": 0.8724,
+      "step": 1005
+    },
+    {
+      "epoch": 1.55968992248062,
+      "grad_norm": 1.579338788986206,
+      "learning_rate": 1.2962962962962962e-06,
+      "loss": 0.889,
+      "step": 1006
+    },
+    {
+      "epoch": 1.5612403100775194,
+      "grad_norm": 1.338676929473877,
+      "learning_rate": 1.2919896640826874e-06,
+      "loss": 0.8721,
+      "step": 1007
+    },
+    {
+      "epoch": 1.5627906976744186,
+      "grad_norm": 1.50413978099823,
+      "learning_rate": 1.2876830318690786e-06,
+      "loss": 0.8771,
+      "step": 1008
+    },
+    {
+      "epoch": 1.5643410852713178,
+      "grad_norm": 1.3786473274230957,
+      "learning_rate": 1.2833763996554696e-06,
+      "loss": 0.859,
+      "step": 1009
+    },
+    {
+      "epoch": 1.5658914728682172,
+      "grad_norm": 1.5475035905838013,
+      "learning_rate": 1.2790697674418605e-06,
+      "loss": 0.885,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5658914728682172,
+      "eval_loss": 1.0137555599212646,
+      "eval_runtime": 46.6808,
+      "eval_samples_per_second": 21.422,
+      "eval_steps_per_second": 1.35,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5674418604651161,
+      "grad_norm": 1.5142121315002441,
+      "learning_rate": 1.2747631352282517e-06,
+      "loss": 0.8622,
+      "step": 1011
+    },
+    {
+      "epoch": 1.5689922480620155,
+      "grad_norm": 1.3826631307601929,
+      "learning_rate": 1.2704565030146427e-06,
+      "loss": 0.8431,
+      "step": 1012
+    },
+    {
+      "epoch": 1.5705426356589147,
+      "grad_norm": 1.3889191150665283,
+      "learning_rate": 1.2661498708010337e-06,
+      "loss": 0.8674,
+      "step": 1013
+    },
+    {
+      "epoch": 1.572093023255814,
+      "grad_norm": 1.5307679176330566,
+      "learning_rate": 1.2618432385874246e-06,
+      "loss": 0.8782,
+      "step": 1014
+    },
+    {
+      "epoch": 1.5736434108527133,
+      "grad_norm": 1.6250945329666138,
+      "learning_rate": 1.257536606373816e-06,
+      "loss": 0.8839,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5751937984496123,
+      "grad_norm": 1.3860634565353394,
+      "learning_rate": 1.253229974160207e-06,
+      "loss": 0.8722,
+      "step": 1016
+    },
+    {
+      "epoch": 1.5767441860465117,
+      "grad_norm": 1.4336800575256348,
+      "learning_rate": 1.248923341946598e-06,
+      "loss": 0.8492,
+      "step": 1017
+    },
+    {
+      "epoch": 1.5782945736434109,
+      "grad_norm": 1.4651581048965454,
+      "learning_rate": 1.244616709732989e-06,
+      "loss": 0.8478,
+      "step": 1018
+    },
+    {
+      "epoch": 1.57984496124031,
+      "grad_norm": 1.3516453504562378,
+      "learning_rate": 1.2403100775193799e-06,
+      "loss": 0.887,
+      "step": 1019
+    },
+    {
+      "epoch": 1.5813953488372094,
+      "grad_norm": 1.4061412811279297,
+      "learning_rate": 1.2360034453057709e-06,
+      "loss": 0.8648,
+      "step": 1020
+    },
+    {
+      "epoch": 1.5813953488372094,
+      "eval_loss": 1.0146329402923584,
+      "eval_runtime": 46.4999,
+      "eval_samples_per_second": 21.505,
+      "eval_steps_per_second": 1.355,
+      "step": 1020
+    },
+    {
+      "epoch": 1.5829457364341084,
+      "grad_norm": 1.4939271211624146,
+      "learning_rate": 1.231696813092162e-06,
+      "loss": 0.8797,
+      "step": 1021
+    },
+    {
+      "epoch": 1.5844961240310078,
+      "grad_norm": 1.8291614055633545,
+      "learning_rate": 1.227390180878553e-06,
+      "loss": 0.8553,
+      "step": 1022
+    },
+    {
+      "epoch": 1.586046511627907,
+      "grad_norm": 1.4999667406082153,
+      "learning_rate": 1.2230835486649442e-06,
+      "loss": 0.821,
+      "step": 1023
+    },
+    {
+      "epoch": 1.5875968992248062,
+      "grad_norm": 1.3863446712493896,
+      "learning_rate": 1.2187769164513351e-06,
+      "loss": 0.863,
+      "step": 1024
+    },
+    {
+      "epoch": 1.5891472868217056,
+      "grad_norm": 1.6713688373565674,
+      "learning_rate": 1.2144702842377263e-06,
+      "loss": 0.9091,
+      "step": 1025
+    },
+    {
+      "epoch": 1.5906976744186045,
+      "grad_norm": 1.4036214351654053,
+      "learning_rate": 1.2101636520241173e-06,
+      "loss": 0.8798,
+      "step": 1026
+    },
+    {
+      "epoch": 1.592248062015504,
+      "grad_norm": 1.470150113105774,
+      "learning_rate": 1.2058570198105083e-06,
+      "loss": 0.8649,
+      "step": 1027
+    },
+    {
+      "epoch": 1.5937984496124031,
+      "grad_norm": 1.312363862991333,
+      "learning_rate": 1.2015503875968994e-06,
+      "loss": 0.8768,
+      "step": 1028
+    },
+    {
+      "epoch": 1.5953488372093023,
+      "grad_norm": 1.326096534729004,
+      "learning_rate": 1.1972437553832904e-06,
+      "loss": 0.8694,
+      "step": 1029
+    },
+    {
+      "epoch": 1.5968992248062015,
+      "grad_norm": 1.5062940120697021,
+      "learning_rate": 1.1929371231696814e-06,
+      "loss": 0.8576,
+      "step": 1030
+    },
+    {
+      "epoch": 1.5968992248062015,
+      "eval_loss": 1.0147876739501953,
+      "eval_runtime": 46.4171,
+      "eval_samples_per_second": 21.544,
+      "eval_steps_per_second": 1.357,
+      "step": 1030
+    },
+    {
+      "epoch": 1.5984496124031007,
+      "grad_norm": 1.51321280002594,
+      "learning_rate": 1.1886304909560723e-06,
+      "loss": 0.8853,
+      "step": 1031
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 1.4676376581192017,
+      "learning_rate": 1.1843238587424635e-06,
+      "loss": 0.8764,
+      "step": 1032
+    },
+    {
+      "epoch": 1.6015503875968993,
+      "grad_norm": 1.5837122201919556,
+      "learning_rate": 1.1800172265288545e-06,
+      "loss": 0.8573,
+      "step": 1033
+    },
+    {
+      "epoch": 1.6031007751937985,
+      "grad_norm": 1.3964475393295288,
+      "learning_rate": 1.1757105943152455e-06,
+      "loss": 0.8533,
+      "step": 1034
+    },
+    {
+      "epoch": 1.6046511627906976,
+      "grad_norm": 1.3663526773452759,
+      "learning_rate": 1.1714039621016366e-06,
+      "loss": 0.87,
+      "step": 1035
+    },
+    {
+      "epoch": 1.6062015503875968,
+      "grad_norm": 1.3615703582763672,
+      "learning_rate": 1.1670973298880276e-06,
+      "loss": 0.8873,
+      "step": 1036
+    },
+    {
+      "epoch": 1.6077519379844962,
+      "grad_norm": 1.720268726348877,
+      "learning_rate": 1.1627906976744188e-06,
+      "loss": 0.8814,
+      "step": 1037
+    },
+    {
+      "epoch": 1.6093023255813952,
+      "grad_norm": 1.7937016487121582,
+      "learning_rate": 1.1584840654608098e-06,
+      "loss": 0.8723,
+      "step": 1038
+    },
+    {
+      "epoch": 1.6108527131782946,
+      "grad_norm": 1.2060179710388184,
+      "learning_rate": 1.1541774332472007e-06,
+      "loss": 0.8176,
+      "step": 1039
+    },
+    {
+      "epoch": 1.6124031007751938,
+      "grad_norm": 1.556498408317566,
+      "learning_rate": 1.149870801033592e-06,
+      "loss": 0.8536,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6124031007751938,
+      "eval_loss": 1.0148112773895264,
+      "eval_runtime": 46.3077,
+      "eval_samples_per_second": 21.595,
+      "eval_steps_per_second": 1.36,
+      "step": 1040
+    },
+    {
+      "epoch": 1.613953488372093,
+      "grad_norm": 1.3427506685256958,
+      "learning_rate": 1.1455641688199829e-06,
+      "loss": 0.8651,
+      "step": 1041
+    },
+    {
+      "epoch": 1.6155038759689924,
+      "grad_norm": 1.2598917484283447,
+      "learning_rate": 1.141257536606374e-06,
+      "loss": 0.8207,
+      "step": 1042
+    },
+    {
+      "epoch": 1.6170542635658913,
+      "grad_norm": 1.4230711460113525,
+      "learning_rate": 1.136950904392765e-06,
+      "loss": 0.8898,
+      "step": 1043
+    },
+    {
+      "epoch": 1.6186046511627907,
+      "grad_norm": 1.5285265445709229,
+      "learning_rate": 1.132644272179156e-06,
+      "loss": 0.8667,
+      "step": 1044
+    },
+    {
+      "epoch": 1.62015503875969,
+      "grad_norm": 1.5158048868179321,
+      "learning_rate": 1.128337639965547e-06,
+      "loss": 0.8694,
+      "step": 1045
+    },
+    {
+      "epoch": 1.621705426356589,
+      "grad_norm": 1.2861136198043823,
+      "learning_rate": 1.1240310077519381e-06,
+      "loss": 0.8414,
+      "step": 1046
+    },
+    {
+      "epoch": 1.6232558139534885,
+      "grad_norm": 1.4673242568969727,
+      "learning_rate": 1.119724375538329e-06,
+      "loss": 0.8825,
+      "step": 1047
+    },
+    {
+      "epoch": 1.6248062015503875,
+      "grad_norm": 1.460770845413208,
+      "learning_rate": 1.11541774332472e-06,
+      "loss": 0.8739,
+      "step": 1048
+    },
+    {
+      "epoch": 1.6263565891472869,
+      "grad_norm": 1.6731219291687012,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 0.8759,
+      "step": 1049
+    },
+    {
+      "epoch": 1.627906976744186,
+      "grad_norm": 1.4495750665664673,
+      "learning_rate": 1.1068044788975022e-06,
+      "loss": 0.8499,
+      "step": 1050
+    },
+    {
+      "epoch": 1.627906976744186,
+      "eval_loss": 1.014420986175537,
+      "eval_runtime": 46.4748,
+      "eval_samples_per_second": 21.517,
+      "eval_steps_per_second": 1.356,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6294573643410852,
+      "grad_norm": 1.457659125328064,
+      "learning_rate": 1.1024978466838932e-06,
+      "loss": 0.8712,
+      "step": 1051
+    },
+    {
+      "epoch": 1.6310077519379846,
+      "grad_norm": 1.3786025047302246,
+      "learning_rate": 1.0981912144702844e-06,
+      "loss": 0.872,
+      "step": 1052
+    },
+    {
+      "epoch": 1.6325581395348836,
+      "grad_norm": 1.3944154977798462,
+      "learning_rate": 1.0938845822566753e-06,
+      "loss": 0.9201,
+      "step": 1053
+    },
+    {
+      "epoch": 1.634108527131783,
+      "grad_norm": 1.571994662284851,
+      "learning_rate": 1.0895779500430665e-06,
+      "loss": 0.8853,
+      "step": 1054
+    },
+    {
+      "epoch": 1.6356589147286822,
+      "grad_norm": 1.4556063413619995,
+      "learning_rate": 1.0852713178294575e-06,
+      "loss": 0.8827,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6372093023255814,
+      "grad_norm": 1.5137085914611816,
+      "learning_rate": 1.0809646856158484e-06,
+      "loss": 0.8885,
+      "step": 1056
+    },
+    {
+      "epoch": 1.6387596899224808,
+      "grad_norm": 1.3905104398727417,
+      "learning_rate": 1.0766580534022396e-06,
+      "loss": 0.8917,
+      "step": 1057
+    },
+    {
+      "epoch": 1.6403100775193797,
+      "grad_norm": 1.4482052326202393,
+      "learning_rate": 1.0723514211886306e-06,
+      "loss": 0.8912,
+      "step": 1058
+    },
+    {
+      "epoch": 1.6418604651162791,
+      "grad_norm": 1.3725974559783936,
+      "learning_rate": 1.0680447889750216e-06,
+      "loss": 0.8918,
+      "step": 1059
+    },
+    {
+      "epoch": 1.6434108527131783,
+      "grad_norm": 1.3311225175857544,
+      "learning_rate": 1.0637381567614125e-06,
+      "loss": 0.8609,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6434108527131783,
+      "eval_loss": 1.0134074687957764,
+      "eval_runtime": 46.4133,
+      "eval_samples_per_second": 21.546,
+      "eval_steps_per_second": 1.357,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6449612403100775,
+      "grad_norm": 1.3237463235855103,
+      "learning_rate": 1.0594315245478037e-06,
+      "loss": 0.8416,
+      "step": 1061
+    },
+    {
+      "epoch": 1.6465116279069767,
+      "grad_norm": 1.3081848621368408,
+      "learning_rate": 1.0551248923341947e-06,
+      "loss": 0.8836,
+      "step": 1062
+    },
+    {
+      "epoch": 1.6480620155038759,
+      "grad_norm": 1.6118297576904297,
+      "learning_rate": 1.0508182601205856e-06,
+      "loss": 0.8822,
+      "step": 1063
+    },
+    {
+      "epoch": 1.6496124031007753,
+      "grad_norm": 1.3589472770690918,
+      "learning_rate": 1.0465116279069768e-06,
+      "loss": 0.8715,
+      "step": 1064
+    },
+    {
+      "epoch": 1.6511627906976745,
+      "grad_norm": 1.4369632005691528,
+      "learning_rate": 1.0422049956933678e-06,
+      "loss": 0.8416,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6527131782945736,
+      "grad_norm": 1.4928234815597534,
+      "learning_rate": 1.037898363479759e-06,
+      "loss": 0.8723,
+      "step": 1066
+    },
+    {
+      "epoch": 1.6542635658914728,
+      "grad_norm": 1.333701729774475,
+      "learning_rate": 1.03359173126615e-06,
+      "loss": 0.8692,
+      "step": 1067
+    },
+    {
+      "epoch": 1.655813953488372,
+      "grad_norm": 1.324978232383728,
+      "learning_rate": 1.0292850990525411e-06,
+      "loss": 0.8549,
+      "step": 1068
+    },
+    {
+      "epoch": 1.6573643410852714,
+      "grad_norm": 1.4882620573043823,
+      "learning_rate": 1.024978466838932e-06,
+      "loss": 0.8732,
+      "step": 1069
+    },
+    {
+      "epoch": 1.6589147286821704,
+      "grad_norm": 1.3324483633041382,
+      "learning_rate": 1.020671834625323e-06,
+      "loss": 0.8902,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6589147286821704,
+      "eval_loss": 1.0144659280776978,
+      "eval_runtime": 46.3728,
+      "eval_samples_per_second": 21.564,
+      "eval_steps_per_second": 1.359,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6604651162790698,
+      "grad_norm": 1.2734097242355347,
+      "learning_rate": 1.0163652024117142e-06,
+      "loss": 0.8381,
+      "step": 1071
+    },
+    {
+      "epoch": 1.662015503875969,
+      "grad_norm": 1.3319727182388306,
+      "learning_rate": 1.0120585701981052e-06,
+      "loss": 0.8616,
+      "step": 1072
+    },
+    {
+      "epoch": 1.6635658914728682,
+      "grad_norm": 1.4507578611373901,
+      "learning_rate": 1.0077519379844962e-06,
+      "loss": 0.8615,
+      "step": 1073
+    },
+    {
+      "epoch": 1.6651162790697676,
+      "grad_norm": 1.4849218130111694,
+      "learning_rate": 1.0034453057708871e-06,
+      "loss": 0.8825,
+      "step": 1074
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 1.3075767755508423,
+      "learning_rate": 9.991386735572783e-07,
+      "loss": 0.8813,
+      "step": 1075
+    },
+    {
+      "epoch": 1.668217054263566,
+      "grad_norm": 1.359571933746338,
+      "learning_rate": 9.948320413436693e-07,
+      "loss": 0.8874,
+      "step": 1076
+    },
+    {
+      "epoch": 1.669767441860465,
+      "grad_norm": 1.582699179649353,
+      "learning_rate": 9.905254091300603e-07,
+      "loss": 0.8527,
+      "step": 1077
+    },
+    {
+      "epoch": 1.6713178294573643,
+      "grad_norm": 1.3974062204360962,
+      "learning_rate": 9.862187769164514e-07,
+      "loss": 0.8754,
+      "step": 1078
+    },
+    {
+      "epoch": 1.6728682170542637,
+      "grad_norm": 1.4189597368240356,
+      "learning_rate": 9.819121447028424e-07,
+      "loss": 0.84,
+      "step": 1079
+    },
+    {
+      "epoch": 1.6744186046511627,
+      "grad_norm": 1.387370228767395,
+      "learning_rate": 9.776055124892336e-07,
+      "loss": 0.8757,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6744186046511627,
+      "eval_loss": 1.0135537385940552,
+      "eval_runtime": 46.4402,
+      "eval_samples_per_second": 21.533,
+      "eval_steps_per_second": 1.357,
+      "step": 1080
+    },
+    {
+      "epoch": 1.675968992248062,
+      "grad_norm": 1.3377106189727783,
+      "learning_rate": 9.732988802756245e-07,
+      "loss": 0.8955,
+      "step": 1081
+    },
+    {
+      "epoch": 1.6775193798449612,
+      "grad_norm": 1.3911293745040894,
+      "learning_rate": 9.689922480620157e-07,
+      "loss": 0.8785,
+      "step": 1082
+    },
+    {
+      "epoch": 1.6790697674418604,
+      "grad_norm": 1.2773281335830688,
+      "learning_rate": 9.646856158484067e-07,
+      "loss": 0.8504,
+      "step": 1083
+    },
+    {
+      "epoch": 1.6806201550387598,
+      "grad_norm": 1.370222806930542,
+      "learning_rate": 9.603789836347977e-07,
+      "loss": 0.8879,
+      "step": 1084
+    },
+    {
+      "epoch": 1.6821705426356588,
+      "grad_norm": 1.3102188110351562,
+      "learning_rate": 9.560723514211886e-07,
+      "loss": 0.8193,
+      "step": 1085
+    },
+    {
+      "epoch": 1.6837209302325582,
+      "grad_norm": 1.5439156293869019,
+      "learning_rate": 9.517657192075798e-07,
+      "loss": 0.8803,
+      "step": 1086
+    },
+    {
+      "epoch": 1.6852713178294574,
+      "grad_norm": 1.530087947845459,
+      "learning_rate": 9.474590869939708e-07,
+      "loss": 0.8543,
+      "step": 1087
+    },
+    {
+      "epoch": 1.6868217054263566,
+      "grad_norm": 1.5727529525756836,
+      "learning_rate": 9.431524547803617e-07,
+      "loss": 0.9076,
+      "step": 1088
+    },
+    {
+      "epoch": 1.688372093023256,
+      "grad_norm": 1.666233777999878,
+      "learning_rate": 9.388458225667529e-07,
+      "loss": 0.8781,
+      "step": 1089
+    },
+    {
+      "epoch": 1.689922480620155,
+      "grad_norm": 1.4620155096054077,
+      "learning_rate": 9.345391903531439e-07,
+      "loss": 0.851,
+      "step": 1090
+    },
+    {
+      "epoch": 1.689922480620155,
+      "eval_loss": 1.0132611989974976,
+      "eval_runtime": 46.4938,
+      "eval_samples_per_second": 21.508,
+      "eval_steps_per_second": 1.355,
+      "step": 1090
+    },
+    {
+      "epoch": 1.6914728682170543,
+      "grad_norm": 1.3910199403762817,
+      "learning_rate": 9.30232558139535e-07,
+      "loss": 0.8891,
+      "step": 1091
+    },
+    {
+      "epoch": 1.6930232558139535,
+      "grad_norm": 1.347779631614685,
+      "learning_rate": 9.259259259259259e-07,
+      "loss": 0.8509,
+      "step": 1092
+    },
+    {
+      "epoch": 1.6945736434108527,
+      "grad_norm": 1.4723174571990967,
+      "learning_rate": 9.216192937123171e-07,
+      "loss": 0.8737,
+      "step": 1093
+    },
+    {
+      "epoch": 1.6961240310077519,
+      "grad_norm": 1.3850101232528687,
+      "learning_rate": 9.173126614987081e-07,
+      "loss": 0.8863,
+      "step": 1094
+    },
+    {
+      "epoch": 1.697674418604651,
+      "grad_norm": 1.3053226470947266,
+      "learning_rate": 9.13006029285099e-07,
+      "loss": 0.8678,
+      "step": 1095
+    },
+    {
+      "epoch": 1.6992248062015505,
+      "grad_norm": 1.3607691526412964,
+      "learning_rate": 9.086993970714901e-07,
+      "loss": 0.8616,
+      "step": 1096
+    },
+    {
+      "epoch": 1.7007751937984497,
+      "grad_norm": 1.5045592784881592,
+      "learning_rate": 9.043927648578812e-07,
+      "loss": 0.8576,
+      "step": 1097
+    },
+    {
+      "epoch": 1.7023255813953488,
+      "grad_norm": 1.4135942459106445,
+      "learning_rate": 9.000861326442723e-07,
+      "loss": 0.8837,
+      "step": 1098
+    },
+    {
+      "epoch": 1.703875968992248,
+      "grad_norm": 1.3827612400054932,
+      "learning_rate": 8.957795004306632e-07,
+      "loss": 0.8609,
+      "step": 1099
+    },
+    {
+      "epoch": 1.7054263565891472,
+      "grad_norm": 1.3499889373779297,
+      "learning_rate": 8.914728682170544e-07,
+      "loss": 0.8283,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7054263565891472,
+      "eval_loss": 1.0121917724609375,
+      "eval_runtime": 46.4487,
+      "eval_samples_per_second": 21.529,
+      "eval_steps_per_second": 1.356,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7069767441860466,
+      "grad_norm": 1.363226294517517,
+      "learning_rate": 8.871662360034454e-07,
+      "loss": 0.8827,
+      "step": 1101
+    },
+    {
+      "epoch": 1.7085271317829456,
+      "grad_norm": 1.5012890100479126,
+      "learning_rate": 8.828596037898364e-07,
+      "loss": 0.8636,
+      "step": 1102
+    },
+    {
+      "epoch": 1.710077519379845,
+      "grad_norm": 1.3506569862365723,
+      "learning_rate": 8.785529715762274e-07,
+      "loss": 0.8507,
+      "step": 1103
+    },
+    {
+      "epoch": 1.7116279069767442,
+      "grad_norm": 1.5143163204193115,
+      "learning_rate": 8.742463393626185e-07,
+      "loss": 0.8715,
+      "step": 1104
+    },
+    {
+      "epoch": 1.7131782945736433,
+      "grad_norm": 1.5398318767547607,
+      "learning_rate": 8.699397071490096e-07,
+      "loss": 0.823,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7147286821705428,
+      "grad_norm": 1.3466542959213257,
+      "learning_rate": 8.656330749354005e-07,
+      "loss": 0.8737,
+      "step": 1106
+    },
+    {
+      "epoch": 1.7162790697674417,
+      "grad_norm": 1.4723925590515137,
+      "learning_rate": 8.613264427217917e-07,
+      "loss": 0.8641,
+      "step": 1107
+    },
+    {
+      "epoch": 1.7178294573643411,
+      "grad_norm": 1.493037462234497,
+      "learning_rate": 8.570198105081827e-07,
+      "loss": 0.8641,
+      "step": 1108
+    },
+    {
+      "epoch": 1.7193798449612403,
+      "grad_norm": 1.3498090505599976,
+      "learning_rate": 8.527131782945737e-07,
+      "loss": 0.8424,
+      "step": 1109
+    },
+    {
+      "epoch": 1.7209302325581395,
+      "grad_norm": 1.5638823509216309,
+      "learning_rate": 8.484065460809647e-07,
+      "loss": 0.8661,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7209302325581395,
+      "eval_loss": 1.0125303268432617,
+      "eval_runtime": 46.5089,
+      "eval_samples_per_second": 21.501,
+      "eval_steps_per_second": 1.355,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7224806201550389,
+      "grad_norm": 1.3510512113571167,
+      "learning_rate": 8.440999138673558e-07,
+      "loss": 0.8589,
+      "step": 1111
+    },
+    {
+      "epoch": 1.7240310077519378,
+      "grad_norm": 1.4735004901885986,
+      "learning_rate": 8.397932816537469e-07,
+      "loss": 0.8345,
+      "step": 1112
+    },
+    {
+      "epoch": 1.7255813953488373,
+      "grad_norm": 1.4031803607940674,
+      "learning_rate": 8.354866494401378e-07,
+      "loss": 0.8861,
+      "step": 1113
+    },
+    {
+      "epoch": 1.7271317829457364,
+      "grad_norm": 1.3703727722167969,
+      "learning_rate": 8.311800172265288e-07,
+      "loss": 0.8727,
+      "step": 1114
+    },
+    {
+      "epoch": 1.7286821705426356,
+      "grad_norm": 1.4153964519500732,
+      "learning_rate": 8.2687338501292e-07,
+      "loss": 0.8676,
+      "step": 1115
+    },
+    {
+      "epoch": 1.730232558139535,
+      "grad_norm": 1.292462944984436,
+      "learning_rate": 8.22566752799311e-07,
+      "loss": 0.8722,
+      "step": 1116
+    },
+    {
+      "epoch": 1.731782945736434,
+      "grad_norm": 1.3069566488265991,
+      "learning_rate": 8.18260120585702e-07,
+      "loss": 0.8322,
+      "step": 1117
+    },
+    {
+      "epoch": 1.7333333333333334,
+      "grad_norm": 1.288217544555664,
+      "learning_rate": 8.139534883720931e-07,
+      "loss": 0.8583,
+      "step": 1118
+    },
+    {
+      "epoch": 1.7348837209302326,
+      "grad_norm": 1.555336594581604,
+      "learning_rate": 8.096468561584842e-07,
+      "loss": 0.8624,
+      "step": 1119
+    },
+    {
+      "epoch": 1.7364341085271318,
+      "grad_norm": 1.4711161851882935,
+      "learning_rate": 8.053402239448752e-07,
+      "loss": 0.851,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7364341085271318,
+      "eval_loss": 1.0113039016723633,
+      "eval_runtime": 46.4504,
+      "eval_samples_per_second": 21.528,
+      "eval_steps_per_second": 1.356,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7379844961240312,
+      "grad_norm": 1.5582923889160156,
+      "learning_rate": 8.010335917312661e-07,
+      "loss": 0.872,
+      "step": 1121
+    },
+    {
+      "epoch": 1.7395348837209301,
+      "grad_norm": 1.3969173431396484,
+      "learning_rate": 7.967269595176573e-07,
+      "loss": 0.8279,
+      "step": 1122
+    },
+    {
+      "epoch": 1.7410852713178295,
+      "grad_norm": 1.5065845251083374,
+      "learning_rate": 7.924203273040483e-07,
+      "loss": 0.9051,
+      "step": 1123
+    },
+    {
+      "epoch": 1.7426356589147287,
+      "grad_norm": 1.520509123802185,
+      "learning_rate": 7.881136950904393e-07,
+      "loss": 0.8762,
+      "step": 1124
+    },
+    {
+      "epoch": 1.744186046511628,
+      "grad_norm": 1.4614084959030151,
+      "learning_rate": 7.838070628768304e-07,
+      "loss": 0.8638,
+      "step": 1125
+    },
+    {
+      "epoch": 1.745736434108527,
+      "grad_norm": 1.3072479963302612,
+      "learning_rate": 7.795004306632215e-07,
+      "loss": 0.8482,
+      "step": 1126
+    },
+    {
+      "epoch": 1.7472868217054263,
+      "grad_norm": 1.5456178188323975,
+      "learning_rate": 7.751937984496125e-07,
+      "loss": 0.8306,
+      "step": 1127
+    },
+    {
+      "epoch": 1.7488372093023257,
+      "grad_norm": 1.354901671409607,
+      "learning_rate": 7.708871662360034e-07,
+      "loss": 0.8581,
+      "step": 1128
+    },
+    {
+      "epoch": 1.7503875968992249,
+      "grad_norm": 1.367995262145996,
+      "learning_rate": 7.665805340223946e-07,
+      "loss": 0.8409,
+      "step": 1129
+    },
+    {
+      "epoch": 1.751937984496124,
+      "grad_norm": 1.4493037462234497,
+      "learning_rate": 7.622739018087856e-07,
+      "loss": 0.8783,
+      "step": 1130
+    },
+    {
+      "epoch": 1.751937984496124,
+      "eval_loss": 1.0115089416503906,
+      "eval_runtime": 46.5666,
+      "eval_samples_per_second": 21.475,
+      "eval_steps_per_second": 1.353,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7534883720930232,
+      "grad_norm": 1.3681944608688354,
+      "learning_rate": 7.579672695951766e-07,
+      "loss": 0.8619,
+      "step": 1131
+    },
+    {
+      "epoch": 1.7550387596899224,
+      "grad_norm": 1.399470329284668,
+      "learning_rate": 7.536606373815676e-07,
+      "loss": 0.8456,
+      "step": 1132
+    },
+    {
+      "epoch": 1.7565891472868218,
+      "grad_norm": 1.2924598455429077,
+      "learning_rate": 7.493540051679588e-07,
+      "loss": 0.8667,
+      "step": 1133
+    },
+    {
+      "epoch": 1.7581395348837208,
+      "grad_norm": 1.578048825263977,
+      "learning_rate": 7.450473729543498e-07,
+      "loss": 0.8768,
+      "step": 1134
+    },
+    {
+      "epoch": 1.7596899224806202,
+      "grad_norm": 1.6144347190856934,
+      "learning_rate": 7.407407407407407e-07,
+      "loss": 0.8443,
+      "step": 1135
+    },
+    {
+      "epoch": 1.7612403100775194,
+      "grad_norm": 1.5569820404052734,
+      "learning_rate": 7.364341085271319e-07,
+      "loss": 0.8826,
+      "step": 1136
+    },
+    {
+      "epoch": 1.7627906976744185,
+      "grad_norm": 1.437939167022705,
+      "learning_rate": 7.321274763135229e-07,
+      "loss": 0.832,
+      "step": 1137
+    },
+    {
+      "epoch": 1.764341085271318,
+      "grad_norm": 1.333731770515442,
+      "learning_rate": 7.27820844099914e-07,
+      "loss": 0.8508,
+      "step": 1138
+    },
+    {
+      "epoch": 1.765891472868217,
+      "grad_norm": 1.3062009811401367,
+      "learning_rate": 7.235142118863049e-07,
+      "loss": 0.8724,
+      "step": 1139
+    },
+    {
+      "epoch": 1.7674418604651163,
+      "grad_norm": 1.471814513206482,
+      "learning_rate": 7.192075796726961e-07,
+      "loss": 0.8718,
+      "step": 1140
+    },
+    {
+      "epoch": 1.7674418604651163,
+      "eval_loss": 1.0100195407867432,
+      "eval_runtime": 46.5331,
+      "eval_samples_per_second": 21.49,
+      "eval_steps_per_second": 1.354,
+      "step": 1140
+    },
+    {
+      "epoch": 1.7689922480620155,
+      "grad_norm": 1.6204849481582642,
+      "learning_rate": 7.149009474590871e-07,
+      "loss": 0.8556,
+      "step": 1141
+    },
+    {
+      "epoch": 1.7705426356589147,
+      "grad_norm": 1.3823068141937256,
+      "learning_rate": 7.10594315245478e-07,
+      "loss": 0.8851,
+      "step": 1142
+    },
+    {
+      "epoch": 1.772093023255814,
+      "grad_norm": 1.2534395456314087,
+      "learning_rate": 7.062876830318692e-07,
+      "loss": 0.8517,
+      "step": 1143
+    },
+    {
+      "epoch": 1.773643410852713,
+      "grad_norm": 1.6374818086624146,
+      "learning_rate": 7.019810508182602e-07,
+      "loss": 0.8953,
+      "step": 1144
+    },
+    {
+      "epoch": 1.7751937984496124,
+      "grad_norm": 1.5260299444198608,
+      "learning_rate": 6.976744186046513e-07,
+      "loss": 0.8555,
+      "step": 1145
+    },
+    {
+      "epoch": 1.7767441860465116,
+      "grad_norm": 1.4138376712799072,
+      "learning_rate": 6.933677863910422e-07,
+      "loss": 0.8479,
+      "step": 1146
+    },
+    {
+      "epoch": 1.7782945736434108,
+      "grad_norm": 1.225590467453003,
+      "learning_rate": 6.890611541774334e-07,
+      "loss": 0.845,
+      "step": 1147
+    },
+    {
+      "epoch": 1.7798449612403102,
+      "grad_norm": 1.3658701181411743,
+      "learning_rate": 6.847545219638244e-07,
+      "loss": 0.8561,
+      "step": 1148
+    },
+    {
+      "epoch": 1.7813953488372092,
+      "grad_norm": 1.4470748901367188,
+      "learning_rate": 6.804478897502153e-07,
+      "loss": 0.8799,
+      "step": 1149
+    },
+    {
+      "epoch": 1.7829457364341086,
+      "grad_norm": 1.3451247215270996,
+      "learning_rate": 6.761412575366064e-07,
+      "loss": 0.8523,
+      "step": 1150
+    },
+    {
+      "epoch": 1.7829457364341086,
+      "eval_loss": 1.0102328062057495,
+      "eval_runtime": 46.5475,
+      "eval_samples_per_second": 21.483,
+      "eval_steps_per_second": 1.353,
+      "step": 1150
+    },
+    {
+      "epoch": 1.7844961240310078,
+      "grad_norm": 1.4519866704940796,
+      "learning_rate": 6.718346253229975e-07,
+      "loss": 0.8441,
+      "step": 1151
+    },
+    {
+      "epoch": 1.786046511627907,
+      "grad_norm": 1.3575239181518555,
+      "learning_rate": 6.675279931093886e-07,
+      "loss": 0.8829,
+      "step": 1152
+    },
+    {
+      "epoch": 1.7875968992248064,
+      "grad_norm": 1.427385926246643,
+      "learning_rate": 6.632213608957795e-07,
+      "loss": 0.8407,
+      "step": 1153
+    },
+    {
+      "epoch": 1.7891472868217053,
+      "grad_norm": 1.4608075618743896,
+      "learning_rate": 6.589147286821707e-07,
+      "loss": 0.864,
+      "step": 1154
+    },
+    {
+      "epoch": 1.7906976744186047,
+      "grad_norm": 1.3200838565826416,
+      "learning_rate": 6.546080964685617e-07,
+      "loss": 0.8551,
+      "step": 1155
+    },
+    {
+      "epoch": 1.792248062015504,
+      "grad_norm": 1.4095115661621094,
+      "learning_rate": 6.503014642549526e-07,
+      "loss": 0.8753,
+      "step": 1156
+    },
+    {
+      "epoch": 1.793798449612403,
+      "grad_norm": 1.334485411643982,
+      "learning_rate": 6.459948320413437e-07,
+      "loss": 0.8601,
+      "step": 1157
+    },
+    {
+      "epoch": 1.7953488372093023,
+      "grad_norm": 1.524133563041687,
+      "learning_rate": 6.416881998277348e-07,
+      "loss": 0.8746,
+      "step": 1158
+    },
+    {
+      "epoch": 1.7968992248062015,
+      "grad_norm": 1.4331588745117188,
+      "learning_rate": 6.373815676141259e-07,
+      "loss": 0.8826,
+      "step": 1159
+    },
+    {
+      "epoch": 1.7984496124031009,
+      "grad_norm": 1.455048680305481,
+      "learning_rate": 6.330749354005168e-07,
+      "loss": 0.8734,
+      "step": 1160
+    },
+    {
+      "epoch": 1.7984496124031009,
+      "eval_loss": 1.009870171546936,
+      "eval_runtime": 46.6815,
+      "eval_samples_per_second": 21.422,
+      "eval_steps_per_second": 1.35,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.3782318830490112,
+      "learning_rate": 6.28768303186908e-07,
+      "loss": 0.8566,
+      "step": 1161
+    },
+    {
+      "epoch": 1.8015503875968992,
+      "grad_norm": 1.4073529243469238,
+      "learning_rate": 6.24461670973299e-07,
+      "loss": 0.8946,
+      "step": 1162
+    },
+    {
+      "epoch": 1.8031007751937984,
+      "grad_norm": 1.3667042255401611,
+      "learning_rate": 6.201550387596899e-07,
+      "loss": 0.8499,
+      "step": 1163
+    },
+    {
+      "epoch": 1.8046511627906976,
+      "grad_norm": 1.4060592651367188,
+      "learning_rate": 6.15848406546081e-07,
+      "loss": 0.8715,
+      "step": 1164
+    },
+    {
+      "epoch": 1.806201550387597,
+      "grad_norm": 1.335383415222168,
+      "learning_rate": 6.115417743324721e-07,
+      "loss": 0.8357,
+      "step": 1165
+    },
+    {
+      "epoch": 1.807751937984496,
+      "grad_norm": 1.4511020183563232,
+      "learning_rate": 6.072351421188632e-07,
+      "loss": 0.8505,
+      "step": 1166
+    },
+    {
+      "epoch": 1.8093023255813954,
+      "grad_norm": 1.4266198873519897,
+      "learning_rate": 6.029285099052541e-07,
+      "loss": 0.868,
+      "step": 1167
+    },
+    {
+      "epoch": 1.8108527131782945,
+      "grad_norm": 1.3813896179199219,
+      "learning_rate": 5.986218776916452e-07,
+      "loss": 0.8709,
+      "step": 1168
+    },
+    {
+      "epoch": 1.8124031007751937,
+      "grad_norm": 1.5237208604812622,
+      "learning_rate": 5.943152454780362e-07,
+      "loss": 0.8397,
+      "step": 1169
+    },
+    {
+      "epoch": 1.8139534883720931,
+      "grad_norm": 1.3090314865112305,
+      "learning_rate": 5.900086132644272e-07,
+      "loss": 0.8507,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8139534883720931,
+      "eval_loss": 1.0094170570373535,
+      "eval_runtime": 46.5958,
+      "eval_samples_per_second": 21.461,
+      "eval_steps_per_second": 1.352,
+      "step": 1170
+    },
+    {
+      "epoch": 1.815503875968992,
+      "grad_norm": 1.3480397462844849,
+      "learning_rate": 5.857019810508183e-07,
+      "loss": 0.8762,
+      "step": 1171
+    },
+    {
+      "epoch": 1.8170542635658915,
+      "grad_norm": 1.4064971208572388,
+      "learning_rate": 5.813953488372094e-07,
+      "loss": 0.8769,
+      "step": 1172
+    },
+    {
+      "epoch": 1.8186046511627907,
+      "grad_norm": 1.3863223791122437,
+      "learning_rate": 5.770887166236004e-07,
+      "loss": 0.864,
+      "step": 1173
+    },
+    {
+      "epoch": 1.8201550387596899,
+      "grad_norm": 1.4081705808639526,
+      "learning_rate": 5.727820844099914e-07,
+      "loss": 0.8843,
+      "step": 1174
+    },
+    {
+      "epoch": 1.8217054263565893,
+      "grad_norm": 1.417191505432129,
+      "learning_rate": 5.684754521963825e-07,
+      "loss": 0.8526,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8232558139534882,
+      "grad_norm": 1.395699143409729,
+      "learning_rate": 5.641688199827735e-07,
+      "loss": 0.8801,
+      "step": 1176
+    },
+    {
+      "epoch": 1.8248062015503876,
+      "grad_norm": 1.3280203342437744,
+      "learning_rate": 5.598621877691646e-07,
+      "loss": 0.8444,
+      "step": 1177
+    },
+    {
+      "epoch": 1.8263565891472868,
+      "grad_norm": 1.5310155153274536,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 0.8828,
+      "step": 1178
+    },
+    {
+      "epoch": 1.827906976744186,
+      "grad_norm": 1.2928513288497925,
+      "learning_rate": 5.512489233419466e-07,
+      "loss": 0.8882,
+      "step": 1179
+    },
+    {
+      "epoch": 1.8294573643410854,
+      "grad_norm": 1.4305659532546997,
+      "learning_rate": 5.469422911283377e-07,
+      "loss": 0.9073,
+      "step": 1180
+    },
+    {
+      "epoch": 1.8294573643410854,
+      "eval_loss": 1.0100611448287964,
+      "eval_runtime": 46.6091,
+      "eval_samples_per_second": 21.455,
+      "eval_steps_per_second": 1.352,
+      "step": 1180
+    },
+    {
+      "epoch": 1.8310077519379844,
+      "grad_norm": 1.4535592794418335,
+      "learning_rate": 5.426356589147287e-07,
+      "loss": 0.8876,
+      "step": 1181
+    },
+    {
+      "epoch": 1.8325581395348838,
+      "grad_norm": 1.358090877532959,
+      "learning_rate": 5.383290267011198e-07,
+      "loss": 0.8592,
+      "step": 1182
+    },
+    {
+      "epoch": 1.834108527131783,
+      "grad_norm": 1.328998327255249,
+      "learning_rate": 5.340223944875108e-07,
+      "loss": 0.8607,
+      "step": 1183
+    },
+    {
+      "epoch": 1.8356589147286821,
+      "grad_norm": 1.3628747463226318,
+      "learning_rate": 5.297157622739019e-07,
+      "loss": 0.8563,
+      "step": 1184
+    },
+    {
+      "epoch": 1.8372093023255816,
+      "grad_norm": 1.3805159330368042,
+      "learning_rate": 5.254091300602928e-07,
+      "loss": 0.8252,
+      "step": 1185
+    },
+    {
+      "epoch": 1.8387596899224805,
+      "grad_norm": 1.4368276596069336,
+      "learning_rate": 5.211024978466839e-07,
+      "loss": 0.8873,
+      "step": 1186
+    },
+    {
+      "epoch": 1.84031007751938,
+      "grad_norm": 1.3603229522705078,
+      "learning_rate": 5.16795865633075e-07,
+      "loss": 0.835,
+      "step": 1187
+    },
+    {
+      "epoch": 1.841860465116279,
+      "grad_norm": 1.41812002658844,
+      "learning_rate": 5.12489233419466e-07,
+      "loss": 0.8423,
+      "step": 1188
+    },
+    {
+      "epoch": 1.8434108527131783,
+      "grad_norm": 1.3339006900787354,
+      "learning_rate": 5.081826012058571e-07,
+      "loss": 0.8612,
+      "step": 1189
+    },
+    {
+      "epoch": 1.8449612403100775,
+      "grad_norm": 1.4747363328933716,
+      "learning_rate": 5.038759689922481e-07,
+      "loss": 0.8648,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8449612403100775,
+      "eval_loss": 1.0100913047790527,
+      "eval_runtime": 46.6654,
+      "eval_samples_per_second": 21.429,
+      "eval_steps_per_second": 1.35,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8465116279069766,
+      "grad_norm": 1.3146756887435913,
+      "learning_rate": 4.995693367786392e-07,
+      "loss": 0.8726,
+      "step": 1191
+    },
+    {
+      "epoch": 1.848062015503876,
+      "grad_norm": 1.419121265411377,
+      "learning_rate": 4.952627045650301e-07,
+      "loss": 0.8612,
+      "step": 1192
+    },
+    {
+      "epoch": 1.8496124031007752,
+      "grad_norm": 1.4017002582550049,
+      "learning_rate": 4.909560723514212e-07,
+      "loss": 0.861,
+      "step": 1193
+    },
+    {
+      "epoch": 1.8511627906976744,
+      "grad_norm": 1.3471308946609497,
+      "learning_rate": 4.866494401378123e-07,
+      "loss": 0.8722,
+      "step": 1194
+    },
+    {
+      "epoch": 1.8527131782945736,
+      "grad_norm": 1.2866394519805908,
+      "learning_rate": 4.823428079242033e-07,
+      "loss": 0.8506,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8542635658914728,
+      "grad_norm": 1.3998876810073853,
+      "learning_rate": 4.780361757105943e-07,
+      "loss": 0.8552,
+      "step": 1196
+    },
+    {
+      "epoch": 1.8558139534883722,
+      "grad_norm": 1.5448180437088013,
+      "learning_rate": 4.737295434969854e-07,
+      "loss": 0.8772,
+      "step": 1197
+    },
+    {
+      "epoch": 1.8573643410852712,
+      "grad_norm": 1.33297860622406,
+      "learning_rate": 4.6942291128337646e-07,
+      "loss": 0.836,
+      "step": 1198
+    },
+    {
+      "epoch": 1.8589147286821706,
+      "grad_norm": 1.3891242742538452,
+      "learning_rate": 4.651162790697675e-07,
+      "loss": 0.8571,
+      "step": 1199
+    },
+    {
+      "epoch": 1.8604651162790697,
+      "grad_norm": 1.4276305437088013,
+      "learning_rate": 4.6080964685615856e-07,
+      "loss": 0.8513,
+      "step": 1200
+    },
+    {
+      "epoch": 1.8604651162790697,
+      "eval_loss": 1.0112069845199585,
+      "eval_runtime": 46.6809,
+      "eval_samples_per_second": 21.422,
+      "eval_steps_per_second": 1.35,
+      "step": 1200
+    },
+    {
+      "epoch": 1.862015503875969,
+      "grad_norm": 1.5008084774017334,
+      "learning_rate": 4.565030146425495e-07,
+      "loss": 0.8589,
+      "step": 1201
+    },
+    {
+      "epoch": 1.8635658914728683,
+      "grad_norm": 1.44911789894104,
+      "learning_rate": 4.521963824289406e-07,
+      "loss": 0.8717,
+      "step": 1202
+    },
+    {
+      "epoch": 1.8651162790697673,
+      "grad_norm": 1.4527829885482788,
+      "learning_rate": 4.478897502153316e-07,
+      "loss": 0.8712,
+      "step": 1203
+    },
+    {
+      "epoch": 1.8666666666666667,
+      "grad_norm": 1.5256608724594116,
+      "learning_rate": 4.435831180017227e-07,
+      "loss": 0.8731,
+      "step": 1204
+    },
+    {
+      "epoch": 1.8682170542635659,
+      "grad_norm": 1.342320203781128,
+      "learning_rate": 4.392764857881137e-07,
+      "loss": 0.8313,
+      "step": 1205
+    },
+    {
+      "epoch": 1.869767441860465,
+      "grad_norm": 1.4780925512313843,
+      "learning_rate": 4.349698535745048e-07,
+      "loss": 0.87,
+      "step": 1206
+    },
+    {
+      "epoch": 1.8713178294573645,
+      "grad_norm": 1.495001196861267,
+      "learning_rate": 4.3066322136089586e-07,
+      "loss": 0.872,
+      "step": 1207
+    },
+    {
+      "epoch": 1.8728682170542634,
+      "grad_norm": 1.3074004650115967,
+      "learning_rate": 4.2635658914728683e-07,
+      "loss": 0.8781,
+      "step": 1208
+    },
+    {
+      "epoch": 1.8744186046511628,
+      "grad_norm": 1.480075478553772,
+      "learning_rate": 4.220499569336779e-07,
+      "loss": 0.8689,
+      "step": 1209
+    },
+    {
+      "epoch": 1.875968992248062,
+      "grad_norm": 1.5911939144134521,
+      "learning_rate": 4.177433247200689e-07,
+      "loss": 0.8646,
+      "step": 1210
+    },
+    {
+      "epoch": 1.875968992248062,
+      "eval_loss": 1.0095446109771729,
+      "eval_runtime": 46.6513,
+      "eval_samples_per_second": 21.436,
+      "eval_steps_per_second": 1.35,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8775193798449612,
+      "grad_norm": 1.456148386001587,
+      "learning_rate": 4.1343669250646e-07,
+      "loss": 0.8474,
+      "step": 1211
+    },
+    {
+      "epoch": 1.8790697674418606,
+      "grad_norm": 1.439658284187317,
+      "learning_rate": 4.09130060292851e-07,
+      "loss": 0.8873,
+      "step": 1212
+    },
+    {
+      "epoch": 1.8806201550387596,
+      "grad_norm": 1.3261785507202148,
+      "learning_rate": 4.048234280792421e-07,
+      "loss": 0.8622,
+      "step": 1213
+    },
+    {
+      "epoch": 1.882170542635659,
+      "grad_norm": 1.4313101768493652,
+      "learning_rate": 4.0051679586563306e-07,
+      "loss": 0.8569,
+      "step": 1214
+    },
+    {
+      "epoch": 1.8837209302325582,
+      "grad_norm": 1.4603917598724365,
+      "learning_rate": 3.9621016365202413e-07,
+      "loss": 0.8678,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8852713178294573,
+      "grad_norm": 1.5106719732284546,
+      "learning_rate": 3.919035314384152e-07,
+      "loss": 0.8787,
+      "step": 1216
+    },
+    {
+      "epoch": 1.8868217054263567,
+      "grad_norm": 1.6415821313858032,
+      "learning_rate": 3.8759689922480623e-07,
+      "loss": 0.8721,
+      "step": 1217
+    },
+    {
+      "epoch": 1.8883720930232557,
+      "grad_norm": 1.4079430103302002,
+      "learning_rate": 3.832902670111973e-07,
+      "loss": 0.8608,
+      "step": 1218
+    },
+    {
+      "epoch": 1.889922480620155,
+      "grad_norm": 1.3679237365722656,
+      "learning_rate": 3.789836347975883e-07,
+      "loss": 0.8637,
+      "step": 1219
+    },
+    {
+      "epoch": 1.8914728682170543,
+      "grad_norm": 1.6318389177322388,
+      "learning_rate": 3.746770025839794e-07,
+      "loss": 0.8705,
+      "step": 1220
+    },
+    {
+      "epoch": 1.8914728682170543,
+      "eval_loss": 1.009251594543457,
+      "eval_runtime": 46.6094,
+      "eval_samples_per_second": 21.455,
+      "eval_steps_per_second": 1.352,
+      "step": 1220
+    },
+    {
+      "epoch": 1.8930232558139535,
+      "grad_norm": 1.377698540687561,
+      "learning_rate": 3.7037037037037036e-07,
+      "loss": 0.8041,
+      "step": 1221
+    },
+    {
+      "epoch": 1.8945736434108527,
+      "grad_norm": 1.3580753803253174,
+      "learning_rate": 3.6606373815676144e-07,
+      "loss": 0.8252,
+      "step": 1222
+    },
+    {
+      "epoch": 1.8961240310077518,
+      "grad_norm": 1.3340038061141968,
+      "learning_rate": 3.6175710594315246e-07,
+      "loss": 0.8446,
+      "step": 1223
+    },
+    {
+      "epoch": 1.8976744186046512,
+      "grad_norm": 1.389021635055542,
+      "learning_rate": 3.5745047372954353e-07,
+      "loss": 0.8788,
+      "step": 1224
+    },
+    {
+      "epoch": 1.8992248062015504,
+      "grad_norm": 1.3582135438919067,
+      "learning_rate": 3.531438415159346e-07,
+      "loss": 0.8398,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9007751937984496,
+      "grad_norm": 1.3302254676818848,
+      "learning_rate": 3.488372093023256e-07,
+      "loss": 0.8652,
+      "step": 1226
+    },
+    {
+      "epoch": 1.9023255813953488,
+      "grad_norm": 1.466200590133667,
+      "learning_rate": 3.445305770887167e-07,
+      "loss": 0.8607,
+      "step": 1227
+    },
+    {
+      "epoch": 1.903875968992248,
+      "grad_norm": 1.7074103355407715,
+      "learning_rate": 3.4022394487510767e-07,
+      "loss": 0.8779,
+      "step": 1228
+    },
+    {
+      "epoch": 1.9054263565891474,
+      "grad_norm": 1.379107117652893,
+      "learning_rate": 3.3591731266149874e-07,
+      "loss": 0.8496,
+      "step": 1229
+    },
+    {
+      "epoch": 1.9069767441860463,
+      "grad_norm": 1.5536657571792603,
+      "learning_rate": 3.3161068044788976e-07,
+      "loss": 0.8684,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9069767441860463,
+      "eval_loss": 1.0089800357818604,
+      "eval_runtime": 46.8452,
+      "eval_samples_per_second": 21.347,
+      "eval_steps_per_second": 1.345,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9085271317829458,
+      "grad_norm": 1.3638468980789185,
+      "learning_rate": 3.2730404823428084e-07,
+      "loss": 0.8937,
+      "step": 1231
+    },
+    {
+      "epoch": 1.910077519379845,
+      "grad_norm": 1.3530744314193726,
+      "learning_rate": 3.2299741602067186e-07,
+      "loss": 0.8746,
+      "step": 1232
+    },
+    {
+      "epoch": 1.9116279069767441,
+      "grad_norm": 1.3501743078231812,
+      "learning_rate": 3.1869078380706293e-07,
+      "loss": 0.8455,
+      "step": 1233
+    },
+    {
+      "epoch": 1.9131782945736435,
+      "grad_norm": 1.3847023248672485,
+      "learning_rate": 3.14384151593454e-07,
+      "loss": 0.8811,
+      "step": 1234
+    },
+    {
+      "epoch": 1.9147286821705425,
+      "grad_norm": 1.319321870803833,
+      "learning_rate": 3.1007751937984497e-07,
+      "loss": 0.8572,
+      "step": 1235
+    },
+    {
+      "epoch": 1.916279069767442,
+      "grad_norm": 1.5270951986312866,
+      "learning_rate": 3.0577088716623605e-07,
+      "loss": 0.8696,
+      "step": 1236
+    },
+    {
+      "epoch": 1.917829457364341,
+      "grad_norm": 1.324975848197937,
+      "learning_rate": 3.0146425495262707e-07,
+      "loss": 0.8656,
+      "step": 1237
+    },
+    {
+      "epoch": 1.9193798449612403,
+      "grad_norm": 1.3629916906356812,
+      "learning_rate": 2.971576227390181e-07,
+      "loss": 0.8662,
+      "step": 1238
+    },
+    {
+      "epoch": 1.9209302325581397,
+      "grad_norm": 1.3780391216278076,
+      "learning_rate": 2.9285099052540916e-07,
+      "loss": 0.8328,
+      "step": 1239
+    },
+    {
+      "epoch": 1.9224806201550386,
+      "grad_norm": 1.4938024282455444,
+      "learning_rate": 2.885443583118002e-07,
+      "loss": 0.8509,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9224806201550386,
+      "eval_loss": 1.0085103511810303,
+      "eval_runtime": 46.6396,
+      "eval_samples_per_second": 21.441,
+      "eval_steps_per_second": 1.351,
+      "step": 1240
+    },
+    {
+      "epoch": 1.924031007751938,
+      "grad_norm": 1.4414805173873901,
+      "learning_rate": 2.8423772609819125e-07,
+      "loss": 0.8642,
+      "step": 1241
+    },
+    {
+      "epoch": 1.9255813953488372,
+      "grad_norm": 1.319283366203308,
+      "learning_rate": 2.799310938845823e-07,
+      "loss": 0.8324,
+      "step": 1242
+    },
+    {
+      "epoch": 1.9271317829457364,
+      "grad_norm": 1.5541296005249023,
+      "learning_rate": 2.756244616709733e-07,
+      "loss": 0.8245,
+      "step": 1243
+    },
+    {
+      "epoch": 1.9286821705426358,
+      "grad_norm": 1.3837289810180664,
+      "learning_rate": 2.7131782945736437e-07,
+      "loss": 0.8664,
+      "step": 1244
+    },
+    {
+      "epoch": 1.9302325581395348,
+      "grad_norm": 1.316536784172058,
+      "learning_rate": 2.670111972437554e-07,
+      "loss": 0.847,
+      "step": 1245
+    },
+    {
+      "epoch": 1.9317829457364342,
+      "grad_norm": 1.3872352838516235,
+      "learning_rate": 2.627045650301464e-07,
+      "loss": 0.8927,
+      "step": 1246
+    },
+    {
+      "epoch": 1.9333333333333333,
+      "grad_norm": 1.3149522542953491,
+      "learning_rate": 2.583979328165375e-07,
+      "loss": 0.8375,
+      "step": 1247
+    },
+    {
+      "epoch": 1.9348837209302325,
+      "grad_norm": 1.3242137432098389,
+      "learning_rate": 2.5409130060292856e-07,
+      "loss": 0.8603,
+      "step": 1248
+    },
+    {
+      "epoch": 1.936434108527132,
+      "grad_norm": 1.4688844680786133,
+      "learning_rate": 2.497846683893196e-07,
+      "loss": 0.866,
+      "step": 1249
+    },
+    {
+      "epoch": 1.937984496124031,
+      "grad_norm": 1.4166333675384521,
+      "learning_rate": 2.454780361757106e-07,
+      "loss": 0.8736,
+      "step": 1250
+    },
+    {
+      "epoch": 1.937984496124031,
+      "eval_loss": 1.0081682205200195,
+      "eval_runtime": 46.7379,
+      "eval_samples_per_second": 21.396,
+      "eval_steps_per_second": 1.348,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9395348837209303,
+      "grad_norm": 1.35499107837677,
+      "learning_rate": 2.411714039621017e-07,
+      "loss": 0.8823,
+      "step": 1251
+    },
+    {
+      "epoch": 1.9410852713178295,
+      "grad_norm": 1.3941737413406372,
+      "learning_rate": 2.368647717484927e-07,
+      "loss": 0.8673,
+      "step": 1252
+    },
+    {
+      "epoch": 1.9426356589147287,
+      "grad_norm": 1.3867199420928955,
+      "learning_rate": 2.3255813953488374e-07,
+      "loss": 0.8793,
+      "step": 1253
+    },
+    {
+      "epoch": 1.9441860465116279,
+      "grad_norm": 1.4277222156524658,
+      "learning_rate": 2.2825150732127476e-07,
+      "loss": 0.8699,
+      "step": 1254
+    },
+    {
+      "epoch": 1.945736434108527,
+      "grad_norm": 1.3886282444000244,
+      "learning_rate": 2.239448751076658e-07,
+      "loss": 0.8716,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9472868217054264,
+      "grad_norm": 1.4584248065948486,
+      "learning_rate": 2.1963824289405686e-07,
+      "loss": 0.8419,
+      "step": 1256
+    },
+    {
+      "epoch": 1.9488372093023256,
+      "grad_norm": 1.3909106254577637,
+      "learning_rate": 2.1533161068044793e-07,
+      "loss": 0.8941,
+      "step": 1257
+    },
+    {
+      "epoch": 1.9503875968992248,
+      "grad_norm": 1.4313043355941772,
+      "learning_rate": 2.1102497846683895e-07,
+      "loss": 0.8523,
+      "step": 1258
+    },
+    {
+      "epoch": 1.951937984496124,
+      "grad_norm": 1.36488938331604,
+      "learning_rate": 2.0671834625323e-07,
+      "loss": 0.8589,
+      "step": 1259
+    },
+    {
+      "epoch": 1.9534883720930232,
+      "grad_norm": 1.3644864559173584,
+      "learning_rate": 2.0241171403962105e-07,
+      "loss": 0.8304,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9534883720930232,
+      "eval_loss": 1.0075480937957764,
+      "eval_runtime": 46.7386,
+      "eval_samples_per_second": 21.396,
+      "eval_steps_per_second": 1.348,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9550387596899226,
+      "grad_norm": 1.4223475456237793,
+      "learning_rate": 1.9810508182601207e-07,
+      "loss": 0.8433,
+      "step": 1261
+    },
+    {
+      "epoch": 1.9565891472868215,
+      "grad_norm": 1.571225881576538,
+      "learning_rate": 1.9379844961240311e-07,
+      "loss": 0.8897,
+      "step": 1262
+    },
+    {
+      "epoch": 1.958139534883721,
+      "grad_norm": 1.5039650201797485,
+      "learning_rate": 1.8949181739879416e-07,
+      "loss": 0.865,
+      "step": 1263
+    },
+    {
+      "epoch": 1.9596899224806201,
+      "grad_norm": 1.4037103652954102,
+      "learning_rate": 1.8518518518518518e-07,
+      "loss": 0.8295,
+      "step": 1264
+    },
+    {
+      "epoch": 1.9612403100775193,
+      "grad_norm": 1.435829520225525,
+      "learning_rate": 1.8087855297157623e-07,
+      "loss": 0.8811,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9627906976744187,
+      "grad_norm": 1.5063098669052124,
+      "learning_rate": 1.765719207579673e-07,
+      "loss": 0.8701,
+      "step": 1266
+    },
+    {
+      "epoch": 1.9643410852713177,
+      "grad_norm": 1.4760342836380005,
+      "learning_rate": 1.7226528854435835e-07,
+      "loss": 0.8545,
+      "step": 1267
+    },
+    {
+      "epoch": 1.965891472868217,
+      "grad_norm": 1.3338667154312134,
+      "learning_rate": 1.6795865633074937e-07,
+      "loss": 0.884,
+      "step": 1268
+    },
+    {
+      "epoch": 1.9674418604651163,
+      "grad_norm": 1.4161882400512695,
+      "learning_rate": 1.6365202411714042e-07,
+      "loss": 0.8676,
+      "step": 1269
+    },
+    {
+      "epoch": 1.9689922480620154,
+      "grad_norm": 1.3632901906967163,
+      "learning_rate": 1.5934539190353146e-07,
+      "loss": 0.8498,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9689922480620154,
+      "eval_loss": 1.0080918073654175,
+      "eval_runtime": 46.7217,
+      "eval_samples_per_second": 21.403,
+      "eval_steps_per_second": 1.348,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9705426356589149,
+      "grad_norm": 1.2917699813842773,
+      "learning_rate": 1.5503875968992249e-07,
+      "loss": 0.8382,
+      "step": 1271
+    },
+    {
+      "epoch": 1.9720930232558138,
+      "grad_norm": 1.4430114030838013,
+      "learning_rate": 1.5073212747631353e-07,
+      "loss": 0.8861,
+      "step": 1272
+    },
+    {
+      "epoch": 1.9736434108527132,
+      "grad_norm": 1.3696763515472412,
+      "learning_rate": 1.4642549526270458e-07,
+      "loss": 0.8619,
+      "step": 1273
+    },
+    {
+      "epoch": 1.9751937984496124,
+      "grad_norm": 1.266954779624939,
+      "learning_rate": 1.4211886304909563e-07,
+      "loss": 0.8384,
+      "step": 1274
+    },
+    {
+      "epoch": 1.9767441860465116,
+      "grad_norm": 1.4904766082763672,
+      "learning_rate": 1.3781223083548665e-07,
+      "loss": 0.8798,
+      "step": 1275
+    },
+    {
+      "epoch": 1.978294573643411,
+      "grad_norm": 1.3043338060379028,
+      "learning_rate": 1.335055986218777e-07,
+      "loss": 0.8552,
+      "step": 1276
+    },
+    {
+      "epoch": 1.97984496124031,
+      "grad_norm": 1.3325281143188477,
+      "learning_rate": 1.2919896640826874e-07,
+      "loss": 0.8717,
+      "step": 1277
+    },
+    {
+      "epoch": 1.9813953488372094,
+      "grad_norm": 1.4170563220977783,
+      "learning_rate": 1.248923341946598e-07,
+      "loss": 0.865,
+      "step": 1278
+    },
+    {
+      "epoch": 1.9829457364341085,
+      "grad_norm": 1.3400081396102905,
+      "learning_rate": 1.2058570198105084e-07,
+      "loss": 0.8657,
+      "step": 1279
+    },
+    {
+      "epoch": 1.9844961240310077,
+      "grad_norm": 1.3908562660217285,
+      "learning_rate": 1.1627906976744187e-07,
+      "loss": 0.8422,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9844961240310077,
+      "eval_loss": 1.008200764656067,
+      "eval_runtime": 46.6818,
+      "eval_samples_per_second": 21.422,
+      "eval_steps_per_second": 1.35,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9860465116279071,
+      "grad_norm": 1.3601107597351074,
+      "learning_rate": 1.119724375538329e-07,
+      "loss": 0.8516,
+      "step": 1281
+    },
+    {
+      "epoch": 1.987596899224806,
+      "grad_norm": 1.4389938116073608,
+      "learning_rate": 1.0766580534022397e-07,
+      "loss": 0.8759,
+      "step": 1282
+    },
+    {
+      "epoch": 1.9891472868217055,
+      "grad_norm": 1.3003112077713013,
+      "learning_rate": 1.03359173126615e-07,
+      "loss": 0.8829,
+      "step": 1283
+    },
+    {
+      "epoch": 1.9906976744186047,
+      "grad_norm": 1.5875047445297241,
+      "learning_rate": 9.905254091300603e-08,
+      "loss": 0.8766,
+      "step": 1284
+    },
+    {
+      "epoch": 1.9922480620155039,
+      "grad_norm": 1.4247767925262451,
+      "learning_rate": 9.474590869939708e-08,
+      "loss": 0.8557,
+      "step": 1285
+    },
+    {
+      "epoch": 1.993798449612403,
+      "grad_norm": 1.4395599365234375,
+      "learning_rate": 9.043927648578811e-08,
+      "loss": 0.8775,
+      "step": 1286
+    },
+    {
+      "epoch": 1.9953488372093022,
+      "grad_norm": 1.3539119958877563,
+      "learning_rate": 8.613264427217917e-08,
+      "loss": 0.8762,
+      "step": 1287
+    },
+    {
+      "epoch": 1.9968992248062016,
+      "grad_norm": 1.2914642095565796,
+      "learning_rate": 8.182601205857021e-08,
+      "loss": 0.8612,
+      "step": 1288
+    },
+    {
+      "epoch": 1.9984496124031008,
+      "grad_norm": 1.3129777908325195,
+      "learning_rate": 7.751937984496124e-08,
+      "loss": 0.8446,
+      "step": 1289
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.5141966342926025,
+      "learning_rate": 7.321274763135229e-08,
+      "loss": 0.8649,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.0077102184295654,
+      "eval_runtime": 46.7171,
+      "eval_samples_per_second": 21.405,
+      "eval_steps_per_second": 1.349,
+      "step": 1290
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1290,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3449606952963277e+18,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}