diff --git "a/checkpoints/Qwen2.5-3B/babylm_hop_control_10M_seed0/runs/checkpoint-1122/trainer_state.json" "b/checkpoints/Qwen2.5-3B/babylm_hop_control_10M_seed0/runs/checkpoint-1122/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoints/Qwen2.5-3B/babylm_hop_control_10M_seed0/runs/checkpoint-1122/trainer_state.json"
@@ -0,0 +1,8783 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1122,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017825311942959,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6857,
+      "step": 1
+    },
+    {
+      "epoch": 0.0035650623885918,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6495,
+      "step": 2
+    },
+    {
+      "epoch": 0.0053475935828877,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6251,
+      "step": 3
+    },
+    {
+      "epoch": 0.0071301247771836,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6453,
+      "step": 4
+    },
+    {
+      "epoch": 0.008912655971479501,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6462,
+      "step": 5
+    },
+    {
+      "epoch": 0.0106951871657754,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6457,
+      "step": 6
+    },
+    {
+      "epoch": 0.012477718360071301,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.677,
+      "step": 7
+    },
+    {
+      "epoch": 0.0142602495543672,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6447,
+      "step": 8
+    },
+    {
+      "epoch": 0.016042780748663103,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6675,
+      "step": 9
+    },
+    {
+      "epoch": 0.017825311942959002,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6563,
+      "step": 10
+    },
+    {
+      "epoch": 0.017825311942959002,
+      "eval_loss": 1.6535868644714355,
+      "eval_runtime": 24.5668,
+      "eval_samples_per_second": 40.705,
+      "eval_steps_per_second": 2.564,
+      "step": 10
+    },
+    {
+      "epoch": 0.0196078431372549,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6458,
+      "step": 11
+    },
+    {
+      "epoch": 0.0213903743315508,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6598,
+      "step": 12
+    },
+    {
+      "epoch": 0.023172905525846704,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6628,
+      "step": 13
+    },
+    {
+      "epoch": 0.024955436720142603,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6816,
+      "step": 14
+    },
+    {
+      "epoch": 0.026737967914438502,
+      "grad_norm": 1.980492115020752,
+      "learning_rate": 4.424778761061947e-08,
+      "loss": 1.6526,
+      "step": 15
+    },
+    {
+      "epoch": 0.0285204991087344,
+      "grad_norm": 1.9155012369155884,
+      "learning_rate": 8.849557522123894e-08,
+      "loss": 1.6552,
+      "step": 16
+    },
+    {
+      "epoch": 0.030303030303030304,
+      "grad_norm": 1.8875389099121094,
+      "learning_rate": 1.327433628318584e-07,
+      "loss": 1.6684,
+      "step": 17
+    },
+    {
+      "epoch": 0.03208556149732621,
+      "grad_norm": 1.8125866651535034,
+      "learning_rate": 1.7699115044247788e-07,
+      "loss": 1.6549,
+      "step": 18
+    },
+    {
+      "epoch": 0.0338680926916221,
+      "grad_norm": 1.8792786598205566,
+      "learning_rate": 2.2123893805309737e-07,
+      "loss": 1.6487,
+      "step": 19
+    },
+    {
+      "epoch": 0.035650623885918005,
+      "grad_norm": 1.973883032798767,
+      "learning_rate": 2.654867256637168e-07,
+      "loss": 1.6671,
+      "step": 20
+    },
+    {
+      "epoch": 0.035650623885918005,
+      "eval_loss": 1.6527129411697388,
+      "eval_runtime": 24.8773,
+      "eval_samples_per_second": 40.197,
+      "eval_steps_per_second": 2.532,
+      "step": 20
+    },
+    {
+      "epoch": 0.0374331550802139,
+      "grad_norm": 1.84323251247406,
+      "learning_rate": 3.097345132743363e-07,
+      "loss": 1.6599,
+      "step": 21
+    },
+    {
+      "epoch": 0.0392156862745098,
+      "grad_norm": 1.8644572496414185,
+      "learning_rate": 3.5398230088495575e-07,
+      "loss": 1.6475,
+      "step": 22
+    },
+    {
+      "epoch": 0.040998217468805706,
+      "grad_norm": 1.860310435295105,
+      "learning_rate": 3.9823008849557525e-07,
+      "loss": 1.6656,
+      "step": 23
+    },
+    {
+      "epoch": 0.0427807486631016,
+      "grad_norm": 1.7291864156723022,
+      "learning_rate": 4.4247787610619474e-07,
+      "loss": 1.6542,
+      "step": 24
+    },
+    {
+      "epoch": 0.044563279857397504,
+      "grad_norm": 1.6295312643051147,
+      "learning_rate": 4.867256637168142e-07,
+      "loss": 1.661,
+      "step": 25
+    },
+    {
+      "epoch": 0.04634581105169341,
+      "grad_norm": 1.5132776498794556,
+      "learning_rate": 5.309734513274336e-07,
+      "loss": 1.6399,
+      "step": 26
+    },
+    {
+      "epoch": 0.0481283422459893,
+      "grad_norm": 1.5132776498794556,
+      "learning_rate": 5.309734513274336e-07,
+      "loss": 1.6567,
+      "step": 27
+    },
+    {
+      "epoch": 0.049910873440285206,
+      "grad_norm": 1.483805775642395,
+      "learning_rate": 5.752212389380532e-07,
+      "loss": 1.6131,
+      "step": 28
+    },
+    {
+      "epoch": 0.05169340463458111,
+      "grad_norm": 1.5470143556594849,
+      "learning_rate": 6.194690265486726e-07,
+      "loss": 1.6696,
+      "step": 29
+    },
+    {
+      "epoch": 0.053475935828877004,
+      "grad_norm": 1.3218852281570435,
+      "learning_rate": 6.637168141592922e-07,
+      "loss": 1.6567,
+      "step": 30
+    },
+    {
+      "epoch": 0.053475935828877004,
+      "eval_loss": 1.628255844116211,
+      "eval_runtime": 25.3856,
+      "eval_samples_per_second": 39.392,
+      "eval_steps_per_second": 2.482,
+      "step": 30
+    },
+    {
+      "epoch": 0.05525846702317291,
+      "grad_norm": 1.229565978050232,
+      "learning_rate": 7.079646017699115e-07,
+      "loss": 1.6412,
+      "step": 31
+    },
+    {
+      "epoch": 0.0570409982174688,
+      "grad_norm": 1.091683268547058,
+      "learning_rate": 7.522123893805311e-07,
+      "loss": 1.6232,
+      "step": 32
+    },
+    {
+      "epoch": 0.058823529411764705,
+      "grad_norm": 1.1046212911605835,
+      "learning_rate": 7.964601769911505e-07,
+      "loss": 1.6024,
+      "step": 33
+    },
+    {
+      "epoch": 0.06060606060606061,
+      "grad_norm": 1.0457117557525635,
+      "learning_rate": 8.4070796460177e-07,
+      "loss": 1.6285,
+      "step": 34
+    },
+    {
+      "epoch": 0.062388591800356503,
+      "grad_norm": 1.0139962434768677,
+      "learning_rate": 8.849557522123895e-07,
+      "loss": 1.6086,
+      "step": 35
+    },
+    {
+      "epoch": 0.06417112299465241,
+      "grad_norm": 0.9111472964286804,
+      "learning_rate": 9.292035398230089e-07,
+      "loss": 1.6246,
+      "step": 36
+    },
+    {
+      "epoch": 0.0659536541889483,
+      "grad_norm": 0.9822351336479187,
+      "learning_rate": 9.734513274336284e-07,
+      "loss": 1.5948,
+      "step": 37
+    },
+    {
+      "epoch": 0.0677361853832442,
+      "grad_norm": 0.9101211428642273,
+      "learning_rate": 1.017699115044248e-06,
+      "loss": 1.6035,
+      "step": 38
+    },
+    {
+      "epoch": 0.06951871657754011,
+      "grad_norm": 0.8859177827835083,
+      "learning_rate": 1.0619469026548673e-06,
+      "loss": 1.594,
+      "step": 39
+    },
+    {
+      "epoch": 0.07130124777183601,
+      "grad_norm": 0.9900261163711548,
+      "learning_rate": 1.106194690265487e-06,
+      "loss": 1.6154,
+      "step": 40
+    },
+    {
+      "epoch": 0.07130124777183601,
+      "eval_loss": 1.5900208950042725,
+      "eval_runtime": 25.6788,
+      "eval_samples_per_second": 38.943,
+      "eval_steps_per_second": 2.453,
+      "step": 40
+    },
+    {
+      "epoch": 0.07308377896613191,
+      "grad_norm": 0.8495362401008606,
+      "learning_rate": 1.1504424778761064e-06,
+      "loss": 1.606,
+      "step": 41
+    },
+    {
+      "epoch": 0.0748663101604278,
+      "grad_norm": 0.8944734334945679,
+      "learning_rate": 1.1946902654867258e-06,
+      "loss": 1.6102,
+      "step": 42
+    },
+    {
+      "epoch": 0.0766488413547237,
+      "grad_norm": 0.9594064354896545,
+      "learning_rate": 1.2389380530973452e-06,
+      "loss": 1.5851,
+      "step": 43
+    },
+    {
+      "epoch": 0.0784313725490196,
+      "grad_norm": 0.8346364498138428,
+      "learning_rate": 1.2831858407079647e-06,
+      "loss": 1.5855,
+      "step": 44
+    },
+    {
+      "epoch": 0.08021390374331551,
+      "grad_norm": 0.8732189536094666,
+      "learning_rate": 1.3274336283185843e-06,
+      "loss": 1.5751,
+      "step": 45
+    },
+    {
+      "epoch": 0.08199643493761141,
+      "grad_norm": 1.0120676755905151,
+      "learning_rate": 1.3716814159292036e-06,
+      "loss": 1.5736,
+      "step": 46
+    },
+    {
+      "epoch": 0.08377896613190731,
+      "grad_norm": 0.9650525450706482,
+      "learning_rate": 1.415929203539823e-06,
+      "loss": 1.5396,
+      "step": 47
+    },
+    {
+      "epoch": 0.0855614973262032,
+      "grad_norm": 0.9934496879577637,
+      "learning_rate": 1.4601769911504427e-06,
+      "loss": 1.5417,
+      "step": 48
+    },
+    {
+      "epoch": 0.0873440285204991,
+      "grad_norm": 1.378520131111145,
+      "learning_rate": 1.5044247787610621e-06,
+      "loss": 1.5554,
+      "step": 49
+    },
+    {
+      "epoch": 0.08912655971479501,
+      "grad_norm": 1.302832841873169,
+      "learning_rate": 1.5486725663716816e-06,
+      "loss": 1.5449,
+      "step": 50
+    },
+    {
+      "epoch": 0.08912655971479501,
+      "eval_loss": 1.5264862775802612,
+      "eval_runtime": 25.8466,
+      "eval_samples_per_second": 38.69,
+      "eval_steps_per_second": 2.437,
+      "step": 50
+    },
+    {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 1.2967264652252197,
+      "learning_rate": 1.592920353982301e-06,
+      "loss": 1.4994,
+      "step": 51
+    },
+    {
+      "epoch": 0.09269162210338681,
+      "grad_norm": 1.2239652872085571,
+      "learning_rate": 1.6371681415929204e-06,
+      "loss": 1.5091,
+      "step": 52
+    },
+    {
+      "epoch": 0.0944741532976827,
+      "grad_norm": 1.2619361877441406,
+      "learning_rate": 1.68141592920354e-06,
+      "loss": 1.4824,
+      "step": 53
+    },
+    {
+      "epoch": 0.0962566844919786,
+      "grad_norm": 1.3398172855377197,
+      "learning_rate": 1.7256637168141593e-06,
+      "loss": 1.4996,
+      "step": 54
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 1.4670956134796143,
+      "learning_rate": 1.769911504424779e-06,
+      "loss": 1.4814,
+      "step": 55
+    },
+    {
+      "epoch": 0.09982174688057041,
+      "grad_norm": 1.2660086154937744,
+      "learning_rate": 1.8141592920353984e-06,
+      "loss": 1.4613,
+      "step": 56
+    },
+    {
+      "epoch": 0.10160427807486631,
+      "grad_norm": 1.2482752799987793,
+      "learning_rate": 1.8584070796460179e-06,
+      "loss": 1.4812,
+      "step": 57
+    },
+    {
+      "epoch": 0.10338680926916222,
+      "grad_norm": 1.3673324584960938,
+      "learning_rate": 1.9026548672566373e-06,
+      "loss": 1.471,
+      "step": 58
+    },
+    {
+      "epoch": 0.1051693404634581,
+      "grad_norm": 1.3449939489364624,
+      "learning_rate": 1.9469026548672567e-06,
+      "loss": 1.4412,
+      "step": 59
+    },
+    {
+      "epoch": 0.10695187165775401,
+      "grad_norm": 1.447826862335205,
+      "learning_rate": 1.991150442477876e-06,
+      "loss": 1.4377,
+      "step": 60
+    },
+    {
+      "epoch": 0.10695187165775401,
+      "eval_loss": 1.429922103881836,
+      "eval_runtime": 26.0399,
+      "eval_samples_per_second": 38.403,
+      "eval_steps_per_second": 2.419,
+      "step": 60
+    },
+    {
+      "epoch": 0.10873440285204991,
+      "grad_norm": 1.2212026119232178,
+      "learning_rate": 2.035398230088496e-06,
+      "loss": 1.3949,
+      "step": 61
+    },
+    {
+      "epoch": 0.11051693404634581,
+      "grad_norm": 1.1740317344665527,
+      "learning_rate": 2.079646017699115e-06,
+      "loss": 1.4268,
+      "step": 62
+    },
+    {
+      "epoch": 0.11229946524064172,
+      "grad_norm": 1.2304081916809082,
+      "learning_rate": 2.1238938053097345e-06,
+      "loss": 1.4055,
+      "step": 63
+    },
+    {
+      "epoch": 0.1140819964349376,
+      "grad_norm": 1.1806448698043823,
+      "learning_rate": 2.1681415929203544e-06,
+      "loss": 1.4126,
+      "step": 64
+    },
+    {
+      "epoch": 0.11586452762923351,
+      "grad_norm": 1.227970004081726,
+      "learning_rate": 2.212389380530974e-06,
+      "loss": 1.4067,
+      "step": 65
+    },
+    {
+      "epoch": 0.11764705882352941,
+      "grad_norm": 1.094361424446106,
+      "learning_rate": 2.256637168141593e-06,
+      "loss": 1.3593,
+      "step": 66
+    },
+    {
+      "epoch": 0.11942959001782531,
+      "grad_norm": 1.2891759872436523,
+      "learning_rate": 2.3008849557522127e-06,
+      "loss": 1.3564,
+      "step": 67
+    },
+    {
+      "epoch": 0.12121212121212122,
+      "grad_norm": 1.0433226823806763,
+      "learning_rate": 2.345132743362832e-06,
+      "loss": 1.3763,
+      "step": 68
+    },
+    {
+      "epoch": 0.12299465240641712,
+      "grad_norm": 1.4282106161117554,
+      "learning_rate": 2.3893805309734516e-06,
+      "loss": 1.3554,
+      "step": 69
+    },
+    {
+      "epoch": 0.12477718360071301,
+      "grad_norm": 1.091366171836853,
+      "learning_rate": 2.433628318584071e-06,
+      "loss": 1.3529,
+      "step": 70
+    },
+    {
+      "epoch": 0.12477718360071301,
+      "eval_loss": 1.3576408624649048,
+      "eval_runtime": 26.0076,
+      "eval_samples_per_second": 38.45,
+      "eval_steps_per_second": 2.422,
+      "step": 70
+    },
+    {
+      "epoch": 0.1265597147950089,
+      "grad_norm": 1.1191158294677734,
+      "learning_rate": 2.4778761061946905e-06,
+      "loss": 1.3368,
+      "step": 71
+    },
+    {
+      "epoch": 0.12834224598930483,
+      "grad_norm": 1.3687219619750977,
+      "learning_rate": 2.52212389380531e-06,
+      "loss": 1.3596,
+      "step": 72
+    },
+    {
+      "epoch": 0.13012477718360071,
+      "grad_norm": 1.1358015537261963,
+      "learning_rate": 2.5663716814159294e-06,
+      "loss": 1.3482,
+      "step": 73
+    },
+    {
+      "epoch": 0.1319073083778966,
+      "grad_norm": 1.4303299188613892,
+      "learning_rate": 2.6106194690265492e-06,
+      "loss": 1.3242,
+      "step": 74
+    },
+    {
+      "epoch": 0.13368983957219252,
+      "grad_norm": 1.0334805250167847,
+      "learning_rate": 2.6548672566371687e-06,
+      "loss": 1.3609,
+      "step": 75
+    },
+    {
+      "epoch": 0.1354723707664884,
+      "grad_norm": 1.222535490989685,
+      "learning_rate": 2.6991150442477877e-06,
+      "loss": 1.3375,
+      "step": 76
+    },
+    {
+      "epoch": 0.13725490196078433,
+      "grad_norm": 1.3004604578018188,
+      "learning_rate": 2.743362831858407e-06,
+      "loss": 1.2845,
+      "step": 77
+    },
+    {
+      "epoch": 0.13903743315508021,
+      "grad_norm": 1.045078158378601,
+      "learning_rate": 2.7876106194690266e-06,
+      "loss": 1.306,
+      "step": 78
+    },
+    {
+      "epoch": 0.1408199643493761,
+      "grad_norm": 1.3703179359436035,
+      "learning_rate": 2.831858407079646e-06,
+      "loss": 1.3284,
+      "step": 79
+    },
+    {
+      "epoch": 0.14260249554367202,
+      "grad_norm": 1.216047763824463,
+      "learning_rate": 2.876106194690266e-06,
+      "loss": 1.2894,
+      "step": 80
+    },
+    {
+      "epoch": 0.14260249554367202,
+      "eval_loss": 1.3045498132705688,
+      "eval_runtime": 25.8145,
+      "eval_samples_per_second": 38.738,
+      "eval_steps_per_second": 2.44,
+      "step": 80
+    },
+    {
+      "epoch": 0.1443850267379679,
+      "grad_norm": 1.1438696384429932,
+      "learning_rate": 2.9203539823008853e-06,
+      "loss": 1.2767,
+      "step": 81
+    },
+    {
+      "epoch": 0.14616755793226383,
+      "grad_norm": 1.375326156616211,
+      "learning_rate": 2.9646017699115048e-06,
+      "loss": 1.3037,
+      "step": 82
+    },
+    {
+      "epoch": 0.14795008912655971,
+      "grad_norm": 1.17705500125885,
+      "learning_rate": 3.0088495575221242e-06,
+      "loss": 1.2736,
+      "step": 83
+    },
+    {
+      "epoch": 0.1497326203208556,
+      "grad_norm": 1.1131092309951782,
+      "learning_rate": 3.0530973451327432e-06,
+      "loss": 1.2743,
+      "step": 84
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 1.2165392637252808,
+      "learning_rate": 3.097345132743363e-06,
+      "loss": 1.2922,
+      "step": 85
+    },
+    {
+      "epoch": 0.1532976827094474,
+      "grad_norm": 1.2263474464416504,
+      "learning_rate": 3.1415929203539825e-06,
+      "loss": 1.2208,
+      "step": 86
+    },
+    {
+      "epoch": 0.15508021390374332,
+      "grad_norm": 1.1002565622329712,
+      "learning_rate": 3.185840707964602e-06,
+      "loss": 1.2437,
+      "step": 87
+    },
+    {
+      "epoch": 0.1568627450980392,
+      "grad_norm": 1.4416580200195312,
+      "learning_rate": 3.2300884955752214e-06,
+      "loss": 1.2713,
+      "step": 88
+    },
+    {
+      "epoch": 0.1586452762923351,
+      "grad_norm": 1.0780956745147705,
+      "learning_rate": 3.274336283185841e-06,
+      "loss": 1.2943,
+      "step": 89
+    },
+    {
+      "epoch": 0.16042780748663102,
+      "grad_norm": 1.2529618740081787,
+      "learning_rate": 3.3185840707964607e-06,
+      "loss": 1.2913,
+      "step": 90
+    },
+    {
+      "epoch": 0.16042780748663102,
+      "eval_loss": 1.2740551233291626,
+      "eval_runtime": 25.9997,
+      "eval_samples_per_second": 38.462,
+      "eval_steps_per_second": 2.423,
+      "step": 90
+    },
+    {
+      "epoch": 0.1622103386809269,
+      "grad_norm": 1.1674306392669678,
+      "learning_rate": 3.36283185840708e-06,
+      "loss": 1.2704,
+      "step": 91
+    },
+    {
+      "epoch": 0.16399286987522282,
+      "grad_norm": 1.5128580331802368,
+      "learning_rate": 3.407079646017699e-06,
+      "loss": 1.2821,
+      "step": 92
+    },
+    {
+      "epoch": 0.1657754010695187,
+      "grad_norm": 1.216848373413086,
+      "learning_rate": 3.4513274336283186e-06,
+      "loss": 1.2629,
+      "step": 93
+    },
+    {
+      "epoch": 0.16755793226381463,
+      "grad_norm": 1.2294068336486816,
+      "learning_rate": 3.495575221238938e-06,
+      "loss": 1.2856,
+      "step": 94
+    },
+    {
+      "epoch": 0.16934046345811052,
+      "grad_norm": 1.4784265756607056,
+      "learning_rate": 3.539823008849558e-06,
+      "loss": 1.273,
+      "step": 95
+    },
+    {
+      "epoch": 0.1711229946524064,
+      "grad_norm": 1.200931191444397,
+      "learning_rate": 3.5840707964601774e-06,
+      "loss": 1.2312,
+      "step": 96
+    },
+    {
+      "epoch": 0.17290552584670232,
+      "grad_norm": 2.0421743392944336,
+      "learning_rate": 3.628318584070797e-06,
+      "loss": 1.2427,
+      "step": 97
+    },
+    {
+      "epoch": 0.1746880570409982,
+      "grad_norm": 1.5405610799789429,
+      "learning_rate": 3.6725663716814163e-06,
+      "loss": 1.2691,
+      "step": 98
+    },
+    {
+      "epoch": 0.17647058823529413,
+      "grad_norm": 2.1598148345947266,
+      "learning_rate": 3.7168141592920357e-06,
+      "loss": 1.2134,
+      "step": 99
+    },
+    {
+      "epoch": 0.17825311942959002,
+      "grad_norm": 1.649290919303894,
+      "learning_rate": 3.7610619469026547e-06,
+      "loss": 1.2532,
+      "step": 100
+    },
+    {
+      "epoch": 0.17825311942959002,
+      "eval_loss": 1.2498174905776978,
+      "eval_runtime": 25.8255,
+      "eval_samples_per_second": 38.721,
+      "eval_steps_per_second": 2.439,
+      "step": 100
+    },
+    {
+      "epoch": 0.1800356506238859,
+      "grad_norm": 2.003908157348633,
+      "learning_rate": 3.8053097345132746e-06,
+      "loss": 1.2372,
+      "step": 101
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 1.2919707298278809,
+      "learning_rate": 3.849557522123894e-06,
+      "loss": 1.2197,
+      "step": 102
+    },
+    {
+      "epoch": 0.1836007130124777,
+      "grad_norm": 1.8482425212860107,
+      "learning_rate": 3.8938053097345135e-06,
+      "loss": 1.1946,
+      "step": 103
+    },
+    {
+      "epoch": 0.18538324420677363,
+      "grad_norm": 1.3239375352859497,
+      "learning_rate": 3.938053097345133e-06,
+      "loss": 1.2045,
+      "step": 104
+    },
+    {
+      "epoch": 0.18716577540106952,
+      "grad_norm": 1.5264768600463867,
+      "learning_rate": 3.982300884955752e-06,
+      "loss": 1.2379,
+      "step": 105
+    },
+    {
+      "epoch": 0.1889483065953654,
+      "grad_norm": 1.1609495878219604,
+      "learning_rate": 4.026548672566372e-06,
+      "loss": 1.2109,
+      "step": 106
+    },
+    {
+      "epoch": 0.19073083778966132,
+      "grad_norm": 1.3971660137176514,
+      "learning_rate": 4.070796460176992e-06,
+      "loss": 1.2049,
+      "step": 107
+    },
+    {
+      "epoch": 0.1925133689839572,
+      "grad_norm": 1.2334355115890503,
+      "learning_rate": 4.115044247787611e-06,
+      "loss": 1.2317,
+      "step": 108
+    },
+    {
+      "epoch": 0.19429590017825313,
+      "grad_norm": 1.36234450340271,
+      "learning_rate": 4.15929203539823e-06,
+      "loss": 1.186,
+      "step": 109
+    },
+    {
+      "epoch": 0.19607843137254902,
+      "grad_norm": 1.2824509143829346,
+      "learning_rate": 4.20353982300885e-06,
+      "loss": 1.1683,
+      "step": 110
+    },
+    {
+      "epoch": 0.19607843137254902,
+      "eval_loss": 1.2226125001907349,
+      "eval_runtime": 26.0396,
+      "eval_samples_per_second": 38.403,
+      "eval_steps_per_second": 2.419,
+      "step": 110
+    },
+    {
+      "epoch": 0.19786096256684493,
+      "grad_norm": 1.3470525741577148,
+      "learning_rate": 4.247787610619469e-06,
+      "loss": 1.1557,
+      "step": 111
+    },
+    {
+      "epoch": 0.19964349376114082,
+      "grad_norm": 1.3658194541931152,
+      "learning_rate": 4.2920353982300885e-06,
+      "loss": 1.2355,
+      "step": 112
+    },
+    {
+      "epoch": 0.2014260249554367,
+      "grad_norm": 1.2262756824493408,
+      "learning_rate": 4.336283185840709e-06,
+      "loss": 1.214,
+      "step": 113
+    },
+    {
+      "epoch": 0.20320855614973263,
+      "grad_norm": 1.4536570310592651,
+      "learning_rate": 4.380530973451328e-06,
+      "loss": 1.2013,
+      "step": 114
+    },
+    {
+      "epoch": 0.20499108734402852,
+      "grad_norm": 1.4537997245788574,
+      "learning_rate": 4.424778761061948e-06,
+      "loss": 1.1879,
+      "step": 115
+    },
+    {
+      "epoch": 0.20677361853832443,
+      "grad_norm": 1.2539417743682861,
+      "learning_rate": 4.469026548672566e-06,
+      "loss": 1.2015,
+      "step": 116
+    },
+    {
+      "epoch": 0.20855614973262032,
+      "grad_norm": 1.296627163887024,
+      "learning_rate": 4.513274336283186e-06,
+      "loss": 1.1552,
+      "step": 117
+    },
+    {
+      "epoch": 0.2103386809269162,
+      "grad_norm": 1.5704238414764404,
+      "learning_rate": 4.557522123893805e-06,
+      "loss": 1.1468,
+      "step": 118
+    },
+    {
+      "epoch": 0.21212121212121213,
+      "grad_norm": 1.4134966135025024,
+      "learning_rate": 4.6017699115044254e-06,
+      "loss": 1.1689,
+      "step": 119
+    },
+    {
+      "epoch": 0.21390374331550802,
+      "grad_norm": 1.182852029800415,
+      "learning_rate": 4.646017699115045e-06,
+      "loss": 1.1733,
+      "step": 120
+    },
+    {
+      "epoch": 0.21390374331550802,
+      "eval_loss": 1.206061840057373,
+      "eval_runtime": 25.9192,
+      "eval_samples_per_second": 38.581,
+      "eval_steps_per_second": 2.431,
+      "step": 120
+    },
+    {
+      "epoch": 0.21568627450980393,
+      "grad_norm": 1.5393521785736084,
+      "learning_rate": 4.690265486725664e-06,
+      "loss": 1.1637,
+      "step": 121
+    },
+    {
+      "epoch": 0.21746880570409982,
+      "grad_norm": 1.4800235033035278,
+      "learning_rate": 4.734513274336284e-06,
+      "loss": 1.1724,
+      "step": 122
+    },
+    {
+      "epoch": 0.2192513368983957,
+      "grad_norm": 1.1709996461868286,
+      "learning_rate": 4.778761061946903e-06,
+      "loss": 1.1309,
+      "step": 123
+    },
+    {
+      "epoch": 0.22103386809269163,
+      "grad_norm": 1.5820499658584595,
+      "learning_rate": 4.823008849557523e-06,
+      "loss": 1.165,
+      "step": 124
+    },
+    {
+      "epoch": 0.22281639928698752,
+      "grad_norm": 1.4861419200897217,
+      "learning_rate": 4.867256637168142e-06,
+      "loss": 1.1958,
+      "step": 125
+    },
+    {
+      "epoch": 0.22459893048128343,
+      "grad_norm": 1.5460313558578491,
+      "learning_rate": 4.9115044247787615e-06,
+      "loss": 1.1264,
+      "step": 126
+    },
+    {
+      "epoch": 0.22638146167557932,
+      "grad_norm": 1.377894401550293,
+      "learning_rate": 4.955752212389381e-06,
+      "loss": 1.1516,
+      "step": 127
+    },
+    {
+      "epoch": 0.2281639928698752,
+      "grad_norm": 1.5216853618621826,
+      "learning_rate": 5e-06,
+      "loss": 1.1729,
+      "step": 128
+    },
+    {
+      "epoch": 0.22994652406417113,
+      "grad_norm": 1.1627072095870972,
+      "learning_rate": 4.995044598612488e-06,
+      "loss": 1.143,
+      "step": 129
+    },
+    {
+      "epoch": 0.23172905525846701,
+      "grad_norm": 1.6515153646469116,
+      "learning_rate": 4.990089197224976e-06,
+      "loss": 1.1501,
+      "step": 130
+    },
+    {
+      "epoch": 0.23172905525846701,
+      "eval_loss": 1.1917165517807007,
+      "eval_runtime": 25.9636,
+      "eval_samples_per_second": 38.515,
+      "eval_steps_per_second": 2.426,
+      "step": 130
+    },
+    {
+      "epoch": 0.23351158645276293,
+      "grad_norm": 1.3303382396697998,
+      "learning_rate": 4.985133795837464e-06,
+      "loss": 1.1089,
+      "step": 131
+    },
+    {
+      "epoch": 0.23529411764705882,
+      "grad_norm": 1.8258463144302368,
+      "learning_rate": 4.980178394449951e-06,
+      "loss": 1.1301,
+      "step": 132
+    },
+    {
+      "epoch": 0.23707664884135474,
+      "grad_norm": 1.3351376056671143,
+      "learning_rate": 4.975222993062438e-06,
+      "loss": 1.1711,
+      "step": 133
+    },
+    {
+      "epoch": 0.23885918003565063,
+      "grad_norm": 1.6486777067184448,
+      "learning_rate": 4.970267591674926e-06,
+      "loss": 1.167,
+      "step": 134
+    },
+    {
+      "epoch": 0.24064171122994651,
+      "grad_norm": 1.3900527954101562,
+      "learning_rate": 4.965312190287414e-06,
+      "loss": 1.168,
+      "step": 135
+    },
+    {
+      "epoch": 0.24242424242424243,
+      "grad_norm": 1.983830213546753,
+      "learning_rate": 4.960356788899901e-06,
+      "loss": 1.146,
+      "step": 136
+    },
+    {
+      "epoch": 0.24420677361853832,
+      "grad_norm": 1.6066932678222656,
+      "learning_rate": 4.955401387512389e-06,
+      "loss": 1.1859,
+      "step": 137
+    },
+    {
+      "epoch": 0.24598930481283424,
+      "grad_norm": 1.9494677782058716,
+      "learning_rate": 4.950445986124876e-06,
+      "loss": 1.1442,
+      "step": 138
+    },
+    {
+      "epoch": 0.24777183600713013,
+      "grad_norm": 1.7467000484466553,
+      "learning_rate": 4.945490584737364e-06,
+      "loss": 1.1342,
+      "step": 139
+    },
+    {
+      "epoch": 0.24955436720142601,
+      "grad_norm": 2.071423292160034,
+      "learning_rate": 4.9405351833498515e-06,
+      "loss": 1.1526,
+      "step": 140
+    },
+    {
+      "epoch": 0.24955436720142601,
+      "eval_loss": 1.1779279708862305,
+      "eval_runtime": 25.9977,
+      "eval_samples_per_second": 38.465,
+      "eval_steps_per_second": 2.423,
+      "step": 140
+    },
+    {
+      "epoch": 0.25133689839572193,
+      "grad_norm": 1.915077805519104,
+      "learning_rate": 4.935579781962339e-06,
+      "loss": 1.1236,
+      "step": 141
+    },
+    {
+      "epoch": 0.2531194295900178,
+      "grad_norm": 1.472425103187561,
+      "learning_rate": 4.930624380574827e-06,
+      "loss": 1.1368,
+      "step": 142
+    },
+    {
+      "epoch": 0.2549019607843137,
+      "grad_norm": 1.830802083015442,
+      "learning_rate": 4.925668979187315e-06,
+      "loss": 1.1103,
+      "step": 143
+    },
+    {
+      "epoch": 0.25668449197860965,
+      "grad_norm": 1.4757088422775269,
+      "learning_rate": 4.920713577799802e-06,
+      "loss": 1.1561,
+      "step": 144
+    },
+    {
+      "epoch": 0.25846702317290554,
+      "grad_norm": 1.6575653553009033,
+      "learning_rate": 4.915758176412289e-06,
+      "loss": 1.1341,
+      "step": 145
+    },
+    {
+      "epoch": 0.26024955436720143,
+      "grad_norm": 1.3438557386398315,
+      "learning_rate": 4.9108027750247775e-06,
+      "loss": 1.1037,
+      "step": 146
+    },
+    {
+      "epoch": 0.2620320855614973,
+      "grad_norm": 1.680197834968567,
+      "learning_rate": 4.9058473736372656e-06,
+      "loss": 1.1173,
+      "step": 147
+    },
+    {
+      "epoch": 0.2638146167557932,
+      "grad_norm": 1.567205548286438,
+      "learning_rate": 4.900891972249753e-06,
+      "loss": 1.1036,
+      "step": 148
+    },
+    {
+      "epoch": 0.26559714795008915,
+      "grad_norm": 1.6614781618118286,
+      "learning_rate": 4.89593657086224e-06,
+      "loss": 1.1118,
+      "step": 149
+    },
+    {
+      "epoch": 0.26737967914438504,
+      "grad_norm": 1.5362111330032349,
+      "learning_rate": 4.890981169474728e-06,
+      "loss": 1.1162,
+      "step": 150
+    },
+    {
+      "epoch": 0.26737967914438504,
+      "eval_loss": 1.1625056266784668,
+      "eval_runtime": 26.0288,
+      "eval_samples_per_second": 38.419,
+      "eval_steps_per_second": 2.42,
+      "step": 150
+    },
+    {
+      "epoch": 0.26916221033868093,
+      "grad_norm": 1.5577179193496704,
+      "learning_rate": 4.886025768087215e-06,
+      "loss": 1.1056,
+      "step": 151
+    },
+    {
+      "epoch": 0.2709447415329768,
+      "grad_norm": 1.5841162204742432,
+      "learning_rate": 4.881070366699703e-06,
+      "loss": 1.105,
+      "step": 152
+    },
+    {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 1.4951591491699219,
+      "learning_rate": 4.876114965312191e-06,
+      "loss": 1.1074,
+      "step": 153
+    },
+    {
+      "epoch": 0.27450980392156865,
+      "grad_norm": 1.5688201189041138,
+      "learning_rate": 4.871159563924679e-06,
+      "loss": 1.0981,
+      "step": 154
+    },
+    {
+      "epoch": 0.27629233511586454,
+      "grad_norm": 1.4888694286346436,
+      "learning_rate": 4.866204162537166e-06,
+      "loss": 1.1283,
+      "step": 155
+    },
+    {
+      "epoch": 0.27807486631016043,
+      "grad_norm": 1.724908471107483,
+      "learning_rate": 4.861248761149653e-06,
+      "loss": 1.1136,
+      "step": 156
+    },
+    {
+      "epoch": 0.2798573975044563,
+      "grad_norm": 1.3840643167495728,
+      "learning_rate": 4.8562933597621405e-06,
+      "loss": 1.112,
+      "step": 157
+    },
+    {
+      "epoch": 0.2816399286987522,
+      "grad_norm": 1.334119200706482,
+      "learning_rate": 4.8513379583746286e-06,
+      "loss": 1.1134,
+      "step": 158
+    },
+    {
+      "epoch": 0.28342245989304815,
+      "grad_norm": 1.3652615547180176,
+      "learning_rate": 4.846382556987117e-06,
+      "loss": 1.1386,
+      "step": 159
+    },
+    {
+      "epoch": 0.28520499108734404,
+      "grad_norm": 1.440026879310608,
+      "learning_rate": 4.841427155599604e-06,
+      "loss": 1.109,
+      "step": 160
+    },
+    {
+      "epoch": 0.28520499108734404,
+      "eval_loss": 1.1494263410568237,
+      "eval_runtime": 26.1128,
+      "eval_samples_per_second": 38.295,
+      "eval_steps_per_second": 2.413,
+      "step": 160
+    },
+    {
+      "epoch": 0.28698752228163993,
+      "grad_norm": 1.5376060009002686,
+      "learning_rate": 4.836471754212091e-06,
+      "loss": 1.0814,
+      "step": 161
+    },
+    {
+      "epoch": 0.2887700534759358,
+      "grad_norm": 1.6225935220718384,
+      "learning_rate": 4.831516352824579e-06,
+      "loss": 1.1231,
+      "step": 162
+    },
+    {
+      "epoch": 0.2905525846702317,
+      "grad_norm": 1.4330049753189087,
+      "learning_rate": 4.826560951437067e-06,
+      "loss": 1.1283,
+      "step": 163
+    },
+    {
+      "epoch": 0.29233511586452765,
+      "grad_norm": 1.3375277519226074,
+      "learning_rate": 4.8216055500495545e-06,
+      "loss": 1.1047,
+      "step": 164
+    },
+    {
+      "epoch": 0.29411764705882354,
+      "grad_norm": 1.572200059890747,
+      "learning_rate": 4.816650148662042e-06,
+      "loss": 1.0868,
+      "step": 165
+    },
+    {
+      "epoch": 0.29590017825311943,
+      "grad_norm": 1.292799472808838,
+      "learning_rate": 4.81169474727453e-06,
+      "loss": 1.0593,
+      "step": 166
+    },
+    {
+      "epoch": 0.2976827094474153,
+      "grad_norm": 1.3536690473556519,
+      "learning_rate": 4.806739345887017e-06,
+      "loss": 1.1251,
+      "step": 167
+    },
+    {
+      "epoch": 0.2994652406417112,
+      "grad_norm": 1.395337462425232,
+      "learning_rate": 4.801783944499504e-06,
+      "loss": 1.0859,
+      "step": 168
+    },
+    {
+      "epoch": 0.30124777183600715,
+      "grad_norm": 1.4418623447418213,
+      "learning_rate": 4.7968285431119924e-06,
+      "loss": 1.0895,
+      "step": 169
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 1.2470104694366455,
+      "learning_rate": 4.7918731417244805e-06,
+      "loss": 1.0945,
+      "step": 170
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "eval_loss": 1.1404340267181396,
+      "eval_runtime": 25.7303,
+      "eval_samples_per_second": 38.865,
+      "eval_steps_per_second": 2.448,
+      "step": 170
+    },
+    {
+      "epoch": 0.3048128342245989,
+      "grad_norm": 1.446307897567749,
+      "learning_rate": 4.786917740336968e-06,
+      "loss": 1.11,
+      "step": 171
+    },
+    {
+      "epoch": 0.3065953654188948,
+      "grad_norm": 1.2429949045181274,
+      "learning_rate": 4.781962338949455e-06,
+      "loss": 1.0945,
+      "step": 172
+    },
+    {
+      "epoch": 0.3083778966131907,
+      "grad_norm": 1.4229985475540161,
+      "learning_rate": 4.777006937561943e-06,
+      "loss": 1.0857,
+      "step": 173
+    },
+    {
+      "epoch": 0.31016042780748665,
+      "grad_norm": 1.2821723222732544,
+      "learning_rate": 4.77205153617443e-06,
+      "loss": 1.1064,
+      "step": 174
+    },
+    {
+      "epoch": 0.31194295900178254,
+      "grad_norm": 1.298244595527649,
+      "learning_rate": 4.767096134786918e-06,
+      "loss": 1.0695,
+      "step": 175
+    },
+    {
+      "epoch": 0.3137254901960784,
+      "grad_norm": 1.521951675415039,
+      "learning_rate": 4.762140733399406e-06,
+      "loss": 1.1152,
+      "step": 176
+    },
+    {
+      "epoch": 0.3155080213903743,
+      "grad_norm": 1.2742416858673096,
+      "learning_rate": 4.757185332011893e-06,
+      "loss": 1.0942,
+      "step": 177
+    },
+    {
+      "epoch": 0.3172905525846702,
+      "grad_norm": 1.5329335927963257,
+      "learning_rate": 4.752229930624381e-06,
+      "loss": 1.0916,
+      "step": 178
+    },
+    {
+      "epoch": 0.31907308377896615,
+      "grad_norm": 1.3044357299804688,
+      "learning_rate": 4.747274529236869e-06,
+      "loss": 1.0597,
+      "step": 179
+    },
+    {
+      "epoch": 0.32085561497326204,
+      "grad_norm": 1.341720700263977,
+      "learning_rate": 4.742319127849356e-06,
+      "loss": 1.11,
+      "step": 180
+    },
+    {
+      "epoch": 0.32085561497326204,
+      "eval_loss": 1.1308393478393555,
+      "eval_runtime": 25.8438,
+      "eval_samples_per_second": 38.694,
+      "eval_steps_per_second": 2.438,
+      "step": 180
+    },
+    {
+      "epoch": 0.3226381461675579,
+      "grad_norm": 1.2242933511734009,
+      "learning_rate": 4.7373637264618435e-06,
+      "loss": 1.0626,
+      "step": 181
+    },
+    {
+      "epoch": 0.3244206773618538,
+      "grad_norm": 1.4313316345214844,
+      "learning_rate": 4.732408325074332e-06,
+      "loss": 1.0998,
+      "step": 182
+    },
+    {
+      "epoch": 0.32620320855614976,
+      "grad_norm": 1.3618254661560059,
+      "learning_rate": 4.727452923686819e-06,
+      "loss": 1.1033,
+      "step": 183
+    },
+    {
+      "epoch": 0.32798573975044565,
+      "grad_norm": 1.5201796293258667,
+      "learning_rate": 4.722497522299306e-06,
+      "loss": 1.1185,
+      "step": 184
+    },
+    {
+      "epoch": 0.32976827094474154,
+      "grad_norm": 1.5577552318572998,
+      "learning_rate": 4.717542120911794e-06,
+      "loss": 1.0953,
+      "step": 185
+    },
+    {
+      "epoch": 0.3315508021390374,
+      "grad_norm": 1.3261568546295166,
+      "learning_rate": 4.712586719524282e-06,
+      "loss": 1.1042,
+      "step": 186
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.4353320598602295,
+      "learning_rate": 4.7076313181367695e-06,
+      "loss": 1.0968,
+      "step": 187
+    },
+    {
+      "epoch": 0.33511586452762926,
+      "grad_norm": 1.4878215789794922,
+      "learning_rate": 4.702675916749257e-06,
+      "loss": 1.1125,
+      "step": 188
+    },
+    {
+      "epoch": 0.33689839572192515,
+      "grad_norm": 1.2873917818069458,
+      "learning_rate": 4.697720515361745e-06,
+      "loss": 1.0628,
+      "step": 189
+    },
+    {
+      "epoch": 0.33868092691622104,
+      "grad_norm": 1.4027594327926636,
+      "learning_rate": 4.692765113974233e-06,
+      "loss": 1.1129,
+      "step": 190
+    },
+    {
+      "epoch": 0.33868092691622104,
+      "eval_loss": 1.1256904602050781,
+      "eval_runtime": 25.7332,
+      "eval_samples_per_second": 38.86,
+      "eval_steps_per_second": 2.448,
+      "step": 190
+    },
+    {
+      "epoch": 0.3404634581105169,
+      "grad_norm": 1.5468748807907104,
+      "learning_rate": 4.68780971258672e-06,
+      "loss": 1.0634,
+      "step": 191
+    },
+    {
+      "epoch": 0.3422459893048128,
+      "grad_norm": 1.4828784465789795,
+      "learning_rate": 4.682854311199207e-06,
+      "loss": 1.0971,
+      "step": 192
+    },
+    {
+      "epoch": 0.34402852049910876,
+      "grad_norm": 1.4721124172210693,
+      "learning_rate": 4.677898909811695e-06,
+      "loss": 1.0492,
+      "step": 193
+    },
+    {
+      "epoch": 0.34581105169340465,
+      "grad_norm": 1.4724591970443726,
+      "learning_rate": 4.672943508424183e-06,
+      "loss": 1.0611,
+      "step": 194
+    },
+    {
+      "epoch": 0.34759358288770054,
+      "grad_norm": 1.3990216255187988,
+      "learning_rate": 4.667988107036671e-06,
+      "loss": 1.0506,
+      "step": 195
+    },
+    {
+      "epoch": 0.3493761140819964,
+      "grad_norm": 1.7127360105514526,
+      "learning_rate": 4.663032705649158e-06,
+      "loss": 1.0718,
+      "step": 196
+    },
+    {
+      "epoch": 0.3511586452762923,
+      "grad_norm": 1.5870146751403809,
+      "learning_rate": 4.658077304261645e-06,
+      "loss": 1.0626,
+      "step": 197
+    },
+    {
+      "epoch": 0.35294117647058826,
+      "grad_norm": 1.4044660329818726,
+      "learning_rate": 4.653121902874133e-06,
+      "loss": 1.0746,
+      "step": 198
+    },
+    {
+      "epoch": 0.35472370766488415,
+      "grad_norm": 1.7337034940719604,
+      "learning_rate": 4.648166501486621e-06,
+      "loss": 1.0813,
+      "step": 199
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "grad_norm": 1.44723379611969,
+      "learning_rate": 4.643211100099108e-06,
+      "loss": 1.0551,
+      "step": 200
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "eval_loss": 1.1240613460540771,
+      "eval_runtime": 25.8921,
+      "eval_samples_per_second": 38.622,
+      "eval_steps_per_second": 2.433,
+      "step": 200
+    },
+    {
+      "epoch": 0.3582887700534759,
+      "grad_norm": 2.0049657821655273,
+      "learning_rate": 4.638255698711596e-06,
+      "loss": 1.091,
+      "step": 201
+    },
+    {
+      "epoch": 0.3600713012477718,
+      "grad_norm": 1.4606605768203735,
+      "learning_rate": 4.633300297324084e-06,
+      "loss": 1.096,
+      "step": 202
+    },
+    {
+      "epoch": 0.36185383244206776,
+      "grad_norm": 1.8128238916397095,
+      "learning_rate": 4.628344895936571e-06,
+      "loss": 1.0471,
+      "step": 203
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 1.587044358253479,
+      "learning_rate": 4.6233894945490585e-06,
+      "loss": 1.065,
+      "step": 204
+    },
+    {
+      "epoch": 0.36541889483065954,
+      "grad_norm": 1.5733635425567627,
+      "learning_rate": 4.6184340931615466e-06,
+      "loss": 1.056,
+      "step": 205
+    },
+    {
+      "epoch": 0.3672014260249554,
+      "grad_norm": 1.5961697101593018,
+      "learning_rate": 4.613478691774035e-06,
+      "loss": 1.0592,
+      "step": 206
+    },
+    {
+      "epoch": 0.3689839572192513,
+      "grad_norm": 1.2846550941467285,
+      "learning_rate": 4.608523290386522e-06,
+      "loss": 1.0498,
+      "step": 207
+    },
+    {
+      "epoch": 0.37076648841354726,
+      "grad_norm": 1.4773404598236084,
+      "learning_rate": 4.603567888999009e-06,
+      "loss": 1.0675,
+      "step": 208
+    },
+    {
+      "epoch": 0.37254901960784315,
+      "grad_norm": 1.4184978008270264,
+      "learning_rate": 4.598612487611497e-06,
+      "loss": 1.0922,
+      "step": 209
+    },
+    {
+      "epoch": 0.37433155080213903,
+      "grad_norm": 1.4357984066009521,
+      "learning_rate": 4.5936570862239844e-06,
+      "loss": 1.0521,
+      "step": 210
+    },
+    {
+      "epoch": 0.37433155080213903,
+      "eval_loss": 1.1120160818099976,
+      "eval_runtime": 25.758,
+      "eval_samples_per_second": 38.823,
+      "eval_steps_per_second": 2.446,
+      "step": 210
+    },
+    {
+      "epoch": 0.3761140819964349,
+      "grad_norm": 1.5778534412384033,
+      "learning_rate": 4.5887016848364725e-06,
+      "loss": 1.0603,
+      "step": 211
+    },
+    {
+      "epoch": 0.3778966131907308,
+      "grad_norm": 1.4360918998718262,
+      "learning_rate": 4.58374628344896e-06,
+      "loss": 1.0796,
+      "step": 212
+    },
+    {
+      "epoch": 0.37967914438502676,
+      "grad_norm": 1.4689946174621582,
+      "learning_rate": 4.578790882061447e-06,
+      "loss": 1.0271,
+      "step": 213
+    },
+    {
+      "epoch": 0.38146167557932265,
+      "grad_norm": 1.4524282217025757,
+      "learning_rate": 4.573835480673935e-06,
+      "loss": 1.0676,
+      "step": 214
+    },
+    {
+      "epoch": 0.38324420677361853,
+      "grad_norm": 1.6127585172653198,
+      "learning_rate": 4.568880079286422e-06,
+      "loss": 1.0708,
+      "step": 215
+    },
+    {
+      "epoch": 0.3850267379679144,
+      "grad_norm": 1.4047455787658691,
+      "learning_rate": 4.5639246778989096e-06,
+      "loss": 1.0813,
+      "step": 216
+    },
+    {
+      "epoch": 0.3868092691622103,
+      "grad_norm": 1.584825873374939,
+      "learning_rate": 4.558969276511398e-06,
+      "loss": 1.0287,
+      "step": 217
+    },
+    {
+      "epoch": 0.38859180035650626,
+      "grad_norm": 1.5703797340393066,
+      "learning_rate": 4.554013875123886e-06,
+      "loss": 1.0687,
+      "step": 218
+    },
+    {
+      "epoch": 0.39037433155080214,
+      "grad_norm": 1.5290143489837646,
+      "learning_rate": 4.549058473736373e-06,
+      "loss": 1.0528,
+      "step": 219
+    },
+    {
+      "epoch": 0.39215686274509803,
+      "grad_norm": 1.6070911884307861,
+      "learning_rate": 4.54410307234886e-06,
+      "loss": 1.0436,
+      "step": 220
+    },
+    {
+      "epoch": 0.39215686274509803,
+      "eval_loss": 1.1070743799209595,
+      "eval_runtime": 25.9372,
+      "eval_samples_per_second": 38.555,
+      "eval_steps_per_second": 2.429,
+      "step": 220
+    },
+    {
+      "epoch": 0.3939393939393939,
+      "grad_norm": 1.534654140472412,
+      "learning_rate": 4.539147670961348e-06,
+      "loss": 1.0744,
+      "step": 221
+    },
+    {
+      "epoch": 0.39572192513368987,
+      "grad_norm": 1.484359622001648,
+      "learning_rate": 4.534192269573836e-06,
+      "loss": 1.0548,
+      "step": 222
+    },
+    {
+      "epoch": 0.39750445632798576,
+      "grad_norm": 1.5490672588348389,
+      "learning_rate": 4.529236868186324e-06,
+      "loss": 1.0438,
+      "step": 223
+    },
+    {
+      "epoch": 0.39928698752228164,
+      "grad_norm": 1.7006093263626099,
+      "learning_rate": 4.524281466798811e-06,
+      "loss": 1.0321,
+      "step": 224
+    },
+    {
+      "epoch": 0.40106951871657753,
+      "grad_norm": 1.3219202756881714,
+      "learning_rate": 4.519326065411299e-06,
+      "loss": 1.0247,
+      "step": 225
+    },
+    {
+      "epoch": 0.4028520499108734,
+      "grad_norm": 1.5470346212387085,
+      "learning_rate": 4.514370664023786e-06,
+      "loss": 1.0203,
+      "step": 226
+    },
+    {
+      "epoch": 0.40463458110516937,
+      "grad_norm": 1.4391487836837769,
+      "learning_rate": 4.509415262636274e-06,
+      "loss": 1.0277,
+      "step": 227
+    },
+    {
+      "epoch": 0.40641711229946526,
+      "grad_norm": 1.3691346645355225,
+      "learning_rate": 4.5044598612487615e-06,
+      "loss": 1.0307,
+      "step": 228
+    },
+    {
+      "epoch": 0.40819964349376114,
+      "grad_norm": 1.5090476274490356,
+      "learning_rate": 4.499504459861249e-06,
+      "loss": 1.06,
+      "step": 229
+    },
+    {
+      "epoch": 0.40998217468805703,
+      "grad_norm": 1.660611629486084,
+      "learning_rate": 4.494549058473737e-06,
+      "loss": 1.0729,
+      "step": 230
+    },
+    {
+      "epoch": 0.40998217468805703,
+      "eval_loss": 1.100784182548523,
+      "eval_runtime": 26.0384,
+      "eval_samples_per_second": 38.405,
+      "eval_steps_per_second": 2.42,
+      "step": 230
+    },
+    {
+      "epoch": 0.4117647058823529,
+      "grad_norm": 1.3682957887649536,
+      "learning_rate": 4.489593657086224e-06,
+      "loss": 1.0211,
+      "step": 231
+    },
+    {
+      "epoch": 0.41354723707664887,
+      "grad_norm": 1.5248552560806274,
+      "learning_rate": 4.484638255698711e-06,
+      "loss": 1.0649,
+      "step": 232
+    },
+    {
+      "epoch": 0.41532976827094475,
+      "grad_norm": 1.4119149446487427,
+      "learning_rate": 4.479682854311199e-06,
+      "loss": 1.0396,
+      "step": 233
+    },
+    {
+      "epoch": 0.41711229946524064,
+      "grad_norm": 1.4445548057556152,
+      "learning_rate": 4.4747274529236875e-06,
+      "loss": 1.0238,
+      "step": 234
+    },
+    {
+      "epoch": 0.41889483065953653,
+      "grad_norm": 1.6393500566482544,
+      "learning_rate": 4.469772051536175e-06,
+      "loss": 1.0405,
+      "step": 235
+    },
+    {
+      "epoch": 0.4206773618538324,
+      "grad_norm": 1.4636632204055786,
+      "learning_rate": 4.464816650148662e-06,
+      "loss": 1.0533,
+      "step": 236
+    },
+    {
+      "epoch": 0.42245989304812837,
+      "grad_norm": 1.408185601234436,
+      "learning_rate": 4.45986124876115e-06,
+      "loss": 1.0441,
+      "step": 237
+    },
+    {
+      "epoch": 0.42424242424242425,
+      "grad_norm": 1.5546650886535645,
+      "learning_rate": 4.454905847373638e-06,
+      "loss": 1.0259,
+      "step": 238
+    },
+    {
+      "epoch": 0.42602495543672014,
+      "grad_norm": 1.3773846626281738,
+      "learning_rate": 4.449950445986125e-06,
+      "loss": 1.0317,
+      "step": 239
+    },
+    {
+      "epoch": 0.42780748663101603,
+      "grad_norm": 1.6242173910140991,
+      "learning_rate": 4.444995044598613e-06,
+      "loss": 1.0572,
+      "step": 240
+    },
+    {
+      "epoch": 0.42780748663101603,
+      "eval_loss": 1.0950753688812256,
+      "eval_runtime": 25.98,
+      "eval_samples_per_second": 38.491,
+      "eval_steps_per_second": 2.425,
+      "step": 240
+    },
+    {
+      "epoch": 0.4295900178253119,
+      "grad_norm": 1.510955572128296,
+      "learning_rate": 4.440039643211101e-06,
+      "loss": 1.0487,
+      "step": 241
+    },
+    {
+      "epoch": 0.43137254901960786,
+      "grad_norm": 1.5870273113250732,
+      "learning_rate": 4.435084241823588e-06,
+      "loss": 1.0126,
+      "step": 242
+    },
+    {
+      "epoch": 0.43315508021390375,
+      "grad_norm": 1.3809922933578491,
+      "learning_rate": 4.430128840436076e-06,
+      "loss": 1.0525,
+      "step": 243
+    },
+    {
+      "epoch": 0.43493761140819964,
+      "grad_norm": 1.558537244796753,
+      "learning_rate": 4.425173439048563e-06,
+      "loss": 1.0421,
+      "step": 244
+    },
+    {
+      "epoch": 0.43672014260249553,
+      "grad_norm": 1.4023057222366333,
+      "learning_rate": 4.420218037661051e-06,
+      "loss": 1.0897,
+      "step": 245
+    },
+    {
+      "epoch": 0.4385026737967914,
+      "grad_norm": 1.4013497829437256,
+      "learning_rate": 4.415262636273539e-06,
+      "loss": 1.0418,
+      "step": 246
+    },
+    {
+      "epoch": 0.44028520499108736,
+      "grad_norm": 1.672249674797058,
+      "learning_rate": 4.410307234886026e-06,
+      "loss": 1.045,
+      "step": 247
+    },
+    {
+      "epoch": 0.44206773618538325,
+      "grad_norm": 1.451650857925415,
+      "learning_rate": 4.405351833498513e-06,
+      "loss": 1.0193,
+      "step": 248
+    },
+    {
+      "epoch": 0.44385026737967914,
+      "grad_norm": 1.5913277864456177,
+      "learning_rate": 4.400396432111001e-06,
+      "loss": 1.0835,
+      "step": 249
+    },
+    {
+      "epoch": 0.44563279857397503,
+      "grad_norm": 1.5107896327972412,
+      "learning_rate": 4.395441030723489e-06,
+      "loss": 1.0468,
+      "step": 250
+    },
+    {
+      "epoch": 0.44563279857397503,
+      "eval_loss": 1.0920634269714355,
+      "eval_runtime": 25.9492,
+      "eval_samples_per_second": 38.537,
+      "eval_steps_per_second": 2.428,
+      "step": 250
+    },
+    {
+      "epoch": 0.4474153297682709,
+      "grad_norm": 1.3713324069976807,
+      "learning_rate": 4.3904856293359765e-06,
+      "loss": 1.011,
+      "step": 251
+    },
+    {
+      "epoch": 0.44919786096256686,
+      "grad_norm": 1.429887294769287,
+      "learning_rate": 4.385530227948464e-06,
+      "loss": 1.0337,
+      "step": 252
+    },
+    {
+      "epoch": 0.45098039215686275,
+      "grad_norm": 1.4633594751358032,
+      "learning_rate": 4.380574826560952e-06,
+      "loss": 1.0564,
+      "step": 253
+    },
+    {
+      "epoch": 0.45276292335115864,
+      "grad_norm": 1.4649094343185425,
+      "learning_rate": 4.37561942517344e-06,
+      "loss": 1.0152,
+      "step": 254
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 1.3923169374465942,
+      "learning_rate": 4.370664023785927e-06,
+      "loss": 1.049,
+      "step": 255
+    },
+    {
+      "epoch": 0.4563279857397504,
+      "grad_norm": 1.3575490713119507,
+      "learning_rate": 4.365708622398414e-06,
+      "loss": 1.0206,
+      "step": 256
+    },
+    {
+      "epoch": 0.45811051693404636,
+      "grad_norm": 1.5358511209487915,
+      "learning_rate": 4.3607532210109024e-06,
+      "loss": 1.0049,
+      "step": 257
+    },
+    {
+      "epoch": 0.45989304812834225,
+      "grad_norm": 1.4974987506866455,
+      "learning_rate": 4.35579781962339e-06,
+      "loss": 1.0512,
+      "step": 258
+    },
+    {
+      "epoch": 0.46167557932263814,
+      "grad_norm": 1.3226289749145508,
+      "learning_rate": 4.350842418235878e-06,
+      "loss": 1.0149,
+      "step": 259
+    },
+    {
+      "epoch": 0.46345811051693403,
+      "grad_norm": 1.4186517000198364,
+      "learning_rate": 4.345887016848365e-06,
+      "loss": 1.0451,
+      "step": 260
+    },
+    {
+      "epoch": 0.46345811051693403,
+      "eval_loss": 1.0856417417526245,
+      "eval_runtime": 25.8374,
+      "eval_samples_per_second": 38.704,
+      "eval_steps_per_second": 2.438,
+      "step": 260
+    },
+    {
+      "epoch": 0.46524064171123,
+      "grad_norm": 1.5120983123779297,
+      "learning_rate": 4.340931615460853e-06,
+      "loss": 1.0525,
+      "step": 261
+    },
+    {
+      "epoch": 0.46702317290552586,
+      "grad_norm": 1.4796828031539917,
+      "learning_rate": 4.33597621407334e-06,
+      "loss": 1.0412,
+      "step": 262
+    },
+    {
+      "epoch": 0.46880570409982175,
+      "grad_norm": 1.4043370485305786,
+      "learning_rate": 4.3310208126858276e-06,
+      "loss": 1.0352,
+      "step": 263
+    },
+    {
+      "epoch": 0.47058823529411764,
+      "grad_norm": 1.454528570175171,
+      "learning_rate": 4.326065411298316e-06,
+      "loss": 1.0432,
+      "step": 264
+    },
+    {
+      "epoch": 0.47237076648841353,
+      "grad_norm": 1.3608475923538208,
+      "learning_rate": 4.321110009910804e-06,
+      "loss": 1.0538,
+      "step": 265
+    },
+    {
+      "epoch": 0.4741532976827095,
+      "grad_norm": 1.4900518655776978,
+      "learning_rate": 4.316154608523291e-06,
+      "loss": 1.0163,
+      "step": 266
+    },
+    {
+      "epoch": 0.47593582887700536,
+      "grad_norm": 1.5197362899780273,
+      "learning_rate": 4.311199207135778e-06,
+      "loss": 1.0483,
+      "step": 267
+    },
+    {
+      "epoch": 0.47771836007130125,
+      "grad_norm": 1.4927443265914917,
+      "learning_rate": 4.3062438057482654e-06,
+      "loss": 1.0171,
+      "step": 268
+    },
+    {
+      "epoch": 0.47950089126559714,
+      "grad_norm": 1.779068946838379,
+      "learning_rate": 4.3012884043607535e-06,
+      "loss": 1.0583,
+      "step": 269
+    },
+    {
+      "epoch": 0.48128342245989303,
+      "grad_norm": 1.5883512496948242,
+      "learning_rate": 4.296333002973242e-06,
+      "loss": 1.052,
+      "step": 270
+    },
+    {
+      "epoch": 0.48128342245989303,
+      "eval_loss": 1.0838474035263062,
+      "eval_runtime": 26.0374,
+      "eval_samples_per_second": 38.406,
+      "eval_steps_per_second": 2.42,
+      "step": 270
+    },
+    {
+      "epoch": 0.483065953654189,
+      "grad_norm": 1.470141887664795,
+      "learning_rate": 4.291377601585729e-06,
+      "loss": 0.9976,
+      "step": 271
+    },
+    {
+      "epoch": 0.48484848484848486,
+      "grad_norm": 1.4511467218399048,
+      "learning_rate": 4.286422200198216e-06,
+      "loss": 1.0318,
+      "step": 272
+    },
+    {
+      "epoch": 0.48663101604278075,
+      "grad_norm": 1.396666169166565,
+      "learning_rate": 4.281466798810704e-06,
+      "loss": 1.0177,
+      "step": 273
+    },
+    {
+      "epoch": 0.48841354723707664,
+      "grad_norm": 1.4303821325302124,
+      "learning_rate": 4.276511397423191e-06,
+      "loss": 1.0385,
+      "step": 274
+    },
+    {
+      "epoch": 0.49019607843137253,
+      "grad_norm": 1.5581824779510498,
+      "learning_rate": 4.2715559960356795e-06,
+      "loss": 1.0388,
+      "step": 275
+    },
+    {
+      "epoch": 0.4919786096256685,
+      "grad_norm": 1.495390772819519,
+      "learning_rate": 4.266600594648167e-06,
+      "loss": 1.0154,
+      "step": 276
+    },
+    {
+      "epoch": 0.49376114081996436,
+      "grad_norm": 1.3809154033660889,
+      "learning_rate": 4.261645193260655e-06,
+      "loss": 1.0157,
+      "step": 277
+    },
+    {
+      "epoch": 0.49554367201426025,
+      "grad_norm": 1.3862433433532715,
+      "learning_rate": 4.256689791873142e-06,
+      "loss": 1.0113,
+      "step": 278
+    },
+    {
+      "epoch": 0.49732620320855614,
+      "grad_norm": 1.477658748626709,
+      "learning_rate": 4.251734390485629e-06,
+      "loss": 1.001,
+      "step": 279
+    },
+    {
+      "epoch": 0.49910873440285203,
+      "grad_norm": 1.4587225914001465,
+      "learning_rate": 4.246778989098117e-06,
+      "loss": 1.0357,
+      "step": 280
+    },
+    {
+      "epoch": 0.49910873440285203,
+      "eval_loss": 1.0841108560562134,
+      "eval_runtime": 25.711,
+      "eval_samples_per_second": 38.894,
+      "eval_steps_per_second": 2.45,
+      "step": 280
+    },
+    {
+      "epoch": 0.5008912655971479,
+      "grad_norm": 1.6233290433883667,
+      "learning_rate": 4.2418235877106055e-06,
+      "loss": 1.0157,
+      "step": 281
+    },
+    {
+      "epoch": 0.5026737967914439,
+      "grad_norm": 1.6783347129821777,
+      "learning_rate": 4.236868186323093e-06,
+      "loss": 0.9985,
+      "step": 282
+    },
+    {
+      "epoch": 0.5044563279857398,
+      "grad_norm": 1.5126416683197021,
+      "learning_rate": 4.23191278493558e-06,
+      "loss": 1.0632,
+      "step": 283
+    },
+    {
+      "epoch": 0.5062388591800356,
+      "grad_norm": 1.4921975135803223,
+      "learning_rate": 4.226957383548068e-06,
+      "loss": 1.0335,
+      "step": 284
+    },
+    {
+      "epoch": 0.5080213903743316,
+      "grad_norm": 1.457830548286438,
+      "learning_rate": 4.222001982160555e-06,
+      "loss": 1.049,
+      "step": 285
+    },
+    {
+      "epoch": 0.5098039215686274,
+      "grad_norm": 1.5393613576889038,
+      "learning_rate": 4.217046580773043e-06,
+      "loss": 1.0711,
+      "step": 286
+    },
+    {
+      "epoch": 0.5115864527629234,
+      "grad_norm": 1.4901129007339478,
+      "learning_rate": 4.212091179385531e-06,
+      "loss": 0.9559,
+      "step": 287
+    },
+    {
+      "epoch": 0.5133689839572193,
+      "grad_norm": 1.6378134489059448,
+      "learning_rate": 4.207135777998018e-06,
+      "loss": 1.0415,
+      "step": 288
+    },
+    {
+      "epoch": 0.5151515151515151,
+      "grad_norm": 1.7938212156295776,
+      "learning_rate": 4.202180376610506e-06,
+      "loss": 1.0231,
+      "step": 289
+    },
+    {
+      "epoch": 0.5169340463458111,
+      "grad_norm": 1.424949288368225,
+      "learning_rate": 4.197224975222993e-06,
+      "loss": 1.0148,
+      "step": 290
+    },
+    {
+      "epoch": 0.5169340463458111,
+      "eval_loss": 1.078644037246704,
+      "eval_runtime": 25.9288,
+      "eval_samples_per_second": 38.567,
+      "eval_steps_per_second": 2.43,
+      "step": 290
+    },
+    {
+      "epoch": 0.5187165775401069,
+      "grad_norm": 1.4257686138153076,
+      "learning_rate": 4.192269573835481e-06,
+      "loss": 1.015,
+      "step": 291
+    },
+    {
+      "epoch": 0.5204991087344029,
+      "grad_norm": 1.4987152814865112,
+      "learning_rate": 4.1873141724479685e-06,
+      "loss": 1.026,
+      "step": 292
+    },
+    {
+      "epoch": 0.5222816399286988,
+      "grad_norm": 1.380038857460022,
+      "learning_rate": 4.1823587710604566e-06,
+      "loss": 1.032,
+      "step": 293
+    },
+    {
+      "epoch": 0.5240641711229946,
+      "grad_norm": 1.421718955039978,
+      "learning_rate": 4.177403369672944e-06,
+      "loss": 0.9999,
+      "step": 294
+    },
+    {
+      "epoch": 0.5258467023172906,
+      "grad_norm": 1.524937391281128,
+      "learning_rate": 4.172447968285431e-06,
+      "loss": 1.0144,
+      "step": 295
+    },
+    {
+      "epoch": 0.5276292335115864,
+      "grad_norm": 1.4932767152786255,
+      "learning_rate": 4.167492566897919e-06,
+      "loss": 1.011,
+      "step": 296
+    },
+    {
+      "epoch": 0.5294117647058824,
+      "grad_norm": 1.4470241069793701,
+      "learning_rate": 4.162537165510407e-06,
+      "loss": 1.0085,
+      "step": 297
+    },
+    {
+      "epoch": 0.5311942959001783,
+      "grad_norm": 1.337119460105896,
+      "learning_rate": 4.1575817641228945e-06,
+      "loss": 1.0414,
+      "step": 298
+    },
+    {
+      "epoch": 0.5329768270944741,
+      "grad_norm": 1.4251195192337036,
+      "learning_rate": 4.152626362735382e-06,
+      "loss": 1.0166,
+      "step": 299
+    },
+    {
+      "epoch": 0.5347593582887701,
+      "grad_norm": 1.4127418994903564,
+      "learning_rate": 4.14767096134787e-06,
+      "loss": 1.0327,
+      "step": 300
+    },
+    {
+      "epoch": 0.5347593582887701,
+      "eval_loss": 1.0737465620040894,
+      "eval_runtime": 25.8837,
+      "eval_samples_per_second": 38.634,
+      "eval_steps_per_second": 2.434,
+      "step": 300
+    },
+    {
+      "epoch": 0.5365418894830659,
+      "grad_norm": 1.32978355884552,
+      "learning_rate": 4.142715559960357e-06,
+      "loss": 1.0267,
+      "step": 301
+    },
+    {
+      "epoch": 0.5383244206773619,
+      "grad_norm": 1.5129624605178833,
+      "learning_rate": 4.137760158572845e-06,
+      "loss": 1.052,
+      "step": 302
+    },
+    {
+      "epoch": 0.5401069518716578,
+      "grad_norm": 1.4814997911453247,
+      "learning_rate": 4.132804757185332e-06,
+      "loss": 1.0316,
+      "step": 303
+    },
+    {
+      "epoch": 0.5418894830659536,
+      "grad_norm": 1.494928240776062,
+      "learning_rate": 4.12784935579782e-06,
+      "loss": 1.0099,
+      "step": 304
+    },
+    {
+      "epoch": 0.5436720142602496,
+      "grad_norm": 1.471745491027832,
+      "learning_rate": 4.122893954410308e-06,
+      "loss": 1.0483,
+      "step": 305
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 1.4592864513397217,
+      "learning_rate": 4.117938553022795e-06,
+      "loss": 0.9876,
+      "step": 306
+    },
+    {
+      "epoch": 0.5472370766488414,
+      "grad_norm": 1.4757778644561768,
+      "learning_rate": 4.112983151635283e-06,
+      "loss": 1.0074,
+      "step": 307
+    },
+    {
+      "epoch": 0.5490196078431373,
+      "grad_norm": 1.50422203540802,
+      "learning_rate": 4.10802775024777e-06,
+      "loss": 1.0246,
+      "step": 308
+    },
+    {
+      "epoch": 0.5508021390374331,
+      "grad_norm": 1.3828375339508057,
+      "learning_rate": 4.103072348860258e-06,
+      "loss": 1.0216,
+      "step": 309
+    },
+    {
+      "epoch": 0.5525846702317291,
+      "grad_norm": 1.385804295539856,
+      "learning_rate": 4.0981169474727456e-06,
+      "loss": 1.0072,
+      "step": 310
+    },
+    {
+      "epoch": 0.5525846702317291,
+      "eval_loss": 1.0722776651382446,
+      "eval_runtime": 25.7006,
+      "eval_samples_per_second": 38.91,
+      "eval_steps_per_second": 2.451,
+      "step": 310
+    },
+    {
+      "epoch": 0.5543672014260249,
+      "grad_norm": 1.4588433504104614,
+      "learning_rate": 4.093161546085233e-06,
+      "loss": 1.0031,
+      "step": 311
+    },
+    {
+      "epoch": 0.5561497326203209,
+      "grad_norm": 1.4075288772583008,
+      "learning_rate": 4.088206144697721e-06,
+      "loss": 1.013,
+      "step": 312
+    },
+    {
+      "epoch": 0.5579322638146168,
+      "grad_norm": 1.495343804359436,
+      "learning_rate": 4.083250743310209e-06,
+      "loss": 1.005,
+      "step": 313
+    },
+    {
+      "epoch": 0.5597147950089126,
+      "grad_norm": 1.4959484338760376,
+      "learning_rate": 4.078295341922696e-06,
+      "loss": 0.989,
+      "step": 314
+    },
+    {
+      "epoch": 0.5614973262032086,
+      "grad_norm": 1.5026849508285522,
+      "learning_rate": 4.0733399405351834e-06,
+      "loss": 0.9741,
+      "step": 315
+    },
+    {
+      "epoch": 0.5632798573975044,
+      "grad_norm": 1.5046217441558838,
+      "learning_rate": 4.0683845391476715e-06,
+      "loss": 1.0406,
+      "step": 316
+    },
+    {
+      "epoch": 0.5650623885918004,
+      "grad_norm": 1.3469237089157104,
+      "learning_rate": 4.063429137760159e-06,
+      "loss": 1.0313,
+      "step": 317
+    },
+    {
+      "epoch": 0.5668449197860963,
+      "grad_norm": 1.5199836492538452,
+      "learning_rate": 4.058473736372647e-06,
+      "loss": 1.0091,
+      "step": 318
+    },
+    {
+      "epoch": 0.5686274509803921,
+      "grad_norm": 1.3804373741149902,
+      "learning_rate": 4.053518334985134e-06,
+      "loss": 1.0049,
+      "step": 319
+    },
+    {
+      "epoch": 0.5704099821746881,
+      "grad_norm": 1.4723666906356812,
+      "learning_rate": 4.048562933597622e-06,
+      "loss": 1.0306,
+      "step": 320
+    },
+    {
+      "epoch": 0.5704099821746881,
+      "eval_loss": 1.0686110258102417,
+      "eval_runtime": 25.8098,
+      "eval_samples_per_second": 38.745,
+      "eval_steps_per_second": 2.441,
+      "step": 320
+    },
+    {
+      "epoch": 0.5721925133689839,
+      "grad_norm": 1.493898630142212,
+      "learning_rate": 4.043607532210109e-06,
+      "loss": 0.9967,
+      "step": 321
+    },
+    {
+      "epoch": 0.5739750445632799,
+      "grad_norm": 1.5348026752471924,
+      "learning_rate": 4.038652130822597e-06,
+      "loss": 0.9936,
+      "step": 322
+    },
+    {
+      "epoch": 0.5757575757575758,
+      "grad_norm": 1.5064849853515625,
+      "learning_rate": 4.033696729435085e-06,
+      "loss": 0.9789,
+      "step": 323
+    },
+    {
+      "epoch": 0.5775401069518716,
+      "grad_norm": 1.5011988878250122,
+      "learning_rate": 4.028741328047572e-06,
+      "loss": 1.0053,
+      "step": 324
+    },
+    {
+      "epoch": 0.5793226381461676,
+      "grad_norm": 1.689141035079956,
+      "learning_rate": 4.02378592666006e-06,
+      "loss": 0.9868,
+      "step": 325
+    },
+    {
+      "epoch": 0.5811051693404634,
+      "grad_norm": 1.4876413345336914,
+      "learning_rate": 4.018830525272547e-06,
+      "loss": 1.0328,
+      "step": 326
+    },
+    {
+      "epoch": 0.5828877005347594,
+      "grad_norm": 1.6670453548431396,
+      "learning_rate": 4.0138751238850345e-06,
+      "loss": 1.0149,
+      "step": 327
+    },
+    {
+      "epoch": 0.5846702317290553,
+      "grad_norm": 1.6840277910232544,
+      "learning_rate": 4.008919722497523e-06,
+      "loss": 1.0407,
+      "step": 328
+    },
+    {
+      "epoch": 0.5864527629233511,
+      "grad_norm": 1.5689325332641602,
+      "learning_rate": 4.003964321110011e-06,
+      "loss": 0.9809,
+      "step": 329
+    },
+    {
+      "epoch": 0.5882352941176471,
+      "grad_norm": 1.5473581552505493,
+      "learning_rate": 3.999008919722498e-06,
+      "loss": 0.9578,
+      "step": 330
+    },
+    {
+      "epoch": 0.5882352941176471,
+      "eval_loss": 1.0670527219772339,
+      "eval_runtime": 25.7616,
+      "eval_samples_per_second": 38.818,
+      "eval_steps_per_second": 2.446,
+      "step": 330
+    },
+    {
+      "epoch": 0.5900178253119429,
+      "grad_norm": 1.6675747632980347,
+      "learning_rate": 3.994053518334985e-06,
+      "loss": 1.0227,
+      "step": 331
+    },
+    {
+      "epoch": 0.5918003565062389,
+      "grad_norm": 1.5776814222335815,
+      "learning_rate": 3.989098116947473e-06,
+      "loss": 1.0151,
+      "step": 332
+    },
+    {
+      "epoch": 0.5935828877005348,
+      "grad_norm": 1.6494024991989136,
+      "learning_rate": 3.9841427155599605e-06,
+      "loss": 0.9756,
+      "step": 333
+    },
+    {
+      "epoch": 0.5953654188948306,
+      "grad_norm": 1.7824499607086182,
+      "learning_rate": 3.979187314172449e-06,
+      "loss": 1.0147,
+      "step": 334
+    },
+    {
+      "epoch": 0.5971479500891266,
+      "grad_norm": 1.4496989250183105,
+      "learning_rate": 3.974231912784936e-06,
+      "loss": 1.0363,
+      "step": 335
+    },
+    {
+      "epoch": 0.5989304812834224,
+      "grad_norm": 1.5654771327972412,
+      "learning_rate": 3.969276511397424e-06,
+      "loss": 1.0148,
+      "step": 336
+    },
+    {
+      "epoch": 0.6007130124777184,
+      "grad_norm": 1.6020286083221436,
+      "learning_rate": 3.964321110009911e-06,
+      "loss": 0.9781,
+      "step": 337
+    },
+    {
+      "epoch": 0.6024955436720143,
+      "grad_norm": 1.539452314376831,
+      "learning_rate": 3.959365708622398e-06,
+      "loss": 0.9959,
+      "step": 338
+    },
+    {
+      "epoch": 0.6042780748663101,
+      "grad_norm": 1.436840534210205,
+      "learning_rate": 3.9544103072348865e-06,
+      "loss": 1.0092,
+      "step": 339
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.5760927200317383,
+      "learning_rate": 3.949454905847374e-06,
+      "loss": 1.0148,
+      "step": 340
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "eval_loss": 1.064415693283081,
+      "eval_runtime": 25.7611,
+      "eval_samples_per_second": 38.818,
+      "eval_steps_per_second": 2.446,
+      "step": 340
+    },
+    {
+      "epoch": 0.6078431372549019,
+      "grad_norm": 1.603229284286499,
+      "learning_rate": 3.944499504459862e-06,
+      "loss": 1.0011,
+      "step": 341
+    },
+    {
+      "epoch": 0.6096256684491979,
+      "grad_norm": 1.5881043672561646,
+      "learning_rate": 3.939544103072349e-06,
+      "loss": 1.0572,
+      "step": 342
+    },
+    {
+      "epoch": 0.6114081996434938,
+      "grad_norm": 1.5322010517120361,
+      "learning_rate": 3.934588701684836e-06,
+      "loss": 1.0401,
+      "step": 343
+    },
+    {
+      "epoch": 0.6131907308377896,
+      "grad_norm": 1.4483110904693604,
+      "learning_rate": 3.929633300297324e-06,
+      "loss": 1.0,
+      "step": 344
+    },
+    {
+      "epoch": 0.6149732620320856,
+      "grad_norm": 1.4452067613601685,
+      "learning_rate": 3.9246778989098124e-06,
+      "loss": 0.9962,
+      "step": 345
+    },
+    {
+      "epoch": 0.6167557932263814,
+      "grad_norm": 1.4547947645187378,
+      "learning_rate": 3.9197224975223e-06,
+      "loss": 0.9923,
+      "step": 346
+    },
+    {
+      "epoch": 0.6185383244206774,
+      "grad_norm": 1.5050512552261353,
+      "learning_rate": 3.914767096134787e-06,
+      "loss": 0.9867,
+      "step": 347
+    },
+    {
+      "epoch": 0.6203208556149733,
+      "grad_norm": 1.4761282205581665,
+      "learning_rate": 3.909811694747275e-06,
+      "loss": 1.023,
+      "step": 348
+    },
+    {
+      "epoch": 0.6221033868092691,
+      "grad_norm": 1.5492281913757324,
+      "learning_rate": 3.904856293359762e-06,
+      "loss": 1.003,
+      "step": 349
+    },
+    {
+      "epoch": 0.6238859180035651,
+      "grad_norm": 1.4964330196380615,
+      "learning_rate": 3.89990089197225e-06,
+      "loss": 0.981,
+      "step": 350
+    },
+    {
+      "epoch": 0.6238859180035651,
+      "eval_loss": 1.0592336654663086,
+      "eval_runtime": 25.7609,
+      "eval_samples_per_second": 38.819,
+      "eval_steps_per_second": 2.446,
+      "step": 350
+    },
+    {
+      "epoch": 0.6256684491978609,
+      "grad_norm": 1.416474461555481,
+      "learning_rate": 3.8949454905847376e-06,
+      "loss": 0.9831,
+      "step": 351
+    },
+    {
+      "epoch": 0.6274509803921569,
+      "grad_norm": 1.5389848947525024,
+      "learning_rate": 3.889990089197226e-06,
+      "loss": 1.0028,
+      "step": 352
+    },
+    {
+      "epoch": 0.6292335115864528,
+      "grad_norm": 1.4282304048538208,
+      "learning_rate": 3.885034687809713e-06,
+      "loss": 0.9946,
+      "step": 353
+    },
+    {
+      "epoch": 0.6310160427807486,
+      "grad_norm": 1.5408443212509155,
+      "learning_rate": 3.8800792864222e-06,
+      "loss": 0.9994,
+      "step": 354
+    },
+    {
+      "epoch": 0.6327985739750446,
+      "grad_norm": 1.5303212404251099,
+      "learning_rate": 3.875123885034688e-06,
+      "loss": 0.993,
+      "step": 355
+    },
+    {
+      "epoch": 0.6345811051693404,
+      "grad_norm": 1.50641930103302,
+      "learning_rate": 3.870168483647176e-06,
+      "loss": 0.9956,
+      "step": 356
+    },
+    {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 1.5365883111953735,
+      "learning_rate": 3.8652130822596635e-06,
+      "loss": 0.9948,
+      "step": 357
+    },
+    {
+      "epoch": 0.6381461675579323,
+      "grad_norm": 1.6365007162094116,
+      "learning_rate": 3.860257680872151e-06,
+      "loss": 0.9991,
+      "step": 358
+    },
+    {
+      "epoch": 0.6399286987522281,
+      "grad_norm": 1.6538429260253906,
+      "learning_rate": 3.855302279484638e-06,
+      "loss": 1.0,
+      "step": 359
+    },
+    {
+      "epoch": 0.6417112299465241,
+      "grad_norm": 1.5091899633407593,
+      "learning_rate": 3.850346878097126e-06,
+      "loss": 1.0018,
+      "step": 360
+    },
+    {
+      "epoch": 0.6417112299465241,
+      "eval_loss": 1.057335376739502,
+      "eval_runtime": 25.5877,
+      "eval_samples_per_second": 39.081,
+      "eval_steps_per_second": 2.462,
+      "step": 360
+    },
+    {
+      "epoch": 0.64349376114082,
+      "grad_norm": 1.4903072118759155,
+      "learning_rate": 3.845391476709614e-06,
+      "loss": 1.0115,
+      "step": 361
+    },
+    {
+      "epoch": 0.6452762923351159,
+      "grad_norm": 1.663288950920105,
+      "learning_rate": 3.8404360753221014e-06,
+      "loss": 0.9848,
+      "step": 362
+    },
+    {
+      "epoch": 0.6470588235294118,
+      "grad_norm": 1.6192997694015503,
+      "learning_rate": 3.835480673934589e-06,
+      "loss": 1.0114,
+      "step": 363
+    },
+    {
+      "epoch": 0.6488413547237076,
+      "grad_norm": 1.487221360206604,
+      "learning_rate": 3.830525272547077e-06,
+      "loss": 0.9734,
+      "step": 364
+    },
+    {
+      "epoch": 0.6506238859180036,
+      "grad_norm": 1.6606189012527466,
+      "learning_rate": 3.825569871159564e-06,
+      "loss": 1.012,
+      "step": 365
+    },
+    {
+      "epoch": 0.6524064171122995,
+      "grad_norm": 1.5607527494430542,
+      "learning_rate": 3.820614469772052e-06,
+      "loss": 0.9784,
+      "step": 366
+    },
+    {
+      "epoch": 0.6541889483065954,
+      "grad_norm": 1.5367454290390015,
+      "learning_rate": 3.815659068384539e-06,
+      "loss": 0.9965,
+      "step": 367
+    },
+    {
+      "epoch": 0.6559714795008913,
+      "grad_norm": 1.595745325088501,
+      "learning_rate": 3.810703666997027e-06,
+      "loss": 1.0008,
+      "step": 368
+    },
+    {
+      "epoch": 0.6577540106951871,
+      "grad_norm": 1.6348439455032349,
+      "learning_rate": 3.8057482656095146e-06,
+      "loss": 1.0033,
+      "step": 369
+    },
+    {
+      "epoch": 0.6595365418894831,
+      "grad_norm": 1.5682718753814697,
+      "learning_rate": 3.8007928642220023e-06,
+      "loss": 0.9946,
+      "step": 370
+    },
+    {
+      "epoch": 0.6595365418894831,
+      "eval_loss": 1.0564712285995483,
+      "eval_runtime": 25.6509,
+      "eval_samples_per_second": 38.985,
+      "eval_steps_per_second": 2.456,
+      "step": 370
+    },
+    {
+      "epoch": 0.661319073083779,
+      "grad_norm": 1.650360107421875,
+      "learning_rate": 3.79583746283449e-06,
+      "loss": 1.0033,
+      "step": 371
+    },
+    {
+      "epoch": 0.6631016042780749,
+      "grad_norm": 1.5877972841262817,
+      "learning_rate": 3.7908820614469776e-06,
+      "loss": 1.0059,
+      "step": 372
+    },
+    {
+      "epoch": 0.6648841354723708,
+      "grad_norm": 1.4824687242507935,
+      "learning_rate": 3.7859266600594653e-06,
+      "loss": 0.9776,
+      "step": 373
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.5661587715148926,
+      "learning_rate": 3.7809712586719525e-06,
+      "loss": 0.9608,
+      "step": 374
+    },
+    {
+      "epoch": 0.6684491978609626,
+      "grad_norm": 1.5237559080123901,
+      "learning_rate": 3.77601585728444e-06,
+      "loss": 0.9967,
+      "step": 375
+    },
+    {
+      "epoch": 0.6702317290552585,
+      "grad_norm": 1.42563796043396,
+      "learning_rate": 3.771060455896928e-06,
+      "loss": 1.0168,
+      "step": 376
+    },
+    {
+      "epoch": 0.6720142602495544,
+      "grad_norm": 1.4831016063690186,
+      "learning_rate": 3.766105054509416e-06,
+      "loss": 1.0032,
+      "step": 377
+    },
+    {
+      "epoch": 0.6737967914438503,
+      "grad_norm": 1.4618051052093506,
+      "learning_rate": 3.761149653121903e-06,
+      "loss": 0.9715,
+      "step": 378
+    },
+    {
+      "epoch": 0.6755793226381461,
+      "grad_norm": 1.5617003440856934,
+      "learning_rate": 3.756194251734391e-06,
+      "loss": 0.9921,
+      "step": 379
+    },
+    {
+      "epoch": 0.6773618538324421,
+      "grad_norm": 1.4664802551269531,
+      "learning_rate": 3.7512388503468785e-06,
+      "loss": 0.9886,
+      "step": 380
+    },
+    {
+      "epoch": 0.6773618538324421,
+      "eval_loss": 1.0536266565322876,
+      "eval_runtime": 25.8266,
+      "eval_samples_per_second": 38.72,
+      "eval_steps_per_second": 2.439,
+      "step": 380
+    },
+    {
+      "epoch": 0.679144385026738,
+      "grad_norm": 1.4738689661026,
+      "learning_rate": 3.7462834489593657e-06,
+      "loss": 0.9894,
+      "step": 381
+    },
+    {
+      "epoch": 0.6809269162210339,
+      "grad_norm": 1.4868427515029907,
+      "learning_rate": 3.741328047571854e-06,
+      "loss": 0.9855,
+      "step": 382
+    },
+    {
+      "epoch": 0.6827094474153298,
+      "grad_norm": 1.4021263122558594,
+      "learning_rate": 3.7363726461843415e-06,
+      "loss": 1.0022,
+      "step": 383
+    },
+    {
+      "epoch": 0.6844919786096256,
+      "grad_norm": 1.432822585105896,
+      "learning_rate": 3.7314172447968287e-06,
+      "loss": 0.923,
+      "step": 384
+    },
+    {
+      "epoch": 0.6862745098039216,
+      "grad_norm": 1.452248215675354,
+      "learning_rate": 3.7264618434093164e-06,
+      "loss": 0.9738,
+      "step": 385
+    },
+    {
+      "epoch": 0.6880570409982175,
+      "grad_norm": 1.4211044311523438,
+      "learning_rate": 3.721506442021804e-06,
+      "loss": 1.0073,
+      "step": 386
+    },
+    {
+      "epoch": 0.6898395721925134,
+      "grad_norm": 1.4883133172988892,
+      "learning_rate": 3.716551040634292e-06,
+      "loss": 0.9823,
+      "step": 387
+    },
+    {
+      "epoch": 0.6916221033868093,
+      "grad_norm": 1.6089807748794556,
+      "learning_rate": 3.7115956392467794e-06,
+      "loss": 1.0085,
+      "step": 388
+    },
+    {
+      "epoch": 0.6934046345811051,
+      "grad_norm": 1.4423185586929321,
+      "learning_rate": 3.706640237859267e-06,
+      "loss": 0.9764,
+      "step": 389
+    },
+    {
+      "epoch": 0.6951871657754011,
+      "grad_norm": 1.5661147832870483,
+      "learning_rate": 3.7016848364717543e-06,
+      "loss": 0.9908,
+      "step": 390
+    },
+    {
+      "epoch": 0.6951871657754011,
+      "eval_loss": 1.0515977144241333,
+      "eval_runtime": 25.7048,
+      "eval_samples_per_second": 38.903,
+      "eval_steps_per_second": 2.451,
+      "step": 390
+    },
+    {
+      "epoch": 0.696969696969697,
+      "grad_norm": 1.5067543983459473,
+      "learning_rate": 3.696729435084242e-06,
+      "loss": 0.993,
+      "step": 391
+    },
+    {
+      "epoch": 0.6987522281639929,
+      "grad_norm": 1.484405279159546,
+      "learning_rate": 3.6917740336967296e-06,
+      "loss": 0.9635,
+      "step": 392
+    },
+    {
+      "epoch": 0.7005347593582888,
+      "grad_norm": 1.4555736780166626,
+      "learning_rate": 3.6868186323092177e-06,
+      "loss": 0.9775,
+      "step": 393
+    },
+    {
+      "epoch": 0.7023172905525846,
+      "grad_norm": 1.4570845365524292,
+      "learning_rate": 3.681863230921705e-06,
+      "loss": 1.0116,
+      "step": 394
+    },
+    {
+      "epoch": 0.7040998217468806,
+      "grad_norm": 1.5175104141235352,
+      "learning_rate": 3.6769078295341926e-06,
+      "loss": 0.9856,
+      "step": 395
+    },
+    {
+      "epoch": 0.7058823529411765,
+      "grad_norm": 1.4439493417739868,
+      "learning_rate": 3.6719524281466802e-06,
+      "loss": 0.9591,
+      "step": 396
+    },
+    {
+      "epoch": 0.7076648841354723,
+      "grad_norm": 1.4264500141143799,
+      "learning_rate": 3.6669970267591675e-06,
+      "loss": 0.9894,
+      "step": 397
+    },
+    {
+      "epoch": 0.7094474153297683,
+      "grad_norm": 1.4578537940979004,
+      "learning_rate": 3.6620416253716556e-06,
+      "loss": 0.9867,
+      "step": 398
+    },
+    {
+      "epoch": 0.7112299465240641,
+      "grad_norm": 1.436010479927063,
+      "learning_rate": 3.6570862239841432e-06,
+      "loss": 1.0047,
+      "step": 399
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "grad_norm": 1.446648359298706,
+      "learning_rate": 3.6521308225966305e-06,
+      "loss": 1.0164,
+      "step": 400
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "eval_loss": 1.050747275352478,
+      "eval_runtime": 25.6718,
+      "eval_samples_per_second": 38.953,
+      "eval_steps_per_second": 2.454,
+      "step": 400
+    },
+    {
+      "epoch": 0.714795008912656,
+      "grad_norm": 1.4584800004959106,
+      "learning_rate": 3.647175421209118e-06,
+      "loss": 0.9755,
+      "step": 401
+    },
+    {
+      "epoch": 0.7165775401069518,
+      "grad_norm": 1.444089412689209,
+      "learning_rate": 3.6422200198216058e-06,
+      "loss": 0.9861,
+      "step": 402
+    },
+    {
+      "epoch": 0.7183600713012478,
+      "grad_norm": 1.522937297821045,
+      "learning_rate": 3.637264618434094e-06,
+      "loss": 0.9904,
+      "step": 403
+    },
+    {
+      "epoch": 0.7201426024955436,
+      "grad_norm": 1.4142314195632935,
+      "learning_rate": 3.632309217046581e-06,
+      "loss": 0.9715,
+      "step": 404
+    },
+    {
+      "epoch": 0.7219251336898396,
+      "grad_norm": 1.4843522310256958,
+      "learning_rate": 3.6273538156590688e-06,
+      "loss": 0.9878,
+      "step": 405
+    },
+    {
+      "epoch": 0.7237076648841355,
+      "grad_norm": 1.4929953813552856,
+      "learning_rate": 3.6223984142715564e-06,
+      "loss": 1.0019,
+      "step": 406
+    },
+    {
+      "epoch": 0.7254901960784313,
+      "grad_norm": 1.4887545108795166,
+      "learning_rate": 3.6174430128840437e-06,
+      "loss": 1.0063,
+      "step": 407
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 1.4991189241409302,
+      "learning_rate": 3.6124876114965313e-06,
+      "loss": 0.9987,
+      "step": 408
+    },
+    {
+      "epoch": 0.7290552584670231,
+      "grad_norm": 1.4522265195846558,
+      "learning_rate": 3.6075322101090194e-06,
+      "loss": 0.961,
+      "step": 409
+    },
+    {
+      "epoch": 0.7308377896613191,
+      "grad_norm": 1.473073959350586,
+      "learning_rate": 3.6025768087215067e-06,
+      "loss": 0.9625,
+      "step": 410
+    },
+    {
+      "epoch": 0.7308377896613191,
+      "eval_loss": 1.0513505935668945,
+      "eval_runtime": 25.7751,
+      "eval_samples_per_second": 38.797,
+      "eval_steps_per_second": 2.444,
+      "step": 410
+    },
+    {
+      "epoch": 0.732620320855615,
+      "grad_norm": 1.7087706327438354,
+      "learning_rate": 3.5976214073339943e-06,
+      "loss": 1.0139,
+      "step": 411
+    },
+    {
+      "epoch": 0.7344028520499108,
+      "grad_norm": 1.4385508298873901,
+      "learning_rate": 3.592666005946482e-06,
+      "loss": 0.9584,
+      "step": 412
+    },
+    {
+      "epoch": 0.7361853832442068,
+      "grad_norm": 1.525636911392212,
+      "learning_rate": 3.5877106045589692e-06,
+      "loss": 1.0031,
+      "step": 413
+    },
+    {
+      "epoch": 0.7379679144385026,
+      "grad_norm": 1.487068772315979,
+      "learning_rate": 3.5827552031714573e-06,
+      "loss": 1.0173,
+      "step": 414
+    },
+    {
+      "epoch": 0.7397504456327986,
+      "grad_norm": 1.4463266134262085,
+      "learning_rate": 3.577799801783945e-06,
+      "loss": 0.9762,
+      "step": 415
+    },
+    {
+      "epoch": 0.7415329768270945,
+      "grad_norm": 1.5234814882278442,
+      "learning_rate": 3.5728444003964326e-06,
+      "loss": 0.9776,
+      "step": 416
+    },
+    {
+      "epoch": 0.7433155080213903,
+      "grad_norm": 1.4824904203414917,
+      "learning_rate": 3.56788899900892e-06,
+      "loss": 0.9981,
+      "step": 417
+    },
+    {
+      "epoch": 0.7450980392156863,
+      "grad_norm": 1.51459538936615,
+      "learning_rate": 3.5629335976214075e-06,
+      "loss": 0.9542,
+      "step": 418
+    },
+    {
+      "epoch": 0.7468805704099821,
+      "grad_norm": 1.5176249742507935,
+      "learning_rate": 3.5579781962338956e-06,
+      "loss": 0.9655,
+      "step": 419
+    },
+    {
+      "epoch": 0.7486631016042781,
+      "grad_norm": 1.840696930885315,
+      "learning_rate": 3.553022794846383e-06,
+      "loss": 0.9948,
+      "step": 420
+    },
+    {
+      "epoch": 0.7486631016042781,
+      "eval_loss": 1.0480048656463623,
+      "eval_runtime": 26.0675,
+      "eval_samples_per_second": 38.362,
+      "eval_steps_per_second": 2.417,
+      "step": 420
+    },
+    {
+      "epoch": 0.750445632798574,
+      "grad_norm": 1.5904754400253296,
+      "learning_rate": 3.5480673934588705e-06,
+      "loss": 0.9577,
+      "step": 421
+    },
+    {
+      "epoch": 0.7522281639928698,
+      "grad_norm": 1.5453381538391113,
+      "learning_rate": 3.543111992071358e-06,
+      "loss": 0.9968,
+      "step": 422
+    },
+    {
+      "epoch": 0.7540106951871658,
+      "grad_norm": 1.5720494985580444,
+      "learning_rate": 3.5381565906838454e-06,
+      "loss": 0.963,
+      "step": 423
+    },
+    {
+      "epoch": 0.7557932263814616,
+      "grad_norm": 1.4631563425064087,
+      "learning_rate": 3.533201189296333e-06,
+      "loss": 0.9987,
+      "step": 424
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 1.5536762475967407,
+      "learning_rate": 3.528245787908821e-06,
+      "loss": 0.9579,
+      "step": 425
+    },
+    {
+      "epoch": 0.7593582887700535,
+      "grad_norm": 1.602889895439148,
+      "learning_rate": 3.523290386521309e-06,
+      "loss": 1.0036,
+      "step": 426
+    },
+    {
+      "epoch": 0.7611408199643493,
+      "grad_norm": 1.6541874408721924,
+      "learning_rate": 3.518334985133796e-06,
+      "loss": 0.9689,
+      "step": 427
+    },
+    {
+      "epoch": 0.7629233511586453,
+      "grad_norm": 1.529970407485962,
+      "learning_rate": 3.5133795837462837e-06,
+      "loss": 0.9577,
+      "step": 428
+    },
+    {
+      "epoch": 0.7647058823529411,
+      "grad_norm": 1.5926837921142578,
+      "learning_rate": 3.508424182358771e-06,
+      "loss": 0.9908,
+      "step": 429
+    },
+    {
+      "epoch": 0.7664884135472371,
+      "grad_norm": 1.5412956476211548,
+      "learning_rate": 3.503468780971259e-06,
+      "loss": 0.9699,
+      "step": 430
+    },
+    {
+      "epoch": 0.7664884135472371,
+      "eval_loss": 1.0452969074249268,
+      "eval_runtime": 25.7018,
+      "eval_samples_per_second": 38.908,
+      "eval_steps_per_second": 2.451,
+      "step": 430
+    },
+    {
+      "epoch": 0.768270944741533,
+      "grad_norm": 1.5507601499557495,
+      "learning_rate": 3.4985133795837467e-06,
+      "loss": 1.0164,
+      "step": 431
+    },
+    {
+      "epoch": 0.7700534759358288,
+      "grad_norm": 1.5753085613250732,
+      "learning_rate": 3.4935579781962344e-06,
+      "loss": 0.9694,
+      "step": 432
+    },
+    {
+      "epoch": 0.7718360071301248,
+      "grad_norm": 1.532804250717163,
+      "learning_rate": 3.4886025768087216e-06,
+      "loss": 0.9837,
+      "step": 433
+    },
+    {
+      "epoch": 0.7736185383244206,
+      "grad_norm": 1.7083972692489624,
+      "learning_rate": 3.4836471754212093e-06,
+      "loss": 1.0071,
+      "step": 434
+    },
+    {
+      "epoch": 0.7754010695187166,
+      "grad_norm": 1.5414382219314575,
+      "learning_rate": 3.4786917740336974e-06,
+      "loss": 0.9755,
+      "step": 435
+    },
+    {
+      "epoch": 0.7771836007130125,
+      "grad_norm": 1.4537928104400635,
+      "learning_rate": 3.4737363726461846e-06,
+      "loss": 0.9659,
+      "step": 436
+    },
+    {
+      "epoch": 0.7789661319073083,
+      "grad_norm": 1.486669898033142,
+      "learning_rate": 3.4687809712586723e-06,
+      "loss": 0.9741,
+      "step": 437
+    },
+    {
+      "epoch": 0.7807486631016043,
+      "grad_norm": 1.5523810386657715,
+      "learning_rate": 3.46382556987116e-06,
+      "loss": 0.9857,
+      "step": 438
+    },
+    {
+      "epoch": 0.7825311942959001,
+      "grad_norm": 1.5614720582962036,
+      "learning_rate": 3.458870168483647e-06,
+      "loss": 1.0088,
+      "step": 439
+    },
+    {
+      "epoch": 0.7843137254901961,
+      "grad_norm": 1.621520757675171,
+      "learning_rate": 3.453914767096135e-06,
+      "loss": 1.0051,
+      "step": 440
+    },
+    {
+      "epoch": 0.7843137254901961,
+      "eval_loss": 1.04201078414917,
+      "eval_runtime": 25.8799,
+      "eval_samples_per_second": 38.64,
+      "eval_steps_per_second": 2.434,
+      "step": 440
+    },
+    {
+      "epoch": 0.786096256684492,
+      "grad_norm": 1.5269745588302612,
+      "learning_rate": 3.448959365708623e-06,
+      "loss": 0.9874,
+      "step": 441
+    },
+    {
+      "epoch": 0.7878787878787878,
+      "grad_norm": 1.5022350549697876,
+      "learning_rate": 3.4440039643211106e-06,
+      "loss": 0.9661,
+      "step": 442
+    },
+    {
+      "epoch": 0.7896613190730838,
+      "grad_norm": 1.476470947265625,
+      "learning_rate": 3.439048562933598e-06,
+      "loss": 0.9626,
+      "step": 443
+    },
+    {
+      "epoch": 0.7914438502673797,
+      "grad_norm": 1.6092950105667114,
+      "learning_rate": 3.4340931615460855e-06,
+      "loss": 0.9828,
+      "step": 444
+    },
+    {
+      "epoch": 0.7932263814616756,
+      "grad_norm": 1.5567653179168701,
+      "learning_rate": 3.4291377601585727e-06,
+      "loss": 0.994,
+      "step": 445
+    },
+    {
+      "epoch": 0.7950089126559715,
+      "grad_norm": 1.4823105335235596,
+      "learning_rate": 3.424182358771061e-06,
+      "loss": 0.9619,
+      "step": 446
+    },
+    {
+      "epoch": 0.7967914438502673,
+      "grad_norm": 1.7685904502868652,
+      "learning_rate": 3.4192269573835485e-06,
+      "loss": 0.9475,
+      "step": 447
+    },
+    {
+      "epoch": 0.7985739750445633,
+      "grad_norm": 1.5113608837127686,
+      "learning_rate": 3.414271555996036e-06,
+      "loss": 0.9868,
+      "step": 448
+    },
+    {
+      "epoch": 0.8003565062388592,
+      "grad_norm": 1.6278724670410156,
+      "learning_rate": 3.4093161546085234e-06,
+      "loss": 0.9729,
+      "step": 449
+    },
+    {
+      "epoch": 0.8021390374331551,
+      "grad_norm": 1.922420859336853,
+      "learning_rate": 3.404360753221011e-06,
+      "loss": 1.0143,
+      "step": 450
+    },
+    {
+      "epoch": 0.8021390374331551,
+      "eval_loss": 1.0419155359268188,
+      "eval_runtime": 25.9263,
+      "eval_samples_per_second": 38.571,
+      "eval_steps_per_second": 2.43,
+      "step": 450
+    },
+    {
+      "epoch": 0.803921568627451,
+      "grad_norm": 1.523510456085205,
+      "learning_rate": 3.399405351833499e-06,
+      "loss": 0.9925,
+      "step": 451
+    },
+    {
+      "epoch": 0.8057040998217468,
+      "grad_norm": 1.5074394941329956,
+      "learning_rate": 3.3944499504459868e-06,
+      "loss": 0.9788,
+      "step": 452
+    },
+    {
+      "epoch": 0.8074866310160428,
+      "grad_norm": 1.5418741703033447,
+      "learning_rate": 3.389494549058474e-06,
+      "loss": 0.9669,
+      "step": 453
+    },
+    {
+      "epoch": 0.8092691622103387,
+      "grad_norm": 1.5402334928512573,
+      "learning_rate": 3.3845391476709617e-06,
+      "loss": 0.995,
+      "step": 454
+    },
+    {
+      "epoch": 0.8110516934046346,
+      "grad_norm": 1.4736061096191406,
+      "learning_rate": 3.379583746283449e-06,
+      "loss": 0.968,
+      "step": 455
+    },
+    {
+      "epoch": 0.8128342245989305,
+      "grad_norm": 1.7145874500274658,
+      "learning_rate": 3.3746283448959366e-06,
+      "loss": 0.9594,
+      "step": 456
+    },
+    {
+      "epoch": 0.8146167557932263,
+      "grad_norm": 1.7427315711975098,
+      "learning_rate": 3.3696729435084247e-06,
+      "loss": 0.9777,
+      "step": 457
+    },
+    {
+      "epoch": 0.8163992869875223,
+      "grad_norm": 1.5081549882888794,
+      "learning_rate": 3.3647175421209123e-06,
+      "loss": 0.9572,
+      "step": 458
+    },
+    {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 1.608188271522522,
+      "learning_rate": 3.3597621407333996e-06,
+      "loss": 0.9906,
+      "step": 459
+    },
+    {
+      "epoch": 0.8199643493761141,
+      "grad_norm": 1.5751535892486572,
+      "learning_rate": 3.3548067393458872e-06,
+      "loss": 0.966,
+      "step": 460
+    },
+    {
+      "epoch": 0.8199643493761141,
+      "eval_loss": 1.0394279956817627,
+      "eval_runtime": 25.7927,
+      "eval_samples_per_second": 38.771,
+      "eval_steps_per_second": 2.443,
+      "step": 460
+    },
+    {
+      "epoch": 0.82174688057041,
+      "grad_norm": 1.4779298305511475,
+      "learning_rate": 3.349851337958375e-06,
+      "loss": 0.9953,
+      "step": 461
+    },
+    {
+      "epoch": 0.8235294117647058,
+      "grad_norm": 1.4367811679840088,
+      "learning_rate": 3.344895936570863e-06,
+      "loss": 0.9793,
+      "step": 462
+    },
+    {
+      "epoch": 0.8253119429590018,
+      "grad_norm": 1.6063673496246338,
+      "learning_rate": 3.33994053518335e-06,
+      "loss": 0.9619,
+      "step": 463
+    },
+    {
+      "epoch": 0.8270944741532977,
+      "grad_norm": 1.5322190523147583,
+      "learning_rate": 3.334985133795838e-06,
+      "loss": 0.9636,
+      "step": 464
+    },
+    {
+      "epoch": 0.8288770053475936,
+      "grad_norm": 1.6693592071533203,
+      "learning_rate": 3.330029732408325e-06,
+      "loss": 1.014,
+      "step": 465
+    },
+    {
+      "epoch": 0.8306595365418895,
+      "grad_norm": 1.6570430994033813,
+      "learning_rate": 3.3250743310208128e-06,
+      "loss": 0.9913,
+      "step": 466
+    },
+    {
+      "epoch": 0.8324420677361853,
+      "grad_norm": 1.5995551347732544,
+      "learning_rate": 3.3201189296333004e-06,
+      "loss": 0.9514,
+      "step": 467
+    },
+    {
+      "epoch": 0.8342245989304813,
+      "grad_norm": 1.528596043586731,
+      "learning_rate": 3.3151635282457885e-06,
+      "loss": 0.9817,
+      "step": 468
+    },
+    {
+      "epoch": 0.8360071301247772,
+      "grad_norm": 1.475396752357483,
+      "learning_rate": 3.3102081268582757e-06,
+      "loss": 0.9604,
+      "step": 469
+    },
+    {
+      "epoch": 0.8377896613190731,
+      "grad_norm": 1.5124799013137817,
+      "learning_rate": 3.3052527254707634e-06,
+      "loss": 0.9596,
+      "step": 470
+    },
+    {
+      "epoch": 0.8377896613190731,
+      "eval_loss": 1.0366942882537842,
+      "eval_runtime": 25.742,
+      "eval_samples_per_second": 38.847,
+      "eval_steps_per_second": 2.447,
+      "step": 470
+    },
+    {
+      "epoch": 0.839572192513369,
+      "grad_norm": 1.5007457733154297,
+      "learning_rate": 3.300297324083251e-06,
+      "loss": 0.9647,
+      "step": 471
+    },
+    {
+      "epoch": 0.8413547237076648,
+      "grad_norm": 1.4299620389938354,
+      "learning_rate": 3.2953419226957383e-06,
+      "loss": 0.9591,
+      "step": 472
+    },
+    {
+      "epoch": 0.8431372549019608,
+      "grad_norm": 1.4947385787963867,
+      "learning_rate": 3.2903865213082264e-06,
+      "loss": 0.971,
+      "step": 473
+    },
+    {
+      "epoch": 0.8449197860962567,
+      "grad_norm": 1.5945279598236084,
+      "learning_rate": 3.285431119920714e-06,
+      "loss": 1.0029,
+      "step": 474
+    },
+    {
+      "epoch": 0.8467023172905526,
+      "grad_norm": 1.6020610332489014,
+      "learning_rate": 3.2804757185332013e-06,
+      "loss": 0.9834,
+      "step": 475
+    },
+    {
+      "epoch": 0.8484848484848485,
+      "grad_norm": 1.6295793056488037,
+      "learning_rate": 3.275520317145689e-06,
+      "loss": 0.9778,
+      "step": 476
+    },
+    {
+      "epoch": 0.8502673796791443,
+      "grad_norm": 1.4652864933013916,
+      "learning_rate": 3.2705649157581766e-06,
+      "loss": 1.0236,
+      "step": 477
+    },
+    {
+      "epoch": 0.8520499108734403,
+      "grad_norm": 1.417965054512024,
+      "learning_rate": 3.2656095143706647e-06,
+      "loss": 0.9724,
+      "step": 478
+    },
+    {
+      "epoch": 0.8538324420677362,
+      "grad_norm": 1.449277400970459,
+      "learning_rate": 3.260654112983152e-06,
+      "loss": 0.9896,
+      "step": 479
+    },
+    {
+      "epoch": 0.8556149732620321,
+      "grad_norm": 1.4030529260635376,
+      "learning_rate": 3.2556987115956396e-06,
+      "loss": 0.9882,
+      "step": 480
+    },
+    {
+      "epoch": 0.8556149732620321,
+      "eval_loss": 1.0339490175247192,
+      "eval_runtime": 25.8189,
+      "eval_samples_per_second": 38.731,
+      "eval_steps_per_second": 2.44,
+      "step": 480
+    },
+    {
+      "epoch": 0.857397504456328,
+      "grad_norm": 1.5231982469558716,
+      "learning_rate": 3.2507433102081273e-06,
+      "loss": 0.964,
+      "step": 481
+    },
+    {
+      "epoch": 0.8591800356506238,
+      "grad_norm": 1.5029219388961792,
+      "learning_rate": 3.2457879088206145e-06,
+      "loss": 0.9774,
+      "step": 482
+    },
+    {
+      "epoch": 0.8609625668449198,
+      "grad_norm": 1.5686631202697754,
+      "learning_rate": 3.240832507433102e-06,
+      "loss": 0.959,
+      "step": 483
+    },
+    {
+      "epoch": 0.8627450980392157,
+      "grad_norm": 1.5515562295913696,
+      "learning_rate": 3.2358771060455903e-06,
+      "loss": 0.9367,
+      "step": 484
+    },
+    {
+      "epoch": 0.8645276292335116,
+      "grad_norm": 1.4003010988235474,
+      "learning_rate": 3.2309217046580775e-06,
+      "loss": 0.9754,
+      "step": 485
+    },
+    {
+      "epoch": 0.8663101604278075,
+      "grad_norm": 1.4356753826141357,
+      "learning_rate": 3.225966303270565e-06,
+      "loss": 0.9673,
+      "step": 486
+    },
+    {
+      "epoch": 0.8680926916221033,
+      "grad_norm": 1.5588560104370117,
+      "learning_rate": 3.221010901883053e-06,
+      "loss": 0.9821,
+      "step": 487
+    },
+    {
+      "epoch": 0.8698752228163993,
+      "grad_norm": 1.494897723197937,
+      "learning_rate": 3.21605550049554e-06,
+      "loss": 0.9595,
+      "step": 488
+    },
+    {
+      "epoch": 0.8716577540106952,
+      "grad_norm": 1.580748200416565,
+      "learning_rate": 3.211100099108028e-06,
+      "loss": 0.9258,
+      "step": 489
+    },
+    {
+      "epoch": 0.8734402852049911,
+      "grad_norm": 1.4712975025177002,
+      "learning_rate": 3.206144697720516e-06,
+      "loss": 0.9657,
+      "step": 490
+    },
+    {
+      "epoch": 0.8734402852049911,
+      "eval_loss": 1.0344802141189575,
+      "eval_runtime": 25.8547,
+      "eval_samples_per_second": 38.678,
+      "eval_steps_per_second": 2.437,
+      "step": 490
+    },
+    {
+      "epoch": 0.875222816399287,
+      "grad_norm": 1.4707386493682861,
+      "learning_rate": 3.201189296333003e-06,
+      "loss": 0.9597,
+      "step": 491
+    },
+    {
+      "epoch": 0.8770053475935828,
+      "grad_norm": 1.5678167343139648,
+      "learning_rate": 3.1962338949454907e-06,
+      "loss": 0.9745,
+      "step": 492
+    },
+    {
+      "epoch": 0.8787878787878788,
+      "grad_norm": 1.577392578125,
+      "learning_rate": 3.1912784935579784e-06,
+      "loss": 0.9644,
+      "step": 493
+    },
+    {
+      "epoch": 0.8805704099821747,
+      "grad_norm": 1.4401471614837646,
+      "learning_rate": 3.1863230921704664e-06,
+      "loss": 0.9563,
+      "step": 494
+    },
+    {
+      "epoch": 0.8823529411764706,
+      "grad_norm": 1.5576186180114746,
+      "learning_rate": 3.1813676907829537e-06,
+      "loss": 0.9665,
+      "step": 495
+    },
+    {
+      "epoch": 0.8841354723707665,
+      "grad_norm": 1.5939122438430786,
+      "learning_rate": 3.1764122893954413e-06,
+      "loss": 0.9746,
+      "step": 496
+    },
+    {
+      "epoch": 0.8859180035650623,
+      "grad_norm": 1.476230263710022,
+      "learning_rate": 3.171456888007929e-06,
+      "loss": 0.9479,
+      "step": 497
+    },
+    {
+      "epoch": 0.8877005347593583,
+      "grad_norm": 1.5572577714920044,
+      "learning_rate": 3.1665014866204162e-06,
+      "loss": 0.9643,
+      "step": 498
+    },
+    {
+      "epoch": 0.8894830659536542,
+      "grad_norm": 1.5189151763916016,
+      "learning_rate": 3.161546085232904e-06,
+      "loss": 0.9257,
+      "step": 499
+    },
+    {
+      "epoch": 0.8912655971479501,
+      "grad_norm": 1.5143529176712036,
+      "learning_rate": 3.156590683845392e-06,
+      "loss": 0.9706,
+      "step": 500
+    },
+    {
+      "epoch": 0.8912655971479501,
+      "eval_loss": 1.0346344709396362,
+      "eval_runtime": 26.07,
+      "eval_samples_per_second": 38.358,
+      "eval_steps_per_second": 2.417,
+      "step": 500
+    },
+    {
+      "epoch": 0.893048128342246,
+      "grad_norm": 1.5261365175247192,
+      "learning_rate": 3.1516352824578792e-06,
+      "loss": 0.9785,
+      "step": 501
+    },
+    {
+      "epoch": 0.8948306595365418,
+      "grad_norm": 1.5646567344665527,
+      "learning_rate": 3.146679881070367e-06,
+      "loss": 0.992,
+      "step": 502
+    },
+    {
+      "epoch": 0.8966131907308378,
+      "grad_norm": 1.5119503736495972,
+      "learning_rate": 3.1417244796828546e-06,
+      "loss": 0.9488,
+      "step": 503
+    },
+    {
+      "epoch": 0.8983957219251337,
+      "grad_norm": 1.5541421175003052,
+      "learning_rate": 3.136769078295342e-06,
+      "loss": 0.9617,
+      "step": 504
+    },
+    {
+      "epoch": 0.9001782531194296,
+      "grad_norm": 1.491257667541504,
+      "learning_rate": 3.13181367690783e-06,
+      "loss": 0.981,
+      "step": 505
+    },
+    {
+      "epoch": 0.9019607843137255,
+      "grad_norm": 1.6135386228561401,
+      "learning_rate": 3.1268582755203175e-06,
+      "loss": 0.9874,
+      "step": 506
+    },
+    {
+      "epoch": 0.9037433155080213,
+      "grad_norm": 1.531295895576477,
+      "learning_rate": 3.121902874132805e-06,
+      "loss": 0.9816,
+      "step": 507
+    },
+    {
+      "epoch": 0.9055258467023173,
+      "grad_norm": 1.5324310064315796,
+      "learning_rate": 3.1169474727452924e-06,
+      "loss": 0.9189,
+      "step": 508
+    },
+    {
+      "epoch": 0.9073083778966132,
+      "grad_norm": 1.8411617279052734,
+      "learning_rate": 3.11199207135778e-06,
+      "loss": 0.9744,
+      "step": 509
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 1.6758506298065186,
+      "learning_rate": 3.107036669970268e-06,
+      "loss": 0.9774,
+      "step": 510
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "eval_loss": 1.033508539199829,
+      "eval_runtime": 25.9173,
+      "eval_samples_per_second": 38.584,
+      "eval_steps_per_second": 2.431,
+      "step": 510
+    },
+    {
+      "epoch": 0.910873440285205,
+      "grad_norm": 1.6758506298065186,
+      "learning_rate": 3.107036669970268e-06,
+      "loss": 0.9609,
+      "step": 511
+    },
+    {
+      "epoch": 0.9126559714795008,
+      "grad_norm": 1.6164848804473877,
+      "learning_rate": 3.1020812685827554e-06,
+      "loss": 0.9842,
+      "step": 512
+    },
+    {
+      "epoch": 0.9144385026737968,
+      "grad_norm": 1.5550708770751953,
+      "learning_rate": 3.097125867195243e-06,
+      "loss": 0.9692,
+      "step": 513
+    },
+    {
+      "epoch": 0.9162210338680927,
+      "grad_norm": 1.5243220329284668,
+      "learning_rate": 3.0921704658077308e-06,
+      "loss": 0.9618,
+      "step": 514
+    },
+    {
+      "epoch": 0.9180035650623886,
+      "grad_norm": 1.660752773284912,
+      "learning_rate": 3.087215064420218e-06,
+      "loss": 0.9727,
+      "step": 515
+    },
+    {
+      "epoch": 0.9197860962566845,
+      "grad_norm": 1.559834361076355,
+      "learning_rate": 3.0822596630327057e-06,
+      "loss": 0.9733,
+      "step": 516
+    },
+    {
+      "epoch": 0.9215686274509803,
+      "grad_norm": 1.4715766906738281,
+      "learning_rate": 3.0773042616451937e-06,
+      "loss": 0.9338,
+      "step": 517
+    },
+    {
+      "epoch": 0.9233511586452763,
+      "grad_norm": 1.4563944339752197,
+      "learning_rate": 3.0723488602576814e-06,
+      "loss": 0.9204,
+      "step": 518
+    },
+    {
+      "epoch": 0.9251336898395722,
+      "grad_norm": 1.613408088684082,
+      "learning_rate": 3.0673934588701686e-06,
+      "loss": 0.975,
+      "step": 519
+    },
+    {
+      "epoch": 0.9269162210338681,
+      "grad_norm": 1.436843752861023,
+      "learning_rate": 3.0624380574826563e-06,
+      "loss": 0.9464,
+      "step": 520
+    },
+    {
+      "epoch": 0.9269162210338681,
+      "eval_loss": 1.031677484512329,
+      "eval_runtime": 25.8601,
+      "eval_samples_per_second": 38.67,
+      "eval_steps_per_second": 2.436,
+      "step": 520
+    },
+    {
+      "epoch": 0.928698752228164,
+      "grad_norm": 1.4962098598480225,
+      "learning_rate": 3.0574826560951435e-06,
+      "loss": 0.9756,
+      "step": 521
+    },
+    {
+      "epoch": 0.93048128342246,
+      "grad_norm": 1.5167757272720337,
+      "learning_rate": 3.0525272547076316e-06,
+      "loss": 0.9565,
+      "step": 522
+    },
+    {
+      "epoch": 0.9322638146167558,
+      "grad_norm": 1.5992977619171143,
+      "learning_rate": 3.0475718533201193e-06,
+      "loss": 0.9752,
+      "step": 523
+    },
+    {
+      "epoch": 0.9340463458110517,
+      "grad_norm": 1.521874189376831,
+      "learning_rate": 3.042616451932607e-06,
+      "loss": 0.9424,
+      "step": 524
+    },
+    {
+      "epoch": 0.9358288770053476,
+      "grad_norm": 1.5859359502792358,
+      "learning_rate": 3.037661050545094e-06,
+      "loss": 0.966,
+      "step": 525
+    },
+    {
+      "epoch": 0.9376114081996435,
+      "grad_norm": 1.4846385717391968,
+      "learning_rate": 3.032705649157582e-06,
+      "loss": 0.97,
+      "step": 526
+    },
+    {
+      "epoch": 0.9393939393939394,
+      "grad_norm": 1.4894582033157349,
+      "learning_rate": 3.02775024777007e-06,
+      "loss": 0.9651,
+      "step": 527
+    },
+    {
+      "epoch": 0.9411764705882353,
+      "grad_norm": 1.538907766342163,
+      "learning_rate": 3.022794846382557e-06,
+      "loss": 0.9439,
+      "step": 528
+    },
+    {
+      "epoch": 0.9429590017825312,
+      "grad_norm": 1.5071743726730347,
+      "learning_rate": 3.017839444995045e-06,
+      "loss": 0.9667,
+      "step": 529
+    },
+    {
+      "epoch": 0.9447415329768271,
+      "grad_norm": 1.4718732833862305,
+      "learning_rate": 3.0128840436075325e-06,
+      "loss": 0.9553,
+      "step": 530
+    },
+    {
+      "epoch": 0.9447415329768271,
+      "eval_loss": 1.027829885482788,
+      "eval_runtime": 25.8768,
+      "eval_samples_per_second": 38.645,
+      "eval_steps_per_second": 2.435,
+      "step": 530
+    },
+    {
+      "epoch": 0.946524064171123,
+      "grad_norm": 1.5042873620986938,
+      "learning_rate": 3.0079286422200197e-06,
+      "loss": 0.9772,
+      "step": 531
+    },
+    {
+      "epoch": 0.948306595365419,
+      "grad_norm": 1.4991027116775513,
+      "learning_rate": 3.0029732408325074e-06,
+      "loss": 0.9549,
+      "step": 532
+    },
+    {
+      "epoch": 0.9500891265597148,
+      "grad_norm": 1.4758248329162598,
+      "learning_rate": 2.9980178394449955e-06,
+      "loss": 0.9659,
+      "step": 533
+    },
+    {
+      "epoch": 0.9518716577540107,
+      "grad_norm": 1.4332822561264038,
+      "learning_rate": 2.993062438057483e-06,
+      "loss": 0.9405,
+      "step": 534
+    },
+    {
+      "epoch": 0.9536541889483066,
+      "grad_norm": 1.4461305141448975,
+      "learning_rate": 2.9881070366699704e-06,
+      "loss": 0.9822,
+      "step": 535
+    },
+    {
+      "epoch": 0.9554367201426025,
+      "grad_norm": 1.5441949367523193,
+      "learning_rate": 2.983151635282458e-06,
+      "loss": 0.9979,
+      "step": 536
+    },
+    {
+      "epoch": 0.9572192513368984,
+      "grad_norm": 1.5105853080749512,
+      "learning_rate": 2.9781962338949457e-06,
+      "loss": 0.9608,
+      "step": 537
+    },
+    {
+      "epoch": 0.9590017825311943,
+      "grad_norm": 1.5477588176727295,
+      "learning_rate": 2.9732408325074334e-06,
+      "loss": 0.9343,
+      "step": 538
+    },
+    {
+      "epoch": 0.9607843137254902,
+      "grad_norm": 1.4892299175262451,
+      "learning_rate": 2.968285431119921e-06,
+      "loss": 0.95,
+      "step": 539
+    },
+    {
+      "epoch": 0.9625668449197861,
+      "grad_norm": 1.5129679441452026,
+      "learning_rate": 2.9633300297324087e-06,
+      "loss": 0.9194,
+      "step": 540
+    },
+    {
+      "epoch": 0.9625668449197861,
+      "eval_loss": 1.0259133577346802,
+      "eval_runtime": 26.1016,
+      "eval_samples_per_second": 38.312,
+      "eval_steps_per_second": 2.414,
+      "step": 540
+    },
+    {
+      "epoch": 0.964349376114082,
+      "grad_norm": 1.589005947113037,
+      "learning_rate": 2.958374628344896e-06,
+      "loss": 0.992,
+      "step": 541
+    },
+    {
+      "epoch": 0.966131907308378,
+      "grad_norm": 1.4980206489562988,
+      "learning_rate": 2.9534192269573836e-06,
+      "loss": 0.938,
+      "step": 542
+    },
+    {
+      "epoch": 0.9679144385026738,
+      "grad_norm": 1.4804502725601196,
+      "learning_rate": 2.9484638255698717e-06,
+      "loss": 0.9629,
+      "step": 543
+    },
+    {
+      "epoch": 0.9696969696969697,
+      "grad_norm": 1.4890419244766235,
+      "learning_rate": 2.9435084241823593e-06,
+      "loss": 0.9633,
+      "step": 544
+    },
+    {
+      "epoch": 0.9714795008912656,
+      "grad_norm": 1.4954791069030762,
+      "learning_rate": 2.9385530227948466e-06,
+      "loss": 0.9718,
+      "step": 545
+    },
+    {
+      "epoch": 0.9732620320855615,
+      "grad_norm": 1.5530617237091064,
+      "learning_rate": 2.9335976214073342e-06,
+      "loss": 0.972,
+      "step": 546
+    },
+    {
+      "epoch": 0.9750445632798574,
+      "grad_norm": 1.5793863534927368,
+      "learning_rate": 2.9286422200198215e-06,
+      "loss": 0.9656,
+      "step": 547
+    },
+    {
+      "epoch": 0.9768270944741533,
+      "grad_norm": 1.5591521263122559,
+      "learning_rate": 2.923686818632309e-06,
+      "loss": 0.9445,
+      "step": 548
+    },
+    {
+      "epoch": 0.9786096256684492,
+      "grad_norm": 1.7068123817443848,
+      "learning_rate": 2.9187314172447972e-06,
+      "loss": 0.9979,
+      "step": 549
+    },
+    {
+      "epoch": 0.9803921568627451,
+      "grad_norm": 1.5867973566055298,
+      "learning_rate": 2.913776015857285e-06,
+      "loss": 0.9755,
+      "step": 550
+    },
+    {
+      "epoch": 0.9803921568627451,
+      "eval_loss": 1.0254909992218018,
+      "eval_runtime": 25.9627,
+      "eval_samples_per_second": 38.517,
+      "eval_steps_per_second": 2.427,
+      "step": 550
+    },
+    {
+      "epoch": 0.982174688057041,
+      "grad_norm": 1.5121583938598633,
+      "learning_rate": 2.908820614469772e-06,
+      "loss": 0.9659,
+      "step": 551
+    },
+    {
+      "epoch": 0.983957219251337,
+      "grad_norm": 1.4990359544754028,
+      "learning_rate": 2.9038652130822598e-06,
+      "loss": 0.9898,
+      "step": 552
+    },
+    {
+      "epoch": 0.9857397504456328,
+      "grad_norm": 1.6708561182022095,
+      "learning_rate": 2.8989098116947474e-06,
+      "loss": 0.9674,
+      "step": 553
+    },
+    {
+      "epoch": 0.9875222816399287,
+      "grad_norm": 1.5765035152435303,
+      "learning_rate": 2.8939544103072355e-06,
+      "loss": 0.9363,
+      "step": 554
+    },
+    {
+      "epoch": 0.9893048128342246,
+      "grad_norm": 1.8125452995300293,
+      "learning_rate": 2.8889990089197228e-06,
+      "loss": 0.9785,
+      "step": 555
+    },
+    {
+      "epoch": 0.9910873440285205,
+      "grad_norm": 1.666881799697876,
+      "learning_rate": 2.8840436075322104e-06,
+      "loss": 0.9514,
+      "step": 556
+    },
+    {
+      "epoch": 0.9928698752228164,
+      "grad_norm": 1.5965781211853027,
+      "learning_rate": 2.8790882061446977e-06,
+      "loss": 0.9818,
+      "step": 557
+    },
+    {
+      "epoch": 0.9946524064171123,
+      "grad_norm": 1.6427617073059082,
+      "learning_rate": 2.8741328047571853e-06,
+      "loss": 0.9308,
+      "step": 558
+    },
+    {
+      "epoch": 0.9964349376114082,
+      "grad_norm": 1.5469664335250854,
+      "learning_rate": 2.8691774033696734e-06,
+      "loss": 1.0039,
+      "step": 559
+    },
+    {
+      "epoch": 0.9982174688057041,
+      "grad_norm": 1.4527232646942139,
+      "learning_rate": 2.864222001982161e-06,
+      "loss": 0.9536,
+      "step": 560
+    },
+    {
+      "epoch": 0.9982174688057041,
+      "eval_loss": 1.0249741077423096,
+      "eval_runtime": 25.9621,
+      "eval_samples_per_second": 38.518,
+      "eval_steps_per_second": 2.427,
+      "step": 560
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.638670802116394,
+      "learning_rate": 2.8592666005946483e-06,
+      "loss": 0.945,
+      "step": 561
+    },
+    {
+      "epoch": 1.0017825311942958,
+      "grad_norm": 1.7704370021820068,
+      "learning_rate": 2.854311199207136e-06,
+      "loss": 0.9291,
+      "step": 562
+    },
+    {
+      "epoch": 1.0035650623885919,
+      "grad_norm": 1.534660816192627,
+      "learning_rate": 2.8493557978196236e-06,
+      "loss": 0.9013,
+      "step": 563
+    },
+    {
+      "epoch": 1.0053475935828877,
+      "grad_norm": 1.504355549812317,
+      "learning_rate": 2.844400396432111e-06,
+      "loss": 0.924,
+      "step": 564
+    },
+    {
+      "epoch": 1.0071301247771836,
+      "grad_norm": 1.7646287679672241,
+      "learning_rate": 2.839444995044599e-06,
+      "loss": 0.954,
+      "step": 565
+    },
+    {
+      "epoch": 1.0089126559714796,
+      "grad_norm": 1.835209846496582,
+      "learning_rate": 2.8344895936570866e-06,
+      "loss": 0.9199,
+      "step": 566
+    },
+    {
+      "epoch": 1.0106951871657754,
+      "grad_norm": 1.6713684797286987,
+      "learning_rate": 2.829534192269574e-06,
+      "loss": 0.9314,
+      "step": 567
+    },
+    {
+      "epoch": 1.0124777183600713,
+      "grad_norm": 1.545698642730713,
+      "learning_rate": 2.8245787908820615e-06,
+      "loss": 0.9741,
+      "step": 568
+    },
+    {
+      "epoch": 1.014260249554367,
+      "grad_norm": 1.5221408605575562,
+      "learning_rate": 2.819623389494549e-06,
+      "loss": 0.9325,
+      "step": 569
+    },
+    {
+      "epoch": 1.0160427807486632,
+      "grad_norm": 1.5692074298858643,
+      "learning_rate": 2.8146679881070373e-06,
+      "loss": 0.9593,
+      "step": 570
+    },
+    {
+      "epoch": 1.0160427807486632,
+      "eval_loss": 1.0230886936187744,
+      "eval_runtime": 25.9561,
+      "eval_samples_per_second": 38.527,
+      "eval_steps_per_second": 2.427,
+      "step": 570
+    },
+    {
+      "epoch": 1.017825311942959,
+      "grad_norm": 1.6565548181533813,
+      "learning_rate": 2.8097125867195245e-06,
+      "loss": 0.9367,
+      "step": 571
+    },
+    {
+      "epoch": 1.0196078431372548,
+      "grad_norm": 1.5202622413635254,
+      "learning_rate": 2.804757185332012e-06,
+      "loss": 0.9421,
+      "step": 572
+    },
+    {
+      "epoch": 1.0213903743315509,
+      "grad_norm": 1.6204488277435303,
+      "learning_rate": 2.7998017839445e-06,
+      "loss": 0.9327,
+      "step": 573
+    },
+    {
+      "epoch": 1.0231729055258467,
+      "grad_norm": 1.7548097372055054,
+      "learning_rate": 2.794846382556987e-06,
+      "loss": 0.9696,
+      "step": 574
+    },
+    {
+      "epoch": 1.0249554367201426,
+      "grad_norm": 1.6551501750946045,
+      "learning_rate": 2.789890981169475e-06,
+      "loss": 0.9255,
+      "step": 575
+    },
+    {
+      "epoch": 1.0267379679144386,
+      "grad_norm": 1.6978447437286377,
+      "learning_rate": 2.784935579781963e-06,
+      "loss": 0.9724,
+      "step": 576
+    },
+    {
+      "epoch": 1.0285204991087344,
+      "grad_norm": 1.6319726705551147,
+      "learning_rate": 2.77998017839445e-06,
+      "loss": 0.9499,
+      "step": 577
+    },
+    {
+      "epoch": 1.0303030303030303,
+      "grad_norm": 1.803403377532959,
+      "learning_rate": 2.7750247770069377e-06,
+      "loss": 0.876,
+      "step": 578
+    },
+    {
+      "epoch": 1.032085561497326,
+      "grad_norm": 1.7570103406906128,
+      "learning_rate": 2.7700693756194254e-06,
+      "loss": 0.9374,
+      "step": 579
+    },
+    {
+      "epoch": 1.0338680926916222,
+      "grad_norm": 1.6463748216629028,
+      "learning_rate": 2.7651139742319126e-06,
+      "loss": 0.9224,
+      "step": 580
+    },
+    {
+      "epoch": 1.0338680926916222,
+      "eval_loss": 1.026388168334961,
+      "eval_runtime": 25.8018,
+      "eval_samples_per_second": 38.757,
+      "eval_steps_per_second": 2.442,
+      "step": 580
+    },
+    {
+      "epoch": 1.035650623885918,
+      "grad_norm": 1.7040306329727173,
+      "learning_rate": 2.7601585728444007e-06,
+      "loss": 0.9659,
+      "step": 581
+    },
+    {
+      "epoch": 1.0374331550802138,
+      "grad_norm": 1.6346691846847534,
+      "learning_rate": 2.7552031714568884e-06,
+      "loss": 0.9305,
+      "step": 582
+    },
+    {
+      "epoch": 1.0392156862745099,
+      "grad_norm": 1.604477882385254,
+      "learning_rate": 2.750247770069376e-06,
+      "loss": 0.9696,
+      "step": 583
+    },
+    {
+      "epoch": 1.0409982174688057,
+      "grad_norm": 1.744341492652893,
+      "learning_rate": 2.7452923686818633e-06,
+      "loss": 0.9211,
+      "step": 584
+    },
+    {
+      "epoch": 1.0427807486631016,
+      "grad_norm": 1.7094818353652954,
+      "learning_rate": 2.740336967294351e-06,
+      "loss": 0.9603,
+      "step": 585
+    },
+    {
+      "epoch": 1.0445632798573976,
+      "grad_norm": 1.6708636283874512,
+      "learning_rate": 2.735381565906839e-06,
+      "loss": 0.9433,
+      "step": 586
+    },
+    {
+      "epoch": 1.0463458110516934,
+      "grad_norm": 1.702352523803711,
+      "learning_rate": 2.7304261645193263e-06,
+      "loss": 0.9388,
+      "step": 587
+    },
+    {
+      "epoch": 1.0481283422459893,
+      "grad_norm": 1.6145100593566895,
+      "learning_rate": 2.725470763131814e-06,
+      "loss": 0.9369,
+      "step": 588
+    },
+    {
+      "epoch": 1.049910873440285,
+      "grad_norm": 1.6174883842468262,
+      "learning_rate": 2.7205153617443016e-06,
+      "loss": 0.8993,
+      "step": 589
+    },
+    {
+      "epoch": 1.0516934046345812,
+      "grad_norm": 1.6352730989456177,
+      "learning_rate": 2.715559960356789e-06,
+      "loss": 0.9432,
+      "step": 590
+    },
+    {
+      "epoch": 1.0516934046345812,
+      "eval_loss": 1.0225163698196411,
+      "eval_runtime": 26.0491,
+      "eval_samples_per_second": 38.389,
+      "eval_steps_per_second": 2.419,
+      "step": 590
+    },
+    {
+      "epoch": 1.053475935828877,
+      "grad_norm": 1.5941722393035889,
+      "learning_rate": 2.710604558969277e-06,
+      "loss": 0.9628,
+      "step": 591
+    },
+    {
+      "epoch": 1.0552584670231728,
+      "grad_norm": 1.645479440689087,
+      "learning_rate": 2.7056491575817646e-06,
+      "loss": 0.9459,
+      "step": 592
+    },
+    {
+      "epoch": 1.0570409982174689,
+      "grad_norm": 1.7374376058578491,
+      "learning_rate": 2.700693756194252e-06,
+      "loss": 0.9432,
+      "step": 593
+    },
+    {
+      "epoch": 1.0588235294117647,
+      "grad_norm": 1.7686152458190918,
+      "learning_rate": 2.6957383548067395e-06,
+      "loss": 0.9301,
+      "step": 594
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 1.615399718284607,
+      "learning_rate": 2.690782953419227e-06,
+      "loss": 0.9238,
+      "step": 595
+    },
+    {
+      "epoch": 1.0623885918003566,
+      "grad_norm": 1.615858554840088,
+      "learning_rate": 2.6858275520317144e-06,
+      "loss": 0.904,
+      "step": 596
+    },
+    {
+      "epoch": 1.0641711229946524,
+      "grad_norm": 1.6182299852371216,
+      "learning_rate": 2.6808721506442025e-06,
+      "loss": 0.8771,
+      "step": 597
+    },
+    {
+      "epoch": 1.0659536541889483,
+      "grad_norm": 1.731257438659668,
+      "learning_rate": 2.67591674925669e-06,
+      "loss": 0.9317,
+      "step": 598
+    },
+    {
+      "epoch": 1.067736185383244,
+      "grad_norm": 1.6173752546310425,
+      "learning_rate": 2.6709613478691778e-06,
+      "loss": 0.9584,
+      "step": 599
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "grad_norm": 1.714568853378296,
+      "learning_rate": 2.666005946481665e-06,
+      "loss": 0.9318,
+      "step": 600
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "eval_loss": 1.0215412378311157,
+      "eval_runtime": 25.9358,
+      "eval_samples_per_second": 38.557,
+      "eval_steps_per_second": 2.429,
+      "step": 600
+    },
+    {
+      "epoch": 1.071301247771836,
+      "grad_norm": 1.6474616527557373,
+      "learning_rate": 2.6610505450941527e-06,
+      "loss": 0.9121,
+      "step": 601
+    },
+    {
+      "epoch": 1.0730837789661318,
+      "grad_norm": 1.569778323173523,
+      "learning_rate": 2.6560951437066408e-06,
+      "loss": 0.9517,
+      "step": 602
+    },
+    {
+      "epoch": 1.0748663101604279,
+      "grad_norm": 1.6712771654129028,
+      "learning_rate": 2.651139742319128e-06,
+      "loss": 0.9473,
+      "step": 603
+    },
+    {
+      "epoch": 1.0766488413547237,
+      "grad_norm": 1.657092571258545,
+      "learning_rate": 2.6461843409316157e-06,
+      "loss": 0.9295,
+      "step": 604
+    },
+    {
+      "epoch": 1.0784313725490196,
+      "grad_norm": 1.5856530666351318,
+      "learning_rate": 2.6412289395441033e-06,
+      "loss": 0.9323,
+      "step": 605
+    },
+    {
+      "epoch": 1.0802139037433156,
+      "grad_norm": 1.6494325399398804,
+      "learning_rate": 2.6362735381565906e-06,
+      "loss": 0.9229,
+      "step": 606
+    },
+    {
+      "epoch": 1.0819964349376114,
+      "grad_norm": 1.6499768495559692,
+      "learning_rate": 2.6313181367690786e-06,
+      "loss": 0.9304,
+      "step": 607
+    },
+    {
+      "epoch": 1.0837789661319073,
+      "grad_norm": 1.6964021921157837,
+      "learning_rate": 2.6263627353815663e-06,
+      "loss": 0.9338,
+      "step": 608
+    },
+    {
+      "epoch": 1.085561497326203,
+      "grad_norm": 1.6379368305206299,
+      "learning_rate": 2.621407333994054e-06,
+      "loss": 0.9413,
+      "step": 609
+    },
+    {
+      "epoch": 1.0873440285204992,
+      "grad_norm": 1.690470576286316,
+      "learning_rate": 2.616451932606541e-06,
+      "loss": 0.9529,
+      "step": 610
+    },
+    {
+      "epoch": 1.0873440285204992,
+      "eval_loss": 1.0203608274459839,
+      "eval_runtime": 25.7541,
+      "eval_samples_per_second": 38.829,
+      "eval_steps_per_second": 2.446,
+      "step": 610
+    },
+    {
+      "epoch": 1.089126559714795,
+      "grad_norm": 1.669008731842041,
+      "learning_rate": 2.611496531219029e-06,
+      "loss": 0.915,
+      "step": 611
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 1.6885886192321777,
+      "learning_rate": 2.606541129831516e-06,
+      "loss": 0.9694,
+      "step": 612
+    },
+    {
+      "epoch": 1.0926916221033869,
+      "grad_norm": 1.6819870471954346,
+      "learning_rate": 2.601585728444004e-06,
+      "loss": 0.894,
+      "step": 613
+    },
+    {
+      "epoch": 1.0944741532976827,
+      "grad_norm": 1.686949372291565,
+      "learning_rate": 2.596630327056492e-06,
+      "loss": 0.946,
+      "step": 614
+    },
+    {
+      "epoch": 1.0962566844919786,
+      "grad_norm": 1.7434266805648804,
+      "learning_rate": 2.5916749256689795e-06,
+      "loss": 0.9535,
+      "step": 615
+    },
+    {
+      "epoch": 1.0980392156862746,
+      "grad_norm": 1.6421117782592773,
+      "learning_rate": 2.5867195242814668e-06,
+      "loss": 0.9247,
+      "step": 616
+    },
+    {
+      "epoch": 1.0998217468805704,
+      "grad_norm": 1.7280988693237305,
+      "learning_rate": 2.5817641228939544e-06,
+      "loss": 0.9098,
+      "step": 617
+    },
+    {
+      "epoch": 1.1016042780748663,
+      "grad_norm": 1.6625720262527466,
+      "learning_rate": 2.5768087215064425e-06,
+      "loss": 0.916,
+      "step": 618
+    },
+    {
+      "epoch": 1.1033868092691623,
+      "grad_norm": 1.6401927471160889,
+      "learning_rate": 2.57185332011893e-06,
+      "loss": 0.9642,
+      "step": 619
+    },
+    {
+      "epoch": 1.1051693404634582,
+      "grad_norm": 1.6819369792938232,
+      "learning_rate": 2.5668979187314174e-06,
+      "loss": 0.9489,
+      "step": 620
+    },
+    {
+      "epoch": 1.1051693404634582,
+      "eval_loss": 1.0213559865951538,
+      "eval_runtime": 25.855,
+      "eval_samples_per_second": 38.677,
+      "eval_steps_per_second": 2.437,
+      "step": 620
+    },
+    {
+      "epoch": 1.106951871657754,
+      "grad_norm": 1.6465580463409424,
+      "learning_rate": 2.561942517343905e-06,
+      "loss": 0.936,
+      "step": 621
+    },
+    {
+      "epoch": 1.1087344028520498,
+      "grad_norm": 1.751806616783142,
+      "learning_rate": 2.5569871159563923e-06,
+      "loss": 0.9177,
+      "step": 622
+    },
+    {
+      "epoch": 1.1105169340463459,
+      "grad_norm": 1.6855638027191162,
+      "learning_rate": 2.5520317145688804e-06,
+      "loss": 0.973,
+      "step": 623
+    },
+    {
+      "epoch": 1.1122994652406417,
+      "grad_norm": 1.575648546218872,
+      "learning_rate": 2.547076313181368e-06,
+      "loss": 0.9475,
+      "step": 624
+    },
+    {
+      "epoch": 1.1140819964349375,
+      "grad_norm": 1.7097973823547363,
+      "learning_rate": 2.5421209117938557e-06,
+      "loss": 0.941,
+      "step": 625
+    },
+    {
+      "epoch": 1.1158645276292336,
+      "grad_norm": 1.6338120698928833,
+      "learning_rate": 2.537165510406343e-06,
+      "loss": 1.003,
+      "step": 626
+    },
+    {
+      "epoch": 1.1176470588235294,
+      "grad_norm": 1.6847730875015259,
+      "learning_rate": 2.5322101090188306e-06,
+      "loss": 0.9165,
+      "step": 627
+    },
+    {
+      "epoch": 1.1194295900178253,
+      "grad_norm": 1.5890871286392212,
+      "learning_rate": 2.5272547076313183e-06,
+      "loss": 0.9363,
+      "step": 628
+    },
+    {
+      "epoch": 1.121212121212121,
+      "grad_norm": 1.5626202821731567,
+      "learning_rate": 2.522299306243806e-06,
+      "loss": 0.9601,
+      "step": 629
+    },
+    {
+      "epoch": 1.1229946524064172,
+      "grad_norm": 1.5947809219360352,
+      "learning_rate": 2.5173439048562936e-06,
+      "loss": 0.9332,
+      "step": 630
+    },
+    {
+      "epoch": 1.1229946524064172,
+      "eval_loss": 1.0196335315704346,
+      "eval_runtime": 25.97,
+      "eval_samples_per_second": 38.506,
+      "eval_steps_per_second": 2.426,
+      "step": 630
+    },
+    {
+      "epoch": 1.124777183600713,
+      "grad_norm": 1.5689764022827148,
+      "learning_rate": 2.5123885034687813e-06,
+      "loss": 0.9045,
+      "step": 631
+    },
+    {
+      "epoch": 1.1265597147950088,
+      "grad_norm": 1.680080771446228,
+      "learning_rate": 2.5074331020812685e-06,
+      "loss": 0.9581,
+      "step": 632
+    },
+    {
+      "epoch": 1.1283422459893049,
+      "grad_norm": 1.6683106422424316,
+      "learning_rate": 2.502477700693756e-06,
+      "loss": 0.9295,
+      "step": 633
+    },
+    {
+      "epoch": 1.1301247771836007,
+      "grad_norm": 1.6343003511428833,
+      "learning_rate": 2.497522299306244e-06,
+      "loss": 0.9312,
+      "step": 634
+    },
+    {
+      "epoch": 1.1319073083778965,
+      "grad_norm": 1.6656992435455322,
+      "learning_rate": 2.492566897918732e-06,
+      "loss": 0.9265,
+      "step": 635
+    },
+    {
+      "epoch": 1.1336898395721926,
+      "grad_norm": 1.7076340913772583,
+      "learning_rate": 2.487611496531219e-06,
+      "loss": 0.9205,
+      "step": 636
+    },
+    {
+      "epoch": 1.1354723707664884,
+      "grad_norm": 1.6243927478790283,
+      "learning_rate": 2.482656095143707e-06,
+      "loss": 0.9429,
+      "step": 637
+    },
+    {
+      "epoch": 1.1372549019607843,
+      "grad_norm": 1.6368482112884521,
+      "learning_rate": 2.4777006937561945e-06,
+      "loss": 0.9155,
+      "step": 638
+    },
+    {
+      "epoch": 1.1390374331550803,
+      "grad_norm": 1.5730035305023193,
+      "learning_rate": 2.472745292368682e-06,
+      "loss": 0.9415,
+      "step": 639
+    },
+    {
+      "epoch": 1.1408199643493762,
+      "grad_norm": 1.5617438554763794,
+      "learning_rate": 2.4677898909811694e-06,
+      "loss": 0.9295,
+      "step": 640
+    },
+    {
+      "epoch": 1.1408199643493762,
+      "eval_loss": 1.0185329914093018,
+      "eval_runtime": 25.9777,
+      "eval_samples_per_second": 38.495,
+      "eval_steps_per_second": 2.425,
+      "step": 640
+    },
+    {
+      "epoch": 1.142602495543672,
+      "grad_norm": 1.5681148767471313,
+      "learning_rate": 2.4628344895936575e-06,
+      "loss": 0.9041,
+      "step": 641
+    },
+    {
+      "epoch": 1.1443850267379678,
+      "grad_norm": 1.6187138557434082,
+      "learning_rate": 2.4578790882061447e-06,
+      "loss": 0.9238,
+      "step": 642
+    },
+    {
+      "epoch": 1.1461675579322639,
+      "grad_norm": 1.5519737005233765,
+      "learning_rate": 2.4529236868186328e-06,
+      "loss": 0.922,
+      "step": 643
+    },
+    {
+      "epoch": 1.1479500891265597,
+      "grad_norm": 1.7167925834655762,
+      "learning_rate": 2.44796828543112e-06,
+      "loss": 0.9255,
+      "step": 644
+    },
+    {
+      "epoch": 1.1497326203208555,
+      "grad_norm": 1.6633977890014648,
+      "learning_rate": 2.4430128840436077e-06,
+      "loss": 0.9498,
+      "step": 645
+    },
+    {
+      "epoch": 1.1515151515151516,
+      "grad_norm": 1.593684196472168,
+      "learning_rate": 2.4380574826560953e-06,
+      "loss": 0.903,
+      "step": 646
+    },
+    {
+      "epoch": 1.1532976827094474,
+      "grad_norm": 1.733375906944275,
+      "learning_rate": 2.433102081268583e-06,
+      "loss": 0.9358,
+      "step": 647
+    },
+    {
+      "epoch": 1.1550802139037433,
+      "grad_norm": 1.6569396257400513,
+      "learning_rate": 2.4281466798810702e-06,
+      "loss": 0.9104,
+      "step": 648
+    },
+    {
+      "epoch": 1.156862745098039,
+      "grad_norm": 1.7154539823532104,
+      "learning_rate": 2.4231912784935583e-06,
+      "loss": 0.922,
+      "step": 649
+    },
+    {
+      "epoch": 1.1586452762923352,
+      "grad_norm": 1.6517035961151123,
+      "learning_rate": 2.4182358771060456e-06,
+      "loss": 0.969,
+      "step": 650
+    },
+    {
+      "epoch": 1.1586452762923352,
+      "eval_loss": 1.020390272140503,
+      "eval_runtime": 26.0598,
+      "eval_samples_per_second": 38.373,
+      "eval_steps_per_second": 2.418,
+      "step": 650
+    },
+    {
+      "epoch": 1.160427807486631,
+      "grad_norm": 1.5234638452529907,
+      "learning_rate": 2.4132804757185337e-06,
+      "loss": 0.8913,
+      "step": 651
+    },
+    {
+      "epoch": 1.1622103386809268,
+      "grad_norm": 1.697643756866455,
+      "learning_rate": 2.408325074331021e-06,
+      "loss": 0.9231,
+      "step": 652
+    },
+    {
+      "epoch": 1.1639928698752229,
+      "grad_norm": 1.6353060007095337,
+      "learning_rate": 2.4033696729435086e-06,
+      "loss": 0.9438,
+      "step": 653
+    },
+    {
+      "epoch": 1.1657754010695187,
+      "grad_norm": 1.615633487701416,
+      "learning_rate": 2.3984142715559962e-06,
+      "loss": 0.9352,
+      "step": 654
+    },
+    {
+      "epoch": 1.1675579322638145,
+      "grad_norm": 1.6155204772949219,
+      "learning_rate": 2.393458870168484e-06,
+      "loss": 0.9843,
+      "step": 655
+    },
+    {
+      "epoch": 1.1693404634581106,
+      "grad_norm": 1.696890115737915,
+      "learning_rate": 2.3885034687809715e-06,
+      "loss": 0.888,
+      "step": 656
+    },
+    {
+      "epoch": 1.1711229946524064,
+      "grad_norm": 1.7331446409225464,
+      "learning_rate": 2.383548067393459e-06,
+      "loss": 0.9122,
+      "step": 657
+    },
+    {
+      "epoch": 1.1729055258467023,
+      "grad_norm": 1.7322742938995361,
+      "learning_rate": 2.3785926660059464e-06,
+      "loss": 0.9233,
+      "step": 658
+    },
+    {
+      "epoch": 1.1746880570409983,
+      "grad_norm": 1.7611942291259766,
+      "learning_rate": 2.3736372646184345e-06,
+      "loss": 0.9364,
+      "step": 659
+    },
+    {
+      "epoch": 1.1764705882352942,
+      "grad_norm": 1.6479074954986572,
+      "learning_rate": 2.3686818632309218e-06,
+      "loss": 0.9462,
+      "step": 660
+    },
+    {
+      "epoch": 1.1764705882352942,
+      "eval_loss": 1.0187642574310303,
+      "eval_runtime": 25.8143,
+      "eval_samples_per_second": 38.738,
+      "eval_steps_per_second": 2.441,
+      "step": 660
+    },
+    {
+      "epoch": 1.17825311942959,
+      "grad_norm": 1.7428786754608154,
+      "learning_rate": 2.3637264618434094e-06,
+      "loss": 0.9287,
+      "step": 661
+    },
+    {
+      "epoch": 1.1800356506238858,
+      "grad_norm": 1.680544376373291,
+      "learning_rate": 2.358771060455897e-06,
+      "loss": 0.916,
+      "step": 662
+    },
+    {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 1.5923148393630981,
+      "learning_rate": 2.3538156590683847e-06,
+      "loss": 0.945,
+      "step": 663
+    },
+    {
+      "epoch": 1.1836007130124777,
+      "grad_norm": 1.616050124168396,
+      "learning_rate": 2.3488602576808724e-06,
+      "loss": 0.9443,
+      "step": 664
+    },
+    {
+      "epoch": 1.1853832442067735,
+      "grad_norm": 1.5984090566635132,
+      "learning_rate": 2.34390485629336e-06,
+      "loss": 0.9196,
+      "step": 665
+    },
+    {
+      "epoch": 1.1871657754010696,
+      "grad_norm": 1.611007571220398,
+      "learning_rate": 2.3389494549058473e-06,
+      "loss": 0.9511,
+      "step": 666
+    },
+    {
+      "epoch": 1.1889483065953654,
+      "grad_norm": 1.755518913269043,
+      "learning_rate": 2.3339940535183354e-06,
+      "loss": 0.9542,
+      "step": 667
+    },
+    {
+      "epoch": 1.1907308377896613,
+      "grad_norm": 1.6217259168624878,
+      "learning_rate": 2.3290386521308226e-06,
+      "loss": 0.9308,
+      "step": 668
+    },
+    {
+      "epoch": 1.192513368983957,
+      "grad_norm": 1.6171029806137085,
+      "learning_rate": 2.3240832507433103e-06,
+      "loss": 0.9602,
+      "step": 669
+    },
+    {
+      "epoch": 1.1942959001782532,
+      "grad_norm": 1.581703543663025,
+      "learning_rate": 2.319127849355798e-06,
+      "loss": 0.9284,
+      "step": 670
+    },
+    {
+      "epoch": 1.1942959001782532,
+      "eval_loss": 1.0156581401824951,
+      "eval_runtime": 25.9436,
+      "eval_samples_per_second": 38.545,
+      "eval_steps_per_second": 2.428,
+      "step": 670
+    },
+    {
+      "epoch": 1.196078431372549,
+      "grad_norm": 1.608201026916504,
+      "learning_rate": 2.3141724479682856e-06,
+      "loss": 0.9226,
+      "step": 671
+    },
+    {
+      "epoch": 1.1978609625668448,
+      "grad_norm": 1.6719226837158203,
+      "learning_rate": 2.3092170465807733e-06,
+      "loss": 0.9547,
+      "step": 672
+    },
+    {
+      "epoch": 1.1996434937611409,
+      "grad_norm": 1.570652723312378,
+      "learning_rate": 2.304261645193261e-06,
+      "loss": 0.9306,
+      "step": 673
+    },
+    {
+      "epoch": 1.2014260249554367,
+      "grad_norm": 1.6573023796081543,
+      "learning_rate": 2.2993062438057486e-06,
+      "loss": 0.9672,
+      "step": 674
+    },
+    {
+      "epoch": 1.2032085561497325,
+      "grad_norm": 1.7345811128616333,
+      "learning_rate": 2.2943508424182363e-06,
+      "loss": 0.9166,
+      "step": 675
+    },
+    {
+      "epoch": 1.2049910873440286,
+      "grad_norm": 1.6902786493301392,
+      "learning_rate": 2.2893954410307235e-06,
+      "loss": 0.9546,
+      "step": 676
+    },
+    {
+      "epoch": 1.2067736185383244,
+      "grad_norm": 1.7392154932022095,
+      "learning_rate": 2.284440039643211e-06,
+      "loss": 0.9053,
+      "step": 677
+    },
+    {
+      "epoch": 1.2085561497326203,
+      "grad_norm": 1.5849647521972656,
+      "learning_rate": 2.279484638255699e-06,
+      "loss": 0.9533,
+      "step": 678
+    },
+    {
+      "epoch": 1.2103386809269163,
+      "grad_norm": 1.638115406036377,
+      "learning_rate": 2.2745292368681865e-06,
+      "loss": 0.9489,
+      "step": 679
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 1.6124337911605835,
+      "learning_rate": 2.269573835480674e-06,
+      "loss": 0.9775,
+      "step": 680
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "eval_loss": 1.0148694515228271,
+      "eval_runtime": 25.8646,
+      "eval_samples_per_second": 38.663,
+      "eval_steps_per_second": 2.436,
+      "step": 680
+    },
+    {
+      "epoch": 1.213903743315508,
+      "grad_norm": 1.5980111360549927,
+      "learning_rate": 2.264618434093162e-06,
+      "loss": 0.9381,
+      "step": 681
+    },
+    {
+      "epoch": 1.215686274509804,
+      "grad_norm": 1.6980215311050415,
+      "learning_rate": 2.2596630327056495e-06,
+      "loss": 0.9105,
+      "step": 682
+    },
+    {
+      "epoch": 1.2174688057040999,
+      "grad_norm": 1.595822811126709,
+      "learning_rate": 2.254707631318137e-06,
+      "loss": 0.9326,
+      "step": 683
+    },
+    {
+      "epoch": 1.2192513368983957,
+      "grad_norm": 1.6184011697769165,
+      "learning_rate": 2.2497522299306244e-06,
+      "loss": 0.9102,
+      "step": 684
+    },
+    {
+      "epoch": 1.2210338680926915,
+      "grad_norm": 1.6323550939559937,
+      "learning_rate": 2.244796828543112e-06,
+      "loss": 0.937,
+      "step": 685
+    },
+    {
+      "epoch": 1.2228163992869876,
+      "grad_norm": 1.7325725555419922,
+      "learning_rate": 2.2398414271555997e-06,
+      "loss": 0.914,
+      "step": 686
+    },
+    {
+      "epoch": 1.2245989304812834,
+      "grad_norm": 1.7137166261672974,
+      "learning_rate": 2.2348860257680874e-06,
+      "loss": 0.9083,
+      "step": 687
+    },
+    {
+      "epoch": 1.2263814616755793,
+      "grad_norm": 1.6467852592468262,
+      "learning_rate": 2.229930624380575e-06,
+      "loss": 0.9091,
+      "step": 688
+    },
+    {
+      "epoch": 1.228163992869875,
+      "grad_norm": 1.683382272720337,
+      "learning_rate": 2.2249752229930627e-06,
+      "loss": 0.9592,
+      "step": 689
+    },
+    {
+      "epoch": 1.2299465240641712,
+      "grad_norm": 1.6149805784225464,
+      "learning_rate": 2.2200198216055503e-06,
+      "loss": 0.9362,
+      "step": 690
+    },
+    {
+      "epoch": 1.2299465240641712,
+      "eval_loss": 1.0140960216522217,
+      "eval_runtime": 25.8555,
+      "eval_samples_per_second": 38.676,
+      "eval_steps_per_second": 2.437,
+      "step": 690
+    },
+    {
+      "epoch": 1.231729055258467,
+      "grad_norm": 1.8765431642532349,
+      "learning_rate": 2.215064420218038e-06,
+      "loss": 0.9462,
+      "step": 691
+    },
+    {
+      "epoch": 1.2335115864527628,
+      "grad_norm": 1.7470018863677979,
+      "learning_rate": 2.2101090188305257e-06,
+      "loss": 0.9459,
+      "step": 692
+    },
+    {
+      "epoch": 1.2352941176470589,
+      "grad_norm": 1.7016042470932007,
+      "learning_rate": 2.205153617443013e-06,
+      "loss": 0.9107,
+      "step": 693
+    },
+    {
+      "epoch": 1.2370766488413547,
+      "grad_norm": 1.6355701684951782,
+      "learning_rate": 2.2001982160555006e-06,
+      "loss": 0.9052,
+      "step": 694
+    },
+    {
+      "epoch": 1.2388591800356505,
+      "grad_norm": 1.7940343618392944,
+      "learning_rate": 2.1952428146679882e-06,
+      "loss": 0.9227,
+      "step": 695
+    },
+    {
+      "epoch": 1.2406417112299466,
+      "grad_norm": 1.992951512336731,
+      "learning_rate": 2.190287413280476e-06,
+      "loss": 0.9405,
+      "step": 696
+    },
+    {
+      "epoch": 1.2424242424242424,
+      "grad_norm": 1.83491849899292,
+      "learning_rate": 2.1853320118929636e-06,
+      "loss": 0.9517,
+      "step": 697
+    },
+    {
+      "epoch": 1.2442067736185383,
+      "grad_norm": 1.682455062866211,
+      "learning_rate": 2.1803766105054512e-06,
+      "loss": 0.903,
+      "step": 698
+    },
+    {
+      "epoch": 1.2459893048128343,
+      "grad_norm": 1.6958837509155273,
+      "learning_rate": 2.175421209117939e-06,
+      "loss": 0.9456,
+      "step": 699
+    },
+    {
+      "epoch": 1.2477718360071302,
+      "grad_norm": 1.7611721754074097,
+      "learning_rate": 2.1704658077304265e-06,
+      "loss": 0.9759,
+      "step": 700
+    },
+    {
+      "epoch": 1.2477718360071302,
+      "eval_loss": 1.0106265544891357,
+      "eval_runtime": 26.0725,
+      "eval_samples_per_second": 38.355,
+      "eval_steps_per_second": 2.416,
+      "step": 700
+    },
+    {
+      "epoch": 1.249554367201426,
+      "grad_norm": 1.5676270723342896,
+      "learning_rate": 2.1655104063429138e-06,
+      "loss": 0.9711,
+      "step": 701
+    },
+    {
+      "epoch": 1.251336898395722,
+      "grad_norm": 1.7543623447418213,
+      "learning_rate": 2.160555004955402e-06,
+      "loss": 0.919,
+      "step": 702
+    },
+    {
+      "epoch": 1.2531194295900179,
+      "grad_norm": 1.6871825456619263,
+      "learning_rate": 2.155599603567889e-06,
+      "loss": 0.9047,
+      "step": 703
+    },
+    {
+      "epoch": 1.2549019607843137,
+      "grad_norm": 1.7749131917953491,
+      "learning_rate": 2.1506442021803768e-06,
+      "loss": 0.9395,
+      "step": 704
+    },
+    {
+      "epoch": 1.2566844919786098,
+      "grad_norm": 1.681713342666626,
+      "learning_rate": 2.1456888007928644e-06,
+      "loss": 0.9127,
+      "step": 705
+    },
+    {
+      "epoch": 1.2584670231729056,
+      "grad_norm": 1.762078881263733,
+      "learning_rate": 2.140733399405352e-06,
+      "loss": 0.9399,
+      "step": 706
+    },
+    {
+      "epoch": 1.2602495543672014,
+      "grad_norm": 1.7800824642181396,
+      "learning_rate": 2.1357779980178398e-06,
+      "loss": 0.93,
+      "step": 707
+    },
+    {
+      "epoch": 1.2620320855614973,
+      "grad_norm": 1.7915018796920776,
+      "learning_rate": 2.1308225966303274e-06,
+      "loss": 0.968,
+      "step": 708
+    },
+    {
+      "epoch": 1.263814616755793,
+      "grad_norm": 1.7534856796264648,
+      "learning_rate": 2.1258671952428147e-06,
+      "loss": 0.9223,
+      "step": 709
+    },
+    {
+      "epoch": 1.2655971479500892,
+      "grad_norm": 1.744184136390686,
+      "learning_rate": 2.1209117938553027e-06,
+      "loss": 0.9232,
+      "step": 710
+    },
+    {
+      "epoch": 1.2655971479500892,
+      "eval_loss": 1.0113749504089355,
+      "eval_runtime": 25.7648,
+      "eval_samples_per_second": 38.813,
+      "eval_steps_per_second": 2.445,
+      "step": 710
+    },
+    {
+      "epoch": 1.267379679144385,
+      "grad_norm": 1.5799130201339722,
+      "learning_rate": 2.11595639246779e-06,
+      "loss": 0.9227,
+      "step": 711
+    },
+    {
+      "epoch": 1.2691622103386808,
+      "grad_norm": 1.7383415699005127,
+      "learning_rate": 2.1110009910802776e-06,
+      "loss": 0.9304,
+      "step": 712
+    },
+    {
+      "epoch": 1.2709447415329769,
+      "grad_norm": 1.7710599899291992,
+      "learning_rate": 2.1060455896927653e-06,
+      "loss": 0.9381,
+      "step": 713
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 1.5871422290802002,
+      "learning_rate": 2.101090188305253e-06,
+      "loss": 0.9352,
+      "step": 714
+    },
+    {
+      "epoch": 1.2745098039215685,
+      "grad_norm": 1.674737572669983,
+      "learning_rate": 2.0961347869177406e-06,
+      "loss": 0.9646,
+      "step": 715
+    },
+    {
+      "epoch": 1.2762923351158646,
+      "grad_norm": 1.628126859664917,
+      "learning_rate": 2.0911793855302283e-06,
+      "loss": 0.9404,
+      "step": 716
+    },
+    {
+      "epoch": 1.2780748663101604,
+      "grad_norm": 1.7360297441482544,
+      "learning_rate": 2.0862239841427155e-06,
+      "loss": 0.9211,
+      "step": 717
+    },
+    {
+      "epoch": 1.2798573975044563,
+      "grad_norm": 1.6000936031341553,
+      "learning_rate": 2.0812685827552036e-06,
+      "loss": 0.9307,
+      "step": 718
+    },
+    {
+      "epoch": 1.2816399286987523,
+      "grad_norm": 1.631537675857544,
+      "learning_rate": 2.076313181367691e-06,
+      "loss": 0.9143,
+      "step": 719
+    },
+    {
+      "epoch": 1.2834224598930482,
+      "grad_norm": 1.8007129430770874,
+      "learning_rate": 2.0713577799801785e-06,
+      "loss": 0.9377,
+      "step": 720
+    },
+    {
+      "epoch": 1.2834224598930482,
+      "eval_loss": 1.0131229162216187,
+      "eval_runtime": 25.8685,
+      "eval_samples_per_second": 38.657,
+      "eval_steps_per_second": 2.435,
+      "step": 720
+    },
+    {
+      "epoch": 1.285204991087344,
+      "grad_norm": 1.8879586458206177,
+      "learning_rate": 2.066402378592666e-06,
+      "loss": 0.9399,
+      "step": 721
+    },
+    {
+      "epoch": 1.28698752228164,
+      "grad_norm": 1.6320232152938843,
+      "learning_rate": 2.061446977205154e-06,
+      "loss": 0.9223,
+      "step": 722
+    },
+    {
+      "epoch": 1.2887700534759359,
+      "grad_norm": 1.6035704612731934,
+      "learning_rate": 2.0564915758176415e-06,
+      "loss": 0.9243,
+      "step": 723
+    },
+    {
+      "epoch": 1.2905525846702317,
+      "grad_norm": 1.6737124919891357,
+      "learning_rate": 2.051536174430129e-06,
+      "loss": 0.8922,
+      "step": 724
+    },
+    {
+      "epoch": 1.2923351158645278,
+      "grad_norm": 1.7025960683822632,
+      "learning_rate": 2.0465807730426164e-06,
+      "loss": 0.9059,
+      "step": 725
+    },
+    {
+      "epoch": 1.2941176470588236,
+      "grad_norm": 1.6543670892715454,
+      "learning_rate": 2.0416253716551045e-06,
+      "loss": 0.9196,
+      "step": 726
+    },
+    {
+      "epoch": 1.2959001782531194,
+      "grad_norm": 1.604996919631958,
+      "learning_rate": 2.0366699702675917e-06,
+      "loss": 0.91,
+      "step": 727
+    },
+    {
+      "epoch": 1.2976827094474153,
+      "grad_norm": 1.603367567062378,
+      "learning_rate": 2.0317145688800794e-06,
+      "loss": 0.9169,
+      "step": 728
+    },
+    {
+      "epoch": 1.299465240641711,
+      "grad_norm": 1.6768299341201782,
+      "learning_rate": 2.026759167492567e-06,
+      "loss": 0.9746,
+      "step": 729
+    },
+    {
+      "epoch": 1.3012477718360071,
+      "grad_norm": 1.7131927013397217,
+      "learning_rate": 2.0218037661050547e-06,
+      "loss": 0.9352,
+      "step": 730
+    },
+    {
+      "epoch": 1.3012477718360071,
+      "eval_loss": 1.0110057592391968,
+      "eval_runtime": 25.8117,
+      "eval_samples_per_second": 38.742,
+      "eval_steps_per_second": 2.441,
+      "step": 730
+    },
+    {
+      "epoch": 1.303030303030303,
+      "grad_norm": 1.7616232633590698,
+      "learning_rate": 2.0168483647175424e-06,
+      "loss": 0.9446,
+      "step": 731
+    },
+    {
+      "epoch": 1.3048128342245988,
+      "grad_norm": 1.6162147521972656,
+      "learning_rate": 2.01189296333003e-06,
+      "loss": 0.9045,
+      "step": 732
+    },
+    {
+      "epoch": 1.3065953654188949,
+      "grad_norm": 1.6433504819869995,
+      "learning_rate": 2.0069375619425173e-06,
+      "loss": 0.9129,
+      "step": 733
+    },
+    {
+      "epoch": 1.3083778966131907,
+      "grad_norm": 1.5753357410430908,
+      "learning_rate": 2.0019821605550054e-06,
+      "loss": 0.9225,
+      "step": 734
+    },
+    {
+      "epoch": 1.3101604278074865,
+      "grad_norm": 1.5571519136428833,
+      "learning_rate": 1.9970267591674926e-06,
+      "loss": 0.911,
+      "step": 735
+    },
+    {
+      "epoch": 1.3119429590017826,
+      "grad_norm": 1.5970759391784668,
+      "learning_rate": 1.9920713577799803e-06,
+      "loss": 0.9415,
+      "step": 736
+    },
+    {
+      "epoch": 1.3137254901960784,
+      "grad_norm": 1.6597561836242676,
+      "learning_rate": 1.987115956392468e-06,
+      "loss": 0.9174,
+      "step": 737
+    },
+    {
+      "epoch": 1.3155080213903743,
+      "grad_norm": 1.6476688385009766,
+      "learning_rate": 1.9821605550049556e-06,
+      "loss": 0.9412,
+      "step": 738
+    },
+    {
+      "epoch": 1.3172905525846703,
+      "grad_norm": 1.6584250926971436,
+      "learning_rate": 1.9772051536174432e-06,
+      "loss": 0.8864,
+      "step": 739
+    },
+    {
+      "epoch": 1.3190730837789661,
+      "grad_norm": 1.5933711528778076,
+      "learning_rate": 1.972249752229931e-06,
+      "loss": 0.9246,
+      "step": 740
+    },
+    {
+      "epoch": 1.3190730837789661,
+      "eval_loss": 1.0107917785644531,
+      "eval_runtime": 25.8955,
+      "eval_samples_per_second": 38.617,
+      "eval_steps_per_second": 2.433,
+      "step": 740
+    },
+    {
+      "epoch": 1.320855614973262,
+      "grad_norm": 1.6883563995361328,
+      "learning_rate": 1.967294350842418e-06,
+      "loss": 0.936,
+      "step": 741
+    },
+    {
+      "epoch": 1.322638146167558,
+      "grad_norm": 1.5187584161758423,
+      "learning_rate": 1.9623389494549062e-06,
+      "loss": 0.878,
+      "step": 742
+    },
+    {
+      "epoch": 1.3244206773618539,
+      "grad_norm": 1.6593390703201294,
+      "learning_rate": 1.9573835480673935e-06,
+      "loss": 0.8939,
+      "step": 743
+    },
+    {
+      "epoch": 1.3262032085561497,
+      "grad_norm": 1.630261778831482,
+      "learning_rate": 1.952428146679881e-06,
+      "loss": 0.9111,
+      "step": 744
+    },
+    {
+      "epoch": 1.3279857397504458,
+      "grad_norm": 1.6974488496780396,
+      "learning_rate": 1.9474727452923688e-06,
+      "loss": 0.941,
+      "step": 745
+    },
+    {
+      "epoch": 1.3297682709447416,
+      "grad_norm": 1.6816980838775635,
+      "learning_rate": 1.9425173439048564e-06,
+      "loss": 0.934,
+      "step": 746
+    },
+    {
+      "epoch": 1.3315508021390374,
+      "grad_norm": 1.6839442253112793,
+      "learning_rate": 1.937561942517344e-06,
+      "loss": 0.9156,
+      "step": 747
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 1.6905910968780518,
+      "learning_rate": 1.9326065411298318e-06,
+      "loss": 0.8929,
+      "step": 748
+    },
+    {
+      "epoch": 1.3351158645276293,
+      "grad_norm": 1.6082037687301636,
+      "learning_rate": 1.927651139742319e-06,
+      "loss": 0.8903,
+      "step": 749
+    },
+    {
+      "epoch": 1.3368983957219251,
+      "grad_norm": 1.6515074968338013,
+      "learning_rate": 1.922695738354807e-06,
+      "loss": 0.9105,
+      "step": 750
+    },
+    {
+      "epoch": 1.3368983957219251,
+      "eval_loss": 1.0095351934432983,
+      "eval_runtime": 25.9946,
+      "eval_samples_per_second": 38.47,
+      "eval_steps_per_second": 2.424,
+      "step": 750
+    },
+    {
+      "epoch": 1.338680926916221,
+      "grad_norm": 1.6954472064971924,
+      "learning_rate": 1.9177403369672943e-06,
+      "loss": 0.9011,
+      "step": 751
+    },
+    {
+      "epoch": 1.3404634581105168,
+      "grad_norm": 1.7742551565170288,
+      "learning_rate": 1.912784935579782e-06,
+      "loss": 0.931,
+      "step": 752
+    },
+    {
+      "epoch": 1.3422459893048129,
+      "grad_norm": 1.6957601308822632,
+      "learning_rate": 1.9078295341922697e-06,
+      "loss": 0.8868,
+      "step": 753
+    },
+    {
+      "epoch": 1.3440285204991087,
+      "grad_norm": 1.6613423824310303,
+      "learning_rate": 1.9028741328047573e-06,
+      "loss": 0.9428,
+      "step": 754
+    },
+    {
+      "epoch": 1.3458110516934045,
+      "grad_norm": 1.7479135990142822,
+      "learning_rate": 1.897918731417245e-06,
+      "loss": 0.9423,
+      "step": 755
+    },
+    {
+      "epoch": 1.3475935828877006,
+      "grad_norm": 1.7856110334396362,
+      "learning_rate": 1.8929633300297326e-06,
+      "loss": 0.9013,
+      "step": 756
+    },
+    {
+      "epoch": 1.3493761140819964,
+      "grad_norm": 1.7142354249954224,
+      "learning_rate": 1.88800792864222e-06,
+      "loss": 0.9013,
+      "step": 757
+    },
+    {
+      "epoch": 1.3511586452762923,
+      "grad_norm": 1.671119213104248,
+      "learning_rate": 1.883052527254708e-06,
+      "loss": 0.9154,
+      "step": 758
+    },
+    {
+      "epoch": 1.3529411764705883,
+      "grad_norm": 1.576013207435608,
+      "learning_rate": 1.8780971258671954e-06,
+      "loss": 0.9175,
+      "step": 759
+    },
+    {
+      "epoch": 1.3547237076648841,
+      "grad_norm": 1.6630672216415405,
+      "learning_rate": 1.8731417244796829e-06,
+      "loss": 0.9157,
+      "step": 760
+    },
+    {
+      "epoch": 1.3547237076648841,
+      "eval_loss": 1.009709358215332,
+      "eval_runtime": 25.8704,
+      "eval_samples_per_second": 38.654,
+      "eval_steps_per_second": 2.435,
+      "step": 760
+    },
+    {
+      "epoch": 1.35650623885918,
+      "grad_norm": 1.6932988166809082,
+      "learning_rate": 1.8681863230921707e-06,
+      "loss": 0.914,
+      "step": 761
+    },
+    {
+      "epoch": 1.358288770053476,
+      "grad_norm": 1.6010832786560059,
+      "learning_rate": 1.8632309217046582e-06,
+      "loss": 0.9275,
+      "step": 762
+    },
+    {
+      "epoch": 1.3600713012477719,
+      "grad_norm": 1.722975730895996,
+      "learning_rate": 1.858275520317146e-06,
+      "loss": 0.9304,
+      "step": 763
+    },
+    {
+      "epoch": 1.3618538324420677,
+      "grad_norm": 1.6667062044143677,
+      "learning_rate": 1.8533201189296335e-06,
+      "loss": 0.9618,
+      "step": 764
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 1.6722793579101562,
+      "learning_rate": 1.848364717542121e-06,
+      "loss": 0.9219,
+      "step": 765
+    },
+    {
+      "epoch": 1.3654188948306596,
+      "grad_norm": 1.6280299425125122,
+      "learning_rate": 1.8434093161546088e-06,
+      "loss": 0.8931,
+      "step": 766
+    },
+    {
+      "epoch": 1.3672014260249554,
+      "grad_norm": 1.6584783792495728,
+      "learning_rate": 1.8384539147670963e-06,
+      "loss": 0.9327,
+      "step": 767
+    },
+    {
+      "epoch": 1.3689839572192513,
+      "grad_norm": 1.8174241781234741,
+      "learning_rate": 1.8334985133795837e-06,
+      "loss": 0.9375,
+      "step": 768
+    },
+    {
+      "epoch": 1.3707664884135473,
+      "grad_norm": 1.8208683729171753,
+      "learning_rate": 1.8285431119920716e-06,
+      "loss": 0.9005,
+      "step": 769
+    },
+    {
+      "epoch": 1.3725490196078431,
+      "grad_norm": 1.6982202529907227,
+      "learning_rate": 1.823587710604559e-06,
+      "loss": 0.9188,
+      "step": 770
+    },
+    {
+      "epoch": 1.3725490196078431,
+      "eval_loss": 1.0106472969055176,
+      "eval_runtime": 25.86,
+      "eval_samples_per_second": 38.67,
+      "eval_steps_per_second": 2.436,
+      "step": 770
+    },
+    {
+      "epoch": 1.374331550802139,
+      "grad_norm": 1.5650068521499634,
+      "learning_rate": 1.818632309217047e-06,
+      "loss": 0.9318,
+      "step": 771
+    },
+    {
+      "epoch": 1.3761140819964348,
+      "grad_norm": 1.6363683938980103,
+      "learning_rate": 1.8136769078295344e-06,
+      "loss": 0.9317,
+      "step": 772
+    },
+    {
+      "epoch": 1.3778966131907309,
+      "grad_norm": 1.6135826110839844,
+      "learning_rate": 1.8087215064420218e-06,
+      "loss": 0.896,
+      "step": 773
+    },
+    {
+      "epoch": 1.3796791443850267,
+      "grad_norm": 1.6176116466522217,
+      "learning_rate": 1.8037661050545097e-06,
+      "loss": 0.8821,
+      "step": 774
+    },
+    {
+      "epoch": 1.3814616755793225,
+      "grad_norm": 1.7548078298568726,
+      "learning_rate": 1.7988107036669972e-06,
+      "loss": 0.9173,
+      "step": 775
+    },
+    {
+      "epoch": 1.3832442067736186,
+      "grad_norm": 1.79991614818573,
+      "learning_rate": 1.7938553022794846e-06,
+      "loss": 0.9288,
+      "step": 776
+    },
+    {
+      "epoch": 1.3850267379679144,
+      "grad_norm": 1.747171401977539,
+      "learning_rate": 1.7888999008919725e-06,
+      "loss": 0.9438,
+      "step": 777
+    },
+    {
+      "epoch": 1.3868092691622103,
+      "grad_norm": 1.6585170030593872,
+      "learning_rate": 1.78394449950446e-06,
+      "loss": 0.9284,
+      "step": 778
+    },
+    {
+      "epoch": 1.3885918003565063,
+      "grad_norm": 1.8032984733581543,
+      "learning_rate": 1.7789890981169478e-06,
+      "loss": 0.9052,
+      "step": 779
+    },
+    {
+      "epoch": 1.3903743315508021,
+      "grad_norm": 1.7318456172943115,
+      "learning_rate": 1.7740336967294353e-06,
+      "loss": 0.9191,
+      "step": 780
+    },
+    {
+      "epoch": 1.3903743315508021,
+      "eval_loss": 1.0097719430923462,
+      "eval_runtime": 25.8995,
+      "eval_samples_per_second": 38.611,
+      "eval_steps_per_second": 2.432,
+      "step": 780
+    },
+    {
+      "epoch": 1.392156862745098,
+      "grad_norm": 1.6782145500183105,
+      "learning_rate": 1.7690782953419227e-06,
+      "loss": 0.9304,
+      "step": 781
+    },
+    {
+      "epoch": 1.393939393939394,
+      "grad_norm": 1.6794602870941162,
+      "learning_rate": 1.7641228939544106e-06,
+      "loss": 0.8886,
+      "step": 782
+    },
+    {
+      "epoch": 1.3957219251336899,
+      "grad_norm": 1.6758161783218384,
+      "learning_rate": 1.759167492566898e-06,
+      "loss": 0.937,
+      "step": 783
+    },
+    {
+      "epoch": 1.3975044563279857,
+      "grad_norm": 1.7178926467895508,
+      "learning_rate": 1.7542120911793855e-06,
+      "loss": 0.9135,
+      "step": 784
+    },
+    {
+      "epoch": 1.3992869875222818,
+      "grad_norm": 1.7379509210586548,
+      "learning_rate": 1.7492566897918734e-06,
+      "loss": 0.9108,
+      "step": 785
+    },
+    {
+      "epoch": 1.4010695187165776,
+      "grad_norm": 1.9025062322616577,
+      "learning_rate": 1.7443012884043608e-06,
+      "loss": 0.9718,
+      "step": 786
+    },
+    {
+      "epoch": 1.4028520499108734,
+      "grad_norm": 1.7181079387664795,
+      "learning_rate": 1.7393458870168487e-06,
+      "loss": 0.9131,
+      "step": 787
+    },
+    {
+      "epoch": 1.4046345811051695,
+      "grad_norm": 1.8086258172988892,
+      "learning_rate": 1.7343904856293361e-06,
+      "loss": 0.9251,
+      "step": 788
+    },
+    {
+      "epoch": 1.4064171122994653,
+      "grad_norm": 1.721421718597412,
+      "learning_rate": 1.7294350842418236e-06,
+      "loss": 0.9466,
+      "step": 789
+    },
+    {
+      "epoch": 1.4081996434937611,
+      "grad_norm": 1.7858272790908813,
+      "learning_rate": 1.7244796828543115e-06,
+      "loss": 0.9413,
+      "step": 790
+    },
+    {
+      "epoch": 1.4081996434937611,
+      "eval_loss": 1.0092817544937134,
+      "eval_runtime": 26.014,
+      "eval_samples_per_second": 38.441,
+      "eval_steps_per_second": 2.422,
+      "step": 790
+    },
+    {
+      "epoch": 1.409982174688057,
+      "grad_norm": 1.6175751686096191,
+      "learning_rate": 1.719524281466799e-06,
+      "loss": 0.9282,
+      "step": 791
+    },
+    {
+      "epoch": 1.4117647058823528,
+      "grad_norm": 1.6843942403793335,
+      "learning_rate": 1.7145688800792864e-06,
+      "loss": 0.8947,
+      "step": 792
+    },
+    {
+      "epoch": 1.4135472370766489,
+      "grad_norm": 1.639022946357727,
+      "learning_rate": 1.7096134786917742e-06,
+      "loss": 0.9174,
+      "step": 793
+    },
+    {
+      "epoch": 1.4153297682709447,
+      "grad_norm": 1.6527246236801147,
+      "learning_rate": 1.7046580773042617e-06,
+      "loss": 0.9325,
+      "step": 794
+    },
+    {
+      "epoch": 1.4171122994652405,
+      "grad_norm": 1.6931911706924438,
+      "learning_rate": 1.6997026759167496e-06,
+      "loss": 0.9105,
+      "step": 795
+    },
+    {
+      "epoch": 1.4188948306595366,
+      "grad_norm": 1.626141905784607,
+      "learning_rate": 1.694747274529237e-06,
+      "loss": 0.8941,
+      "step": 796
+    },
+    {
+      "epoch": 1.4206773618538324,
+      "grad_norm": 1.7113869190216064,
+      "learning_rate": 1.6897918731417245e-06,
+      "loss": 0.9012,
+      "step": 797
+    },
+    {
+      "epoch": 1.4224598930481283,
+      "grad_norm": 1.6680278778076172,
+      "learning_rate": 1.6848364717542123e-06,
+      "loss": 0.9116,
+      "step": 798
+    },
+    {
+      "epoch": 1.4242424242424243,
+      "grad_norm": 1.7071088552474976,
+      "learning_rate": 1.6798810703666998e-06,
+      "loss": 0.9217,
+      "step": 799
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "grad_norm": 1.6858220100402832,
+      "learning_rate": 1.6749256689791874e-06,
+      "loss": 0.9277,
+      "step": 800
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "eval_loss": 1.0078998804092407,
+      "eval_runtime": 25.8631,
+      "eval_samples_per_second": 38.665,
+      "eval_steps_per_second": 2.436,
+      "step": 800
+    },
+    {
+      "epoch": 1.427807486631016,
+      "grad_norm": 1.6459059715270996,
+      "learning_rate": 1.669970267591675e-06,
+      "loss": 0.951,
+      "step": 801
+    },
+    {
+      "epoch": 1.429590017825312,
+      "grad_norm": 1.6655635833740234,
+      "learning_rate": 1.6650148662041625e-06,
+      "loss": 0.921,
+      "step": 802
+    },
+    {
+      "epoch": 1.4313725490196079,
+      "grad_norm": 1.7144039869308472,
+      "learning_rate": 1.6600594648166502e-06,
+      "loss": 0.9186,
+      "step": 803
+    },
+    {
+      "epoch": 1.4331550802139037,
+      "grad_norm": 1.6617674827575684,
+      "learning_rate": 1.6551040634291379e-06,
+      "loss": 0.9002,
+      "step": 804
+    },
+    {
+      "epoch": 1.4349376114081998,
+      "grad_norm": 1.7272840738296509,
+      "learning_rate": 1.6501486620416255e-06,
+      "loss": 0.9253,
+      "step": 805
+    },
+    {
+      "epoch": 1.4367201426024956,
+      "grad_norm": 1.717220664024353,
+      "learning_rate": 1.6451932606541132e-06,
+      "loss": 0.9674,
+      "step": 806
+    },
+    {
+      "epoch": 1.4385026737967914,
+      "grad_norm": 1.6640911102294922,
+      "learning_rate": 1.6402378592666006e-06,
+      "loss": 0.9324,
+      "step": 807
+    },
+    {
+      "epoch": 1.4402852049910875,
+      "grad_norm": 1.6072384119033813,
+      "learning_rate": 1.6352824578790883e-06,
+      "loss": 0.9289,
+      "step": 808
+    },
+    {
+      "epoch": 1.4420677361853833,
+      "grad_norm": 1.7120065689086914,
+      "learning_rate": 1.630327056491576e-06,
+      "loss": 0.8992,
+      "step": 809
+    },
+    {
+      "epoch": 1.4438502673796791,
+      "grad_norm": 1.6117724180221558,
+      "learning_rate": 1.6253716551040636e-06,
+      "loss": 0.8798,
+      "step": 810
+    },
+    {
+      "epoch": 1.4438502673796791,
+      "eval_loss": 1.0046720504760742,
+      "eval_runtime": 26.0154,
+      "eval_samples_per_second": 38.439,
+      "eval_steps_per_second": 2.422,
+      "step": 810
+    },
+    {
+      "epoch": 1.445632798573975,
+      "grad_norm": 1.689333200454712,
+      "learning_rate": 1.620416253716551e-06,
+      "loss": 0.9071,
+      "step": 811
+    },
+    {
+      "epoch": 1.4474153297682708,
+      "grad_norm": 1.5870457887649536,
+      "learning_rate": 1.6154608523290387e-06,
+      "loss": 0.9308,
+      "step": 812
+    },
+    {
+      "epoch": 1.4491978609625669,
+      "grad_norm": 1.7409943342208862,
+      "learning_rate": 1.6105054509415264e-06,
+      "loss": 0.9348,
+      "step": 813
+    },
+    {
+      "epoch": 1.4509803921568627,
+      "grad_norm": 1.6735934019088745,
+      "learning_rate": 1.605550049554014e-06,
+      "loss": 0.9443,
+      "step": 814
+    },
+    {
+      "epoch": 1.4527629233511585,
+      "grad_norm": 1.6621036529541016,
+      "learning_rate": 1.6005946481665015e-06,
+      "loss": 0.9159,
+      "step": 815
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 1.613136887550354,
+      "learning_rate": 1.5956392467789892e-06,
+      "loss": 0.9043,
+      "step": 816
+    },
+    {
+      "epoch": 1.4563279857397504,
+      "grad_norm": 1.560819149017334,
+      "learning_rate": 1.5906838453914768e-06,
+      "loss": 0.9034,
+      "step": 817
+    },
+    {
+      "epoch": 1.4581105169340463,
+      "grad_norm": 1.5957945585250854,
+      "learning_rate": 1.5857284440039645e-06,
+      "loss": 0.8977,
+      "step": 818
+    },
+    {
+      "epoch": 1.4598930481283423,
+      "grad_norm": 1.608081340789795,
+      "learning_rate": 1.580773042616452e-06,
+      "loss": 0.888,
+      "step": 819
+    },
+    {
+      "epoch": 1.4616755793226381,
+      "grad_norm": 1.6894760131835938,
+      "learning_rate": 1.5758176412289396e-06,
+      "loss": 0.9139,
+      "step": 820
+    },
+    {
+      "epoch": 1.4616755793226381,
+      "eval_loss": 1.0035488605499268,
+      "eval_runtime": 25.9248,
+      "eval_samples_per_second": 38.573,
+      "eval_steps_per_second": 2.43,
+      "step": 820
+    },
+    {
+      "epoch": 1.463458110516934,
+      "grad_norm": 1.691773533821106,
+      "learning_rate": 1.5708622398414273e-06,
+      "loss": 0.9202,
+      "step": 821
+    },
+    {
+      "epoch": 1.46524064171123,
+      "grad_norm": 1.7405924797058105,
+      "learning_rate": 1.565906838453915e-06,
+      "loss": 0.9049,
+      "step": 822
+    },
+    {
+      "epoch": 1.4670231729055259,
+      "grad_norm": 1.6937116384506226,
+      "learning_rate": 1.5609514370664026e-06,
+      "loss": 0.9305,
+      "step": 823
+    },
+    {
+      "epoch": 1.4688057040998217,
+      "grad_norm": 1.7311570644378662,
+      "learning_rate": 1.55599603567889e-06,
+      "loss": 0.9446,
+      "step": 824
+    },
+    {
+      "epoch": 1.4705882352941178,
+      "grad_norm": 1.6122374534606934,
+      "learning_rate": 1.5510406342913777e-06,
+      "loss": 0.9097,
+      "step": 825
+    },
+    {
+      "epoch": 1.4723707664884136,
+      "grad_norm": 1.6009140014648438,
+      "learning_rate": 1.5460852329038654e-06,
+      "loss": 0.9213,
+      "step": 826
+    },
+    {
+      "epoch": 1.4741532976827094,
+      "grad_norm": 1.6482139825820923,
+      "learning_rate": 1.5411298315163528e-06,
+      "loss": 0.9446,
+      "step": 827
+    },
+    {
+      "epoch": 1.4759358288770055,
+      "grad_norm": 1.6383440494537354,
+      "learning_rate": 1.5361744301288407e-06,
+      "loss": 0.9326,
+      "step": 828
+    },
+    {
+      "epoch": 1.4777183600713013,
+      "grad_norm": 1.675575852394104,
+      "learning_rate": 1.5312190287413281e-06,
+      "loss": 0.9304,
+      "step": 829
+    },
+    {
+      "epoch": 1.4795008912655971,
+      "grad_norm": 1.6283584833145142,
+      "learning_rate": 1.5262636273538158e-06,
+      "loss": 0.8586,
+      "step": 830
+    },
+    {
+      "epoch": 1.4795008912655971,
+      "eval_loss": 1.002890944480896,
+      "eval_runtime": 25.9393,
+      "eval_samples_per_second": 38.552,
+      "eval_steps_per_second": 2.429,
+      "step": 830
+    },
+    {
+      "epoch": 1.481283422459893,
+      "grad_norm": 1.7265905141830444,
+      "learning_rate": 1.5213082259663035e-06,
+      "loss": 0.9409,
+      "step": 831
+    },
+    {
+      "epoch": 1.483065953654189,
+      "grad_norm": 1.659981369972229,
+      "learning_rate": 1.516352824578791e-06,
+      "loss": 0.9376,
+      "step": 832
+    },
+    {
+      "epoch": 1.4848484848484849,
+      "grad_norm": 1.710387945175171,
+      "learning_rate": 1.5113974231912786e-06,
+      "loss": 0.9041,
+      "step": 833
+    },
+    {
+      "epoch": 1.4866310160427807,
+      "grad_norm": 1.6636488437652588,
+      "learning_rate": 1.5064420218037662e-06,
+      "loss": 0.9171,
+      "step": 834
+    },
+    {
+      "epoch": 1.4884135472370765,
+      "grad_norm": 1.7282999753952026,
+      "learning_rate": 1.5014866204162537e-06,
+      "loss": 0.9034,
+      "step": 835
+    },
+    {
+      "epoch": 1.4901960784313726,
+      "grad_norm": 1.7450913190841675,
+      "learning_rate": 1.4965312190287416e-06,
+      "loss": 0.9341,
+      "step": 836
+    },
+    {
+      "epoch": 1.4919786096256684,
+      "grad_norm": 1.667269229888916,
+      "learning_rate": 1.491575817641229e-06,
+      "loss": 0.9033,
+      "step": 837
+    },
+    {
+      "epoch": 1.4937611408199643,
+      "grad_norm": 1.6336359977722168,
+      "learning_rate": 1.4866204162537167e-06,
+      "loss": 0.916,
+      "step": 838
+    },
+    {
+      "epoch": 1.4955436720142603,
+      "grad_norm": 1.5937727689743042,
+      "learning_rate": 1.4816650148662043e-06,
+      "loss": 0.9012,
+      "step": 839
+    },
+    {
+      "epoch": 1.4973262032085561,
+      "grad_norm": 1.717529535293579,
+      "learning_rate": 1.4767096134786918e-06,
+      "loss": 0.9337,
+      "step": 840
+    },
+    {
+      "epoch": 1.4973262032085561,
+      "eval_loss": 1.0023201704025269,
+      "eval_runtime": 25.9528,
+      "eval_samples_per_second": 38.532,
+      "eval_steps_per_second": 2.427,
+      "step": 840
+    },
+    {
+      "epoch": 1.499108734402852,
+      "grad_norm": 1.6657592058181763,
+      "learning_rate": 1.4717542120911797e-06,
+      "loss": 0.939,
+      "step": 841
+    },
+    {
+      "epoch": 1.500891265597148,
+      "grad_norm": 1.7361196279525757,
+      "learning_rate": 1.4667988107036671e-06,
+      "loss": 0.9348,
+      "step": 842
+    },
+    {
+      "epoch": 1.5026737967914439,
+      "grad_norm": 1.6162766218185425,
+      "learning_rate": 1.4618434093161546e-06,
+      "loss": 0.9382,
+      "step": 843
+    },
+    {
+      "epoch": 1.5044563279857397,
+      "grad_norm": 1.639491319656372,
+      "learning_rate": 1.4568880079286424e-06,
+      "loss": 0.9133,
+      "step": 844
+    },
+    {
+      "epoch": 1.5062388591800357,
+      "grad_norm": 1.5712809562683105,
+      "learning_rate": 1.4519326065411299e-06,
+      "loss": 0.9152,
+      "step": 845
+    },
+    {
+      "epoch": 1.5080213903743316,
+      "grad_norm": 1.639657974243164,
+      "learning_rate": 1.4469772051536178e-06,
+      "loss": 0.9398,
+      "step": 846
+    },
+    {
+      "epoch": 1.5098039215686274,
+      "grad_norm": 1.664557695388794,
+      "learning_rate": 1.4420218037661052e-06,
+      "loss": 0.8776,
+      "step": 847
+    },
+    {
+      "epoch": 1.5115864527629235,
+      "grad_norm": 1.6303818225860596,
+      "learning_rate": 1.4370664023785927e-06,
+      "loss": 0.9392,
+      "step": 848
+    },
+    {
+      "epoch": 1.5133689839572193,
+      "grad_norm": 1.6585919857025146,
+      "learning_rate": 1.4321110009910805e-06,
+      "loss": 0.9181,
+      "step": 849
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 1.7727773189544678,
+      "learning_rate": 1.427155599603568e-06,
+      "loss": 0.9333,
+      "step": 850
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "eval_loss": 1.0036695003509521,
+      "eval_runtime": 25.8611,
+      "eval_samples_per_second": 38.668,
+      "eval_steps_per_second": 2.436,
+      "step": 850
+    },
+    {
+      "epoch": 1.5169340463458112,
+      "grad_norm": 1.6858317852020264,
+      "learning_rate": 1.4222001982160554e-06,
+      "loss": 0.9017,
+      "step": 851
+    },
+    {
+      "epoch": 1.5187165775401068,
+      "grad_norm": 1.6429566144943237,
+      "learning_rate": 1.4172447968285433e-06,
+      "loss": 0.9047,
+      "step": 852
+    },
+    {
+      "epoch": 1.5204991087344029,
+      "grad_norm": 1.877684235572815,
+      "learning_rate": 1.4122893954410308e-06,
+      "loss": 0.9314,
+      "step": 853
+    },
+    {
+      "epoch": 1.522281639928699,
+      "grad_norm": 1.6848722696304321,
+      "learning_rate": 1.4073339940535186e-06,
+      "loss": 0.914,
+      "step": 854
+    },
+    {
+      "epoch": 1.5240641711229945,
+      "grad_norm": 1.7669544219970703,
+      "learning_rate": 1.402378592666006e-06,
+      "loss": 0.9158,
+      "step": 855
+    },
+    {
+      "epoch": 1.5258467023172906,
+      "grad_norm": 1.6447798013687134,
+      "learning_rate": 1.3974231912784935e-06,
+      "loss": 0.9603,
+      "step": 856
+    },
+    {
+      "epoch": 1.5276292335115864,
+      "grad_norm": 1.6374293565750122,
+      "learning_rate": 1.3924677898909814e-06,
+      "loss": 0.9168,
+      "step": 857
+    },
+    {
+      "epoch": 1.5294117647058822,
+      "grad_norm": 1.614821434020996,
+      "learning_rate": 1.3875123885034689e-06,
+      "loss": 0.9262,
+      "step": 858
+    },
+    {
+      "epoch": 1.5311942959001783,
+      "grad_norm": 1.6780483722686768,
+      "learning_rate": 1.3825569871159563e-06,
+      "loss": 0.9218,
+      "step": 859
+    },
+    {
+      "epoch": 1.5329768270944741,
+      "grad_norm": 1.727554202079773,
+      "learning_rate": 1.3776015857284442e-06,
+      "loss": 0.9085,
+      "step": 860
+    },
+    {
+      "epoch": 1.5329768270944741,
+      "eval_loss": 1.0021950006484985,
+      "eval_runtime": 25.7933,
+      "eval_samples_per_second": 38.77,
+      "eval_steps_per_second": 2.442,
+      "step": 860
+    },
+    {
+      "epoch": 1.53475935828877,
+      "grad_norm": 1.6244946718215942,
+      "learning_rate": 1.3726461843409316e-06,
+      "loss": 0.9083,
+      "step": 861
+    },
+    {
+      "epoch": 1.536541889483066,
+      "grad_norm": 1.7828315496444702,
+      "learning_rate": 1.3676907829534195e-06,
+      "loss": 0.9258,
+      "step": 862
+    },
+    {
+      "epoch": 1.5383244206773619,
+      "grad_norm": 1.6405810117721558,
+      "learning_rate": 1.362735381565907e-06,
+      "loss": 0.9407,
+      "step": 863
+    },
+    {
+      "epoch": 1.5401069518716577,
+      "grad_norm": 1.7271147966384888,
+      "learning_rate": 1.3577799801783944e-06,
+      "loss": 0.9375,
+      "step": 864
+    },
+    {
+      "epoch": 1.5418894830659537,
+      "grad_norm": 1.789184808731079,
+      "learning_rate": 1.3528245787908823e-06,
+      "loss": 0.9062,
+      "step": 865
+    },
+    {
+      "epoch": 1.5436720142602496,
+      "grad_norm": 1.7284061908721924,
+      "learning_rate": 1.3478691774033697e-06,
+      "loss": 0.901,
+      "step": 866
+    },
+    {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 1.7288529872894287,
+      "learning_rate": 1.3429137760158572e-06,
+      "loss": 0.8885,
+      "step": 867
+    },
+    {
+      "epoch": 1.5472370766488415,
+      "grad_norm": 1.71056067943573,
+      "learning_rate": 1.337958374628345e-06,
+      "loss": 0.8895,
+      "step": 868
+    },
+    {
+      "epoch": 1.5490196078431373,
+      "grad_norm": 1.8451443910598755,
+      "learning_rate": 1.3330029732408325e-06,
+      "loss": 0.9351,
+      "step": 869
+    },
+    {
+      "epoch": 1.5508021390374331,
+      "grad_norm": 1.7918177843093872,
+      "learning_rate": 1.3280475718533204e-06,
+      "loss": 0.9022,
+      "step": 870
+    },
+    {
+      "epoch": 1.5508021390374331,
+      "eval_loss": 1.0015515089035034,
+      "eval_runtime": 25.6272,
+      "eval_samples_per_second": 39.021,
+      "eval_steps_per_second": 2.458,
+      "step": 870
+    },
+    {
+      "epoch": 1.5525846702317292,
+      "grad_norm": 1.7702810764312744,
+      "learning_rate": 1.3230921704658078e-06,
+      "loss": 0.9402,
+      "step": 871
+    },
+    {
+      "epoch": 1.5543672014260248,
+      "grad_norm": 1.6390447616577148,
+      "learning_rate": 1.3181367690782953e-06,
+      "loss": 0.8972,
+      "step": 872
+    },
+    {
+      "epoch": 1.5561497326203209,
+      "grad_norm": 1.6672519445419312,
+      "learning_rate": 1.3131813676907832e-06,
+      "loss": 0.9124,
+      "step": 873
+    },
+    {
+      "epoch": 1.557932263814617,
+      "grad_norm": 1.7509058713912964,
+      "learning_rate": 1.3082259663032706e-06,
+      "loss": 0.8998,
+      "step": 874
+    },
+    {
+      "epoch": 1.5597147950089125,
+      "grad_norm": 1.7143068313598633,
+      "learning_rate": 1.303270564915758e-06,
+      "loss": 0.9138,
+      "step": 875
+    },
+    {
+      "epoch": 1.5614973262032086,
+      "grad_norm": 1.778315544128418,
+      "learning_rate": 1.298315163528246e-06,
+      "loss": 0.9245,
+      "step": 876
+    },
+    {
+      "epoch": 1.5632798573975044,
+      "grad_norm": 1.7348226308822632,
+      "learning_rate": 1.2933597621407334e-06,
+      "loss": 0.9192,
+      "step": 877
+    },
+    {
+      "epoch": 1.5650623885918002,
+      "grad_norm": 1.6273071765899658,
+      "learning_rate": 1.2884043607532213e-06,
+      "loss": 0.9332,
+      "step": 878
+    },
+    {
+      "epoch": 1.5668449197860963,
+      "grad_norm": 1.9141535758972168,
+      "learning_rate": 1.2834489593657087e-06,
+      "loss": 0.932,
+      "step": 879
+    },
+    {
+      "epoch": 1.5686274509803921,
+      "grad_norm": 1.7379990816116333,
+      "learning_rate": 1.2784935579781962e-06,
+      "loss": 0.9357,
+      "step": 880
+    },
+    {
+      "epoch": 1.5686274509803921,
+      "eval_loss": 1.0007495880126953,
+      "eval_runtime": 25.606,
+      "eval_samples_per_second": 39.053,
+      "eval_steps_per_second": 2.46,
+      "step": 880
+    },
+    {
+      "epoch": 1.570409982174688,
+      "grad_norm": 1.6585184335708618,
+      "learning_rate": 1.273538156590684e-06,
+      "loss": 0.8919,
+      "step": 881
+    },
+    {
+      "epoch": 1.572192513368984,
+      "grad_norm": 1.6785247325897217,
+      "learning_rate": 1.2685827552031715e-06,
+      "loss": 0.914,
+      "step": 882
+    },
+    {
+      "epoch": 1.5739750445632799,
+      "grad_norm": 1.7640776634216309,
+      "learning_rate": 1.2636273538156591e-06,
+      "loss": 0.9166,
+      "step": 883
+    },
+    {
+      "epoch": 1.5757575757575757,
+      "grad_norm": 1.8093239068984985,
+      "learning_rate": 1.2586719524281468e-06,
+      "loss": 0.9098,
+      "step": 884
+    },
+    {
+      "epoch": 1.5775401069518717,
+      "grad_norm": 1.623347520828247,
+      "learning_rate": 1.2537165510406342e-06,
+      "loss": 0.8963,
+      "step": 885
+    },
+    {
+      "epoch": 1.5793226381461676,
+      "grad_norm": 1.7071688175201416,
+      "learning_rate": 1.248761149653122e-06,
+      "loss": 0.9077,
+      "step": 886
+    },
+    {
+      "epoch": 1.5811051693404634,
+      "grad_norm": 1.6982461214065552,
+      "learning_rate": 1.2438057482656096e-06,
+      "loss": 0.897,
+      "step": 887
+    },
+    {
+      "epoch": 1.5828877005347595,
+      "grad_norm": 1.7919094562530518,
+      "learning_rate": 1.2388503468780972e-06,
+      "loss": 0.904,
+      "step": 888
+    },
+    {
+      "epoch": 1.5846702317290553,
+      "grad_norm": 1.7249748706817627,
+      "learning_rate": 1.2338949454905847e-06,
+      "loss": 0.9144,
+      "step": 889
+    },
+    {
+      "epoch": 1.5864527629233511,
+      "grad_norm": 1.6791118383407593,
+      "learning_rate": 1.2289395441030723e-06,
+      "loss": 0.9086,
+      "step": 890
+    },
+    {
+      "epoch": 1.5864527629233511,
+      "eval_loss": 0.9997517466545105,
+      "eval_runtime": 25.5898,
+      "eval_samples_per_second": 39.078,
+      "eval_steps_per_second": 2.462,
+      "step": 890
+    },
+    {
+      "epoch": 1.5882352941176472,
+      "grad_norm": 1.78848135471344,
+      "learning_rate": 1.22398414271556e-06,
+      "loss": 0.9264,
+      "step": 891
+    },
+    {
+      "epoch": 1.5900178253119428,
+      "grad_norm": 1.6467989683151245,
+      "learning_rate": 1.2190287413280477e-06,
+      "loss": 0.9457,
+      "step": 892
+    },
+    {
+      "epoch": 1.5918003565062389,
+      "grad_norm": 1.7058672904968262,
+      "learning_rate": 1.2140733399405351e-06,
+      "loss": 0.8898,
+      "step": 893
+    },
+    {
+      "epoch": 1.593582887700535,
+      "grad_norm": 1.7215371131896973,
+      "learning_rate": 1.2091179385530228e-06,
+      "loss": 0.954,
+      "step": 894
+    },
+    {
+      "epoch": 1.5953654188948305,
+      "grad_norm": 1.7038347721099854,
+      "learning_rate": 1.2041625371655104e-06,
+      "loss": 0.9021,
+      "step": 895
+    },
+    {
+      "epoch": 1.5971479500891266,
+      "grad_norm": 1.6531912088394165,
+      "learning_rate": 1.1992071357779981e-06,
+      "loss": 0.8992,
+      "step": 896
+    },
+    {
+      "epoch": 1.5989304812834224,
+      "grad_norm": 1.6344718933105469,
+      "learning_rate": 1.1942517343904858e-06,
+      "loss": 0.8755,
+      "step": 897
+    },
+    {
+      "epoch": 1.6007130124777182,
+      "grad_norm": 1.667981743812561,
+      "learning_rate": 1.1892963330029732e-06,
+      "loss": 0.9112,
+      "step": 898
+    },
+    {
+      "epoch": 1.6024955436720143,
+      "grad_norm": 1.7452551126480103,
+      "learning_rate": 1.1843409316154609e-06,
+      "loss": 0.9181,
+      "step": 899
+    },
+    {
+      "epoch": 1.6042780748663101,
+      "grad_norm": 1.6953660249710083,
+      "learning_rate": 1.1793855302279485e-06,
+      "loss": 0.9109,
+      "step": 900
+    },
+    {
+      "epoch": 1.6042780748663101,
+      "eval_loss": 1.0002549886703491,
+      "eval_runtime": 25.6596,
+      "eval_samples_per_second": 38.972,
+      "eval_steps_per_second": 2.455,
+      "step": 900
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 1.733874797821045,
+      "learning_rate": 1.1744301288404362e-06,
+      "loss": 0.9067,
+      "step": 901
+    },
+    {
+      "epoch": 1.607843137254902,
+      "grad_norm": 1.6959171295166016,
+      "learning_rate": 1.1694747274529237e-06,
+      "loss": 0.8932,
+      "step": 902
+    },
+    {
+      "epoch": 1.6096256684491979,
+      "grad_norm": 1.7267096042633057,
+      "learning_rate": 1.1645193260654113e-06,
+      "loss": 0.9315,
+      "step": 903
+    },
+    {
+      "epoch": 1.6114081996434937,
+      "grad_norm": 1.755772352218628,
+      "learning_rate": 1.159563924677899e-06,
+      "loss": 0.9187,
+      "step": 904
+    },
+    {
+      "epoch": 1.6131907308377897,
+      "grad_norm": 1.570746898651123,
+      "learning_rate": 1.1546085232903866e-06,
+      "loss": 0.8931,
+      "step": 905
+    },
+    {
+      "epoch": 1.6149732620320856,
+      "grad_norm": 1.7020586729049683,
+      "learning_rate": 1.1496531219028743e-06,
+      "loss": 0.9247,
+      "step": 906
+    },
+    {
+      "epoch": 1.6167557932263814,
+      "grad_norm": 1.6069488525390625,
+      "learning_rate": 1.1446977205153618e-06,
+      "loss": 0.8965,
+      "step": 907
+    },
+    {
+      "epoch": 1.6185383244206775,
+      "grad_norm": 1.6286472082138062,
+      "learning_rate": 1.1397423191278494e-06,
+      "loss": 0.9191,
+      "step": 908
+    },
+    {
+      "epoch": 1.6203208556149733,
+      "grad_norm": 1.5472110509872437,
+      "learning_rate": 1.134786917740337e-06,
+      "loss": 0.8944,
+      "step": 909
+    },
+    {
+      "epoch": 1.6221033868092691,
+      "grad_norm": 1.8870779275894165,
+      "learning_rate": 1.1298315163528247e-06,
+      "loss": 0.94,
+      "step": 910
+    },
+    {
+      "epoch": 1.6221033868092691,
+      "eval_loss": 0.9995793700218201,
+      "eval_runtime": 25.6625,
+      "eval_samples_per_second": 38.967,
+      "eval_steps_per_second": 2.455,
+      "step": 910
+    },
+    {
+      "epoch": 1.6238859180035652,
+      "grad_norm": 1.745804786682129,
+      "learning_rate": 1.1248761149653122e-06,
+      "loss": 0.9077,
+      "step": 911
+    },
+    {
+      "epoch": 1.6256684491978608,
+      "grad_norm": 1.6671708822250366,
+      "learning_rate": 1.1199207135777999e-06,
+      "loss": 0.909,
+      "step": 912
+    },
+    {
+      "epoch": 1.6274509803921569,
+      "grad_norm": 1.612488865852356,
+      "learning_rate": 1.1149653121902875e-06,
+      "loss": 0.9034,
+      "step": 913
+    },
+    {
+      "epoch": 1.629233511586453,
+      "grad_norm": 1.620811104774475,
+      "learning_rate": 1.1100099108027752e-06,
+      "loss": 0.9386,
+      "step": 914
+    },
+    {
+      "epoch": 1.6310160427807485,
+      "grad_norm": 1.6019529104232788,
+      "learning_rate": 1.1050545094152628e-06,
+      "loss": 0.8877,
+      "step": 915
+    },
+    {
+      "epoch": 1.6327985739750446,
+      "grad_norm": 1.613588809967041,
+      "learning_rate": 1.1000991080277503e-06,
+      "loss": 0.9113,
+      "step": 916
+    },
+    {
+      "epoch": 1.6345811051693404,
+      "grad_norm": 1.5803701877593994,
+      "learning_rate": 1.095143706640238e-06,
+      "loss": 0.896,
+      "step": 917
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 1.7000629901885986,
+      "learning_rate": 1.0901883052527256e-06,
+      "loss": 0.9184,
+      "step": 918
+    },
+    {
+      "epoch": 1.6381461675579323,
+      "grad_norm": 1.5939161777496338,
+      "learning_rate": 1.0852329038652133e-06,
+      "loss": 0.9119,
+      "step": 919
+    },
+    {
+      "epoch": 1.6399286987522281,
+      "grad_norm": 1.7102794647216797,
+      "learning_rate": 1.080277502477701e-06,
+      "loss": 0.9483,
+      "step": 920
+    },
+    {
+      "epoch": 1.6399286987522281,
+      "eval_loss": 0.9989098906517029,
+      "eval_runtime": 25.7806,
+      "eval_samples_per_second": 38.789,
+      "eval_steps_per_second": 2.444,
+      "step": 920
+    },
+    {
+      "epoch": 1.641711229946524,
+      "grad_norm": 1.713068962097168,
+      "learning_rate": 1.0753221010901884e-06,
+      "loss": 0.9219,
+      "step": 921
+    },
+    {
+      "epoch": 1.64349376114082,
+      "grad_norm": 1.6815000772476196,
+      "learning_rate": 1.070366699702676e-06,
+      "loss": 0.9216,
+      "step": 922
+    },
+    {
+      "epoch": 1.6452762923351159,
+      "grad_norm": 1.6452995538711548,
+      "learning_rate": 1.0654112983151637e-06,
+      "loss": 0.8934,
+      "step": 923
+    },
+    {
+      "epoch": 1.6470588235294117,
+      "grad_norm": 1.6780235767364502,
+      "learning_rate": 1.0604558969276514e-06,
+      "loss": 0.9355,
+      "step": 924
+    },
+    {
+      "epoch": 1.6488413547237077,
+      "grad_norm": 1.781059980392456,
+      "learning_rate": 1.0555004955401388e-06,
+      "loss": 0.9272,
+      "step": 925
+    },
+    {
+      "epoch": 1.6506238859180036,
+      "grad_norm": 1.7290118932724,
+      "learning_rate": 1.0505450941526265e-06,
+      "loss": 0.89,
+      "step": 926
+    },
+    {
+      "epoch": 1.6524064171122994,
+      "grad_norm": 1.6904546022415161,
+      "learning_rate": 1.0455896927651141e-06,
+      "loss": 0.8923,
+      "step": 927
+    },
+    {
+      "epoch": 1.6541889483065955,
+      "grad_norm": 1.7392773628234863,
+      "learning_rate": 1.0406342913776018e-06,
+      "loss": 0.92,
+      "step": 928
+    },
+    {
+      "epoch": 1.6559714795008913,
+      "grad_norm": 1.676623821258545,
+      "learning_rate": 1.0356788899900893e-06,
+      "loss": 0.9204,
+      "step": 929
+    },
+    {
+      "epoch": 1.6577540106951871,
+      "grad_norm": 1.813863754272461,
+      "learning_rate": 1.030723488602577e-06,
+      "loss": 0.9442,
+      "step": 930
+    },
+    {
+      "epoch": 1.6577540106951871,
+      "eval_loss": 0.9988206624984741,
+      "eval_runtime": 25.7006,
+      "eval_samples_per_second": 38.91,
+      "eval_steps_per_second": 2.451,
+      "step": 930
+    },
+    {
+      "epoch": 1.6595365418894832,
+      "grad_norm": 1.7283135652542114,
+      "learning_rate": 1.0257680872150646e-06,
+      "loss": 0.9189,
+      "step": 931
+    },
+    {
+      "epoch": 1.661319073083779,
+      "grad_norm": 1.6723238229751587,
+      "learning_rate": 1.0208126858275522e-06,
+      "loss": 0.9173,
+      "step": 932
+    },
+    {
+      "epoch": 1.6631016042780749,
+      "grad_norm": 1.6681349277496338,
+      "learning_rate": 1.0158572844400397e-06,
+      "loss": 0.8979,
+      "step": 933
+    },
+    {
+      "epoch": 1.664884135472371,
+      "grad_norm": 1.6180405616760254,
+      "learning_rate": 1.0109018830525274e-06,
+      "loss": 0.9034,
+      "step": 934
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 1.7722523212432861,
+      "learning_rate": 1.005946481665015e-06,
+      "loss": 0.9223,
+      "step": 935
+    },
+    {
+      "epoch": 1.6684491978609626,
+      "grad_norm": 1.613845705986023,
+      "learning_rate": 1.0009910802775027e-06,
+      "loss": 0.9204,
+      "step": 936
+    },
+    {
+      "epoch": 1.6702317290552586,
+      "grad_norm": 1.751266360282898,
+      "learning_rate": 9.960356788899901e-07,
+      "loss": 0.9552,
+      "step": 937
+    },
+    {
+      "epoch": 1.6720142602495542,
+      "grad_norm": 1.6438246965408325,
+      "learning_rate": 9.910802775024778e-07,
+      "loss": 0.9023,
+      "step": 938
+    },
+    {
+      "epoch": 1.6737967914438503,
+      "grad_norm": 1.7301279306411743,
+      "learning_rate": 9.861248761149655e-07,
+      "loss": 0.9408,
+      "step": 939
+    },
+    {
+      "epoch": 1.6755793226381461,
+      "grad_norm": 1.5920662879943848,
+      "learning_rate": 9.811694747274531e-07,
+      "loss": 0.8898,
+      "step": 940
+    },
+    {
+      "epoch": 1.6755793226381461,
+      "eval_loss": 0.9970239400863647,
+      "eval_runtime": 25.6234,
+      "eval_samples_per_second": 39.027,
+      "eval_steps_per_second": 2.459,
+      "step": 940
+    },
+    {
+      "epoch": 1.677361853832442,
+      "grad_norm": 1.7048591375350952,
+      "learning_rate": 9.762140733399406e-07,
+      "loss": 0.9092,
+      "step": 941
+    },
+    {
+      "epoch": 1.679144385026738,
+      "grad_norm": 1.7565436363220215,
+      "learning_rate": 9.712586719524282e-07,
+      "loss": 0.9229,
+      "step": 942
+    },
+    {
+      "epoch": 1.6809269162210339,
+      "grad_norm": 1.7713767290115356,
+      "learning_rate": 9.663032705649159e-07,
+      "loss": 0.9253,
+      "step": 943
+    },
+    {
+      "epoch": 1.6827094474153297,
+      "grad_norm": 1.864357352256775,
+      "learning_rate": 9.613478691774035e-07,
+      "loss": 0.9322,
+      "step": 944
+    },
+    {
+      "epoch": 1.6844919786096257,
+      "grad_norm": 1.6992820501327515,
+      "learning_rate": 9.56392467789891e-07,
+      "loss": 0.8836,
+      "step": 945
+    },
+    {
+      "epoch": 1.6862745098039216,
+      "grad_norm": 1.6879699230194092,
+      "learning_rate": 9.514370664023787e-07,
+      "loss": 0.9119,
+      "step": 946
+    },
+    {
+      "epoch": 1.6880570409982174,
+      "grad_norm": 1.6944903135299683,
+      "learning_rate": 9.464816650148663e-07,
+      "loss": 0.9365,
+      "step": 947
+    },
+    {
+      "epoch": 1.6898395721925135,
+      "grad_norm": 1.694254994392395,
+      "learning_rate": 9.41526263627354e-07,
+      "loss": 0.9524,
+      "step": 948
+    },
+    {
+      "epoch": 1.6916221033868093,
+      "grad_norm": 1.6590172052383423,
+      "learning_rate": 9.365708622398414e-07,
+      "loss": 0.9276,
+      "step": 949
+    },
+    {
+      "epoch": 1.6934046345811051,
+      "grad_norm": 1.7332650423049927,
+      "learning_rate": 9.316154608523291e-07,
+      "loss": 0.9267,
+      "step": 950
+    },
+    {
+      "epoch": 1.6934046345811051,
+      "eval_loss": 0.9965516328811646,
+      "eval_runtime": 25.7077,
+      "eval_samples_per_second": 38.899,
+      "eval_steps_per_second": 2.451,
+      "step": 950
+    },
+    {
+      "epoch": 1.6951871657754012,
+      "grad_norm": 1.617246389389038,
+      "learning_rate": 9.266600594648168e-07,
+      "loss": 0.9161,
+      "step": 951
+    },
+    {
+      "epoch": 1.696969696969697,
+      "grad_norm": 1.6363615989685059,
+      "learning_rate": 9.217046580773044e-07,
+      "loss": 0.9123,
+      "step": 952
+    },
+    {
+      "epoch": 1.6987522281639929,
+      "grad_norm": 1.6289929151535034,
+      "learning_rate": 9.167492566897919e-07,
+      "loss": 0.9148,
+      "step": 953
+    },
+    {
+      "epoch": 1.700534759358289,
+      "grad_norm": 1.7203320264816284,
+      "learning_rate": 9.117938553022795e-07,
+      "loss": 0.9035,
+      "step": 954
+    },
+    {
+      "epoch": 1.7023172905525845,
+      "grad_norm": 1.6997250318527222,
+      "learning_rate": 9.068384539147672e-07,
+      "loss": 0.9123,
+      "step": 955
+    },
+    {
+      "epoch": 1.7040998217468806,
+      "grad_norm": 1.7788165807724,
+      "learning_rate": 9.018830525272549e-07,
+      "loss": 0.9198,
+      "step": 956
+    },
+    {
+      "epoch": 1.7058823529411766,
+      "grad_norm": 1.708892822265625,
+      "learning_rate": 8.969276511397423e-07,
+      "loss": 0.8984,
+      "step": 957
+    },
+    {
+      "epoch": 1.7076648841354722,
+      "grad_norm": 1.6467701196670532,
+      "learning_rate": 8.9197224975223e-07,
+      "loss": 0.9046,
+      "step": 958
+    },
+    {
+      "epoch": 1.7094474153297683,
+      "grad_norm": 1.7599132061004639,
+      "learning_rate": 8.870168483647176e-07,
+      "loss": 0.9361,
+      "step": 959
+    },
+    {
+      "epoch": 1.7112299465240641,
+      "grad_norm": 1.6088155508041382,
+      "learning_rate": 8.820614469772053e-07,
+      "loss": 0.8796,
+      "step": 960
+    },
+    {
+      "epoch": 1.7112299465240641,
+      "eval_loss": 0.9957713484764099,
+      "eval_runtime": 25.9512,
+      "eval_samples_per_second": 38.534,
+      "eval_steps_per_second": 2.428,
+      "step": 960
+    },
+    {
+      "epoch": 1.71301247771836,
+      "grad_norm": 1.698625087738037,
+      "learning_rate": 8.771060455896927e-07,
+      "loss": 0.8925,
+      "step": 961
+    },
+    {
+      "epoch": 1.714795008912656,
+      "grad_norm": 1.6810694932937622,
+      "learning_rate": 8.721506442021804e-07,
+      "loss": 0.908,
+      "step": 962
+    },
+    {
+      "epoch": 1.7165775401069518,
+      "grad_norm": 1.6146069765090942,
+      "learning_rate": 8.671952428146681e-07,
+      "loss": 0.9064,
+      "step": 963
+    },
+    {
+      "epoch": 1.7183600713012477,
+      "grad_norm": 1.6394168138504028,
+      "learning_rate": 8.622398414271557e-07,
+      "loss": 0.9309,
+      "step": 964
+    },
+    {
+      "epoch": 1.7201426024955437,
+      "grad_norm": 1.6711269617080688,
+      "learning_rate": 8.572844400396432e-07,
+      "loss": 0.9436,
+      "step": 965
+    },
+    {
+      "epoch": 1.7219251336898396,
+      "grad_norm": 1.6253998279571533,
+      "learning_rate": 8.523290386521308e-07,
+      "loss": 0.93,
+      "step": 966
+    },
+    {
+      "epoch": 1.7237076648841354,
+      "grad_norm": 1.7100143432617188,
+      "learning_rate": 8.473736372646185e-07,
+      "loss": 0.9047,
+      "step": 967
+    },
+    {
+      "epoch": 1.7254901960784315,
+      "grad_norm": 1.6870434284210205,
+      "learning_rate": 8.424182358771062e-07,
+      "loss": 0.8878,
+      "step": 968
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 1.6940484046936035,
+      "learning_rate": 8.374628344895937e-07,
+      "loss": 0.9092,
+      "step": 969
+    },
+    {
+      "epoch": 1.7290552584670231,
+      "grad_norm": 1.7144818305969238,
+      "learning_rate": 8.325074331020813e-07,
+      "loss": 0.8831,
+      "step": 970
+    },
+    {
+      "epoch": 1.7290552584670231,
+      "eval_loss": 0.996261715888977,
+      "eval_runtime": 25.6913,
+      "eval_samples_per_second": 38.924,
+      "eval_steps_per_second": 2.452,
+      "step": 970
+    },
+    {
+      "epoch": 1.7308377896613192,
+      "grad_norm": 1.7108317613601685,
+      "learning_rate": 8.275520317145689e-07,
+      "loss": 0.8759,
+      "step": 971
+    },
+    {
+      "epoch": 1.732620320855615,
+      "grad_norm": 1.8309522867202759,
+      "learning_rate": 8.225966303270566e-07,
+      "loss": 0.8997,
+      "step": 972
+    },
+    {
+      "epoch": 1.7344028520499108,
+      "grad_norm": 1.6917412281036377,
+      "learning_rate": 8.176412289395442e-07,
+      "loss": 0.9109,
+      "step": 973
+    },
+    {
+      "epoch": 1.736185383244207,
+      "grad_norm": 1.776153802871704,
+      "learning_rate": 8.126858275520318e-07,
+      "loss": 0.9283,
+      "step": 974
+    },
+    {
+      "epoch": 1.7379679144385025,
+      "grad_norm": 1.7653323411941528,
+      "learning_rate": 8.077304261645194e-07,
+      "loss": 0.8964,
+      "step": 975
+    },
+    {
+      "epoch": 1.7397504456327986,
+      "grad_norm": 1.7760730981826782,
+      "learning_rate": 8.02775024777007e-07,
+      "loss": 0.9283,
+      "step": 976
+    },
+    {
+      "epoch": 1.7415329768270946,
+      "grad_norm": 1.7228769063949585,
+      "learning_rate": 7.978196233894946e-07,
+      "loss": 0.905,
+      "step": 977
+    },
+    {
+      "epoch": 1.7433155080213902,
+      "grad_norm": 1.7163366079330444,
+      "learning_rate": 7.928642220019823e-07,
+      "loss": 0.9439,
+      "step": 978
+    },
+    {
+      "epoch": 1.7450980392156863,
+      "grad_norm": 1.750165581703186,
+      "learning_rate": 7.879088206144698e-07,
+      "loss": 0.8971,
+      "step": 979
+    },
+    {
+      "epoch": 1.7468805704099821,
+      "grad_norm": 1.7466638088226318,
+      "learning_rate": 7.829534192269575e-07,
+      "loss": 0.9069,
+      "step": 980
+    },
+    {
+      "epoch": 1.7468805704099821,
+      "eval_loss": 0.9950440526008606,
+      "eval_runtime": 25.7702,
+      "eval_samples_per_second": 38.805,
+      "eval_steps_per_second": 2.445,
+      "step": 980
+    },
+    {
+      "epoch": 1.748663101604278,
+      "grad_norm": 1.672875165939331,
+      "learning_rate": 7.77998017839445e-07,
+      "loss": 0.9218,
+      "step": 981
+    },
+    {
+      "epoch": 1.750445632798574,
+      "grad_norm": 1.7255640029907227,
+      "learning_rate": 7.730426164519327e-07,
+      "loss": 0.9122,
+      "step": 982
+    },
+    {
+      "epoch": 1.7522281639928698,
+      "grad_norm": 1.6890486478805542,
+      "learning_rate": 7.680872150644203e-07,
+      "loss": 0.9098,
+      "step": 983
+    },
+    {
+      "epoch": 1.7540106951871657,
+      "grad_norm": 1.6408272981643677,
+      "learning_rate": 7.631318136769079e-07,
+      "loss": 0.921,
+      "step": 984
+    },
+    {
+      "epoch": 1.7557932263814617,
+      "grad_norm": 1.6204485893249512,
+      "learning_rate": 7.581764122893955e-07,
+      "loss": 0.9109,
+      "step": 985
+    },
+    {
+      "epoch": 1.7575757575757576,
+      "grad_norm": 1.645685076713562,
+      "learning_rate": 7.532210109018831e-07,
+      "loss": 0.9066,
+      "step": 986
+    },
+    {
+      "epoch": 1.7593582887700534,
+      "grad_norm": 1.7292003631591797,
+      "learning_rate": 7.482656095143708e-07,
+      "loss": 0.8945,
+      "step": 987
+    },
+    {
+      "epoch": 1.7611408199643495,
+      "grad_norm": 1.6503880023956299,
+      "learning_rate": 7.433102081268583e-07,
+      "loss": 0.8997,
+      "step": 988
+    },
+    {
+      "epoch": 1.7629233511586453,
+      "grad_norm": 1.7059553861618042,
+      "learning_rate": 7.383548067393459e-07,
+      "loss": 0.9076,
+      "step": 989
+    },
+    {
+      "epoch": 1.7647058823529411,
+      "grad_norm": 1.6760096549987793,
+      "learning_rate": 7.333994053518336e-07,
+      "loss": 0.9466,
+      "step": 990
+    },
+    {
+      "epoch": 1.7647058823529411,
+      "eval_loss": 0.995206892490387,
+      "eval_runtime": 25.6123,
+      "eval_samples_per_second": 39.044,
+      "eval_steps_per_second": 2.46,
+      "step": 990
+    },
+    {
+      "epoch": 1.7664884135472372,
+      "grad_norm": 1.683337926864624,
+      "learning_rate": 7.284440039643212e-07,
+      "loss": 0.8894,
+      "step": 991
+    },
+    {
+      "epoch": 1.768270944741533,
+      "grad_norm": 1.7856427431106567,
+      "learning_rate": 7.234886025768089e-07,
+      "loss": 0.891,
+      "step": 992
+    },
+    {
+      "epoch": 1.7700534759358288,
+      "grad_norm": 1.7322144508361816,
+      "learning_rate": 7.185332011892963e-07,
+      "loss": 0.9189,
+      "step": 993
+    },
+    {
+      "epoch": 1.771836007130125,
+      "grad_norm": 1.6664427518844604,
+      "learning_rate": 7.13577799801784e-07,
+      "loss": 0.8929,
+      "step": 994
+    },
+    {
+      "epoch": 1.7736185383244205,
+      "grad_norm": 1.6912422180175781,
+      "learning_rate": 7.086223984142717e-07,
+      "loss": 0.8925,
+      "step": 995
+    },
+    {
+      "epoch": 1.7754010695187166,
+      "grad_norm": 1.8387762308120728,
+      "learning_rate": 7.036669970267593e-07,
+      "loss": 0.9081,
+      "step": 996
+    },
+    {
+      "epoch": 1.7771836007130126,
+      "grad_norm": 1.683955430984497,
+      "learning_rate": 6.987115956392468e-07,
+      "loss": 0.8802,
+      "step": 997
+    },
+    {
+      "epoch": 1.7789661319073082,
+      "grad_norm": 1.760162115097046,
+      "learning_rate": 6.937561942517344e-07,
+      "loss": 0.9419,
+      "step": 998
+    },
+    {
+      "epoch": 1.7807486631016043,
+      "grad_norm": 1.680673599243164,
+      "learning_rate": 6.888007928642221e-07,
+      "loss": 0.9024,
+      "step": 999
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "grad_norm": 1.6909873485565186,
+      "learning_rate": 6.838453914767098e-07,
+      "loss": 0.9069,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "eval_loss": 0.9940561652183533,
+      "eval_runtime": 25.7043,
+      "eval_samples_per_second": 38.904,
+      "eval_steps_per_second": 2.451,
+      "step": 1000
+    },
+    {
+      "epoch": 1.784313725490196,
+      "grad_norm": 1.7442917823791504,
+      "learning_rate": 6.788899900891972e-07,
+      "loss": 0.9188,
+      "step": 1001
+    },
+    {
+      "epoch": 1.786096256684492,
+      "grad_norm": 1.658192753791809,
+      "learning_rate": 6.739345887016849e-07,
+      "loss": 0.9085,
+      "step": 1002
+    },
+    {
+      "epoch": 1.7878787878787878,
+      "grad_norm": 1.6885861158370972,
+      "learning_rate": 6.689791873141725e-07,
+      "loss": 0.8958,
+      "step": 1003
+    },
+    {
+      "epoch": 1.7896613190730837,
+      "grad_norm": 1.7260538339614868,
+      "learning_rate": 6.640237859266602e-07,
+      "loss": 0.9086,
+      "step": 1004
+    },
+    {
+      "epoch": 1.7914438502673797,
+      "grad_norm": 1.752336859703064,
+      "learning_rate": 6.590683845391476e-07,
+      "loss": 0.8825,
+      "step": 1005
+    },
+    {
+      "epoch": 1.7932263814616756,
+      "grad_norm": 1.6466569900512695,
+      "learning_rate": 6.541129831516353e-07,
+      "loss": 0.8537,
+      "step": 1006
+    },
+    {
+      "epoch": 1.7950089126559714,
+      "grad_norm": 1.669892430305481,
+      "learning_rate": 6.49157581764123e-07,
+      "loss": 0.9325,
+      "step": 1007
+    },
+    {
+      "epoch": 1.7967914438502675,
+      "grad_norm": 1.8119384050369263,
+      "learning_rate": 6.442021803766106e-07,
+      "loss": 0.9359,
+      "step": 1008
+    },
+    {
+      "epoch": 1.7985739750445633,
+      "grad_norm": 1.6587778329849243,
+      "learning_rate": 6.392467789890981e-07,
+      "loss": 0.9183,
+      "step": 1009
+    },
+    {
+      "epoch": 1.8003565062388591,
+      "grad_norm": 1.6684181690216064,
+      "learning_rate": 6.342913776015857e-07,
+      "loss": 0.8833,
+      "step": 1010
+    },
+    {
+      "epoch": 1.8003565062388591,
+      "eval_loss": 0.9942149519920349,
+      "eval_runtime": 25.6683,
+      "eval_samples_per_second": 38.959,
+      "eval_steps_per_second": 2.454,
+      "step": 1010
+    },
+    {
+      "epoch": 1.8021390374331552,
+      "grad_norm": 1.7912577390670776,
+      "learning_rate": 6.293359762140734e-07,
+      "loss": 0.937,
+      "step": 1011
+    },
+    {
+      "epoch": 1.803921568627451,
+      "grad_norm": 1.7254101037979126,
+      "learning_rate": 6.24380574826561e-07,
+      "loss": 0.9412,
+      "step": 1012
+    },
+    {
+      "epoch": 1.8057040998217468,
+      "grad_norm": 1.7295520305633545,
+      "learning_rate": 6.194251734390486e-07,
+      "loss": 0.9308,
+      "step": 1013
+    },
+    {
+      "epoch": 1.807486631016043,
+      "grad_norm": 1.736169695854187,
+      "learning_rate": 6.144697720515362e-07,
+      "loss": 0.8654,
+      "step": 1014
+    },
+    {
+      "epoch": 1.8092691622103387,
+      "grad_norm": 1.6945925951004028,
+      "learning_rate": 6.095143706640238e-07,
+      "loss": 0.9093,
+      "step": 1015
+    },
+    {
+      "epoch": 1.8110516934046346,
+      "grad_norm": 1.674950361251831,
+      "learning_rate": 6.045589692765114e-07,
+      "loss": 0.8958,
+      "step": 1016
+    },
+    {
+      "epoch": 1.8128342245989306,
+      "grad_norm": 1.73616361618042,
+      "learning_rate": 5.996035678889991e-07,
+      "loss": 0.9306,
+      "step": 1017
+    },
+    {
+      "epoch": 1.8146167557932262,
+      "grad_norm": 1.694348931312561,
+      "learning_rate": 5.946481665014866e-07,
+      "loss": 0.9172,
+      "step": 1018
+    },
+    {
+      "epoch": 1.8163992869875223,
+      "grad_norm": 1.7344218492507935,
+      "learning_rate": 5.896927651139743e-07,
+      "loss": 0.8831,
+      "step": 1019
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 1.6549162864685059,
+      "learning_rate": 5.847373637264618e-07,
+      "loss": 0.8691,
+      "step": 1020
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "eval_loss": 0.9938483238220215,
+      "eval_runtime": 25.6316,
+      "eval_samples_per_second": 39.014,
+      "eval_steps_per_second": 2.458,
+      "step": 1020
+    },
+    {
+      "epoch": 1.819964349376114,
+      "grad_norm": 1.638862133026123,
+      "learning_rate": 5.797819623389495e-07,
+      "loss": 0.8828,
+      "step": 1021
+    },
+    {
+      "epoch": 1.82174688057041,
+      "grad_norm": 1.7828065156936646,
+      "learning_rate": 5.748265609514372e-07,
+      "loss": 0.9372,
+      "step": 1022
+    },
+    {
+      "epoch": 1.8235294117647058,
+      "grad_norm": 1.6707323789596558,
+      "learning_rate": 5.698711595639247e-07,
+      "loss": 0.8912,
+      "step": 1023
+    },
+    {
+      "epoch": 1.8253119429590017,
+      "grad_norm": 1.707122564315796,
+      "learning_rate": 5.649157581764124e-07,
+      "loss": 0.9071,
+      "step": 1024
+    },
+    {
+      "epoch": 1.8270944741532977,
+      "grad_norm": 1.7146416902542114,
+      "learning_rate": 5.599603567888999e-07,
+      "loss": 0.914,
+      "step": 1025
+    },
+    {
+      "epoch": 1.8288770053475936,
+      "grad_norm": 1.7814098596572876,
+      "learning_rate": 5.550049554013876e-07,
+      "loss": 0.9267,
+      "step": 1026
+    },
+    {
+      "epoch": 1.8306595365418894,
+      "grad_norm": 1.7249929904937744,
+      "learning_rate": 5.500495540138751e-07,
+      "loss": 0.9132,
+      "step": 1027
+    },
+    {
+      "epoch": 1.8324420677361855,
+      "grad_norm": 1.764886498451233,
+      "learning_rate": 5.450941526263628e-07,
+      "loss": 0.908,
+      "step": 1028
+    },
+    {
+      "epoch": 1.8342245989304813,
+      "grad_norm": 1.780199408531189,
+      "learning_rate": 5.401387512388505e-07,
+      "loss": 0.8821,
+      "step": 1029
+    },
+    {
+      "epoch": 1.8360071301247771,
+      "grad_norm": 1.6997710466384888,
+      "learning_rate": 5.35183349851338e-07,
+      "loss": 0.8834,
+      "step": 1030
+    },
+    {
+      "epoch": 1.8360071301247771,
+      "eval_loss": 0.9937179088592529,
+      "eval_runtime": 25.8881,
+      "eval_samples_per_second": 38.628,
+      "eval_steps_per_second": 2.434,
+      "step": 1030
+    },
+    {
+      "epoch": 1.8377896613190732,
+      "grad_norm": 1.8370707035064697,
+      "learning_rate": 5.302279484638257e-07,
+      "loss": 0.923,
+      "step": 1031
+    },
+    {
+      "epoch": 1.839572192513369,
+      "grad_norm": 1.6870957612991333,
+      "learning_rate": 5.252725470763132e-07,
+      "loss": 0.9032,
+      "step": 1032
+    },
+    {
+      "epoch": 1.8413547237076648,
+      "grad_norm": 1.792143702507019,
+      "learning_rate": 5.203171456888009e-07,
+      "loss": 0.893,
+      "step": 1033
+    },
+    {
+      "epoch": 1.843137254901961,
+      "grad_norm": 1.7480674982070923,
+      "learning_rate": 5.153617443012885e-07,
+      "loss": 0.9187,
+      "step": 1034
+    },
+    {
+      "epoch": 1.8449197860962567,
+      "grad_norm": 1.7090226411819458,
+      "learning_rate": 5.104063429137761e-07,
+      "loss": 0.893,
+      "step": 1035
+    },
+    {
+      "epoch": 1.8467023172905526,
+      "grad_norm": 1.6919946670532227,
+      "learning_rate": 5.054509415262637e-07,
+      "loss": 0.9254,
+      "step": 1036
+    },
+    {
+      "epoch": 1.8484848484848486,
+      "grad_norm": 1.7311183214187622,
+      "learning_rate": 5.004955401387513e-07,
+      "loss": 0.8938,
+      "step": 1037
+    },
+    {
+      "epoch": 1.8502673796791442,
+      "grad_norm": 1.6775836944580078,
+      "learning_rate": 4.955401387512389e-07,
+      "loss": 0.935,
+      "step": 1038
+    },
+    {
+      "epoch": 1.8520499108734403,
+      "grad_norm": 1.7318252325057983,
+      "learning_rate": 4.905847373637266e-07,
+      "loss": 0.8997,
+      "step": 1039
+    },
+    {
+      "epoch": 1.8538324420677363,
+      "grad_norm": 1.7152644395828247,
+      "learning_rate": 4.856293359762141e-07,
+      "loss": 0.9001,
+      "step": 1040
+    },
+    {
+      "epoch": 1.8538324420677363,
+      "eval_loss": 0.992533802986145,
+      "eval_runtime": 25.9005,
+      "eval_samples_per_second": 38.609,
+      "eval_steps_per_second": 2.432,
+      "step": 1040
+    },
+    {
+      "epoch": 1.855614973262032,
+      "grad_norm": 1.613964557647705,
+      "learning_rate": 4.806739345887018e-07,
+      "loss": 0.8808,
+      "step": 1041
+    },
+    {
+      "epoch": 1.857397504456328,
+      "grad_norm": 1.6681315898895264,
+      "learning_rate": 4.7571853320118933e-07,
+      "loss": 0.8872,
+      "step": 1042
+    },
+    {
+      "epoch": 1.8591800356506238,
+      "grad_norm": 1.867087483406067,
+      "learning_rate": 4.70763131813677e-07,
+      "loss": 0.9167,
+      "step": 1043
+    },
+    {
+      "epoch": 1.8609625668449197,
+      "grad_norm": 1.7233860492706299,
+      "learning_rate": 4.6580773042616455e-07,
+      "loss": 0.9293,
+      "step": 1044
+    },
+    {
+      "epoch": 1.8627450980392157,
+      "grad_norm": 1.6797863245010376,
+      "learning_rate": 4.608523290386522e-07,
+      "loss": 0.8912,
+      "step": 1045
+    },
+    {
+      "epoch": 1.8645276292335116,
+      "grad_norm": 1.761204719543457,
+      "learning_rate": 4.5589692765113977e-07,
+      "loss": 0.8773,
+      "step": 1046
+    },
+    {
+      "epoch": 1.8663101604278074,
+      "grad_norm": 1.6859080791473389,
+      "learning_rate": 4.5094152626362743e-07,
+      "loss": 0.9091,
+      "step": 1047
+    },
+    {
+      "epoch": 1.8680926916221035,
+      "grad_norm": 1.7600855827331543,
+      "learning_rate": 4.45986124876115e-07,
+      "loss": 0.9268,
+      "step": 1048
+    },
+    {
+      "epoch": 1.8698752228163993,
+      "grad_norm": 1.656213641166687,
+      "learning_rate": 4.4103072348860265e-07,
+      "loss": 0.9073,
+      "step": 1049
+    },
+    {
+      "epoch": 1.8716577540106951,
+      "grad_norm": 1.7546846866607666,
+      "learning_rate": 4.360753221010902e-07,
+      "loss": 0.9244,
+      "step": 1050
+    },
+    {
+      "epoch": 1.8716577540106951,
+      "eval_loss": 0.99254310131073,
+      "eval_runtime": 25.8491,
+      "eval_samples_per_second": 38.686,
+      "eval_steps_per_second": 2.437,
+      "step": 1050
+    },
+    {
+      "epoch": 1.8734402852049912,
+      "grad_norm": 1.7467200756072998,
+      "learning_rate": 4.3111992071357786e-07,
+      "loss": 0.9226,
+      "step": 1051
+    },
+    {
+      "epoch": 1.875222816399287,
+      "grad_norm": 1.6825129985809326,
+      "learning_rate": 4.261645193260654e-07,
+      "loss": 0.9166,
+      "step": 1052
+    },
+    {
+      "epoch": 1.8770053475935828,
+      "grad_norm": 1.7524358034133911,
+      "learning_rate": 4.212091179385531e-07,
+      "loss": 0.8991,
+      "step": 1053
+    },
+    {
+      "epoch": 1.878787878787879,
+      "grad_norm": 1.770285964012146,
+      "learning_rate": 4.1625371655104064e-07,
+      "loss": 0.9491,
+      "step": 1054
+    },
+    {
+      "epoch": 1.8805704099821747,
+      "grad_norm": 1.684815526008606,
+      "learning_rate": 4.112983151635283e-07,
+      "loss": 0.911,
+      "step": 1055
+    },
+    {
+      "epoch": 1.8823529411764706,
+      "grad_norm": 1.7561120986938477,
+      "learning_rate": 4.063429137760159e-07,
+      "loss": 0.9183,
+      "step": 1056
+    },
+    {
+      "epoch": 1.8841354723707666,
+      "grad_norm": 1.6486444473266602,
+      "learning_rate": 4.013875123885035e-07,
+      "loss": 0.877,
+      "step": 1057
+    },
+    {
+      "epoch": 1.8859180035650622,
+      "grad_norm": 1.6989293098449707,
+      "learning_rate": 3.964321110009911e-07,
+      "loss": 0.9141,
+      "step": 1058
+    },
+    {
+      "epoch": 1.8877005347593583,
+      "grad_norm": 1.6929652690887451,
+      "learning_rate": 3.9147670961347873e-07,
+      "loss": 0.8975,
+      "step": 1059
+    },
+    {
+      "epoch": 1.8894830659536543,
+      "grad_norm": 1.671325445175171,
+      "learning_rate": 3.8652130822596634e-07,
+      "loss": 0.9029,
+      "step": 1060
+    },
+    {
+      "epoch": 1.8894830659536543,
+      "eval_loss": 0.9915664792060852,
+      "eval_runtime": 25.7783,
+      "eval_samples_per_second": 38.792,
+      "eval_steps_per_second": 2.444,
+      "step": 1060
+    },
+    {
+      "epoch": 1.89126559714795,
+      "grad_norm": 1.6696364879608154,
+      "learning_rate": 3.8156590683845395e-07,
+      "loss": 0.9056,
+      "step": 1061
+    },
+    {
+      "epoch": 1.893048128342246,
+      "grad_norm": 1.676077961921692,
+      "learning_rate": 3.7661050545094156e-07,
+      "loss": 0.9,
+      "step": 1062
+    },
+    {
+      "epoch": 1.8948306595365418,
+      "grad_norm": 1.6590983867645264,
+      "learning_rate": 3.7165510406342917e-07,
+      "loss": 0.9123,
+      "step": 1063
+    },
+    {
+      "epoch": 1.8966131907308377,
+      "grad_norm": 1.6489439010620117,
+      "learning_rate": 3.666997026759168e-07,
+      "loss": 0.8964,
+      "step": 1064
+    },
+    {
+      "epoch": 1.8983957219251337,
+      "grad_norm": 1.6285649538040161,
+      "learning_rate": 3.6174430128840444e-07,
+      "loss": 0.8824,
+      "step": 1065
+    },
+    {
+      "epoch": 1.9001782531194296,
+      "grad_norm": 1.6999493837356567,
+      "learning_rate": 3.56788899900892e-07,
+      "loss": 0.9206,
+      "step": 1066
+    },
+    {
+      "epoch": 1.9019607843137254,
+      "grad_norm": 1.7763421535491943,
+      "learning_rate": 3.5183349851337966e-07,
+      "loss": 0.9161,
+      "step": 1067
+    },
+    {
+      "epoch": 1.9037433155080214,
+      "grad_norm": 1.6540220975875854,
+      "learning_rate": 3.468780971258672e-07,
+      "loss": 0.9353,
+      "step": 1068
+    },
+    {
+      "epoch": 1.9055258467023173,
+      "grad_norm": 1.6708558797836304,
+      "learning_rate": 3.419226957383549e-07,
+      "loss": 0.88,
+      "step": 1069
+    },
+    {
+      "epoch": 1.9073083778966131,
+      "grad_norm": 1.6973531246185303,
+      "learning_rate": 3.3696729435084243e-07,
+      "loss": 0.9,
+      "step": 1070
+    },
+    {
+      "epoch": 1.9073083778966131,
+      "eval_loss": 0.9920867085456848,
+      "eval_runtime": 25.9739,
+      "eval_samples_per_second": 38.5,
+      "eval_steps_per_second": 2.426,
+      "step": 1070
+    },
+    {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 1.6297328472137451,
+      "learning_rate": 3.320118929633301e-07,
+      "loss": 0.9112,
+      "step": 1071
+    },
+    {
+      "epoch": 1.910873440285205,
+      "grad_norm": 1.69383704662323,
+      "learning_rate": 3.2705649157581765e-07,
+      "loss": 0.9088,
+      "step": 1072
+    },
+    {
+      "epoch": 1.9126559714795008,
+      "grad_norm": 1.6893271207809448,
+      "learning_rate": 3.221010901883053e-07,
+      "loss": 0.861,
+      "step": 1073
+    },
+    {
+      "epoch": 1.914438502673797,
+      "grad_norm": 1.786592721939087,
+      "learning_rate": 3.1714568880079287e-07,
+      "loss": 0.9026,
+      "step": 1074
+    },
+    {
+      "epoch": 1.9162210338680927,
+      "grad_norm": 1.8169891834259033,
+      "learning_rate": 3.121902874132805e-07,
+      "loss": 0.9249,
+      "step": 1075
+    },
+    {
+      "epoch": 1.9180035650623886,
+      "grad_norm": 1.6998229026794434,
+      "learning_rate": 3.072348860257681e-07,
+      "loss": 0.9056,
+      "step": 1076
+    },
+    {
+      "epoch": 1.9197860962566846,
+      "grad_norm": 1.7679296731948853,
+      "learning_rate": 3.022794846382557e-07,
+      "loss": 0.9003,
+      "step": 1077
+    },
+    {
+      "epoch": 1.9215686274509802,
+      "grad_norm": 1.663244605064392,
+      "learning_rate": 2.973240832507433e-07,
+      "loss": 0.9093,
+      "step": 1078
+    },
+    {
+      "epoch": 1.9233511586452763,
+      "grad_norm": 1.7235232591629028,
+      "learning_rate": 2.923686818632309e-07,
+      "loss": 0.8856,
+      "step": 1079
+    },
+    {
+      "epoch": 1.9251336898395723,
+      "grad_norm": 1.6739834547042847,
+      "learning_rate": 2.874132804757186e-07,
+      "loss": 0.9137,
+      "step": 1080
+    },
+    {
+      "epoch": 1.9251336898395723,
+      "eval_loss": 0.9913634061813354,
+      "eval_runtime": 25.9488,
+      "eval_samples_per_second": 38.537,
+      "eval_steps_per_second": 2.428,
+      "step": 1080
+    },
+    {
+      "epoch": 1.926916221033868,
+      "grad_norm": 1.6595032215118408,
+      "learning_rate": 2.824578790882062e-07,
+      "loss": 0.8928,
+      "step": 1081
+    },
+    {
+      "epoch": 1.928698752228164,
+      "grad_norm": 1.7184573411941528,
+      "learning_rate": 2.775024777006938e-07,
+      "loss": 0.9226,
+      "step": 1082
+    },
+    {
+      "epoch": 1.93048128342246,
+      "grad_norm": 1.7487850189208984,
+      "learning_rate": 2.725470763131814e-07,
+      "loss": 0.9335,
+      "step": 1083
+    },
+    {
+      "epoch": 1.9322638146167557,
+      "grad_norm": 1.6264859437942505,
+      "learning_rate": 2.67591674925669e-07,
+      "loss": 0.9092,
+      "step": 1084
+    },
+    {
+      "epoch": 1.9340463458110517,
+      "grad_norm": 1.784679889678955,
+      "learning_rate": 2.626362735381566e-07,
+      "loss": 0.9258,
+      "step": 1085
+    },
+    {
+      "epoch": 1.9358288770053476,
+      "grad_norm": 1.645410180091858,
+      "learning_rate": 2.5768087215064423e-07,
+      "loss": 0.918,
+      "step": 1086
+    },
+    {
+      "epoch": 1.9376114081996434,
+      "grad_norm": 1.7359968423843384,
+      "learning_rate": 2.5272547076313184e-07,
+      "loss": 0.9283,
+      "step": 1087
+    },
+    {
+      "epoch": 1.9393939393939394,
+      "grad_norm": 1.7158929109573364,
+      "learning_rate": 2.4777006937561945e-07,
+      "loss": 0.8898,
+      "step": 1088
+    },
+    {
+      "epoch": 1.9411764705882353,
+      "grad_norm": 1.7459858655929565,
+      "learning_rate": 2.4281466798810706e-07,
+      "loss": 0.9302,
+      "step": 1089
+    },
+    {
+      "epoch": 1.9429590017825311,
+      "grad_norm": 1.7283644676208496,
+      "learning_rate": 2.3785926660059467e-07,
+      "loss": 0.9021,
+      "step": 1090
+    },
+    {
+      "epoch": 1.9429590017825311,
+      "eval_loss": 0.9908037781715393,
+      "eval_runtime": 25.9896,
+      "eval_samples_per_second": 38.477,
+      "eval_steps_per_second": 2.424,
+      "step": 1090
+    },
+    {
+      "epoch": 1.9447415329768272,
+      "grad_norm": 1.673015832901001,
+      "learning_rate": 2.3290386521308227e-07,
+      "loss": 0.8822,
+      "step": 1091
+    },
+    {
+      "epoch": 1.946524064171123,
+      "grad_norm": 1.6898096799850464,
+      "learning_rate": 2.2794846382556988e-07,
+      "loss": 0.8827,
+      "step": 1092
+    },
+    {
+      "epoch": 1.9483065953654188,
+      "grad_norm": 1.6672441959381104,
+      "learning_rate": 2.229930624380575e-07,
+      "loss": 0.9085,
+      "step": 1093
+    },
+    {
+      "epoch": 1.950089126559715,
+      "grad_norm": 1.6861313581466675,
+      "learning_rate": 2.180376610505451e-07,
+      "loss": 0.9116,
+      "step": 1094
+    },
+    {
+      "epoch": 1.9518716577540107,
+      "grad_norm": 1.7200956344604492,
+      "learning_rate": 2.130822596630327e-07,
+      "loss": 0.9095,
+      "step": 1095
+    },
+    {
+      "epoch": 1.9536541889483066,
+      "grad_norm": 1.6788264513015747,
+      "learning_rate": 2.0812685827552032e-07,
+      "loss": 0.9123,
+      "step": 1096
+    },
+    {
+      "epoch": 1.9554367201426026,
+      "grad_norm": 1.596208095550537,
+      "learning_rate": 2.0317145688800795e-07,
+      "loss": 0.8672,
+      "step": 1097
+    },
+    {
+      "epoch": 1.9572192513368984,
+      "grad_norm": 1.7107036113739014,
+      "learning_rate": 1.9821605550049556e-07,
+      "loss": 0.8914,
+      "step": 1098
+    },
+    {
+      "epoch": 1.9590017825311943,
+      "grad_norm": 1.7238177061080933,
+      "learning_rate": 1.9326065411298317e-07,
+      "loss": 0.9258,
+      "step": 1099
+    },
+    {
+      "epoch": 1.9607843137254903,
+      "grad_norm": 1.8259961605072021,
+      "learning_rate": 1.8830525272547078e-07,
+      "loss": 0.9138,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9607843137254903,
+      "eval_loss": 0.99046790599823,
+      "eval_runtime": 25.9327,
+      "eval_samples_per_second": 38.561,
+      "eval_steps_per_second": 2.429,
+      "step": 1100
+    },
+    {
+      "epoch": 1.962566844919786,
+      "grad_norm": 1.7623475790023804,
+      "learning_rate": 1.833498513379584e-07,
+      "loss": 0.9137,
+      "step": 1101
+    },
+    {
+      "epoch": 1.964349376114082,
+      "grad_norm": 1.6571979522705078,
+      "learning_rate": 1.78394449950446e-07,
+      "loss": 0.8816,
+      "step": 1102
+    },
+    {
+      "epoch": 1.966131907308378,
+      "grad_norm": 1.8103545904159546,
+      "learning_rate": 1.734390485629336e-07,
+      "loss": 0.9206,
+      "step": 1103
+    },
+    {
+      "epoch": 1.9679144385026737,
+      "grad_norm": 1.7835015058517456,
+      "learning_rate": 1.6848364717542122e-07,
+      "loss": 0.9153,
+      "step": 1104
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 1.7936205863952637,
+      "learning_rate": 1.6352824578790883e-07,
+      "loss": 0.9228,
+      "step": 1105
+    },
+    {
+      "epoch": 1.9714795008912656,
+      "grad_norm": 1.6824809312820435,
+      "learning_rate": 1.5857284440039643e-07,
+      "loss": 0.943,
+      "step": 1106
+    },
+    {
+      "epoch": 1.9732620320855614,
+      "grad_norm": 1.6871892213821411,
+      "learning_rate": 1.5361744301288404e-07,
+      "loss": 0.8884,
+      "step": 1107
+    },
+    {
+      "epoch": 1.9750445632798574,
+      "grad_norm": 1.6781103610992432,
+      "learning_rate": 1.4866204162537165e-07,
+      "loss": 0.9194,
+      "step": 1108
+    },
+    {
+      "epoch": 1.9768270944741533,
+      "grad_norm": 1.7025740146636963,
+      "learning_rate": 1.437066402378593e-07,
+      "loss": 0.8969,
+      "step": 1109
+    },
+    {
+      "epoch": 1.9786096256684491,
+      "grad_norm": 1.637919306755066,
+      "learning_rate": 1.387512388503469e-07,
+      "loss": 0.8786,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9786096256684491,
+      "eval_loss": 0.9895098209381104,
+      "eval_runtime": 25.8128,
+      "eval_samples_per_second": 38.741,
+      "eval_steps_per_second": 2.441,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9803921568627452,
+      "grad_norm": 1.6665587425231934,
+      "learning_rate": 1.337958374628345e-07,
+      "loss": 0.8595,
+      "step": 1111
+    },
+    {
+      "epoch": 1.982174688057041,
+      "grad_norm": 1.6462030410766602,
+      "learning_rate": 1.2884043607532211e-07,
+      "loss": 0.9294,
+      "step": 1112
+    },
+    {
+      "epoch": 1.9839572192513368,
+      "grad_norm": 1.6541804075241089,
+      "learning_rate": 1.2388503468780972e-07,
+      "loss": 0.9129,
+      "step": 1113
+    },
+    {
+      "epoch": 1.985739750445633,
+      "grad_norm": 1.692040205001831,
+      "learning_rate": 1.1892963330029733e-07,
+      "loss": 0.935,
+      "step": 1114
+    },
+    {
+      "epoch": 1.9875222816399287,
+      "grad_norm": 1.64594566822052,
+      "learning_rate": 1.1397423191278494e-07,
+      "loss": 0.8774,
+      "step": 1115
+    },
+    {
+      "epoch": 1.9893048128342246,
+      "grad_norm": 1.645636796951294,
+      "learning_rate": 1.0901883052527255e-07,
+      "loss": 0.9163,
+      "step": 1116
+    },
+    {
+      "epoch": 1.9910873440285206,
+      "grad_norm": 1.6432050466537476,
+      "learning_rate": 1.0406342913776016e-07,
+      "loss": 0.8675,
+      "step": 1117
+    },
+    {
+      "epoch": 1.9928698752228164,
+      "grad_norm": 1.7587097883224487,
+      "learning_rate": 9.910802775024778e-08,
+      "loss": 0.9541,
+      "step": 1118
+    },
+    {
+      "epoch": 1.9946524064171123,
+      "grad_norm": 1.705899715423584,
+      "learning_rate": 9.415262636273539e-08,
+      "loss": 0.9088,
+      "step": 1119
+    },
+    {
+      "epoch": 1.9964349376114083,
+      "grad_norm": 1.652106761932373,
+      "learning_rate": 8.9197224975223e-08,
+      "loss": 0.8887,
+      "step": 1120
+    },
+    {
+      "epoch": 1.9964349376114083,
+      "eval_loss": 0.9894319772720337,
+      "eval_runtime": 25.8963,
+      "eval_samples_per_second": 38.615,
+      "eval_steps_per_second": 2.433,
+      "step": 1120
+    },
+    {
+      "epoch": 1.998217468805704,
+      "grad_norm": 1.6611700057983398,
+      "learning_rate": 8.424182358771061e-08,
+      "loss": 0.9036,
+      "step": 1121
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.6227155923843384,
+      "learning_rate": 7.928642220019822e-08,
+      "loss": 0.8938,
+      "step": 1122
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1122,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.590741983802163e+17,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}