diff --git "a/checkpoints/Qwen2.5-7B/babylm_hop_words4_10M_seed0/runs/checkpoint-1122/trainer_state.json" "b/checkpoints/Qwen2.5-7B/babylm_hop_words4_10M_seed0/runs/checkpoint-1122/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoints/Qwen2.5-7B/babylm_hop_words4_10M_seed0/runs/checkpoint-1122/trainer_state.json"
@@ -0,0 +1,8783 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1122,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017825311942959,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.683,
+      "step": 1
+    },
+    {
+      "epoch": 0.0035650623885918,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6428,
+      "step": 2
+    },
+    {
+      "epoch": 0.0053475935828877,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6195,
+      "step": 3
+    },
+    {
+      "epoch": 0.0071301247771836,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6425,
+      "step": 4
+    },
+    {
+      "epoch": 0.008912655971479501,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6463,
+      "step": 5
+    },
+    {
+      "epoch": 0.0106951871657754,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6414,
+      "step": 6
+    },
+    {
+      "epoch": 0.012477718360071301,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6741,
+      "step": 7
+    },
+    {
+      "epoch": 0.0142602495543672,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6424,
+      "step": 8
+    },
+    {
+      "epoch": 0.016042780748663103,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6623,
+      "step": 9
+    },
+    {
+      "epoch": 0.017825311942959002,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6533,
+      "step": 10
+    },
+    {
+      "epoch": 0.017825311942959002,
+      "eval_loss": 1.6557202339172363,
+      "eval_runtime": 43.8817,
+      "eval_samples_per_second": 22.789,
+      "eval_steps_per_second": 1.436,
+      "step": 10
+    },
+    {
+      "epoch": 0.0196078431372549,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6458,
+      "step": 11
+    },
+    {
+      "epoch": 0.0213903743315508,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6579,
+      "step": 12
+    },
+    {
+      "epoch": 0.023172905525846704,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6622,
+      "step": 13
+    },
+    {
+      "epoch": 0.024955436720142603,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6817,
+      "step": 14
+    },
+    {
+      "epoch": 0.026737967914438502,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6487,
+      "step": 15
+    },
+    {
+      "epoch": 0.0285204991087344,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0,
+      "loss": 1.6536,
+      "step": 16
+    },
+    {
+      "epoch": 0.030303030303030304,
+      "grad_norm": 3.260859966278076,
+      "learning_rate": 4.424778761061947e-08,
+      "loss": 1.6625,
+      "step": 17
+    },
+    {
+      "epoch": 0.03208556149732621,
+      "grad_norm": 3.260859966278076,
+      "learning_rate": 4.424778761061947e-08,
+      "loss": 1.6537,
+      "step": 18
+    },
+    {
+      "epoch": 0.0338680926916221,
+      "grad_norm": 2.555511713027954,
+      "learning_rate": 8.849557522123894e-08,
+      "loss": 1.6453,
+      "step": 19
+    },
+    {
+      "epoch": 0.035650623885918005,
+      "grad_norm": 4.205122947692871,
+      "learning_rate": 1.327433628318584e-07,
+      "loss": 1.6643,
+      "step": 20
+    },
+    {
+      "epoch": 0.035650623885918005,
+      "eval_loss": 1.6556264162063599,
+      "eval_runtime": 43.7614,
+      "eval_samples_per_second": 22.851,
+      "eval_steps_per_second": 1.44,
+      "step": 20
+    },
+    {
+      "epoch": 0.0374331550802139,
+      "grad_norm": 3.736807346343994,
+      "learning_rate": 1.7699115044247788e-07,
+      "loss": 1.6559,
+      "step": 21
+    },
+    {
+      "epoch": 0.0392156862745098,
+      "grad_norm": 3.6770436763763428,
+      "learning_rate": 2.2123893805309737e-07,
+      "loss": 1.6467,
+      "step": 22
+    },
+    {
+      "epoch": 0.040998217468805706,
+      "grad_norm": 3.84171986579895,
+      "learning_rate": 2.654867256637168e-07,
+      "loss": 1.6601,
+      "step": 23
+    },
+    {
+      "epoch": 0.0427807486631016,
+      "grad_norm": 3.9061005115509033,
+      "learning_rate": 3.097345132743363e-07,
+      "loss": 1.6531,
+      "step": 24
+    },
+    {
+      "epoch": 0.044563279857397504,
+      "grad_norm": 3.594496726989746,
+      "learning_rate": 3.5398230088495575e-07,
+      "loss": 1.6616,
+      "step": 25
+    },
+    {
+      "epoch": 0.04634581105169341,
+      "grad_norm": 3.2176733016967773,
+      "learning_rate": 3.9823008849557525e-07,
+      "loss": 1.6419,
+      "step": 26
+    },
+    {
+      "epoch": 0.0481283422459893,
+      "grad_norm": 3.9928395748138428,
+      "learning_rate": 4.4247787610619474e-07,
+      "loss": 1.6579,
+      "step": 27
+    },
+    {
+      "epoch": 0.049910873440285206,
+      "grad_norm": 3.372333288192749,
+      "learning_rate": 4.867256637168142e-07,
+      "loss": 1.609,
+      "step": 28
+    },
+    {
+      "epoch": 0.05169340463458111,
+      "grad_norm": 4.2034831047058105,
+      "learning_rate": 5.309734513274336e-07,
+      "loss": 1.6619,
+      "step": 29
+    },
+    {
+      "epoch": 0.053475935828877004,
+      "grad_norm": 4.264431953430176,
+      "learning_rate": 5.752212389380532e-07,
+      "loss": 1.6542,
+      "step": 30
+    },
+    {
+      "epoch": 0.053475935828877004,
+      "eval_loss": 1.6338881254196167,
+      "eval_runtime": 45.0967,
+      "eval_samples_per_second": 22.175,
+      "eval_steps_per_second": 1.397,
+      "step": 30
+    },
+    {
+      "epoch": 0.05525846702317291,
+      "grad_norm": 3.44136381149292,
+      "learning_rate": 6.194690265486726e-07,
+      "loss": 1.6431,
+      "step": 31
+    },
+    {
+      "epoch": 0.0570409982174688,
+      "grad_norm": 1.7967804670333862,
+      "learning_rate": 6.637168141592922e-07,
+      "loss": 1.6195,
+      "step": 32
+    },
+    {
+      "epoch": 0.058823529411764705,
+      "grad_norm": 2.048090696334839,
+      "learning_rate": 7.079646017699115e-07,
+      "loss": 1.5952,
+      "step": 33
+    },
+    {
+      "epoch": 0.06060606060606061,
+      "grad_norm": 2.1011312007904053,
+      "learning_rate": 7.522123893805311e-07,
+      "loss": 1.6177,
+      "step": 34
+    },
+    {
+      "epoch": 0.062388591800356503,
+      "grad_norm": 2.0587246417999268,
+      "learning_rate": 7.964601769911505e-07,
+      "loss": 1.5957,
+      "step": 35
+    },
+    {
+      "epoch": 0.06417112299465241,
+      "grad_norm": 2.3372628688812256,
+      "learning_rate": 8.4070796460177e-07,
+      "loss": 1.6202,
+      "step": 36
+    },
+    {
+      "epoch": 0.0659536541889483,
+      "grad_norm": 2.5044116973876953,
+      "learning_rate": 8.849557522123895e-07,
+      "loss": 1.5929,
+      "step": 37
+    },
+    {
+      "epoch": 0.0677361853832442,
+      "grad_norm": 2.064378261566162,
+      "learning_rate": 9.292035398230089e-07,
+      "loss": 1.5967,
+      "step": 38
+    },
+    {
+      "epoch": 0.06951871657754011,
+      "grad_norm": 2.2238266468048096,
+      "learning_rate": 9.734513274336284e-07,
+      "loss": 1.5837,
+      "step": 39
+    },
+    {
+      "epoch": 0.07130124777183601,
+      "grad_norm": 2.6334006786346436,
+      "learning_rate": 1.017699115044248e-06,
+      "loss": 1.6019,
+      "step": 40
+    },
+    {
+      "epoch": 0.07130124777183601,
+      "eval_loss": 1.5835978984832764,
+      "eval_runtime": 46.0453,
+      "eval_samples_per_second": 21.718,
+      "eval_steps_per_second": 1.368,
+      "step": 40
+    },
+    {
+      "epoch": 0.07308377896613191,
+      "grad_norm": 2.2472009658813477,
+      "learning_rate": 1.0619469026548673e-06,
+      "loss": 1.5946,
+      "step": 41
+    },
+    {
+      "epoch": 0.0748663101604278,
+      "grad_norm": 1.96908700466156,
+      "learning_rate": 1.106194690265487e-06,
+      "loss": 1.5945,
+      "step": 42
+    },
+    {
+      "epoch": 0.0766488413547237,
+      "grad_norm": 1.6488579511642456,
+      "learning_rate": 1.1504424778761064e-06,
+      "loss": 1.5721,
+      "step": 43
+    },
+    {
+      "epoch": 0.0784313725490196,
+      "grad_norm": 1.7816352844238281,
+      "learning_rate": 1.1946902654867258e-06,
+      "loss": 1.5722,
+      "step": 44
+    },
+    {
+      "epoch": 0.08021390374331551,
+      "grad_norm": 2.124027967453003,
+      "learning_rate": 1.2389380530973452e-06,
+      "loss": 1.5593,
+      "step": 45
+    },
+    {
+      "epoch": 0.08199643493761141,
+      "grad_norm": 1.7037169933319092,
+      "learning_rate": 1.2831858407079647e-06,
+      "loss": 1.5581,
+      "step": 46
+    },
+    {
+      "epoch": 0.08377896613190731,
+      "grad_norm": 1.7509331703186035,
+      "learning_rate": 1.3274336283185843e-06,
+      "loss": 1.5216,
+      "step": 47
+    },
+    {
+      "epoch": 0.0855614973262032,
+      "grad_norm": 1.6668604612350464,
+      "learning_rate": 1.3716814159292036e-06,
+      "loss": 1.5189,
+      "step": 48
+    },
+    {
+      "epoch": 0.0873440285204991,
+      "grad_norm": 2.4238839149475098,
+      "learning_rate": 1.415929203539823e-06,
+      "loss": 1.5294,
+      "step": 49
+    },
+    {
+      "epoch": 0.08912655971479501,
+      "grad_norm": 2.1640570163726807,
+      "learning_rate": 1.4601769911504427e-06,
+      "loss": 1.5123,
+      "step": 50
+    },
+    {
+      "epoch": 0.08912655971479501,
+      "eval_loss": 1.4958887100219727,
+      "eval_runtime": 46.3715,
+      "eval_samples_per_second": 21.565,
+      "eval_steps_per_second": 1.359,
+      "step": 50
+    },
+    {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 2.5261659622192383,
+      "learning_rate": 1.5044247787610621e-06,
+      "loss": 1.4612,
+      "step": 51
+    },
+    {
+      "epoch": 0.09269162210338681,
+      "grad_norm": 2.2561535835266113,
+      "learning_rate": 1.5486725663716816e-06,
+      "loss": 1.4658,
+      "step": 52
+    },
+    {
+      "epoch": 0.0944741532976827,
+      "grad_norm": 1.9507025480270386,
+      "learning_rate": 1.592920353982301e-06,
+      "loss": 1.4318,
+      "step": 53
+    },
+    {
+      "epoch": 0.0962566844919786,
+      "grad_norm": 1.9108023643493652,
+      "learning_rate": 1.6371681415929204e-06,
+      "loss": 1.4402,
+      "step": 54
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 1.9863903522491455,
+      "learning_rate": 1.68141592920354e-06,
+      "loss": 1.4153,
+      "step": 55
+    },
+    {
+      "epoch": 0.09982174688057041,
+      "grad_norm": 1.8348286151885986,
+      "learning_rate": 1.7256637168141593e-06,
+      "loss": 1.3972,
+      "step": 56
+    },
+    {
+      "epoch": 0.10160427807486631,
+      "grad_norm": 1.7218098640441895,
+      "learning_rate": 1.769911504424779e-06,
+      "loss": 1.4074,
+      "step": 57
+    },
+    {
+      "epoch": 0.10338680926916222,
+      "grad_norm": 1.701070785522461,
+      "learning_rate": 1.8141592920353984e-06,
+      "loss": 1.3963,
+      "step": 58
+    },
+    {
+      "epoch": 0.1051693404634581,
+      "grad_norm": 1.723047137260437,
+      "learning_rate": 1.8584070796460179e-06,
+      "loss": 1.3701,
+      "step": 59
+    },
+    {
+      "epoch": 0.10695187165775401,
+      "grad_norm": 1.7656834125518799,
+      "learning_rate": 1.9026548672566373e-06,
+      "loss": 1.3684,
+      "step": 60
+    },
+    {
+      "epoch": 0.10695187165775401,
+      "eval_loss": 1.372704267501831,
+      "eval_runtime": 46.4402,
+      "eval_samples_per_second": 21.533,
+      "eval_steps_per_second": 1.357,
+      "step": 60
+    },
+    {
+      "epoch": 0.10873440285204991,
+      "grad_norm": 1.3581100702285767,
+      "learning_rate": 1.9469026548672567e-06,
+      "loss": 1.335,
+      "step": 61
+    },
+    {
+      "epoch": 0.11051693404634581,
+      "grad_norm": 1.767719030380249,
+      "learning_rate": 1.991150442477876e-06,
+      "loss": 1.3582,
+      "step": 62
+    },
+    {
+      "epoch": 0.11229946524064172,
+      "grad_norm": 2.281069278717041,
+      "learning_rate": 2.035398230088496e-06,
+      "loss": 1.3439,
+      "step": 63
+    },
+    {
+      "epoch": 0.1140819964349376,
+      "grad_norm": 1.6699604988098145,
+      "learning_rate": 2.079646017699115e-06,
+      "loss": 1.3473,
+      "step": 64
+    },
+    {
+      "epoch": 0.11586452762923351,
+      "grad_norm": 2.3921921253204346,
+      "learning_rate": 2.1238938053097345e-06,
+      "loss": 1.3396,
+      "step": 65
+    },
+    {
+      "epoch": 0.11764705882352941,
+      "grad_norm": 2.202098846435547,
+      "learning_rate": 2.1681415929203544e-06,
+      "loss": 1.301,
+      "step": 66
+    },
+    {
+      "epoch": 0.11942959001782531,
+      "grad_norm": 1.4074530601501465,
+      "learning_rate": 2.212389380530974e-06,
+      "loss": 1.2906,
+      "step": 67
+    },
+    {
+      "epoch": 0.12121212121212122,
+      "grad_norm": 2.1387805938720703,
+      "learning_rate": 2.256637168141593e-06,
+      "loss": 1.3198,
+      "step": 68
+    },
+    {
+      "epoch": 0.12299465240641712,
+      "grad_norm": 1.847856044769287,
+      "learning_rate": 2.3008849557522127e-06,
+      "loss": 1.2936,
+      "step": 69
+    },
+    {
+      "epoch": 0.12477718360071301,
+      "grad_norm": 1.4546856880187988,
+      "learning_rate": 2.345132743362832e-06,
+      "loss": 1.2913,
+      "step": 70
+    },
+    {
+      "epoch": 0.12477718360071301,
+      "eval_loss": 1.3064121007919312,
+      "eval_runtime": 46.4122,
+      "eval_samples_per_second": 21.546,
+      "eval_steps_per_second": 1.357,
+      "step": 70
+    },
+    {
+      "epoch": 0.1265597147950089,
+      "grad_norm": 2.0278193950653076,
+      "learning_rate": 2.3893805309734516e-06,
+      "loss": 1.2712,
+      "step": 71
+    },
+    {
+      "epoch": 0.12834224598930483,
+      "grad_norm": 1.7863024473190308,
+      "learning_rate": 2.433628318584071e-06,
+      "loss": 1.2981,
+      "step": 72
+    },
+    {
+      "epoch": 0.13012477718360071,
+      "grad_norm": 1.429937720298767,
+      "learning_rate": 2.4778761061946905e-06,
+      "loss": 1.2857,
+      "step": 73
+    },
+    {
+      "epoch": 0.1319073083778966,
+      "grad_norm": 1.7546721696853638,
+      "learning_rate": 2.52212389380531e-06,
+      "loss": 1.2608,
+      "step": 74
+    },
+    {
+      "epoch": 0.13368983957219252,
+      "grad_norm": 2.067112445831299,
+      "learning_rate": 2.5663716814159294e-06,
+      "loss": 1.3058,
+      "step": 75
+    },
+    {
+      "epoch": 0.1354723707664884,
+      "grad_norm": 1.893581509590149,
+      "learning_rate": 2.6106194690265492e-06,
+      "loss": 1.2815,
+      "step": 76
+    },
+    {
+      "epoch": 0.13725490196078433,
+      "grad_norm": 2.0614898204803467,
+      "learning_rate": 2.6548672566371687e-06,
+      "loss": 1.2204,
+      "step": 77
+    },
+    {
+      "epoch": 0.13903743315508021,
+      "grad_norm": 1.8954098224639893,
+      "learning_rate": 2.6991150442477877e-06,
+      "loss": 1.2517,
+      "step": 78
+    },
+    {
+      "epoch": 0.1408199643493761,
+      "grad_norm": 1.7369916439056396,
+      "learning_rate": 2.743362831858407e-06,
+      "loss": 1.2706,
+      "step": 79
+    },
+    {
+      "epoch": 0.14260249554367202,
+      "grad_norm": 2.026573896408081,
+      "learning_rate": 2.7876106194690266e-06,
+      "loss": 1.2368,
+      "step": 80
+    },
+    {
+      "epoch": 0.14260249554367202,
+      "eval_loss": 1.2641066312789917,
+      "eval_runtime": 46.4069,
+      "eval_samples_per_second": 21.549,
+      "eval_steps_per_second": 1.358,
+      "step": 80
+    },
+    {
+      "epoch": 0.1443850267379679,
+      "grad_norm": 1.4173997640609741,
+      "learning_rate": 2.831858407079646e-06,
+      "loss": 1.2187,
+      "step": 81
+    },
+    {
+      "epoch": 0.14616755793226383,
+      "grad_norm": 1.9600279331207275,
+      "learning_rate": 2.876106194690266e-06,
+      "loss": 1.25,
+      "step": 82
+    },
+    {
+      "epoch": 0.14795008912655971,
+      "grad_norm": 2.2337169647216797,
+      "learning_rate": 2.9203539823008853e-06,
+      "loss": 1.2143,
+      "step": 83
+    },
+    {
+      "epoch": 0.1497326203208556,
+      "grad_norm": 1.4930493831634521,
+      "learning_rate": 2.9646017699115048e-06,
+      "loss": 1.2178,
+      "step": 84
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 1.950060248374939,
+      "learning_rate": 3.0088495575221242e-06,
+      "loss": 1.237,
+      "step": 85
+    },
+    {
+      "epoch": 0.1532976827094474,
+      "grad_norm": 1.7046960592269897,
+      "learning_rate": 3.0530973451327432e-06,
+      "loss": 1.1585,
+      "step": 86
+    },
+    {
+      "epoch": 0.15508021390374332,
+      "grad_norm": 1.6781989336013794,
+      "learning_rate": 3.097345132743363e-06,
+      "loss": 1.1892,
+      "step": 87
+    },
+    {
+      "epoch": 0.1568627450980392,
+      "grad_norm": 2.416377544403076,
+      "learning_rate": 3.1415929203539825e-06,
+      "loss": 1.2191,
+      "step": 88
+    },
+    {
+      "epoch": 0.1586452762923351,
+      "grad_norm": 1.7030267715454102,
+      "learning_rate": 3.185840707964602e-06,
+      "loss": 1.2403,
+      "step": 89
+    },
+    {
+      "epoch": 0.16042780748663102,
+      "grad_norm": 2.404744863510132,
+      "learning_rate": 3.2300884955752214e-06,
+      "loss": 1.2364,
+      "step": 90
+    },
+    {
+      "epoch": 0.16042780748663102,
+      "eval_loss": 1.2330397367477417,
+      "eval_runtime": 46.5752,
+      "eval_samples_per_second": 21.471,
+      "eval_steps_per_second": 1.353,
+      "step": 90
+    },
+    {
+      "epoch": 0.1622103386809269,
+      "grad_norm": 1.3477970361709595,
+      "learning_rate": 3.274336283185841e-06,
+      "loss": 1.2123,
+      "step": 91
+    },
+    {
+      "epoch": 0.16399286987522282,
+      "grad_norm": 2.5285091400146484,
+      "learning_rate": 3.3185840707964607e-06,
+      "loss": 1.2276,
+      "step": 92
+    },
+    {
+      "epoch": 0.1657754010695187,
+      "grad_norm": 1.5784391164779663,
+      "learning_rate": 3.36283185840708e-06,
+      "loss": 1.2056,
+      "step": 93
+    },
+    {
+      "epoch": 0.16755793226381463,
+      "grad_norm": 1.4000625610351562,
+      "learning_rate": 3.407079646017699e-06,
+      "loss": 1.2324,
+      "step": 94
+    },
+    {
+      "epoch": 0.16934046345811052,
+      "grad_norm": 2.468836784362793,
+      "learning_rate": 3.4513274336283186e-06,
+      "loss": 1.2209,
+      "step": 95
+    },
+    {
+      "epoch": 0.1711229946524064,
+      "grad_norm": 1.5707480907440186,
+      "learning_rate": 3.495575221238938e-06,
+      "loss": 1.1717,
+      "step": 96
+    },
+    {
+      "epoch": 0.17290552584670232,
+      "grad_norm": 2.063491106033325,
+      "learning_rate": 3.539823008849558e-06,
+      "loss": 1.1874,
+      "step": 97
+    },
+    {
+      "epoch": 0.1746880570409982,
+      "grad_norm": 1.7270640134811401,
+      "learning_rate": 3.5840707964601774e-06,
+      "loss": 1.2179,
+      "step": 98
+    },
+    {
+      "epoch": 0.17647058823529413,
+      "grad_norm": 1.9771426916122437,
+      "learning_rate": 3.628318584070797e-06,
+      "loss": 1.1508,
+      "step": 99
+    },
+    {
+      "epoch": 0.17825311942959002,
+      "grad_norm": 2.1519346237182617,
+      "learning_rate": 3.6725663716814163e-06,
+      "loss": 1.1956,
+      "step": 100
+    },
+    {
+      "epoch": 0.17825311942959002,
+      "eval_loss": 1.2065033912658691,
+      "eval_runtime": 46.5272,
+      "eval_samples_per_second": 21.493,
+      "eval_steps_per_second": 1.354,
+      "step": 100
+    },
+    {
+      "epoch": 0.1800356506238859,
+      "grad_norm": 1.335666298866272,
+      "learning_rate": 3.7168141592920357e-06,
+      "loss": 1.18,
+      "step": 101
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 2.2174997329711914,
+      "learning_rate": 3.7610619469026547e-06,
+      "loss": 1.164,
+      "step": 102
+    },
+    {
+      "epoch": 0.1836007130124777,
+      "grad_norm": 1.5584592819213867,
+      "learning_rate": 3.8053097345132746e-06,
+      "loss": 1.133,
+      "step": 103
+    },
+    {
+      "epoch": 0.18538324420677363,
+      "grad_norm": 2.222188949584961,
+      "learning_rate": 3.849557522123894e-06,
+      "loss": 1.1489,
+      "step": 104
+    },
+    {
+      "epoch": 0.18716577540106952,
+      "grad_norm": 1.8726121187210083,
+      "learning_rate": 3.8938053097345135e-06,
+      "loss": 1.1791,
+      "step": 105
+    },
+    {
+      "epoch": 0.1889483065953654,
+      "grad_norm": 1.8814388513565063,
+      "learning_rate": 3.938053097345133e-06,
+      "loss": 1.1545,
+      "step": 106
+    },
+    {
+      "epoch": 0.19073083778966132,
+      "grad_norm": 2.3826377391815186,
+      "learning_rate": 3.982300884955752e-06,
+      "loss": 1.1467,
+      "step": 107
+    },
+    {
+      "epoch": 0.1925133689839572,
+      "grad_norm": 1.398267388343811,
+      "learning_rate": 4.026548672566372e-06,
+      "loss": 1.1761,
+      "step": 108
+    },
+    {
+      "epoch": 0.19429590017825313,
+      "grad_norm": 1.3779840469360352,
+      "learning_rate": 4.070796460176992e-06,
+      "loss": 1.1323,
+      "step": 109
+    },
+    {
+      "epoch": 0.19607843137254902,
+      "grad_norm": 1.9406187534332275,
+      "learning_rate": 4.115044247787611e-06,
+      "loss": 1.1125,
+      "step": 110
+    },
+    {
+      "epoch": 0.19607843137254902,
+      "eval_loss": 1.1835510730743408,
+      "eval_runtime": 46.5879,
+      "eval_samples_per_second": 21.465,
+      "eval_steps_per_second": 1.352,
+      "step": 110
+    },
+    {
+      "epoch": 0.19786096256684493,
+      "grad_norm": 1.5905953645706177,
+      "learning_rate": 4.15929203539823e-06,
+      "loss": 1.1075,
+      "step": 111
+    },
+    {
+      "epoch": 0.19964349376114082,
+      "grad_norm": 1.9858345985412598,
+      "learning_rate": 4.20353982300885e-06,
+      "loss": 1.1772,
+      "step": 112
+    },
+    {
+      "epoch": 0.2014260249554367,
+      "grad_norm": 1.8801683187484741,
+      "learning_rate": 4.247787610619469e-06,
+      "loss": 1.1625,
+      "step": 113
+    },
+    {
+      "epoch": 0.20320855614973263,
+      "grad_norm": 1.507340431213379,
+      "learning_rate": 4.2920353982300885e-06,
+      "loss": 1.1491,
+      "step": 114
+    },
+    {
+      "epoch": 0.20499108734402852,
+      "grad_norm": 1.7214304208755493,
+      "learning_rate": 4.336283185840709e-06,
+      "loss": 1.1368,
+      "step": 115
+    },
+    {
+      "epoch": 0.20677361853832443,
+      "grad_norm": 1.971928358078003,
+      "learning_rate": 4.380530973451328e-06,
+      "loss": 1.1497,
+      "step": 116
+    },
+    {
+      "epoch": 0.20855614973262032,
+      "grad_norm": 1.5467543601989746,
+      "learning_rate": 4.424778761061948e-06,
+      "loss": 1.099,
+      "step": 117
+    },
+    {
+      "epoch": 0.2103386809269162,
+      "grad_norm": 1.7273290157318115,
+      "learning_rate": 4.469026548672566e-06,
+      "loss": 1.0918,
+      "step": 118
+    },
+    {
+      "epoch": 0.21212121212121213,
+      "grad_norm": 1.7540112733840942,
+      "learning_rate": 4.513274336283186e-06,
+      "loss": 1.1105,
+      "step": 119
+    },
+    {
+      "epoch": 0.21390374331550802,
+      "grad_norm": 1.4684147834777832,
+      "learning_rate": 4.557522123893805e-06,
+      "loss": 1.12,
+      "step": 120
+    },
+    {
+      "epoch": 0.21390374331550802,
+      "eval_loss": 1.165776014328003,
+      "eval_runtime": 46.5571,
+      "eval_samples_per_second": 21.479,
+      "eval_steps_per_second": 1.353,
+      "step": 120
+    },
+    {
+      "epoch": 0.21568627450980393,
+      "grad_norm": 1.7996920347213745,
+      "learning_rate": 4.6017699115044254e-06,
+      "loss": 1.1068,
+      "step": 121
+    },
+    {
+      "epoch": 0.21746880570409982,
+      "grad_norm": 2.2431159019470215,
+      "learning_rate": 4.646017699115045e-06,
+      "loss": 1.1161,
+      "step": 122
+    },
+    {
+      "epoch": 0.2192513368983957,
+      "grad_norm": 1.894515037536621,
+      "learning_rate": 4.690265486725664e-06,
+      "loss": 1.0878,
+      "step": 123
+    },
+    {
+      "epoch": 0.22103386809269163,
+      "grad_norm": 1.8979915380477905,
+      "learning_rate": 4.734513274336284e-06,
+      "loss": 1.1148,
+      "step": 124
+    },
+    {
+      "epoch": 0.22281639928698752,
+      "grad_norm": 2.508988618850708,
+      "learning_rate": 4.778761061946903e-06,
+      "loss": 1.1406,
+      "step": 125
+    },
+    {
+      "epoch": 0.22459893048128343,
+      "grad_norm": 1.8342119455337524,
+      "learning_rate": 4.823008849557523e-06,
+      "loss": 1.0711,
+      "step": 126
+    },
+    {
+      "epoch": 0.22638146167557932,
+      "grad_norm": 1.8743572235107422,
+      "learning_rate": 4.867256637168142e-06,
+      "loss": 1.1042,
+      "step": 127
+    },
+    {
+      "epoch": 0.2281639928698752,
+      "grad_norm": 2.5183823108673096,
+      "learning_rate": 4.9115044247787615e-06,
+      "loss": 1.1277,
+      "step": 128
+    },
+    {
+      "epoch": 0.22994652406417113,
+      "grad_norm": 1.3967655897140503,
+      "learning_rate": 4.955752212389381e-06,
+      "loss": 1.0907,
+      "step": 129
+    },
+    {
+      "epoch": 0.23172905525846701,
+      "grad_norm": 2.2105870246887207,
+      "learning_rate": 5e-06,
+      "loss": 1.1028,
+      "step": 130
+    },
+    {
+      "epoch": 0.23172905525846701,
+      "eval_loss": 1.1516406536102295,
+      "eval_runtime": 46.5761,
+      "eval_samples_per_second": 21.47,
+      "eval_steps_per_second": 1.353,
+      "step": 130
+    },
+    {
+      "epoch": 0.23351158645276293,
+      "grad_norm": 2.0359160900115967,
+      "learning_rate": 4.995044598612488e-06,
+      "loss": 1.0557,
+      "step": 131
+    },
+    {
+      "epoch": 0.23529411764705882,
+      "grad_norm": 2.0358471870422363,
+      "learning_rate": 4.990089197224976e-06,
+      "loss": 1.0805,
+      "step": 132
+    },
+    {
+      "epoch": 0.23707664884135474,
+      "grad_norm": 2.4560561180114746,
+      "learning_rate": 4.985133795837464e-06,
+      "loss": 1.1219,
+      "step": 133
+    },
+    {
+      "epoch": 0.23885918003565063,
+      "grad_norm": 1.5883917808532715,
+      "learning_rate": 4.980178394449951e-06,
+      "loss": 1.1179,
+      "step": 134
+    },
+    {
+      "epoch": 0.24064171122994651,
+      "grad_norm": 2.193816661834717,
+      "learning_rate": 4.975222993062438e-06,
+      "loss": 1.1235,
+      "step": 135
+    },
+    {
+      "epoch": 0.24242424242424243,
+      "grad_norm": 1.2323447465896606,
+      "learning_rate": 4.970267591674926e-06,
+      "loss": 1.095,
+      "step": 136
+    },
+    {
+      "epoch": 0.24420677361853832,
+      "grad_norm": 2.667142391204834,
+      "learning_rate": 4.965312190287414e-06,
+      "loss": 1.1391,
+      "step": 137
+    },
+    {
+      "epoch": 0.24598930481283424,
+      "grad_norm": 1.4862529039382935,
+      "learning_rate": 4.960356788899901e-06,
+      "loss": 1.0967,
+      "step": 138
+    },
+    {
+      "epoch": 0.24777183600713013,
+      "grad_norm": 2.315704584121704,
+      "learning_rate": 4.955401387512389e-06,
+      "loss": 1.0874,
+      "step": 139
+    },
+    {
+      "epoch": 0.24955436720142601,
+      "grad_norm": 1.8777433633804321,
+      "learning_rate": 4.950445986124876e-06,
+      "loss": 1.0983,
+      "step": 140
+    },
+    {
+      "epoch": 0.24955436720142601,
+      "eval_loss": 1.1395500898361206,
+      "eval_runtime": 46.6881,
+      "eval_samples_per_second": 21.419,
+      "eval_steps_per_second": 1.349,
+      "step": 140
+    },
+    {
+      "epoch": 0.25133689839572193,
+      "grad_norm": 1.809224009513855,
+      "learning_rate": 4.945490584737364e-06,
+      "loss": 1.0752,
+      "step": 141
+    },
+    {
+      "epoch": 0.2531194295900178,
+      "grad_norm": 1.6294512748718262,
+      "learning_rate": 4.9405351833498515e-06,
+      "loss": 1.09,
+      "step": 142
+    },
+    {
+      "epoch": 0.2549019607843137,
+      "grad_norm": 2.189276933670044,
+      "learning_rate": 4.935579781962339e-06,
+      "loss": 1.0583,
+      "step": 143
+    },
+    {
+      "epoch": 0.25668449197860965,
+      "grad_norm": 1.9446332454681396,
+      "learning_rate": 4.930624380574827e-06,
+      "loss": 1.1093,
+      "step": 144
+    },
+    {
+      "epoch": 0.25846702317290554,
+      "grad_norm": 1.3689433336257935,
+      "learning_rate": 4.925668979187315e-06,
+      "loss": 1.0821,
+      "step": 145
+    },
+    {
+      "epoch": 0.26024955436720143,
+      "grad_norm": 1.5317623615264893,
+      "learning_rate": 4.920713577799802e-06,
+      "loss": 1.0563,
+      "step": 146
+    },
+    {
+      "epoch": 0.2620320855614973,
+      "grad_norm": 1.8125908374786377,
+      "learning_rate": 4.915758176412289e-06,
+      "loss": 1.0721,
+      "step": 147
+    },
+    {
+      "epoch": 0.2638146167557932,
+      "grad_norm": 1.3997691869735718,
+      "learning_rate": 4.9108027750247775e-06,
+      "loss": 1.0583,
+      "step": 148
+    },
+    {
+      "epoch": 0.26559714795008915,
+      "grad_norm": 2.459528684616089,
+      "learning_rate": 4.9058473736372656e-06,
+      "loss": 1.0657,
+      "step": 149
+    },
+    {
+      "epoch": 0.26737967914438504,
+      "grad_norm": 1.4545484781265259,
+      "learning_rate": 4.900891972249753e-06,
+      "loss": 1.0698,
+      "step": 150
+    },
+    {
+      "epoch": 0.26737967914438504,
+      "eval_loss": 1.1283138990402222,
+      "eval_runtime": 46.5528,
+      "eval_samples_per_second": 21.481,
+      "eval_steps_per_second": 1.353,
+      "step": 150
+    },
+    {
+      "epoch": 0.26916221033868093,
+      "grad_norm": 1.7834577560424805,
+      "learning_rate": 4.89593657086224e-06,
+      "loss": 1.051,
+      "step": 151
+    },
+    {
+      "epoch": 0.2709447415329768,
+      "grad_norm": 2.1426002979278564,
+      "learning_rate": 4.890981169474728e-06,
+      "loss": 1.0561,
+      "step": 152
+    },
+    {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 1.8575809001922607,
+      "learning_rate": 4.886025768087215e-06,
+      "loss": 1.0573,
+      "step": 153
+    },
+    {
+      "epoch": 0.27450980392156865,
+      "grad_norm": 1.7787772417068481,
+      "learning_rate": 4.881070366699703e-06,
+      "loss": 1.0508,
+      "step": 154
+    },
+    {
+      "epoch": 0.27629233511586454,
+      "grad_norm": 1.6413190364837646,
+      "learning_rate": 4.876114965312191e-06,
+      "loss": 1.0829,
+      "step": 155
+    },
+    {
+      "epoch": 0.27807486631016043,
+      "grad_norm": 2.1804025173187256,
+      "learning_rate": 4.871159563924679e-06,
+      "loss": 1.0663,
+      "step": 156
+    },
+    {
+      "epoch": 0.2798573975044563,
+      "grad_norm": 2.081756353378296,
+      "learning_rate": 4.866204162537166e-06,
+      "loss": 1.0621,
+      "step": 157
+    },
+    {
+      "epoch": 0.2816399286987522,
+      "grad_norm": 1.5092004537582397,
+      "learning_rate": 4.861248761149653e-06,
+      "loss": 1.071,
+      "step": 158
+    },
+    {
+      "epoch": 0.28342245989304815,
+      "grad_norm": 2.1826558113098145,
+      "learning_rate": 4.8562933597621405e-06,
+      "loss": 1.0904,
+      "step": 159
+    },
+    {
+      "epoch": 0.28520499108734404,
+      "grad_norm": 1.9138243198394775,
+      "learning_rate": 4.8513379583746286e-06,
+      "loss": 1.0662,
+      "step": 160
+    },
+    {
+      "epoch": 0.28520499108734404,
+      "eval_loss": 1.1119412183761597,
+      "eval_runtime": 46.6301,
+      "eval_samples_per_second": 21.445,
+      "eval_steps_per_second": 1.351,
+      "step": 160
+    },
+    {
+      "epoch": 0.28698752228163993,
+      "grad_norm": 1.5497403144836426,
+      "learning_rate": 4.846382556987117e-06,
+      "loss": 1.0312,
+      "step": 161
+    },
+    {
+      "epoch": 0.2887700534759358,
+      "grad_norm": 1.6409493684768677,
+      "learning_rate": 4.841427155599604e-06,
+      "loss": 1.0779,
+      "step": 162
+    },
+    {
+      "epoch": 0.2905525846702317,
+      "grad_norm": 1.6504517793655396,
+      "learning_rate": 4.836471754212091e-06,
+      "loss": 1.0901,
+      "step": 163
+    },
+    {
+      "epoch": 0.29233511586452765,
+      "grad_norm": 1.433727741241455,
+      "learning_rate": 4.831516352824579e-06,
+      "loss": 1.0555,
+      "step": 164
+    },
+    {
+      "epoch": 0.29411764705882354,
+      "grad_norm": 1.4695453643798828,
+      "learning_rate": 4.826560951437067e-06,
+      "loss": 1.0421,
+      "step": 165
+    },
+    {
+      "epoch": 0.29590017825311943,
+      "grad_norm": 1.7415134906768799,
+      "learning_rate": 4.8216055500495545e-06,
+      "loss": 1.0147,
+      "step": 166
+    },
+    {
+      "epoch": 0.2976827094474153,
+      "grad_norm": 1.3279739618301392,
+      "learning_rate": 4.816650148662042e-06,
+      "loss": 1.0764,
+      "step": 167
+    },
+    {
+      "epoch": 0.2994652406417112,
+      "grad_norm": 1.5744524002075195,
+      "learning_rate": 4.81169474727453e-06,
+      "loss": 1.0412,
+      "step": 168
+    },
+    {
+      "epoch": 0.30124777183600715,
+      "grad_norm": 1.8190844058990479,
+      "learning_rate": 4.806739345887017e-06,
+      "loss": 1.0448,
+      "step": 169
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 1.7416859865188599,
+      "learning_rate": 4.801783944499504e-06,
+      "loss": 1.052,
+      "step": 170
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "eval_loss": 1.1019716262817383,
+      "eval_runtime": 46.6122,
+      "eval_samples_per_second": 21.454,
+      "eval_steps_per_second": 1.352,
+      "step": 170
+    },
+    {
+      "epoch": 0.3048128342245989,
+      "grad_norm": 1.7649554014205933,
+      "learning_rate": 4.7968285431119924e-06,
+      "loss": 1.0654,
+      "step": 171
+    },
+    {
+      "epoch": 0.3065953654188948,
+      "grad_norm": 2.2370729446411133,
+      "learning_rate": 4.7918731417244805e-06,
+      "loss": 1.0519,
+      "step": 172
+    },
+    {
+      "epoch": 0.3083778966131907,
+      "grad_norm": 1.7433515787124634,
+      "learning_rate": 4.786917740336968e-06,
+      "loss": 1.0431,
+      "step": 173
+    },
+    {
+      "epoch": 0.31016042780748665,
+      "grad_norm": 1.4815385341644287,
+      "learning_rate": 4.781962338949455e-06,
+      "loss": 1.0614,
+      "step": 174
+    },
+    {
+      "epoch": 0.31194295900178254,
+      "grad_norm": 1.8037469387054443,
+      "learning_rate": 4.777006937561943e-06,
+      "loss": 1.0199,
+      "step": 175
+    },
+    {
+      "epoch": 0.3137254901960784,
+      "grad_norm": 1.5110318660736084,
+      "learning_rate": 4.77205153617443e-06,
+      "loss": 1.0676,
+      "step": 176
+    },
+    {
+      "epoch": 0.3155080213903743,
+      "grad_norm": 1.390753149986267,
+      "learning_rate": 4.767096134786918e-06,
+      "loss": 1.0451,
+      "step": 177
+    },
+    {
+      "epoch": 0.3172905525846702,
+      "grad_norm": 1.6916826963424683,
+      "learning_rate": 4.762140733399406e-06,
+      "loss": 1.041,
+      "step": 178
+    },
+    {
+      "epoch": 0.31907308377896615,
+      "grad_norm": 1.5903483629226685,
+      "learning_rate": 4.757185332011893e-06,
+      "loss": 1.0095,
+      "step": 179
+    },
+    {
+      "epoch": 0.32085561497326204,
+      "grad_norm": 1.4499986171722412,
+      "learning_rate": 4.752229930624381e-06,
+      "loss": 1.0617,
+      "step": 180
+    },
+    {
+      "epoch": 0.32085561497326204,
+      "eval_loss": 1.0959594249725342,
+      "eval_runtime": 46.7286,
+      "eval_samples_per_second": 21.4,
+      "eval_steps_per_second": 1.348,
+      "step": 180
+    },
+    {
+      "epoch": 0.3226381461675579,
+      "grad_norm": 1.4177703857421875,
+      "learning_rate": 4.747274529236869e-06,
+      "loss": 1.0189,
+      "step": 181
+    },
+    {
+      "epoch": 0.3244206773618538,
+      "grad_norm": 1.5359338521957397,
+      "learning_rate": 4.742319127849356e-06,
+      "loss": 1.0539,
+      "step": 182
+    },
+    {
+      "epoch": 0.32620320855614976,
+      "grad_norm": 1.726008415222168,
+      "learning_rate": 4.7373637264618435e-06,
+      "loss": 1.053,
+      "step": 183
+    },
+    {
+      "epoch": 0.32798573975044565,
+      "grad_norm": 2.085338830947876,
+      "learning_rate": 4.732408325074332e-06,
+      "loss": 1.0649,
+      "step": 184
+    },
+    {
+      "epoch": 0.32976827094474154,
+      "grad_norm": 2.3661019802093506,
+      "learning_rate": 4.727452923686819e-06,
+      "loss": 1.0542,
+      "step": 185
+    },
+    {
+      "epoch": 0.3315508021390374,
+      "grad_norm": 1.524652123451233,
+      "learning_rate": 4.722497522299306e-06,
+      "loss": 1.0562,
+      "step": 186
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 1.3639343976974487,
+      "learning_rate": 4.717542120911794e-06,
+      "loss": 1.0518,
+      "step": 187
+    },
+    {
+      "epoch": 0.33511586452762926,
+      "grad_norm": 1.6668117046356201,
+      "learning_rate": 4.712586719524282e-06,
+      "loss": 1.0644,
+      "step": 188
+    },
+    {
+      "epoch": 0.33689839572192515,
+      "grad_norm": 1.5209795236587524,
+      "learning_rate": 4.7076313181367695e-06,
+      "loss": 1.0203,
+      "step": 189
+    },
+    {
+      "epoch": 0.33868092691622104,
+      "grad_norm": 1.3939913511276245,
+      "learning_rate": 4.702675916749257e-06,
+      "loss": 1.0712,
+      "step": 190
+    },
+    {
+      "epoch": 0.33868092691622104,
+      "eval_loss": 1.0875341892242432,
+      "eval_runtime": 46.5406,
+      "eval_samples_per_second": 21.487,
+      "eval_steps_per_second": 1.354,
+      "step": 190
+    },
+    {
+      "epoch": 0.3404634581105169,
+      "grad_norm": 1.3856011629104614,
+      "learning_rate": 4.697720515361745e-06,
+      "loss": 1.0199,
+      "step": 191
+    },
+    {
+      "epoch": 0.3422459893048128,
+      "grad_norm": 1.608984112739563,
+      "learning_rate": 4.692765113974233e-06,
+      "loss": 1.0507,
+      "step": 192
+    },
+    {
+      "epoch": 0.34402852049910876,
+      "grad_norm": 1.7646530866622925,
+      "learning_rate": 4.68780971258672e-06,
+      "loss": 1.0043,
+      "step": 193
+    },
+    {
+      "epoch": 0.34581105169340465,
+      "grad_norm": 1.4916458129882812,
+      "learning_rate": 4.682854311199207e-06,
+      "loss": 1.0132,
+      "step": 194
+    },
+    {
+      "epoch": 0.34759358288770054,
+      "grad_norm": 1.4389764070510864,
+      "learning_rate": 4.677898909811695e-06,
+      "loss": 1.0088,
+      "step": 195
+    },
+    {
+      "epoch": 0.3493761140819964,
+      "grad_norm": 1.8099793195724487,
+      "learning_rate": 4.672943508424183e-06,
+      "loss": 1.0278,
+      "step": 196
+    },
+    {
+      "epoch": 0.3511586452762923,
+      "grad_norm": 1.6475704908370972,
+      "learning_rate": 4.667988107036671e-06,
+      "loss": 1.0169,
+      "step": 197
+    },
+    {
+      "epoch": 0.35294117647058826,
+      "grad_norm": 1.4079694747924805,
+      "learning_rate": 4.663032705649158e-06,
+      "loss": 1.0327,
+      "step": 198
+    },
+    {
+      "epoch": 0.35472370766488415,
+      "grad_norm": 1.91256844997406,
+      "learning_rate": 4.658077304261645e-06,
+      "loss": 1.0322,
+      "step": 199
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "grad_norm": 1.4115188121795654,
+      "learning_rate": 4.653121902874133e-06,
+      "loss": 1.0105,
+      "step": 200
+    },
+    {
+      "epoch": 0.35650623885918004,
+      "eval_loss": 1.083202600479126,
+      "eval_runtime": 46.6309,
+      "eval_samples_per_second": 21.445,
+      "eval_steps_per_second": 1.351,
+      "step": 200
+    },
+    {
+      "epoch": 0.3582887700534759,
+      "grad_norm": 1.5581523180007935,
+      "learning_rate": 4.648166501486621e-06,
+      "loss": 1.0371,
+      "step": 201
+    },
+    {
+      "epoch": 0.3600713012477718,
+      "grad_norm": 2.1223926544189453,
+      "learning_rate": 4.643211100099108e-06,
+      "loss": 1.0471,
+      "step": 202
+    },
+    {
+      "epoch": 0.36185383244206776,
+      "grad_norm": 1.6544064283370972,
+      "learning_rate": 4.638255698711596e-06,
+      "loss": 1.0059,
+      "step": 203
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 1.9283210039138794,
+      "learning_rate": 4.633300297324084e-06,
+      "loss": 1.0192,
+      "step": 204
+    },
+    {
+      "epoch": 0.36541889483065954,
+      "grad_norm": 1.6795734167099,
+      "learning_rate": 4.628344895936571e-06,
+      "loss": 1.0084,
+      "step": 205
+    },
+    {
+      "epoch": 0.3672014260249554,
+      "grad_norm": 1.3590185642242432,
+      "learning_rate": 4.6233894945490585e-06,
+      "loss": 1.0137,
+      "step": 206
+    },
+    {
+      "epoch": 0.3689839572192513,
+      "grad_norm": 1.6634080410003662,
+      "learning_rate": 4.6184340931615466e-06,
+      "loss": 1.0059,
+      "step": 207
+    },
+    {
+      "epoch": 0.37076648841354726,
+      "grad_norm": 1.5877068042755127,
+      "learning_rate": 4.613478691774035e-06,
+      "loss": 1.0255,
+      "step": 208
+    },
+    {
+      "epoch": 0.37254901960784315,
+      "grad_norm": 1.8005928993225098,
+      "learning_rate": 4.608523290386522e-06,
+      "loss": 1.0474,
+      "step": 209
+    },
+    {
+      "epoch": 0.37433155080213903,
+      "grad_norm": 1.8539314270019531,
+      "learning_rate": 4.603567888999009e-06,
+      "loss": 1.011,
+      "step": 210
+    },
+    {
+      "epoch": 0.37433155080213903,
+      "eval_loss": 1.07635498046875,
+      "eval_runtime": 46.6192,
+      "eval_samples_per_second": 21.45,
+      "eval_steps_per_second": 1.351,
+      "step": 210
+    },
+    {
+      "epoch": 0.3761140819964349,
+      "grad_norm": 1.860028624534607,
+      "learning_rate": 4.598612487611497e-06,
+      "loss": 1.0141,
+      "step": 211
+    },
+    {
+      "epoch": 0.3778966131907308,
+      "grad_norm": 1.5086894035339355,
+      "learning_rate": 4.5936570862239844e-06,
+      "loss": 1.0376,
+      "step": 212
+    },
+    {
+      "epoch": 0.37967914438502676,
+      "grad_norm": 1.4755423069000244,
+      "learning_rate": 4.5887016848364725e-06,
+      "loss": 0.9872,
+      "step": 213
+    },
+    {
+      "epoch": 0.38146167557932265,
+      "grad_norm": 1.6511203050613403,
+      "learning_rate": 4.58374628344896e-06,
+      "loss": 1.0179,
+      "step": 214
+    },
+    {
+      "epoch": 0.38324420677361853,
+      "grad_norm": 1.6704275608062744,
+      "learning_rate": 4.578790882061447e-06,
+      "loss": 1.0257,
+      "step": 215
+    },
+    {
+      "epoch": 0.3850267379679144,
+      "grad_norm": 2.220994234085083,
+      "learning_rate": 4.573835480673935e-06,
+      "loss": 1.0393,
+      "step": 216
+    },
+    {
+      "epoch": 0.3868092691622103,
+      "grad_norm": 1.5368359088897705,
+      "learning_rate": 4.568880079286422e-06,
+      "loss": 0.9882,
+      "step": 217
+    },
+    {
+      "epoch": 0.38859180035650626,
+      "grad_norm": 2.0973968505859375,
+      "learning_rate": 4.5639246778989096e-06,
+      "loss": 1.0245,
+      "step": 218
+    },
+    {
+      "epoch": 0.39037433155080214,
+      "grad_norm": 1.4747536182403564,
+      "learning_rate": 4.558969276511398e-06,
+      "loss": 1.0095,
+      "step": 219
+    },
+    {
+      "epoch": 0.39215686274509803,
+      "grad_norm": 2.0183606147766113,
+      "learning_rate": 4.554013875123886e-06,
+      "loss": 0.9998,
+      "step": 220
+    },
+    {
+      "epoch": 0.39215686274509803,
+      "eval_loss": 1.0749539136886597,
+      "eval_runtime": 46.6635,
+      "eval_samples_per_second": 21.43,
+      "eval_steps_per_second": 1.35,
+      "step": 220
+    },
+    {
+      "epoch": 0.3939393939393939,
+      "grad_norm": 1.8413187265396118,
+      "learning_rate": 4.549058473736373e-06,
+      "loss": 1.0348,
+      "step": 221
+    },
+    {
+      "epoch": 0.39572192513368987,
+      "grad_norm": 1.7214562892913818,
+      "learning_rate": 4.54410307234886e-06,
+      "loss": 1.0154,
+      "step": 222
+    },
+    {
+      "epoch": 0.39750445632798576,
+      "grad_norm": 1.8179748058319092,
+      "learning_rate": 4.539147670961348e-06,
+      "loss": 0.9957,
+      "step": 223
+    },
+    {
+      "epoch": 0.39928698752228164,
+      "grad_norm": 1.905828833580017,
+      "learning_rate": 4.534192269573836e-06,
+      "loss": 0.9873,
+      "step": 224
+    },
+    {
+      "epoch": 0.40106951871657753,
+      "grad_norm": 1.3392513990402222,
+      "learning_rate": 4.529236868186324e-06,
+      "loss": 0.9834,
+      "step": 225
+    },
+    {
+      "epoch": 0.4028520499108734,
+      "grad_norm": 1.6817848682403564,
+      "learning_rate": 4.524281466798811e-06,
+      "loss": 0.9789,
+      "step": 226
+    },
+    {
+      "epoch": 0.40463458110516937,
+      "grad_norm": 1.7255293130874634,
+      "learning_rate": 4.519326065411299e-06,
+      "loss": 0.9868,
+      "step": 227
+    },
+    {
+      "epoch": 0.40641711229946526,
+      "grad_norm": 1.6086952686309814,
+      "learning_rate": 4.514370664023786e-06,
+      "loss": 0.9916,
+      "step": 228
+    },
+    {
+      "epoch": 0.40819964349376114,
+      "grad_norm": 2.0437726974487305,
+      "learning_rate": 4.509415262636274e-06,
+      "loss": 1.0168,
+      "step": 229
+    },
+    {
+      "epoch": 0.40998217468805703,
+      "grad_norm": 1.719529390335083,
+      "learning_rate": 4.5044598612487615e-06,
+      "loss": 1.0293,
+      "step": 230
+    },
+    {
+      "epoch": 0.40998217468805703,
+      "eval_loss": 1.0670229196548462,
+      "eval_runtime": 46.6553,
+      "eval_samples_per_second": 21.434,
+      "eval_steps_per_second": 1.35,
+      "step": 230
+    },
+    {
+      "epoch": 0.4117647058823529,
+      "grad_norm": 1.3140292167663574,
+      "learning_rate": 4.499504459861249e-06,
+      "loss": 0.98,
+      "step": 231
+    },
+    {
+      "epoch": 0.41354723707664887,
+      "grad_norm": 1.678130030632019,
+      "learning_rate": 4.494549058473737e-06,
+      "loss": 1.0202,
+      "step": 232
+    },
+    {
+      "epoch": 0.41532976827094475,
+      "grad_norm": 1.4645118713378906,
+      "learning_rate": 4.489593657086224e-06,
+      "loss": 0.9959,
+      "step": 233
+    },
+    {
+      "epoch": 0.41711229946524064,
+      "grad_norm": 1.5366790294647217,
+      "learning_rate": 4.484638255698711e-06,
+      "loss": 0.9832,
+      "step": 234
+    },
+    {
+      "epoch": 0.41889483065953653,
+      "grad_norm": 2.286126136779785,
+      "learning_rate": 4.479682854311199e-06,
+      "loss": 0.9998,
+      "step": 235
+    },
+    {
+      "epoch": 0.4206773618538324,
+      "grad_norm": 2.260972261428833,
+      "learning_rate": 4.4747274529236875e-06,
+      "loss": 1.0139,
+      "step": 236
+    },
+    {
+      "epoch": 0.42245989304812837,
+      "grad_norm": 1.9152151346206665,
+      "learning_rate": 4.469772051536175e-06,
+      "loss": 1.0002,
+      "step": 237
+    },
+    {
+      "epoch": 0.42424242424242425,
+      "grad_norm": 1.7313346862792969,
+      "learning_rate": 4.464816650148662e-06,
+      "loss": 0.9828,
+      "step": 238
+    },
+    {
+      "epoch": 0.42602495543672014,
+      "grad_norm": 1.9363431930541992,
+      "learning_rate": 4.45986124876115e-06,
+      "loss": 0.9889,
+      "step": 239
+    },
+    {
+      "epoch": 0.42780748663101603,
+      "grad_norm": 1.4276996850967407,
+      "learning_rate": 4.454905847373638e-06,
+      "loss": 1.0121,
+      "step": 240
+    },
+    {
+      "epoch": 0.42780748663101603,
+      "eval_loss": 1.0605522394180298,
+      "eval_runtime": 46.6086,
+      "eval_samples_per_second": 21.455,
+      "eval_steps_per_second": 1.352,
+      "step": 240
+    },
+    {
+      "epoch": 0.4295900178253119,
+      "grad_norm": 1.8374282121658325,
+      "learning_rate": 4.449950445986125e-06,
+      "loss": 1.006,
+      "step": 241
+    },
+    {
+      "epoch": 0.43137254901960786,
+      "grad_norm": 1.6907085180282593,
+      "learning_rate": 4.444995044598613e-06,
+      "loss": 0.9724,
+      "step": 242
+    },
+    {
+      "epoch": 0.43315508021390375,
+      "grad_norm": 1.5171328783035278,
+      "learning_rate": 4.440039643211101e-06,
+      "loss": 1.0077,
+      "step": 243
+    },
+    {
+      "epoch": 0.43493761140819964,
+      "grad_norm": 1.7695019245147705,
+      "learning_rate": 4.435084241823588e-06,
+      "loss": 0.9975,
+      "step": 244
+    },
+    {
+      "epoch": 0.43672014260249553,
+      "grad_norm": 1.7725054025650024,
+      "learning_rate": 4.430128840436076e-06,
+      "loss": 1.05,
+      "step": 245
+    },
+    {
+      "epoch": 0.4385026737967914,
+      "grad_norm": 1.385197639465332,
+      "learning_rate": 4.425173439048563e-06,
+      "loss": 0.9982,
+      "step": 246
+    },
+    {
+      "epoch": 0.44028520499108736,
+      "grad_norm": 1.5117247104644775,
+      "learning_rate": 4.420218037661051e-06,
+      "loss": 1.0047,
+      "step": 247
+    },
+    {
+      "epoch": 0.44206773618538325,
+      "grad_norm": 1.545850157737732,
+      "learning_rate": 4.415262636273539e-06,
+      "loss": 0.9703,
+      "step": 248
+    },
+    {
+      "epoch": 0.44385026737967914,
+      "grad_norm": 1.550068974494934,
+      "learning_rate": 4.410307234886026e-06,
+      "loss": 1.0437,
+      "step": 249
+    },
+    {
+      "epoch": 0.44563279857397503,
+      "grad_norm": 1.711711049079895,
+      "learning_rate": 4.405351833498513e-06,
+      "loss": 0.9987,
+      "step": 250
+    },
+    {
+      "epoch": 0.44563279857397503,
+      "eval_loss": 1.0570459365844727,
+      "eval_runtime": 46.6929,
+      "eval_samples_per_second": 21.417,
+      "eval_steps_per_second": 1.349,
+      "step": 250
+    },
+    {
+      "epoch": 0.4474153297682709,
+      "grad_norm": 1.3180267810821533,
+      "learning_rate": 4.400396432111001e-06,
+      "loss": 0.9692,
+      "step": 251
+    },
+    {
+      "epoch": 0.44919786096256686,
+      "grad_norm": 1.583044409751892,
+      "learning_rate": 4.395441030723489e-06,
+      "loss": 0.9882,
+      "step": 252
+    },
+    {
+      "epoch": 0.45098039215686275,
+      "grad_norm": 1.7625699043273926,
+      "learning_rate": 4.3904856293359765e-06,
+      "loss": 1.019,
+      "step": 253
+    },
+    {
+      "epoch": 0.45276292335115864,
+      "grad_norm": 1.416609287261963,
+      "learning_rate": 4.385530227948464e-06,
+      "loss": 0.9722,
+      "step": 254
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 1.2862454652786255,
+      "learning_rate": 4.380574826560952e-06,
+      "loss": 1.0051,
+      "step": 255
+    },
+    {
+      "epoch": 0.4563279857397504,
+      "grad_norm": 1.6484653949737549,
+      "learning_rate": 4.37561942517344e-06,
+      "loss": 0.9845,
+      "step": 256
+    },
+    {
+      "epoch": 0.45811051693404636,
+      "grad_norm": 1.7906770706176758,
+      "learning_rate": 4.370664023785927e-06,
+      "loss": 0.9639,
+      "step": 257
+    },
+    {
+      "epoch": 0.45989304812834225,
+      "grad_norm": 1.6653960943222046,
+      "learning_rate": 4.365708622398414e-06,
+      "loss": 1.0079,
+      "step": 258
+    },
+    {
+      "epoch": 0.46167557932263814,
+      "grad_norm": 1.40675687789917,
+      "learning_rate": 4.3607532210109024e-06,
+      "loss": 0.9742,
+      "step": 259
+    },
+    {
+      "epoch": 0.46345811051693403,
+      "grad_norm": 1.9555974006652832,
+      "learning_rate": 4.35579781962339e-06,
+      "loss": 1.0031,
+      "step": 260
+    },
+    {
+      "epoch": 0.46345811051693403,
+      "eval_loss": 1.05105459690094,
+      "eval_runtime": 46.5955,
+      "eval_samples_per_second": 21.461,
+      "eval_steps_per_second": 1.352,
+      "step": 260
+    },
+    {
+      "epoch": 0.46524064171123,
+      "grad_norm": 1.5779472589492798,
+      "learning_rate": 4.350842418235878e-06,
+      "loss": 1.0131,
+      "step": 261
+    },
+    {
+      "epoch": 0.46702317290552586,
+      "grad_norm": 2.0448193550109863,
+      "learning_rate": 4.345887016848365e-06,
+      "loss": 0.9952,
+      "step": 262
+    },
+    {
+      "epoch": 0.46880570409982175,
+      "grad_norm": 1.8460253477096558,
+      "learning_rate": 4.340931615460853e-06,
+      "loss": 0.9894,
+      "step": 263
+    },
+    {
+      "epoch": 0.47058823529411764,
+      "grad_norm": 1.5205235481262207,
+      "learning_rate": 4.33597621407334e-06,
+      "loss": 0.998,
+      "step": 264
+    },
+    {
+      "epoch": 0.47237076648841353,
+      "grad_norm": 1.6225255727767944,
+      "learning_rate": 4.3310208126858276e-06,
+      "loss": 1.0076,
+      "step": 265
+    },
+    {
+      "epoch": 0.4741532976827095,
+      "grad_norm": 1.5571482181549072,
+      "learning_rate": 4.326065411298316e-06,
+      "loss": 0.9738,
+      "step": 266
+    },
+    {
+      "epoch": 0.47593582887700536,
+      "grad_norm": 1.5609713792800903,
+      "learning_rate": 4.321110009910804e-06,
+      "loss": 0.9998,
+      "step": 267
+    },
+    {
+      "epoch": 0.47771836007130125,
+      "grad_norm": 1.6771962642669678,
+      "learning_rate": 4.316154608523291e-06,
+      "loss": 0.9775,
+      "step": 268
+    },
+    {
+      "epoch": 0.47950089126559714,
+      "grad_norm": 1.8408950567245483,
+      "learning_rate": 4.311199207135778e-06,
+      "loss": 1.0069,
+      "step": 269
+    },
+    {
+      "epoch": 0.48128342245989303,
+      "grad_norm": 1.3811196088790894,
+      "learning_rate": 4.3062438057482654e-06,
+      "loss": 1.0066,
+      "step": 270
+    },
+    {
+      "epoch": 0.48128342245989303,
+      "eval_loss": 1.0506361722946167,
+      "eval_runtime": 46.56,
+      "eval_samples_per_second": 21.478,
+      "eval_steps_per_second": 1.353,
+      "step": 270
+    },
+    {
+      "epoch": 0.483065953654189,
+      "grad_norm": 1.4970637559890747,
+      "learning_rate": 4.3012884043607535e-06,
+      "loss": 0.9513,
+      "step": 271
+    },
+    {
+      "epoch": 0.48484848484848486,
+      "grad_norm": 1.7834646701812744,
+      "learning_rate": 4.296333002973242e-06,
+      "loss": 0.9865,
+      "step": 272
+    },
+    {
+      "epoch": 0.48663101604278075,
+      "grad_norm": 1.3917527198791504,
+      "learning_rate": 4.291377601585729e-06,
+      "loss": 0.977,
+      "step": 273
+    },
+    {
+      "epoch": 0.48841354723707664,
+      "grad_norm": 1.5269951820373535,
+      "learning_rate": 4.286422200198216e-06,
+      "loss": 0.9977,
+      "step": 274
+    },
+    {
+      "epoch": 0.49019607843137253,
+      "grad_norm": 1.4431025981903076,
+      "learning_rate": 4.281466798810704e-06,
+      "loss": 0.9944,
+      "step": 275
+    },
+    {
+      "epoch": 0.4919786096256685,
+      "grad_norm": 1.6358513832092285,
+      "learning_rate": 4.276511397423191e-06,
+      "loss": 0.9737,
+      "step": 276
+    },
+    {
+      "epoch": 0.49376114081996436,
+      "grad_norm": 1.3993628025054932,
+      "learning_rate": 4.2715559960356795e-06,
+      "loss": 0.9756,
+      "step": 277
+    },
+    {
+      "epoch": 0.49554367201426025,
+      "grad_norm": 1.3654887676239014,
+      "learning_rate": 4.266600594648167e-06,
+      "loss": 0.9682,
+      "step": 278
+    },
+    {
+      "epoch": 0.49732620320855614,
+      "grad_norm": 1.3362301588058472,
+      "learning_rate": 4.261645193260655e-06,
+      "loss": 0.9647,
+      "step": 279
+    },
+    {
+      "epoch": 0.49910873440285203,
+      "grad_norm": 1.2186959981918335,
+      "learning_rate": 4.256689791873142e-06,
+      "loss": 0.9926,
+      "step": 280
+    },
+    {
+      "epoch": 0.49910873440285203,
+      "eval_loss": 1.0509613752365112,
+      "eval_runtime": 46.6119,
+      "eval_samples_per_second": 21.454,
+      "eval_steps_per_second": 1.352,
+      "step": 280
+    },
+    {
+      "epoch": 0.5008912655971479,
+      "grad_norm": 1.639351725578308,
+      "learning_rate": 4.251734390485629e-06,
+      "loss": 0.9713,
+      "step": 281
+    },
+    {
+      "epoch": 0.5026737967914439,
+      "grad_norm": 1.4995989799499512,
+      "learning_rate": 4.246778989098117e-06,
+      "loss": 0.9563,
+      "step": 282
+    },
+    {
+      "epoch": 0.5044563279857398,
+      "grad_norm": 1.5948710441589355,
+      "learning_rate": 4.2418235877106055e-06,
+      "loss": 1.0202,
+      "step": 283
+    },
+    {
+      "epoch": 0.5062388591800356,
+      "grad_norm": 1.8484435081481934,
+      "learning_rate": 4.236868186323093e-06,
+      "loss": 0.9919,
+      "step": 284
+    },
+    {
+      "epoch": 0.5080213903743316,
+      "grad_norm": 1.5352140665054321,
+      "learning_rate": 4.23191278493558e-06,
+      "loss": 1.0022,
+      "step": 285
+    },
+    {
+      "epoch": 0.5098039215686274,
+      "grad_norm": 1.3767133951187134,
+      "learning_rate": 4.226957383548068e-06,
+      "loss": 1.0292,
+      "step": 286
+    },
+    {
+      "epoch": 0.5115864527629234,
+      "grad_norm": 1.5115472078323364,
+      "learning_rate": 4.222001982160555e-06,
+      "loss": 0.9171,
+      "step": 287
+    },
+    {
+      "epoch": 0.5133689839572193,
+      "grad_norm": 1.3484890460968018,
+      "learning_rate": 4.217046580773043e-06,
+      "loss": 1.0022,
+      "step": 288
+    },
+    {
+      "epoch": 0.5151515151515151,
+      "grad_norm": 1.391195297241211,
+      "learning_rate": 4.212091179385531e-06,
+      "loss": 0.9813,
+      "step": 289
+    },
+    {
+      "epoch": 0.5169340463458111,
+      "grad_norm": 1.5086086988449097,
+      "learning_rate": 4.207135777998018e-06,
+      "loss": 0.9763,
+      "step": 290
+    },
+    {
+      "epoch": 0.5169340463458111,
+      "eval_loss": 1.0481914281845093,
+      "eval_runtime": 46.5639,
+      "eval_samples_per_second": 21.476,
+      "eval_steps_per_second": 1.353,
+      "step": 290
+    },
+    {
+      "epoch": 0.5187165775401069,
+      "grad_norm": 1.4498447179794312,
+      "learning_rate": 4.202180376610506e-06,
+      "loss": 0.9716,
+      "step": 291
+    },
+    {
+      "epoch": 0.5204991087344029,
+      "grad_norm": 1.4278929233551025,
+      "learning_rate": 4.197224975222993e-06,
+      "loss": 0.9837,
+      "step": 292
+    },
+    {
+      "epoch": 0.5222816399286988,
+      "grad_norm": 1.180837869644165,
+      "learning_rate": 4.192269573835481e-06,
+      "loss": 0.9888,
+      "step": 293
+    },
+    {
+      "epoch": 0.5240641711229946,
+      "grad_norm": 1.7972043752670288,
+      "learning_rate": 4.1873141724479685e-06,
+      "loss": 0.9611,
+      "step": 294
+    },
+    {
+      "epoch": 0.5258467023172906,
+      "grad_norm": 1.7201738357543945,
+      "learning_rate": 4.1823587710604566e-06,
+      "loss": 0.9771,
+      "step": 295
+    },
+    {
+      "epoch": 0.5276292335115864,
+      "grad_norm": 1.5042906999588013,
+      "learning_rate": 4.177403369672944e-06,
+      "loss": 0.9703,
+      "step": 296
+    },
+    {
+      "epoch": 0.5294117647058824,
+      "grad_norm": 1.6776472330093384,
+      "learning_rate": 4.172447968285431e-06,
+      "loss": 0.9677,
+      "step": 297
+    },
+    {
+      "epoch": 0.5311942959001783,
+      "grad_norm": 1.5473730564117432,
+      "learning_rate": 4.167492566897919e-06,
+      "loss": 0.9977,
+      "step": 298
+    },
+    {
+      "epoch": 0.5329768270944741,
+      "grad_norm": 1.3396865129470825,
+      "learning_rate": 4.162537165510407e-06,
+      "loss": 0.9766,
+      "step": 299
+    },
+    {
+      "epoch": 0.5347593582887701,
+      "grad_norm": 1.6403453350067139,
+      "learning_rate": 4.1575817641228945e-06,
+      "loss": 0.9906,
+      "step": 300
+    },
+    {
+      "epoch": 0.5347593582887701,
+      "eval_loss": 1.0455559492111206,
+      "eval_runtime": 46.5647,
+      "eval_samples_per_second": 21.476,
+      "eval_steps_per_second": 1.353,
+      "step": 300
+    },
+    {
+      "epoch": 0.5365418894830659,
+      "grad_norm": 1.3962197303771973,
+      "learning_rate": 4.152626362735382e-06,
+      "loss": 0.9825,
+      "step": 301
+    },
+    {
+      "epoch": 0.5383244206773619,
+      "grad_norm": 1.410086750984192,
+      "learning_rate": 4.14767096134787e-06,
+      "loss": 1.0086,
+      "step": 302
+    },
+    {
+      "epoch": 0.5401069518716578,
+      "grad_norm": 1.3633315563201904,
+      "learning_rate": 4.142715559960357e-06,
+      "loss": 0.9872,
+      "step": 303
+    },
+    {
+      "epoch": 0.5418894830659536,
+      "grad_norm": 1.3306059837341309,
+      "learning_rate": 4.137760158572845e-06,
+      "loss": 0.9645,
+      "step": 304
+    },
+    {
+      "epoch": 0.5436720142602496,
+      "grad_norm": 1.4005334377288818,
+      "learning_rate": 4.132804757185332e-06,
+      "loss": 1.0071,
+      "step": 305
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 1.3115915060043335,
+      "learning_rate": 4.12784935579782e-06,
+      "loss": 0.9449,
+      "step": 306
+    },
+    {
+      "epoch": 0.5472370766488414,
+      "grad_norm": 1.3861132860183716,
+      "learning_rate": 4.122893954410308e-06,
+      "loss": 0.9653,
+      "step": 307
+    },
+    {
+      "epoch": 0.5490196078431373,
+      "grad_norm": 1.7560895681381226,
+      "learning_rate": 4.117938553022795e-06,
+      "loss": 0.9823,
+      "step": 308
+    },
+    {
+      "epoch": 0.5508021390374331,
+      "grad_norm": 1.2997792959213257,
+      "learning_rate": 4.112983151635283e-06,
+      "loss": 0.9842,
+      "step": 309
+    },
+    {
+      "epoch": 0.5525846702317291,
+      "grad_norm": 1.1829795837402344,
+      "learning_rate": 4.10802775024777e-06,
+      "loss": 0.9696,
+      "step": 310
+    },
+    {
+      "epoch": 0.5525846702317291,
+      "eval_loss": 1.0410667657852173,
+      "eval_runtime": 46.5919,
+      "eval_samples_per_second": 21.463,
+      "eval_steps_per_second": 1.352,
+      "step": 310
+    },
+    {
+      "epoch": 0.5543672014260249,
+      "grad_norm": 1.5539531707763672,
+      "learning_rate": 4.103072348860258e-06,
+      "loss": 0.9614,
+      "step": 311
+    },
+    {
+      "epoch": 0.5561497326203209,
+      "grad_norm": 1.2344005107879639,
+      "learning_rate": 4.0981169474727456e-06,
+      "loss": 0.9712,
+      "step": 312
+    },
+    {
+      "epoch": 0.5579322638146168,
+      "grad_norm": 1.511521816253662,
+      "learning_rate": 4.093161546085233e-06,
+      "loss": 0.9629,
+      "step": 313
+    },
+    {
+      "epoch": 0.5597147950089126,
+      "grad_norm": 1.450751781463623,
+      "learning_rate": 4.088206144697721e-06,
+      "loss": 0.9496,
+      "step": 314
+    },
+    {
+      "epoch": 0.5614973262032086,
+      "grad_norm": 1.3266924619674683,
+      "learning_rate": 4.083250743310209e-06,
+      "loss": 0.9296,
+      "step": 315
+    },
+    {
+      "epoch": 0.5632798573975044,
+      "grad_norm": 1.4744081497192383,
+      "learning_rate": 4.078295341922696e-06,
+      "loss": 0.9991,
+      "step": 316
+    },
+    {
+      "epoch": 0.5650623885918004,
+      "grad_norm": 1.5017259120941162,
+      "learning_rate": 4.0733399405351834e-06,
+      "loss": 0.9906,
+      "step": 317
+    },
+    {
+      "epoch": 0.5668449197860963,
+      "grad_norm": 1.6157349348068237,
+      "learning_rate": 4.0683845391476715e-06,
+      "loss": 0.9707,
+      "step": 318
+    },
+    {
+      "epoch": 0.5686274509803921,
+      "grad_norm": 1.161723017692566,
+      "learning_rate": 4.063429137760159e-06,
+      "loss": 0.9634,
+      "step": 319
+    },
+    {
+      "epoch": 0.5704099821746881,
+      "grad_norm": 1.2189308404922485,
+      "learning_rate": 4.058473736372647e-06,
+      "loss": 0.9898,
+      "step": 320
+    },
+    {
+      "epoch": 0.5704099821746881,
+      "eval_loss": 1.0385627746582031,
+      "eval_runtime": 46.6868,
+      "eval_samples_per_second": 21.419,
+      "eval_steps_per_second": 1.349,
+      "step": 320
+    },
+    {
+      "epoch": 0.5721925133689839,
+      "grad_norm": 1.570832371711731,
+      "learning_rate": 4.053518334985134e-06,
+      "loss": 0.9551,
+      "step": 321
+    },
+    {
+      "epoch": 0.5739750445632799,
+      "grad_norm": 1.483991026878357,
+      "learning_rate": 4.048562933597622e-06,
+      "loss": 0.954,
+      "step": 322
+    },
+    {
+      "epoch": 0.5757575757575758,
+      "grad_norm": 1.3836183547973633,
+      "learning_rate": 4.043607532210109e-06,
+      "loss": 0.9457,
+      "step": 323
+    },
+    {
+      "epoch": 0.5775401069518716,
+      "grad_norm": 1.4889212846755981,
+      "learning_rate": 4.038652130822597e-06,
+      "loss": 0.9664,
+      "step": 324
+    },
+    {
+      "epoch": 0.5793226381461676,
+      "grad_norm": 1.424689769744873,
+      "learning_rate": 4.033696729435085e-06,
+      "loss": 0.9386,
+      "step": 325
+    },
+    {
+      "epoch": 0.5811051693404634,
+      "grad_norm": 1.5658745765686035,
+      "learning_rate": 4.028741328047572e-06,
+      "loss": 0.9984,
+      "step": 326
+    },
+    {
+      "epoch": 0.5828877005347594,
+      "grad_norm": 1.7062257528305054,
+      "learning_rate": 4.02378592666006e-06,
+      "loss": 0.9732,
+      "step": 327
+    },
+    {
+      "epoch": 0.5846702317290553,
+      "grad_norm": 1.586201786994934,
+      "learning_rate": 4.018830525272547e-06,
+      "loss": 0.9974,
+      "step": 328
+    },
+    {
+      "epoch": 0.5864527629233511,
+      "grad_norm": 1.6218819618225098,
+      "learning_rate": 4.0138751238850345e-06,
+      "loss": 0.9468,
+      "step": 329
+    },
+    {
+      "epoch": 0.5882352941176471,
+      "grad_norm": 1.2196019887924194,
+      "learning_rate": 4.008919722497523e-06,
+      "loss": 0.9194,
+      "step": 330
+    },
+    {
+      "epoch": 0.5882352941176471,
+      "eval_loss": 1.0374821424484253,
+      "eval_runtime": 46.6459,
+      "eval_samples_per_second": 21.438,
+      "eval_steps_per_second": 1.351,
+      "step": 330
+    },
+    {
+      "epoch": 0.5900178253119429,
+      "grad_norm": 1.5755513906478882,
+      "learning_rate": 4.003964321110011e-06,
+      "loss": 0.9806,
+      "step": 331
+    },
+    {
+      "epoch": 0.5918003565062389,
+      "grad_norm": 1.482846975326538,
+      "learning_rate": 3.999008919722498e-06,
+      "loss": 0.9713,
+      "step": 332
+    },
+    {
+      "epoch": 0.5935828877005348,
+      "grad_norm": 1.4488551616668701,
+      "learning_rate": 3.994053518334985e-06,
+      "loss": 0.932,
+      "step": 333
+    },
+    {
+      "epoch": 0.5953654188948306,
+      "grad_norm": 1.8037104606628418,
+      "learning_rate": 3.989098116947473e-06,
+      "loss": 0.973,
+      "step": 334
+    },
+    {
+      "epoch": 0.5971479500891266,
+      "grad_norm": 1.507331371307373,
+      "learning_rate": 3.9841427155599605e-06,
+      "loss": 0.989,
+      "step": 335
+    },
+    {
+      "epoch": 0.5989304812834224,
+      "grad_norm": 1.49600350856781,
+      "learning_rate": 3.979187314172449e-06,
+      "loss": 0.9764,
+      "step": 336
+    },
+    {
+      "epoch": 0.6007130124777184,
+      "grad_norm": 1.3340990543365479,
+      "learning_rate": 3.974231912784936e-06,
+      "loss": 0.9359,
+      "step": 337
+    },
+    {
+      "epoch": 0.6024955436720143,
+      "grad_norm": 1.6840732097625732,
+      "learning_rate": 3.969276511397424e-06,
+      "loss": 0.949,
+      "step": 338
+    },
+    {
+      "epoch": 0.6042780748663101,
+      "grad_norm": 1.650481104850769,
+      "learning_rate": 3.964321110009911e-06,
+      "loss": 0.9643,
+      "step": 339
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.87238609790802,
+      "learning_rate": 3.959365708622398e-06,
+      "loss": 0.9673,
+      "step": 340
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "eval_loss": 1.034188985824585,
+      "eval_runtime": 46.5782,
+      "eval_samples_per_second": 21.469,
+      "eval_steps_per_second": 1.353,
+      "step": 340
+    },
+    {
+      "epoch": 0.6078431372549019,
+      "grad_norm": 1.3788107633590698,
+      "learning_rate": 3.9544103072348865e-06,
+      "loss": 0.9555,
+      "step": 341
+    },
+    {
+      "epoch": 0.6096256684491979,
+      "grad_norm": 1.4971482753753662,
+      "learning_rate": 3.949454905847374e-06,
+      "loss": 1.0169,
+      "step": 342
+    },
+    {
+      "epoch": 0.6114081996434938,
+      "grad_norm": 1.7131767272949219,
+      "learning_rate": 3.944499504459862e-06,
+      "loss": 0.9959,
+      "step": 343
+    },
+    {
+      "epoch": 0.6131907308377896,
+      "grad_norm": 1.3918700218200684,
+      "learning_rate": 3.939544103072349e-06,
+      "loss": 0.9582,
+      "step": 344
+    },
+    {
+      "epoch": 0.6149732620320856,
+      "grad_norm": 1.3024098873138428,
+      "learning_rate": 3.934588701684836e-06,
+      "loss": 0.9577,
+      "step": 345
+    },
+    {
+      "epoch": 0.6167557932263814,
+      "grad_norm": 1.5201256275177002,
+      "learning_rate": 3.929633300297324e-06,
+      "loss": 0.9554,
+      "step": 346
+    },
+    {
+      "epoch": 0.6185383244206774,
+      "grad_norm": 1.3050342798233032,
+      "learning_rate": 3.9246778989098124e-06,
+      "loss": 0.94,
+      "step": 347
+    },
+    {
+      "epoch": 0.6203208556149733,
+      "grad_norm": 1.3635145425796509,
+      "learning_rate": 3.9197224975223e-06,
+      "loss": 0.9789,
+      "step": 348
+    },
+    {
+      "epoch": 0.6221033868092691,
+      "grad_norm": 1.7963106632232666,
+      "learning_rate": 3.914767096134787e-06,
+      "loss": 0.9634,
+      "step": 349
+    },
+    {
+      "epoch": 0.6238859180035651,
+      "grad_norm": 1.6082175970077515,
+      "learning_rate": 3.909811694747275e-06,
+      "loss": 0.941,
+      "step": 350
+    },
+    {
+      "epoch": 0.6238859180035651,
+      "eval_loss": 1.034374713897705,
+      "eval_runtime": 46.5986,
+      "eval_samples_per_second": 21.46,
+      "eval_steps_per_second": 1.352,
+      "step": 350
+    },
+    {
+      "epoch": 0.6256684491978609,
+      "grad_norm": 1.575903296470642,
+      "learning_rate": 3.904856293359762e-06,
+      "loss": 0.9449,
+      "step": 351
+    },
+    {
+      "epoch": 0.6274509803921569,
+      "grad_norm": 1.6089097261428833,
+      "learning_rate": 3.89990089197225e-06,
+      "loss": 0.9581,
+      "step": 352
+    },
+    {
+      "epoch": 0.6292335115864528,
+      "grad_norm": 1.5840903520584106,
+      "learning_rate": 3.8949454905847376e-06,
+      "loss": 0.9521,
+      "step": 353
+    },
+    {
+      "epoch": 0.6310160427807486,
+      "grad_norm": 1.9419504404067993,
+      "learning_rate": 3.889990089197226e-06,
+      "loss": 0.9552,
+      "step": 354
+    },
+    {
+      "epoch": 0.6327985739750446,
+      "grad_norm": 1.3905870914459229,
+      "learning_rate": 3.885034687809713e-06,
+      "loss": 0.9477,
+      "step": 355
+    },
+    {
+      "epoch": 0.6345811051693404,
+      "grad_norm": 1.5863651037216187,
+      "learning_rate": 3.8800792864222e-06,
+      "loss": 0.9533,
+      "step": 356
+    },
+    {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 1.7959046363830566,
+      "learning_rate": 3.875123885034688e-06,
+      "loss": 0.9568,
+      "step": 357
+    },
+    {
+      "epoch": 0.6381461675579323,
+      "grad_norm": 1.6786869764328003,
+      "learning_rate": 3.870168483647176e-06,
+      "loss": 0.9639,
+      "step": 358
+    },
+    {
+      "epoch": 0.6399286987522281,
+      "grad_norm": 1.3908863067626953,
+      "learning_rate": 3.8652130822596635e-06,
+      "loss": 0.9601,
+      "step": 359
+    },
+    {
+      "epoch": 0.6417112299465241,
+      "grad_norm": 1.9571239948272705,
+      "learning_rate": 3.860257680872151e-06,
+      "loss": 0.9578,
+      "step": 360
+    },
+    {
+      "epoch": 0.6417112299465241,
+      "eval_loss": 1.0281429290771484,
+      "eval_runtime": 46.6801,
+      "eval_samples_per_second": 21.422,
+      "eval_steps_per_second": 1.35,
+      "step": 360
+    },
+    {
+      "epoch": 0.64349376114082,
+      "grad_norm": 1.445169448852539,
+      "learning_rate": 3.855302279484638e-06,
+      "loss": 0.9706,
+      "step": 361
+    },
+    {
+      "epoch": 0.6452762923351159,
+      "grad_norm": 1.475993275642395,
+      "learning_rate": 3.850346878097126e-06,
+      "loss": 0.9396,
+      "step": 362
+    },
+    {
+      "epoch": 0.6470588235294118,
+      "grad_norm": 1.7329176664352417,
+      "learning_rate": 3.845391476709614e-06,
+      "loss": 0.965,
+      "step": 363
+    },
+    {
+      "epoch": 0.6488413547237076,
+      "grad_norm": 1.6262738704681396,
+      "learning_rate": 3.8404360753221014e-06,
+      "loss": 0.9343,
+      "step": 364
+    },
+    {
+      "epoch": 0.6506238859180036,
+      "grad_norm": 1.3373092412948608,
+      "learning_rate": 3.835480673934589e-06,
+      "loss": 0.9652,
+      "step": 365
+    },
+    {
+      "epoch": 0.6524064171122995,
+      "grad_norm": 1.3127856254577637,
+      "learning_rate": 3.830525272547077e-06,
+      "loss": 0.935,
+      "step": 366
+    },
+    {
+      "epoch": 0.6541889483065954,
+      "grad_norm": 1.8555575609207153,
+      "learning_rate": 3.825569871159564e-06,
+      "loss": 0.9491,
+      "step": 367
+    },
+    {
+      "epoch": 0.6559714795008913,
+      "grad_norm": 1.5025370121002197,
+      "learning_rate": 3.820614469772052e-06,
+      "loss": 0.9626,
+      "step": 368
+    },
+    {
+      "epoch": 0.6577540106951871,
+      "grad_norm": 1.4000506401062012,
+      "learning_rate": 3.815659068384539e-06,
+      "loss": 0.9603,
+      "step": 369
+    },
+    {
+      "epoch": 0.6595365418894831,
+      "grad_norm": 1.4358906745910645,
+      "learning_rate": 3.810703666997027e-06,
+      "loss": 0.956,
+      "step": 370
+    },
+    {
+      "epoch": 0.6595365418894831,
+      "eval_loss": 1.02617347240448,
+      "eval_runtime": 46.651,
+      "eval_samples_per_second": 21.436,
+      "eval_steps_per_second": 1.35,
+      "step": 370
+    },
+    {
+      "epoch": 0.661319073083779,
+      "grad_norm": 1.8912854194641113,
+      "learning_rate": 3.8057482656095146e-06,
+      "loss": 0.9657,
+      "step": 371
+    },
+    {
+      "epoch": 0.6631016042780749,
+      "grad_norm": 1.5366394519805908,
+      "learning_rate": 3.8007928642220023e-06,
+      "loss": 0.9673,
+      "step": 372
+    },
+    {
+      "epoch": 0.6648841354723708,
+      "grad_norm": 1.7613537311553955,
+      "learning_rate": 3.79583746283449e-06,
+      "loss": 0.9436,
+      "step": 373
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.1846625804901123,
+      "learning_rate": 3.7908820614469776e-06,
+      "loss": 0.9182,
+      "step": 374
+    },
+    {
+      "epoch": 0.6684491978609626,
+      "grad_norm": 1.6215946674346924,
+      "learning_rate": 3.7859266600594653e-06,
+      "loss": 0.9589,
+      "step": 375
+    },
+    {
+      "epoch": 0.6702317290552585,
+      "grad_norm": 1.6887030601501465,
+      "learning_rate": 3.7809712586719525e-06,
+      "loss": 0.9756,
+      "step": 376
+    },
+    {
+      "epoch": 0.6720142602495544,
+      "grad_norm": 1.6964572668075562,
+      "learning_rate": 3.77601585728444e-06,
+      "loss": 0.961,
+      "step": 377
+    },
+    {
+      "epoch": 0.6737967914438503,
+      "grad_norm": 1.435586929321289,
+      "learning_rate": 3.771060455896928e-06,
+      "loss": 0.9267,
+      "step": 378
+    },
+    {
+      "epoch": 0.6755793226381461,
+      "grad_norm": 1.7841671705245972,
+      "learning_rate": 3.766105054509416e-06,
+      "loss": 0.9518,
+      "step": 379
+    },
+    {
+      "epoch": 0.6773618538324421,
+      "grad_norm": 1.2004536390304565,
+      "learning_rate": 3.761149653121903e-06,
+      "loss": 0.9519,
+      "step": 380
+    },
+    {
+      "epoch": 0.6773618538324421,
+      "eval_loss": 1.0265334844589233,
+      "eval_runtime": 46.6028,
+      "eval_samples_per_second": 21.458,
+      "eval_steps_per_second": 1.352,
+      "step": 380
+    },
+    {
+      "epoch": 0.679144385026738,
+      "grad_norm": 1.3047566413879395,
+      "learning_rate": 3.756194251734391e-06,
+      "loss": 0.9484,
+      "step": 381
+    },
+    {
+      "epoch": 0.6809269162210339,
+      "grad_norm": 1.5224716663360596,
+      "learning_rate": 3.7512388503468785e-06,
+      "loss": 0.9477,
+      "step": 382
+    },
+    {
+      "epoch": 0.6827094474153298,
+      "grad_norm": 1.3004342317581177,
+      "learning_rate": 3.7462834489593657e-06,
+      "loss": 0.9604,
+      "step": 383
+    },
+    {
+      "epoch": 0.6844919786096256,
+      "grad_norm": 1.3921082019805908,
+      "learning_rate": 3.741328047571854e-06,
+      "loss": 0.8758,
+      "step": 384
+    },
+    {
+      "epoch": 0.6862745098039216,
+      "grad_norm": 1.5825459957122803,
+      "learning_rate": 3.7363726461843415e-06,
+      "loss": 0.931,
+      "step": 385
+    },
+    {
+      "epoch": 0.6880570409982175,
+      "grad_norm": 1.5469566583633423,
+      "learning_rate": 3.7314172447968287e-06,
+      "loss": 0.9627,
+      "step": 386
+    },
+    {
+      "epoch": 0.6898395721925134,
+      "grad_norm": 1.2618225812911987,
+      "learning_rate": 3.7264618434093164e-06,
+      "loss": 0.9368,
+      "step": 387
+    },
+    {
+      "epoch": 0.6916221033868093,
+      "grad_norm": 1.4424333572387695,
+      "learning_rate": 3.721506442021804e-06,
+      "loss": 0.9692,
+      "step": 388
+    },
+    {
+      "epoch": 0.6934046345811051,
+      "grad_norm": 1.6238356828689575,
+      "learning_rate": 3.716551040634292e-06,
+      "loss": 0.9367,
+      "step": 389
+    },
+    {
+      "epoch": 0.6951871657754011,
+      "grad_norm": 1.4406427145004272,
+      "learning_rate": 3.7115956392467794e-06,
+      "loss": 0.954,
+      "step": 390
+    },
+    {
+      "epoch": 0.6951871657754011,
+      "eval_loss": 1.0200062990188599,
+      "eval_runtime": 46.7033,
+      "eval_samples_per_second": 21.412,
+      "eval_steps_per_second": 1.349,
+      "step": 390
+    },
+    {
+      "epoch": 0.696969696969697,
+      "grad_norm": 1.437024712562561,
+      "learning_rate": 3.706640237859267e-06,
+      "loss": 0.9506,
+      "step": 391
+    },
+    {
+      "epoch": 0.6987522281639929,
+      "grad_norm": 1.4848400354385376,
+      "learning_rate": 3.7016848364717543e-06,
+      "loss": 0.9205,
+      "step": 392
+    },
+    {
+      "epoch": 0.7005347593582888,
+      "grad_norm": 1.484315276145935,
+      "learning_rate": 3.696729435084242e-06,
+      "loss": 0.9379,
+      "step": 393
+    },
+    {
+      "epoch": 0.7023172905525846,
+      "grad_norm": 1.4815152883529663,
+      "learning_rate": 3.6917740336967296e-06,
+      "loss": 0.9742,
+      "step": 394
+    },
+    {
+      "epoch": 0.7040998217468806,
+      "grad_norm": 1.586028814315796,
+      "learning_rate": 3.6868186323092177e-06,
+      "loss": 0.9397,
+      "step": 395
+    },
+    {
+      "epoch": 0.7058823529411765,
+      "grad_norm": 1.713163137435913,
+      "learning_rate": 3.681863230921705e-06,
+      "loss": 0.9133,
+      "step": 396
+    },
+    {
+      "epoch": 0.7076648841354723,
+      "grad_norm": 1.3876514434814453,
+      "learning_rate": 3.6769078295341926e-06,
+      "loss": 0.9483,
+      "step": 397
+    },
+    {
+      "epoch": 0.7094474153297683,
+      "grad_norm": 1.3811695575714111,
+      "learning_rate": 3.6719524281466802e-06,
+      "loss": 0.9432,
+      "step": 398
+    },
+    {
+      "epoch": 0.7112299465240641,
+      "grad_norm": 1.4277275800704956,
+      "learning_rate": 3.6669970267591675e-06,
+      "loss": 0.9606,
+      "step": 399
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "grad_norm": 1.2409071922302246,
+      "learning_rate": 3.6620416253716556e-06,
+      "loss": 0.9754,
+      "step": 400
+    },
+    {
+      "epoch": 0.7130124777183601,
+      "eval_loss": 1.0203064680099487,
+      "eval_runtime": 46.6978,
+      "eval_samples_per_second": 21.414,
+      "eval_steps_per_second": 1.349,
+      "step": 400
+    },
+    {
+      "epoch": 0.714795008912656,
+      "grad_norm": 1.7478358745574951,
+      "learning_rate": 3.6570862239841432e-06,
+      "loss": 0.9343,
+      "step": 401
+    },
+    {
+      "epoch": 0.7165775401069518,
+      "grad_norm": 1.3118643760681152,
+      "learning_rate": 3.6521308225966305e-06,
+      "loss": 0.9459,
+      "step": 402
+    },
+    {
+      "epoch": 0.7183600713012478,
+      "grad_norm": 1.6754299402236938,
+      "learning_rate": 3.647175421209118e-06,
+      "loss": 0.9477,
+      "step": 403
+    },
+    {
+      "epoch": 0.7201426024955436,
+      "grad_norm": 1.4706963300704956,
+      "learning_rate": 3.6422200198216058e-06,
+      "loss": 0.9237,
+      "step": 404
+    },
+    {
+      "epoch": 0.7219251336898396,
+      "grad_norm": 1.7521008253097534,
+      "learning_rate": 3.637264618434094e-06,
+      "loss": 0.9473,
+      "step": 405
+    },
+    {
+      "epoch": 0.7237076648841355,
+      "grad_norm": 1.5755548477172852,
+      "learning_rate": 3.632309217046581e-06,
+      "loss": 0.9632,
+      "step": 406
+    },
+    {
+      "epoch": 0.7254901960784313,
+      "grad_norm": 1.6598784923553467,
+      "learning_rate": 3.6273538156590688e-06,
+      "loss": 0.96,
+      "step": 407
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 1.4760100841522217,
+      "learning_rate": 3.6223984142715564e-06,
+      "loss": 0.9562,
+      "step": 408
+    },
+    {
+      "epoch": 0.7290552584670231,
+      "grad_norm": 1.5755516290664673,
+      "learning_rate": 3.6174430128840437e-06,
+      "loss": 0.9244,
+      "step": 409
+    },
+    {
+      "epoch": 0.7308377896613191,
+      "grad_norm": 1.3510775566101074,
+      "learning_rate": 3.6124876114965313e-06,
+      "loss": 0.9179,
+      "step": 410
+    },
+    {
+      "epoch": 0.7308377896613191,
+      "eval_loss": 1.0200093984603882,
+      "eval_runtime": 46.669,
+      "eval_samples_per_second": 21.427,
+      "eval_steps_per_second": 1.35,
+      "step": 410
+    },
+    {
+      "epoch": 0.732620320855615,
+      "grad_norm": 1.5007761716842651,
+      "learning_rate": 3.6075322101090194e-06,
+      "loss": 0.9746,
+      "step": 411
+    },
+    {
+      "epoch": 0.7344028520499108,
+      "grad_norm": 1.417822241783142,
+      "learning_rate": 3.6025768087215067e-06,
+      "loss": 0.9199,
+      "step": 412
+    },
+    {
+      "epoch": 0.7361853832442068,
+      "grad_norm": 1.534996509552002,
+      "learning_rate": 3.5976214073339943e-06,
+      "loss": 0.9603,
+      "step": 413
+    },
+    {
+      "epoch": 0.7379679144385026,
+      "grad_norm": 1.7224639654159546,
+      "learning_rate": 3.592666005946482e-06,
+      "loss": 0.9735,
+      "step": 414
+    },
+    {
+      "epoch": 0.7397504456327986,
+      "grad_norm": 1.594838261604309,
+      "learning_rate": 3.5877106045589692e-06,
+      "loss": 0.9362,
+      "step": 415
+    },
+    {
+      "epoch": 0.7415329768270945,
+      "grad_norm": 1.3750888109207153,
+      "learning_rate": 3.5827552031714573e-06,
+      "loss": 0.9365,
+      "step": 416
+    },
+    {
+      "epoch": 0.7433155080213903,
+      "grad_norm": 1.5538432598114014,
+      "learning_rate": 3.577799801783945e-06,
+      "loss": 0.9562,
+      "step": 417
+    },
+    {
+      "epoch": 0.7450980392156863,
+      "grad_norm": 1.4937995672225952,
+      "learning_rate": 3.5728444003964326e-06,
+      "loss": 0.9139,
+      "step": 418
+    },
+    {
+      "epoch": 0.7468805704099821,
+      "grad_norm": 1.2770204544067383,
+      "learning_rate": 3.56788899900892e-06,
+      "loss": 0.9246,
+      "step": 419
+    },
+    {
+      "epoch": 0.7486631016042781,
+      "grad_norm": 1.5773707628250122,
+      "learning_rate": 3.5629335976214075e-06,
+      "loss": 0.9558,
+      "step": 420
+    },
+    {
+      "epoch": 0.7486631016042781,
+      "eval_loss": 1.018223524093628,
+      "eval_runtime": 46.7216,
+      "eval_samples_per_second": 21.403,
+      "eval_steps_per_second": 1.348,
+      "step": 420
+    },
+    {
+      "epoch": 0.750445632798574,
+      "grad_norm": 1.7162814140319824,
+      "learning_rate": 3.5579781962338956e-06,
+      "loss": 0.9187,
+      "step": 421
+    },
+    {
+      "epoch": 0.7522281639928698,
+      "grad_norm": 1.4196544885635376,
+      "learning_rate": 3.553022794846383e-06,
+      "loss": 0.9518,
+      "step": 422
+    },
+    {
+      "epoch": 0.7540106951871658,
+      "grad_norm": 1.3683091402053833,
+      "learning_rate": 3.5480673934588705e-06,
+      "loss": 0.9248,
+      "step": 423
+    },
+    {
+      "epoch": 0.7557932263814616,
+      "grad_norm": 1.3377984762191772,
+      "learning_rate": 3.543111992071358e-06,
+      "loss": 0.9598,
+      "step": 424
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 1.5043303966522217,
+      "learning_rate": 3.5381565906838454e-06,
+      "loss": 0.9175,
+      "step": 425
+    },
+    {
+      "epoch": 0.7593582887700535,
+      "grad_norm": 1.3370412588119507,
+      "learning_rate": 3.533201189296333e-06,
+      "loss": 0.9645,
+      "step": 426
+    },
+    {
+      "epoch": 0.7611408199643493,
+      "grad_norm": 1.914324402809143,
+      "learning_rate": 3.528245787908821e-06,
+      "loss": 0.9281,
+      "step": 427
+    },
+    {
+      "epoch": 0.7629233511586453,
+      "grad_norm": 1.2636290788650513,
+      "learning_rate": 3.523290386521309e-06,
+      "loss": 0.9162,
+      "step": 428
+    },
+    {
+      "epoch": 0.7647058823529411,
+      "grad_norm": 1.3550281524658203,
+      "learning_rate": 3.518334985133796e-06,
+      "loss": 0.948,
+      "step": 429
+    },
+    {
+      "epoch": 0.7664884135472371,
+      "grad_norm": 1.3391727209091187,
+      "learning_rate": 3.5133795837462837e-06,
+      "loss": 0.9303,
+      "step": 430
+    },
+    {
+      "epoch": 0.7664884135472371,
+      "eval_loss": 1.0136384963989258,
+      "eval_runtime": 46.6342,
+      "eval_samples_per_second": 21.443,
+      "eval_steps_per_second": 1.351,
+      "step": 430
+    },
+    {
+      "epoch": 0.768270944741533,
+      "grad_norm": 1.5297709703445435,
+      "learning_rate": 3.508424182358771e-06,
+      "loss": 0.98,
+      "step": 431
+    },
+    {
+      "epoch": 0.7700534759358288,
+      "grad_norm": 1.2201571464538574,
+      "learning_rate": 3.503468780971259e-06,
+      "loss": 0.9274,
+      "step": 432
+    },
+    {
+      "epoch": 0.7718360071301248,
+      "grad_norm": 1.2383842468261719,
+      "learning_rate": 3.4985133795837467e-06,
+      "loss": 0.9424,
+      "step": 433
+    },
+    {
+      "epoch": 0.7736185383244206,
+      "grad_norm": 1.5169589519500732,
+      "learning_rate": 3.4935579781962344e-06,
+      "loss": 0.9635,
+      "step": 434
+    },
+    {
+      "epoch": 0.7754010695187166,
+      "grad_norm": 1.2745269536972046,
+      "learning_rate": 3.4886025768087216e-06,
+      "loss": 0.9307,
+      "step": 435
+    },
+    {
+      "epoch": 0.7771836007130125,
+      "grad_norm": 1.3778202533721924,
+      "learning_rate": 3.4836471754212093e-06,
+      "loss": 0.9197,
+      "step": 436
+    },
+    {
+      "epoch": 0.7789661319073083,
+      "grad_norm": 1.466562271118164,
+      "learning_rate": 3.4786917740336974e-06,
+      "loss": 0.9332,
+      "step": 437
+    },
+    {
+      "epoch": 0.7807486631016043,
+      "grad_norm": 1.4385347366333008,
+      "learning_rate": 3.4737363726461846e-06,
+      "loss": 0.9458,
+      "step": 438
+    },
+    {
+      "epoch": 0.7825311942959001,
+      "grad_norm": 1.6402126550674438,
+      "learning_rate": 3.4687809712586723e-06,
+      "loss": 0.9622,
+      "step": 439
+    },
+    {
+      "epoch": 0.7843137254901961,
+      "grad_norm": 1.5755741596221924,
+      "learning_rate": 3.46382556987116e-06,
+      "loss": 0.9608,
+      "step": 440
+    },
+    {
+      "epoch": 0.7843137254901961,
+      "eval_loss": 1.010831356048584,
+      "eval_runtime": 46.6146,
+      "eval_samples_per_second": 21.452,
+      "eval_steps_per_second": 1.352,
+      "step": 440
+    },
+    {
+      "epoch": 0.786096256684492,
+      "grad_norm": 1.723796010017395,
+      "learning_rate": 3.458870168483647e-06,
+      "loss": 0.945,
+      "step": 441
+    },
+    {
+      "epoch": 0.7878787878787878,
+      "grad_norm": 1.3989365100860596,
+      "learning_rate": 3.453914767096135e-06,
+      "loss": 0.931,
+      "step": 442
+    },
+    {
+      "epoch": 0.7896613190730838,
+      "grad_norm": 1.5464838743209839,
+      "learning_rate": 3.448959365708623e-06,
+      "loss": 0.9168,
+      "step": 443
+    },
+    {
+      "epoch": 0.7914438502673797,
+      "grad_norm": 1.7541731595993042,
+      "learning_rate": 3.4440039643211106e-06,
+      "loss": 0.9397,
+      "step": 444
+    },
+    {
+      "epoch": 0.7932263814616756,
+      "grad_norm": 1.467142939567566,
+      "learning_rate": 3.439048562933598e-06,
+      "loss": 0.9484,
+      "step": 445
+    },
+    {
+      "epoch": 0.7950089126559715,
+      "grad_norm": 1.7130905389785767,
+      "learning_rate": 3.4340931615460855e-06,
+      "loss": 0.9211,
+      "step": 446
+    },
+    {
+      "epoch": 0.7967914438502673,
+      "grad_norm": 1.4574313163757324,
+      "learning_rate": 3.4291377601585727e-06,
+      "loss": 0.9027,
+      "step": 447
+    },
+    {
+      "epoch": 0.7985739750445633,
+      "grad_norm": 1.7143244743347168,
+      "learning_rate": 3.424182358771061e-06,
+      "loss": 0.9465,
+      "step": 448
+    },
+    {
+      "epoch": 0.8003565062388592,
+      "grad_norm": 1.5416609048843384,
+      "learning_rate": 3.4192269573835485e-06,
+      "loss": 0.9332,
+      "step": 449
+    },
+    {
+      "epoch": 0.8021390374331551,
+      "grad_norm": 1.456154704093933,
+      "learning_rate": 3.414271555996036e-06,
+      "loss": 0.9729,
+      "step": 450
+    },
+    {
+      "epoch": 0.8021390374331551,
+      "eval_loss": 1.0104098320007324,
+      "eval_runtime": 46.6938,
+      "eval_samples_per_second": 21.416,
+      "eval_steps_per_second": 1.349,
+      "step": 450
+    },
+    {
+      "epoch": 0.803921568627451,
+      "grad_norm": 1.4171167612075806,
+      "learning_rate": 3.4093161546085234e-06,
+      "loss": 0.9444,
+      "step": 451
+    },
+    {
+      "epoch": 0.8057040998217468,
+      "grad_norm": 1.294493317604065,
+      "learning_rate": 3.404360753221011e-06,
+      "loss": 0.9351,
+      "step": 452
+    },
+    {
+      "epoch": 0.8074866310160428,
+      "grad_norm": 1.312842607498169,
+      "learning_rate": 3.399405351833499e-06,
+      "loss": 0.9246,
+      "step": 453
+    },
+    {
+      "epoch": 0.8092691622103387,
+      "grad_norm": 1.524051308631897,
+      "learning_rate": 3.3944499504459868e-06,
+      "loss": 0.9531,
+      "step": 454
+    },
+    {
+      "epoch": 0.8110516934046346,
+      "grad_norm": 1.317704439163208,
+      "learning_rate": 3.389494549058474e-06,
+      "loss": 0.9273,
+      "step": 455
+    },
+    {
+      "epoch": 0.8128342245989305,
+      "grad_norm": 1.7461873292922974,
+      "learning_rate": 3.3845391476709617e-06,
+      "loss": 0.9193,
+      "step": 456
+    },
+    {
+      "epoch": 0.8146167557932263,
+      "grad_norm": 1.4218385219573975,
+      "learning_rate": 3.379583746283449e-06,
+      "loss": 0.9355,
+      "step": 457
+    },
+    {
+      "epoch": 0.8163992869875223,
+      "grad_norm": 1.3833569288253784,
+      "learning_rate": 3.3746283448959366e-06,
+      "loss": 0.9197,
+      "step": 458
+    },
+    {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 1.394129991531372,
+      "learning_rate": 3.3696729435084247e-06,
+      "loss": 0.9521,
+      "step": 459
+    },
+    {
+      "epoch": 0.8199643493761141,
+      "grad_norm": 1.4939794540405273,
+      "learning_rate": 3.3647175421209123e-06,
+      "loss": 0.923,
+      "step": 460
+    },
+    {
+      "epoch": 0.8199643493761141,
+      "eval_loss": 1.0103003978729248,
+      "eval_runtime": 46.6448,
+      "eval_samples_per_second": 21.439,
+      "eval_steps_per_second": 1.351,
+      "step": 460
+    },
+    {
+      "epoch": 0.82174688057041,
+      "grad_norm": 1.37795090675354,
+      "learning_rate": 3.3597621407333996e-06,
+      "loss": 0.9514,
+      "step": 461
+    },
+    {
+      "epoch": 0.8235294117647058,
+      "grad_norm": 1.3167624473571777,
+      "learning_rate": 3.3548067393458872e-06,
+      "loss": 0.9349,
+      "step": 462
+    },
+    {
+      "epoch": 0.8253119429590018,
+      "grad_norm": 1.4826208353042603,
+      "learning_rate": 3.349851337958375e-06,
+      "loss": 0.9225,
+      "step": 463
+    },
+    {
+      "epoch": 0.8270944741532977,
+      "grad_norm": 1.6088447570800781,
+      "learning_rate": 3.344895936570863e-06,
+      "loss": 0.9237,
+      "step": 464
+    },
+    {
+      "epoch": 0.8288770053475936,
+      "grad_norm": 1.490053653717041,
+      "learning_rate": 3.33994053518335e-06,
+      "loss": 0.9679,
+      "step": 465
+    },
+    {
+      "epoch": 0.8306595365418895,
+      "grad_norm": 1.485168695449829,
+      "learning_rate": 3.334985133795838e-06,
+      "loss": 0.9425,
+      "step": 466
+    },
+    {
+      "epoch": 0.8324420677361853,
+      "grad_norm": 1.6942230463027954,
+      "learning_rate": 3.330029732408325e-06,
+      "loss": 0.9106,
+      "step": 467
+    },
+    {
+      "epoch": 0.8342245989304813,
+      "grad_norm": 1.4257986545562744,
+      "learning_rate": 3.3250743310208128e-06,
+      "loss": 0.9405,
+      "step": 468
+    },
+    {
+      "epoch": 0.8360071301247772,
+      "grad_norm": 1.4217643737792969,
+      "learning_rate": 3.3201189296333004e-06,
+      "loss": 0.9197,
+      "step": 469
+    },
+    {
+      "epoch": 0.8377896613190731,
+      "grad_norm": 1.3932244777679443,
+      "learning_rate": 3.3151635282457885e-06,
+      "loss": 0.9177,
+      "step": 470
+    },
+    {
+      "epoch": 0.8377896613190731,
+      "eval_loss": 1.0090515613555908,
+      "eval_runtime": 46.8153,
+      "eval_samples_per_second": 21.361,
+      "eval_steps_per_second": 1.346,
+      "step": 470
+    },
+    {
+      "epoch": 0.839572192513369,
+      "grad_norm": 1.2177809476852417,
+      "learning_rate": 3.3102081268582757e-06,
+      "loss": 0.92,
+      "step": 471
+    },
+    {
+      "epoch": 0.8413547237076648,
+      "grad_norm": 1.2709245681762695,
+      "learning_rate": 3.3052527254707634e-06,
+      "loss": 0.9185,
+      "step": 472
+    },
+    {
+      "epoch": 0.8431372549019608,
+      "grad_norm": 1.4141621589660645,
+      "learning_rate": 3.300297324083251e-06,
+      "loss": 0.9332,
+      "step": 473
+    },
+    {
+      "epoch": 0.8449197860962567,
+      "grad_norm": 1.4409421682357788,
+      "learning_rate": 3.2953419226957383e-06,
+      "loss": 0.9604,
+      "step": 474
+    },
+    {
+      "epoch": 0.8467023172905526,
+      "grad_norm": 1.2500907182693481,
+      "learning_rate": 3.2903865213082264e-06,
+      "loss": 0.9369,
+      "step": 475
+    },
+    {
+      "epoch": 0.8484848484848485,
+      "grad_norm": 1.3185211420059204,
+      "learning_rate": 3.285431119920714e-06,
+      "loss": 0.9409,
+      "step": 476
+    },
+    {
+      "epoch": 0.8502673796791443,
+      "grad_norm": 1.2824376821517944,
+      "learning_rate": 3.2804757185332013e-06,
+      "loss": 0.9829,
+      "step": 477
+    },
+    {
+      "epoch": 0.8520499108734403,
+      "grad_norm": 1.4796322584152222,
+      "learning_rate": 3.275520317145689e-06,
+      "loss": 0.9282,
+      "step": 478
+    },
+    {
+      "epoch": 0.8538324420677362,
+      "grad_norm": 1.5836542844772339,
+      "learning_rate": 3.2705649157581766e-06,
+      "loss": 0.9382,
+      "step": 479
+    },
+    {
+      "epoch": 0.8556149732620321,
+      "grad_norm": 1.553688645362854,
+      "learning_rate": 3.2656095143706647e-06,
+      "loss": 0.9423,
+      "step": 480
+    },
+    {
+      "epoch": 0.8556149732620321,
+      "eval_loss": 1.0048539638519287,
+      "eval_runtime": 46.7514,
+      "eval_samples_per_second": 21.39,
+      "eval_steps_per_second": 1.348,
+      "step": 480
+    },
+    {
+      "epoch": 0.857397504456328,
+      "grad_norm": 1.568942666053772,
+      "learning_rate": 3.260654112983152e-06,
+      "loss": 0.9206,
+      "step": 481
+    },
+    {
+      "epoch": 0.8591800356506238,
+      "grad_norm": 1.928328037261963,
+      "learning_rate": 3.2556987115956396e-06,
+      "loss": 0.9357,
+      "step": 482
+    },
+    {
+      "epoch": 0.8609625668449198,
+      "grad_norm": 1.5885226726531982,
+      "learning_rate": 3.2507433102081273e-06,
+      "loss": 0.9142,
+      "step": 483
+    },
+    {
+      "epoch": 0.8627450980392157,
+      "grad_norm": 1.5181794166564941,
+      "learning_rate": 3.2457879088206145e-06,
+      "loss": 0.8949,
+      "step": 484
+    },
+    {
+      "epoch": 0.8645276292335116,
+      "grad_norm": 1.2631601095199585,
+      "learning_rate": 3.240832507433102e-06,
+      "loss": 0.9369,
+      "step": 485
+    },
+    {
+      "epoch": 0.8663101604278075,
+      "grad_norm": 1.3040809631347656,
+      "learning_rate": 3.2358771060455903e-06,
+      "loss": 0.9243,
+      "step": 486
+    },
+    {
+      "epoch": 0.8680926916221033,
+      "grad_norm": 1.8351012468338013,
+      "learning_rate": 3.2309217046580775e-06,
+      "loss": 0.9463,
+      "step": 487
+    },
+    {
+      "epoch": 0.8698752228163993,
+      "grad_norm": 1.5455973148345947,
+      "learning_rate": 3.225966303270565e-06,
+      "loss": 0.9203,
+      "step": 488
+    },
+    {
+      "epoch": 0.8716577540106952,
+      "grad_norm": 1.3624253273010254,
+      "learning_rate": 3.221010901883053e-06,
+      "loss": 0.8842,
+      "step": 489
+    },
+    {
+      "epoch": 0.8734402852049911,
+      "grad_norm": 1.2611980438232422,
+      "learning_rate": 3.21605550049554e-06,
+      "loss": 0.9197,
+      "step": 490
+    },
+    {
+      "epoch": 0.8734402852049911,
+      "eval_loss": 1.0056157112121582,
+      "eval_runtime": 46.6828,
+      "eval_samples_per_second": 21.421,
+      "eval_steps_per_second": 1.35,
+      "step": 490
+    },
+    {
+      "epoch": 0.875222816399287,
+      "grad_norm": 1.5297733545303345,
+      "learning_rate": 3.211100099108028e-06,
+      "loss": 0.9202,
+      "step": 491
+    },
+    {
+      "epoch": 0.8770053475935828,
+      "grad_norm": 1.541585922241211,
+      "learning_rate": 3.206144697720516e-06,
+      "loss": 0.9288,
+      "step": 492
+    },
+    {
+      "epoch": 0.8787878787878788,
+      "grad_norm": 1.3235573768615723,
+      "learning_rate": 3.201189296333003e-06,
+      "loss": 0.9242,
+      "step": 493
+    },
+    {
+      "epoch": 0.8805704099821747,
+      "grad_norm": 1.4747593402862549,
+      "learning_rate": 3.1962338949454907e-06,
+      "loss": 0.918,
+      "step": 494
+    },
+    {
+      "epoch": 0.8823529411764706,
+      "grad_norm": 1.530286192893982,
+      "learning_rate": 3.1912784935579784e-06,
+      "loss": 0.9223,
+      "step": 495
+    },
+    {
+      "epoch": 0.8841354723707665,
+      "grad_norm": 1.5007365942001343,
+      "learning_rate": 3.1863230921704664e-06,
+      "loss": 0.9273,
+      "step": 496
+    },
+    {
+      "epoch": 0.8859180035650623,
+      "grad_norm": 1.6092145442962646,
+      "learning_rate": 3.1813676907829537e-06,
+      "loss": 0.9085,
+      "step": 497
+    },
+    {
+      "epoch": 0.8877005347593583,
+      "grad_norm": 1.4303230047225952,
+      "learning_rate": 3.1764122893954413e-06,
+      "loss": 0.9255,
+      "step": 498
+    },
+    {
+      "epoch": 0.8894830659536542,
+      "grad_norm": 1.3100906610488892,
+      "learning_rate": 3.171456888007929e-06,
+      "loss": 0.8832,
+      "step": 499
+    },
+    {
+      "epoch": 0.8912655971479501,
+      "grad_norm": 1.608756184577942,
+      "learning_rate": 3.1665014866204162e-06,
+      "loss": 0.9347,
+      "step": 500
+    },
+    {
+      "epoch": 0.8912655971479501,
+      "eval_loss": 1.004418969154358,
+      "eval_runtime": 46.7969,
+      "eval_samples_per_second": 21.369,
+      "eval_steps_per_second": 1.346,
+      "step": 500
+    },
+    {
+      "epoch": 0.893048128342246,
+      "grad_norm": 1.752517819404602,
+      "learning_rate": 3.161546085232904e-06,
+      "loss": 0.9387,
+      "step": 501
+    },
+    {
+      "epoch": 0.8948306595365418,
+      "grad_norm": 1.4832419157028198,
+      "learning_rate": 3.156590683845392e-06,
+      "loss": 0.953,
+      "step": 502
+    },
+    {
+      "epoch": 0.8966131907308378,
+      "grad_norm": 1.3396515846252441,
+      "learning_rate": 3.1516352824578792e-06,
+      "loss": 0.9047,
+      "step": 503
+    },
+    {
+      "epoch": 0.8983957219251337,
+      "grad_norm": 1.3921819925308228,
+      "learning_rate": 3.146679881070367e-06,
+      "loss": 0.9185,
+      "step": 504
+    },
+    {
+      "epoch": 0.9001782531194296,
+      "grad_norm": 1.469477891921997,
+      "learning_rate": 3.1417244796828546e-06,
+      "loss": 0.939,
+      "step": 505
+    },
+    {
+      "epoch": 0.9019607843137255,
+      "grad_norm": 1.3249036073684692,
+      "learning_rate": 3.136769078295342e-06,
+      "loss": 0.9455,
+      "step": 506
+    },
+    {
+      "epoch": 0.9037433155080213,
+      "grad_norm": 1.5113000869750977,
+      "learning_rate": 3.13181367690783e-06,
+      "loss": 0.9438,
+      "step": 507
+    },
+    {
+      "epoch": 0.9055258467023173,
+      "grad_norm": 1.2723312377929688,
+      "learning_rate": 3.1268582755203175e-06,
+      "loss": 0.882,
+      "step": 508
+    },
+    {
+      "epoch": 0.9073083778966132,
+      "grad_norm": 1.5971113443374634,
+      "learning_rate": 3.121902874132805e-06,
+      "loss": 0.9296,
+      "step": 509
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 1.5077036619186401,
+      "learning_rate": 3.1169474727452924e-06,
+      "loss": 0.936,
+      "step": 510
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "eval_loss": 1.0027379989624023,
+      "eval_runtime": 46.6635,
+      "eval_samples_per_second": 21.43,
+      "eval_steps_per_second": 1.35,
+      "step": 510
+    },
+    {
+      "epoch": 0.910873440285205,
+      "grad_norm": 1.405448317527771,
+      "learning_rate": 3.11199207135778e-06,
+      "loss": 0.9213,
+      "step": 511
+    },
+    {
+      "epoch": 0.9126559714795008,
+      "grad_norm": 1.4942853450775146,
+      "learning_rate": 3.107036669970268e-06,
+      "loss": 0.9436,
+      "step": 512
+    },
+    {
+      "epoch": 0.9144385026737968,
+      "grad_norm": 1.7423145771026611,
+      "learning_rate": 3.1020812685827554e-06,
+      "loss": 0.9292,
+      "step": 513
+    },
+    {
+      "epoch": 0.9162210338680927,
+      "grad_norm": 1.383570671081543,
+      "learning_rate": 3.097125867195243e-06,
+      "loss": 0.9229,
+      "step": 514
+    },
+    {
+      "epoch": 0.9180035650623886,
+      "grad_norm": 1.337597131729126,
+      "learning_rate": 3.0921704658077308e-06,
+      "loss": 0.9312,
+      "step": 515
+    },
+    {
+      "epoch": 0.9197860962566845,
+      "grad_norm": 1.3440173864364624,
+      "learning_rate": 3.087215064420218e-06,
+      "loss": 0.9327,
+      "step": 516
+    },
+    {
+      "epoch": 0.9215686274509803,
+      "grad_norm": 1.2668343782424927,
+      "learning_rate": 3.0822596630327057e-06,
+      "loss": 0.8952,
+      "step": 517
+    },
+    {
+      "epoch": 0.9233511586452763,
+      "grad_norm": 1.346970796585083,
+      "learning_rate": 3.0773042616451937e-06,
+      "loss": 0.8791,
+      "step": 518
+    },
+    {
+      "epoch": 0.9251336898395722,
+      "grad_norm": 1.4276890754699707,
+      "learning_rate": 3.0723488602576814e-06,
+      "loss": 0.9303,
+      "step": 519
+    },
+    {
+      "epoch": 0.9269162210338681,
+      "grad_norm": 1.7119300365447998,
+      "learning_rate": 3.0673934588701686e-06,
+      "loss": 0.9023,
+      "step": 520
+    },
+    {
+      "epoch": 0.9269162210338681,
+      "eval_loss": 1.0006085634231567,
+      "eval_runtime": 46.6636,
+      "eval_samples_per_second": 21.43,
+      "eval_steps_per_second": 1.35,
+      "step": 520
+    },
+    {
+      "epoch": 0.928698752228164,
+      "grad_norm": 1.5053153038024902,
+      "learning_rate": 3.0624380574826563e-06,
+      "loss": 0.9321,
+      "step": 521
+    },
+    {
+      "epoch": 0.93048128342246,
+      "grad_norm": 1.4875434637069702,
+      "learning_rate": 3.0574826560951435e-06,
+      "loss": 0.9112,
+      "step": 522
+    },
+    {
+      "epoch": 0.9322638146167558,
+      "grad_norm": 1.5411895513534546,
+      "learning_rate": 3.0525272547076316e-06,
+      "loss": 0.9279,
+      "step": 523
+    },
+    {
+      "epoch": 0.9340463458110517,
+      "grad_norm": 1.7409446239471436,
+      "learning_rate": 3.0475718533201193e-06,
+      "loss": 0.8981,
+      "step": 524
+    },
+    {
+      "epoch": 0.9358288770053476,
+      "grad_norm": 1.537593126296997,
+      "learning_rate": 3.042616451932607e-06,
+      "loss": 0.9232,
+      "step": 525
+    },
+    {
+      "epoch": 0.9376114081996435,
+      "grad_norm": 1.4763727188110352,
+      "learning_rate": 3.037661050545094e-06,
+      "loss": 0.9251,
+      "step": 526
+    },
+    {
+      "epoch": 0.9393939393939394,
+      "grad_norm": 1.4310897588729858,
+      "learning_rate": 3.032705649157582e-06,
+      "loss": 0.9222,
+      "step": 527
+    },
+    {
+      "epoch": 0.9411764705882353,
+      "grad_norm": 1.3526382446289062,
+      "learning_rate": 3.02775024777007e-06,
+      "loss": 0.9037,
+      "step": 528
+    },
+    {
+      "epoch": 0.9429590017825312,
+      "grad_norm": 1.479190707206726,
+      "learning_rate": 3.022794846382557e-06,
+      "loss": 0.9236,
+      "step": 529
+    },
+    {
+      "epoch": 0.9447415329768271,
+      "grad_norm": 1.3150599002838135,
+      "learning_rate": 3.017839444995045e-06,
+      "loss": 0.915,
+      "step": 530
+    },
+    {
+      "epoch": 0.9447415329768271,
+      "eval_loss": 1.0004721879959106,
+      "eval_runtime": 46.7447,
+      "eval_samples_per_second": 21.393,
+      "eval_steps_per_second": 1.348,
+      "step": 530
+    },
+    {
+      "epoch": 0.946524064171123,
+      "grad_norm": 1.5540575981140137,
+      "learning_rate": 3.0128840436075325e-06,
+      "loss": 0.9368,
+      "step": 531
+    },
+    {
+      "epoch": 0.948306595365419,
+      "grad_norm": 1.236002802848816,
+      "learning_rate": 3.0079286422200197e-06,
+      "loss": 0.9127,
+      "step": 532
+    },
+    {
+      "epoch": 0.9500891265597148,
+      "grad_norm": 1.1216799020767212,
+      "learning_rate": 3.0029732408325074e-06,
+      "loss": 0.9257,
+      "step": 533
+    },
+    {
+      "epoch": 0.9518716577540107,
+      "grad_norm": 1.2347314357757568,
+      "learning_rate": 2.9980178394449955e-06,
+      "loss": 0.9031,
+      "step": 534
+    },
+    {
+      "epoch": 0.9536541889483066,
+      "grad_norm": 1.4934622049331665,
+      "learning_rate": 2.993062438057483e-06,
+      "loss": 0.9392,
+      "step": 535
+    },
+    {
+      "epoch": 0.9554367201426025,
+      "grad_norm": 1.4701998233795166,
+      "learning_rate": 2.9881070366699704e-06,
+      "loss": 0.9553,
+      "step": 536
+    },
+    {
+      "epoch": 0.9572192513368984,
+      "grad_norm": 1.3633320331573486,
+      "learning_rate": 2.983151635282458e-06,
+      "loss": 0.9187,
+      "step": 537
+    },
+    {
+      "epoch": 0.9590017825311943,
+      "grad_norm": 1.3644108772277832,
+      "learning_rate": 2.9781962338949457e-06,
+      "loss": 0.8887,
+      "step": 538
+    },
+    {
+      "epoch": 0.9607843137254902,
+      "grad_norm": 1.3315613269805908,
+      "learning_rate": 2.9732408325074334e-06,
+      "loss": 0.9067,
+      "step": 539
+    },
+    {
+      "epoch": 0.9625668449197861,
+      "grad_norm": 1.1904367208480835,
+      "learning_rate": 2.968285431119921e-06,
+      "loss": 0.8795,
+      "step": 540
+    },
+    {
+      "epoch": 0.9625668449197861,
+      "eval_loss": 0.9979129433631897,
+      "eval_runtime": 46.8388,
+      "eval_samples_per_second": 21.35,
+      "eval_steps_per_second": 1.345,
+      "step": 540
+    },
+    {
+      "epoch": 0.964349376114082,
+      "grad_norm": 1.2745060920715332,
+      "learning_rate": 2.9633300297324087e-06,
+      "loss": 0.9501,
+      "step": 541
+    },
+    {
+      "epoch": 0.966131907308378,
+      "grad_norm": 1.2493999004364014,
+      "learning_rate": 2.958374628344896e-06,
+      "loss": 0.8889,
+      "step": 542
+    },
+    {
+      "epoch": 0.9679144385026738,
+      "grad_norm": 1.567963719367981,
+      "learning_rate": 2.9534192269573836e-06,
+      "loss": 0.9227,
+      "step": 543
+    },
+    {
+      "epoch": 0.9696969696969697,
+      "grad_norm": 1.4358129501342773,
+      "learning_rate": 2.9484638255698717e-06,
+      "loss": 0.9275,
+      "step": 544
+    },
+    {
+      "epoch": 0.9714795008912656,
+      "grad_norm": 1.5509508848190308,
+      "learning_rate": 2.9435084241823593e-06,
+      "loss": 0.9326,
+      "step": 545
+    },
+    {
+      "epoch": 0.9732620320855615,
+      "grad_norm": 1.381090760231018,
+      "learning_rate": 2.9385530227948466e-06,
+      "loss": 0.9265,
+      "step": 546
+    },
+    {
+      "epoch": 0.9750445632798574,
+      "grad_norm": 1.2564153671264648,
+      "learning_rate": 2.9335976214073342e-06,
+      "loss": 0.9274,
+      "step": 547
+    },
+    {
+      "epoch": 0.9768270944741533,
+      "grad_norm": 1.2519657611846924,
+      "learning_rate": 2.9286422200198215e-06,
+      "loss": 0.9072,
+      "step": 548
+    },
+    {
+      "epoch": 0.9786096256684492,
+      "grad_norm": 1.3204387426376343,
+      "learning_rate": 2.923686818632309e-06,
+      "loss": 0.9551,
+      "step": 549
+    },
+    {
+      "epoch": 0.9803921568627451,
+      "grad_norm": 1.4377044439315796,
+      "learning_rate": 2.9187314172447972e-06,
+      "loss": 0.9298,
+      "step": 550
+    },
+    {
+      "epoch": 0.9803921568627451,
+      "eval_loss": 0.9987896680831909,
+      "eval_runtime": 46.9021,
+      "eval_samples_per_second": 21.321,
+      "eval_steps_per_second": 1.343,
+      "step": 550
+    },
+    {
+      "epoch": 0.982174688057041,
+      "grad_norm": 1.2541097402572632,
+      "learning_rate": 2.913776015857285e-06,
+      "loss": 0.9214,
+      "step": 551
+    },
+    {
+      "epoch": 0.983957219251337,
+      "grad_norm": 1.2618757486343384,
+      "learning_rate": 2.908820614469772e-06,
+      "loss": 0.9441,
+      "step": 552
+    },
+    {
+      "epoch": 0.9857397504456328,
+      "grad_norm": 1.393532156944275,
+      "learning_rate": 2.9038652130822598e-06,
+      "loss": 0.9201,
+      "step": 553
+    },
+    {
+      "epoch": 0.9875222816399287,
+      "grad_norm": 1.4977319240570068,
+      "learning_rate": 2.8989098116947474e-06,
+      "loss": 0.8966,
+      "step": 554
+    },
+    {
+      "epoch": 0.9893048128342246,
+      "grad_norm": 1.5135530233383179,
+      "learning_rate": 2.8939544103072355e-06,
+      "loss": 0.9311,
+      "step": 555
+    },
+    {
+      "epoch": 0.9910873440285205,
+      "grad_norm": 1.2576534748077393,
+      "learning_rate": 2.8889990089197228e-06,
+      "loss": 0.9059,
+      "step": 556
+    },
+    {
+      "epoch": 0.9928698752228164,
+      "grad_norm": 1.3360989093780518,
+      "learning_rate": 2.8840436075322104e-06,
+      "loss": 0.9377,
+      "step": 557
+    },
+    {
+      "epoch": 0.9946524064171123,
+      "grad_norm": 1.4871883392333984,
+      "learning_rate": 2.8790882061446977e-06,
+      "loss": 0.8884,
+      "step": 558
+    },
+    {
+      "epoch": 0.9964349376114082,
+      "grad_norm": 1.3083568811416626,
+      "learning_rate": 2.8741328047571853e-06,
+      "loss": 0.9614,
+      "step": 559
+    },
+    {
+      "epoch": 0.9982174688057041,
+      "grad_norm": 1.3526344299316406,
+      "learning_rate": 2.8691774033696734e-06,
+      "loss": 0.9117,
+      "step": 560
+    },
+    {
+      "epoch": 0.9982174688057041,
+      "eval_loss": 0.9987173080444336,
+      "eval_runtime": 46.7749,
+      "eval_samples_per_second": 21.379,
+      "eval_steps_per_second": 1.347,
+      "step": 560
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.3440344333648682,
+      "learning_rate": 2.864222001982161e-06,
+      "loss": 0.9054,
+      "step": 561
+    },
+    {
+      "epoch": 1.0017825311942958,
+      "grad_norm": 1.4617632627487183,
+      "learning_rate": 2.8592666005946483e-06,
+      "loss": 0.8767,
+      "step": 562
+    },
+    {
+      "epoch": 1.0035650623885919,
+      "grad_norm": 1.4687554836273193,
+      "learning_rate": 2.854311199207136e-06,
+      "loss": 0.8534,
+      "step": 563
+    },
+    {
+      "epoch": 1.0053475935828877,
+      "grad_norm": 1.4026336669921875,
+      "learning_rate": 2.8493557978196236e-06,
+      "loss": 0.8736,
+      "step": 564
+    },
+    {
+      "epoch": 1.0071301247771836,
+      "grad_norm": 1.4234068393707275,
+      "learning_rate": 2.844400396432111e-06,
+      "loss": 0.9045,
+      "step": 565
+    },
+    {
+      "epoch": 1.0089126559714796,
+      "grad_norm": 1.2911219596862793,
+      "learning_rate": 2.839444995044599e-06,
+      "loss": 0.8587,
+      "step": 566
+    },
+    {
+      "epoch": 1.0106951871657754,
+      "grad_norm": 1.4888267517089844,
+      "learning_rate": 2.8344895936570866e-06,
+      "loss": 0.8876,
+      "step": 567
+    },
+    {
+      "epoch": 1.0124777183600713,
+      "grad_norm": 1.3002138137817383,
+      "learning_rate": 2.829534192269574e-06,
+      "loss": 0.9234,
+      "step": 568
+    },
+    {
+      "epoch": 1.014260249554367,
+      "grad_norm": 1.3918858766555786,
+      "learning_rate": 2.8245787908820615e-06,
+      "loss": 0.8831,
+      "step": 569
+    },
+    {
+      "epoch": 1.0160427807486632,
+      "grad_norm": 1.284764051437378,
+      "learning_rate": 2.819623389494549e-06,
+      "loss": 0.9112,
+      "step": 570
+    },
+    {
+      "epoch": 1.0160427807486632,
+      "eval_loss": 0.9963147044181824,
+      "eval_runtime": 46.8613,
+      "eval_samples_per_second": 21.34,
+      "eval_steps_per_second": 1.344,
+      "step": 570
+    },
+    {
+      "epoch": 1.017825311942959,
+      "grad_norm": 1.4841303825378418,
+      "learning_rate": 2.8146679881070373e-06,
+      "loss": 0.8811,
+      "step": 571
+    },
+    {
+      "epoch": 1.0196078431372548,
+      "grad_norm": 1.4724706411361694,
+      "learning_rate": 2.8097125867195245e-06,
+      "loss": 0.8884,
+      "step": 572
+    },
+    {
+      "epoch": 1.0213903743315509,
+      "grad_norm": 1.5961391925811768,
+      "learning_rate": 2.804757185332012e-06,
+      "loss": 0.8845,
+      "step": 573
+    },
+    {
+      "epoch": 1.0231729055258467,
+      "grad_norm": 2.456153154373169,
+      "learning_rate": 2.7998017839445e-06,
+      "loss": 0.9175,
+      "step": 574
+    },
+    {
+      "epoch": 1.0249554367201426,
+      "grad_norm": 1.4625294208526611,
+      "learning_rate": 2.794846382556987e-06,
+      "loss": 0.883,
+      "step": 575
+    },
+    {
+      "epoch": 1.0267379679144386,
+      "grad_norm": 1.4096879959106445,
+      "learning_rate": 2.789890981169475e-06,
+      "loss": 0.9139,
+      "step": 576
+    },
+    {
+      "epoch": 1.0285204991087344,
+      "grad_norm": 1.606987714767456,
+      "learning_rate": 2.784935579781963e-06,
+      "loss": 0.9002,
+      "step": 577
+    },
+    {
+      "epoch": 1.0303030303030303,
+      "grad_norm": 2.5259013175964355,
+      "learning_rate": 2.77998017839445e-06,
+      "loss": 0.8221,
+      "step": 578
+    },
+    {
+      "epoch": 1.032085561497326,
+      "grad_norm": 1.870883822441101,
+      "learning_rate": 2.7750247770069377e-06,
+      "loss": 0.885,
+      "step": 579
+    },
+    {
+      "epoch": 1.0338680926916222,
+      "grad_norm": 1.4817109107971191,
+      "learning_rate": 2.7700693756194254e-06,
+      "loss": 0.872,
+      "step": 580
+    },
+    {
+      "epoch": 1.0338680926916222,
+      "eval_loss": 0.9969404935836792,
+      "eval_runtime": 46.8454,
+      "eval_samples_per_second": 21.347,
+      "eval_steps_per_second": 1.345,
+      "step": 580
+    },
+    {
+      "epoch": 1.035650623885918,
+      "grad_norm": 1.7585666179656982,
+      "learning_rate": 2.7651139742319126e-06,
+      "loss": 0.9138,
+      "step": 581
+    },
+    {
+      "epoch": 1.0374331550802138,
+      "grad_norm": 1.8026617765426636,
+      "learning_rate": 2.7601585728444007e-06,
+      "loss": 0.8799,
+      "step": 582
+    },
+    {
+      "epoch": 1.0392156862745099,
+      "grad_norm": 1.6723577976226807,
+      "learning_rate": 2.7552031714568884e-06,
+      "loss": 0.9084,
+      "step": 583
+    },
+    {
+      "epoch": 1.0409982174688057,
+      "grad_norm": 1.8082846403121948,
+      "learning_rate": 2.750247770069376e-06,
+      "loss": 0.8649,
+      "step": 584
+    },
+    {
+      "epoch": 1.0427807486631016,
+      "grad_norm": 1.685368299484253,
+      "learning_rate": 2.7452923686818633e-06,
+      "loss": 0.9086,
+      "step": 585
+    },
+    {
+      "epoch": 1.0445632798573976,
+      "grad_norm": 1.6961652040481567,
+      "learning_rate": 2.740336967294351e-06,
+      "loss": 0.8912,
+      "step": 586
+    },
+    {
+      "epoch": 1.0463458110516934,
+      "grad_norm": 1.55865478515625,
+      "learning_rate": 2.735381565906839e-06,
+      "loss": 0.8864,
+      "step": 587
+    },
+    {
+      "epoch": 1.0481283422459893,
+      "grad_norm": 1.8303790092468262,
+      "learning_rate": 2.7304261645193263e-06,
+      "loss": 0.8801,
+      "step": 588
+    },
+    {
+      "epoch": 1.049910873440285,
+      "grad_norm": 1.6978795528411865,
+      "learning_rate": 2.725470763131814e-06,
+      "loss": 0.8463,
+      "step": 589
+    },
+    {
+      "epoch": 1.0516934046345812,
+      "grad_norm": 1.6878262758255005,
+      "learning_rate": 2.7205153617443016e-06,
+      "loss": 0.8927,
+      "step": 590
+    },
+    {
+      "epoch": 1.0516934046345812,
+      "eval_loss": 0.9930296540260315,
+      "eval_runtime": 46.8922,
+      "eval_samples_per_second": 21.326,
+      "eval_steps_per_second": 1.344,
+      "step": 590
+    },
+    {
+      "epoch": 1.053475935828877,
+      "grad_norm": 1.3838971853256226,
+      "learning_rate": 2.715559960356789e-06,
+      "loss": 0.9149,
+      "step": 591
+    },
+    {
+      "epoch": 1.0552584670231728,
+      "grad_norm": 1.689635992050171,
+      "learning_rate": 2.710604558969277e-06,
+      "loss": 0.893,
+      "step": 592
+    },
+    {
+      "epoch": 1.0570409982174689,
+      "grad_norm": 1.940187692642212,
+      "learning_rate": 2.7056491575817646e-06,
+      "loss": 0.8938,
+      "step": 593
+    },
+    {
+      "epoch": 1.0588235294117647,
+      "grad_norm": 1.5956275463104248,
+      "learning_rate": 2.700693756194252e-06,
+      "loss": 0.8764,
+      "step": 594
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 1.4798537492752075,
+      "learning_rate": 2.6957383548067395e-06,
+      "loss": 0.8709,
+      "step": 595
+    },
+    {
+      "epoch": 1.0623885918003566,
+      "grad_norm": 1.2672936916351318,
+      "learning_rate": 2.690782953419227e-06,
+      "loss": 0.8538,
+      "step": 596
+    },
+    {
+      "epoch": 1.0641711229946524,
+      "grad_norm": 1.2904959917068481,
+      "learning_rate": 2.6858275520317144e-06,
+      "loss": 0.8187,
+      "step": 597
+    },
+    {
+      "epoch": 1.0659536541889483,
+      "grad_norm": 1.4401811361312866,
+      "learning_rate": 2.6808721506442025e-06,
+      "loss": 0.8775,
+      "step": 598
+    },
+    {
+      "epoch": 1.067736185383244,
+      "grad_norm": 1.3092564344406128,
+      "learning_rate": 2.67591674925669e-06,
+      "loss": 0.9057,
+      "step": 599
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "grad_norm": 1.6517740488052368,
+      "learning_rate": 2.6709613478691778e-06,
+      "loss": 0.883,
+      "step": 600
+    },
+    {
+      "epoch": 1.0695187165775402,
+      "eval_loss": 0.9936122298240662,
+      "eval_runtime": 46.824,
+      "eval_samples_per_second": 21.357,
+      "eval_steps_per_second": 1.345,
+      "step": 600
+    },
+    {
+      "epoch": 1.071301247771836,
+      "grad_norm": 1.5126500129699707,
+      "learning_rate": 2.666005946481665e-06,
+      "loss": 0.8645,
+      "step": 601
+    },
+    {
+      "epoch": 1.0730837789661318,
+      "grad_norm": 1.408370852470398,
+      "learning_rate": 2.6610505450941527e-06,
+      "loss": 0.8944,
+      "step": 602
+    },
+    {
+      "epoch": 1.0748663101604279,
+      "grad_norm": 1.4243714809417725,
+      "learning_rate": 2.6560951437066408e-06,
+      "loss": 0.8935,
+      "step": 603
+    },
+    {
+      "epoch": 1.0766488413547237,
+      "grad_norm": 1.4721286296844482,
+      "learning_rate": 2.651139742319128e-06,
+      "loss": 0.8808,
+      "step": 604
+    },
+    {
+      "epoch": 1.0784313725490196,
+      "grad_norm": 1.4304018020629883,
+      "learning_rate": 2.6461843409316157e-06,
+      "loss": 0.8792,
+      "step": 605
+    },
+    {
+      "epoch": 1.0802139037433156,
+      "grad_norm": 1.4544298648834229,
+      "learning_rate": 2.6412289395441033e-06,
+      "loss": 0.877,
+      "step": 606
+    },
+    {
+      "epoch": 1.0819964349376114,
+      "grad_norm": 1.409246802330017,
+      "learning_rate": 2.6362735381565906e-06,
+      "loss": 0.8757,
+      "step": 607
+    },
+    {
+      "epoch": 1.0837789661319073,
+      "grad_norm": 1.617339015007019,
+      "learning_rate": 2.6313181367690786e-06,
+      "loss": 0.8767,
+      "step": 608
+    },
+    {
+      "epoch": 1.085561497326203,
+      "grad_norm": 1.5547486543655396,
+      "learning_rate": 2.6263627353815663e-06,
+      "loss": 0.8893,
+      "step": 609
+    },
+    {
+      "epoch": 1.0873440285204992,
+      "grad_norm": 1.579556941986084,
+      "learning_rate": 2.621407333994054e-06,
+      "loss": 0.9028,
+      "step": 610
+    },
+    {
+      "epoch": 1.0873440285204992,
+      "eval_loss": 0.9923062920570374,
+      "eval_runtime": 46.7842,
+      "eval_samples_per_second": 21.375,
+      "eval_steps_per_second": 1.347,
+      "step": 610
+    },
+    {
+      "epoch": 1.089126559714795,
+      "grad_norm": 1.847340703010559,
+      "learning_rate": 2.616451932606541e-06,
+      "loss": 0.8685,
+      "step": 611
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 1.5406112670898438,
+      "learning_rate": 2.611496531219029e-06,
+      "loss": 0.9117,
+      "step": 612
+    },
+    {
+      "epoch": 1.0926916221033869,
+      "grad_norm": 1.3770501613616943,
+      "learning_rate": 2.606541129831516e-06,
+      "loss": 0.8371,
+      "step": 613
+    },
+    {
+      "epoch": 1.0944741532976827,
+      "grad_norm": 1.3167601823806763,
+      "learning_rate": 2.601585728444004e-06,
+      "loss": 0.8927,
+      "step": 614
+    },
+    {
+      "epoch": 1.0962566844919786,
+      "grad_norm": 1.4795807600021362,
+      "learning_rate": 2.596630327056492e-06,
+      "loss": 0.8994,
+      "step": 615
+    },
+    {
+      "epoch": 1.0980392156862746,
+      "grad_norm": 1.7171162366867065,
+      "learning_rate": 2.5916749256689795e-06,
+      "loss": 0.877,
+      "step": 616
+    },
+    {
+      "epoch": 1.0998217468805704,
+      "grad_norm": 1.4922057390213013,
+      "learning_rate": 2.5867195242814668e-06,
+      "loss": 0.8555,
+      "step": 617
+    },
+    {
+      "epoch": 1.1016042780748663,
+      "grad_norm": 1.5104916095733643,
+      "learning_rate": 2.5817641228939544e-06,
+      "loss": 0.8668,
+      "step": 618
+    },
+    {
+      "epoch": 1.1033868092691623,
+      "grad_norm": 1.3570868968963623,
+      "learning_rate": 2.5768087215064425e-06,
+      "loss": 0.9149,
+      "step": 619
+    },
+    {
+      "epoch": 1.1051693404634582,
+      "grad_norm": 1.4930638074874878,
+      "learning_rate": 2.57185332011893e-06,
+      "loss": 0.8967,
+      "step": 620
+    },
+    {
+      "epoch": 1.1051693404634582,
+      "eval_loss": 0.9928249716758728,
+      "eval_runtime": 46.7413,
+      "eval_samples_per_second": 21.394,
+      "eval_steps_per_second": 1.348,
+      "step": 620
+    },
+    {
+      "epoch": 1.106951871657754,
+      "grad_norm": 1.3319581747055054,
+      "learning_rate": 2.5668979187314174e-06,
+      "loss": 0.8868,
+      "step": 621
+    },
+    {
+      "epoch": 1.1087344028520498,
+      "grad_norm": 1.4563696384429932,
+      "learning_rate": 2.561942517343905e-06,
+      "loss": 0.8676,
+      "step": 622
+    },
+    {
+      "epoch": 1.1105169340463459,
+      "grad_norm": 1.5489044189453125,
+      "learning_rate": 2.5569871159563923e-06,
+      "loss": 0.9186,
+      "step": 623
+    },
+    {
+      "epoch": 1.1122994652406417,
+      "grad_norm": 1.4728941917419434,
+      "learning_rate": 2.5520317145688804e-06,
+      "loss": 0.8981,
+      "step": 624
+    },
+    {
+      "epoch": 1.1140819964349375,
+      "grad_norm": 1.4518868923187256,
+      "learning_rate": 2.547076313181368e-06,
+      "loss": 0.8867,
+      "step": 625
+    },
+    {
+      "epoch": 1.1158645276292336,
+      "grad_norm": 1.464890480041504,
+      "learning_rate": 2.5421209117938557e-06,
+      "loss": 0.9466,
+      "step": 626
+    },
+    {
+      "epoch": 1.1176470588235294,
+      "grad_norm": 1.4894181489944458,
+      "learning_rate": 2.537165510406343e-06,
+      "loss": 0.8671,
+      "step": 627
+    },
+    {
+      "epoch": 1.1194295900178253,
+      "grad_norm": 1.6478488445281982,
+      "learning_rate": 2.5322101090188306e-06,
+      "loss": 0.8847,
+      "step": 628
+    },
+    {
+      "epoch": 1.121212121212121,
+      "grad_norm": 1.465164065361023,
+      "learning_rate": 2.5272547076313183e-06,
+      "loss": 0.9166,
+      "step": 629
+    },
+    {
+      "epoch": 1.1229946524064172,
+      "grad_norm": 1.3165429830551147,
+      "learning_rate": 2.522299306243806e-06,
+      "loss": 0.8853,
+      "step": 630
+    },
+    {
+      "epoch": 1.1229946524064172,
+      "eval_loss": 0.99163818359375,
+      "eval_runtime": 46.7506,
+      "eval_samples_per_second": 21.39,
+      "eval_steps_per_second": 1.348,
+      "step": 630
+    },
+    {
+      "epoch": 1.124777183600713,
+      "grad_norm": 1.6656804084777832,
+      "learning_rate": 2.5173439048562936e-06,
+      "loss": 0.8526,
+      "step": 631
+    },
+    {
+      "epoch": 1.1265597147950088,
+      "grad_norm": 1.3749207258224487,
+      "learning_rate": 2.5123885034687813e-06,
+      "loss": 0.9135,
+      "step": 632
+    },
+    {
+      "epoch": 1.1283422459893049,
+      "grad_norm": 1.4695217609405518,
+      "learning_rate": 2.5074331020812685e-06,
+      "loss": 0.8788,
+      "step": 633
+    },
+    {
+      "epoch": 1.1301247771836007,
+      "grad_norm": 1.4157956838607788,
+      "learning_rate": 2.502477700693756e-06,
+      "loss": 0.8811,
+      "step": 634
+    },
+    {
+      "epoch": 1.1319073083778965,
+      "grad_norm": 1.3357295989990234,
+      "learning_rate": 2.497522299306244e-06,
+      "loss": 0.8767,
+      "step": 635
+    },
+    {
+      "epoch": 1.1336898395721926,
+      "grad_norm": 1.7020845413208008,
+      "learning_rate": 2.492566897918732e-06,
+      "loss": 0.8714,
+      "step": 636
+    },
+    {
+      "epoch": 1.1354723707664884,
+      "grad_norm": 1.4703397750854492,
+      "learning_rate": 2.487611496531219e-06,
+      "loss": 0.8921,
+      "step": 637
+    },
+    {
+      "epoch": 1.1372549019607843,
+      "grad_norm": 2.017592430114746,
+      "learning_rate": 2.482656095143707e-06,
+      "loss": 0.865,
+      "step": 638
+    },
+    {
+      "epoch": 1.1390374331550803,
+      "grad_norm": 1.4744309186935425,
+      "learning_rate": 2.4777006937561945e-06,
+      "loss": 0.8894,
+      "step": 639
+    },
+    {
+      "epoch": 1.1408199643493762,
+      "grad_norm": 1.4797626733779907,
+      "learning_rate": 2.472745292368682e-06,
+      "loss": 0.8839,
+      "step": 640
+    },
+    {
+      "epoch": 1.1408199643493762,
+      "eval_loss": 0.9916447401046753,
+      "eval_runtime": 46.6435,
+      "eval_samples_per_second": 21.439,
+      "eval_steps_per_second": 1.351,
+      "step": 640
+    },
+    {
+      "epoch": 1.142602495543672,
+      "grad_norm": 1.2910147905349731,
+      "learning_rate": 2.4677898909811694e-06,
+      "loss": 0.8608,
+      "step": 641
+    },
+    {
+      "epoch": 1.1443850267379678,
+      "grad_norm": 1.276078701019287,
+      "learning_rate": 2.4628344895936575e-06,
+      "loss": 0.8723,
+      "step": 642
+    },
+    {
+      "epoch": 1.1461675579322639,
+      "grad_norm": 1.7418657541275024,
+      "learning_rate": 2.4578790882061447e-06,
+      "loss": 0.8726,
+      "step": 643
+    },
+    {
+      "epoch": 1.1479500891265597,
+      "grad_norm": 1.5978609323501587,
+      "learning_rate": 2.4529236868186328e-06,
+      "loss": 0.867,
+      "step": 644
+    },
+    {
+      "epoch": 1.1497326203208555,
+      "grad_norm": 1.3553117513656616,
+      "learning_rate": 2.44796828543112e-06,
+      "loss": 0.8993,
+      "step": 645
+    },
+    {
+      "epoch": 1.1515151515151516,
+      "grad_norm": 1.5411593914031982,
+      "learning_rate": 2.4430128840436077e-06,
+      "loss": 0.8556,
+      "step": 646
+    },
+    {
+      "epoch": 1.1532976827094474,
+      "grad_norm": 1.4576947689056396,
+      "learning_rate": 2.4380574826560953e-06,
+      "loss": 0.8889,
+      "step": 647
+    },
+    {
+      "epoch": 1.1550802139037433,
+      "grad_norm": 1.4621020555496216,
+      "learning_rate": 2.433102081268583e-06,
+      "loss": 0.8586,
+      "step": 648
+    },
+    {
+      "epoch": 1.156862745098039,
+      "grad_norm": 1.409480094909668,
+      "learning_rate": 2.4281466798810702e-06,
+      "loss": 0.8656,
+      "step": 649
+    },
+    {
+      "epoch": 1.1586452762923352,
+      "grad_norm": 1.461784839630127,
+      "learning_rate": 2.4231912784935583e-06,
+      "loss": 0.9129,
+      "step": 650
+    },
+    {
+      "epoch": 1.1586452762923352,
+      "eval_loss": 0.9930744767189026,
+      "eval_runtime": 46.6771,
+      "eval_samples_per_second": 21.424,
+      "eval_steps_per_second": 1.35,
+      "step": 650
+    },
+    {
+      "epoch": 1.160427807486631,
+      "grad_norm": 1.2690680027008057,
+      "learning_rate": 2.4182358771060456e-06,
+      "loss": 0.8445,
+      "step": 651
+    },
+    {
+      "epoch": 1.1622103386809268,
+      "grad_norm": 1.6523184776306152,
+      "learning_rate": 2.4132804757185337e-06,
+      "loss": 0.8714,
+      "step": 652
+    },
+    {
+      "epoch": 1.1639928698752229,
+      "grad_norm": 1.4627327919006348,
+      "learning_rate": 2.408325074331021e-06,
+      "loss": 0.8907,
+      "step": 653
+    },
+    {
+      "epoch": 1.1657754010695187,
+      "grad_norm": 1.3082153797149658,
+      "learning_rate": 2.4033696729435086e-06,
+      "loss": 0.8873,
+      "step": 654
+    },
+    {
+      "epoch": 1.1675579322638145,
+      "grad_norm": 1.4305182695388794,
+      "learning_rate": 2.3984142715559962e-06,
+      "loss": 0.9341,
+      "step": 655
+    },
+    {
+      "epoch": 1.1693404634581106,
+      "grad_norm": 1.3877581357955933,
+      "learning_rate": 2.393458870168484e-06,
+      "loss": 0.834,
+      "step": 656
+    },
+    {
+      "epoch": 1.1711229946524064,
+      "grad_norm": 1.5337421894073486,
+      "learning_rate": 2.3885034687809715e-06,
+      "loss": 0.8588,
+      "step": 657
+    },
+    {
+      "epoch": 1.1729055258467023,
+      "grad_norm": 1.6537212133407593,
+      "learning_rate": 2.383548067393459e-06,
+      "loss": 0.8761,
+      "step": 658
+    },
+    {
+      "epoch": 1.1746880570409983,
+      "grad_norm": 1.5609780550003052,
+      "learning_rate": 2.3785926660059464e-06,
+      "loss": 0.88,
+      "step": 659
+    },
+    {
+      "epoch": 1.1764705882352942,
+      "grad_norm": 1.4873936176300049,
+      "learning_rate": 2.3736372646184345e-06,
+      "loss": 0.8904,
+      "step": 660
+    },
+    {
+      "epoch": 1.1764705882352942,
+      "eval_loss": 0.9909861087799072,
+      "eval_runtime": 46.6989,
+      "eval_samples_per_second": 21.414,
+      "eval_steps_per_second": 1.349,
+      "step": 660
+    },
+    {
+      "epoch": 1.17825311942959,
+      "grad_norm": 1.471859097480774,
+      "learning_rate": 2.3686818632309218e-06,
+      "loss": 0.8774,
+      "step": 661
+    },
+    {
+      "epoch": 1.1800356506238858,
+      "grad_norm": 1.4559658765792847,
+      "learning_rate": 2.3637264618434094e-06,
+      "loss": 0.8649,
+      "step": 662
+    },
+    {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 1.5359556674957275,
+      "learning_rate": 2.358771060455897e-06,
+      "loss": 0.8953,
+      "step": 663
+    },
+    {
+      "epoch": 1.1836007130124777,
+      "grad_norm": 1.2817641496658325,
+      "learning_rate": 2.3538156590683847e-06,
+      "loss": 0.8987,
+      "step": 664
+    },
+    {
+      "epoch": 1.1853832442067735,
+      "grad_norm": 1.2599672079086304,
+      "learning_rate": 2.3488602576808724e-06,
+      "loss": 0.8735,
+      "step": 665
+    },
+    {
+      "epoch": 1.1871657754010696,
+      "grad_norm": 1.341021180152893,
+      "learning_rate": 2.34390485629336e-06,
+      "loss": 0.8981,
+      "step": 666
+    },
+    {
+      "epoch": 1.1889483065953654,
+      "grad_norm": 1.4567780494689941,
+      "learning_rate": 2.3389494549058473e-06,
+      "loss": 0.9018,
+      "step": 667
+    },
+    {
+      "epoch": 1.1907308377896613,
+      "grad_norm": 1.7645756006240845,
+      "learning_rate": 2.3339940535183354e-06,
+      "loss": 0.8856,
+      "step": 668
+    },
+    {
+      "epoch": 1.192513368983957,
+      "grad_norm": 1.4657479524612427,
+      "learning_rate": 2.3290386521308226e-06,
+      "loss": 0.9083,
+      "step": 669
+    },
+    {
+      "epoch": 1.1942959001782532,
+      "grad_norm": 1.269451379776001,
+      "learning_rate": 2.3240832507433103e-06,
+      "loss": 0.887,
+      "step": 670
+    },
+    {
+      "epoch": 1.1942959001782532,
+      "eval_loss": 0.9889793992042542,
+      "eval_runtime": 46.8047,
+      "eval_samples_per_second": 21.365,
+      "eval_steps_per_second": 1.346,
+      "step": 670
+    },
+    {
+      "epoch": 1.196078431372549,
+      "grad_norm": 1.3132225275039673,
+      "learning_rate": 2.319127849355798e-06,
+      "loss": 0.8676,
+      "step": 671
+    },
+    {
+      "epoch": 1.1978609625668448,
+      "grad_norm": 1.3520387411117554,
+      "learning_rate": 2.3141724479682856e-06,
+      "loss": 0.9,
+      "step": 672
+    },
+    {
+      "epoch": 1.1996434937611409,
+      "grad_norm": 1.392001986503601,
+      "learning_rate": 2.3092170465807733e-06,
+      "loss": 0.8749,
+      "step": 673
+    },
+    {
+      "epoch": 1.2014260249554367,
+      "grad_norm": 1.4109132289886475,
+      "learning_rate": 2.304261645193261e-06,
+      "loss": 0.9133,
+      "step": 674
+    },
+    {
+      "epoch": 1.2032085561497325,
+      "grad_norm": 1.6250007152557373,
+      "learning_rate": 2.2993062438057486e-06,
+      "loss": 0.8689,
+      "step": 675
+    },
+    {
+      "epoch": 1.2049910873440286,
+      "grad_norm": 1.4920040369033813,
+      "learning_rate": 2.2943508424182363e-06,
+      "loss": 0.9031,
+      "step": 676
+    },
+    {
+      "epoch": 1.2067736185383244,
+      "grad_norm": 1.2313125133514404,
+      "learning_rate": 2.2893954410307235e-06,
+      "loss": 0.8538,
+      "step": 677
+    },
+    {
+      "epoch": 1.2085561497326203,
+      "grad_norm": 1.5707231760025024,
+      "learning_rate": 2.284440039643211e-06,
+      "loss": 0.9009,
+      "step": 678
+    },
+    {
+      "epoch": 1.2103386809269163,
+      "grad_norm": 1.4293330907821655,
+      "learning_rate": 2.279484638255699e-06,
+      "loss": 0.8994,
+      "step": 679
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 1.404422640800476,
+      "learning_rate": 2.2745292368681865e-06,
+      "loss": 0.9259,
+      "step": 680
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "eval_loss": 0.9861985445022583,
+      "eval_runtime": 46.7287,
+      "eval_samples_per_second": 21.4,
+      "eval_steps_per_second": 1.348,
+      "step": 680
+    },
+    {
+      "epoch": 1.213903743315508,
+      "grad_norm": 1.2691755294799805,
+      "learning_rate": 2.269573835480674e-06,
+      "loss": 0.8818,
+      "step": 681
+    },
+    {
+      "epoch": 1.215686274509804,
+      "grad_norm": 1.478243350982666,
+      "learning_rate": 2.264618434093162e-06,
+      "loss": 0.86,
+      "step": 682
+    },
+    {
+      "epoch": 1.2174688057040999,
+      "grad_norm": 1.398539662361145,
+      "learning_rate": 2.2596630327056495e-06,
+      "loss": 0.8802,
+      "step": 683
+    },
+    {
+      "epoch": 1.2192513368983957,
+      "grad_norm": 1.4929214715957642,
+      "learning_rate": 2.254707631318137e-06,
+      "loss": 0.8576,
+      "step": 684
+    },
+    {
+      "epoch": 1.2210338680926915,
+      "grad_norm": 1.4856244325637817,
+      "learning_rate": 2.2497522299306244e-06,
+      "loss": 0.8893,
+      "step": 685
+    },
+    {
+      "epoch": 1.2228163992869876,
+      "grad_norm": 1.3881354331970215,
+      "learning_rate": 2.244796828543112e-06,
+      "loss": 0.866,
+      "step": 686
+    },
+    {
+      "epoch": 1.2245989304812834,
+      "grad_norm": 1.413635492324829,
+      "learning_rate": 2.2398414271555997e-06,
+      "loss": 0.8584,
+      "step": 687
+    },
+    {
+      "epoch": 1.2263814616755793,
+      "grad_norm": 1.5331799983978271,
+      "learning_rate": 2.2348860257680874e-06,
+      "loss": 0.8614,
+      "step": 688
+    },
+    {
+      "epoch": 1.228163992869875,
+      "grad_norm": 1.396995186805725,
+      "learning_rate": 2.229930624380575e-06,
+      "loss": 0.9054,
+      "step": 689
+    },
+    {
+      "epoch": 1.2299465240641712,
+      "grad_norm": 1.6812461614608765,
+      "learning_rate": 2.2249752229930627e-06,
+      "loss": 0.8831,
+      "step": 690
+    },
+    {
+      "epoch": 1.2299465240641712,
+      "eval_loss": 0.9851806163787842,
+      "eval_runtime": 46.6927,
+      "eval_samples_per_second": 21.417,
+      "eval_steps_per_second": 1.349,
+      "step": 690
+    },
+    {
+      "epoch": 1.231729055258467,
+      "grad_norm": 1.4893524646759033,
+      "learning_rate": 2.2200198216055503e-06,
+      "loss": 0.8903,
+      "step": 691
+    },
+    {
+      "epoch": 1.2335115864527628,
+      "grad_norm": 1.5997936725616455,
+      "learning_rate": 2.215064420218038e-06,
+      "loss": 0.8972,
+      "step": 692
+    },
+    {
+      "epoch": 1.2352941176470589,
+      "grad_norm": 1.4399421215057373,
+      "learning_rate": 2.2101090188305257e-06,
+      "loss": 0.8639,
+      "step": 693
+    },
+    {
+      "epoch": 1.2370766488413547,
+      "grad_norm": 1.5507882833480835,
+      "learning_rate": 2.205153617443013e-06,
+      "loss": 0.8558,
+      "step": 694
+    },
+    {
+      "epoch": 1.2388591800356505,
+      "grad_norm": 1.3248441219329834,
+      "learning_rate": 2.2001982160555006e-06,
+      "loss": 0.8699,
+      "step": 695
+    },
+    {
+      "epoch": 1.2406417112299466,
+      "grad_norm": 1.4937480688095093,
+      "learning_rate": 2.1952428146679882e-06,
+      "loss": 0.8935,
+      "step": 696
+    },
+    {
+      "epoch": 1.2424242424242424,
+      "grad_norm": 1.5931757688522339,
+      "learning_rate": 2.190287413280476e-06,
+      "loss": 0.9007,
+      "step": 697
+    },
+    {
+      "epoch": 1.2442067736185383,
+      "grad_norm": 1.7570197582244873,
+      "learning_rate": 2.1853320118929636e-06,
+      "loss": 0.8546,
+      "step": 698
+    },
+    {
+      "epoch": 1.2459893048128343,
+      "grad_norm": 1.621984601020813,
+      "learning_rate": 2.1803766105054512e-06,
+      "loss": 0.901,
+      "step": 699
+    },
+    {
+      "epoch": 1.2477718360071302,
+      "grad_norm": 1.3966022729873657,
+      "learning_rate": 2.175421209117939e-06,
+      "loss": 0.9307,
+      "step": 700
+    },
+    {
+      "epoch": 1.2477718360071302,
+      "eval_loss": 0.9874935150146484,
+      "eval_runtime": 46.6795,
+      "eval_samples_per_second": 21.423,
+      "eval_steps_per_second": 1.35,
+      "step": 700
+    },
+    {
+      "epoch": 1.249554367201426,
+      "grad_norm": 1.3309043645858765,
+      "learning_rate": 2.1704658077304265e-06,
+      "loss": 0.9206,
+      "step": 701
+    },
+    {
+      "epoch": 1.251336898395722,
+      "grad_norm": 1.3344178199768066,
+      "learning_rate": 2.1655104063429138e-06,
+      "loss": 0.868,
+      "step": 702
+    },
+    {
+      "epoch": 1.2531194295900179,
+      "grad_norm": 1.3813674449920654,
+      "learning_rate": 2.160555004955402e-06,
+      "loss": 0.8573,
+      "step": 703
+    },
+    {
+      "epoch": 1.2549019607843137,
+      "grad_norm": 1.44670832157135,
+      "learning_rate": 2.155599603567889e-06,
+      "loss": 0.8964,
+      "step": 704
+    },
+    {
+      "epoch": 1.2566844919786098,
+      "grad_norm": 1.732297420501709,
+      "learning_rate": 2.1506442021803768e-06,
+      "loss": 0.8635,
+      "step": 705
+    },
+    {
+      "epoch": 1.2584670231729056,
+      "grad_norm": 1.3183728456497192,
+      "learning_rate": 2.1456888007928644e-06,
+      "loss": 0.8917,
+      "step": 706
+    },
+    {
+      "epoch": 1.2602495543672014,
+      "grad_norm": 1.4485280513763428,
+      "learning_rate": 2.140733399405352e-06,
+      "loss": 0.8731,
+      "step": 707
+    },
+    {
+      "epoch": 1.2620320855614973,
+      "grad_norm": 1.4543678760528564,
+      "learning_rate": 2.1357779980178398e-06,
+      "loss": 0.9099,
+      "step": 708
+    },
+    {
+      "epoch": 1.263814616755793,
+      "grad_norm": 1.3342758417129517,
+      "learning_rate": 2.1308225966303274e-06,
+      "loss": 0.8704,
+      "step": 709
+    },
+    {
+      "epoch": 1.2655971479500892,
+      "grad_norm": 1.6935384273529053,
+      "learning_rate": 2.1258671952428147e-06,
+      "loss": 0.8775,
+      "step": 710
+    },
+    {
+      "epoch": 1.2655971479500892,
+      "eval_loss": 0.989654541015625,
+      "eval_runtime": 46.5841,
+      "eval_samples_per_second": 21.467,
+      "eval_steps_per_second": 1.352,
+      "step": 710
+    },
+    {
+      "epoch": 1.267379679144385,
+      "grad_norm": 1.6754915714263916,
+      "learning_rate": 2.1209117938553027e-06,
+      "loss": 0.8715,
+      "step": 711
+    },
+    {
+      "epoch": 1.2691622103386808,
+      "grad_norm": 1.4536867141723633,
+      "learning_rate": 2.11595639246779e-06,
+      "loss": 0.8805,
+      "step": 712
+    },
+    {
+      "epoch": 1.2709447415329769,
+      "grad_norm": 1.46432626247406,
+      "learning_rate": 2.1110009910802776e-06,
+      "loss": 0.898,
+      "step": 713
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 1.3130420446395874,
+      "learning_rate": 2.1060455896927653e-06,
+      "loss": 0.8851,
+      "step": 714
+    },
+    {
+      "epoch": 1.2745098039215685,
+      "grad_norm": 1.454445242881775,
+      "learning_rate": 2.101090188305253e-06,
+      "loss": 0.9093,
+      "step": 715
+    },
+    {
+      "epoch": 1.2762923351158646,
+      "grad_norm": 1.560789942741394,
+      "learning_rate": 2.0961347869177406e-06,
+      "loss": 0.8874,
+      "step": 716
+    },
+    {
+      "epoch": 1.2780748663101604,
+      "grad_norm": 1.632129430770874,
+      "learning_rate": 2.0911793855302283e-06,
+      "loss": 0.8691,
+      "step": 717
+    },
+    {
+      "epoch": 1.2798573975044563,
+      "grad_norm": 1.4793459177017212,
+      "learning_rate": 2.0862239841427155e-06,
+      "loss": 0.8766,
+      "step": 718
+    },
+    {
+      "epoch": 1.2816399286987523,
+      "grad_norm": 1.4158271551132202,
+      "learning_rate": 2.0812685827552036e-06,
+      "loss": 0.8646,
+      "step": 719
+    },
+    {
+      "epoch": 1.2834224598930482,
+      "grad_norm": 1.508212924003601,
+      "learning_rate": 2.076313181367691e-06,
+      "loss": 0.8856,
+      "step": 720
+    },
+    {
+      "epoch": 1.2834224598930482,
+      "eval_loss": 0.9860122799873352,
+      "eval_runtime": 46.8096,
+      "eval_samples_per_second": 21.363,
+      "eval_steps_per_second": 1.346,
+      "step": 720
+    },
+    {
+      "epoch": 1.285204991087344,
+      "grad_norm": 1.4124470949172974,
+      "learning_rate": 2.0713577799801785e-06,
+      "loss": 0.8816,
+      "step": 721
+    },
+    {
+      "epoch": 1.28698752228164,
+      "grad_norm": 1.4536359310150146,
+      "learning_rate": 2.066402378592666e-06,
+      "loss": 0.8729,
+      "step": 722
+    },
+    {
+      "epoch": 1.2887700534759359,
+      "grad_norm": 1.6018034219741821,
+      "learning_rate": 2.061446977205154e-06,
+      "loss": 0.8749,
+      "step": 723
+    },
+    {
+      "epoch": 1.2905525846702317,
+      "grad_norm": 1.3637772798538208,
+      "learning_rate": 2.0564915758176415e-06,
+      "loss": 0.8483,
+      "step": 724
+    },
+    {
+      "epoch": 1.2923351158645278,
+      "grad_norm": 1.6507648229599,
+      "learning_rate": 2.051536174430129e-06,
+      "loss": 0.8598,
+      "step": 725
+    },
+    {
+      "epoch": 1.2941176470588236,
+      "grad_norm": 1.37489652633667,
+      "learning_rate": 2.0465807730426164e-06,
+      "loss": 0.8736,
+      "step": 726
+    },
+    {
+      "epoch": 1.2959001782531194,
+      "grad_norm": 1.2748007774353027,
+      "learning_rate": 2.0416253716551045e-06,
+      "loss": 0.863,
+      "step": 727
+    },
+    {
+      "epoch": 1.2976827094474153,
+      "grad_norm": 1.4101521968841553,
+      "learning_rate": 2.0366699702675917e-06,
+      "loss": 0.864,
+      "step": 728
+    },
+    {
+      "epoch": 1.299465240641711,
+      "grad_norm": 1.4258012771606445,
+      "learning_rate": 2.0317145688800794e-06,
+      "loss": 0.9237,
+      "step": 729
+    },
+    {
+      "epoch": 1.3012477718360071,
+      "grad_norm": 1.3425439596176147,
+      "learning_rate": 2.026759167492567e-06,
+      "loss": 0.8798,
+      "step": 730
+    },
+    {
+      "epoch": 1.3012477718360071,
+      "eval_loss": 0.9809737801551819,
+      "eval_runtime": 47.0146,
+      "eval_samples_per_second": 21.27,
+      "eval_steps_per_second": 1.34,
+      "step": 730
+    },
+    {
+      "epoch": 1.303030303030303,
+      "grad_norm": 1.4874835014343262,
+      "learning_rate": 2.0218037661050547e-06,
+      "loss": 0.8953,
+      "step": 731
+    },
+    {
+      "epoch": 1.3048128342245988,
+      "grad_norm": 1.3893389701843262,
+      "learning_rate": 2.0168483647175424e-06,
+      "loss": 0.8579,
+      "step": 732
+    },
+    {
+      "epoch": 1.3065953654188949,
+      "grad_norm": 1.4454811811447144,
+      "learning_rate": 2.01189296333003e-06,
+      "loss": 0.865,
+      "step": 733
+    },
+    {
+      "epoch": 1.3083778966131907,
+      "grad_norm": 1.3343634605407715,
+      "learning_rate": 2.0069375619425173e-06,
+      "loss": 0.8737,
+      "step": 734
+    },
+    {
+      "epoch": 1.3101604278074865,
+      "grad_norm": 1.5064631700515747,
+      "learning_rate": 2.0019821605550054e-06,
+      "loss": 0.8625,
+      "step": 735
+    },
+    {
+      "epoch": 1.3119429590017826,
+      "grad_norm": 1.5989820957183838,
+      "learning_rate": 1.9970267591674926e-06,
+      "loss": 0.8926,
+      "step": 736
+    },
+    {
+      "epoch": 1.3137254901960784,
+      "grad_norm": 1.5042405128479004,
+      "learning_rate": 1.9920713577799803e-06,
+      "loss": 0.8646,
+      "step": 737
+    },
+    {
+      "epoch": 1.3155080213903743,
+      "grad_norm": 1.463318109512329,
+      "learning_rate": 1.987115956392468e-06,
+      "loss": 0.8918,
+      "step": 738
+    },
+    {
+      "epoch": 1.3172905525846703,
+      "grad_norm": 1.477638602256775,
+      "learning_rate": 1.9821605550049556e-06,
+      "loss": 0.8414,
+      "step": 739
+    },
+    {
+      "epoch": 1.3190730837789661,
+      "grad_norm": 1.4441951513290405,
+      "learning_rate": 1.9772051536174432e-06,
+      "loss": 0.8728,
+      "step": 740
+    },
+    {
+      "epoch": 1.3190730837789661,
+      "eval_loss": 0.9812989830970764,
+      "eval_runtime": 46.879,
+      "eval_samples_per_second": 21.332,
+      "eval_steps_per_second": 1.344,
+      "step": 740
+    },
+    {
+      "epoch": 1.320855614973262,
+      "grad_norm": 1.275374174118042,
+      "learning_rate": 1.972249752229931e-06,
+      "loss": 0.8911,
+      "step": 741
+    },
+    {
+      "epoch": 1.322638146167558,
+      "grad_norm": 1.400856614112854,
+      "learning_rate": 1.967294350842418e-06,
+      "loss": 0.8334,
+      "step": 742
+    },
+    {
+      "epoch": 1.3244206773618539,
+      "grad_norm": 1.4594979286193848,
+      "learning_rate": 1.9623389494549062e-06,
+      "loss": 0.8453,
+      "step": 743
+    },
+    {
+      "epoch": 1.3262032085561497,
+      "grad_norm": 1.2328113317489624,
+      "learning_rate": 1.9573835480673935e-06,
+      "loss": 0.8652,
+      "step": 744
+    },
+    {
+      "epoch": 1.3279857397504458,
+      "grad_norm": 1.5369375944137573,
+      "learning_rate": 1.952428146679881e-06,
+      "loss": 0.8884,
+      "step": 745
+    },
+    {
+      "epoch": 1.3297682709447416,
+      "grad_norm": 1.4707444906234741,
+      "learning_rate": 1.9474727452923688e-06,
+      "loss": 0.8811,
+      "step": 746
+    },
+    {
+      "epoch": 1.3315508021390374,
+      "grad_norm": 1.4895271062850952,
+      "learning_rate": 1.9425173439048564e-06,
+      "loss": 0.8704,
+      "step": 747
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 1.413124918937683,
+      "learning_rate": 1.937561942517344e-06,
+      "loss": 0.8407,
+      "step": 748
+    },
+    {
+      "epoch": 1.3351158645276293,
+      "grad_norm": 1.322543740272522,
+      "learning_rate": 1.9326065411298318e-06,
+      "loss": 0.8429,
+      "step": 749
+    },
+    {
+      "epoch": 1.3368983957219251,
+      "grad_norm": 1.3666902780532837,
+      "learning_rate": 1.927651139742319e-06,
+      "loss": 0.8587,
+      "step": 750
+    },
+    {
+      "epoch": 1.3368983957219251,
+      "eval_loss": 0.9815234541893005,
+      "eval_runtime": 46.813,
+      "eval_samples_per_second": 21.362,
+      "eval_steps_per_second": 1.346,
+      "step": 750
+    },
+    {
+      "epoch": 1.338680926916221,
+      "grad_norm": 1.336299180984497,
+      "learning_rate": 1.922695738354807e-06,
+      "loss": 0.8528,
+      "step": 751
+    },
+    {
+      "epoch": 1.3404634581105168,
+      "grad_norm": 1.4908182621002197,
+      "learning_rate": 1.9177403369672943e-06,
+      "loss": 0.8841,
+      "step": 752
+    },
+    {
+      "epoch": 1.3422459893048129,
+      "grad_norm": 1.2383705377578735,
+      "learning_rate": 1.912784935579782e-06,
+      "loss": 0.8387,
+      "step": 753
+    },
+    {
+      "epoch": 1.3440285204991087,
+      "grad_norm": 1.4624556303024292,
+      "learning_rate": 1.9078295341922697e-06,
+      "loss": 0.891,
+      "step": 754
+    },
+    {
+      "epoch": 1.3458110516934045,
+      "grad_norm": 1.4606925249099731,
+      "learning_rate": 1.9028741328047573e-06,
+      "loss": 0.8913,
+      "step": 755
+    },
+    {
+      "epoch": 1.3475935828877006,
+      "grad_norm": 1.403260350227356,
+      "learning_rate": 1.897918731417245e-06,
+      "loss": 0.8486,
+      "step": 756
+    },
+    {
+      "epoch": 1.3493761140819964,
+      "grad_norm": 1.407162070274353,
+      "learning_rate": 1.8929633300297326e-06,
+      "loss": 0.8588,
+      "step": 757
+    },
+    {
+      "epoch": 1.3511586452762923,
+      "grad_norm": 1.468679666519165,
+      "learning_rate": 1.88800792864222e-06,
+      "loss": 0.8619,
+      "step": 758
+    },
+    {
+      "epoch": 1.3529411764705883,
+      "grad_norm": 1.3879525661468506,
+      "learning_rate": 1.883052527254708e-06,
+      "loss": 0.8706,
+      "step": 759
+    },
+    {
+      "epoch": 1.3547237076648841,
+      "grad_norm": 1.4178467988967896,
+      "learning_rate": 1.8780971258671954e-06,
+      "loss": 0.867,
+      "step": 760
+    },
+    {
+      "epoch": 1.3547237076648841,
+      "eval_loss": 0.9822861552238464,
+      "eval_runtime": 46.8957,
+      "eval_samples_per_second": 21.324,
+      "eval_steps_per_second": 1.343,
+      "step": 760
+    },
+    {
+      "epoch": 1.35650623885918,
+      "grad_norm": 1.4034419059753418,
+      "learning_rate": 1.8731417244796829e-06,
+      "loss": 0.8609,
+      "step": 761
+    },
+    {
+      "epoch": 1.358288770053476,
+      "grad_norm": 1.3722320795059204,
+      "learning_rate": 1.8681863230921707e-06,
+      "loss": 0.8748,
+      "step": 762
+    },
+    {
+      "epoch": 1.3600713012477719,
+      "grad_norm": 1.5463820695877075,
+      "learning_rate": 1.8632309217046582e-06,
+      "loss": 0.8766,
+      "step": 763
+    },
+    {
+      "epoch": 1.3618538324420677,
+      "grad_norm": 1.416810154914856,
+      "learning_rate": 1.858275520317146e-06,
+      "loss": 0.9066,
+      "step": 764
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 1.387640118598938,
+      "learning_rate": 1.8533201189296335e-06,
+      "loss": 0.8714,
+      "step": 765
+    },
+    {
+      "epoch": 1.3654188948306596,
+      "grad_norm": 1.4583455324172974,
+      "learning_rate": 1.848364717542121e-06,
+      "loss": 0.8479,
+      "step": 766
+    },
+    {
+      "epoch": 1.3672014260249554,
+      "grad_norm": 1.587016224861145,
+      "learning_rate": 1.8434093161546088e-06,
+      "loss": 0.8807,
+      "step": 767
+    },
+    {
+      "epoch": 1.3689839572192513,
+      "grad_norm": 1.321548342704773,
+      "learning_rate": 1.8384539147670963e-06,
+      "loss": 0.8939,
+      "step": 768
+    },
+    {
+      "epoch": 1.3707664884135473,
+      "grad_norm": 1.3802001476287842,
+      "learning_rate": 1.8334985133795837e-06,
+      "loss": 0.8547,
+      "step": 769
+    },
+    {
+      "epoch": 1.3725490196078431,
+      "grad_norm": 1.4780387878417969,
+      "learning_rate": 1.8285431119920716e-06,
+      "loss": 0.8684,
+      "step": 770
+    },
+    {
+      "epoch": 1.3725490196078431,
+      "eval_loss": 0.9845251441001892,
+      "eval_runtime": 47.0226,
+      "eval_samples_per_second": 21.266,
+      "eval_steps_per_second": 1.34,
+      "step": 770
+    },
+    {
+      "epoch": 1.374331550802139,
+      "grad_norm": 1.3145530223846436,
+      "learning_rate": 1.823587710604559e-06,
+      "loss": 0.8808,
+      "step": 771
+    },
+    {
+      "epoch": 1.3761140819964348,
+      "grad_norm": 1.316864252090454,
+      "learning_rate": 1.818632309217047e-06,
+      "loss": 0.8819,
+      "step": 772
+    },
+    {
+      "epoch": 1.3778966131907309,
+      "grad_norm": 1.4071670770645142,
+      "learning_rate": 1.8136769078295344e-06,
+      "loss": 0.8472,
+      "step": 773
+    },
+    {
+      "epoch": 1.3796791443850267,
+      "grad_norm": 1.388598084449768,
+      "learning_rate": 1.8087215064420218e-06,
+      "loss": 0.8377,
+      "step": 774
+    },
+    {
+      "epoch": 1.3814616755793225,
+      "grad_norm": 1.3947046995162964,
+      "learning_rate": 1.8037661050545097e-06,
+      "loss": 0.8687,
+      "step": 775
+    },
+    {
+      "epoch": 1.3832442067736186,
+      "grad_norm": 1.363625407218933,
+      "learning_rate": 1.7988107036669972e-06,
+      "loss": 0.8834,
+      "step": 776
+    },
+    {
+      "epoch": 1.3850267379679144,
+      "grad_norm": 1.3984752893447876,
+      "learning_rate": 1.7938553022794846e-06,
+      "loss": 0.889,
+      "step": 777
+    },
+    {
+      "epoch": 1.3868092691622103,
+      "grad_norm": 1.3725652694702148,
+      "learning_rate": 1.7888999008919725e-06,
+      "loss": 0.876,
+      "step": 778
+    },
+    {
+      "epoch": 1.3885918003565063,
+      "grad_norm": 1.3153650760650635,
+      "learning_rate": 1.78394449950446e-06,
+      "loss": 0.8499,
+      "step": 779
+    },
+    {
+      "epoch": 1.3903743315508021,
+      "grad_norm": 1.5021432638168335,
+      "learning_rate": 1.7789890981169478e-06,
+      "loss": 0.8692,
+      "step": 780
+    },
+    {
+      "epoch": 1.3903743315508021,
+      "eval_loss": 0.9833415746688843,
+      "eval_runtime": 47.0835,
+      "eval_samples_per_second": 21.239,
+      "eval_steps_per_second": 1.338,
+      "step": 780
+    },
+    {
+      "epoch": 1.392156862745098,
+      "grad_norm": 1.4319101572036743,
+      "learning_rate": 1.7740336967294353e-06,
+      "loss": 0.8789,
+      "step": 781
+    },
+    {
+      "epoch": 1.393939393939394,
+      "grad_norm": 1.339253306388855,
+      "learning_rate": 1.7690782953419227e-06,
+      "loss": 0.8383,
+      "step": 782
+    },
+    {
+      "epoch": 1.3957219251336899,
+      "grad_norm": 1.3650999069213867,
+      "learning_rate": 1.7641228939544106e-06,
+      "loss": 0.8887,
+      "step": 783
+    },
+    {
+      "epoch": 1.3975044563279857,
+      "grad_norm": 1.4936254024505615,
+      "learning_rate": 1.759167492566898e-06,
+      "loss": 0.8612,
+      "step": 784
+    },
+    {
+      "epoch": 1.3992869875222818,
+      "grad_norm": 1.5137629508972168,
+      "learning_rate": 1.7542120911793855e-06,
+      "loss": 0.8589,
+      "step": 785
+    },
+    {
+      "epoch": 1.4010695187165776,
+      "grad_norm": 1.533828854560852,
+      "learning_rate": 1.7492566897918734e-06,
+      "loss": 0.9156,
+      "step": 786
+    },
+    {
+      "epoch": 1.4028520499108734,
+      "grad_norm": 1.531721830368042,
+      "learning_rate": 1.7443012884043608e-06,
+      "loss": 0.8586,
+      "step": 787
+    },
+    {
+      "epoch": 1.4046345811051695,
+      "grad_norm": 1.3086016178131104,
+      "learning_rate": 1.7393458870168487e-06,
+      "loss": 0.8745,
+      "step": 788
+    },
+    {
+      "epoch": 1.4064171122994653,
+      "grad_norm": 1.552501916885376,
+      "learning_rate": 1.7343904856293361e-06,
+      "loss": 0.8973,
+      "step": 789
+    },
+    {
+      "epoch": 1.4081996434937611,
+      "grad_norm": 1.5181113481521606,
+      "learning_rate": 1.7294350842418236e-06,
+      "loss": 0.8966,
+      "step": 790
+    },
+    {
+      "epoch": 1.4081996434937611,
+      "eval_loss": 0.9810673594474792,
+      "eval_runtime": 46.8684,
+      "eval_samples_per_second": 21.336,
+      "eval_steps_per_second": 1.344,
+      "step": 790
+    },
+    {
+      "epoch": 1.409982174688057,
+      "grad_norm": 1.4277552366256714,
+      "learning_rate": 1.7244796828543115e-06,
+      "loss": 0.8757,
+      "step": 791
+    },
+    {
+      "epoch": 1.4117647058823528,
+      "grad_norm": 1.5821561813354492,
+      "learning_rate": 1.719524281466799e-06,
+      "loss": 0.8413,
+      "step": 792
+    },
+    {
+      "epoch": 1.4135472370766489,
+      "grad_norm": 1.62177312374115,
+      "learning_rate": 1.7145688800792864e-06,
+      "loss": 0.8655,
+      "step": 793
+    },
+    {
+      "epoch": 1.4153297682709447,
+      "grad_norm": 1.4088196754455566,
+      "learning_rate": 1.7096134786917742e-06,
+      "loss": 0.8847,
+      "step": 794
+    },
+    {
+      "epoch": 1.4171122994652405,
+      "grad_norm": 1.3866719007492065,
+      "learning_rate": 1.7046580773042617e-06,
+      "loss": 0.8587,
+      "step": 795
+    },
+    {
+      "epoch": 1.4188948306595366,
+      "grad_norm": 1.3425517082214355,
+      "learning_rate": 1.6997026759167496e-06,
+      "loss": 0.8458,
+      "step": 796
+    },
+    {
+      "epoch": 1.4206773618538324,
+      "grad_norm": 1.6923450231552124,
+      "learning_rate": 1.694747274529237e-06,
+      "loss": 0.8479,
+      "step": 797
+    },
+    {
+      "epoch": 1.4224598930481283,
+      "grad_norm": 1.6441086530685425,
+      "learning_rate": 1.6897918731417245e-06,
+      "loss": 0.8601,
+      "step": 798
+    },
+    {
+      "epoch": 1.4242424242424243,
+      "grad_norm": 1.314751148223877,
+      "learning_rate": 1.6848364717542123e-06,
+      "loss": 0.8763,
+      "step": 799
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "grad_norm": 1.3205382823944092,
+      "learning_rate": 1.6798810703666998e-06,
+      "loss": 0.8743,
+      "step": 800
+    },
+    {
+      "epoch": 1.4260249554367201,
+      "eval_loss": 0.9791409969329834,
+      "eval_runtime": 46.9219,
+      "eval_samples_per_second": 21.312,
+      "eval_steps_per_second": 1.343,
+      "step": 800
+    },
+    {
+      "epoch": 1.427807486631016,
+      "grad_norm": 1.3294618129730225,
+      "learning_rate": 1.6749256689791874e-06,
+      "loss": 0.9012,
+      "step": 801
+    },
+    {
+      "epoch": 1.429590017825312,
+      "grad_norm": 1.5094256401062012,
+      "learning_rate": 1.669970267591675e-06,
+      "loss": 0.8675,
+      "step": 802
+    },
+    {
+      "epoch": 1.4313725490196079,
+      "grad_norm": 1.41444993019104,
+      "learning_rate": 1.6650148662041625e-06,
+      "loss": 0.8706,
+      "step": 803
+    },
+    {
+      "epoch": 1.4331550802139037,
+      "grad_norm": 1.65440034866333,
+      "learning_rate": 1.6600594648166502e-06,
+      "loss": 0.8435,
+      "step": 804
+    },
+    {
+      "epoch": 1.4349376114081998,
+      "grad_norm": 1.6403052806854248,
+      "learning_rate": 1.6551040634291379e-06,
+      "loss": 0.874,
+      "step": 805
+    },
+    {
+      "epoch": 1.4367201426024956,
+      "grad_norm": 1.4437483549118042,
+      "learning_rate": 1.6501486620416255e-06,
+      "loss": 0.9185,
+      "step": 806
+    },
+    {
+      "epoch": 1.4385026737967914,
+      "grad_norm": 1.4905844926834106,
+      "learning_rate": 1.6451932606541132e-06,
+      "loss": 0.8883,
+      "step": 807
+    },
+    {
+      "epoch": 1.4402852049910875,
+      "grad_norm": 1.3557101488113403,
+      "learning_rate": 1.6402378592666006e-06,
+      "loss": 0.877,
+      "step": 808
+    },
+    {
+      "epoch": 1.4420677361853833,
+      "grad_norm": 1.3058723211288452,
+      "learning_rate": 1.6352824578790883e-06,
+      "loss": 0.8434,
+      "step": 809
+    },
+    {
+      "epoch": 1.4438502673796791,
+      "grad_norm": 1.2745517492294312,
+      "learning_rate": 1.630327056491576e-06,
+      "loss": 0.8342,
+      "step": 810
+    },
+    {
+      "epoch": 1.4438502673796791,
+      "eval_loss": 0.976764976978302,
+      "eval_runtime": 47.017,
+      "eval_samples_per_second": 21.269,
+      "eval_steps_per_second": 1.34,
+      "step": 810
+    },
+    {
+      "epoch": 1.445632798573975,
+      "grad_norm": 1.3594675064086914,
+      "learning_rate": 1.6253716551040636e-06,
+      "loss": 0.8595,
+      "step": 811
+    },
+    {
+      "epoch": 1.4474153297682708,
+      "grad_norm": 1.369518756866455,
+      "learning_rate": 1.620416253716551e-06,
+      "loss": 0.8829,
+      "step": 812
+    },
+    {
+      "epoch": 1.4491978609625669,
+      "grad_norm": 1.3764517307281494,
+      "learning_rate": 1.6154608523290387e-06,
+      "loss": 0.8823,
+      "step": 813
+    },
+    {
+      "epoch": 1.4509803921568627,
+      "grad_norm": 1.540014624595642,
+      "learning_rate": 1.6105054509415264e-06,
+      "loss": 0.8946,
+      "step": 814
+    },
+    {
+      "epoch": 1.4527629233511585,
+      "grad_norm": 1.6402170658111572,
+      "learning_rate": 1.605550049554014e-06,
+      "loss": 0.8679,
+      "step": 815
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 1.4860880374908447,
+      "learning_rate": 1.6005946481665015e-06,
+      "loss": 0.8545,
+      "step": 816
+    },
+    {
+      "epoch": 1.4563279857397504,
+      "grad_norm": 1.3628255128860474,
+      "learning_rate": 1.5956392467789892e-06,
+      "loss": 0.8528,
+      "step": 817
+    },
+    {
+      "epoch": 1.4581105169340463,
+      "grad_norm": 1.293563723564148,
+      "learning_rate": 1.5906838453914768e-06,
+      "loss": 0.8464,
+      "step": 818
+    },
+    {
+      "epoch": 1.4598930481283423,
+      "grad_norm": 1.3827394247055054,
+      "learning_rate": 1.5857284440039645e-06,
+      "loss": 0.8442,
+      "step": 819
+    },
+    {
+      "epoch": 1.4616755793226381,
+      "grad_norm": 1.3776166439056396,
+      "learning_rate": 1.580773042616452e-06,
+      "loss": 0.864,
+      "step": 820
+    },
+    {
+      "epoch": 1.4616755793226381,
+      "eval_loss": 0.9785693883895874,
+      "eval_runtime": 46.9106,
+      "eval_samples_per_second": 21.317,
+      "eval_steps_per_second": 1.343,
+      "step": 820
+    },
+    {
+      "epoch": 1.463458110516934,
+      "grad_norm": 1.3489983081817627,
+      "learning_rate": 1.5758176412289396e-06,
+      "loss": 0.8603,
+      "step": 821
+    },
+    {
+      "epoch": 1.46524064171123,
+      "grad_norm": 1.3305773735046387,
+      "learning_rate": 1.5708622398414273e-06,
+      "loss": 0.8598,
+      "step": 822
+    },
+    {
+      "epoch": 1.4670231729055259,
+      "grad_norm": 1.4561748504638672,
+      "learning_rate": 1.565906838453915e-06,
+      "loss": 0.884,
+      "step": 823
+    },
+    {
+      "epoch": 1.4688057040998217,
+      "grad_norm": 1.3868783712387085,
+      "learning_rate": 1.5609514370664026e-06,
+      "loss": 0.894,
+      "step": 824
+    },
+    {
+      "epoch": 1.4705882352941178,
+      "grad_norm": 1.3360710144042969,
+      "learning_rate": 1.55599603567889e-06,
+      "loss": 0.8581,
+      "step": 825
+    },
+    {
+      "epoch": 1.4723707664884136,
+      "grad_norm": 1.3062288761138916,
+      "learning_rate": 1.5510406342913777e-06,
+      "loss": 0.8703,
+      "step": 826
+    },
+    {
+      "epoch": 1.4741532976827094,
+      "grad_norm": 1.3395740985870361,
+      "learning_rate": 1.5460852329038654e-06,
+      "loss": 0.8906,
+      "step": 827
+    },
+    {
+      "epoch": 1.4759358288770055,
+      "grad_norm": 1.5396114587783813,
+      "learning_rate": 1.5411298315163528e-06,
+      "loss": 0.8845,
+      "step": 828
+    },
+    {
+      "epoch": 1.4777183600713013,
+      "grad_norm": 1.420955777168274,
+      "learning_rate": 1.5361744301288407e-06,
+      "loss": 0.8841,
+      "step": 829
+    },
+    {
+      "epoch": 1.4795008912655971,
+      "grad_norm": 1.4807546138763428,
+      "learning_rate": 1.5312190287413281e-06,
+      "loss": 0.8091,
+      "step": 830
+    },
+    {
+      "epoch": 1.4795008912655971,
+      "eval_loss": 0.9777600169181824,
+      "eval_runtime": 46.9306,
+      "eval_samples_per_second": 21.308,
+      "eval_steps_per_second": 1.342,
+      "step": 830
+    },
+    {
+      "epoch": 1.481283422459893,
+      "grad_norm": 1.5020588636398315,
+      "learning_rate": 1.5262636273538158e-06,
+      "loss": 0.8964,
+      "step": 831
+    },
+    {
+      "epoch": 1.483065953654189,
+      "grad_norm": 1.3068898916244507,
+      "learning_rate": 1.5213082259663035e-06,
+      "loss": 0.8868,
+      "step": 832
+    },
+    {
+      "epoch": 1.4848484848484849,
+      "grad_norm": 1.448590636253357,
+      "learning_rate": 1.516352824578791e-06,
+      "loss": 0.8554,
+      "step": 833
+    },
+    {
+      "epoch": 1.4866310160427807,
+      "grad_norm": 1.4764806032180786,
+      "learning_rate": 1.5113974231912786e-06,
+      "loss": 0.8683,
+      "step": 834
+    },
+    {
+      "epoch": 1.4884135472370765,
+      "grad_norm": 1.5730153322219849,
+      "learning_rate": 1.5064420218037662e-06,
+      "loss": 0.8511,
+      "step": 835
+    },
+    {
+      "epoch": 1.4901960784313726,
+      "grad_norm": 1.454858422279358,
+      "learning_rate": 1.5014866204162537e-06,
+      "loss": 0.8801,
+      "step": 836
+    },
+    {
+      "epoch": 1.4919786096256684,
+      "grad_norm": 1.2339946031570435,
+      "learning_rate": 1.4965312190287416e-06,
+      "loss": 0.8551,
+      "step": 837
+    },
+    {
+      "epoch": 1.4937611408199643,
+      "grad_norm": 1.4510794878005981,
+      "learning_rate": 1.491575817641229e-06,
+      "loss": 0.8639,
+      "step": 838
+    },
+    {
+      "epoch": 1.4955436720142603,
+      "grad_norm": 1.4182809591293335,
+      "learning_rate": 1.4866204162537167e-06,
+      "loss": 0.8486,
+      "step": 839
+    },
+    {
+      "epoch": 1.4973262032085561,
+      "grad_norm": 1.485177993774414,
+      "learning_rate": 1.4816650148662043e-06,
+      "loss": 0.8787,
+      "step": 840
+    },
+    {
+      "epoch": 1.4973262032085561,
+      "eval_loss": 0.9784544706344604,
+      "eval_runtime": 47.0602,
+      "eval_samples_per_second": 21.249,
+      "eval_steps_per_second": 1.339,
+      "step": 840
+    },
+    {
+      "epoch": 1.499108734402852,
+      "grad_norm": 1.579306721687317,
+      "learning_rate": 1.4767096134786918e-06,
+      "loss": 0.891,
+      "step": 841
+    },
+    {
+      "epoch": 1.500891265597148,
+      "grad_norm": 1.5453161001205444,
+      "learning_rate": 1.4717542120911797e-06,
+      "loss": 0.8892,
+      "step": 842
+    },
+    {
+      "epoch": 1.5026737967914439,
+      "grad_norm": 1.4892089366912842,
+      "learning_rate": 1.4667988107036671e-06,
+      "loss": 0.8884,
+      "step": 843
+    },
+    {
+      "epoch": 1.5044563279857397,
+      "grad_norm": 1.328150987625122,
+      "learning_rate": 1.4618434093161546e-06,
+      "loss": 0.8631,
+      "step": 844
+    },
+    {
+      "epoch": 1.5062388591800357,
+      "grad_norm": 1.3877817392349243,
+      "learning_rate": 1.4568880079286424e-06,
+      "loss": 0.8622,
+      "step": 845
+    },
+    {
+      "epoch": 1.5080213903743316,
+      "grad_norm": 1.4985857009887695,
+      "learning_rate": 1.4519326065411299e-06,
+      "loss": 0.8857,
+      "step": 846
+    },
+    {
+      "epoch": 1.5098039215686274,
+      "grad_norm": 1.3666197061538696,
+      "learning_rate": 1.4469772051536178e-06,
+      "loss": 0.823,
+      "step": 847
+    },
+    {
+      "epoch": 1.5115864527629235,
+      "grad_norm": 1.5058780908584595,
+      "learning_rate": 1.4420218037661052e-06,
+      "loss": 0.893,
+      "step": 848
+    },
+    {
+      "epoch": 1.5133689839572193,
+      "grad_norm": 1.4480105638504028,
+      "learning_rate": 1.4370664023785927e-06,
+      "loss": 0.869,
+      "step": 849
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 1.503057599067688,
+      "learning_rate": 1.4321110009910805e-06,
+      "loss": 0.8804,
+      "step": 850
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "eval_loss": 0.9779770374298096,
+      "eval_runtime": 47.0961,
+      "eval_samples_per_second": 21.233,
+      "eval_steps_per_second": 1.338,
+      "step": 850
+    },
+    {
+      "epoch": 1.5169340463458112,
+      "grad_norm": 1.4159053564071655,
+      "learning_rate": 1.427155599603568e-06,
+      "loss": 0.8564,
+      "step": 851
+    },
+    {
+      "epoch": 1.5187165775401068,
+      "grad_norm": 1.3249149322509766,
+      "learning_rate": 1.4222001982160554e-06,
+      "loss": 0.8598,
+      "step": 852
+    },
+    {
+      "epoch": 1.5204991087344029,
+      "grad_norm": 1.7462847232818604,
+      "learning_rate": 1.4172447968285433e-06,
+      "loss": 0.8788,
+      "step": 853
+    },
+    {
+      "epoch": 1.522281639928699,
+      "grad_norm": 1.4859366416931152,
+      "learning_rate": 1.4122893954410308e-06,
+      "loss": 0.8624,
+      "step": 854
+    },
+    {
+      "epoch": 1.5240641711229945,
+      "grad_norm": 1.4355590343475342,
+      "learning_rate": 1.4073339940535186e-06,
+      "loss": 0.8639,
+      "step": 855
+    },
+    {
+      "epoch": 1.5258467023172906,
+      "grad_norm": 1.3695921897888184,
+      "learning_rate": 1.402378592666006e-06,
+      "loss": 0.9011,
+      "step": 856
+    },
+    {
+      "epoch": 1.5276292335115864,
+      "grad_norm": 1.4948954582214355,
+      "learning_rate": 1.3974231912784935e-06,
+      "loss": 0.8689,
+      "step": 857
+    },
+    {
+      "epoch": 1.5294117647058822,
+      "grad_norm": 1.4612298011779785,
+      "learning_rate": 1.3924677898909814e-06,
+      "loss": 0.8753,
+      "step": 858
+    },
+    {
+      "epoch": 1.5311942959001783,
+      "grad_norm": 1.3478108644485474,
+      "learning_rate": 1.3875123885034689e-06,
+      "loss": 0.8684,
+      "step": 859
+    },
+    {
+      "epoch": 1.5329768270944741,
+      "grad_norm": 1.349802017211914,
+      "learning_rate": 1.3825569871159563e-06,
+      "loss": 0.8532,
+      "step": 860
+    },
+    {
+      "epoch": 1.5329768270944741,
+      "eval_loss": 0.9780614972114563,
+      "eval_runtime": 46.8154,
+      "eval_samples_per_second": 21.36,
+      "eval_steps_per_second": 1.346,
+      "step": 860
+    },
+    {
+      "epoch": 1.53475935828877,
+      "grad_norm": 1.3854540586471558,
+      "learning_rate": 1.3776015857284442e-06,
+      "loss": 0.8555,
+      "step": 861
+    },
+    {
+      "epoch": 1.536541889483066,
+      "grad_norm": 1.5170377492904663,
+      "learning_rate": 1.3726461843409316e-06,
+      "loss": 0.868,
+      "step": 862
+    },
+    {
+      "epoch": 1.5383244206773619,
+      "grad_norm": 1.2990375757217407,
+      "learning_rate": 1.3676907829534195e-06,
+      "loss": 0.8884,
+      "step": 863
+    },
+    {
+      "epoch": 1.5401069518716577,
+      "grad_norm": 1.7202647924423218,
+      "learning_rate": 1.362735381565907e-06,
+      "loss": 0.8877,
+      "step": 864
+    },
+    {
+      "epoch": 1.5418894830659537,
+      "grad_norm": 1.5239756107330322,
+      "learning_rate": 1.3577799801783944e-06,
+      "loss": 0.8492,
+      "step": 865
+    },
+    {
+      "epoch": 1.5436720142602496,
+      "grad_norm": 1.6998838186264038,
+      "learning_rate": 1.3528245787908823e-06,
+      "loss": 0.8541,
+      "step": 866
+    },
+    {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 1.5735970735549927,
+      "learning_rate": 1.3478691774033697e-06,
+      "loss": 0.839,
+      "step": 867
+    },
+    {
+      "epoch": 1.5472370766488415,
+      "grad_norm": 1.411190152168274,
+      "learning_rate": 1.3429137760158572e-06,
+      "loss": 0.8485,
+      "step": 868
+    },
+    {
+      "epoch": 1.5490196078431373,
+      "grad_norm": 1.404719591140747,
+      "learning_rate": 1.337958374628345e-06,
+      "loss": 0.8878,
+      "step": 869
+    },
+    {
+      "epoch": 1.5508021390374331,
+      "grad_norm": 1.3522731065750122,
+      "learning_rate": 1.3330029732408325e-06,
+      "loss": 0.8515,
+      "step": 870
+    },
+    {
+      "epoch": 1.5508021390374331,
+      "eval_loss": 0.9770654439926147,
+      "eval_runtime": 46.8266,
+      "eval_samples_per_second": 21.355,
+      "eval_steps_per_second": 1.345,
+      "step": 870
+    },
+    {
+      "epoch": 1.5525846702317292,
+      "grad_norm": 1.5110260248184204,
+      "learning_rate": 1.3280475718533204e-06,
+      "loss": 0.8872,
+      "step": 871
+    },
+    {
+      "epoch": 1.5543672014260248,
+      "grad_norm": 1.3744357824325562,
+      "learning_rate": 1.3230921704658078e-06,
+      "loss": 0.846,
+      "step": 872
+    },
+    {
+      "epoch": 1.5561497326203209,
+      "grad_norm": 1.3850080966949463,
+      "learning_rate": 1.3181367690782953e-06,
+      "loss": 0.8583,
+      "step": 873
+    },
+    {
+      "epoch": 1.557932263814617,
+      "grad_norm": 1.3675907850265503,
+      "learning_rate": 1.3131813676907832e-06,
+      "loss": 0.8449,
+      "step": 874
+    },
+    {
+      "epoch": 1.5597147950089125,
+      "grad_norm": 1.5432913303375244,
+      "learning_rate": 1.3082259663032706e-06,
+      "loss": 0.8614,
+      "step": 875
+    },
+    {
+      "epoch": 1.5614973262032086,
+      "grad_norm": 1.450357437133789,
+      "learning_rate": 1.303270564915758e-06,
+      "loss": 0.879,
+      "step": 876
+    },
+    {
+      "epoch": 1.5632798573975044,
+      "grad_norm": 1.320804238319397,
+      "learning_rate": 1.298315163528246e-06,
+      "loss": 0.8628,
+      "step": 877
+    },
+    {
+      "epoch": 1.5650623885918002,
+      "grad_norm": 1.6349151134490967,
+      "learning_rate": 1.2933597621407334e-06,
+      "loss": 0.8911,
+      "step": 878
+    },
+    {
+      "epoch": 1.5668449197860963,
+      "grad_norm": 1.7157044410705566,
+      "learning_rate": 1.2884043607532213e-06,
+      "loss": 0.8828,
+      "step": 879
+    },
+    {
+      "epoch": 1.5686274509803921,
+      "grad_norm": 1.4008780717849731,
+      "learning_rate": 1.2834489593657087e-06,
+      "loss": 0.8749,
+      "step": 880
+    },
+    {
+      "epoch": 1.5686274509803921,
+      "eval_loss": 0.9742602705955505,
+      "eval_runtime": 46.9284,
+      "eval_samples_per_second": 21.309,
+      "eval_steps_per_second": 1.342,
+      "step": 880
+    },
+    {
+      "epoch": 1.570409982174688,
+      "grad_norm": 1.318920373916626,
+      "learning_rate": 1.2784935579781962e-06,
+      "loss": 0.8446,
+      "step": 881
+    },
+    {
+      "epoch": 1.572192513368984,
+      "grad_norm": 1.393837332725525,
+      "learning_rate": 1.273538156590684e-06,
+      "loss": 0.8629,
+      "step": 882
+    },
+    {
+      "epoch": 1.5739750445632799,
+      "grad_norm": 1.3566768169403076,
+      "learning_rate": 1.2685827552031715e-06,
+      "loss": 0.8621,
+      "step": 883
+    },
+    {
+      "epoch": 1.5757575757575757,
+      "grad_norm": 1.699874758720398,
+      "learning_rate": 1.2636273538156591e-06,
+      "loss": 0.8613,
+      "step": 884
+    },
+    {
+      "epoch": 1.5775401069518717,
+      "grad_norm": 1.401503086090088,
+      "learning_rate": 1.2586719524281468e-06,
+      "loss": 0.8455,
+      "step": 885
+    },
+    {
+      "epoch": 1.5793226381461676,
+      "grad_norm": 1.5046734809875488,
+      "learning_rate": 1.2537165510406342e-06,
+      "loss": 0.86,
+      "step": 886
+    },
+    {
+      "epoch": 1.5811051693404634,
+      "grad_norm": 1.4957525730133057,
+      "learning_rate": 1.248761149653122e-06,
+      "loss": 0.8512,
+      "step": 887
+    },
+    {
+      "epoch": 1.5828877005347595,
+      "grad_norm": 1.4307347536087036,
+      "learning_rate": 1.2438057482656096e-06,
+      "loss": 0.8533,
+      "step": 888
+    },
+    {
+      "epoch": 1.5846702317290553,
+      "grad_norm": 1.4483654499053955,
+      "learning_rate": 1.2388503468780972e-06,
+      "loss": 0.8619,
+      "step": 889
+    },
+    {
+      "epoch": 1.5864527629233511,
+      "grad_norm": 1.3696205615997314,
+      "learning_rate": 1.2338949454905847e-06,
+      "loss": 0.8602,
+      "step": 890
+    },
+    {
+      "epoch": 1.5864527629233511,
+      "eval_loss": 0.9740327596664429,
+      "eval_runtime": 46.7849,
+      "eval_samples_per_second": 21.374,
+      "eval_steps_per_second": 1.347,
+      "step": 890
+    },
+    {
+      "epoch": 1.5882352941176472,
+      "grad_norm": 1.479094386100769,
+      "learning_rate": 1.2289395441030723e-06,
+      "loss": 0.8781,
+      "step": 891
+    },
+    {
+      "epoch": 1.5900178253119428,
+      "grad_norm": 1.5047471523284912,
+      "learning_rate": 1.22398414271556e-06,
+      "loss": 0.8956,
+      "step": 892
+    },
+    {
+      "epoch": 1.5918003565062389,
+      "grad_norm": 1.5792458057403564,
+      "learning_rate": 1.2190287413280477e-06,
+      "loss": 0.843,
+      "step": 893
+    },
+    {
+      "epoch": 1.593582887700535,
+      "grad_norm": 1.5192897319793701,
+      "learning_rate": 1.2140733399405351e-06,
+      "loss": 0.9011,
+      "step": 894
+    },
+    {
+      "epoch": 1.5953654188948305,
+      "grad_norm": 1.5448248386383057,
+      "learning_rate": 1.2091179385530228e-06,
+      "loss": 0.8518,
+      "step": 895
+    },
+    {
+      "epoch": 1.5971479500891266,
+      "grad_norm": 1.4314959049224854,
+      "learning_rate": 1.2041625371655104e-06,
+      "loss": 0.8539,
+      "step": 896
+    },
+    {
+      "epoch": 1.5989304812834224,
+      "grad_norm": 1.321249008178711,
+      "learning_rate": 1.1992071357779981e-06,
+      "loss": 0.8342,
+      "step": 897
+    },
+    {
+      "epoch": 1.6007130124777182,
+      "grad_norm": 1.3444586992263794,
+      "learning_rate": 1.1942517343904858e-06,
+      "loss": 0.8595,
+      "step": 898
+    },
+    {
+      "epoch": 1.6024955436720143,
+      "grad_norm": 1.4096806049346924,
+      "learning_rate": 1.1892963330029732e-06,
+      "loss": 0.8736,
+      "step": 899
+    },
+    {
+      "epoch": 1.6042780748663101,
+      "grad_norm": 1.3600504398345947,
+      "learning_rate": 1.1843409316154609e-06,
+      "loss": 0.865,
+      "step": 900
+    },
+    {
+      "epoch": 1.6042780748663101,
+      "eval_loss": 0.9747118949890137,
+      "eval_runtime": 46.7507,
+      "eval_samples_per_second": 21.39,
+      "eval_steps_per_second": 1.348,
+      "step": 900
+    },
+    {
+      "epoch": 1.606060606060606,
+      "grad_norm": 1.4022847414016724,
+      "learning_rate": 1.1793855302279485e-06,
+      "loss": 0.8613,
+      "step": 901
+    },
+    {
+      "epoch": 1.607843137254902,
+      "grad_norm": 1.4285733699798584,
+      "learning_rate": 1.1744301288404362e-06,
+      "loss": 0.8422,
+      "step": 902
+    },
+    {
+      "epoch": 1.6096256684491979,
+      "grad_norm": 1.4409127235412598,
+      "learning_rate": 1.1694747274529237e-06,
+      "loss": 0.8812,
+      "step": 903
+    },
+    {
+      "epoch": 1.6114081996434937,
+      "grad_norm": 1.4568157196044922,
+      "learning_rate": 1.1645193260654113e-06,
+      "loss": 0.8614,
+      "step": 904
+    },
+    {
+      "epoch": 1.6131907308377897,
+      "grad_norm": 1.3659733533859253,
+      "learning_rate": 1.159563924677899e-06,
+      "loss": 0.8476,
+      "step": 905
+    },
+    {
+      "epoch": 1.6149732620320856,
+      "grad_norm": 1.2912815809249878,
+      "learning_rate": 1.1546085232903866e-06,
+      "loss": 0.8716,
+      "step": 906
+    },
+    {
+      "epoch": 1.6167557932263814,
+      "grad_norm": 1.267293095588684,
+      "learning_rate": 1.1496531219028743e-06,
+      "loss": 0.844,
+      "step": 907
+    },
+    {
+      "epoch": 1.6185383244206775,
+      "grad_norm": 1.3510090112686157,
+      "learning_rate": 1.1446977205153618e-06,
+      "loss": 0.8698,
+      "step": 908
+    },
+    {
+      "epoch": 1.6203208556149733,
+      "grad_norm": 1.3630146980285645,
+      "learning_rate": 1.1397423191278494e-06,
+      "loss": 0.8503,
+      "step": 909
+    },
+    {
+      "epoch": 1.6221033868092691,
+      "grad_norm": 1.4114625453948975,
+      "learning_rate": 1.134786917740337e-06,
+      "loss": 0.8918,
+      "step": 910
+    },
+    {
+      "epoch": 1.6221033868092691,
+      "eval_loss": 0.9753348231315613,
+      "eval_runtime": 46.5853,
+      "eval_samples_per_second": 21.466,
+      "eval_steps_per_second": 1.352,
+      "step": 910
+    },
+    {
+      "epoch": 1.6238859180035652,
+      "grad_norm": 1.4507378339767456,
+      "learning_rate": 1.1298315163528247e-06,
+      "loss": 0.8544,
+      "step": 911
+    },
+    {
+      "epoch": 1.6256684491978608,
+      "grad_norm": 1.4346193075180054,
+      "learning_rate": 1.1248761149653122e-06,
+      "loss": 0.8598,
+      "step": 912
+    },
+    {
+      "epoch": 1.6274509803921569,
+      "grad_norm": 1.3302711248397827,
+      "learning_rate": 1.1199207135777999e-06,
+      "loss": 0.8577,
+      "step": 913
+    },
+    {
+      "epoch": 1.629233511586453,
+      "grad_norm": 1.3755695819854736,
+      "learning_rate": 1.1149653121902875e-06,
+      "loss": 0.8837,
+      "step": 914
+    },
+    {
+      "epoch": 1.6310160427807485,
+      "grad_norm": 1.3624552488327026,
+      "learning_rate": 1.1100099108027752e-06,
+      "loss": 0.842,
+      "step": 915
+    },
+    {
+      "epoch": 1.6327985739750446,
+      "grad_norm": 1.3074328899383545,
+      "learning_rate": 1.1050545094152628e-06,
+      "loss": 0.8627,
+      "step": 916
+    },
+    {
+      "epoch": 1.6345811051693404,
+      "grad_norm": 1.367962121963501,
+      "learning_rate": 1.1000991080277503e-06,
+      "loss": 0.8497,
+      "step": 917
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 1.4831198453903198,
+      "learning_rate": 1.095143706640238e-06,
+      "loss": 0.8663,
+      "step": 918
+    },
+    {
+      "epoch": 1.6381461675579323,
+      "grad_norm": 1.3029274940490723,
+      "learning_rate": 1.0901883052527256e-06,
+      "loss": 0.8619,
+      "step": 919
+    },
+    {
+      "epoch": 1.6399286987522281,
+      "grad_norm": 1.30776047706604,
+      "learning_rate": 1.0852329038652133e-06,
+      "loss": 0.8964,
+      "step": 920
+    },
+    {
+      "epoch": 1.6399286987522281,
+      "eval_loss": 0.9733108878135681,
+      "eval_runtime": 46.6339,
+      "eval_samples_per_second": 21.444,
+      "eval_steps_per_second": 1.351,
+      "step": 920
+    },
+    {
+      "epoch": 1.641711229946524,
+      "grad_norm": 1.4224649667739868,
+      "learning_rate": 1.080277502477701e-06,
+      "loss": 0.8656,
+      "step": 921
+    },
+    {
+      "epoch": 1.64349376114082,
+      "grad_norm": 1.3711802959442139,
+      "learning_rate": 1.0753221010901884e-06,
+      "loss": 0.8649,
+      "step": 922
+    },
+    {
+      "epoch": 1.6452762923351159,
+      "grad_norm": 1.336634635925293,
+      "learning_rate": 1.070366699702676e-06,
+      "loss": 0.846,
+      "step": 923
+    },
+    {
+      "epoch": 1.6470588235294117,
+      "grad_norm": 1.5247148275375366,
+      "learning_rate": 1.0654112983151637e-06,
+      "loss": 0.885,
+      "step": 924
+    },
+    {
+      "epoch": 1.6488413547237077,
+      "grad_norm": 1.6466078758239746,
+      "learning_rate": 1.0604558969276514e-06,
+      "loss": 0.8769,
+      "step": 925
+    },
+    {
+      "epoch": 1.6506238859180036,
+      "grad_norm": 1.4711053371429443,
+      "learning_rate": 1.0555004955401388e-06,
+      "loss": 0.8402,
+      "step": 926
+    },
+    {
+      "epoch": 1.6524064171122994,
+      "grad_norm": 1.3819425106048584,
+      "learning_rate": 1.0505450941526265e-06,
+      "loss": 0.8406,
+      "step": 927
+    },
+    {
+      "epoch": 1.6541889483065955,
+      "grad_norm": 1.3762542009353638,
+      "learning_rate": 1.0455896927651141e-06,
+      "loss": 0.8703,
+      "step": 928
+    },
+    {
+      "epoch": 1.6559714795008913,
+      "grad_norm": 1.4304357767105103,
+      "learning_rate": 1.0406342913776018e-06,
+      "loss": 0.8701,
+      "step": 929
+    },
+    {
+      "epoch": 1.6577540106951871,
+      "grad_norm": 1.425401210784912,
+      "learning_rate": 1.0356788899900893e-06,
+      "loss": 0.8959,
+      "step": 930
+    },
+    {
+      "epoch": 1.6577540106951871,
+      "eval_loss": 0.9722786545753479,
+      "eval_runtime": 46.6071,
+      "eval_samples_per_second": 21.456,
+      "eval_steps_per_second": 1.352,
+      "step": 930
+    },
+    {
+      "epoch": 1.6595365418894832,
+      "grad_norm": 1.4085595607757568,
+      "learning_rate": 1.030723488602577e-06,
+      "loss": 0.8659,
+      "step": 931
+    },
+    {
+      "epoch": 1.661319073083779,
+      "grad_norm": 1.3777707815170288,
+      "learning_rate": 1.0257680872150646e-06,
+      "loss": 0.8653,
+      "step": 932
+    },
+    {
+      "epoch": 1.6631016042780749,
+      "grad_norm": 1.3614346981048584,
+      "learning_rate": 1.0208126858275522e-06,
+      "loss": 0.8484,
+      "step": 933
+    },
+    {
+      "epoch": 1.664884135472371,
+      "grad_norm": 1.5707229375839233,
+      "learning_rate": 1.0158572844400397e-06,
+      "loss": 0.8472,
+      "step": 934
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 1.4438365697860718,
+      "learning_rate": 1.0109018830525274e-06,
+      "loss": 0.8725,
+      "step": 935
+    },
+    {
+      "epoch": 1.6684491978609626,
+      "grad_norm": 1.4993953704833984,
+      "learning_rate": 1.005946481665015e-06,
+      "loss": 0.8674,
+      "step": 936
+    },
+    {
+      "epoch": 1.6702317290552586,
+      "grad_norm": 1.4290693998336792,
+      "learning_rate": 1.0009910802775027e-06,
+      "loss": 0.9074,
+      "step": 937
+    },
+    {
+      "epoch": 1.6720142602495542,
+      "grad_norm": 1.3130066394805908,
+      "learning_rate": 9.960356788899901e-07,
+      "loss": 0.8474,
+      "step": 938
+    },
+    {
+      "epoch": 1.6737967914438503,
+      "grad_norm": 1.410224199295044,
+      "learning_rate": 9.910802775024778e-07,
+      "loss": 0.8933,
+      "step": 939
+    },
+    {
+      "epoch": 1.6755793226381461,
+      "grad_norm": 1.414833426475525,
+      "learning_rate": 9.861248761149655e-07,
+      "loss": 0.8384,
+      "step": 940
+    },
+    {
+      "epoch": 1.6755793226381461,
+      "eval_loss": 0.972200870513916,
+      "eval_runtime": 46.6515,
+      "eval_samples_per_second": 21.436,
+      "eval_steps_per_second": 1.35,
+      "step": 940
+    },
+    {
+      "epoch": 1.677361853832442,
+      "grad_norm": 1.4031621217727661,
+      "learning_rate": 9.811694747274531e-07,
+      "loss": 0.8616,
+      "step": 941
+    },
+    {
+      "epoch": 1.679144385026738,
+      "grad_norm": 1.359629511833191,
+      "learning_rate": 9.762140733399406e-07,
+      "loss": 0.8736,
+      "step": 942
+    },
+    {
+      "epoch": 1.6809269162210339,
+      "grad_norm": 1.4111278057098389,
+      "learning_rate": 9.712586719524282e-07,
+      "loss": 0.8786,
+      "step": 943
+    },
+    {
+      "epoch": 1.6827094474153297,
+      "grad_norm": 1.2958585023880005,
+      "learning_rate": 9.663032705649159e-07,
+      "loss": 0.8778,
+      "step": 944
+    },
+    {
+      "epoch": 1.6844919786096257,
+      "grad_norm": 1.2840744256973267,
+      "learning_rate": 9.613478691774035e-07,
+      "loss": 0.8403,
+      "step": 945
+    },
+    {
+      "epoch": 1.6862745098039216,
+      "grad_norm": 1.3533388376235962,
+      "learning_rate": 9.56392467789891e-07,
+      "loss": 0.8607,
+      "step": 946
+    },
+    {
+      "epoch": 1.6880570409982174,
+      "grad_norm": 1.4401700496673584,
+      "learning_rate": 9.514370664023787e-07,
+      "loss": 0.8829,
+      "step": 947
+    },
+    {
+      "epoch": 1.6898395721925135,
+      "grad_norm": 1.36722731590271,
+      "learning_rate": 9.464816650148663e-07,
+      "loss": 0.8987,
+      "step": 948
+    },
+    {
+      "epoch": 1.6916221033868093,
+      "grad_norm": 1.4618425369262695,
+      "learning_rate": 9.41526263627354e-07,
+      "loss": 0.8751,
+      "step": 949
+    },
+    {
+      "epoch": 1.6934046345811051,
+      "grad_norm": 1.316740870475769,
+      "learning_rate": 9.365708622398414e-07,
+      "loss": 0.8744,
+      "step": 950
+    },
+    {
+      "epoch": 1.6934046345811051,
+      "eval_loss": 0.9702951312065125,
+      "eval_runtime": 46.6377,
+      "eval_samples_per_second": 21.442,
+      "eval_steps_per_second": 1.351,
+      "step": 950
+    },
+    {
+      "epoch": 1.6951871657754012,
+      "grad_norm": 1.2979589700698853,
+      "learning_rate": 9.316154608523291e-07,
+      "loss": 0.8648,
+      "step": 951
+    },
+    {
+      "epoch": 1.696969696969697,
+      "grad_norm": 1.327683448791504,
+      "learning_rate": 9.266600594648168e-07,
+      "loss": 0.8673,
+      "step": 952
+    },
+    {
+      "epoch": 1.6987522281639929,
+      "grad_norm": 1.37446928024292,
+      "learning_rate": 9.217046580773044e-07,
+      "loss": 0.8694,
+      "step": 953
+    },
+    {
+      "epoch": 1.700534759358289,
+      "grad_norm": 1.4593381881713867,
+      "learning_rate": 9.167492566897919e-07,
+      "loss": 0.8567,
+      "step": 954
+    },
+    {
+      "epoch": 1.7023172905525845,
+      "grad_norm": 1.3869240283966064,
+      "learning_rate": 9.117938553022795e-07,
+      "loss": 0.8561,
+      "step": 955
+    },
+    {
+      "epoch": 1.7040998217468806,
+      "grad_norm": 1.4869533777236938,
+      "learning_rate": 9.068384539147672e-07,
+      "loss": 0.8749,
+      "step": 956
+    },
+    {
+      "epoch": 1.7058823529411766,
+      "grad_norm": 1.421717882156372,
+      "learning_rate": 9.018830525272549e-07,
+      "loss": 0.8468,
+      "step": 957
+    },
+    {
+      "epoch": 1.7076648841354722,
+      "grad_norm": 1.276408314704895,
+      "learning_rate": 8.969276511397423e-07,
+      "loss": 0.8563,
+      "step": 958
+    },
+    {
+      "epoch": 1.7094474153297683,
+      "grad_norm": 1.4118410348892212,
+      "learning_rate": 8.9197224975223e-07,
+      "loss": 0.8841,
+      "step": 959
+    },
+    {
+      "epoch": 1.7112299465240641,
+      "grad_norm": 1.3833427429199219,
+      "learning_rate": 8.870168483647176e-07,
+      "loss": 0.8331,
+      "step": 960
+    },
+    {
+      "epoch": 1.7112299465240641,
+      "eval_loss": 0.9688822627067566,
+      "eval_runtime": 46.5438,
+      "eval_samples_per_second": 21.485,
+      "eval_steps_per_second": 1.354,
+      "step": 960
+    },
+    {
+      "epoch": 1.71301247771836,
+      "grad_norm": 1.2577204704284668,
+      "learning_rate": 8.820614469772053e-07,
+      "loss": 0.843,
+      "step": 961
+    },
+    {
+      "epoch": 1.714795008912656,
+      "grad_norm": 1.4835284948349,
+      "learning_rate": 8.771060455896927e-07,
+      "loss": 0.8575,
+      "step": 962
+    },
+    {
+      "epoch": 1.7165775401069518,
+      "grad_norm": 1.2658771276474,
+      "learning_rate": 8.721506442021804e-07,
+      "loss": 0.8559,
+      "step": 963
+    },
+    {
+      "epoch": 1.7183600713012477,
+      "grad_norm": 1.3600130081176758,
+      "learning_rate": 8.671952428146681e-07,
+      "loss": 0.8782,
+      "step": 964
+    },
+    {
+      "epoch": 1.7201426024955437,
+      "grad_norm": 1.4301601648330688,
+      "learning_rate": 8.622398414271557e-07,
+      "loss": 0.8856,
+      "step": 965
+    },
+    {
+      "epoch": 1.7219251336898396,
+      "grad_norm": 1.3931207656860352,
+      "learning_rate": 8.572844400396432e-07,
+      "loss": 0.8804,
+      "step": 966
+    },
+    {
+      "epoch": 1.7237076648841354,
+      "grad_norm": 1.3883016109466553,
+      "learning_rate": 8.523290386521308e-07,
+      "loss": 0.855,
+      "step": 967
+    },
+    {
+      "epoch": 1.7254901960784315,
+      "grad_norm": 1.6030256748199463,
+      "learning_rate": 8.473736372646185e-07,
+      "loss": 0.8384,
+      "step": 968
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 1.3320417404174805,
+      "learning_rate": 8.424182358771062e-07,
+      "loss": 0.8611,
+      "step": 969
+    },
+    {
+      "epoch": 1.7290552584670231,
+      "grad_norm": 1.3974928855895996,
+      "learning_rate": 8.374628344895937e-07,
+      "loss": 0.8352,
+      "step": 970
+    },
+    {
+      "epoch": 1.7290552584670231,
+      "eval_loss": 0.9675564169883728,
+      "eval_runtime": 46.6655,
+      "eval_samples_per_second": 21.429,
+      "eval_steps_per_second": 1.35,
+      "step": 970
+    },
+    {
+      "epoch": 1.7308377896613192,
+      "grad_norm": 1.4703048467636108,
+      "learning_rate": 8.325074331020813e-07,
+      "loss": 0.8202,
+      "step": 971
+    },
+    {
+      "epoch": 1.732620320855615,
+      "grad_norm": 1.364524245262146,
+      "learning_rate": 8.275520317145689e-07,
+      "loss": 0.8476,
+      "step": 972
+    },
+    {
+      "epoch": 1.7344028520499108,
+      "grad_norm": 1.377961277961731,
+      "learning_rate": 8.225966303270566e-07,
+      "loss": 0.8582,
+      "step": 973
+    },
+    {
+      "epoch": 1.736185383244207,
+      "grad_norm": 1.3737229108810425,
+      "learning_rate": 8.176412289395442e-07,
+      "loss": 0.8649,
+      "step": 974
+    },
+    {
+      "epoch": 1.7379679144385025,
+      "grad_norm": 1.5605295896530151,
+      "learning_rate": 8.126858275520318e-07,
+      "loss": 0.8465,
+      "step": 975
+    },
+    {
+      "epoch": 1.7397504456327986,
+      "grad_norm": 1.3562301397323608,
+      "learning_rate": 8.077304261645194e-07,
+      "loss": 0.8662,
+      "step": 976
+    },
+    {
+      "epoch": 1.7415329768270946,
+      "grad_norm": 1.3104592561721802,
+      "learning_rate": 8.02775024777007e-07,
+      "loss": 0.8463,
+      "step": 977
+    },
+    {
+      "epoch": 1.7433155080213902,
+      "grad_norm": 1.5199209451675415,
+      "learning_rate": 7.978196233894946e-07,
+      "loss": 0.8942,
+      "step": 978
+    },
+    {
+      "epoch": 1.7450980392156863,
+      "grad_norm": 1.464303970336914,
+      "learning_rate": 7.928642220019823e-07,
+      "loss": 0.8493,
+      "step": 979
+    },
+    {
+      "epoch": 1.7468805704099821,
+      "grad_norm": 1.5610185861587524,
+      "learning_rate": 7.879088206144698e-07,
+      "loss": 0.8577,
+      "step": 980
+    },
+    {
+      "epoch": 1.7468805704099821,
+      "eval_loss": 0.9670752882957458,
+      "eval_runtime": 46.6969,
+      "eval_samples_per_second": 21.415,
+      "eval_steps_per_second": 1.349,
+      "step": 980
+    },
+    {
+      "epoch": 1.748663101604278,
+      "grad_norm": 1.378450870513916,
+      "learning_rate": 7.829534192269575e-07,
+      "loss": 0.8659,
+      "step": 981
+    },
+    {
+      "epoch": 1.750445632798574,
+      "grad_norm": 1.45065176486969,
+      "learning_rate": 7.77998017839445e-07,
+      "loss": 0.8607,
+      "step": 982
+    },
+    {
+      "epoch": 1.7522281639928698,
+      "grad_norm": 1.4157710075378418,
+      "learning_rate": 7.730426164519327e-07,
+      "loss": 0.8625,
+      "step": 983
+    },
+    {
+      "epoch": 1.7540106951871657,
+      "grad_norm": 1.481164813041687,
+      "learning_rate": 7.680872150644203e-07,
+      "loss": 0.864,
+      "step": 984
+    },
+    {
+      "epoch": 1.7557932263814617,
+      "grad_norm": 1.3908096551895142,
+      "learning_rate": 7.631318136769079e-07,
+      "loss": 0.8572,
+      "step": 985
+    },
+    {
+      "epoch": 1.7575757575757576,
+      "grad_norm": 1.392127275466919,
+      "learning_rate": 7.581764122893955e-07,
+      "loss": 0.8524,
+      "step": 986
+    },
+    {
+      "epoch": 1.7593582887700534,
+      "grad_norm": 1.3347569704055786,
+      "learning_rate": 7.532210109018831e-07,
+      "loss": 0.8442,
+      "step": 987
+    },
+    {
+      "epoch": 1.7611408199643495,
+      "grad_norm": 1.4272398948669434,
+      "learning_rate": 7.482656095143708e-07,
+      "loss": 0.8515,
+      "step": 988
+    },
+    {
+      "epoch": 1.7629233511586453,
+      "grad_norm": 1.5819486379623413,
+      "learning_rate": 7.433102081268583e-07,
+      "loss": 0.8543,
+      "step": 989
+    },
+    {
+      "epoch": 1.7647058823529411,
+      "grad_norm": 1.479175090789795,
+      "learning_rate": 7.383548067393459e-07,
+      "loss": 0.8975,
+      "step": 990
+    },
+    {
+      "epoch": 1.7647058823529411,
+      "eval_loss": 0.967066764831543,
+      "eval_runtime": 46.4369,
+      "eval_samples_per_second": 21.535,
+      "eval_steps_per_second": 1.357,
+      "step": 990
+    },
+    {
+      "epoch": 1.7664884135472372,
+      "grad_norm": 1.5135704278945923,
+      "learning_rate": 7.333994053518336e-07,
+      "loss": 0.8402,
+      "step": 991
+    },
+    {
+      "epoch": 1.768270944741533,
+      "grad_norm": 1.5999902486801147,
+      "learning_rate": 7.284440039643212e-07,
+      "loss": 0.8429,
+      "step": 992
+    },
+    {
+      "epoch": 1.7700534759358288,
+      "grad_norm": 1.5118600130081177,
+      "learning_rate": 7.234886025768089e-07,
+      "loss": 0.8635,
+      "step": 993
+    },
+    {
+      "epoch": 1.771836007130125,
+      "grad_norm": 1.4912101030349731,
+      "learning_rate": 7.185332011892963e-07,
+      "loss": 0.8448,
+      "step": 994
+    },
+    {
+      "epoch": 1.7736185383244205,
+      "grad_norm": 1.3264952898025513,
+      "learning_rate": 7.13577799801784e-07,
+      "loss": 0.839,
+      "step": 995
+    },
+    {
+      "epoch": 1.7754010695187166,
+      "grad_norm": 1.4948172569274902,
+      "learning_rate": 7.086223984142717e-07,
+      "loss": 0.856,
+      "step": 996
+    },
+    {
+      "epoch": 1.7771836007130126,
+      "grad_norm": 1.3413145542144775,
+      "learning_rate": 7.036669970267593e-07,
+      "loss": 0.8328,
+      "step": 997
+    },
+    {
+      "epoch": 1.7789661319073082,
+      "grad_norm": 1.6730403900146484,
+      "learning_rate": 6.987115956392468e-07,
+      "loss": 0.8902,
+      "step": 998
+    },
+    {
+      "epoch": 1.7807486631016043,
+      "grad_norm": 1.2651108503341675,
+      "learning_rate": 6.937561942517344e-07,
+      "loss": 0.8523,
+      "step": 999
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "grad_norm": 1.4899260997772217,
+      "learning_rate": 6.888007928642221e-07,
+      "loss": 0.8535,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7825311942959001,
+      "eval_loss": 0.9660161137580872,
+      "eval_runtime": 46.3938,
+      "eval_samples_per_second": 21.555,
+      "eval_steps_per_second": 1.358,
+      "step": 1000
+    },
+    {
+      "epoch": 1.784313725490196,
+      "grad_norm": 1.3794513940811157,
+      "learning_rate": 6.838453914767098e-07,
+      "loss": 0.8702,
+      "step": 1001
+    },
+    {
+      "epoch": 1.786096256684492,
+      "grad_norm": 1.3967653512954712,
+      "learning_rate": 6.788899900891972e-07,
+      "loss": 0.861,
+      "step": 1002
+    },
+    {
+      "epoch": 1.7878787878787878,
+      "grad_norm": 1.330071210861206,
+      "learning_rate": 6.739345887016849e-07,
+      "loss": 0.851,
+      "step": 1003
+    },
+    {
+      "epoch": 1.7896613190730837,
+      "grad_norm": 1.358818769454956,
+      "learning_rate": 6.689791873141725e-07,
+      "loss": 0.8549,
+      "step": 1004
+    },
+    {
+      "epoch": 1.7914438502673797,
+      "grad_norm": 1.3878623247146606,
+      "learning_rate": 6.640237859266602e-07,
+      "loss": 0.8318,
+      "step": 1005
+    },
+    {
+      "epoch": 1.7932263814616756,
+      "grad_norm": 1.4242222309112549,
+      "learning_rate": 6.590683845391476e-07,
+      "loss": 0.8061,
+      "step": 1006
+    },
+    {
+      "epoch": 1.7950089126559714,
+      "grad_norm": 1.4395697116851807,
+      "learning_rate": 6.541129831516353e-07,
+      "loss": 0.8766,
+      "step": 1007
+    },
+    {
+      "epoch": 1.7967914438502675,
+      "grad_norm": 1.402297854423523,
+      "learning_rate": 6.49157581764123e-07,
+      "loss": 0.8798,
+      "step": 1008
+    },
+    {
+      "epoch": 1.7985739750445633,
+      "grad_norm": 1.275490641593933,
+      "learning_rate": 6.442021803766106e-07,
+      "loss": 0.8721,
+      "step": 1009
+    },
+    {
+      "epoch": 1.8003565062388591,
+      "grad_norm": 1.4236700534820557,
+      "learning_rate": 6.392467789890981e-07,
+      "loss": 0.8363,
+      "step": 1010
+    },
+    {
+      "epoch": 1.8003565062388591,
+      "eval_loss": 0.9658026695251465,
+      "eval_runtime": 46.4079,
+      "eval_samples_per_second": 21.548,
+      "eval_steps_per_second": 1.358,
+      "step": 1010
+    },
+    {
+      "epoch": 1.8021390374331552,
+      "grad_norm": 1.3970041275024414,
+      "learning_rate": 6.342913776015857e-07,
+      "loss": 0.8874,
+      "step": 1011
+    },
+    {
+      "epoch": 1.803921568627451,
+      "grad_norm": 1.4485187530517578,
+      "learning_rate": 6.293359762140734e-07,
+      "loss": 0.8852,
+      "step": 1012
+    },
+    {
+      "epoch": 1.8057040998217468,
+      "grad_norm": 1.4213767051696777,
+      "learning_rate": 6.24380574826561e-07,
+      "loss": 0.8839,
+      "step": 1013
+    },
+    {
+      "epoch": 1.807486631016043,
+      "grad_norm": 1.3294216394424438,
+      "learning_rate": 6.194251734390486e-07,
+      "loss": 0.8104,
+      "step": 1014
+    },
+    {
+      "epoch": 1.8092691622103387,
+      "grad_norm": 1.532103180885315,
+      "learning_rate": 6.144697720515362e-07,
+      "loss": 0.8569,
+      "step": 1015
+    },
+    {
+      "epoch": 1.8110516934046346,
+      "grad_norm": 1.565508484840393,
+      "learning_rate": 6.095143706640238e-07,
+      "loss": 0.8487,
+      "step": 1016
+    },
+    {
+      "epoch": 1.8128342245989306,
+      "grad_norm": 1.3395127058029175,
+      "learning_rate": 6.045589692765114e-07,
+      "loss": 0.8787,
+      "step": 1017
+    },
+    {
+      "epoch": 1.8146167557932262,
+      "grad_norm": 1.433997631072998,
+      "learning_rate": 5.996035678889991e-07,
+      "loss": 0.8642,
+      "step": 1018
+    },
+    {
+      "epoch": 1.8163992869875223,
+      "grad_norm": 1.4260884523391724,
+      "learning_rate": 5.946481665014866e-07,
+      "loss": 0.8388,
+      "step": 1019
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 1.3741999864578247,
+      "learning_rate": 5.896927651139743e-07,
+      "loss": 0.8299,
+      "step": 1020
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "eval_loss": 0.9656883478164673,
+      "eval_runtime": 46.3715,
+      "eval_samples_per_second": 21.565,
+      "eval_steps_per_second": 1.359,
+      "step": 1020
+    },
+    {
+      "epoch": 1.819964349376114,
+      "grad_norm": 1.28693687915802,
+      "learning_rate": 5.847373637264618e-07,
+      "loss": 0.8359,
+      "step": 1021
+    },
+    {
+      "epoch": 1.82174688057041,
+      "grad_norm": 1.3816742897033691,
+      "learning_rate": 5.797819623389495e-07,
+      "loss": 0.8802,
+      "step": 1022
+    },
+    {
+      "epoch": 1.8235294117647058,
+      "grad_norm": 1.3577059507369995,
+      "learning_rate": 5.748265609514372e-07,
+      "loss": 0.8391,
+      "step": 1023
+    },
+    {
+      "epoch": 1.8253119429590017,
+      "grad_norm": 1.3769915103912354,
+      "learning_rate": 5.698711595639247e-07,
+      "loss": 0.8568,
+      "step": 1024
+    },
+    {
+      "epoch": 1.8270944741532977,
+      "grad_norm": 1.3751606941223145,
+      "learning_rate": 5.649157581764124e-07,
+      "loss": 0.865,
+      "step": 1025
+    },
+    {
+      "epoch": 1.8288770053475936,
+      "grad_norm": 1.3568124771118164,
+      "learning_rate": 5.599603567888999e-07,
+      "loss": 0.8728,
+      "step": 1026
+    },
+    {
+      "epoch": 1.8306595365418894,
+      "grad_norm": 1.5252009630203247,
+      "learning_rate": 5.550049554013876e-07,
+      "loss": 0.8675,
+      "step": 1027
+    },
+    {
+      "epoch": 1.8324420677361855,
+      "grad_norm": 1.46947181224823,
+      "learning_rate": 5.500495540138751e-07,
+      "loss": 0.8545,
+      "step": 1028
+    },
+    {
+      "epoch": 1.8342245989304813,
+      "grad_norm": 1.430268406867981,
+      "learning_rate": 5.450941526263628e-07,
+      "loss": 0.8329,
+      "step": 1029
+    },
+    {
+      "epoch": 1.8360071301247771,
+      "grad_norm": 1.3437092304229736,
+      "learning_rate": 5.401387512388505e-07,
+      "loss": 0.8291,
+      "step": 1030
+    },
+    {
+      "epoch": 1.8360071301247771,
+      "eval_loss": 0.9675397872924805,
+      "eval_runtime": 46.4016,
+      "eval_samples_per_second": 21.551,
+      "eval_steps_per_second": 1.358,
+      "step": 1030
+    },
+    {
+      "epoch": 1.8377896613190732,
+      "grad_norm": 1.40646231174469,
+      "learning_rate": 5.35183349851338e-07,
+      "loss": 0.8745,
+      "step": 1031
+    },
+    {
+      "epoch": 1.839572192513369,
+      "grad_norm": 1.4662535190582275,
+      "learning_rate": 5.302279484638257e-07,
+      "loss": 0.8494,
+      "step": 1032
+    },
+    {
+      "epoch": 1.8413547237076648,
+      "grad_norm": 1.535057783126831,
+      "learning_rate": 5.252725470763132e-07,
+      "loss": 0.8445,
+      "step": 1033
+    },
+    {
+      "epoch": 1.843137254901961,
+      "grad_norm": 1.4149518013000488,
+      "learning_rate": 5.203171456888009e-07,
+      "loss": 0.8753,
+      "step": 1034
+    },
+    {
+      "epoch": 1.8449197860962567,
+      "grad_norm": 1.4421366453170776,
+      "learning_rate": 5.153617443012885e-07,
+      "loss": 0.8435,
+      "step": 1035
+    },
+    {
+      "epoch": 1.8467023172905526,
+      "grad_norm": 1.384452223777771,
+      "learning_rate": 5.104063429137761e-07,
+      "loss": 0.8768,
+      "step": 1036
+    },
+    {
+      "epoch": 1.8484848484848486,
+      "grad_norm": 1.4903335571289062,
+      "learning_rate": 5.054509415262637e-07,
+      "loss": 0.8501,
+      "step": 1037
+    },
+    {
+      "epoch": 1.8502673796791442,
+      "grad_norm": 1.3531508445739746,
+      "learning_rate": 5.004955401387513e-07,
+      "loss": 0.8837,
+      "step": 1038
+    },
+    {
+      "epoch": 1.8520499108734403,
+      "grad_norm": 1.486277461051941,
+      "learning_rate": 4.955401387512389e-07,
+      "loss": 0.853,
+      "step": 1039
+    },
+    {
+      "epoch": 1.8538324420677363,
+      "grad_norm": 1.4410507678985596,
+      "learning_rate": 4.905847373637266e-07,
+      "loss": 0.8464,
+      "step": 1040
+    },
+    {
+      "epoch": 1.8538324420677363,
+      "eval_loss": 0.9651731252670288,
+      "eval_runtime": 46.4455,
+      "eval_samples_per_second": 21.531,
+      "eval_steps_per_second": 1.356,
+      "step": 1040
+    },
+    {
+      "epoch": 1.855614973262032,
+      "grad_norm": 1.268168568611145,
+      "learning_rate": 4.856293359762141e-07,
+      "loss": 0.8265,
+      "step": 1041
+    },
+    {
+      "epoch": 1.857397504456328,
+      "grad_norm": 1.409671664237976,
+      "learning_rate": 4.806739345887018e-07,
+      "loss": 0.8374,
+      "step": 1042
+    },
+    {
+      "epoch": 1.8591800356506238,
+      "grad_norm": 1.6630103588104248,
+      "learning_rate": 4.7571853320118933e-07,
+      "loss": 0.8632,
+      "step": 1043
+    },
+    {
+      "epoch": 1.8609625668449197,
+      "grad_norm": 1.4358583688735962,
+      "learning_rate": 4.70763131813677e-07,
+      "loss": 0.8782,
+      "step": 1044
+    },
+    {
+      "epoch": 1.8627450980392157,
+      "grad_norm": 1.3741272687911987,
+      "learning_rate": 4.6580773042616455e-07,
+      "loss": 0.8455,
+      "step": 1045
+    },
+    {
+      "epoch": 1.8645276292335116,
+      "grad_norm": 1.2778054475784302,
+      "learning_rate": 4.608523290386522e-07,
+      "loss": 0.8218,
+      "step": 1046
+    },
+    {
+      "epoch": 1.8663101604278074,
+      "grad_norm": 1.3522720336914062,
+      "learning_rate": 4.5589692765113977e-07,
+      "loss": 0.8633,
+      "step": 1047
+    },
+    {
+      "epoch": 1.8680926916221035,
+      "grad_norm": 1.49508798122406,
+      "learning_rate": 4.5094152626362743e-07,
+      "loss": 0.8746,
+      "step": 1048
+    },
+    {
+      "epoch": 1.8698752228163993,
+      "grad_norm": 1.3672457933425903,
+      "learning_rate": 4.45986124876115e-07,
+      "loss": 0.8559,
+      "step": 1049
+    },
+    {
+      "epoch": 1.8716577540106951,
+      "grad_norm": 1.4118934869766235,
+      "learning_rate": 4.4103072348860265e-07,
+      "loss": 0.8701,
+      "step": 1050
+    },
+    {
+      "epoch": 1.8716577540106951,
+      "eval_loss": 0.9658851027488708,
+      "eval_runtime": 46.4763,
+      "eval_samples_per_second": 21.516,
+      "eval_steps_per_second": 1.356,
+      "step": 1050
+    },
+    {
+      "epoch": 1.8734402852049912,
+      "grad_norm": 1.3773859739303589,
+      "learning_rate": 4.360753221010902e-07,
+      "loss": 0.871,
+      "step": 1051
+    },
+    {
+      "epoch": 1.875222816399287,
+      "grad_norm": 1.3667681217193604,
+      "learning_rate": 4.3111992071357786e-07,
+      "loss": 0.8678,
+      "step": 1052
+    },
+    {
+      "epoch": 1.8770053475935828,
+      "grad_norm": 1.4455238580703735,
+      "learning_rate": 4.261645193260654e-07,
+      "loss": 0.8475,
+      "step": 1053
+    },
+    {
+      "epoch": 1.878787878787879,
+      "grad_norm": 1.5423219203948975,
+      "learning_rate": 4.212091179385531e-07,
+      "loss": 0.895,
+      "step": 1054
+    },
+    {
+      "epoch": 1.8805704099821747,
+      "grad_norm": 1.3430986404418945,
+      "learning_rate": 4.1625371655104064e-07,
+      "loss": 0.8616,
+      "step": 1055
+    },
+    {
+      "epoch": 1.8823529411764706,
+      "grad_norm": 1.4237287044525146,
+      "learning_rate": 4.112983151635283e-07,
+      "loss": 0.8642,
+      "step": 1056
+    },
+    {
+      "epoch": 1.8841354723707666,
+      "grad_norm": 1.3155893087387085,
+      "learning_rate": 4.063429137760159e-07,
+      "loss": 0.8269,
+      "step": 1057
+    },
+    {
+      "epoch": 1.8859180035650622,
+      "grad_norm": 1.390587329864502,
+      "learning_rate": 4.013875123885035e-07,
+      "loss": 0.8609,
+      "step": 1058
+    },
+    {
+      "epoch": 1.8877005347593583,
+      "grad_norm": 1.3156342506408691,
+      "learning_rate": 3.964321110009911e-07,
+      "loss": 0.8487,
+      "step": 1059
+    },
+    {
+      "epoch": 1.8894830659536543,
+      "grad_norm": 1.3792142868041992,
+      "learning_rate": 3.9147670961347873e-07,
+      "loss": 0.8581,
+      "step": 1060
+    },
+    {
+      "epoch": 1.8894830659536543,
+      "eval_loss": 0.9655643701553345,
+      "eval_runtime": 46.4516,
+      "eval_samples_per_second": 21.528,
+      "eval_steps_per_second": 1.356,
+      "step": 1060
+    },
+    {
+      "epoch": 1.89126559714795,
+      "grad_norm": 1.407735824584961,
+      "learning_rate": 3.8652130822596634e-07,
+      "loss": 0.8542,
+      "step": 1061
+    },
+    {
+      "epoch": 1.893048128342246,
+      "grad_norm": 1.2963098287582397,
+      "learning_rate": 3.8156590683845395e-07,
+      "loss": 0.8487,
+      "step": 1062
+    },
+    {
+      "epoch": 1.8948306595365418,
+      "grad_norm": 1.3685920238494873,
+      "learning_rate": 3.7661050545094156e-07,
+      "loss": 0.8637,
+      "step": 1063
+    },
+    {
+      "epoch": 1.8966131907308377,
+      "grad_norm": 1.3968452215194702,
+      "learning_rate": 3.7165510406342917e-07,
+      "loss": 0.8487,
+      "step": 1064
+    },
+    {
+      "epoch": 1.8983957219251337,
+      "grad_norm": 1.358083724975586,
+      "learning_rate": 3.666997026759168e-07,
+      "loss": 0.8383,
+      "step": 1065
+    },
+    {
+      "epoch": 1.9001782531194296,
+      "grad_norm": 1.3680518865585327,
+      "learning_rate": 3.6174430128840444e-07,
+      "loss": 0.8727,
+      "step": 1066
+    },
+    {
+      "epoch": 1.9019607843137254,
+      "grad_norm": 1.481414794921875,
+      "learning_rate": 3.56788899900892e-07,
+      "loss": 0.8667,
+      "step": 1067
+    },
+    {
+      "epoch": 1.9037433155080214,
+      "grad_norm": 1.2826118469238281,
+      "learning_rate": 3.5183349851337966e-07,
+      "loss": 0.8814,
+      "step": 1068
+    },
+    {
+      "epoch": 1.9055258467023173,
+      "grad_norm": 1.2083613872528076,
+      "learning_rate": 3.468780971258672e-07,
+      "loss": 0.8351,
+      "step": 1069
+    },
+    {
+      "epoch": 1.9073083778966131,
+      "grad_norm": 1.3396915197372437,
+      "learning_rate": 3.419226957383549e-07,
+      "loss": 0.8462,
+      "step": 1070
+    },
+    {
+      "epoch": 1.9073083778966131,
+      "eval_loss": 0.9655903577804565,
+      "eval_runtime": 46.3914,
+      "eval_samples_per_second": 21.556,
+      "eval_steps_per_second": 1.358,
+      "step": 1070
+    },
+    {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 1.2929346561431885,
+      "learning_rate": 3.3696729435084243e-07,
+      "loss": 0.8624,
+      "step": 1071
+    },
+    {
+      "epoch": 1.910873440285205,
+      "grad_norm": 1.4455654621124268,
+      "learning_rate": 3.320118929633301e-07,
+      "loss": 0.8553,
+      "step": 1072
+    },
+    {
+      "epoch": 1.9126559714795008,
+      "grad_norm": 1.4255995750427246,
+      "learning_rate": 3.2705649157581765e-07,
+      "loss": 0.8071,
+      "step": 1073
+    },
+    {
+      "epoch": 1.914438502673797,
+      "grad_norm": 1.3290305137634277,
+      "learning_rate": 3.221010901883053e-07,
+      "loss": 0.8566,
+      "step": 1074
+    },
+    {
+      "epoch": 1.9162210338680927,
+      "grad_norm": 1.3911030292510986,
+      "learning_rate": 3.1714568880079287e-07,
+      "loss": 0.8747,
+      "step": 1075
+    },
+    {
+      "epoch": 1.9180035650623886,
+      "grad_norm": 1.3865312337875366,
+      "learning_rate": 3.121902874132805e-07,
+      "loss": 0.854,
+      "step": 1076
+    },
+    {
+      "epoch": 1.9197860962566846,
+      "grad_norm": 1.332739233970642,
+      "learning_rate": 3.072348860257681e-07,
+      "loss": 0.8461,
+      "step": 1077
+    },
+    {
+      "epoch": 1.9215686274509802,
+      "grad_norm": 1.3937904834747314,
+      "learning_rate": 3.022794846382557e-07,
+      "loss": 0.8532,
+      "step": 1078
+    },
+    {
+      "epoch": 1.9233511586452763,
+      "grad_norm": 1.299120545387268,
+      "learning_rate": 2.973240832507433e-07,
+      "loss": 0.8327,
+      "step": 1079
+    },
+    {
+      "epoch": 1.9251336898395723,
+      "grad_norm": 1.3246283531188965,
+      "learning_rate": 2.923686818632309e-07,
+      "loss": 0.8591,
+      "step": 1080
+    },
+    {
+      "epoch": 1.9251336898395723,
+      "eval_loss": 0.9655869603157043,
+      "eval_runtime": 46.5255,
+      "eval_samples_per_second": 21.494,
+      "eval_steps_per_second": 1.354,
+      "step": 1080
+    },
+    {
+      "epoch": 1.926916221033868,
+      "grad_norm": 1.4631216526031494,
+      "learning_rate": 2.874132804757186e-07,
+      "loss": 0.8395,
+      "step": 1081
+    },
+    {
+      "epoch": 1.928698752228164,
+      "grad_norm": 1.3864542245864868,
+      "learning_rate": 2.824578790882062e-07,
+      "loss": 0.8658,
+      "step": 1082
+    },
+    {
+      "epoch": 1.93048128342246,
+      "grad_norm": 1.5314630270004272,
+      "learning_rate": 2.775024777006938e-07,
+      "loss": 0.8729,
+      "step": 1083
+    },
+    {
+      "epoch": 1.9322638146167557,
+      "grad_norm": 1.379754662513733,
+      "learning_rate": 2.725470763131814e-07,
+      "loss": 0.8533,
+      "step": 1084
+    },
+    {
+      "epoch": 1.9340463458110517,
+      "grad_norm": 1.5379613637924194,
+      "learning_rate": 2.67591674925669e-07,
+      "loss": 0.8733,
+      "step": 1085
+    },
+    {
+      "epoch": 1.9358288770053476,
+      "grad_norm": 1.3273158073425293,
+      "learning_rate": 2.626362735381566e-07,
+      "loss": 0.8661,
+      "step": 1086
+    },
+    {
+      "epoch": 1.9376114081996434,
+      "grad_norm": 1.4625751972198486,
+      "learning_rate": 2.5768087215064423e-07,
+      "loss": 0.8725,
+      "step": 1087
+    },
+    {
+      "epoch": 1.9393939393939394,
+      "grad_norm": 1.4103567600250244,
+      "learning_rate": 2.5272547076313184e-07,
+      "loss": 0.8406,
+      "step": 1088
+    },
+    {
+      "epoch": 1.9411764705882353,
+      "grad_norm": 1.4515103101730347,
+      "learning_rate": 2.4777006937561945e-07,
+      "loss": 0.8759,
+      "step": 1089
+    },
+    {
+      "epoch": 1.9429590017825311,
+      "grad_norm": 1.4476619958877563,
+      "learning_rate": 2.4281466798810706e-07,
+      "loss": 0.8511,
+      "step": 1090
+    },
+    {
+      "epoch": 1.9429590017825311,
+      "eval_loss": 0.9651136994361877,
+      "eval_runtime": 46.6353,
+      "eval_samples_per_second": 21.443,
+      "eval_steps_per_second": 1.351,
+      "step": 1090
+    },
+    {
+      "epoch": 1.9447415329768272,
+      "grad_norm": 1.3023127317428589,
+      "learning_rate": 2.3785926660059467e-07,
+      "loss": 0.8293,
+      "step": 1091
+    },
+    {
+      "epoch": 1.946524064171123,
+      "grad_norm": 1.3094813823699951,
+      "learning_rate": 2.3290386521308227e-07,
+      "loss": 0.8309,
+      "step": 1092
+    },
+    {
+      "epoch": 1.9483065953654188,
+      "grad_norm": 1.3473302125930786,
+      "learning_rate": 2.2794846382556988e-07,
+      "loss": 0.8577,
+      "step": 1093
+    },
+    {
+      "epoch": 1.950089126559715,
+      "grad_norm": 1.3242158889770508,
+      "learning_rate": 2.229930624380575e-07,
+      "loss": 0.8603,
+      "step": 1094
+    },
+    {
+      "epoch": 1.9518716577540107,
+      "grad_norm": 1.3417590856552124,
+      "learning_rate": 2.180376610505451e-07,
+      "loss": 0.8566,
+      "step": 1095
+    },
+    {
+      "epoch": 1.9536541889483066,
+      "grad_norm": 1.3229726552963257,
+      "learning_rate": 2.130822596630327e-07,
+      "loss": 0.8592,
+      "step": 1096
+    },
+    {
+      "epoch": 1.9554367201426026,
+      "grad_norm": 1.2532265186309814,
+      "learning_rate": 2.0812685827552032e-07,
+      "loss": 0.8184,
+      "step": 1097
+    },
+    {
+      "epoch": 1.9572192513368984,
+      "grad_norm": 1.352526307106018,
+      "learning_rate": 2.0317145688800795e-07,
+      "loss": 0.8401,
+      "step": 1098
+    },
+    {
+      "epoch": 1.9590017825311943,
+      "grad_norm": 1.3978582620620728,
+      "learning_rate": 1.9821605550049556e-07,
+      "loss": 0.8697,
+      "step": 1099
+    },
+    {
+      "epoch": 1.9607843137254903,
+      "grad_norm": 1.4667596817016602,
+      "learning_rate": 1.9326065411298317e-07,
+      "loss": 0.8625,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9607843137254903,
+      "eval_loss": 0.9646411538124084,
+      "eval_runtime": 46.485,
+      "eval_samples_per_second": 21.512,
+      "eval_steps_per_second": 1.355,
+      "step": 1100
+    },
+    {
+      "epoch": 1.962566844919786,
+      "grad_norm": 1.5264577865600586,
+      "learning_rate": 1.8830525272547078e-07,
+      "loss": 0.8593,
+      "step": 1101
+    },
+    {
+      "epoch": 1.964349376114082,
+      "grad_norm": 1.3619505167007446,
+      "learning_rate": 1.833498513379584e-07,
+      "loss": 0.8339,
+      "step": 1102
+    },
+    {
+      "epoch": 1.966131907308378,
+      "grad_norm": 1.4145138263702393,
+      "learning_rate": 1.78394449950446e-07,
+      "loss": 0.8666,
+      "step": 1103
+    },
+    {
+      "epoch": 1.9679144385026737,
+      "grad_norm": 1.5150694847106934,
+      "learning_rate": 1.734390485629336e-07,
+      "loss": 0.8658,
+      "step": 1104
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 1.400248646736145,
+      "learning_rate": 1.6848364717542122e-07,
+      "loss": 0.8715,
+      "step": 1105
+    },
+    {
+      "epoch": 1.9714795008912656,
+      "grad_norm": 1.298861026763916,
+      "learning_rate": 1.6352824578790883e-07,
+      "loss": 0.8883,
+      "step": 1106
+    },
+    {
+      "epoch": 1.9732620320855614,
+      "grad_norm": 1.3148902654647827,
+      "learning_rate": 1.5857284440039643e-07,
+      "loss": 0.8369,
+      "step": 1107
+    },
+    {
+      "epoch": 1.9750445632798574,
+      "grad_norm": 1.4141972064971924,
+      "learning_rate": 1.5361744301288404e-07,
+      "loss": 0.8589,
+      "step": 1108
+    },
+    {
+      "epoch": 1.9768270944741533,
+      "grad_norm": 1.3640660047531128,
+      "learning_rate": 1.4866204162537165e-07,
+      "loss": 0.8505,
+      "step": 1109
+    },
+    {
+      "epoch": 1.9786096256684491,
+      "grad_norm": 1.3286265134811401,
+      "learning_rate": 1.437066402378593e-07,
+      "loss": 0.8301,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9786096256684491,
+      "eval_loss": 0.9635146260261536,
+      "eval_runtime": 46.687,
+      "eval_samples_per_second": 21.419,
+      "eval_steps_per_second": 1.349,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9803921568627452,
+      "grad_norm": 1.3741545677185059,
+      "learning_rate": 1.387512388503469e-07,
+      "loss": 0.8125,
+      "step": 1111
+    },
+    {
+      "epoch": 1.982174688057041,
+      "grad_norm": 1.324223518371582,
+      "learning_rate": 1.337958374628345e-07,
+      "loss": 0.8809,
+      "step": 1112
+    },
+    {
+      "epoch": 1.9839572192513368,
+      "grad_norm": 1.3497804403305054,
+      "learning_rate": 1.2884043607532211e-07,
+      "loss": 0.8602,
+      "step": 1113
+    },
+    {
+      "epoch": 1.985739750445633,
+      "grad_norm": 1.4052799940109253,
+      "learning_rate": 1.2388503468780972e-07,
+      "loss": 0.8791,
+      "step": 1114
+    },
+    {
+      "epoch": 1.9875222816399287,
+      "grad_norm": 1.344778060913086,
+      "learning_rate": 1.1892963330029733e-07,
+      "loss": 0.8234,
+      "step": 1115
+    },
+    {
+      "epoch": 1.9893048128342246,
+      "grad_norm": 1.3553035259246826,
+      "learning_rate": 1.1397423191278494e-07,
+      "loss": 0.8653,
+      "step": 1116
+    },
+    {
+      "epoch": 1.9910873440285206,
+      "grad_norm": 1.3913415670394897,
+      "learning_rate": 1.0901883052527255e-07,
+      "loss": 0.8166,
+      "step": 1117
+    },
+    {
+      "epoch": 1.9928698752228164,
+      "grad_norm": 1.4442999362945557,
+      "learning_rate": 1.0406342913776016e-07,
+      "loss": 0.8955,
+      "step": 1118
+    },
+    {
+      "epoch": 1.9946524064171123,
+      "grad_norm": 1.413307547569275,
+      "learning_rate": 9.910802775024778e-08,
+      "loss": 0.8523,
+      "step": 1119
+    },
+    {
+      "epoch": 1.9964349376114083,
+      "grad_norm": 1.5048857927322388,
+      "learning_rate": 9.415262636273539e-08,
+      "loss": 0.837,
+      "step": 1120
+    },
+    {
+      "epoch": 1.9964349376114083,
+      "eval_loss": 0.9632958173751831,
+      "eval_runtime": 46.6855,
+      "eval_samples_per_second": 21.42,
+      "eval_steps_per_second": 1.349,
+      "step": 1120
+    },
+    {
+      "epoch": 1.998217468805704,
+      "grad_norm": 1.2615009546279907,
+      "learning_rate": 8.9197224975223e-08,
+      "loss": 0.8477,
+      "step": 1121
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.3137235641479492,
+      "learning_rate": 8.424182358771061e-08,
+      "loss": 0.8359,
+      "step": 1122
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1122,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1698030233507594e+18,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}