diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4038 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9994447529150472,
+  "eval_steps": 100,
+  "global_step": 2700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0037016472330186935,
+      "grad_norm": 6.938235277319067,
+      "learning_rate": 1.8518518518518518e-07,
+      "loss": 1.5302,
+      "step": 5
+    },
+    {
+      "epoch": 0.007403294466037387,
+      "grad_norm": 7.268031041449709,
+      "learning_rate": 3.7037037037037036e-07,
+      "loss": 1.5099,
+      "step": 10
+    },
+    {
+      "epoch": 0.01110494169905608,
+      "grad_norm": 6.707361833146337,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 1.5182,
+      "step": 15
+    },
+    {
+      "epoch": 0.014806588932074774,
+      "grad_norm": 5.451093836504048,
+      "learning_rate": 7.407407407407407e-07,
+      "loss": 1.4956,
+      "step": 20
+    },
+    {
+      "epoch": 0.018508236165093468,
+      "grad_norm": 4.673928540068199,
+      "learning_rate": 9.259259259259259e-07,
+      "loss": 1.4582,
+      "step": 25
+    },
+    {
+      "epoch": 0.02220988339811216,
+      "grad_norm": 4.159325167616714,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 1.4219,
+      "step": 30
+    },
+    {
+      "epoch": 0.025911530631130855,
+      "grad_norm": 3.3344977569830063,
+      "learning_rate": 1.2962962962962962e-06,
+      "loss": 1.4247,
+      "step": 35
+    },
+    {
+      "epoch": 0.029613177864149548,
+      "grad_norm": 2.855470171195436,
+      "learning_rate": 1.4814814814814815e-06,
+      "loss": 1.421,
+      "step": 40
+    },
+    {
+      "epoch": 0.03331482509716824,
+      "grad_norm": 2.6416618989533287,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 1.33,
+      "step": 45
+    },
+    {
+      "epoch": 0.037016472330186935,
+      "grad_norm": 2.263261354668403,
+      "learning_rate": 1.8518518518518519e-06,
+      "loss": 1.3312,
+      "step": 50
+    },
+    {
+      "epoch": 0.040718119563205625,
+      "grad_norm": 2.1013541579712753,
+      "learning_rate": 2.037037037037037e-06,
+      "loss": 1.3097,
+      "step": 55
+    },
+    {
+      "epoch": 0.04441976679622432,
+      "grad_norm": 1.9120786676503165,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 1.3118,
+      "step": 60
+    },
+    {
+      "epoch": 0.04812141402924301,
+      "grad_norm": 2.050461699793645,
+      "learning_rate": 2.4074074074074075e-06,
+      "loss": 1.2272,
+      "step": 65
+    },
+    {
+      "epoch": 0.05182306126226171,
+      "grad_norm": 1.8323934446281782,
+      "learning_rate": 2.5925925925925925e-06,
+      "loss": 1.218,
+      "step": 70
+    },
+    {
+      "epoch": 0.0555247084952804,
+      "grad_norm": 1.7538859492518115,
+      "learning_rate": 2.7777777777777783e-06,
+      "loss": 1.2018,
+      "step": 75
+    },
+    {
+      "epoch": 0.059226355728299096,
+      "grad_norm": 1.6618831514082206,
+      "learning_rate": 2.962962962962963e-06,
+      "loss": 1.1764,
+      "step": 80
+    },
+    {
+      "epoch": 0.06292800296131779,
+      "grad_norm": 1.574905727360327,
+      "learning_rate": 3.1481481481481483e-06,
+      "loss": 1.215,
+      "step": 85
+    },
+    {
+      "epoch": 0.06662965019433648,
+      "grad_norm": 1.5752273016779739,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 1.182,
+      "step": 90
+    },
+    {
+      "epoch": 0.07033129742735518,
+      "grad_norm": 1.5042772619553242,
+      "learning_rate": 3.5185185185185187e-06,
+      "loss": 1.1873,
+      "step": 95
+    },
+    {
+      "epoch": 0.07403294466037387,
+      "grad_norm": 1.5685723198311041,
+      "learning_rate": 3.7037037037037037e-06,
+      "loss": 1.1626,
+      "step": 100
+    },
+    {
+      "epoch": 0.07403294466037387,
+      "eval_loss": 1.1988961696624756,
+      "eval_runtime": 8.7616,
+      "eval_samples_per_second": 58.437,
+      "eval_steps_per_second": 14.609,
+      "step": 100
+    },
+    {
+      "epoch": 0.07773459189339256,
+      "grad_norm": 1.6895282160559515,
+      "learning_rate": 3.88888888888889e-06,
+      "loss": 1.1575,
+      "step": 105
+    },
+    {
+      "epoch": 0.08143623912641125,
+      "grad_norm": 1.7296810758427557,
+      "learning_rate": 4.074074074074074e-06,
+      "loss": 1.1554,
+      "step": 110
+    },
+    {
+      "epoch": 0.08513788635942994,
+      "grad_norm": 1.6334197275112947,
+      "learning_rate": 4.2592592592592596e-06,
+      "loss": 1.1336,
+      "step": 115
+    },
+    {
+      "epoch": 0.08883953359244864,
+      "grad_norm": 1.6157911900138535,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 1.1568,
+      "step": 120
+    },
+    {
+      "epoch": 0.09254118082546733,
+      "grad_norm": 1.819386269418747,
+      "learning_rate": 4.62962962962963e-06,
+      "loss": 1.1508,
+      "step": 125
+    },
+    {
+      "epoch": 0.09624282805848602,
+      "grad_norm": 1.5300312016091477,
+      "learning_rate": 4.814814814814815e-06,
+      "loss": 1.1636,
+      "step": 130
+    },
+    {
+      "epoch": 0.09994447529150471,
+      "grad_norm": 1.6687523332204426,
+      "learning_rate": 5e-06,
+      "loss": 1.1528,
+      "step": 135
+    },
+    {
+      "epoch": 0.10364612252452342,
+      "grad_norm": 1.6037586197720166,
+      "learning_rate": 5.185185185185185e-06,
+      "loss": 1.1502,
+      "step": 140
+    },
+    {
+      "epoch": 0.10734776975754211,
+      "grad_norm": 1.6775275762359088,
+      "learning_rate": 5.370370370370371e-06,
+      "loss": 1.1469,
+      "step": 145
+    },
+    {
+      "epoch": 0.1110494169905608,
+      "grad_norm": 1.682996179151718,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 1.1636,
+      "step": 150
+    },
+    {
+      "epoch": 0.11475106422357949,
+      "grad_norm": 1.6277070415853683,
+      "learning_rate": 5.740740740740741e-06,
+      "loss": 1.1258,
+      "step": 155
+    },
+    {
+      "epoch": 0.11845271145659819,
+      "grad_norm": 1.553623964811703,
+      "learning_rate": 5.925925925925926e-06,
+      "loss": 1.1304,
+      "step": 160
+    },
+    {
+      "epoch": 0.12215435868961688,
+      "grad_norm": 1.588580332839279,
+      "learning_rate": 6.111111111111112e-06,
+      "loss": 1.1439,
+      "step": 165
+    },
+    {
+      "epoch": 0.12585600592263557,
+      "grad_norm": 1.6616684567469457,
+      "learning_rate": 6.296296296296297e-06,
+      "loss": 1.1079,
+      "step": 170
+    },
+    {
+      "epoch": 0.12955765315565426,
+      "grad_norm": 1.6762897748350518,
+      "learning_rate": 6.481481481481482e-06,
+      "loss": 1.0921,
+      "step": 175
+    },
+    {
+      "epoch": 0.13325930038867295,
+      "grad_norm": 1.6596422210586161,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.1092,
+      "step": 180
+    },
+    {
+      "epoch": 0.13696094762169164,
+      "grad_norm": 1.7722037854995745,
+      "learning_rate": 6.851851851851853e-06,
+      "loss": 1.1006,
+      "step": 185
+    },
+    {
+      "epoch": 0.14066259485471036,
+      "grad_norm": 1.7298048533999484,
+      "learning_rate": 7.0370370370370375e-06,
+      "loss": 1.1233,
+      "step": 190
+    },
+    {
+      "epoch": 0.14436424208772905,
+      "grad_norm": 1.98573216584534,
+      "learning_rate": 7.222222222222223e-06,
+      "loss": 1.088,
+      "step": 195
+    },
+    {
+      "epoch": 0.14806588932074774,
+      "grad_norm": 1.8587833627332044,
+      "learning_rate": 7.4074074074074075e-06,
+      "loss": 1.1218,
+      "step": 200
+    },
+    {
+      "epoch": 0.14806588932074774,
+      "eval_loss": 1.1345889568328857,
+      "eval_runtime": 11.8208,
+      "eval_samples_per_second": 43.313,
+      "eval_steps_per_second": 10.828,
+      "step": 200
+    },
+    {
+      "epoch": 0.15176753655376643,
+      "grad_norm": 1.7130597917326755,
+      "learning_rate": 7.592592592592594e-06,
+      "loss": 1.101,
+      "step": 205
+    },
+    {
+      "epoch": 0.15546918378678512,
+      "grad_norm": 1.663391530176506,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 1.1329,
+      "step": 210
+    },
+    {
+      "epoch": 0.1591708310198038,
+      "grad_norm": 1.5996505363015205,
+      "learning_rate": 7.962962962962963e-06,
+      "loss": 1.1075,
+      "step": 215
+    },
+    {
+      "epoch": 0.1628724782528225,
+      "grad_norm": 1.7038076686493917,
+      "learning_rate": 8.148148148148148e-06,
+      "loss": 1.1177,
+      "step": 220
+    },
+    {
+      "epoch": 0.1665741254858412,
+      "grad_norm": 1.6542551343222383,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 1.1093,
+      "step": 225
+    },
+    {
+      "epoch": 0.17027577271885988,
+      "grad_norm": 1.651873246968653,
+      "learning_rate": 8.518518518518519e-06,
+      "loss": 1.0843,
+      "step": 230
+    },
+    {
+      "epoch": 0.1739774199518786,
+      "grad_norm": 1.7198765328480365,
+      "learning_rate": 8.703703703703705e-06,
+      "loss": 1.0937,
+      "step": 235
+    },
+    {
+      "epoch": 0.1776790671848973,
+      "grad_norm": 1.6510428501911072,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 1.1111,
+      "step": 240
+    },
+    {
+      "epoch": 0.18138071441791598,
+      "grad_norm": 1.6213365368916086,
+      "learning_rate": 9.074074074074075e-06,
+      "loss": 1.0756,
+      "step": 245
+    },
+    {
+      "epoch": 0.18508236165093467,
+      "grad_norm": 1.7093203475906262,
+      "learning_rate": 9.25925925925926e-06,
+      "loss": 1.0694,
+      "step": 250
+    },
+    {
+      "epoch": 0.18878400888395336,
+      "grad_norm": 1.5228912650464108,
+      "learning_rate": 9.444444444444445e-06,
+      "loss": 1.1033,
+      "step": 255
+    },
+    {
+      "epoch": 0.19248565611697205,
+      "grad_norm": 1.673445899569891,
+      "learning_rate": 9.62962962962963e-06,
+      "loss": 1.0678,
+      "step": 260
+    },
+    {
+      "epoch": 0.19618730334999074,
+      "grad_norm": 1.6304796670467894,
+      "learning_rate": 9.814814814814815e-06,
+      "loss": 1.1158,
+      "step": 265
+    },
+    {
+      "epoch": 0.19988895058300943,
+      "grad_norm": 1.8244867074957254,
+      "learning_rate": 1e-05,
+      "loss": 1.1145,
+      "step": 270
+    },
+    {
+      "epoch": 0.20359059781602815,
+      "grad_norm": 1.6240030719245686,
+      "learning_rate": 9.999895536228031e-06,
+      "loss": 1.0802,
+      "step": 275
+    },
+    {
+      "epoch": 0.20729224504904684,
+      "grad_norm": 1.6243177291419117,
+      "learning_rate": 9.999582149277188e-06,
+      "loss": 1.0542,
+      "step": 280
+    },
+    {
+      "epoch": 0.21099389228206553,
+      "grad_norm": 1.629944565116536,
+      "learning_rate": 9.999059852242508e-06,
+      "loss": 1.0698,
+      "step": 285
+    },
+    {
+      "epoch": 0.21469553951508422,
+      "grad_norm": 1.5777459565221383,
+      "learning_rate": 9.998328666948437e-06,
+      "loss": 1.0681,
+      "step": 290
+    },
+    {
+      "epoch": 0.2183971867481029,
+      "grad_norm": 1.7250795726626251,
+      "learning_rate": 9.997388623947927e-06,
+      "loss": 1.0762,
+      "step": 295
+    },
+    {
+      "epoch": 0.2220988339811216,
+      "grad_norm": 1.9272571692059781,
+      "learning_rate": 9.996239762521152e-06,
+      "loss": 1.066,
+      "step": 300
+    },
+    {
+      "epoch": 0.2220988339811216,
+      "eval_loss": 1.1053733825683594,
+      "eval_runtime": 10.8836,
+      "eval_samples_per_second": 47.043,
+      "eval_steps_per_second": 11.761,
+      "step": 300
+    },
+    {
+      "epoch": 0.2258004812141403,
+      "grad_norm": 1.5120677355850114,
+      "learning_rate": 9.994882130673869e-06,
+      "loss": 1.0493,
+      "step": 305
+    },
+    {
+      "epoch": 0.22950212844715898,
+      "grad_norm": 1.6029522673182492,
+      "learning_rate": 9.993315785135417e-06,
+      "loss": 1.0682,
+      "step": 310
+    },
+    {
+      "epoch": 0.23320377568017767,
+      "grad_norm": 1.7123312053771258,
+      "learning_rate": 9.991540791356342e-06,
+      "loss": 1.0763,
+      "step": 315
+    },
+    {
+      "epoch": 0.23690542291319638,
+      "grad_norm": 1.5724741182176134,
+      "learning_rate": 9.989557223505661e-06,
+      "loss": 1.0401,
+      "step": 320
+    },
+    {
+      "epoch": 0.24060707014621507,
+      "grad_norm": 1.6045932069501792,
+      "learning_rate": 9.987365164467767e-06,
+      "loss": 1.0529,
+      "step": 325
+    },
+    {
+      "epoch": 0.24430871737923376,
+      "grad_norm": 1.8264353953198955,
+      "learning_rate": 9.98496470583896e-06,
+      "loss": 1.1018,
+      "step": 330
+    },
+    {
+      "epoch": 0.24801036461225245,
+      "grad_norm": 1.6022738272462664,
+      "learning_rate": 9.98235594792363e-06,
+      "loss": 1.0571,
+      "step": 335
+    },
+    {
+      "epoch": 0.25171201184527114,
+      "grad_norm": 1.590245570446662,
+      "learning_rate": 9.979538999730047e-06,
+      "loss": 1.0606,
+      "step": 340
+    },
+    {
+      "epoch": 0.25541365907828983,
+      "grad_norm": 1.6383785467577274,
+      "learning_rate": 9.976513978965829e-06,
+      "loss": 1.0662,
+      "step": 345
+    },
+    {
+      "epoch": 0.2591153063113085,
+      "grad_norm": 1.6184390160064126,
+      "learning_rate": 9.973281012033009e-06,
+      "loss": 1.0597,
+      "step": 350
+    },
+    {
+      "epoch": 0.2628169535443272,
+      "grad_norm": 1.6462129640549557,
+      "learning_rate": 9.96984023402275e-06,
+      "loss": 1.0455,
+      "step": 355
+    },
+    {
+      "epoch": 0.2665186007773459,
+      "grad_norm": 1.5771952400129876,
+      "learning_rate": 9.966191788709716e-06,
+      "loss": 1.0928,
+      "step": 360
+    },
+    {
+      "epoch": 0.2702202480103646,
+      "grad_norm": 1.6788673967555205,
+      "learning_rate": 9.962335828546049e-06,
+      "loss": 1.0652,
+      "step": 365
+    },
+    {
+      "epoch": 0.2739218952433833,
+      "grad_norm": 1.7130930928872348,
+      "learning_rate": 9.958272514655006e-06,
+      "loss": 1.0429,
+      "step": 370
+    },
+    {
+      "epoch": 0.277623542476402,
+      "grad_norm": 1.6689391337155577,
+      "learning_rate": 9.954002016824226e-06,
+      "loss": 1.0877,
+      "step": 375
+    },
+    {
+      "epoch": 0.2813251897094207,
+      "grad_norm": 1.690319301390716,
+      "learning_rate": 9.949524513498636e-06,
+      "loss": 1.0795,
+      "step": 380
+    },
+    {
+      "epoch": 0.2850268369424394,
+      "grad_norm": 1.572172606275347,
+      "learning_rate": 9.944840191772987e-06,
+      "loss": 1.0663,
+      "step": 385
+    },
+    {
+      "epoch": 0.2887284841754581,
+      "grad_norm": 1.5263034631804209,
+      "learning_rate": 9.939949247384046e-06,
+      "loss": 1.0851,
+      "step": 390
+    },
+    {
+      "epoch": 0.2924301314084768,
+      "grad_norm": 1.7211710040759145,
+      "learning_rate": 9.934851884702415e-06,
+      "loss": 1.0759,
+      "step": 395
+    },
+    {
+      "epoch": 0.2961317786414955,
+      "grad_norm": 1.6740011312182885,
+      "learning_rate": 9.929548316723983e-06,
+      "loss": 1.0549,
+      "step": 400
+    },
+    {
+      "epoch": 0.2961317786414955,
+      "eval_loss": 1.0882835388183594,
+      "eval_runtime": 8.7432,
+      "eval_samples_per_second": 58.56,
+      "eval_steps_per_second": 14.64,
+      "step": 400
+    },
+    {
+      "epoch": 0.29983342587451417,
+      "grad_norm": 1.6831994602911653,
+      "learning_rate": 9.924038765061042e-06,
+      "loss": 1.0567,
+      "step": 405
+    },
+    {
+      "epoch": 0.30353507310753286,
+      "grad_norm": 1.8233912351646488,
+      "learning_rate": 9.918323459933006e-06,
+      "loss": 1.043,
+      "step": 410
+    },
+    {
+      "epoch": 0.30723672034055155,
+      "grad_norm": 1.5346197193130573,
+      "learning_rate": 9.912402640156812e-06,
+      "loss": 1.067,
+      "step": 415
+    },
+    {
+      "epoch": 0.31093836757357024,
+      "grad_norm": 1.6970688002458603,
+      "learning_rate": 9.906276553136924e-06,
+      "loss": 1.0554,
+      "step": 420
+    },
+    {
+      "epoch": 0.31464001480658893,
+      "grad_norm": 1.7448280302843093,
+      "learning_rate": 9.899945454855007e-06,
+      "loss": 1.0667,
+      "step": 425
+    },
+    {
+      "epoch": 0.3183416620396076,
+      "grad_norm": 1.7073354168183785,
+      "learning_rate": 9.893409609859221e-06,
+      "loss": 1.0479,
+      "step": 430
+    },
+    {
+      "epoch": 0.3220433092726263,
+      "grad_norm": 1.5924364453552773,
+      "learning_rate": 9.886669291253178e-06,
+      "loss": 1.0556,
+      "step": 435
+    },
+    {
+      "epoch": 0.325744956505645,
+      "grad_norm": 1.6684326207150462,
+      "learning_rate": 9.879724780684518e-06,
+      "loss": 1.0508,
+      "step": 440
+    },
+    {
+      "epoch": 0.3294466037386637,
+      "grad_norm": 1.6967295571568946,
+      "learning_rate": 9.872576368333152e-06,
+      "loss": 1.0651,
+      "step": 445
+    },
+    {
+      "epoch": 0.3331482509716824,
+      "grad_norm": 1.562036282290576,
+      "learning_rate": 9.86522435289912e-06,
+      "loss": 1.0743,
+      "step": 450
+    },
+    {
+      "epoch": 0.33684989820470107,
+      "grad_norm": 1.6107399617591995,
+      "learning_rate": 9.857669041590135e-06,
+      "loss": 1.0358,
+      "step": 455
+    },
+    {
+      "epoch": 0.34055154543771976,
+      "grad_norm": 1.6404525122379863,
+      "learning_rate": 9.849910750108718e-06,
+      "loss": 1.0306,
+      "step": 460
+    },
+    {
+      "epoch": 0.3442531926707385,
+      "grad_norm": 1.5534884160132494,
+      "learning_rate": 9.841949802639031e-06,
+      "loss": 1.0229,
+      "step": 465
+    },
+    {
+      "epoch": 0.3479548399037572,
+      "grad_norm": 1.6683504420781063,
+      "learning_rate": 9.833786531833311e-06,
+      "loss": 1.0499,
+      "step": 470
+    },
+    {
+      "epoch": 0.3516564871367759,
+      "grad_norm": 1.6125669309723765,
+      "learning_rate": 9.825421278797984e-06,
+      "loss": 1.055,
+      "step": 475
+    },
+    {
+      "epoch": 0.3553581343697946,
+      "grad_norm": 1.5428815014911474,
+      "learning_rate": 9.816854393079402e-06,
+      "loss": 1.0221,
+      "step": 480
+    },
+    {
+      "epoch": 0.35905978160281327,
+      "grad_norm": 1.617284949544144,
+      "learning_rate": 9.808086232649246e-06,
+      "loss": 1.033,
+      "step": 485
+    },
+    {
+      "epoch": 0.36276142883583196,
+      "grad_norm": 1.5951767676523292,
+      "learning_rate": 9.79911716388956e-06,
+      "loss": 1.0782,
+      "step": 490
+    },
+    {
+      "epoch": 0.36646307606885065,
+      "grad_norm": 1.6581136429430168,
+      "learning_rate": 9.789947561577445e-06,
+      "loss": 1.0635,
+      "step": 495
+    },
+    {
+      "epoch": 0.37016472330186934,
+      "grad_norm": 1.6065188309294918,
+      "learning_rate": 9.7805778088694e-06,
+      "loss": 1.0669,
+      "step": 500
+    },
+    {
+      "epoch": 0.37016472330186934,
+      "eval_loss": 1.0748300552368164,
+      "eval_runtime": 8.6678,
+      "eval_samples_per_second": 59.07,
+      "eval_steps_per_second": 14.767,
+      "step": 500
+    },
+    {
+      "epoch": 0.373866370534888,
+      "grad_norm": 1.6616285045031425,
+      "learning_rate": 9.771008297285307e-06,
+      "loss": 1.0263,
+      "step": 505
+    },
+    {
+      "epoch": 0.3775680177679067,
+      "grad_norm": 1.6890553974206772,
+      "learning_rate": 9.761239426692077e-06,
+      "loss": 1.0379,
+      "step": 510
+    },
+    {
+      "epoch": 0.3812696650009254,
+      "grad_norm": 1.642371294327726,
+      "learning_rate": 9.75127160528694e-06,
+      "loss": 1.0363,
+      "step": 515
+    },
+    {
+      "epoch": 0.3849713122339441,
+      "grad_norm": 1.609971736170351,
+      "learning_rate": 9.741105249580383e-06,
+      "loss": 1.0148,
+      "step": 520
+    },
+    {
+      "epoch": 0.3886729594669628,
+      "grad_norm": 1.5388625837247312,
+      "learning_rate": 9.730740784378755e-06,
+      "loss": 1.045,
+      "step": 525
+    },
+    {
+      "epoch": 0.3923746066999815,
+      "grad_norm": 1.5888188137718782,
+      "learning_rate": 9.7201786427665e-06,
+      "loss": 1.0341,
+      "step": 530
+    },
+    {
+      "epoch": 0.39607625393300017,
+      "grad_norm": 1.4974736547619503,
+      "learning_rate": 9.709419266088086e-06,
+      "loss": 1.0573,
+      "step": 535
+    },
+    {
+      "epoch": 0.39977790116601886,
+      "grad_norm": 1.6247908776077464,
+      "learning_rate": 9.698463103929542e-06,
+      "loss": 1.0545,
+      "step": 540
+    },
+    {
+      "epoch": 0.40347954839903755,
+      "grad_norm": 1.6041126674957051,
+      "learning_rate": 9.687310614099676e-06,
+      "loss": 1.0513,
+      "step": 545
+    },
+    {
+      "epoch": 0.4071811956320563,
+      "grad_norm": 1.6658902497560741,
+      "learning_rate": 9.67596226261095e-06,
+      "loss": 1.031,
+      "step": 550
+    },
+    {
+      "epoch": 0.410882842865075,
+      "grad_norm": 1.6137694638926725,
+      "learning_rate": 9.664418523660004e-06,
+      "loss": 1.0359,
+      "step": 555
+    },
+    {
+      "epoch": 0.41458449009809367,
+      "grad_norm": 1.6095305212035607,
+      "learning_rate": 9.652679879607843e-06,
+      "loss": 1.0798,
+      "step": 560
+    },
+    {
+      "epoch": 0.41828613733111236,
+      "grad_norm": 3.8514579839708425,
+      "learning_rate": 9.640746820959684e-06,
+      "loss": 1.0199,
+      "step": 565
+    },
+    {
+      "epoch": 0.42198778456413105,
+      "grad_norm": 1.5890073416005945,
+      "learning_rate": 9.628619846344453e-06,
+      "loss": 1.0462,
+      "step": 570
+    },
+    {
+      "epoch": 0.42568943179714974,
+      "grad_norm": 1.510801346821129,
+      "learning_rate": 9.616299462493952e-06,
+      "loss": 1.0209,
+      "step": 575
+    },
+    {
+      "epoch": 0.42939107903016843,
+      "grad_norm": 1.7546849226440295,
+      "learning_rate": 9.603786184221693e-06,
+      "loss": 1.0521,
+      "step": 580
+    },
+    {
+      "epoch": 0.4330927262631871,
+      "grad_norm": 1.6762272283133195,
+      "learning_rate": 9.591080534401371e-06,
+      "loss": 1.0218,
+      "step": 585
+    },
+    {
+      "epoch": 0.4367943734962058,
+      "grad_norm": 1.6662530229421226,
+      "learning_rate": 9.578183043945031e-06,
+      "loss": 1.0593,
+      "step": 590
+    },
+    {
+      "epoch": 0.4404960207292245,
+      "grad_norm": 1.6167637404172526,
+      "learning_rate": 9.565094251780872e-06,
+      "loss": 1.069,
+      "step": 595
+    },
+    {
+      "epoch": 0.4441976679622432,
+      "grad_norm": 1.5708072893669605,
+      "learning_rate": 9.551814704830734e-06,
+      "loss": 1.0228,
+      "step": 600
+    },
+    {
+      "epoch": 0.4441976679622432,
+      "eval_loss": 1.064416766166687,
+      "eval_runtime": 8.755,
+      "eval_samples_per_second": 58.481,
+      "eval_steps_per_second": 14.62,
+      "step": 600
+    },
+    {
+      "epoch": 0.4478993151952619,
+      "grad_norm": 1.7242405674867898,
+      "learning_rate": 9.538344957987245e-06,
+      "loss": 1.0317,
+      "step": 605
+    },
+    {
+      "epoch": 0.4516009624282806,
+      "grad_norm": 1.6824094274691348,
+      "learning_rate": 9.524685574090627e-06,
+      "loss": 1.0295,
+      "step": 610
+    },
+    {
+      "epoch": 0.45530260966129926,
+      "grad_norm": 1.839445106245174,
+      "learning_rate": 9.51083712390519e-06,
+      "loss": 1.0313,
+      "step": 615
+    },
+    {
+      "epoch": 0.45900425689431795,
+      "grad_norm": 1.5181275348104526,
+      "learning_rate": 9.496800186095466e-06,
+      "loss": 1.0494,
+      "step": 620
+    },
+    {
+      "epoch": 0.46270590412733664,
+      "grad_norm": 1.6758656485484829,
+      "learning_rate": 9.482575347202047e-06,
+      "loss": 1.0054,
+      "step": 625
+    },
+    {
+      "epoch": 0.46640755136035533,
+      "grad_norm": 1.522464483917177,
+      "learning_rate": 9.468163201617063e-06,
+      "loss": 1.0111,
+      "step": 630
+    },
+    {
+      "epoch": 0.4701091985933741,
+      "grad_norm": 1.5939009948400995,
+      "learning_rate": 9.453564351559348e-06,
+      "loss": 1.0108,
+      "step": 635
+    },
+    {
+      "epoch": 0.47381084582639277,
+      "grad_norm": 1.7148818415747054,
+      "learning_rate": 9.438779407049282e-06,
+      "loss": 1.0196,
+      "step": 640
+    },
+    {
+      "epoch": 0.47751249305941146,
+      "grad_norm": 1.6494094050372314,
+      "learning_rate": 9.423808985883289e-06,
+      "loss": 1.0287,
+      "step": 645
+    },
+    {
+      "epoch": 0.48121414029243015,
+      "grad_norm": 1.6800798770987615,
+      "learning_rate": 9.40865371360804e-06,
+      "loss": 1.0222,
+      "step": 650
+    },
+    {
+      "epoch": 0.48491578752544884,
+      "grad_norm": 1.6487616779870438,
+      "learning_rate": 9.393314223494297e-06,
+      "loss": 1.0468,
+      "step": 655
+    },
+    {
+      "epoch": 0.48861743475846753,
+      "grad_norm": 1.5971477827325937,
+      "learning_rate": 9.377791156510456e-06,
+      "loss": 1.0257,
+      "step": 660
+    },
+    {
+      "epoch": 0.4923190819914862,
+      "grad_norm": 1.6349785933485341,
+      "learning_rate": 9.362085161295768e-06,
+      "loss": 1.0143,
+      "step": 665
+    },
+    {
+      "epoch": 0.4960207292245049,
+      "grad_norm": 1.7836047329450633,
+      "learning_rate": 9.346196894133239e-06,
+      "loss": 1.0279,
+      "step": 670
+    },
+    {
+      "epoch": 0.4997223764575236,
+      "grad_norm": 1.5554656111349356,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 1.025,
+      "step": 675
+    },
+    {
+      "epoch": 0.5034240236905423,
+      "grad_norm": 1.5873567496879941,
+      "learning_rate": 9.313876207150544e-06,
+      "loss": 1.0369,
+      "step": 680
+    },
+    {
+      "epoch": 0.507125670923561,
+      "grad_norm": 1.4901436339732956,
+      "learning_rate": 9.297445137866726e-06,
+      "loss": 1.0138,
+      "step": 685
+    },
+    {
+      "epoch": 0.5108273181565797,
+      "grad_norm": 1.5964192406854771,
+      "learning_rate": 9.280834497651334e-06,
+      "loss": 1.0254,
+      "step": 690
+    },
+    {
+      "epoch": 0.5145289653895984,
+      "grad_norm": 1.6347585768126325,
+      "learning_rate": 9.264044980588415e-06,
+      "loss": 1.0491,
+      "step": 695
+    },
+    {
+      "epoch": 0.518230612622617,
+      "grad_norm": 1.5001440241949608,
+      "learning_rate": 9.247077288236488e-06,
+      "loss": 1.0423,
+      "step": 700
+    },
+    {
+      "epoch": 0.518230612622617,
+      "eval_loss": 1.0569534301757812,
+      "eval_runtime": 8.6934,
+      "eval_samples_per_second": 58.895,
+      "eval_steps_per_second": 14.724,
+      "step": 700
+    },
+    {
+      "epoch": 0.5219322598556357,
+      "grad_norm": 1.6942056219401487,
+      "learning_rate": 9.229932129599206e-06,
+      "loss": 1.0227,
+      "step": 705
+    },
+    {
+      "epoch": 0.5256339070886544,
+      "grad_norm": 1.575360988478343,
+      "learning_rate": 9.212610221095748e-06,
+      "loss": 1.0232,
+      "step": 710
+    },
+    {
+      "epoch": 0.5293355543216731,
+      "grad_norm": 1.5434758990559243,
+      "learning_rate": 9.195112286530874e-06,
+      "loss": 0.9868,
+      "step": 715
+    },
+    {
+      "epoch": 0.5330372015546918,
+      "grad_norm": 1.5834059602030823,
+      "learning_rate": 9.177439057064684e-06,
+      "loss": 1.0066,
+      "step": 720
+    },
+    {
+      "epoch": 0.5367388487877105,
+      "grad_norm": 1.5583588394086199,
+      "learning_rate": 9.159591271182058e-06,
+      "loss": 1.0504,
+      "step": 725
+    },
+    {
+      "epoch": 0.5404404960207292,
+      "grad_norm": 1.5241057918795764,
+      "learning_rate": 9.141569674661816e-06,
+      "loss": 1.0459,
+      "step": 730
+    },
+    {
+      "epoch": 0.5441421432537479,
+      "grad_norm": 1.6179959479360446,
+      "learning_rate": 9.123375020545534e-06,
+      "loss": 1.0304,
+      "step": 735
+    },
+    {
+      "epoch": 0.5478437904867666,
+      "grad_norm": 1.7974028021647042,
+      "learning_rate": 9.105008069106093e-06,
+      "loss": 1.0442,
+      "step": 740
+    },
+    {
+      "epoch": 0.5515454377197853,
+      "grad_norm": 1.649050893416802,
+      "learning_rate": 9.086469587815904e-06,
+      "loss": 1.0086,
+      "step": 745
+    },
+    {
+      "epoch": 0.555247084952804,
+      "grad_norm": 1.614894669068509,
+      "learning_rate": 9.067760351314838e-06,
+      "loss": 1.0469,
+      "step": 750
+    },
+    {
+      "epoch": 0.5589487321858226,
+      "grad_norm": 1.520568884970898,
+      "learning_rate": 9.048881141377863e-06,
+      "loss": 1.0086,
+      "step": 755
+    },
+    {
+      "epoch": 0.5626503794188414,
+      "grad_norm": 1.5551225273423361,
+      "learning_rate": 9.029832746882372e-06,
+      "loss": 1.0065,
+      "step": 760
+    },
+    {
+      "epoch": 0.5663520266518601,
+      "grad_norm": 1.4730472068014824,
+      "learning_rate": 9.01061596377522e-06,
+      "loss": 1.0281,
+      "step": 765
+    },
+    {
+      "epoch": 0.5700536738848788,
+      "grad_norm": 1.6014116272335734,
+      "learning_rate": 8.991231595039464e-06,
+      "loss": 1.0151,
+      "step": 770
+    },
+    {
+      "epoch": 0.5737553211178975,
+      "grad_norm": 1.5967294946956374,
+      "learning_rate": 8.97168045066082e-06,
+      "loss": 1.0046,
+      "step": 775
+    },
+    {
+      "epoch": 0.5774569683509162,
+      "grad_norm": 1.506719942875412,
+      "learning_rate": 8.951963347593797e-06,
+      "loss": 0.9995,
+      "step": 780
+    },
+    {
+      "epoch": 0.5811586155839349,
+      "grad_norm": 1.6981243082186395,
+      "learning_rate": 8.932081109727582e-06,
+      "loss": 1.0185,
+      "step": 785
+    },
+    {
+      "epoch": 0.5848602628169536,
+      "grad_norm": 1.5548965922799378,
+      "learning_rate": 8.9120345678516e-06,
+      "loss": 1.035,
+      "step": 790
+    },
+    {
+      "epoch": 0.5885619100499723,
+      "grad_norm": 1.4696827174157245,
+      "learning_rate": 8.891824559620801e-06,
+      "loss": 0.9988,
+      "step": 795
+    },
+    {
+      "epoch": 0.592263557282991,
+      "grad_norm": 1.6048014454677177,
+      "learning_rate": 8.871451929520662e-06,
+      "loss": 1.0263,
+      "step": 800
+    },
+    {
+      "epoch": 0.592263557282991,
+      "eval_loss": 1.0503827333450317,
+      "eval_runtime": 8.6982,
+      "eval_samples_per_second": 58.863,
+      "eval_steps_per_second": 14.716,
+      "step": 800
+    },
+    {
+      "epoch": 0.5959652045160097,
+      "grad_norm": 1.6988170070957813,
+      "learning_rate": 8.8509175288319e-06,
+      "loss": 1.0232,
+      "step": 805
+    },
+    {
+      "epoch": 0.5996668517490283,
+      "grad_norm": 1.5659115399712737,
+      "learning_rate": 8.83022221559489e-06,
+      "loss": 1.0354,
+      "step": 810
+    },
+    {
+      "epoch": 0.603368498982047,
+      "grad_norm": 1.5221814047695017,
+      "learning_rate": 8.80936685457383e-06,
+      "loss": 1.0333,
+      "step": 815
+    },
+    {
+      "epoch": 0.6070701462150657,
+      "grad_norm": 1.562353756566854,
+      "learning_rate": 8.78835231722059e-06,
+      "loss": 1.0382,
+      "step": 820
+    },
+    {
+      "epoch": 0.6107717934480844,
+      "grad_norm": 1.490360737763961,
+      "learning_rate": 8.767179481638303e-06,
+      "loss": 1.0184,
+      "step": 825
+    },
+    {
+      "epoch": 0.6144734406811031,
+      "grad_norm": 1.5813235550962066,
+      "learning_rate": 8.74584923254468e-06,
+      "loss": 1.0046,
+      "step": 830
+    },
+    {
+      "epoch": 0.6181750879141218,
+      "grad_norm": 1.618903249194125,
+      "learning_rate": 8.72436246123503e-06,
+      "loss": 0.9847,
+      "step": 835
+    },
+    {
+      "epoch": 0.6218767351471405,
+      "grad_norm": 1.6051868684169093,
+      "learning_rate": 8.702720065545024e-06,
+      "loss": 1.0241,
+      "step": 840
+    },
+    {
+      "epoch": 0.6255783823801592,
+      "grad_norm": 1.6080381216679098,
+      "learning_rate": 8.680922949813177e-06,
+      "loss": 1.0455,
+      "step": 845
+    },
+    {
+      "epoch": 0.6292800296131779,
+      "grad_norm": 1.5398170796291812,
+      "learning_rate": 8.658972024843063e-06,
+      "loss": 1.0123,
+      "step": 850
+    },
+    {
+      "epoch": 0.6329816768461966,
+      "grad_norm": 1.5079959968532555,
+      "learning_rate": 8.636868207865244e-06,
+      "loss": 1.0012,
+      "step": 855
+    },
+    {
+      "epoch": 0.6366833240792152,
+      "grad_norm": 1.5108165451129463,
+      "learning_rate": 8.614612422498965e-06,
+      "loss": 0.987,
+      "step": 860
+    },
+    {
+      "epoch": 0.6403849713122339,
+      "grad_norm": 1.5072453921526188,
+      "learning_rate": 8.592205598713539e-06,
+      "loss": 1.0063,
+      "step": 865
+    },
+    {
+      "epoch": 0.6440866185452526,
+      "grad_norm": 1.5236818405854384,
+      "learning_rate": 8.569648672789496e-06,
+      "loss": 1.0531,
+      "step": 870
+    },
+    {
+      "epoch": 0.6477882657782713,
+      "grad_norm": 1.6365347434254418,
+      "learning_rate": 8.546942587279465e-06,
+      "loss": 1.0125,
+      "step": 875
+    },
+    {
+      "epoch": 0.65148991301129,
+      "grad_norm": 1.4635136559877178,
+      "learning_rate": 8.524088290968781e-06,
+      "loss": 1.0235,
+      "step": 880
+    },
+    {
+      "epoch": 0.6551915602443087,
+      "grad_norm": 1.6403726541339527,
+      "learning_rate": 8.501086738835843e-06,
+      "loss": 1.017,
+      "step": 885
+    },
+    {
+      "epoch": 0.6588932074773274,
+      "grad_norm": 1.593396456570902,
+      "learning_rate": 8.477938892012209e-06,
+      "loss": 1.0169,
+      "step": 890
+    },
+    {
+      "epoch": 0.6625948547103461,
+      "grad_norm": 1.5617836951270059,
+      "learning_rate": 8.45464571774244e-06,
+      "loss": 1.0362,
+      "step": 895
+    },
+    {
+      "epoch": 0.6662965019433648,
+      "grad_norm": 1.5971242667528898,
+      "learning_rate": 8.43120818934367e-06,
+      "loss": 1.0085,
+      "step": 900
+    },
+    {
+      "epoch": 0.6662965019433648,
+      "eval_loss": 1.0438976287841797,
+      "eval_runtime": 8.7306,
+      "eval_samples_per_second": 58.644,
+      "eval_steps_per_second": 14.661,
+      "step": 900
+    },
+    {
+      "epoch": 0.6699981491763835,
+      "grad_norm": 1.515185532674787,
+      "learning_rate": 8.407627286164948e-06,
+      "loss": 1.0029,
+      "step": 905
+    },
+    {
+      "epoch": 0.6736997964094021,
+      "grad_norm": 1.5243850755391344,
+      "learning_rate": 8.38390399354631e-06,
+      "loss": 1.0266,
+      "step": 910
+    },
+    {
+      "epoch": 0.6774014436424208,
+      "grad_norm": 1.580588659304687,
+      "learning_rate": 8.360039302777614e-06,
+      "loss": 1.028,
+      "step": 915
+    },
+    {
+      "epoch": 0.6811030908754395,
+      "grad_norm": 1.5511561732012566,
+      "learning_rate": 8.336034211057098e-06,
+      "loss": 1.0203,
+      "step": 920
+    },
+    {
+      "epoch": 0.6848047381084582,
+      "grad_norm": 1.5317152606248665,
+      "learning_rate": 8.31188972144974e-06,
+      "loss": 1.0222,
+      "step": 925
+    },
+    {
+      "epoch": 0.688506385341477,
+      "grad_norm": 1.640160330004564,
+      "learning_rate": 8.28760684284532e-06,
+      "loss": 0.9964,
+      "step": 930
+    },
+    {
+      "epoch": 0.6922080325744957,
+      "grad_norm": 1.5621396147571478,
+      "learning_rate": 8.263186589916273e-06,
+      "loss": 1.0085,
+      "step": 935
+    },
+    {
+      "epoch": 0.6959096798075144,
+      "grad_norm": 1.4901980514863815,
+      "learning_rate": 8.238629983075296e-06,
+      "loss": 1.0224,
+      "step": 940
+    },
+    {
+      "epoch": 0.6996113270405331,
+      "grad_norm": 1.5553443210091746,
+      "learning_rate": 8.213938048432697e-06,
+      "loss": 1.0073,
+      "step": 945
+    },
+    {
+      "epoch": 0.7033129742735518,
+      "grad_norm": 1.5439376225728318,
+      "learning_rate": 8.18911181775353e-06,
+      "loss": 1.0071,
+      "step": 950
+    },
+    {
+      "epoch": 0.7070146215065705,
+      "grad_norm": 1.5106567899403467,
+      "learning_rate": 8.164152328414476e-06,
+      "loss": 1.0113,
+      "step": 955
+    },
+    {
+      "epoch": 0.7107162687395892,
+      "grad_norm": 1.6943973028544896,
+      "learning_rate": 8.139060623360494e-06,
+      "loss": 0.9877,
+      "step": 960
+    },
+    {
+      "epoch": 0.7144179159726078,
+      "grad_norm": 1.6055362835498677,
+      "learning_rate": 8.113837751061246e-06,
+      "loss": 1.003,
+      "step": 965
+    },
+    {
+      "epoch": 0.7181195632056265,
+      "grad_norm": 1.6279162631191268,
+      "learning_rate": 8.088484765467286e-06,
+      "loss": 1.0049,
+      "step": 970
+    },
+    {
+      "epoch": 0.7218212104386452,
+      "grad_norm": 1.6604430854839634,
+      "learning_rate": 8.063002725966014e-06,
+      "loss": 1.0049,
+      "step": 975
+    },
+    {
+      "epoch": 0.7255228576716639,
+      "grad_norm": 1.5017388079752865,
+      "learning_rate": 8.037392697337418e-06,
+      "loss": 1.0115,
+      "step": 980
+    },
+    {
+      "epoch": 0.7292245049046826,
+      "grad_norm": 1.6030200207735454,
+      "learning_rate": 8.011655749709575e-06,
+      "loss": 1.0044,
+      "step": 985
+    },
+    {
+      "epoch": 0.7329261521377013,
+      "grad_norm": 1.6579368837511645,
+      "learning_rate": 7.985792958513932e-06,
+      "loss": 1.0342,
+      "step": 990
+    },
+    {
+      "epoch": 0.73662779937072,
+      "grad_norm": 1.5292653573625021,
+      "learning_rate": 7.95980540444038e-06,
+      "loss": 1.011,
+      "step": 995
+    },
+    {
+      "epoch": 0.7403294466037387,
+      "grad_norm": 1.681691332550864,
+      "learning_rate": 7.93369417339209e-06,
+      "loss": 0.9926,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7403294466037387,
+      "eval_loss": 1.0382704734802246,
+      "eval_runtime": 8.6988,
+      "eval_samples_per_second": 58.859,
+      "eval_steps_per_second": 14.715,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7440310938367574,
+      "grad_norm": 1.6752177251862725,
+      "learning_rate": 7.907460356440133e-06,
+      "loss": 0.982,
+      "step": 1005
+    },
+    {
+      "epoch": 0.747732741069776,
+      "grad_norm": 1.6158791648303268,
+      "learning_rate": 7.881105049777902e-06,
+      "loss": 1.0065,
+      "step": 1010
+    },
+    {
+      "epoch": 0.7514343883027947,
+      "grad_norm": 1.5008165210546065,
+      "learning_rate": 7.854629354675292e-06,
+      "loss": 0.9998,
+      "step": 1015
+    },
+    {
+      "epoch": 0.7551360355358134,
+      "grad_norm": 1.5488208344349068,
+      "learning_rate": 7.828034377432694e-06,
+      "loss": 1.0172,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7588376827688321,
+      "grad_norm": 1.469701423604682,
+      "learning_rate": 7.801321229334764e-06,
+      "loss": 1.0002,
+      "step": 1025
+    },
+    {
+      "epoch": 0.7625393300018508,
+      "grad_norm": 1.4531574430918555,
+      "learning_rate": 7.774491026603985e-06,
+      "loss": 1.0055,
+      "step": 1030
+    },
+    {
+      "epoch": 0.7662409772348695,
+      "grad_norm": 1.6900598064531231,
+      "learning_rate": 7.747544890354031e-06,
+      "loss": 1.0192,
+      "step": 1035
+    },
+    {
+      "epoch": 0.7699426244678882,
+      "grad_norm": 1.558014346947998,
+      "learning_rate": 7.720483946542913e-06,
+      "loss": 1.0013,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7736442717009069,
+      "grad_norm": 1.5500371825585382,
+      "learning_rate": 7.69330932592594e-06,
+      "loss": 0.9878,
+      "step": 1045
+    },
+    {
+      "epoch": 0.7773459189339256,
+      "grad_norm": 1.5367469436528767,
+      "learning_rate": 7.666022164008458e-06,
+      "loss": 0.9858,
+      "step": 1050
+    },
+    {
+      "epoch": 0.7810475661669443,
+      "grad_norm": 1.4601423824350541,
+      "learning_rate": 7.638623600998409e-06,
+      "loss": 1.0007,
+      "step": 1055
+    },
+    {
+      "epoch": 0.784749213399963,
+      "grad_norm": 1.648616874802218,
+      "learning_rate": 7.6111147817586925e-06,
+      "loss": 0.9931,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7884508606329816,
+      "grad_norm": 1.5849178244300517,
+      "learning_rate": 7.5834968557593155e-06,
+      "loss": 1.0205,
+      "step": 1065
+    },
+    {
+      "epoch": 0.7921525078660003,
+      "grad_norm": 1.5867119629960735,
+      "learning_rate": 7.5557709770293664e-06,
+      "loss": 1.0306,
+      "step": 1070
+    },
+    {
+      "epoch": 0.795854155099019,
+      "grad_norm": 1.5641079232184893,
+      "learning_rate": 7.527938304108795e-06,
+      "loss": 1.0229,
+      "step": 1075
+    },
+    {
+      "epoch": 0.7995558023320377,
+      "grad_norm": 1.5590165376402654,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.9696,
+      "step": 1080
+    },
+    {
+      "epoch": 0.8032574495650564,
+      "grad_norm": 1.4736677882280342,
+      "learning_rate": 7.471957232119235e-06,
+      "loss": 0.9958,
+      "step": 1085
+    },
+    {
+      "epoch": 0.8069590967980751,
+      "grad_norm": 1.495242221963721,
+      "learning_rate": 7.443811172247822e-06,
+      "loss": 0.9797,
+      "step": 1090
+    },
+    {
+      "epoch": 0.8106607440310938,
+      "grad_norm": 1.5646379206160475,
+      "learning_rate": 7.415562996483193e-06,
+      "loss": 1.0331,
+      "step": 1095
+    },
+    {
+      "epoch": 0.8143623912641126,
+      "grad_norm": 1.4867013854186002,
+      "learning_rate": 7.387213885189746e-06,
+      "loss": 0.9972,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8143623912641126,
+      "eval_loss": 1.0345816612243652,
+      "eval_runtime": 8.7672,
+      "eval_samples_per_second": 58.399,
+      "eval_steps_per_second": 14.6,
+      "step": 1100
+    },
+    {
+      "epoch": 0.8180640384971313,
+      "grad_norm": 1.6986654279574274,
+      "learning_rate": 7.358765022949519e-06,
+      "loss": 1.0015,
+      "step": 1105
+    },
+    {
+      "epoch": 0.82176568573015,
+      "grad_norm": 1.4979227472262977,
+      "learning_rate": 7.330217598512696e-06,
+      "loss": 0.9918,
+      "step": 1110
+    },
+    {
+      "epoch": 0.8254673329631687,
+      "grad_norm": 1.4409780655786124,
+      "learning_rate": 7.30157280474793e-06,
+      "loss": 0.9759,
+      "step": 1115
+    },
+    {
+      "epoch": 0.8291689801961873,
+      "grad_norm": 1.4674103925105133,
+      "learning_rate": 7.2728318385925035e-06,
+      "loss": 0.9865,
+      "step": 1120
+    },
+    {
+      "epoch": 0.832870627429206,
+      "grad_norm": 1.605252575982056,
+      "learning_rate": 7.243995901002312e-06,
+      "loss": 0.9832,
+      "step": 1125
+    },
+    {
+      "epoch": 0.8365722746622247,
+      "grad_norm": 1.68151860441419,
+      "learning_rate": 7.215066196901676e-06,
+      "loss": 1.0105,
+      "step": 1130
+    },
+    {
+      "epoch": 0.8402739218952434,
+      "grad_norm": 1.5248287910415732,
+      "learning_rate": 7.186043935133005e-06,
+      "loss": 1.0174,
+      "step": 1135
+    },
+    {
+      "epoch": 0.8439755691282621,
+      "grad_norm": 1.5354066430867073,
+      "learning_rate": 7.156930328406268e-06,
+      "loss": 0.9881,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8476772163612808,
+      "grad_norm": 1.5525671237362146,
+      "learning_rate": 7.127726593248337e-06,
+      "loss": 0.9895,
+      "step": 1145
+    },
+    {
+      "epoch": 0.8513788635942995,
+      "grad_norm": 1.5174388784248503,
+      "learning_rate": 7.098433949952146e-06,
+      "loss": 0.9887,
+      "step": 1150
+    },
+    {
+      "epoch": 0.8550805108273182,
+      "grad_norm": 1.5315674808308293,
+      "learning_rate": 7.069053622525697e-06,
+      "loss": 0.9502,
+      "step": 1155
+    },
+    {
+      "epoch": 0.8587821580603369,
+      "grad_norm": 1.4149587100662266,
+      "learning_rate": 7.039586838640918e-06,
+      "loss": 1.0154,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8624838052933556,
+      "grad_norm": 1.5648914937353318,
+      "learning_rate": 7.0100348295823706e-06,
+      "loss": 0.988,
+      "step": 1165
+    },
+    {
+      "epoch": 0.8661854525263742,
+      "grad_norm": 1.5539675815083542,
+      "learning_rate": 6.980398830195785e-06,
+      "loss": 1.0112,
+      "step": 1170
+    },
+    {
+      "epoch": 0.8698870997593929,
+      "grad_norm": 1.421648283396261,
+      "learning_rate": 6.950680078836475e-06,
+      "loss": 1.0135,
+      "step": 1175
+    },
+    {
+      "epoch": 0.8735887469924116,
+      "grad_norm": 1.4464001727685705,
+      "learning_rate": 6.920879817317588e-06,
+      "loss": 0.9909,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8772903942254303,
+      "grad_norm": 1.4937571669427057,
+      "learning_rate": 6.890999290858213e-06,
+      "loss": 0.9702,
+      "step": 1185
+    },
+    {
+      "epoch": 0.880992041458449,
+      "grad_norm": 1.4509930616773992,
+      "learning_rate": 6.861039748031351e-06,
+      "loss": 1.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.8846936886914677,
+      "grad_norm": 1.4945840852689367,
+      "learning_rate": 6.8310024407117405e-06,
+      "loss": 1.0049,
+      "step": 1195
+    },
+    {
+      "epoch": 0.8883953359244864,
+      "grad_norm": 1.4989120949638919,
+      "learning_rate": 6.800888624023552e-06,
+      "loss": 0.9729,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8883953359244864,
+      "eval_loss": 1.0306613445281982,
+      "eval_runtime": 8.685,
+      "eval_samples_per_second": 58.952,
+      "eval_steps_per_second": 14.738,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8920969831575051,
+      "grad_norm": 1.4664191520171954,
+      "learning_rate": 6.770699556287939e-06,
+      "loss": 1.0008,
+      "step": 1205
+    },
+    {
+      "epoch": 0.8957986303905238,
+      "grad_norm": 1.5754399133388788,
+      "learning_rate": 6.740436498970453e-06,
+      "loss": 1.0126,
+      "step": 1210
+    },
+    {
+      "epoch": 0.8995002776235425,
+      "grad_norm": 1.60094861067482,
+      "learning_rate": 6.710100716628345e-06,
+      "loss": 1.0,
+      "step": 1215
+    },
+    {
+      "epoch": 0.9032019248565611,
+      "grad_norm": 1.4595385666093144,
+      "learning_rate": 6.679693476857712e-06,
+      "loss": 0.9861,
+      "step": 1220
+    },
+    {
+      "epoch": 0.9069035720895798,
+      "grad_norm": 1.5022484134921414,
+      "learning_rate": 6.649216050240539e-06,
+      "loss": 1.0129,
+      "step": 1225
+    },
+    {
+      "epoch": 0.9106052193225985,
+      "grad_norm": 1.4991474920735144,
+      "learning_rate": 6.618669710291607e-06,
+      "loss": 0.9906,
+      "step": 1230
+    },
+    {
+      "epoch": 0.9143068665556172,
+      "grad_norm": 1.9098143096411326,
+      "learning_rate": 6.588055733405266e-06,
+      "loss": 0.996,
+      "step": 1235
+    },
+    {
+      "epoch": 0.9180085137886359,
+      "grad_norm": 1.582034170801525,
+      "learning_rate": 6.557375398802124e-06,
+      "loss": 1.01,
+      "step": 1240
+    },
+    {
+      "epoch": 0.9217101610216546,
+      "grad_norm": 1.541103799700962,
+      "learning_rate": 6.526629988475567e-06,
+      "loss": 0.9986,
+      "step": 1245
+    },
+    {
+      "epoch": 0.9254118082546733,
+      "grad_norm": 1.7275592833521085,
+      "learning_rate": 6.495820787138209e-06,
+      "loss": 1.0112,
+      "step": 1250
+    },
+    {
+      "epoch": 0.929113455487692,
+      "grad_norm": 1.6073614158652252,
+      "learning_rate": 6.4649490821682035e-06,
+      "loss": 1.0329,
+      "step": 1255
+    },
+    {
+      "epoch": 0.9328151027207107,
+      "grad_norm": 1.520357101228401,
+      "learning_rate": 6.434016163555452e-06,
+      "loss": 0.9895,
+      "step": 1260
+    },
+    {
+      "epoch": 0.9365167499537294,
+      "grad_norm": 1.5144821625571732,
+      "learning_rate": 6.403023323847695e-06,
+      "loss": 0.9797,
+      "step": 1265
+    },
+    {
+      "epoch": 0.9402183971867482,
+      "grad_norm": 1.6223270323749646,
+      "learning_rate": 6.371971858096509e-06,
+      "loss": 1.0114,
+      "step": 1270
+    },
+    {
+      "epoch": 0.9439200444197668,
+      "grad_norm": 1.4650778858538542,
+      "learning_rate": 6.340863063803187e-06,
+      "loss": 1.0004,
+      "step": 1275
+    },
+    {
+      "epoch": 0.9476216916527855,
+      "grad_norm": 1.7363358348857403,
+      "learning_rate": 6.30969824086453e-06,
+      "loss": 0.9973,
+      "step": 1280
+    },
+    {
+      "epoch": 0.9513233388858042,
+      "grad_norm": 1.5673091008974074,
+      "learning_rate": 6.278478691518519e-06,
+      "loss": 0.9903,
+      "step": 1285
+    },
+    {
+      "epoch": 0.9550249861188229,
+      "grad_norm": 1.5737677986125558,
+      "learning_rate": 6.247205720289907e-06,
+      "loss": 1.0283,
+      "step": 1290
+    },
+    {
+      "epoch": 0.9587266333518416,
+      "grad_norm": 1.490823774845133,
+      "learning_rate": 6.215880633935709e-06,
+      "loss": 1.005,
+      "step": 1295
+    },
+    {
+      "epoch": 0.9624282805848603,
+      "grad_norm": 1.5221265311599004,
+      "learning_rate": 6.184504741390596e-06,
+      "loss": 1.016,
+      "step": 1300
+    },
+    {
+      "epoch": 0.9624282805848603,
+      "eval_loss": 1.0261740684509277,
+      "eval_runtime": 8.6903,
+      "eval_samples_per_second": 58.916,
+      "eval_steps_per_second": 14.729,
+      "step": 1300
+    },
+    {
+      "epoch": 0.966129927817879,
+      "grad_norm": 1.5497782760360854,
+      "learning_rate": 6.153079353712201e-06,
+      "loss": 0.9538,
+      "step": 1305
+    },
+    {
+      "epoch": 0.9698315750508977,
+      "grad_norm": 1.4891969599026476,
+      "learning_rate": 6.121605784026339e-06,
+      "loss": 1.0251,
+      "step": 1310
+    },
+    {
+      "epoch": 0.9735332222839164,
+      "grad_norm": 1.5148913492479763,
+      "learning_rate": 6.09008534747213e-06,
+      "loss": 0.9565,
+      "step": 1315
+    },
+    {
+      "epoch": 0.9772348695169351,
+      "grad_norm": 1.4933554149502895,
+      "learning_rate": 6.058519361147055e-06,
+      "loss": 0.9968,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9809365167499537,
+      "grad_norm": 1.4313288190909095,
+      "learning_rate": 6.02690914405191e-06,
+      "loss": 0.9423,
+      "step": 1325
+    },
+    {
+      "epoch": 0.9846381639829724,
+      "grad_norm": 1.5414508677830214,
+      "learning_rate": 5.995256017035703e-06,
+      "loss": 0.9976,
+      "step": 1330
+    },
+    {
+      "epoch": 0.9883398112159911,
+      "grad_norm": 1.5902685807729529,
+      "learning_rate": 5.9635613027404495e-06,
+      "loss": 1.0087,
+      "step": 1335
+    },
+    {
+      "epoch": 0.9920414584490098,
+      "grad_norm": 1.4804073390294674,
+      "learning_rate": 5.931826325545912e-06,
+      "loss": 0.9951,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9957431056820285,
+      "grad_norm": 1.5203977933330328,
+      "learning_rate": 5.900052411514257e-06,
+      "loss": 0.9927,
+      "step": 1345
+    },
+    {
+      "epoch": 0.9994447529150472,
+      "grad_norm": 1.4592290182162446,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 0.9849,
+      "step": 1350
+    },
+    {
+      "epoch": 1.0037016472330187,
+      "grad_norm": 1.6078346951337656,
+      "learning_rate": 5.836393085267777e-06,
+      "loss": 1.1427,
+      "step": 1355
+    },
+    {
+      "epoch": 1.0074032944660374,
+      "grad_norm": 1.6427326520740766,
+      "learning_rate": 5.804510333090287e-06,
+      "loss": 0.952,
+      "step": 1360
+    },
+    {
+      "epoch": 1.011104941699056,
+      "grad_norm": 1.5456535118472365,
+      "learning_rate": 5.772593964039203e-06,
+      "loss": 0.9688,
+      "step": 1365
+    },
+    {
+      "epoch": 1.0148065889320748,
+      "grad_norm": 1.5530197411372806,
+      "learning_rate": 5.740645311756246e-06,
+      "loss": 0.964,
+      "step": 1370
+    },
+    {
+      "epoch": 1.0185082361650935,
+      "grad_norm": 1.4947763198029176,
+      "learning_rate": 5.708665711232103e-06,
+      "loss": 0.9444,
+      "step": 1375
+    },
+    {
+      "epoch": 1.0222098833981121,
+      "grad_norm": 1.494845190296128,
+      "learning_rate": 5.6766564987506564e-06,
+      "loss": 0.9677,
+      "step": 1380
+    },
+    {
+      "epoch": 1.0259115306311308,
+      "grad_norm": 1.4684775854511547,
+      "learning_rate": 5.644619011833134e-06,
+      "loss": 0.9446,
+      "step": 1385
+    },
+    {
+      "epoch": 1.0296131778641495,
+      "grad_norm": 1.5375658212489818,
+      "learning_rate": 5.612554589182228e-06,
+      "loss": 0.9389,
+      "step": 1390
+    },
+    {
+      "epoch": 1.0333148250971682,
+      "grad_norm": 1.5529509671731727,
+      "learning_rate": 5.5804645706261515e-06,
+      "loss": 0.9572,
+      "step": 1395
+    },
+    {
+      "epoch": 1.037016472330187,
+      "grad_norm": 1.5523403019399546,
+      "learning_rate": 5.548350297062659e-06,
+      "loss": 0.9554,
+      "step": 1400
+    },
+    {
+      "epoch": 1.037016472330187,
+      "eval_loss": 1.0255022048950195,
+      "eval_runtime": 8.6881,
+      "eval_samples_per_second": 58.931,
+      "eval_steps_per_second": 14.733,
+      "step": 1400
+    },
+    {
+      "epoch": 1.0407181195632056,
+      "grad_norm": 1.5104158625012352,
+      "learning_rate": 5.516213110403009e-06,
+      "loss": 0.9575,
+      "step": 1405
+    },
+    {
+      "epoch": 1.0444197667962243,
+      "grad_norm": 1.4890138153731032,
+      "learning_rate": 5.484054353515896e-06,
+      "loss": 0.9427,
+      "step": 1410
+    },
+    {
+      "epoch": 1.048121414029243,
+      "grad_norm": 1.5548599476236042,
+      "learning_rate": 5.451875370171341e-06,
+      "loss": 0.9524,
+      "step": 1415
+    },
+    {
+      "epoch": 1.0518230612622617,
+      "grad_norm": 1.4627948267025965,
+      "learning_rate": 5.419677504984534e-06,
+      "loss": 0.9587,
+      "step": 1420
+    },
+    {
+      "epoch": 1.0555247084952804,
+      "grad_norm": 1.5347877324134909,
+      "learning_rate": 5.387462103359655e-06,
+      "loss": 0.9512,
+      "step": 1425
+    },
+    {
+      "epoch": 1.059226355728299,
+      "grad_norm": 1.5055307314944746,
+      "learning_rate": 5.3552305114336515e-06,
+      "loss": 0.9562,
+      "step": 1430
+    },
+    {
+      "epoch": 1.0629280029613177,
+      "grad_norm": 1.5673533023313868,
+      "learning_rate": 5.32298407601999e-06,
+      "loss": 0.9456,
+      "step": 1435
+    },
+    {
+      "epoch": 1.0666296501943364,
+      "grad_norm": 1.564595024828045,
+      "learning_rate": 5.290724144552379e-06,
+      "loss": 0.947,
+      "step": 1440
+    },
+    {
+      "epoch": 1.070331297427355,
+      "grad_norm": 1.5287981221284181,
+      "learning_rate": 5.258452065028473e-06,
+      "loss": 0.9536,
+      "step": 1445
+    },
+    {
+      "epoch": 1.0740329446603738,
+      "grad_norm": 1.4465299230775315,
+      "learning_rate": 5.2261691859535325e-06,
+      "loss": 0.9186,
+      "step": 1450
+    },
+    {
+      "epoch": 1.0777345918933925,
+      "grad_norm": 1.634117352035441,
+      "learning_rate": 5.193876856284085e-06,
+      "loss": 0.9328,
+      "step": 1455
+    },
+    {
+      "epoch": 1.0814362391264112,
+      "grad_norm": 1.4385431388650558,
+      "learning_rate": 5.161576425371554e-06,
+      "loss": 0.9479,
+      "step": 1460
+    },
+    {
+      "epoch": 1.0851378863594299,
+      "grad_norm": 1.5072383725336713,
+      "learning_rate": 5.1292692429058824e-06,
+      "loss": 0.9861,
+      "step": 1465
+    },
+    {
+      "epoch": 1.0888395335924486,
+      "grad_norm": 1.4625718771019833,
+      "learning_rate": 5.096956658859122e-06,
+      "loss": 0.9557,
+      "step": 1470
+    },
+    {
+      "epoch": 1.0925411808254673,
+      "grad_norm": 1.4758402229370147,
+      "learning_rate": 5.064640023429042e-06,
+      "loss": 0.944,
+      "step": 1475
+    },
+    {
+      "epoch": 1.096242828058486,
+      "grad_norm": 1.51962588520605,
+      "learning_rate": 5.032320686982697e-06,
+      "loss": 0.9213,
+      "step": 1480
+    },
+    {
+      "epoch": 1.0999444752915046,
+      "grad_norm": 1.5103027827969508,
+      "learning_rate": 5e-06,
+      "loss": 0.9411,
+      "step": 1485
+    },
+    {
+      "epoch": 1.1036461225245233,
+      "grad_norm": 1.5224293264101985,
+      "learning_rate": 4.967679313017304e-06,
+      "loss": 0.9547,
+      "step": 1490
+    },
+    {
+      "epoch": 1.107347769757542,
+      "grad_norm": 1.5430895119123675,
+      "learning_rate": 4.9353599765709585e-06,
+      "loss": 0.9118,
+      "step": 1495
+    },
+    {
+      "epoch": 1.1110494169905607,
+      "grad_norm": 1.481637878176428,
+      "learning_rate": 4.903043341140879e-06,
+      "loss": 0.9564,
+      "step": 1500
+    },
+    {
+      "epoch": 1.1110494169905607,
+      "eval_loss": 1.0229578018188477,
+      "eval_runtime": 8.679,
+      "eval_samples_per_second": 58.993,
+      "eval_steps_per_second": 14.748,
+      "step": 1500
+    },
+    {
+      "epoch": 1.1147510642235794,
+      "grad_norm": 1.5115835881244595,
+      "learning_rate": 4.870730757094121e-06,
+      "loss": 0.9338,
+      "step": 1505
+    },
+    {
+      "epoch": 1.118452711456598,
+      "grad_norm": 1.5449975680141834,
+      "learning_rate": 4.838423574628447e-06,
+      "loss": 0.9661,
+      "step": 1510
+    },
+    {
+      "epoch": 1.1221543586896168,
+      "grad_norm": 1.5861346370310576,
+      "learning_rate": 4.806123143715916e-06,
+      "loss": 0.9246,
+      "step": 1515
+    },
+    {
+      "epoch": 1.1258560059226355,
+      "grad_norm": 1.5449553152899103,
+      "learning_rate": 4.773830814046469e-06,
+      "loss": 0.9599,
+      "step": 1520
+    },
+    {
+      "epoch": 1.1295576531556542,
+      "grad_norm": 1.5015297636472575,
+      "learning_rate": 4.741547934971528e-06,
+      "loss": 0.9462,
+      "step": 1525
+    },
+    {
+      "epoch": 1.1332593003886728,
+      "grad_norm": 1.5234680183519724,
+      "learning_rate": 4.7092758554476215e-06,
+      "loss": 0.9413,
+      "step": 1530
+    },
+    {
+      "epoch": 1.1369609476216915,
+      "grad_norm": 1.4388073264946348,
+      "learning_rate": 4.677015923980012e-06,
+      "loss": 0.9286,
+      "step": 1535
+    },
+    {
+      "epoch": 1.1406625948547104,
+      "grad_norm": 1.5248461875650898,
+      "learning_rate": 4.644769488566351e-06,
+      "loss": 0.9435,
+      "step": 1540
+    },
+    {
+      "epoch": 1.1443642420877291,
+      "grad_norm": 1.5303284099305183,
+      "learning_rate": 4.6125378966403465e-06,
+      "loss": 0.9522,
+      "step": 1545
+    },
+    {
+      "epoch": 1.1480658893207478,
+      "grad_norm": 1.4682502346893527,
+      "learning_rate": 4.580322495015466e-06,
+      "loss": 0.9384,
+      "step": 1550
+    },
+    {
+      "epoch": 1.1517675365537665,
+      "grad_norm": 1.509753203698729,
+      "learning_rate": 4.548124629828661e-06,
+      "loss": 0.9573,
+      "step": 1555
+    },
+    {
+      "epoch": 1.1554691837867852,
+      "grad_norm": 1.4247250737749155,
+      "learning_rate": 4.515945646484105e-06,
+      "loss": 0.934,
+      "step": 1560
+    },
+    {
+      "epoch": 1.159170831019804,
+      "grad_norm": 1.4971184840487013,
+      "learning_rate": 4.483786889596993e-06,
+      "loss": 0.9446,
+      "step": 1565
+    },
+    {
+      "epoch": 1.1628724782528226,
+      "grad_norm": 1.4886733261293026,
+      "learning_rate": 4.451649702937343e-06,
+      "loss": 0.9591,
+      "step": 1570
+    },
+    {
+      "epoch": 1.1665741254858413,
+      "grad_norm": 1.517255771510319,
+      "learning_rate": 4.4195354293738484e-06,
+      "loss": 0.9446,
+      "step": 1575
+    },
+    {
+      "epoch": 1.17027577271886,
+      "grad_norm": 1.5799027115378548,
+      "learning_rate": 4.387445410817774e-06,
+      "loss": 0.9598,
+      "step": 1580
+    },
+    {
+      "epoch": 1.1739774199518787,
+      "grad_norm": 1.5335978469668663,
+      "learning_rate": 4.355380988166867e-06,
+      "loss": 0.9526,
+      "step": 1585
+    },
+    {
+      "epoch": 1.1776790671848973,
+      "grad_norm": 1.4421944530376716,
+      "learning_rate": 4.323343501249346e-06,
+      "loss": 0.936,
+      "step": 1590
+    },
+    {
+      "epoch": 1.181380714417916,
+      "grad_norm": 1.4733228432144814,
+      "learning_rate": 4.291334288767899e-06,
+      "loss": 0.9249,
+      "step": 1595
+    },
+    {
+      "epoch": 1.1850823616509347,
+      "grad_norm": 1.4103746479202812,
+      "learning_rate": 4.259354688243758e-06,
+      "loss": 0.9551,
+      "step": 1600
+    },
+    {
+      "epoch": 1.1850823616509347,
+      "eval_loss": 1.0203170776367188,
+      "eval_runtime": 8.6729,
+      "eval_samples_per_second": 59.035,
+      "eval_steps_per_second": 14.759,
+      "step": 1600
+    },
+    {
+      "epoch": 1.1887840088839534,
+      "grad_norm": 1.4794301636693654,
+      "learning_rate": 4.227406035960798e-06,
+      "loss": 0.928,
+      "step": 1605
+    },
+    {
+      "epoch": 1.192485656116972,
+      "grad_norm": 1.5324684655198086,
+      "learning_rate": 4.195489666909714e-06,
+      "loss": 0.9556,
+      "step": 1610
+    },
+    {
+      "epoch": 1.1961873033499908,
+      "grad_norm": 1.5550207701455285,
+      "learning_rate": 4.163606914732224e-06,
+      "loss": 0.9024,
+      "step": 1615
+    },
+    {
+      "epoch": 1.1998889505830095,
+      "grad_norm": 1.6078242471595632,
+      "learning_rate": 4.131759111665349e-06,
+      "loss": 0.9371,
+      "step": 1620
+    },
+    {
+      "epoch": 1.2035905978160282,
+      "grad_norm": 1.5130235230070461,
+      "learning_rate": 4.099947588485744e-06,
+      "loss": 0.9193,
+      "step": 1625
+    },
+    {
+      "epoch": 1.2072922450490469,
+      "grad_norm": 1.5880211247444604,
+      "learning_rate": 4.06817367445409e-06,
+      "loss": 0.9338,
+      "step": 1630
+    },
+    {
+      "epoch": 1.2109938922820656,
+      "grad_norm": 1.5601507103852923,
+      "learning_rate": 4.036438697259551e-06,
+      "loss": 0.946,
+      "step": 1635
+    },
+    {
+      "epoch": 1.2146955395150842,
+      "grad_norm": 1.460584421184121,
+      "learning_rate": 4.004743982964298e-06,
+      "loss": 0.9468,
+      "step": 1640
+    },
+    {
+      "epoch": 1.218397186748103,
+      "grad_norm": 1.5301475424382758,
+      "learning_rate": 3.9730908559480904e-06,
+      "loss": 0.9304,
+      "step": 1645
+    },
+    {
+      "epoch": 1.2220988339811216,
+      "grad_norm": 1.4152802250426284,
+      "learning_rate": 3.941480638852948e-06,
+      "loss": 0.9288,
+      "step": 1650
+    },
+    {
+      "epoch": 1.2258004812141403,
+      "grad_norm": 1.4558762027477072,
+      "learning_rate": 3.909914652527872e-06,
+      "loss": 0.9532,
+      "step": 1655
+    },
+    {
+      "epoch": 1.229502128447159,
+      "grad_norm": 1.475608066439469,
+      "learning_rate": 3.878394215973663e-06,
+      "loss": 0.9267,
+      "step": 1660
+    },
+    {
+      "epoch": 1.2332037756801777,
+      "grad_norm": 1.4527083725574788,
+      "learning_rate": 3.8469206462878e-06,
+      "loss": 0.9397,
+      "step": 1665
+    },
+    {
+      "epoch": 1.2369054229131964,
+      "grad_norm": 1.4252791523516992,
+      "learning_rate": 3.815495258609404e-06,
+      "loss": 0.9515,
+      "step": 1670
+    },
+    {
+      "epoch": 1.240607070146215,
+      "grad_norm": 1.458412822434971,
+      "learning_rate": 3.784119366064293e-06,
+      "loss": 0.9627,
+      "step": 1675
+    },
+    {
+      "epoch": 1.2443087173792338,
+      "grad_norm": 1.4371535265538804,
+      "learning_rate": 3.752794279710094e-06,
+      "loss": 0.9492,
+      "step": 1680
+    },
+    {
+      "epoch": 1.2480103646122525,
+      "grad_norm": 1.5956163025607901,
+      "learning_rate": 3.721521308481483e-06,
+      "loss": 0.941,
+      "step": 1685
+    },
+    {
+      "epoch": 1.2517120118452711,
+      "grad_norm": 1.4606215041268646,
+      "learning_rate": 3.690301759135471e-06,
+      "loss": 0.9218,
+      "step": 1690
+    },
+    {
+      "epoch": 1.2554136590782898,
+      "grad_norm": 1.551540798338896,
+      "learning_rate": 3.6591369361968127e-06,
+      "loss": 0.9452,
+      "step": 1695
+    },
+    {
+      "epoch": 1.2591153063113085,
+      "grad_norm": 1.438162871091009,
+      "learning_rate": 3.6280281419034934e-06,
+      "loss": 0.9515,
+      "step": 1700
+    },
+    {
+      "epoch": 1.2591153063113085,
+      "eval_loss": 1.018484354019165,
+      "eval_runtime": 8.675,
+      "eval_samples_per_second": 59.02,
+      "eval_steps_per_second": 14.755,
+      "step": 1700
+    },
+    {
+      "epoch": 1.2628169535443272,
+      "grad_norm": 1.5280110385731542,
+      "learning_rate": 3.596976676152306e-06,
+      "loss": 0.9312,
+      "step": 1705
+    },
+    {
+      "epoch": 1.266518600777346,
+      "grad_norm": 1.439021134519739,
+      "learning_rate": 3.5659838364445505e-06,
+      "loss": 0.9502,
+      "step": 1710
+    },
+    {
+      "epoch": 1.2702202480103646,
+      "grad_norm": 1.542129828765787,
+      "learning_rate": 3.535050917831797e-06,
+      "loss": 0.9245,
+      "step": 1715
+    },
+    {
+      "epoch": 1.2739218952433833,
+      "grad_norm": 1.4387624494665334,
+      "learning_rate": 3.504179212861793e-06,
+      "loss": 0.9646,
+      "step": 1720
+    },
+    {
+      "epoch": 1.277623542476402,
+      "grad_norm": 1.4576756291542614,
+      "learning_rate": 3.473370011524435e-06,
+      "loss": 0.9444,
+      "step": 1725
+    },
+    {
+      "epoch": 1.2813251897094207,
+      "grad_norm": 1.4116630998926993,
+      "learning_rate": 3.442624601197877e-06,
+      "loss": 0.935,
+      "step": 1730
+    },
+    {
+      "epoch": 1.2850268369424394,
+      "grad_norm": 1.448732573638826,
+      "learning_rate": 3.4119442665947346e-06,
+      "loss": 0.9205,
+      "step": 1735
+    },
+    {
+      "epoch": 1.288728484175458,
+      "grad_norm": 1.4616328188755885,
+      "learning_rate": 3.3813302897083955e-06,
+      "loss": 0.9515,
+      "step": 1740
+    },
+    {
+      "epoch": 1.2924301314084767,
+      "grad_norm": 1.4994123055203914,
+      "learning_rate": 3.350783949759462e-06,
+      "loss": 0.9406,
+      "step": 1745
+    },
+    {
+      "epoch": 1.2961317786414954,
+      "grad_norm": 1.5510537992943896,
+      "learning_rate": 3.3203065231422904e-06,
+      "loss": 0.9536,
+      "step": 1750
+    },
+    {
+      "epoch": 1.2998334258745141,
+      "grad_norm": 1.4979569186605917,
+      "learning_rate": 3.289899283371657e-06,
+      "loss": 0.9364,
+      "step": 1755
+    },
+    {
+      "epoch": 1.3035350731075328,
+      "grad_norm": 1.4342400603002827,
+      "learning_rate": 3.259563501029548e-06,
+      "loss": 0.9661,
+      "step": 1760
+    },
+    {
+      "epoch": 1.3072367203405515,
+      "grad_norm": 1.605308165753949,
+      "learning_rate": 3.2293004437120622e-06,
+      "loss": 0.9419,
+      "step": 1765
+    },
+    {
+      "epoch": 1.3109383675735702,
+      "grad_norm": 1.4535747016942286,
+      "learning_rate": 3.1991113759764493e-06,
+      "loss": 0.9399,
+      "step": 1770
+    },
+    {
+      "epoch": 1.3146400148065889,
+      "grad_norm": 1.5045929424819182,
+      "learning_rate": 3.1689975592882603e-06,
+      "loss": 0.9493,
+      "step": 1775
+    },
+    {
+      "epoch": 1.3183416620396076,
+      "grad_norm": 1.4298316802164306,
+      "learning_rate": 3.1389602519686515e-06,
+      "loss": 0.9684,
+      "step": 1780
+    },
+    {
+      "epoch": 1.3220433092726263,
+      "grad_norm": 1.448065974522092,
+      "learning_rate": 3.1090007091417884e-06,
+      "loss": 0.9337,
+      "step": 1785
+    },
+    {
+      "epoch": 1.325744956505645,
+      "grad_norm": 1.5957876876234358,
+      "learning_rate": 3.0791201826824117e-06,
+      "loss": 0.9697,
+      "step": 1790
+    },
+    {
+      "epoch": 1.3294466037386636,
+      "grad_norm": 1.439183790018425,
+      "learning_rate": 3.049319921163526e-06,
+      "loss": 0.9246,
+      "step": 1795
+    },
+    {
+      "epoch": 1.3331482509716823,
+      "grad_norm": 1.5233611691430629,
+      "learning_rate": 3.019601169804216e-06,
+      "loss": 0.9507,
+      "step": 1800
+    },
+    {
+      "epoch": 1.3331482509716823,
+      "eval_loss": 1.0166221857070923,
+      "eval_runtime": 8.6954,
+      "eval_samples_per_second": 58.882,
+      "eval_steps_per_second": 14.72,
+      "step": 1800
+    },
+    {
+      "epoch": 1.336849898204701,
+      "grad_norm": 1.535149119238128,
+      "learning_rate": 2.9899651704176324e-06,
+      "loss": 0.9488,
+      "step": 1805
+    },
+    {
+      "epoch": 1.3405515454377197,
+      "grad_norm": 1.8013917126510002,
+      "learning_rate": 2.9604131613590825e-06,
+      "loss": 0.9525,
+      "step": 1810
+    },
+    {
+      "epoch": 1.3442531926707386,
+      "grad_norm": 1.5758419968483395,
+      "learning_rate": 2.9309463774743047e-06,
+      "loss": 0.9296,
+      "step": 1815
+    },
+    {
+      "epoch": 1.3479548399037573,
+      "grad_norm": 1.6434309252154637,
+      "learning_rate": 2.901566050047855e-06,
+      "loss": 0.972,
+      "step": 1820
+    },
+    {
+      "epoch": 1.351656487136776,
+      "grad_norm": 1.4945819293763394,
+      "learning_rate": 2.8722734067516637e-06,
+      "loss": 0.9678,
+      "step": 1825
+    },
+    {
+      "epoch": 1.3553581343697947,
+      "grad_norm": 1.5747674704361272,
+      "learning_rate": 2.843069671593734e-06,
+      "loss": 0.9461,
+      "step": 1830
+    },
+    {
+      "epoch": 1.3590597816028134,
+      "grad_norm": 1.4849849381301548,
+      "learning_rate": 2.813956064866996e-06,
+      "loss": 0.9547,
+      "step": 1835
+    },
+    {
+      "epoch": 1.362761428835832,
+      "grad_norm": 1.4549463612438158,
+      "learning_rate": 2.784933803098326e-06,
+      "loss": 0.9395,
+      "step": 1840
+    },
+    {
+      "epoch": 1.3664630760688508,
+      "grad_norm": 1.465774668348825,
+      "learning_rate": 2.7560040989976894e-06,
+      "loss": 0.946,
+      "step": 1845
+    },
+    {
+      "epoch": 1.3701647233018694,
+      "grad_norm": 1.3769248284299485,
+      "learning_rate": 2.7271681614074973e-06,
+      "loss": 0.9447,
+      "step": 1850
+    },
+    {
+      "epoch": 1.3738663705348881,
+      "grad_norm": 1.486107645046795,
+      "learning_rate": 2.6984271952520723e-06,
+      "loss": 0.9292,
+      "step": 1855
+    },
+    {
+      "epoch": 1.3775680177679068,
+      "grad_norm": 1.4626507230942989,
+      "learning_rate": 2.6697824014873076e-06,
+      "loss": 0.9514,
+      "step": 1860
+    },
+    {
+      "epoch": 1.3812696650009255,
+      "grad_norm": 1.6022770155906472,
+      "learning_rate": 2.641234977050484e-06,
+      "loss": 0.9117,
+      "step": 1865
+    },
+    {
+      "epoch": 1.3849713122339442,
+      "grad_norm": 1.4831861982927594,
+      "learning_rate": 2.6127861148102552e-06,
+      "loss": 0.9551,
+      "step": 1870
+    },
+    {
+      "epoch": 1.388672959466963,
+      "grad_norm": 1.5391905518945717,
+      "learning_rate": 2.5844370035168077e-06,
+      "loss": 0.9673,
+      "step": 1875
+    },
+    {
+      "epoch": 1.3923746066999816,
+      "grad_norm": 1.4232605239839096,
+      "learning_rate": 2.5561888277521797e-06,
+      "loss": 0.9494,
+      "step": 1880
+    },
+    {
+      "epoch": 1.3960762539330003,
+      "grad_norm": 1.4657072720630537,
+      "learning_rate": 2.528042767880766e-06,
+      "loss": 0.948,
+      "step": 1885
+    },
+    {
+      "epoch": 1.399777901166019,
+      "grad_norm": 1.5609370972114847,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.95,
+      "step": 1890
+    },
+    {
+      "epoch": 1.4034795483990377,
+      "grad_norm": 1.4134009339502485,
+      "learning_rate": 2.4720616958912054e-06,
+      "loss": 0.9312,
+      "step": 1895
+    },
+    {
+      "epoch": 1.4071811956320563,
+      "grad_norm": 1.4265922385575842,
+      "learning_rate": 2.4442290229706344e-06,
+      "loss": 0.9208,
+      "step": 1900
+    },
+    {
+      "epoch": 1.4071811956320563,
+      "eval_loss": 1.0145511627197266,
+      "eval_runtime": 8.723,
+      "eval_samples_per_second": 58.696,
+      "eval_steps_per_second": 14.674,
+      "step": 1900
+    },
+    {
+      "epoch": 1.410882842865075,
+      "grad_norm": 1.5398189516335716,
+      "learning_rate": 2.4165031442406857e-06,
+      "loss": 0.934,
+      "step": 1905
+    },
+    {
+      "epoch": 1.4145844900980937,
+      "grad_norm": 1.4291248478737046,
+      "learning_rate": 2.3888852182413087e-06,
+      "loss": 0.9396,
+      "step": 1910
+    },
+    {
+      "epoch": 1.4182861373311124,
+      "grad_norm": 1.4682471847679497,
+      "learning_rate": 2.361376399001592e-06,
+      "loss": 0.929,
+      "step": 1915
+    },
+    {
+      "epoch": 1.421987784564131,
+      "grad_norm": 1.83568046169287,
+      "learning_rate": 2.333977835991545e-06,
+      "loss": 0.9679,
+      "step": 1920
+    },
+    {
+      "epoch": 1.4256894317971498,
+      "grad_norm": 1.4338451747010847,
+      "learning_rate": 2.3066906740740626e-06,
+      "loss": 0.9503,
+      "step": 1925
+    },
+    {
+      "epoch": 1.4293910790301685,
+      "grad_norm": 1.5848101818149134,
+      "learning_rate": 2.2795160534570866e-06,
+      "loss": 0.9386,
+      "step": 1930
+    },
+    {
+      "epoch": 1.4330927262631872,
+      "grad_norm": 1.5568704183626663,
+      "learning_rate": 2.2524551096459703e-06,
+      "loss": 0.9176,
+      "step": 1935
+    },
+    {
+      "epoch": 1.4367943734962059,
+      "grad_norm": 1.4342074642791622,
+      "learning_rate": 2.2255089733960162e-06,
+      "loss": 0.9268,
+      "step": 1940
+    },
+    {
+      "epoch": 1.4404960207292246,
+      "grad_norm": 1.5264043520803225,
+      "learning_rate": 2.1986787706652377e-06,
+      "loss": 0.9182,
+      "step": 1945
+    },
+    {
+      "epoch": 1.4441976679622432,
+      "grad_norm": 1.4394261424854324,
+      "learning_rate": 2.171965622567308e-06,
+      "loss": 0.9577,
+      "step": 1950
+    },
+    {
+      "epoch": 1.447899315195262,
+      "grad_norm": 1.6049354461149483,
+      "learning_rate": 2.1453706453247088e-06,
+      "loss": 0.9287,
+      "step": 1955
+    },
+    {
+      "epoch": 1.4516009624282806,
+      "grad_norm": 1.6106070007653557,
+      "learning_rate": 2.1188949502220987e-06,
+      "loss": 0.9276,
+      "step": 1960
+    },
+    {
+      "epoch": 1.4553026096612993,
+      "grad_norm": 1.4890495009473046,
+      "learning_rate": 2.0925396435598665e-06,
+      "loss": 0.9531,
+      "step": 1965
+    },
+    {
+      "epoch": 1.459004256894318,
+      "grad_norm": 1.4462853644582758,
+      "learning_rate": 2.066305826607911e-06,
+      "loss": 0.9197,
+      "step": 1970
+    },
+    {
+      "epoch": 1.4627059041273367,
+      "grad_norm": 1.4880943901640924,
+      "learning_rate": 2.0401945955596206e-06,
+      "loss": 0.9592,
+      "step": 1975
+    },
+    {
+      "epoch": 1.4664075513603554,
+      "grad_norm": 1.5124073104411913,
+      "learning_rate": 2.0142070414860704e-06,
+      "loss": 0.9504,
+      "step": 1980
+    },
+    {
+      "epoch": 1.470109198593374,
+      "grad_norm": 1.5207220675714455,
+      "learning_rate": 1.9883442502904284e-06,
+      "loss": 0.9408,
+      "step": 1985
+    },
+    {
+      "epoch": 1.4738108458263928,
+      "grad_norm": 1.4024329836356682,
+      "learning_rate": 1.962607302662582e-06,
+      "loss": 0.9414,
+      "step": 1990
+    },
+    {
+      "epoch": 1.4775124930594115,
+      "grad_norm": 1.598071795929375,
+      "learning_rate": 1.936997274033986e-06,
+      "loss": 0.9547,
+      "step": 1995
+    },
+    {
+      "epoch": 1.4812141402924301,
+      "grad_norm": 1.534264128372943,
+      "learning_rate": 1.9115152345327154e-06,
+      "loss": 0.9442,
+      "step": 2000
+    },
+    {
+      "epoch": 1.4812141402924301,
+      "eval_loss": 1.0129976272583008,
+      "eval_runtime": 8.7217,
+      "eval_samples_per_second": 58.704,
+      "eval_steps_per_second": 14.676,
+      "step": 2000
+    },
+    {
+      "epoch": 1.4849157875254488,
+      "grad_norm": 1.4981898335706558,
+      "learning_rate": 1.8861622489387555e-06,
+      "loss": 0.9297,
+      "step": 2005
+    },
+    {
+      "epoch": 1.4886174347584675,
+      "grad_norm": 1.4632628358653164,
+      "learning_rate": 1.8609393766395083e-06,
+      "loss": 0.9607,
+      "step": 2010
+    },
+    {
+      "epoch": 1.4923190819914862,
+      "grad_norm": 1.4831829971571602,
+      "learning_rate": 1.8358476715855262e-06,
+      "loss": 0.9172,
+      "step": 2015
+    },
+    {
+      "epoch": 1.496020729224505,
+      "grad_norm": 1.4304163837532902,
+      "learning_rate": 1.8108881822464697e-06,
+      "loss": 0.9579,
+      "step": 2020
+    },
+    {
+      "epoch": 1.4997223764575236,
+      "grad_norm": 1.4641568790604755,
+      "learning_rate": 1.7860619515673034e-06,
+      "loss": 0.9592,
+      "step": 2025
+    },
+    {
+      "epoch": 1.5034240236905423,
+      "grad_norm": 1.4108335100882108,
+      "learning_rate": 1.7613700169247055e-06,
+      "loss": 0.9559,
+      "step": 2030
+    },
+    {
+      "epoch": 1.507125670923561,
+      "grad_norm": 1.3960221127799861,
+      "learning_rate": 1.7368134100837286e-06,
+      "loss": 0.9445,
+      "step": 2035
+    },
+    {
+      "epoch": 1.5108273181565797,
+      "grad_norm": 1.629241663992989,
+      "learning_rate": 1.7123931571546826e-06,
+      "loss": 0.9475,
+      "step": 2040
+    },
+    {
+      "epoch": 1.5145289653895984,
+      "grad_norm": 1.4769846296544475,
+      "learning_rate": 1.6881102785502618e-06,
+      "loss": 0.9241,
+      "step": 2045
+    },
+    {
+      "epoch": 1.518230612622617,
+      "grad_norm": 1.4450985025765188,
+      "learning_rate": 1.6639657889429017e-06,
+      "loss": 0.9312,
+      "step": 2050
+    },
+    {
+      "epoch": 1.5219322598556357,
+      "grad_norm": 1.5147785679068462,
+      "learning_rate": 1.639960697222388e-06,
+      "loss": 0.9539,
+      "step": 2055
+    },
+    {
+      "epoch": 1.5256339070886544,
+      "grad_norm": 1.4133261880328718,
+      "learning_rate": 1.6160960064536907e-06,
+      "loss": 0.9289,
+      "step": 2060
+    },
+    {
+      "epoch": 1.5293355543216731,
+      "grad_norm": 1.4899070137565427,
+      "learning_rate": 1.5923727138350548e-06,
+      "loss": 0.9452,
+      "step": 2065
+    },
+    {
+      "epoch": 1.5330372015546918,
+      "grad_norm": 1.604006362692692,
+      "learning_rate": 1.5687918106563326e-06,
+      "loss": 0.9676,
+      "step": 2070
+    },
+    {
+      "epoch": 1.5367388487877105,
+      "grad_norm": 1.573211074541449,
+      "learning_rate": 1.5453542822575624e-06,
+      "loss": 0.9245,
+      "step": 2075
+    },
+    {
+      "epoch": 1.5404404960207292,
+      "grad_norm": 1.4149807137951171,
+      "learning_rate": 1.52206110798779e-06,
+      "loss": 0.9213,
+      "step": 2080
+    },
+    {
+      "epoch": 1.5441421432537479,
+      "grad_norm": 1.6082035265990422,
+      "learning_rate": 1.4989132611641576e-06,
+      "loss": 0.9516,
+      "step": 2085
+    },
+    {
+      "epoch": 1.5478437904867666,
+      "grad_norm": 1.4214335025262166,
+      "learning_rate": 1.4759117090312197e-06,
+      "loss": 0.9676,
+      "step": 2090
+    },
+    {
+      "epoch": 1.5515454377197853,
+      "grad_norm": 1.4659084277987675,
+      "learning_rate": 1.453057412720536e-06,
+      "loss": 0.9635,
+      "step": 2095
+    },
+    {
+      "epoch": 1.555247084952804,
+      "grad_norm": 1.406907426871313,
+      "learning_rate": 1.4303513272105057e-06,
+      "loss": 0.9307,
+      "step": 2100
+    },
+    {
+      "epoch": 1.555247084952804,
+      "eval_loss": 1.0118414163589478,
+      "eval_runtime": 8.6982,
+      "eval_samples_per_second": 58.863,
+      "eval_steps_per_second": 14.716,
+      "step": 2100
+    },
+    {
+      "epoch": 1.5589487321858226,
+      "grad_norm": 1.5213179697546635,
+      "learning_rate": 1.4077944012864636e-06,
+      "loss": 0.925,
+      "step": 2105
+    },
+    {
+      "epoch": 1.5626503794188413,
+      "grad_norm": 1.4785816893254842,
+      "learning_rate": 1.3853875775010355e-06,
+      "loss": 0.9335,
+      "step": 2110
+    },
+    {
+      "epoch": 1.56635202665186,
+      "grad_norm": 1.6319160798858328,
+      "learning_rate": 1.3631317921347564e-06,
+      "loss": 0.9659,
+      "step": 2115
+    },
+    {
+      "epoch": 1.5700536738848787,
+      "grad_norm": 1.4378863109702957,
+      "learning_rate": 1.3410279751569399e-06,
+      "loss": 0.9543,
+      "step": 2120
+    },
+    {
+      "epoch": 1.5737553211178974,
+      "grad_norm": 1.5220881879950408,
+      "learning_rate": 1.3190770501868243e-06,
+      "loss": 0.9489,
+      "step": 2125
+    },
+    {
+      "epoch": 1.577456968350916,
+      "grad_norm": 1.5579237503089478,
+      "learning_rate": 1.297279934454978e-06,
+      "loss": 0.9388,
+      "step": 2130
+    },
+    {
+      "epoch": 1.5811586155839348,
+      "grad_norm": 1.4591792161712915,
+      "learning_rate": 1.2756375387649717e-06,
+      "loss": 0.9192,
+      "step": 2135
+    },
+    {
+      "epoch": 1.5848602628169535,
+      "grad_norm": 1.462271376262188,
+      "learning_rate": 1.25415076745532e-06,
+      "loss": 0.9513,
+      "step": 2140
+    },
+    {
+      "epoch": 1.5885619100499722,
+      "grad_norm": 1.4585813420478615,
+      "learning_rate": 1.2328205183616964e-06,
+      "loss": 0.9305,
+      "step": 2145
+    },
+    {
+      "epoch": 1.5922635572829908,
+      "grad_norm": 1.3996273344722394,
+      "learning_rate": 1.2116476827794104e-06,
+      "loss": 0.946,
+      "step": 2150
+    },
+    {
+      "epoch": 1.5959652045160095,
+      "grad_norm": 1.4264637319705211,
+      "learning_rate": 1.1906331454261704e-06,
+      "loss": 0.9572,
+      "step": 2155
+    },
+    {
+      "epoch": 1.5996668517490282,
+      "grad_norm": 1.4973661367660114,
+      "learning_rate": 1.1697777844051105e-06,
+      "loss": 0.9275,
+      "step": 2160
+    },
+    {
+      "epoch": 1.603368498982047,
+      "grad_norm": 1.4060812708874741,
+      "learning_rate": 1.1490824711681026e-06,
+      "loss": 0.9268,
+      "step": 2165
+    },
+    {
+      "epoch": 1.6070701462150656,
+      "grad_norm": 1.491081236302123,
+      "learning_rate": 1.1285480704793378e-06,
+      "loss": 0.9279,
+      "step": 2170
+    },
+    {
+      "epoch": 1.6107717934480843,
+      "grad_norm": 1.5268525956740233,
+      "learning_rate": 1.1081754403792e-06,
+      "loss": 0.9523,
+      "step": 2175
+    },
+    {
+      "epoch": 1.614473440681103,
+      "grad_norm": 1.4071150743669196,
+      "learning_rate": 1.0879654321484012e-06,
+      "loss": 0.9361,
+      "step": 2180
+    },
+    {
+      "epoch": 1.6181750879141217,
+      "grad_norm": 1.438776125548561,
+      "learning_rate": 1.067918890272419e-06,
+      "loss": 0.9657,
+      "step": 2185
+    },
+    {
+      "epoch": 1.6218767351471404,
+      "grad_norm": 1.474475325920584,
+      "learning_rate": 1.0480366524062041e-06,
+      "loss": 0.8989,
+      "step": 2190
+    },
+    {
+      "epoch": 1.625578382380159,
+      "grad_norm": 1.5065652638225282,
+      "learning_rate": 1.0283195493391823e-06,
+      "loss": 0.9269,
+      "step": 2195
+    },
+    {
+      "epoch": 1.6292800296131777,
+      "grad_norm": 1.380340972623135,
+      "learning_rate": 1.008768404960535e-06,
+      "loss": 0.9382,
+      "step": 2200
+    },
+    {
+      "epoch": 1.6292800296131777,
+      "eval_loss": 1.0108721256256104,
+      "eval_runtime": 8.6862,
+      "eval_samples_per_second": 58.944,
+      "eval_steps_per_second": 14.736,
+      "step": 2200
+    },
+    {
+      "epoch": 1.6329816768461964,
+      "grad_norm": 1.508384357350366,
+      "learning_rate": 9.893840362247809e-07,
+      "loss": 0.9536,
+      "step": 2205
+    },
+    {
+      "epoch": 1.6366833240792151,
+      "grad_norm": 1.4302336284783967,
+      "learning_rate": 9.701672531176287e-07,
+      "loss": 0.9472,
+      "step": 2210
+    },
+    {
+      "epoch": 1.6403849713122338,
+      "grad_norm": 1.5021914595466455,
+      "learning_rate": 9.511188586221376e-07,
+      "loss": 0.9528,
+      "step": 2215
+    },
+    {
+      "epoch": 1.6440866185452525,
+      "grad_norm": 1.4896957386140668,
+      "learning_rate": 9.322396486851626e-07,
+      "loss": 0.9191,
+      "step": 2220
+    },
+    {
+      "epoch": 1.6477882657782712,
+      "grad_norm": 1.410027647026084,
+      "learning_rate": 9.135304121840976e-07,
+      "loss": 0.9528,
+      "step": 2225
+    },
+    {
+      "epoch": 1.65148991301129,
+      "grad_norm": 1.4647072795828986,
+      "learning_rate": 8.949919308939081e-07,
+      "loss": 0.9406,
+      "step": 2230
+    },
+    {
+      "epoch": 1.6551915602443086,
+      "grad_norm": 1.4421036811256112,
+      "learning_rate": 8.766249794544662e-07,
+      "loss": 0.9417,
+      "step": 2235
+    },
+    {
+      "epoch": 1.6588932074773273,
+      "grad_norm": 1.5154978969386064,
+      "learning_rate": 8.584303253381848e-07,
+      "loss": 0.9561,
+      "step": 2240
+    },
+    {
+      "epoch": 1.662594854710346,
+      "grad_norm": 1.4562633895782477,
+      "learning_rate": 8.404087288179425e-07,
+      "loss": 0.9466,
+      "step": 2245
+    },
+    {
+      "epoch": 1.6662965019433646,
+      "grad_norm": 1.3887325189281265,
+      "learning_rate": 8.225609429353187e-07,
+      "loss": 0.93,
+      "step": 2250
+    },
+    {
+      "epoch": 1.6699981491763833,
+      "grad_norm": 1.4325023361891722,
+      "learning_rate": 8.048877134691269e-07,
+      "loss": 0.9187,
+      "step": 2255
+    },
+    {
+      "epoch": 1.673699796409402,
+      "grad_norm": 1.42483840932513,
+      "learning_rate": 7.873897789042523e-07,
+      "loss": 0.9397,
+      "step": 2260
+    },
+    {
+      "epoch": 1.6774014436424207,
+      "grad_norm": 1.436754914963747,
+      "learning_rate": 7.700678704007947e-07,
+      "loss": 0.9232,
+      "step": 2265
+    },
+    {
+      "epoch": 1.6811030908754394,
+      "grad_norm": 1.4213278153304933,
+      "learning_rate": 7.529227117635135e-07,
+      "loss": 0.9438,
+      "step": 2270
+    },
+    {
+      "epoch": 1.684804738108458,
+      "grad_norm": 1.4376904196493825,
+      "learning_rate": 7.35955019411585e-07,
+      "loss": 0.9474,
+      "step": 2275
+    },
+    {
+      "epoch": 1.688506385341477,
+      "grad_norm": 1.440330895903481,
+      "learning_rate": 7.191655023486682e-07,
+      "loss": 0.9309,
+      "step": 2280
+    },
+    {
+      "epoch": 1.6922080325744957,
+      "grad_norm": 1.4504946046355287,
+      "learning_rate": 7.02554862133275e-07,
+      "loss": 0.9232,
+      "step": 2285
+    },
+    {
+      "epoch": 1.6959096798075144,
+      "grad_norm": 1.406976117754757,
+      "learning_rate": 6.86123792849458e-07,
+      "loss": 0.9404,
+      "step": 2290
+    },
+    {
+      "epoch": 1.699611327040533,
+      "grad_norm": 1.3998459625736135,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 0.9004,
+      "step": 2295
+    },
+    {
+      "epoch": 1.7033129742735518,
+      "grad_norm": 1.4323508272233076,
+      "learning_rate": 6.53803105866761e-07,
+      "loss": 0.9224,
+      "step": 2300
+    },
+    {
+      "epoch": 1.7033129742735518,
+      "eval_loss": 1.0102295875549316,
+      "eval_runtime": 8.7111,
+      "eval_samples_per_second": 58.775,
+      "eval_steps_per_second": 14.694,
+      "step": 2300
+    },
+    {
+      "epoch": 1.7070146215065705,
+      "grad_norm": 1.4397884549696236,
+      "learning_rate": 6.379148387042317e-07,
+      "loss": 0.9172,
+      "step": 2305
+    },
+    {
+      "epoch": 1.7107162687395892,
+      "grad_norm": 1.3919854340288227,
+      "learning_rate": 6.222088434895462e-07,
+      "loss": 0.9426,
+      "step": 2310
+    },
+    {
+      "epoch": 1.7144179159726078,
+      "grad_norm": 1.5011837528149663,
+      "learning_rate": 6.066857765057055e-07,
+      "loss": 0.91,
+      "step": 2315
+    },
+    {
+      "epoch": 1.7181195632056265,
+      "grad_norm": 1.4347878112668406,
+      "learning_rate": 5.9134628639196e-07,
+      "loss": 0.9355,
+      "step": 2320
+    },
+    {
+      "epoch": 1.7218212104386452,
+      "grad_norm": 1.3752936187697653,
+      "learning_rate": 5.76191014116711e-07,
+      "loss": 0.9344,
+      "step": 2325
+    },
+    {
+      "epoch": 1.725522857671664,
+      "grad_norm": 1.6317281996837152,
+      "learning_rate": 5.612205929507209e-07,
+      "loss": 0.9579,
+      "step": 2330
+    },
+    {
+      "epoch": 1.7292245049046826,
+      "grad_norm": 1.410242165503811,
+      "learning_rate": 5.464356484406535e-07,
+      "loss": 0.9463,
+      "step": 2335
+    },
+    {
+      "epoch": 1.7329261521377013,
+      "grad_norm": 1.6054001196121293,
+      "learning_rate": 5.318367983829393e-07,
+      "loss": 0.9412,
+      "step": 2340
+    },
+    {
+      "epoch": 1.73662779937072,
+      "grad_norm": 1.5108735566791855,
+      "learning_rate": 5.174246527979532e-07,
+      "loss": 0.9336,
+      "step": 2345
+    },
+    {
+      "epoch": 1.7403294466037387,
+      "grad_norm": 1.3879969337954057,
+      "learning_rate": 5.031998139045352e-07,
+      "loss": 0.9711,
+      "step": 2350
+    },
+    {
+      "epoch": 1.7440310938367574,
+      "grad_norm": 1.4140765269828401,
+      "learning_rate": 4.891628760948114e-07,
+      "loss": 0.9361,
+      "step": 2355
+    },
+    {
+      "epoch": 1.747732741069776,
+      "grad_norm": 1.380062448755359,
+      "learning_rate": 4.753144259093734e-07,
+      "loss": 0.9169,
+      "step": 2360
+    },
+    {
+      "epoch": 1.7514343883027947,
+      "grad_norm": 1.4140375581071665,
+      "learning_rate": 4.6165504201275635e-07,
+      "loss": 0.9439,
+      "step": 2365
+    },
+    {
+      "epoch": 1.7551360355358134,
+      "grad_norm": 1.4112810465379273,
+      "learning_rate": 4.481852951692672e-07,
+      "loss": 0.9548,
+      "step": 2370
+    },
+    {
+      "epoch": 1.7588376827688321,
+      "grad_norm": 1.4942125652112794,
+      "learning_rate": 4.349057482191299e-07,
+      "loss": 0.9697,
+      "step": 2375
+    },
+    {
+      "epoch": 1.7625393300018508,
+      "grad_norm": 1.4925795445437877,
+      "learning_rate": 4.2181695605497066e-07,
+      "loss": 0.9428,
+      "step": 2380
+    },
+    {
+      "epoch": 1.7662409772348695,
+      "grad_norm": 1.42841396976241,
+      "learning_rate": 4.089194655986306e-07,
+      "loss": 0.961,
+      "step": 2385
+    },
+    {
+      "epoch": 1.7699426244678882,
+      "grad_norm": 1.4682730700276878,
+      "learning_rate": 3.9621381577830855e-07,
+      "loss": 0.9654,
+      "step": 2390
+    },
+    {
+      "epoch": 1.7736442717009069,
+      "grad_norm": 1.4896297456091034,
+      "learning_rate": 3.837005375060482e-07,
+      "loss": 0.9333,
+      "step": 2395
+    },
+    {
+      "epoch": 1.7773459189339256,
+      "grad_norm": 1.4361351794217565,
+      "learning_rate": 3.7138015365554834e-07,
+      "loss": 0.9107,
+      "step": 2400
+    },
+    {
+      "epoch": 1.7773459189339256,
+      "eval_loss": 1.0095593929290771,
+      "eval_runtime": 8.7265,
+      "eval_samples_per_second": 58.672,
+      "eval_steps_per_second": 14.668,
+      "step": 2400
+    },
+    {
+      "epoch": 1.7810475661669443,
+      "grad_norm": 1.391141743720589,
+      "learning_rate": 3.592531790403159e-07,
+      "loss": 0.9338,
+      "step": 2405
+    },
+    {
+      "epoch": 1.784749213399963,
+      "grad_norm": 1.4629443997006013,
+      "learning_rate": 3.473201203921578e-07,
+      "loss": 0.9281,
+      "step": 2410
+    },
+    {
+      "epoch": 1.7884508606329816,
+      "grad_norm": 1.3944795908599104,
+      "learning_rate": 3.355814763399973e-07,
+      "loss": 0.9217,
+      "step": 2415
+    },
+    {
+      "epoch": 1.7921525078660003,
+      "grad_norm": 1.5108952168010161,
+      "learning_rate": 3.2403773738905185e-07,
+      "loss": 0.9312,
+      "step": 2420
+    },
+    {
+      "epoch": 1.795854155099019,
+      "grad_norm": 1.4747389522523875,
+      "learning_rate": 3.1268938590032495e-07,
+      "loss": 0.9405,
+      "step": 2425
+    },
+    {
+      "epoch": 1.7995558023320377,
+      "grad_norm": 1.4670095692146419,
+      "learning_rate": 3.015368960704584e-07,
+      "loss": 0.9341,
+      "step": 2430
+    },
+    {
+      "epoch": 1.8032574495650564,
+      "grad_norm": 1.420938494793171,
+      "learning_rate": 2.905807339119138e-07,
+      "loss": 0.9133,
+      "step": 2435
+    },
+    {
+      "epoch": 1.806959096798075,
+      "grad_norm": 1.485589398600447,
+      "learning_rate": 2.798213572335001e-07,
+      "loss": 0.9477,
+      "step": 2440
+    },
+    {
+      "epoch": 1.8106607440310938,
+      "grad_norm": 1.398836941771861,
+      "learning_rate": 2.6925921562124867e-07,
+      "loss": 0.9559,
+      "step": 2445
+    },
+    {
+      "epoch": 1.8143623912641127,
+      "grad_norm": 1.460425549307081,
+      "learning_rate": 2.5889475041961767e-07,
+      "loss": 0.912,
+      "step": 2450
+    },
+    {
+      "epoch": 1.8180640384971314,
+      "grad_norm": 1.374269506100965,
+      "learning_rate": 2.487283947130609e-07,
+      "loss": 0.9398,
+      "step": 2455
+    },
+    {
+      "epoch": 1.82176568573015,
+      "grad_norm": 1.4896413268024193,
+      "learning_rate": 2.3876057330792344e-07,
+      "loss": 0.941,
+      "step": 2460
+    },
+    {
+      "epoch": 1.8254673329631688,
+      "grad_norm": 1.4690641033644323,
+      "learning_rate": 2.289917027146943e-07,
+      "loss": 0.9454,
+      "step": 2465
+    },
+    {
+      "epoch": 1.8291689801961875,
+      "grad_norm": 1.477863619064755,
+      "learning_rate": 2.1942219113060215e-07,
+      "loss": 0.9168,
+      "step": 2470
+    },
+    {
+      "epoch": 1.8328706274292061,
+      "grad_norm": 1.470521817548036,
+      "learning_rate": 2.1005243842255552e-07,
+      "loss": 0.9643,
+      "step": 2475
+    },
+    {
+      "epoch": 1.8365722746622248,
+      "grad_norm": 1.4220441215199895,
+      "learning_rate": 2.0088283611044034e-07,
+      "loss": 0.9661,
+      "step": 2480
+    },
+    {
+      "epoch": 1.8402739218952435,
+      "grad_norm": 1.425688228229182,
+      "learning_rate": 1.919137673507543e-07,
+      "loss": 0.9178,
+      "step": 2485
+    },
+    {
+      "epoch": 1.8439755691282622,
+      "grad_norm": 1.4518435265086809,
+      "learning_rate": 1.8314560692059836e-07,
+      "loss": 0.955,
+      "step": 2490
+    },
+    {
+      "epoch": 1.847677216361281,
+      "grad_norm": 1.3744149843020026,
+      "learning_rate": 1.745787212020178e-07,
+      "loss": 0.9461,
+      "step": 2495
+    },
+    {
+      "epoch": 1.8513788635942996,
+      "grad_norm": 1.3888947471272055,
+      "learning_rate": 1.6621346816668993e-07,
+      "loss": 0.9126,
+      "step": 2500
+    },
+    {
+      "epoch": 1.8513788635942996,
+      "eval_loss": 1.009551763534546,
+      "eval_runtime": 8.7021,
+      "eval_samples_per_second": 58.836,
+      "eval_steps_per_second": 14.709,
+      "step": 2500
+    },
+    {
+      "epoch": 1.8550805108273183,
+      "grad_norm": 1.4437352870170306,
+      "learning_rate": 1.5805019736097105e-07,
+      "loss": 0.964,
+      "step": 2505
+    },
+    {
+      "epoch": 1.858782158060337,
+      "grad_norm": 1.489377218702996,
+      "learning_rate": 1.500892498912826e-07,
+      "loss": 0.953,
+      "step": 2510
+    },
+    {
+      "epoch": 1.8624838052933557,
+      "grad_norm": 1.4083382533340174,
+      "learning_rate": 1.4233095840986756e-07,
+      "loss": 0.9395,
+      "step": 2515
+    },
+    {
+      "epoch": 1.8661854525263744,
+      "grad_norm": 1.534772930422051,
+      "learning_rate": 1.3477564710088097e-07,
+      "loss": 0.9518,
+      "step": 2520
+    },
+    {
+      "epoch": 1.869887099759393,
+      "grad_norm": 1.4635427384257957,
+      "learning_rate": 1.2742363166685035e-07,
+      "loss": 0.9302,
+      "step": 2525
+    },
+    {
+      "epoch": 1.8735887469924117,
+      "grad_norm": 1.4435798379334432,
+      "learning_rate": 1.2027521931548214e-07,
+      "loss": 0.9367,
+      "step": 2530
+    },
+    {
+      "epoch": 1.8772903942254304,
+      "grad_norm": 1.5934946790816538,
+      "learning_rate": 1.1333070874682217e-07,
+      "loss": 0.9569,
+      "step": 2535
+    },
+    {
+      "epoch": 1.8809920414584491,
+      "grad_norm": 1.5279098622590994,
+      "learning_rate": 1.0659039014077943e-07,
+      "loss": 0.946,
+      "step": 2540
+    },
+    {
+      "epoch": 1.8846936886914678,
+      "grad_norm": 1.4305237726798707,
+      "learning_rate": 1.0005454514499413e-07,
+      "loss": 0.9393,
+      "step": 2545
+    },
+    {
+      "epoch": 1.8883953359244865,
+      "grad_norm": 1.3607031449762257,
+      "learning_rate": 9.372344686307655e-08,
+      "loss": 0.9208,
+      "step": 2550
+    },
+    {
+      "epoch": 1.8920969831575052,
+      "grad_norm": 1.4516808837913941,
+      "learning_rate": 8.759735984318896e-08,
+      "loss": 0.935,
+      "step": 2555
+    },
+    {
+      "epoch": 1.8957986303905239,
+      "grad_norm": 1.4761229374330997,
+      "learning_rate": 8.167654006699444e-08,
+      "loss": 0.9256,
+      "step": 2560
+    },
+    {
+      "epoch": 1.8995002776235426,
+      "grad_norm": 1.3922043238331803,
+      "learning_rate": 7.59612349389599e-08,
+      "loss": 0.9267,
+      "step": 2565
+    },
+    {
+      "epoch": 1.9032019248565613,
+      "grad_norm": 1.4640748538799848,
+      "learning_rate": 7.04516832760177e-08,
+      "loss": 0.9353,
+      "step": 2570
+    },
+    {
+      "epoch": 1.90690357208958,
+      "grad_norm": 1.4570342845008322,
+      "learning_rate": 6.514811529758747e-08,
+      "loss": 0.9688,
+      "step": 2575
+    },
+    {
+      "epoch": 1.9106052193225986,
+      "grad_norm": 1.5068310410001753,
+      "learning_rate": 6.005075261595495e-08,
+      "loss": 0.9386,
+      "step": 2580
+    },
+    {
+      "epoch": 1.9143068665556173,
+      "grad_norm": 1.6357739595126155,
+      "learning_rate": 5.515980822701439e-08,
+      "loss": 0.9142,
+      "step": 2585
+    },
+    {
+      "epoch": 1.918008513788636,
+      "grad_norm": 1.4936931467730468,
+      "learning_rate": 5.047548650136513e-08,
+      "loss": 0.9457,
+      "step": 2590
+    },
+    {
+      "epoch": 1.9217101610216547,
+      "grad_norm": 1.4313994134689743,
+      "learning_rate": 4.599798317577342e-08,
+      "loss": 0.9445,
+      "step": 2595
+    },
+    {
+      "epoch": 1.9254118082546734,
+      "grad_norm": 1.3922872350713908,
+      "learning_rate": 4.172748534499449e-08,
+      "loss": 0.9436,
+      "step": 2600
+    },
+    {
+      "epoch": 1.9254118082546734,
+      "eval_loss": 1.0094008445739746,
+      "eval_runtime": 8.7208,
+      "eval_samples_per_second": 58.71,
+      "eval_steps_per_second": 14.678,
+      "step": 2600
+    },
+    {
+      "epoch": 1.929113455487692,
+      "grad_norm": 1.4644306273860928,
+      "learning_rate": 3.766417145395218e-08,
+      "loss": 0.8973,
+      "step": 2605
+    },
+    {
+      "epoch": 1.9328151027207108,
+      "grad_norm": 1.4664829295420516,
+      "learning_rate": 3.3808211290284886e-08,
+      "loss": 0.9474,
+      "step": 2610
+    },
+    {
+      "epoch": 1.9365167499537295,
+      "grad_norm": 1.6215118114126437,
+      "learning_rate": 3.015976597725068e-08,
+      "loss": 0.9375,
+      "step": 2615
+    },
+    {
+      "epoch": 1.9402183971867482,
+      "grad_norm": 1.5227133782361597,
+      "learning_rate": 2.6718987966992683e-08,
+      "loss": 0.9311,
+      "step": 2620
+    },
+    {
+      "epoch": 1.9439200444197668,
+      "grad_norm": 1.4393447693954848,
+      "learning_rate": 2.3486021034170857e-08,
+      "loss": 0.9631,
+      "step": 2625
+    },
+    {
+      "epoch": 1.9476216916527855,
+      "grad_norm": 1.5109187749460615,
+      "learning_rate": 2.0461000269953457e-08,
+      "loss": 0.9268,
+      "step": 2630
+    },
+    {
+      "epoch": 1.9513233388858042,
+      "grad_norm": 1.4311275838119613,
+      "learning_rate": 1.7644052076371544e-08,
+      "loss": 0.9497,
+      "step": 2635
+    },
+    {
+      "epoch": 1.955024986118823,
+      "grad_norm": 1.5072494573524384,
+      "learning_rate": 1.5035294161039882e-08,
+      "loss": 0.9454,
+      "step": 2640
+    },
+    {
+      "epoch": 1.9587266333518416,
+      "grad_norm": 1.377355135058928,
+      "learning_rate": 1.2634835532233658e-08,
+      "loss": 0.9502,
+      "step": 2645
+    },
+    {
+      "epoch": 1.9624282805848603,
+      "grad_norm": 1.4159739187815894,
+      "learning_rate": 1.044277649433989e-08,
+      "loss": 0.9266,
+      "step": 2650
+    },
+    {
+      "epoch": 1.966129927817879,
+      "grad_norm": 1.4394933242463066,
+      "learning_rate": 8.459208643659122e-09,
+      "loss": 0.9562,
+      "step": 2655
+    },
+    {
+      "epoch": 1.9698315750508977,
+      "grad_norm": 1.5178638738982297,
+      "learning_rate": 6.6842148645840374e-09,
+      "loss": 0.9454,
+      "step": 2660
+    },
+    {
+      "epoch": 1.9735332222839164,
+      "grad_norm": 1.4200491669164597,
+      "learning_rate": 5.11786932613223e-09,
+      "loss": 0.9401,
+      "step": 2665
+    },
+    {
+      "epoch": 1.977234869516935,
+      "grad_norm": 1.4450080285866973,
+      "learning_rate": 3.760237478849793e-09,
+      "loss": 0.941,
+      "step": 2670
+    },
+    {
+      "epoch": 1.9809365167499537,
+      "grad_norm": 1.4766549402543119,
+      "learning_rate": 2.611376052073511e-09,
+      "loss": 0.9267,
+      "step": 2675
+    },
+    {
+      "epoch": 1.9846381639829724,
+      "grad_norm": 1.4163540389673057,
+      "learning_rate": 1.6713330515627512e-09,
+      "loss": 0.9383,
+      "step": 2680
+    },
+    {
+      "epoch": 1.9883398112159911,
+      "grad_norm": 1.6265977059825938,
+      "learning_rate": 9.401477574932927e-10,
+      "loss": 0.9718,
+      "step": 2685
+    },
+    {
+      "epoch": 1.9920414584490098,
+      "grad_norm": 1.4536437823952244,
+      "learning_rate": 4.178507228136397e-10,
+      "loss": 0.9253,
+      "step": 2690
+    },
+    {
+      "epoch": 1.9957431056820285,
+      "grad_norm": 1.4668631253091544,
+      "learning_rate": 1.0446377197104174e-10,
+      "loss": 0.9352,
+      "step": 2695
+    },
+    {
+      "epoch": 1.9994447529150472,
+      "grad_norm": 1.4405225324032018,
+      "learning_rate": 0.0,
+      "loss": 0.9419,
+      "step": 2700
+    },
+    {
+      "epoch": 1.9994447529150472,
+      "eval_loss": 1.0093477964401245,
+      "eval_runtime": 8.6956,
+      "eval_samples_per_second": 58.88,
+      "eval_steps_per_second": 14.72,
+      "step": 2700
+    },
+    {
+      "epoch": 1.9994447529150472,
+      "step": 2700,
+      "total_flos": 75972590174208.0,
+      "train_loss": 1.0003288196634363,
+      "train_runtime": 4442.862,
+      "train_samples_per_second": 38.911,
+      "train_steps_per_second": 0.608
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2700,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 75972590174208.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}