diff --git "a/checkpoint-11700/trainer_state.json" "b/checkpoint-11700/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-11700/trainer_state.json"
@@ -0,0 +1,9159 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.951564076690212,
+  "eval_steps": 100,
+  "global_step": 11700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025227043390514633,
+      "grad_norm": 2.0247561931610107,
+      "learning_rate": 4.995795492768248e-05,
+      "loss": 3.4981,
+      "step": 10
+    },
+    {
+      "epoch": 0.005045408678102927,
+      "grad_norm": 1.5878010988235474,
+      "learning_rate": 4.991590985536496e-05,
+      "loss": 0.2065,
+      "step": 20
+    },
+    {
+      "epoch": 0.0075681130171543895,
+      "grad_norm": 1.1973843574523926,
+      "learning_rate": 4.987386478304743e-05,
+      "loss": 0.1051,
+      "step": 30
+    },
+    {
+      "epoch": 0.010090817356205853,
+      "grad_norm": 1.2424113750457764,
+      "learning_rate": 4.983181971072991e-05,
+      "loss": 0.0835,
+      "step": 40
+    },
+    {
+      "epoch": 0.012613521695257316,
+      "grad_norm": 0.5049477219581604,
+      "learning_rate": 4.9789774638412376e-05,
+      "loss": 0.0712,
+      "step": 50
+    },
+    {
+      "epoch": 0.015136226034308779,
+      "grad_norm": 0.6395494937896729,
+      "learning_rate": 4.974772956609486e-05,
+      "loss": 0.0777,
+      "step": 60
+    },
+    {
+      "epoch": 0.017658930373360242,
+      "grad_norm": 0.5573010444641113,
+      "learning_rate": 4.970568449377733e-05,
+      "loss": 0.0627,
+      "step": 70
+    },
+    {
+      "epoch": 0.020181634712411706,
+      "grad_norm": 0.5474572777748108,
+      "learning_rate": 4.9663639421459806e-05,
+      "loss": 0.0553,
+      "step": 80
+    },
+    {
+      "epoch": 0.022704339051463168,
+      "grad_norm": 0.4932677149772644,
+      "learning_rate": 4.962159434914228e-05,
+      "loss": 0.0455,
+      "step": 90
+    },
+    {
+      "epoch": 0.025227043390514632,
+      "grad_norm": 0.4285220205783844,
+      "learning_rate": 4.957954927682476e-05,
+      "loss": 0.0403,
+      "step": 100
+    },
+    {
+      "epoch": 0.025227043390514632,
+      "eval_loss": 0.04899341240525246,
+      "eval_runtime": 21.0493,
+      "eval_samples_per_second": 83.708,
+      "eval_steps_per_second": 20.951,
+      "step": 100
+    },
+    {
+      "epoch": 0.027749747729566093,
+      "grad_norm": 0.5163519978523254,
+      "learning_rate": 4.953750420450724e-05,
+      "loss": 0.0511,
+      "step": 110
+    },
+    {
+      "epoch": 0.030272452068617558,
+      "grad_norm": 0.42003411054611206,
+      "learning_rate": 4.949545913218971e-05,
+      "loss": 0.0353,
+      "step": 120
+    },
+    {
+      "epoch": 0.03279515640766902,
+      "grad_norm": 0.35371342301368713,
+      "learning_rate": 4.945341405987218e-05,
+      "loss": 0.035,
+      "step": 130
+    },
+    {
+      "epoch": 0.035317860746720484,
+      "grad_norm": 0.4225226938724518,
+      "learning_rate": 4.941136898755466e-05,
+      "loss": 0.0368,
+      "step": 140
+    },
+    {
+      "epoch": 0.037840565085771945,
+      "grad_norm": 0.5139334201812744,
+      "learning_rate": 4.9369323915237136e-05,
+      "loss": 0.0411,
+      "step": 150
+    },
+    {
+      "epoch": 0.04036326942482341,
+      "grad_norm": 0.44391313195228577,
+      "learning_rate": 4.932727884291961e-05,
+      "loss": 0.0336,
+      "step": 160
+    },
+    {
+      "epoch": 0.042885973763874874,
+      "grad_norm": 0.4110734462738037,
+      "learning_rate": 4.928523377060209e-05,
+      "loss": 0.0353,
+      "step": 170
+    },
+    {
+      "epoch": 0.045408678102926335,
+      "grad_norm": 0.4065672755241394,
+      "learning_rate": 4.924318869828457e-05,
+      "loss": 0.0302,
+      "step": 180
+    },
+    {
+      "epoch": 0.0479313824419778,
+      "grad_norm": 0.3082253336906433,
+      "learning_rate": 4.920114362596704e-05,
+      "loss": 0.0335,
+      "step": 190
+    },
+    {
+      "epoch": 0.050454086781029264,
+      "grad_norm": 0.36431366205215454,
+      "learning_rate": 4.9159098553649516e-05,
+      "loss": 0.0359,
+      "step": 200
+    },
+    {
+      "epoch": 0.050454086781029264,
+      "eval_loss": 0.03114187717437744,
+      "eval_runtime": 20.9213,
+      "eval_samples_per_second": 84.22,
+      "eval_steps_per_second": 21.079,
+      "step": 200
+    },
+    {
+      "epoch": 0.052976791120080725,
+      "grad_norm": 0.3635391294956207,
+      "learning_rate": 4.911705348133199e-05,
+      "loss": 0.0314,
+      "step": 210
+    },
+    {
+      "epoch": 0.055499495459132187,
+      "grad_norm": 0.3692784905433655,
+      "learning_rate": 4.9075008409014465e-05,
+      "loss": 0.0355,
+      "step": 220
+    },
+    {
+      "epoch": 0.058022199798183655,
+      "grad_norm": 0.3028589189052582,
+      "learning_rate": 4.903296333669694e-05,
+      "loss": 0.0331,
+      "step": 230
+    },
+    {
+      "epoch": 0.060544904137235116,
+      "grad_norm": 0.31779277324676514,
+      "learning_rate": 4.8990918264379415e-05,
+      "loss": 0.0218,
+      "step": 240
+    },
+    {
+      "epoch": 0.06306760847628658,
+      "grad_norm": 0.36319565773010254,
+      "learning_rate": 4.8948873192061896e-05,
+      "loss": 0.02,
+      "step": 250
+    },
+    {
+      "epoch": 0.06559031281533804,
+      "grad_norm": 0.30493369698524475,
+      "learning_rate": 4.890682811974437e-05,
+      "loss": 0.0321,
+      "step": 260
+    },
+    {
+      "epoch": 0.0681130171543895,
+      "grad_norm": 0.4685748815536499,
+      "learning_rate": 4.8864783047426845e-05,
+      "loss": 0.0273,
+      "step": 270
+    },
+    {
+      "epoch": 0.07063572149344097,
+      "grad_norm": 0.45751261711120605,
+      "learning_rate": 4.882273797510932e-05,
+      "loss": 0.0293,
+      "step": 280
+    },
+    {
+      "epoch": 0.07315842583249244,
+      "grad_norm": 0.3978378474712372,
+      "learning_rate": 4.8780692902791795e-05,
+      "loss": 0.0279,
+      "step": 290
+    },
+    {
+      "epoch": 0.07568113017154389,
+      "grad_norm": 0.22009262442588806,
+      "learning_rate": 4.873864783047427e-05,
+      "loss": 0.0236,
+      "step": 300
+    },
+    {
+      "epoch": 0.07568113017154389,
+      "eval_loss": 0.027077585458755493,
+      "eval_runtime": 20.9736,
+      "eval_samples_per_second": 84.01,
+      "eval_steps_per_second": 21.026,
+      "step": 300
+    },
+    {
+      "epoch": 0.07820383451059536,
+      "grad_norm": 0.4900023937225342,
+      "learning_rate": 4.8696602758156744e-05,
+      "loss": 0.0269,
+      "step": 310
+    },
+    {
+      "epoch": 0.08072653884964683,
+      "grad_norm": 0.4146521985530853,
+      "learning_rate": 4.865455768583922e-05,
+      "loss": 0.0427,
+      "step": 320
+    },
+    {
+      "epoch": 0.08324924318869828,
+      "grad_norm": 0.3285127580165863,
+      "learning_rate": 4.86125126135217e-05,
+      "loss": 0.0272,
+      "step": 330
+    },
+    {
+      "epoch": 0.08577194752774975,
+      "grad_norm": 0.49365487694740295,
+      "learning_rate": 4.8570467541204175e-05,
+      "loss": 0.0263,
+      "step": 340
+    },
+    {
+      "epoch": 0.08829465186680122,
+      "grad_norm": 0.3131512701511383,
+      "learning_rate": 4.852842246888665e-05,
+      "loss": 0.0193,
+      "step": 350
+    },
+    {
+      "epoch": 0.09081735620585267,
+      "grad_norm": 0.29098790884017944,
+      "learning_rate": 4.8486377396569124e-05,
+      "loss": 0.0262,
+      "step": 360
+    },
+    {
+      "epoch": 0.09334006054490414,
+      "grad_norm": 0.2346099615097046,
+      "learning_rate": 4.84443323242516e-05,
+      "loss": 0.0203,
+      "step": 370
+    },
+    {
+      "epoch": 0.0958627648839556,
+      "grad_norm": 0.31069958209991455,
+      "learning_rate": 4.8402287251934074e-05,
+      "loss": 0.0199,
+      "step": 380
+    },
+    {
+      "epoch": 0.09838546922300706,
+      "grad_norm": 0.36899533867836,
+      "learning_rate": 4.836024217961655e-05,
+      "loss": 0.0236,
+      "step": 390
+    },
+    {
+      "epoch": 0.10090817356205853,
+      "grad_norm": 0.2813776135444641,
+      "learning_rate": 4.831819710729903e-05,
+      "loss": 0.019,
+      "step": 400
+    },
+    {
+      "epoch": 0.10090817356205853,
+      "eval_loss": 0.02430903911590576,
+      "eval_runtime": 20.9604,
+      "eval_samples_per_second": 84.063,
+      "eval_steps_per_second": 21.04,
+      "step": 400
+    },
+    {
+      "epoch": 0.10343087790111,
+      "grad_norm": 0.4171730577945709,
+      "learning_rate": 4.8276152034981504e-05,
+      "loss": 0.0179,
+      "step": 410
+    },
+    {
+      "epoch": 0.10595358224016145,
+      "grad_norm": 0.30979079008102417,
+      "learning_rate": 4.823410696266398e-05,
+      "loss": 0.0238,
+      "step": 420
+    },
+    {
+      "epoch": 0.10847628657921292,
+      "grad_norm": 0.2565608620643616,
+      "learning_rate": 4.8192061890346454e-05,
+      "loss": 0.0216,
+      "step": 430
+    },
+    {
+      "epoch": 0.11099899091826437,
+      "grad_norm": 0.2515753507614136,
+      "learning_rate": 4.815001681802893e-05,
+      "loss": 0.0187,
+      "step": 440
+    },
+    {
+      "epoch": 0.11352169525731584,
+      "grad_norm": 0.49704504013061523,
+      "learning_rate": 4.81079717457114e-05,
+      "loss": 0.0276,
+      "step": 450
+    },
+    {
+      "epoch": 0.11604439959636731,
+      "grad_norm": 0.215419739484787,
+      "learning_rate": 4.806592667339388e-05,
+      "loss": 0.0174,
+      "step": 460
+    },
+    {
+      "epoch": 0.11856710393541876,
+      "grad_norm": 0.3299199044704437,
+      "learning_rate": 4.802388160107635e-05,
+      "loss": 0.0229,
+      "step": 470
+    },
+    {
+      "epoch": 0.12108980827447023,
+      "grad_norm": 0.2970931828022003,
+      "learning_rate": 4.7981836528758834e-05,
+      "loss": 0.026,
+      "step": 480
+    },
+    {
+      "epoch": 0.1236125126135217,
+      "grad_norm": 0.250384658575058,
+      "learning_rate": 4.793979145644131e-05,
+      "loss": 0.0189,
+      "step": 490
+    },
+    {
+      "epoch": 0.12613521695257315,
+      "grad_norm": 0.2535875141620636,
+      "learning_rate": 4.789774638412378e-05,
+      "loss": 0.0202,
+      "step": 500
+    },
+    {
+      "epoch": 0.12613521695257315,
+      "eval_loss": 0.023427454754710197,
+      "eval_runtime": 20.8805,
+      "eval_samples_per_second": 84.385,
+      "eval_steps_per_second": 21.12,
+      "step": 500
+    },
+    {
+      "epoch": 0.12865792129162462,
+      "grad_norm": 0.3403398096561432,
+      "learning_rate": 4.7855701311806265e-05,
+      "loss": 0.0217,
+      "step": 510
+    },
+    {
+      "epoch": 0.1311806256306761,
+      "grad_norm": 0.3057946264743805,
+      "learning_rate": 4.781365623948873e-05,
+      "loss": 0.0191,
+      "step": 520
+    },
+    {
+      "epoch": 0.13370332996972756,
+      "grad_norm": 0.25819921493530273,
+      "learning_rate": 4.777161116717121e-05,
+      "loss": 0.0193,
+      "step": 530
+    },
+    {
+      "epoch": 0.136226034308779,
+      "grad_norm": 0.29612287878990173,
+      "learning_rate": 4.772956609485368e-05,
+      "loss": 0.0208,
+      "step": 540
+    },
+    {
+      "epoch": 0.13874873864783047,
+      "grad_norm": 0.2910136580467224,
+      "learning_rate": 4.768752102253616e-05,
+      "loss": 0.0173,
+      "step": 550
+    },
+    {
+      "epoch": 0.14127144298688193,
+      "grad_norm": 0.3340837359428406,
+      "learning_rate": 4.764547595021864e-05,
+      "loss": 0.0151,
+      "step": 560
+    },
+    {
+      "epoch": 0.1437941473259334,
+      "grad_norm": 0.3069872260093689,
+      "learning_rate": 4.760343087790111e-05,
+      "loss": 0.0166,
+      "step": 570
+    },
+    {
+      "epoch": 0.14631685166498487,
+      "grad_norm": 0.24297013878822327,
+      "learning_rate": 4.756138580558359e-05,
+      "loss": 0.0228,
+      "step": 580
+    },
+    {
+      "epoch": 0.14883955600403634,
+      "grad_norm": 0.28086113929748535,
+      "learning_rate": 4.751934073326607e-05,
+      "loss": 0.0221,
+      "step": 590
+    },
+    {
+      "epoch": 0.15136226034308778,
+      "grad_norm": 0.26318562030792236,
+      "learning_rate": 4.7477295660948536e-05,
+      "loss": 0.0281,
+      "step": 600
+    },
+    {
+      "epoch": 0.15136226034308778,
+      "eval_loss": 0.02024998515844345,
+      "eval_runtime": 20.8969,
+      "eval_samples_per_second": 84.319,
+      "eval_steps_per_second": 21.104,
+      "step": 600
+    },
+    {
+      "epoch": 0.15388496468213925,
+      "grad_norm": 0.27950412034988403,
+      "learning_rate": 4.743525058863101e-05,
+      "loss": 0.0169,
+      "step": 610
+    },
+    {
+      "epoch": 0.15640766902119072,
+      "grad_norm": 0.20953460037708282,
+      "learning_rate": 4.7393205516313486e-05,
+      "loss": 0.0213,
+      "step": 620
+    },
+    {
+      "epoch": 0.15893037336024218,
+      "grad_norm": 0.2499765157699585,
+      "learning_rate": 4.735116044399597e-05,
+      "loss": 0.0191,
+      "step": 630
+    },
+    {
+      "epoch": 0.16145307769929365,
+      "grad_norm": 0.3006986975669861,
+      "learning_rate": 4.730911537167844e-05,
+      "loss": 0.0269,
+      "step": 640
+    },
+    {
+      "epoch": 0.16397578203834512,
+      "grad_norm": 0.24447965621948242,
+      "learning_rate": 4.7267070299360917e-05,
+      "loss": 0.0193,
+      "step": 650
+    },
+    {
+      "epoch": 0.16649848637739656,
+      "grad_norm": 0.319516122341156,
+      "learning_rate": 4.722502522704339e-05,
+      "loss": 0.0222,
+      "step": 660
+    },
+    {
+      "epoch": 0.16902119071644803,
+      "grad_norm": 0.30482372641563416,
+      "learning_rate": 4.718298015472587e-05,
+      "loss": 0.0165,
+      "step": 670
+    },
+    {
+      "epoch": 0.1715438950554995,
+      "grad_norm": 0.18806371092796326,
+      "learning_rate": 4.714093508240835e-05,
+      "loss": 0.014,
+      "step": 680
+    },
+    {
+      "epoch": 0.17406659939455096,
+      "grad_norm": 0.21826079487800598,
+      "learning_rate": 4.7098890010090815e-05,
+      "loss": 0.0192,
+      "step": 690
+    },
+    {
+      "epoch": 0.17658930373360243,
+      "grad_norm": 0.2127252221107483,
+      "learning_rate": 4.70568449377733e-05,
+      "loss": 0.0142,
+      "step": 700
+    },
+    {
+      "epoch": 0.17658930373360243,
+      "eval_loss": 0.018979934975504875,
+      "eval_runtime": 20.8994,
+      "eval_samples_per_second": 84.309,
+      "eval_steps_per_second": 21.101,
+      "step": 700
+    },
+    {
+      "epoch": 0.17911200807265387,
+      "grad_norm": 0.23581889271736145,
+      "learning_rate": 4.701479986545577e-05,
+      "loss": 0.0199,
+      "step": 710
+    },
+    {
+      "epoch": 0.18163471241170534,
+      "grad_norm": 0.18842558562755585,
+      "learning_rate": 4.6972754793138246e-05,
+      "loss": 0.0194,
+      "step": 720
+    },
+    {
+      "epoch": 0.1841574167507568,
+      "grad_norm": 0.29515010118484497,
+      "learning_rate": 4.693070972082072e-05,
+      "loss": 0.0299,
+      "step": 730
+    },
+    {
+      "epoch": 0.18668012108980828,
+      "grad_norm": 0.27162402868270874,
+      "learning_rate": 4.68886646485032e-05,
+      "loss": 0.0227,
+      "step": 740
+    },
+    {
+      "epoch": 0.18920282542885974,
+      "grad_norm": 0.18802249431610107,
+      "learning_rate": 4.684661957618568e-05,
+      "loss": 0.0169,
+      "step": 750
+    },
+    {
+      "epoch": 0.1917255297679112,
+      "grad_norm": 0.34699660539627075,
+      "learning_rate": 4.680457450386815e-05,
+      "loss": 0.0159,
+      "step": 760
+    },
+    {
+      "epoch": 0.19424823410696265,
+      "grad_norm": 0.3048790693283081,
+      "learning_rate": 4.676252943155062e-05,
+      "loss": 0.0178,
+      "step": 770
+    },
+    {
+      "epoch": 0.19677093844601412,
+      "grad_norm": 0.2703554332256317,
+      "learning_rate": 4.67204843592331e-05,
+      "loss": 0.0136,
+      "step": 780
+    },
+    {
+      "epoch": 0.1992936427850656,
+      "grad_norm": 0.18560905754566193,
+      "learning_rate": 4.6678439286915575e-05,
+      "loss": 0.0202,
+      "step": 790
+    },
+    {
+      "epoch": 0.20181634712411706,
+      "grad_norm": 0.2768602967262268,
+      "learning_rate": 4.663639421459805e-05,
+      "loss": 0.0283,
+      "step": 800
+    },
+    {
+      "epoch": 0.20181634712411706,
+      "eval_loss": 0.01911783590912819,
+      "eval_runtime": 20.9061,
+      "eval_samples_per_second": 84.282,
+      "eval_steps_per_second": 21.094,
+      "step": 800
+    },
+    {
+      "epoch": 0.20433905146316853,
+      "grad_norm": 0.18893299996852875,
+      "learning_rate": 4.6594349142280525e-05,
+      "loss": 0.0205,
+      "step": 810
+    },
+    {
+      "epoch": 0.20686175580222,
+      "grad_norm": 0.24870939552783966,
+      "learning_rate": 4.6552304069963006e-05,
+      "loss": 0.0185,
+      "step": 820
+    },
+    {
+      "epoch": 0.20938446014127143,
+      "grad_norm": 0.2561938464641571,
+      "learning_rate": 4.651025899764548e-05,
+      "loss": 0.0144,
+      "step": 830
+    },
+    {
+      "epoch": 0.2119071644803229,
+      "grad_norm": 0.22478680312633514,
+      "learning_rate": 4.6468213925327956e-05,
+      "loss": 0.0213,
+      "step": 840
+    },
+    {
+      "epoch": 0.21442986881937437,
+      "grad_norm": 0.30591025948524475,
+      "learning_rate": 4.642616885301043e-05,
+      "loss": 0.0129,
+      "step": 850
+    },
+    {
+      "epoch": 0.21695257315842584,
+      "grad_norm": 0.21737821400165558,
+      "learning_rate": 4.6384123780692905e-05,
+      "loss": 0.0181,
+      "step": 860
+    },
+    {
+      "epoch": 0.2194752774974773,
+      "grad_norm": 0.20260506868362427,
+      "learning_rate": 4.634207870837538e-05,
+      "loss": 0.0144,
+      "step": 870
+    },
+    {
+      "epoch": 0.22199798183652875,
+      "grad_norm": 0.21997040510177612,
+      "learning_rate": 4.6300033636057854e-05,
+      "loss": 0.0215,
+      "step": 880
+    },
+    {
+      "epoch": 0.22452068617558021,
+      "grad_norm": 0.2595633864402771,
+      "learning_rate": 4.6257988563740336e-05,
+      "loss": 0.0159,
+      "step": 890
+    },
+    {
+      "epoch": 0.22704339051463168,
+      "grad_norm": 0.16551759839057922,
+      "learning_rate": 4.621594349142281e-05,
+      "loss": 0.0217,
+      "step": 900
+    },
+    {
+      "epoch": 0.22704339051463168,
+      "eval_loss": 0.017788389697670937,
+      "eval_runtime": 20.9214,
+      "eval_samples_per_second": 84.22,
+      "eval_steps_per_second": 21.079,
+      "step": 900
+    },
+    {
+      "epoch": 0.22956609485368315,
+      "grad_norm": 0.27989163994789124,
+      "learning_rate": 4.6173898419105285e-05,
+      "loss": 0.0166,
+      "step": 910
+    },
+    {
+      "epoch": 0.23208879919273462,
+      "grad_norm": 0.1843036413192749,
+      "learning_rate": 4.613185334678776e-05,
+      "loss": 0.0148,
+      "step": 920
+    },
+    {
+      "epoch": 0.2346115035317861,
+      "grad_norm": 0.2792811691761017,
+      "learning_rate": 4.6089808274470234e-05,
+      "loss": 0.014,
+      "step": 930
+    },
+    {
+      "epoch": 0.23713420787083753,
+      "grad_norm": 0.4182822108268738,
+      "learning_rate": 4.604776320215271e-05,
+      "loss": 0.013,
+      "step": 940
+    },
+    {
+      "epoch": 0.239656912209889,
+      "grad_norm": 0.17877671122550964,
+      "learning_rate": 4.6005718129835184e-05,
+      "loss": 0.0207,
+      "step": 950
+    },
+    {
+      "epoch": 0.24217961654894046,
+      "grad_norm": 0.21961210668087006,
+      "learning_rate": 4.596367305751766e-05,
+      "loss": 0.0168,
+      "step": 960
+    },
+    {
+      "epoch": 0.24470232088799193,
+      "grad_norm": 0.12489340454339981,
+      "learning_rate": 4.592162798520014e-05,
+      "loss": 0.0177,
+      "step": 970
+    },
+    {
+      "epoch": 0.2472250252270434,
+      "grad_norm": 0.24905265867710114,
+      "learning_rate": 4.5879582912882614e-05,
+      "loss": 0.0126,
+      "step": 980
+    },
+    {
+      "epoch": 0.24974772956609487,
+      "grad_norm": 0.14141976833343506,
+      "learning_rate": 4.583753784056509e-05,
+      "loss": 0.0102,
+      "step": 990
+    },
+    {
+      "epoch": 0.2522704339051463,
+      "grad_norm": 0.19035248458385468,
+      "learning_rate": 4.5795492768247564e-05,
+      "loss": 0.0143,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2522704339051463,
+      "eval_loss": 0.016077525913715363,
+      "eval_runtime": 21.0175,
+      "eval_samples_per_second": 83.835,
+      "eval_steps_per_second": 20.983,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2547931382441978,
+      "grad_norm": 0.2033461332321167,
+      "learning_rate": 4.575344769593004e-05,
+      "loss": 0.0172,
+      "step": 1010
+    },
+    {
+      "epoch": 0.25731584258324924,
+      "grad_norm": 0.17932486534118652,
+      "learning_rate": 4.571140262361251e-05,
+      "loss": 0.0223,
+      "step": 1020
+    },
+    {
+      "epoch": 0.2598385469223007,
+      "grad_norm": 0.16702575981616974,
+      "learning_rate": 4.566935755129499e-05,
+      "loss": 0.0117,
+      "step": 1030
+    },
+    {
+      "epoch": 0.2623612512613522,
+      "grad_norm": 0.24906021356582642,
+      "learning_rate": 4.562731247897747e-05,
+      "loss": 0.0136,
+      "step": 1040
+    },
+    {
+      "epoch": 0.2648839556004036,
+      "grad_norm": 0.2807481586933136,
+      "learning_rate": 4.5585267406659944e-05,
+      "loss": 0.0161,
+      "step": 1050
+    },
+    {
+      "epoch": 0.2674066599394551,
+      "grad_norm": 0.25573644042015076,
+      "learning_rate": 4.554322233434242e-05,
+      "loss": 0.0161,
+      "step": 1060
+    },
+    {
+      "epoch": 0.26992936427850656,
+      "grad_norm": 0.20996974408626556,
+      "learning_rate": 4.550117726202489e-05,
+      "loss": 0.0099,
+      "step": 1070
+    },
+    {
+      "epoch": 0.272452068617558,
+      "grad_norm": 0.18074114620685577,
+      "learning_rate": 4.545913218970737e-05,
+      "loss": 0.0142,
+      "step": 1080
+    },
+    {
+      "epoch": 0.2749747729566095,
+      "grad_norm": 0.11202214658260345,
+      "learning_rate": 4.541708711738984e-05,
+      "loss": 0.0148,
+      "step": 1090
+    },
+    {
+      "epoch": 0.27749747729566093,
+      "grad_norm": 1.3392621278762817,
+      "learning_rate": 4.537504204507232e-05,
+      "loss": 0.028,
+      "step": 1100
+    },
+    {
+      "epoch": 0.27749747729566093,
+      "eval_loss": 0.015062345191836357,
+      "eval_runtime": 20.9469,
+      "eval_samples_per_second": 84.117,
+      "eval_steps_per_second": 21.053,
+      "step": 1100
+    },
+    {
+      "epoch": 0.28002018163471243,
+      "grad_norm": 0.26927411556243896,
+      "learning_rate": 4.533299697275479e-05,
+      "loss": 0.0149,
+      "step": 1110
+    },
+    {
+      "epoch": 0.28254288597376387,
+      "grad_norm": 0.25918543338775635,
+      "learning_rate": 4.529095190043727e-05,
+      "loss": 0.0157,
+      "step": 1120
+    },
+    {
+      "epoch": 0.28506559031281536,
+      "grad_norm": 0.10137899965047836,
+      "learning_rate": 4.524890682811975e-05,
+      "loss": 0.0117,
+      "step": 1130
+    },
+    {
+      "epoch": 0.2875882946518668,
+      "grad_norm": 0.1916513592004776,
+      "learning_rate": 4.520686175580222e-05,
+      "loss": 0.0176,
+      "step": 1140
+    },
+    {
+      "epoch": 0.29011099899091825,
+      "grad_norm": 0.3005896210670471,
+      "learning_rate": 4.51648166834847e-05,
+      "loss": 0.0122,
+      "step": 1150
+    },
+    {
+      "epoch": 0.29263370332996974,
+      "grad_norm": 0.24127791821956635,
+      "learning_rate": 4.512277161116717e-05,
+      "loss": 0.0108,
+      "step": 1160
+    },
+    {
+      "epoch": 0.2951564076690212,
+      "grad_norm": 0.28272244334220886,
+      "learning_rate": 4.5080726538849647e-05,
+      "loss": 0.0119,
+      "step": 1170
+    },
+    {
+      "epoch": 0.2976791120080727,
+      "grad_norm": 0.36542513966560364,
+      "learning_rate": 4.503868146653212e-05,
+      "loss": 0.0122,
+      "step": 1180
+    },
+    {
+      "epoch": 0.3002018163471241,
+      "grad_norm": 0.26852190494537354,
+      "learning_rate": 4.49966363942146e-05,
+      "loss": 0.0162,
+      "step": 1190
+    },
+    {
+      "epoch": 0.30272452068617556,
+      "grad_norm": 0.26203736662864685,
+      "learning_rate": 4.495459132189708e-05,
+      "loss": 0.0125,
+      "step": 1200
+    },
+    {
+      "epoch": 0.30272452068617556,
+      "eval_loss": 0.0145474998280406,
+      "eval_runtime": 20.9732,
+      "eval_samples_per_second": 84.012,
+      "eval_steps_per_second": 21.027,
+      "step": 1200
+    },
+    {
+      "epoch": 0.30524722502522705,
+      "grad_norm": 0.31206783652305603,
+      "learning_rate": 4.491254624957955e-05,
+      "loss": 0.0241,
+      "step": 1210
+    },
+    {
+      "epoch": 0.3077699293642785,
+      "grad_norm": 0.17130957543849945,
+      "learning_rate": 4.487050117726203e-05,
+      "loss": 0.0174,
+      "step": 1220
+    },
+    {
+      "epoch": 0.31029263370333,
+      "grad_norm": 0.3070640563964844,
+      "learning_rate": 4.482845610494451e-05,
+      "loss": 0.023,
+      "step": 1230
+    },
+    {
+      "epoch": 0.31281533804238143,
+      "grad_norm": 0.5285329818725586,
+      "learning_rate": 4.4786411032626976e-05,
+      "loss": 0.0115,
+      "step": 1240
+    },
+    {
+      "epoch": 0.31533804238143287,
+      "grad_norm": 0.21449489891529083,
+      "learning_rate": 4.474436596030945e-05,
+      "loss": 0.0205,
+      "step": 1250
+    },
+    {
+      "epoch": 0.31786074672048437,
+      "grad_norm": 0.18218982219696045,
+      "learning_rate": 4.4702320887991925e-05,
+      "loss": 0.0062,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3203834510595358,
+      "grad_norm": 0.03409017622470856,
+      "learning_rate": 4.466027581567441e-05,
+      "loss": 0.02,
+      "step": 1270
+    },
+    {
+      "epoch": 0.3229061553985873,
+      "grad_norm": 0.2536049783229828,
+      "learning_rate": 4.461823074335688e-05,
+      "loss": 0.0146,
+      "step": 1280
+    },
+    {
+      "epoch": 0.32542885973763874,
+      "grad_norm": 0.17619676887989044,
+      "learning_rate": 4.4576185671039356e-05,
+      "loss": 0.0074,
+      "step": 1290
+    },
+    {
+      "epoch": 0.32795156407669024,
+      "grad_norm": 0.1441410630941391,
+      "learning_rate": 4.453414059872183e-05,
+      "loss": 0.013,
+      "step": 1300
+    },
+    {
+      "epoch": 0.32795156407669024,
+      "eval_loss": 0.012744620442390442,
+      "eval_runtime": 21.0058,
+      "eval_samples_per_second": 83.882,
+      "eval_steps_per_second": 20.994,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3304742684157417,
+      "grad_norm": 0.18683987855911255,
+      "learning_rate": 4.449209552640431e-05,
+      "loss": 0.0119,
+      "step": 1310
+    },
+    {
+      "epoch": 0.3329969727547931,
+      "grad_norm": 0.16165736317634583,
+      "learning_rate": 4.445005045408678e-05,
+      "loss": 0.0113,
+      "step": 1320
+    },
+    {
+      "epoch": 0.3355196770938446,
+      "grad_norm": 0.1178312599658966,
+      "learning_rate": 4.4408005381769255e-05,
+      "loss": 0.0113,
+      "step": 1330
+    },
+    {
+      "epoch": 0.33804238143289606,
+      "grad_norm": 0.1859322488307953,
+      "learning_rate": 4.436596030945173e-05,
+      "loss": 0.0104,
+      "step": 1340
+    },
+    {
+      "epoch": 0.34056508577194755,
+      "grad_norm": 0.49083656072616577,
+      "learning_rate": 4.432391523713421e-05,
+      "loss": 0.0095,
+      "step": 1350
+    },
+    {
+      "epoch": 0.343087790110999,
+      "grad_norm": 0.14915814995765686,
+      "learning_rate": 4.4281870164816686e-05,
+      "loss": 0.013,
+      "step": 1360
+    },
+    {
+      "epoch": 0.34561049445005043,
+      "grad_norm": 0.16166740655899048,
+      "learning_rate": 4.423982509249916e-05,
+      "loss": 0.0104,
+      "step": 1370
+    },
+    {
+      "epoch": 0.3481331987891019,
+      "grad_norm": 0.19710753858089447,
+      "learning_rate": 4.419778002018164e-05,
+      "loss": 0.0094,
+      "step": 1380
+    },
+    {
+      "epoch": 0.35065590312815337,
+      "grad_norm": 0.12713222205638885,
+      "learning_rate": 4.4155734947864116e-05,
+      "loss": 0.0122,
+      "step": 1390
+    },
+    {
+      "epoch": 0.35317860746720486,
+      "grad_norm": 0.11732326447963715,
+      "learning_rate": 4.411368987554659e-05,
+      "loss": 0.0127,
+      "step": 1400
+    },
+    {
+      "epoch": 0.35317860746720486,
+      "eval_loss": 0.010630101896822453,
+      "eval_runtime": 20.9885,
+      "eval_samples_per_second": 83.951,
+      "eval_steps_per_second": 21.012,
+      "step": 1400
+    },
+    {
+      "epoch": 0.3557013118062563,
+      "grad_norm": 0.16016925871372223,
+      "learning_rate": 4.407164480322906e-05,
+      "loss": 0.0035,
+      "step": 1410
+    },
+    {
+      "epoch": 0.35822401614530774,
+      "grad_norm": 0.11872086673974991,
+      "learning_rate": 4.402959973091154e-05,
+      "loss": 0.0049,
+      "step": 1420
+    },
+    {
+      "epoch": 0.36074672048435924,
+      "grad_norm": 0.15516729652881622,
+      "learning_rate": 4.3987554658594015e-05,
+      "loss": 0.0144,
+      "step": 1430
+    },
+    {
+      "epoch": 0.3632694248234107,
+      "grad_norm": 0.18500037491321564,
+      "learning_rate": 4.394550958627649e-05,
+      "loss": 0.0113,
+      "step": 1440
+    },
+    {
+      "epoch": 0.3657921291624622,
+      "grad_norm": 0.17393891513347626,
+      "learning_rate": 4.3903464513958964e-05,
+      "loss": 0.0102,
+      "step": 1450
+    },
+    {
+      "epoch": 0.3683148335015136,
+      "grad_norm": 0.24622410535812378,
+      "learning_rate": 4.3861419441641446e-05,
+      "loss": 0.0099,
+      "step": 1460
+    },
+    {
+      "epoch": 0.3708375378405651,
+      "grad_norm": 0.18613703548908234,
+      "learning_rate": 4.381937436932392e-05,
+      "loss": 0.0111,
+      "step": 1470
+    },
+    {
+      "epoch": 0.37336024217961655,
+      "grad_norm": 0.23599150776863098,
+      "learning_rate": 4.3777329297006395e-05,
+      "loss": 0.0152,
+      "step": 1480
+    },
+    {
+      "epoch": 0.375882946518668,
+      "grad_norm": 0.08963891863822937,
+      "learning_rate": 4.373528422468886e-05,
+      "loss": 0.0156,
+      "step": 1490
+    },
+    {
+      "epoch": 0.3784056508577195,
+      "grad_norm": 0.26133468747138977,
+      "learning_rate": 4.3693239152371344e-05,
+      "loss": 0.0185,
+      "step": 1500
+    },
+    {
+      "epoch": 0.3784056508577195,
+      "eval_loss": 0.010692655108869076,
+      "eval_runtime": 21.0428,
+      "eval_samples_per_second": 83.734,
+      "eval_steps_per_second": 20.957,
+      "step": 1500
+    },
+    {
+      "epoch": 0.38092835519677093,
+      "grad_norm": 0.07590801268815994,
+      "learning_rate": 4.365119408005382e-05,
+      "loss": 0.0102,
+      "step": 1510
+    },
+    {
+      "epoch": 0.3834510595358224,
+      "grad_norm": 0.047652024775743484,
+      "learning_rate": 4.3609149007736294e-05,
+      "loss": 0.0095,
+      "step": 1520
+    },
+    {
+      "epoch": 0.38597376387487387,
+      "grad_norm": 0.23399275541305542,
+      "learning_rate": 4.3567103935418775e-05,
+      "loss": 0.0089,
+      "step": 1530
+    },
+    {
+      "epoch": 0.3884964682139253,
+      "grad_norm": 0.2155078798532486,
+      "learning_rate": 4.352505886310125e-05,
+      "loss": 0.0115,
+      "step": 1540
+    },
+    {
+      "epoch": 0.3910191725529768,
+      "grad_norm": 0.09053190052509308,
+      "learning_rate": 4.3483013790783725e-05,
+      "loss": 0.0088,
+      "step": 1550
+    },
+    {
+      "epoch": 0.39354187689202824,
+      "grad_norm": 0.2110535055398941,
+      "learning_rate": 4.34409687184662e-05,
+      "loss": 0.0098,
+      "step": 1560
+    },
+    {
+      "epoch": 0.39606458123107974,
+      "grad_norm": 0.1765887439250946,
+      "learning_rate": 4.3398923646148674e-05,
+      "loss": 0.0072,
+      "step": 1570
+    },
+    {
+      "epoch": 0.3985872855701312,
+      "grad_norm": 0.3545493483543396,
+      "learning_rate": 4.335687857383115e-05,
+      "loss": 0.0132,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4011099899091826,
+      "grad_norm": 0.06623344123363495,
+      "learning_rate": 4.331483350151362e-05,
+      "loss": 0.0069,
+      "step": 1590
+    },
+    {
+      "epoch": 0.4036326942482341,
+      "grad_norm": 0.16485914587974548,
+      "learning_rate": 4.32727884291961e-05,
+      "loss": 0.007,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4036326942482341,
+      "eval_loss": 0.01017470471560955,
+      "eval_runtime": 20.8918,
+      "eval_samples_per_second": 84.339,
+      "eval_steps_per_second": 21.109,
+      "step": 1600
+    },
+    {
+      "epoch": 0.40615539858728555,
+      "grad_norm": 0.15467578172683716,
+      "learning_rate": 4.323074335687858e-05,
+      "loss": 0.0081,
+      "step": 1610
+    },
+    {
+      "epoch": 0.40867810292633705,
+      "grad_norm": 0.2580385208129883,
+      "learning_rate": 4.3188698284561054e-05,
+      "loss": 0.0112,
+      "step": 1620
+    },
+    {
+      "epoch": 0.4112008072653885,
+      "grad_norm": 0.010637140832841396,
+      "learning_rate": 4.314665321224353e-05,
+      "loss": 0.008,
+      "step": 1630
+    },
+    {
+      "epoch": 0.41372351160444,
+      "grad_norm": 0.26677659153938293,
+      "learning_rate": 4.3104608139926e-05,
+      "loss": 0.0094,
+      "step": 1640
+    },
+    {
+      "epoch": 0.4162462159434914,
+      "grad_norm": 0.2677707374095917,
+      "learning_rate": 4.306256306760848e-05,
+      "loss": 0.0127,
+      "step": 1650
+    },
+    {
+      "epoch": 0.41876892028254287,
+      "grad_norm": 0.13589175045490265,
+      "learning_rate": 4.302051799529095e-05,
+      "loss": 0.0056,
+      "step": 1660
+    },
+    {
+      "epoch": 0.42129162462159436,
+      "grad_norm": 0.10289867222309113,
+      "learning_rate": 4.297847292297343e-05,
+      "loss": 0.0109,
+      "step": 1670
+    },
+    {
+      "epoch": 0.4238143289606458,
+      "grad_norm": 0.07062846422195435,
+      "learning_rate": 4.29364278506559e-05,
+      "loss": 0.0048,
+      "step": 1680
+    },
+    {
+      "epoch": 0.4263370332996973,
+      "grad_norm": 0.16123530268669128,
+      "learning_rate": 4.289438277833838e-05,
+      "loss": 0.0052,
+      "step": 1690
+    },
+    {
+      "epoch": 0.42885973763874874,
+      "grad_norm": 0.13397027552127838,
+      "learning_rate": 4.285233770602086e-05,
+      "loss": 0.0048,
+      "step": 1700
+    },
+    {
+      "epoch": 0.42885973763874874,
+      "eval_loss": 0.010062881745398045,
+      "eval_runtime": 20.9947,
+      "eval_samples_per_second": 83.926,
+      "eval_steps_per_second": 21.005,
+      "step": 1700
+    },
+    {
+      "epoch": 0.4313824419778002,
+      "grad_norm": 0.04028566554188728,
+      "learning_rate": 4.281029263370333e-05,
+      "loss": 0.0101,
+      "step": 1710
+    },
+    {
+      "epoch": 0.4339051463168517,
+      "grad_norm": 0.06560038775205612,
+      "learning_rate": 4.276824756138581e-05,
+      "loss": 0.0033,
+      "step": 1720
+    },
+    {
+      "epoch": 0.4364278506559031,
+      "grad_norm": 0.16810742020606995,
+      "learning_rate": 4.272620248906828e-05,
+      "loss": 0.0054,
+      "step": 1730
+    },
+    {
+      "epoch": 0.4389505549949546,
+      "grad_norm": 0.015353145077824593,
+      "learning_rate": 4.268415741675076e-05,
+      "loss": 0.0058,
+      "step": 1740
+    },
+    {
+      "epoch": 0.44147325933400605,
+      "grad_norm": 0.2716507911682129,
+      "learning_rate": 4.264211234443323e-05,
+      "loss": 0.0114,
+      "step": 1750
+    },
+    {
+      "epoch": 0.4439959636730575,
+      "grad_norm": 0.10725341737270355,
+      "learning_rate": 4.260006727211571e-05,
+      "loss": 0.0095,
+      "step": 1760
+    },
+    {
+      "epoch": 0.446518668012109,
+      "grad_norm": 0.21090877056121826,
+      "learning_rate": 4.255802219979819e-05,
+      "loss": 0.0171,
+      "step": 1770
+    },
+    {
+      "epoch": 0.44904137235116043,
+      "grad_norm": 0.08791640400886536,
+      "learning_rate": 4.251597712748066e-05,
+      "loss": 0.0111,
+      "step": 1780
+    },
+    {
+      "epoch": 0.4515640766902119,
+      "grad_norm": 0.29180845618247986,
+      "learning_rate": 4.247393205516314e-05,
+      "loss": 0.0093,
+      "step": 1790
+    },
+    {
+      "epoch": 0.45408678102926336,
+      "grad_norm": 0.21628066897392273,
+      "learning_rate": 4.243188698284561e-05,
+      "loss": 0.0056,
+      "step": 1800
+    },
+    {
+      "epoch": 0.45408678102926336,
+      "eval_loss": 0.010144516825675964,
+      "eval_runtime": 20.9537,
+      "eval_samples_per_second": 84.09,
+      "eval_steps_per_second": 21.046,
+      "step": 1800
+    },
+    {
+      "epoch": 0.45660948536831486,
+      "grad_norm": 0.17846959829330444,
+      "learning_rate": 4.2389841910528086e-05,
+      "loss": 0.008,
+      "step": 1810
+    },
+    {
+      "epoch": 0.4591321897073663,
+      "grad_norm": 0.18932151794433594,
+      "learning_rate": 4.234779683821056e-05,
+      "loss": 0.0098,
+      "step": 1820
+    },
+    {
+      "epoch": 0.46165489404641774,
+      "grad_norm": 0.005480750929564238,
+      "learning_rate": 4.2305751765893035e-05,
+      "loss": 0.0099,
+      "step": 1830
+    },
+    {
+      "epoch": 0.46417759838546924,
+      "grad_norm": 0.02110099606215954,
+      "learning_rate": 4.226370669357552e-05,
+      "loss": 0.0089,
+      "step": 1840
+    },
+    {
+      "epoch": 0.4667003027245207,
+      "grad_norm": 0.12439311295747757,
+      "learning_rate": 4.222166162125799e-05,
+      "loss": 0.006,
+      "step": 1850
+    },
+    {
+      "epoch": 0.4692230070635722,
+      "grad_norm": 0.12683548033237457,
+      "learning_rate": 4.2179616548940466e-05,
+      "loss": 0.0062,
+      "step": 1860
+    },
+    {
+      "epoch": 0.4717457114026236,
+      "grad_norm": 0.10005199909210205,
+      "learning_rate": 4.213757147662295e-05,
+      "loss": 0.0064,
+      "step": 1870
+    },
+    {
+      "epoch": 0.47426841574167505,
+      "grad_norm": 0.101644366979599,
+      "learning_rate": 4.2095526404305416e-05,
+      "loss": 0.0076,
+      "step": 1880
+    },
+    {
+      "epoch": 0.47679112008072655,
+      "grad_norm": 0.09989798069000244,
+      "learning_rate": 4.205348133198789e-05,
+      "loss": 0.0048,
+      "step": 1890
+    },
+    {
+      "epoch": 0.479313824419778,
+      "grad_norm": 0.12589283287525177,
+      "learning_rate": 4.2011436259670365e-05,
+      "loss": 0.008,
+      "step": 1900
+    },
+    {
+      "epoch": 0.479313824419778,
+      "eval_loss": 0.00869656726717949,
+      "eval_runtime": 20.8804,
+      "eval_samples_per_second": 84.385,
+      "eval_steps_per_second": 21.12,
+      "step": 1900
+    },
+    {
+      "epoch": 0.4818365287588295,
+      "grad_norm": 0.3338044583797455,
+      "learning_rate": 4.1969391187352846e-05,
+      "loss": 0.014,
+      "step": 1910
+    },
+    {
+      "epoch": 0.4843592330978809,
+      "grad_norm": 0.1185823529958725,
+      "learning_rate": 4.192734611503532e-05,
+      "loss": 0.0072,
+      "step": 1920
+    },
+    {
+      "epoch": 0.48688193743693237,
+      "grad_norm": 0.2536753714084625,
+      "learning_rate": 4.1885301042717796e-05,
+      "loss": 0.0177,
+      "step": 1930
+    },
+    {
+      "epoch": 0.48940464177598386,
+      "grad_norm": 0.1340733915567398,
+      "learning_rate": 4.184325597040027e-05,
+      "loss": 0.0052,
+      "step": 1940
+    },
+    {
+      "epoch": 0.4919273461150353,
+      "grad_norm": 0.09943121671676636,
+      "learning_rate": 4.180121089808275e-05,
+      "loss": 0.0054,
+      "step": 1950
+    },
+    {
+      "epoch": 0.4944500504540868,
+      "grad_norm": 0.17324581742286682,
+      "learning_rate": 4.175916582576522e-05,
+      "loss": 0.0061,
+      "step": 1960
+    },
+    {
+      "epoch": 0.49697275479313824,
+      "grad_norm": 0.027863750234246254,
+      "learning_rate": 4.1717120753447694e-05,
+      "loss": 0.0093,
+      "step": 1970
+    },
+    {
+      "epoch": 0.49949545913218973,
+      "grad_norm": 0.016479160636663437,
+      "learning_rate": 4.167507568113017e-05,
+      "loss": 0.0036,
+      "step": 1980
+    },
+    {
+      "epoch": 0.5020181634712412,
+      "grad_norm": 0.06331757456064224,
+      "learning_rate": 4.163303060881265e-05,
+      "loss": 0.0074,
+      "step": 1990
+    },
+    {
+      "epoch": 0.5045408678102926,
+      "grad_norm": 0.028800033032894135,
+      "learning_rate": 4.1590985536495125e-05,
+      "loss": 0.0092,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5045408678102926,
+      "eval_loss": 0.008615074679255486,
+      "eval_runtime": 20.9704,
+      "eval_samples_per_second": 84.023,
+      "eval_steps_per_second": 21.03,
+      "step": 2000
+    },
+    {
+      "epoch": 0.507063572149344,
+      "grad_norm": 0.11708183586597443,
+      "learning_rate": 4.15489404641776e-05,
+      "loss": 0.0028,
+      "step": 2010
+    },
+    {
+      "epoch": 0.5095862764883956,
+      "grad_norm": 0.0794278159737587,
+      "learning_rate": 4.1506895391860074e-05,
+      "loss": 0.0097,
+      "step": 2020
+    },
+    {
+      "epoch": 0.512108980827447,
+      "grad_norm": 0.06485351175069809,
+      "learning_rate": 4.1464850319542556e-05,
+      "loss": 0.0076,
+      "step": 2030
+    },
+    {
+      "epoch": 0.5146316851664985,
+      "grad_norm": 0.15527907013893127,
+      "learning_rate": 4.142280524722503e-05,
+      "loss": 0.013,
+      "step": 2040
+    },
+    {
+      "epoch": 0.5171543895055499,
+      "grad_norm": 0.20114894211292267,
+      "learning_rate": 4.13807601749075e-05,
+      "loss": 0.0117,
+      "step": 2050
+    },
+    {
+      "epoch": 0.5196770938446014,
+      "grad_norm": 0.12151603400707245,
+      "learning_rate": 4.133871510258998e-05,
+      "loss": 0.015,
+      "step": 2060
+    },
+    {
+      "epoch": 0.5221997981836529,
+      "grad_norm": 0.055018067359924316,
+      "learning_rate": 4.1296670030272455e-05,
+      "loss": 0.0079,
+      "step": 2070
+    },
+    {
+      "epoch": 0.5247225025227044,
+      "grad_norm": 0.16336438059806824,
+      "learning_rate": 4.125462495795493e-05,
+      "loss": 0.0072,
+      "step": 2080
+    },
+    {
+      "epoch": 0.5272452068617558,
+      "grad_norm": 0.2550767660140991,
+      "learning_rate": 4.1212579885637404e-05,
+      "loss": 0.013,
+      "step": 2090
+    },
+    {
+      "epoch": 0.5297679112008072,
+      "grad_norm": 0.040902867913246155,
+      "learning_rate": 4.1170534813319885e-05,
+      "loss": 0.0033,
+      "step": 2100
+    },
+    {
+      "epoch": 0.5297679112008072,
+      "eval_loss": 0.00901652593165636,
+      "eval_runtime": 21.0472,
+      "eval_samples_per_second": 83.717,
+      "eval_steps_per_second": 20.953,
+      "step": 2100
+    },
+    {
+      "epoch": 0.5322906155398587,
+      "grad_norm": 0.03393733501434326,
+      "learning_rate": 4.112848974100236e-05,
+      "loss": 0.0107,
+      "step": 2110
+    },
+    {
+      "epoch": 0.5348133198789102,
+      "grad_norm": 0.10513912886381149,
+      "learning_rate": 4.1086444668684835e-05,
+      "loss": 0.0069,
+      "step": 2120
+    },
+    {
+      "epoch": 0.5373360242179617,
+      "grad_norm": 0.012084727175533772,
+      "learning_rate": 4.10443995963673e-05,
+      "loss": 0.0082,
+      "step": 2130
+    },
+    {
+      "epoch": 0.5398587285570131,
+      "grad_norm": 0.07994495332241058,
+      "learning_rate": 4.1002354524049784e-05,
+      "loss": 0.0109,
+      "step": 2140
+    },
+    {
+      "epoch": 0.5423814328960646,
+      "grad_norm": 0.13017794489860535,
+      "learning_rate": 4.096030945173226e-05,
+      "loss": 0.0041,
+      "step": 2150
+    },
+    {
+      "epoch": 0.544904137235116,
+      "grad_norm": 0.0023468900471925735,
+      "learning_rate": 4.091826437941473e-05,
+      "loss": 0.0085,
+      "step": 2160
+    },
+    {
+      "epoch": 0.5474268415741675,
+      "grad_norm": 0.061518047004938126,
+      "learning_rate": 4.087621930709721e-05,
+      "loss": 0.011,
+      "step": 2170
+    },
+    {
+      "epoch": 0.549949545913219,
+      "grad_norm": 0.07088392227888107,
+      "learning_rate": 4.083417423477969e-05,
+      "loss": 0.0056,
+      "step": 2180
+    },
+    {
+      "epoch": 0.5524722502522704,
+      "grad_norm": 0.09153332561254501,
+      "learning_rate": 4.0792129162462164e-05,
+      "loss": 0.0059,
+      "step": 2190
+    },
+    {
+      "epoch": 0.5549949545913219,
+      "grad_norm": 0.15585757791996002,
+      "learning_rate": 4.075008409014464e-05,
+      "loss": 0.0041,
+      "step": 2200
+    },
+    {
+      "epoch": 0.5549949545913219,
+      "eval_loss": 0.008859611116349697,
+      "eval_runtime": 20.9351,
+      "eval_samples_per_second": 84.165,
+      "eval_steps_per_second": 21.065,
+      "step": 2200
+    },
+    {
+      "epoch": 0.5575176589303733,
+      "grad_norm": 0.14012502133846283,
+      "learning_rate": 4.070803901782711e-05,
+      "loss": 0.0052,
+      "step": 2210
+    },
+    {
+      "epoch": 0.5600403632694249,
+      "grad_norm": 0.18286050856113434,
+      "learning_rate": 4.066599394550959e-05,
+      "loss": 0.0117,
+      "step": 2220
+    },
+    {
+      "epoch": 0.5625630676084763,
+      "grad_norm": 0.12133604288101196,
+      "learning_rate": 4.062394887319206e-05,
+      "loss": 0.0064,
+      "step": 2230
+    },
+    {
+      "epoch": 0.5650857719475277,
+      "grad_norm": 0.006398872472345829,
+      "learning_rate": 4.058190380087454e-05,
+      "loss": 0.0032,
+      "step": 2240
+    },
+    {
+      "epoch": 0.5676084762865792,
+      "grad_norm": 0.0005545477033592761,
+      "learning_rate": 4.053985872855702e-05,
+      "loss": 0.004,
+      "step": 2250
+    },
+    {
+      "epoch": 0.5701311806256307,
+      "grad_norm": 0.16576875746250153,
+      "learning_rate": 4.0497813656239493e-05,
+      "loss": 0.0041,
+      "step": 2260
+    },
+    {
+      "epoch": 0.5726538849646822,
+      "grad_norm": 0.034229591488838196,
+      "learning_rate": 4.045576858392197e-05,
+      "loss": 0.0051,
+      "step": 2270
+    },
+    {
+      "epoch": 0.5751765893037336,
+      "grad_norm": 0.13495758175849915,
+      "learning_rate": 4.041372351160444e-05,
+      "loss": 0.0081,
+      "step": 2280
+    },
+    {
+      "epoch": 0.577699293642785,
+      "grad_norm": 0.20754534006118774,
+      "learning_rate": 4.037167843928692e-05,
+      "loss": 0.0129,
+      "step": 2290
+    },
+    {
+      "epoch": 0.5802219979818365,
+      "grad_norm": 0.12224958837032318,
+      "learning_rate": 4.032963336696939e-05,
+      "loss": 0.007,
+      "step": 2300
+    },
+    {
+      "epoch": 0.5802219979818365,
+      "eval_loss": 0.008081664331257343,
+      "eval_runtime": 20.9045,
+      "eval_samples_per_second": 84.288,
+      "eval_steps_per_second": 21.096,
+      "step": 2300
+    },
+    {
+      "epoch": 0.582744702320888,
+      "grad_norm": 0.20963284373283386,
+      "learning_rate": 4.028758829465187e-05,
+      "loss": 0.011,
+      "step": 2310
+    },
+    {
+      "epoch": 0.5852674066599395,
+      "grad_norm": 0.1182667464017868,
+      "learning_rate": 4.024554322233434e-05,
+      "loss": 0.0085,
+      "step": 2320
+    },
+    {
+      "epoch": 0.5877901109989909,
+      "grad_norm": 0.1626705825328827,
+      "learning_rate": 4.020349815001682e-05,
+      "loss": 0.0091,
+      "step": 2330
+    },
+    {
+      "epoch": 0.5903128153380424,
+      "grad_norm": 0.10798126459121704,
+      "learning_rate": 4.01614530776993e-05,
+      "loss": 0.009,
+      "step": 2340
+    },
+    {
+      "epoch": 0.5928355196770938,
+      "grad_norm": 0.03671824559569359,
+      "learning_rate": 4.011940800538177e-05,
+      "loss": 0.005,
+      "step": 2350
+    },
+    {
+      "epoch": 0.5953582240161454,
+      "grad_norm": 0.019325584173202515,
+      "learning_rate": 4.007736293306425e-05,
+      "loss": 0.0082,
+      "step": 2360
+    },
+    {
+      "epoch": 0.5978809283551968,
+      "grad_norm": 0.04128754511475563,
+      "learning_rate": 4.003531786074672e-05,
+      "loss": 0.0046,
+      "step": 2370
+    },
+    {
+      "epoch": 0.6004036326942482,
+      "grad_norm": 0.07875852286815643,
+      "learning_rate": 3.9993272788429196e-05,
+      "loss": 0.0283,
+      "step": 2380
+    },
+    {
+      "epoch": 0.6029263370332997,
+      "grad_norm": 0.11841381341218948,
+      "learning_rate": 3.995122771611167e-05,
+      "loss": 0.0052,
+      "step": 2390
+    },
+    {
+      "epoch": 0.6054490413723511,
+      "grad_norm": 0.14310500025749207,
+      "learning_rate": 3.990918264379415e-05,
+      "loss": 0.0027,
+      "step": 2400
+    },
+    {
+      "epoch": 0.6054490413723511,
+      "eval_loss": 0.008280658163130283,
+      "eval_runtime": 20.9355,
+      "eval_samples_per_second": 84.163,
+      "eval_steps_per_second": 21.065,
+      "step": 2400
+    },
+    {
+      "epoch": 0.6079717457114027,
+      "grad_norm": 0.1013203114271164,
+      "learning_rate": 3.986713757147663e-05,
+      "loss": 0.0071,
+      "step": 2410
+    },
+    {
+      "epoch": 0.6104944500504541,
+      "grad_norm": 0.09219915419816971,
+      "learning_rate": 3.98250924991591e-05,
+      "loss": 0.0128,
+      "step": 2420
+    },
+    {
+      "epoch": 0.6130171543895055,
+      "grad_norm": 0.21949277818202972,
+      "learning_rate": 3.9783047426841576e-05,
+      "loss": 0.0125,
+      "step": 2430
+    },
+    {
+      "epoch": 0.615539858728557,
+      "grad_norm": 0.04883907735347748,
+      "learning_rate": 3.974100235452405e-05,
+      "loss": 0.0133,
+      "step": 2440
+    },
+    {
+      "epoch": 0.6180625630676084,
+      "grad_norm": 0.28083309531211853,
+      "learning_rate": 3.9698957282206526e-05,
+      "loss": 0.0094,
+      "step": 2450
+    },
+    {
+      "epoch": 0.62058526740666,
+      "grad_norm": 0.1395656317472458,
+      "learning_rate": 3.9656912209889e-05,
+      "loss": 0.008,
+      "step": 2460
+    },
+    {
+      "epoch": 0.6231079717457114,
+      "grad_norm": 0.3387027084827423,
+      "learning_rate": 3.9614867137571475e-05,
+      "loss": 0.0108,
+      "step": 2470
+    },
+    {
+      "epoch": 0.6256306760847629,
+      "grad_norm": 0.12317987531423569,
+      "learning_rate": 3.9572822065253956e-05,
+      "loss": 0.0106,
+      "step": 2480
+    },
+    {
+      "epoch": 0.6281533804238143,
+      "grad_norm": 0.11516406387090683,
+      "learning_rate": 3.953077699293643e-05,
+      "loss": 0.0109,
+      "step": 2490
+    },
+    {
+      "epoch": 0.6306760847628657,
+      "grad_norm": 0.3164563775062561,
+      "learning_rate": 3.9488731920618906e-05,
+      "loss": 0.0122,
+      "step": 2500
+    },
+    {
+      "epoch": 0.6306760847628657,
+      "eval_loss": 0.0077254436910152435,
+      "eval_runtime": 20.9125,
+      "eval_samples_per_second": 84.256,
+      "eval_steps_per_second": 21.088,
+      "step": 2500
+    },
+    {
+      "epoch": 0.6331987891019173,
+      "grad_norm": 0.1707511991262436,
+      "learning_rate": 3.944668684830138e-05,
+      "loss": 0.009,
+      "step": 2510
+    },
+    {
+      "epoch": 0.6357214934409687,
+      "grad_norm": 0.08108045160770416,
+      "learning_rate": 3.9404641775983855e-05,
+      "loss": 0.012,
+      "step": 2520
+    },
+    {
+      "epoch": 0.6382441977800202,
+      "grad_norm": 0.1104462668299675,
+      "learning_rate": 3.936259670366633e-05,
+      "loss": 0.004,
+      "step": 2530
+    },
+    {
+      "epoch": 0.6407669021190716,
+      "grad_norm": 0.17339076101779938,
+      "learning_rate": 3.9320551631348804e-05,
+      "loss": 0.0056,
+      "step": 2540
+    },
+    {
+      "epoch": 0.643289606458123,
+      "grad_norm": 0.10303635895252228,
+      "learning_rate": 3.9278506559031286e-05,
+      "loss": 0.0078,
+      "step": 2550
+    },
+    {
+      "epoch": 0.6458123107971746,
+      "grad_norm": 0.009340761229395866,
+      "learning_rate": 3.923646148671376e-05,
+      "loss": 0.0071,
+      "step": 2560
+    },
+    {
+      "epoch": 0.648335015136226,
+      "grad_norm": 0.11290521174669266,
+      "learning_rate": 3.9194416414396235e-05,
+      "loss": 0.0064,
+      "step": 2570
+    },
+    {
+      "epoch": 0.6508577194752775,
+      "grad_norm": 0.13023380935192108,
+      "learning_rate": 3.915237134207871e-05,
+      "loss": 0.0047,
+      "step": 2580
+    },
+    {
+      "epoch": 0.6533804238143289,
+      "grad_norm": 0.027826808393001556,
+      "learning_rate": 3.911032626976119e-05,
+      "loss": 0.004,
+      "step": 2590
+    },
+    {
+      "epoch": 0.6559031281533805,
+      "grad_norm": 0.12674188613891602,
+      "learning_rate": 3.906828119744366e-05,
+      "loss": 0.0087,
+      "step": 2600
+    },
+    {
+      "epoch": 0.6559031281533805,
+      "eval_loss": 0.007544004824012518,
+      "eval_runtime": 20.9279,
+      "eval_samples_per_second": 84.194,
+      "eval_steps_per_second": 21.072,
+      "step": 2600
+    },
+    {
+      "epoch": 0.6584258324924319,
+      "grad_norm": 0.05906185507774353,
+      "learning_rate": 3.9026236125126134e-05,
+      "loss": 0.0112,
+      "step": 2610
+    },
+    {
+      "epoch": 0.6609485368314834,
+      "grad_norm": 0.02223772369325161,
+      "learning_rate": 3.898419105280861e-05,
+      "loss": 0.0061,
+      "step": 2620
+    },
+    {
+      "epoch": 0.6634712411705348,
+      "grad_norm": 0.1578211635351181,
+      "learning_rate": 3.894214598049109e-05,
+      "loss": 0.0065,
+      "step": 2630
+    },
+    {
+      "epoch": 0.6659939455095862,
+      "grad_norm": 0.0348033532500267,
+      "learning_rate": 3.8900100908173565e-05,
+      "loss": 0.0106,
+      "step": 2640
+    },
+    {
+      "epoch": 0.6685166498486378,
+      "grad_norm": 0.09289257973432541,
+      "learning_rate": 3.885805583585604e-05,
+      "loss": 0.0055,
+      "step": 2650
+    },
+    {
+      "epoch": 0.6710393541876892,
+      "grad_norm": 0.0011186335468664765,
+      "learning_rate": 3.8816010763538514e-05,
+      "loss": 0.0043,
+      "step": 2660
+    },
+    {
+      "epoch": 0.6735620585267407,
+      "grad_norm": 0.04303692653775215,
+      "learning_rate": 3.8773965691220995e-05,
+      "loss": 0.003,
+      "step": 2670
+    },
+    {
+      "epoch": 0.6760847628657921,
+      "grad_norm": 0.02356291376054287,
+      "learning_rate": 3.873192061890347e-05,
+      "loss": 0.004,
+      "step": 2680
+    },
+    {
+      "epoch": 0.6786074672048436,
+      "grad_norm": 0.23490223288536072,
+      "learning_rate": 3.868987554658594e-05,
+      "loss": 0.0087,
+      "step": 2690
+    },
+    {
+      "epoch": 0.6811301715438951,
+      "grad_norm": 0.18736770749092102,
+      "learning_rate": 3.864783047426841e-05,
+      "loss": 0.0101,
+      "step": 2700
+    },
+    {
+      "epoch": 0.6811301715438951,
+      "eval_loss": 0.007495530880987644,
+      "eval_runtime": 20.9691,
+      "eval_samples_per_second": 84.029,
+      "eval_steps_per_second": 21.031,
+      "step": 2700
+    },
+    {
+      "epoch": 0.6836528758829465,
+      "grad_norm": 0.06488362699747086,
+      "learning_rate": 3.8605785401950894e-05,
+      "loss": 0.0023,
+      "step": 2710
+    },
+    {
+      "epoch": 0.686175580221998,
+      "grad_norm": 0.11341580748558044,
+      "learning_rate": 3.856374032963337e-05,
+      "loss": 0.0075,
+      "step": 2720
+    },
+    {
+      "epoch": 0.6886982845610494,
+      "grad_norm": 0.018855459988117218,
+      "learning_rate": 3.852169525731584e-05,
+      "loss": 0.0072,
+      "step": 2730
+    },
+    {
+      "epoch": 0.6912209889001009,
+      "grad_norm": 0.002237241482362151,
+      "learning_rate": 3.8479650184998325e-05,
+      "loss": 0.0046,
+      "step": 2740
+    },
+    {
+      "epoch": 0.6937436932391524,
+      "grad_norm": 0.2180403620004654,
+      "learning_rate": 3.84376051126808e-05,
+      "loss": 0.008,
+      "step": 2750
+    },
+    {
+      "epoch": 0.6962663975782039,
+      "grad_norm": 0.09451308846473694,
+      "learning_rate": 3.8395560040363274e-05,
+      "loss": 0.0065,
+      "step": 2760
+    },
+    {
+      "epoch": 0.6987891019172553,
+      "grad_norm": 0.14188626408576965,
+      "learning_rate": 3.835351496804574e-05,
+      "loss": 0.0106,
+      "step": 2770
+    },
+    {
+      "epoch": 0.7013118062563067,
+      "grad_norm": 0.10723700374364853,
+      "learning_rate": 3.8311469895728223e-05,
+      "loss": 0.0101,
+      "step": 2780
+    },
+    {
+      "epoch": 0.7038345105953582,
+      "grad_norm": 0.09538406878709793,
+      "learning_rate": 3.82694248234107e-05,
+      "loss": 0.0101,
+      "step": 2790
+    },
+    {
+      "epoch": 0.7063572149344097,
+      "grad_norm": 0.013723093084990978,
+      "learning_rate": 3.822737975109317e-05,
+      "loss": 0.0064,
+      "step": 2800
+    },
+    {
+      "epoch": 0.7063572149344097,
+      "eval_loss": 0.007626931183040142,
+      "eval_runtime": 20.9136,
+      "eval_samples_per_second": 84.252,
+      "eval_steps_per_second": 21.087,
+      "step": 2800
+    },
+    {
+      "epoch": 0.7088799192734612,
+      "grad_norm": 0.08048822730779648,
+      "learning_rate": 3.818533467877565e-05,
+      "loss": 0.0046,
+      "step": 2810
+    },
+    {
+      "epoch": 0.7114026236125126,
+      "grad_norm": 0.2678566873073578,
+      "learning_rate": 3.814328960645813e-05,
+      "loss": 0.0111,
+      "step": 2820
+    },
+    {
+      "epoch": 0.713925327951564,
+      "grad_norm": 0.016534708440303802,
+      "learning_rate": 3.8101244534140604e-05,
+      "loss": 0.0105,
+      "step": 2830
+    },
+    {
+      "epoch": 0.7164480322906155,
+      "grad_norm": 0.17189861834049225,
+      "learning_rate": 3.805919946182308e-05,
+      "loss": 0.0124,
+      "step": 2840
+    },
+    {
+      "epoch": 0.718970736629667,
+      "grad_norm": 0.004572316538542509,
+      "learning_rate": 3.8017154389505546e-05,
+      "loss": 0.0054,
+      "step": 2850
+    },
+    {
+      "epoch": 0.7214934409687185,
+      "grad_norm": 0.02059135213494301,
+      "learning_rate": 3.797510931718803e-05,
+      "loss": 0.0108,
+      "step": 2860
+    },
+    {
+      "epoch": 0.7240161453077699,
+      "grad_norm": 0.11188461631536484,
+      "learning_rate": 3.79330642448705e-05,
+      "loss": 0.0052,
+      "step": 2870
+    },
+    {
+      "epoch": 0.7265388496468214,
+      "grad_norm": 0.0961654856801033,
+      "learning_rate": 3.789101917255298e-05,
+      "loss": 0.0092,
+      "step": 2880
+    },
+    {
+      "epoch": 0.7290615539858728,
+      "grad_norm": 0.004565075505524874,
+      "learning_rate": 3.784897410023546e-05,
+      "loss": 0.005,
+      "step": 2890
+    },
+    {
+      "epoch": 0.7315842583249244,
+      "grad_norm": 0.058100827038288116,
+      "learning_rate": 3.780692902791793e-05,
+      "loss": 0.0196,
+      "step": 2900
+    },
+    {
+      "epoch": 0.7315842583249244,
+      "eval_loss": 0.007003966718912125,
+      "eval_runtime": 20.9017,
+      "eval_samples_per_second": 84.299,
+      "eval_steps_per_second": 21.099,
+      "step": 2900
+    },
+    {
+      "epoch": 0.7341069626639758,
+      "grad_norm": 0.08321081846952438,
+      "learning_rate": 3.776488395560041e-05,
+      "loss": 0.0042,
+      "step": 2910
+    },
+    {
+      "epoch": 0.7366296670030272,
+      "grad_norm": 0.12256285548210144,
+      "learning_rate": 3.772283888328288e-05,
+      "loss": 0.0101,
+      "step": 2920
+    },
+    {
+      "epoch": 0.7391523713420787,
+      "grad_norm": 0.18364761769771576,
+      "learning_rate": 3.768079381096536e-05,
+      "loss": 0.0136,
+      "step": 2930
+    },
+    {
+      "epoch": 0.7416750756811302,
+      "grad_norm": 0.3442452847957611,
+      "learning_rate": 3.763874873864783e-05,
+      "loss": 0.0094,
+      "step": 2940
+    },
+    {
+      "epoch": 0.7441977800201817,
+      "grad_norm": 0.1985316276550293,
+      "learning_rate": 3.7596703666330306e-05,
+      "loss": 0.0058,
+      "step": 2950
+    },
+    {
+      "epoch": 0.7467204843592331,
+      "grad_norm": 0.000595409597735852,
+      "learning_rate": 3.755465859401278e-05,
+      "loss": 0.0212,
+      "step": 2960
+    },
+    {
+      "epoch": 0.7492431886982845,
+      "grad_norm": 0.23967699706554413,
+      "learning_rate": 3.751261352169526e-05,
+      "loss": 0.0064,
+      "step": 2970
+    },
+    {
+      "epoch": 0.751765893037336,
+      "grad_norm": 0.0012721189996227622,
+      "learning_rate": 3.747056844937774e-05,
+      "loss": 0.003,
+      "step": 2980
+    },
+    {
+      "epoch": 0.7542885973763875,
+      "grad_norm": 0.0369889996945858,
+      "learning_rate": 3.742852337706021e-05,
+      "loss": 0.0072,
+      "step": 2990
+    },
+    {
+      "epoch": 0.756811301715439,
+      "grad_norm": 0.07918387651443481,
+      "learning_rate": 3.7386478304742686e-05,
+      "loss": 0.0031,
+      "step": 3000
+    },
+    {
+      "epoch": 0.756811301715439,
+      "eval_loss": 0.007090387400239706,
+      "eval_runtime": 20.9252,
+      "eval_samples_per_second": 84.205,
+      "eval_steps_per_second": 21.075,
+      "step": 3000
+    },
+    {
+      "epoch": 0.7593340060544904,
+      "grad_norm": 0.08373123407363892,
+      "learning_rate": 3.734443323242516e-05,
+      "loss": 0.0033,
+      "step": 3010
+    },
+    {
+      "epoch": 0.7618567103935419,
+      "grad_norm": 0.06391701102256775,
+      "learning_rate": 3.7302388160107636e-05,
+      "loss": 0.0084,
+      "step": 3020
+    },
+    {
+      "epoch": 0.7643794147325933,
+      "grad_norm": 0.11146340519189835,
+      "learning_rate": 3.726034308779011e-05,
+      "loss": 0.0054,
+      "step": 3030
+    },
+    {
+      "epoch": 0.7669021190716448,
+      "grad_norm": 0.1086326614022255,
+      "learning_rate": 3.7218298015472585e-05,
+      "loss": 0.0028,
+      "step": 3040
+    },
+    {
+      "epoch": 0.7694248234106963,
+      "grad_norm": 0.029285268858075142,
+      "learning_rate": 3.7176252943155067e-05,
+      "loss": 0.0065,
+      "step": 3050
+    },
+    {
+      "epoch": 0.7719475277497477,
+      "grad_norm": 0.13605491816997528,
+      "learning_rate": 3.713420787083754e-05,
+      "loss": 0.0083,
+      "step": 3060
+    },
+    {
+      "epoch": 0.7744702320887992,
+      "grad_norm": 0.0959770679473877,
+      "learning_rate": 3.7092162798520016e-05,
+      "loss": 0.0078,
+      "step": 3070
+    },
+    {
+      "epoch": 0.7769929364278506,
+      "grad_norm": 0.022328553721308708,
+      "learning_rate": 3.705011772620249e-05,
+      "loss": 0.0065,
+      "step": 3080
+    },
+    {
+      "epoch": 0.7795156407669022,
+      "grad_norm": 0.0018502280581742525,
+      "learning_rate": 3.7008072653884965e-05,
+      "loss": 0.0107,
+      "step": 3090
+    },
+    {
+      "epoch": 0.7820383451059536,
+      "grad_norm": 0.020223820582032204,
+      "learning_rate": 3.696602758156744e-05,
+      "loss": 0.0014,
+      "step": 3100
+    },
+    {
+      "epoch": 0.7820383451059536,
+      "eval_loss": 0.007150140590965748,
+      "eval_runtime": 20.9003,
+      "eval_samples_per_second": 84.305,
+      "eval_steps_per_second": 21.1,
+      "step": 3100
+    },
+    {
+      "epoch": 0.784561049445005,
+      "grad_norm": 0.17186589539051056,
+      "learning_rate": 3.6923982509249915e-05,
+      "loss": 0.0043,
+      "step": 3110
+    },
+    {
+      "epoch": 0.7870837537840565,
+      "grad_norm": 0.054055992513895035,
+      "learning_rate": 3.6881937436932396e-05,
+      "loss": 0.0036,
+      "step": 3120
+    },
+    {
+      "epoch": 0.7896064581231079,
+      "grad_norm": 0.24574770033359528,
+      "learning_rate": 3.683989236461487e-05,
+      "loss": 0.0125,
+      "step": 3130
+    },
+    {
+      "epoch": 0.7921291624621595,
+      "grad_norm": 0.09889545291662216,
+      "learning_rate": 3.6797847292297345e-05,
+      "loss": 0.0071,
+      "step": 3140
+    },
+    {
+      "epoch": 0.7946518668012109,
+      "grad_norm": 0.1626003533601761,
+      "learning_rate": 3.675580221997982e-05,
+      "loss": 0.0037,
+      "step": 3150
+    },
+    {
+      "epoch": 0.7971745711402624,
+      "grad_norm": 0.1329105943441391,
+      "learning_rate": 3.6713757147662295e-05,
+      "loss": 0.007,
+      "step": 3160
+    },
+    {
+      "epoch": 0.7996972754793138,
+      "grad_norm": 0.18876679241657257,
+      "learning_rate": 3.667171207534477e-05,
+      "loss": 0.0121,
+      "step": 3170
+    },
+    {
+      "epoch": 0.8022199798183652,
+      "grad_norm": 0.1532873511314392,
+      "learning_rate": 3.6629667003027244e-05,
+      "loss": 0.0061,
+      "step": 3180
+    },
+    {
+      "epoch": 0.8047426841574168,
+      "grad_norm": 0.14677301049232483,
+      "learning_rate": 3.658762193070972e-05,
+      "loss": 0.0046,
+      "step": 3190
+    },
+    {
+      "epoch": 0.8072653884964682,
+      "grad_norm": 0.1844077706336975,
+      "learning_rate": 3.65455768583922e-05,
+      "loss": 0.0075,
+      "step": 3200
+    },
+    {
+      "epoch": 0.8072653884964682,
+      "eval_loss": 0.006863369140774012,
+      "eval_runtime": 20.9061,
+      "eval_samples_per_second": 84.282,
+      "eval_steps_per_second": 21.094,
+      "step": 3200
+    },
+    {
+      "epoch": 0.8097880928355197,
+      "grad_norm": 0.07923123240470886,
+      "learning_rate": 3.6503531786074675e-05,
+      "loss": 0.0023,
+      "step": 3210
+    },
+    {
+      "epoch": 0.8123107971745711,
+      "grad_norm": 0.003948994446545839,
+      "learning_rate": 3.646148671375715e-05,
+      "loss": 0.0036,
+      "step": 3220
+    },
+    {
+      "epoch": 0.8148335015136225,
+      "grad_norm": 0.18977274000644684,
+      "learning_rate": 3.641944164143963e-05,
+      "loss": 0.0058,
+      "step": 3230
+    },
+    {
+      "epoch": 0.8173562058526741,
+      "grad_norm": 0.13241952657699585,
+      "learning_rate": 3.63773965691221e-05,
+      "loss": 0.0048,
+      "step": 3240
+    },
+    {
+      "epoch": 0.8198789101917255,
+      "grad_norm": 0.10835881531238556,
+      "learning_rate": 3.6335351496804573e-05,
+      "loss": 0.0065,
+      "step": 3250
+    },
+    {
+      "epoch": 0.822401614530777,
+      "grad_norm": 0.10177090018987656,
+      "learning_rate": 3.629330642448705e-05,
+      "loss": 0.0089,
+      "step": 3260
+    },
+    {
+      "epoch": 0.8249243188698284,
+      "grad_norm": 0.11547684669494629,
+      "learning_rate": 3.625126135216953e-05,
+      "loss": 0.0069,
+      "step": 3270
+    },
+    {
+      "epoch": 0.82744702320888,
+      "grad_norm": 0.22250983119010925,
+      "learning_rate": 3.6209216279852004e-05,
+      "loss": 0.0128,
+      "step": 3280
+    },
+    {
+      "epoch": 0.8299697275479314,
+      "grad_norm": 0.05016797035932541,
+      "learning_rate": 3.616717120753448e-05,
+      "loss": 0.0041,
+      "step": 3290
+    },
+    {
+      "epoch": 0.8324924318869829,
+      "grad_norm": 0.19384223222732544,
+      "learning_rate": 3.6125126135216953e-05,
+      "loss": 0.0125,
+      "step": 3300
+    },
+    {
+      "epoch": 0.8324924318869829,
+      "eval_loss": 0.006563546601682901,
+      "eval_runtime": 20.8989,
+      "eval_samples_per_second": 84.311,
+      "eval_steps_per_second": 21.102,
+      "step": 3300
+    },
+    {
+      "epoch": 0.8350151362260343,
+      "grad_norm": 0.09639815986156464,
+      "learning_rate": 3.6083081062899435e-05,
+      "loss": 0.0064,
+      "step": 3310
+    },
+    {
+      "epoch": 0.8375378405650857,
+      "grad_norm": 0.006088436581194401,
+      "learning_rate": 3.60410359905819e-05,
+      "loss": 0.0072,
+      "step": 3320
+    },
+    {
+      "epoch": 0.8400605449041373,
+      "grad_norm": 0.20799516141414642,
+      "learning_rate": 3.599899091826438e-05,
+      "loss": 0.0094,
+      "step": 3330
+    },
+    {
+      "epoch": 0.8425832492431887,
+      "grad_norm": 0.0011494633508846164,
+      "learning_rate": 3.595694584594685e-05,
+      "loss": 0.0064,
+      "step": 3340
+    },
+    {
+      "epoch": 0.8451059535822402,
+      "grad_norm": 0.012169969268143177,
+      "learning_rate": 3.5914900773629334e-05,
+      "loss": 0.0077,
+      "step": 3350
+    },
+    {
+      "epoch": 0.8476286579212916,
+      "grad_norm": 0.15517696738243103,
+      "learning_rate": 3.587285570131181e-05,
+      "loss": 0.004,
+      "step": 3360
+    },
+    {
+      "epoch": 0.850151362260343,
+      "grad_norm": 0.1262129694223404,
+      "learning_rate": 3.583081062899428e-05,
+      "loss": 0.0123,
+      "step": 3370
+    },
+    {
+      "epoch": 0.8526740665993946,
+      "grad_norm": 0.05431267246603966,
+      "learning_rate": 3.578876555667676e-05,
+      "loss": 0.0012,
+      "step": 3380
+    },
+    {
+      "epoch": 0.855196770938446,
+      "grad_norm": 0.27199476957321167,
+      "learning_rate": 3.574672048435924e-05,
+      "loss": 0.0076,
+      "step": 3390
+    },
+    {
+      "epoch": 0.8577194752774975,
+      "grad_norm": 0.20499233901500702,
+      "learning_rate": 3.5704675412041714e-05,
+      "loss": 0.0045,
+      "step": 3400
+    },
+    {
+      "epoch": 0.8577194752774975,
+      "eval_loss": 0.006544772535562515,
+      "eval_runtime": 20.8969,
+      "eval_samples_per_second": 84.319,
+      "eval_steps_per_second": 21.104,
+      "step": 3400
+    },
+    {
+      "epoch": 0.8602421796165489,
+      "grad_norm": 0.08625713735818863,
+      "learning_rate": 3.566263033972418e-05,
+      "loss": 0.0099,
+      "step": 3410
+    },
+    {
+      "epoch": 0.8627648839556004,
+      "grad_norm": 0.11165639013051987,
+      "learning_rate": 3.562058526740666e-05,
+      "loss": 0.0026,
+      "step": 3420
+    },
+    {
+      "epoch": 0.8652875882946519,
+      "grad_norm": 0.0018256891053169966,
+      "learning_rate": 3.557854019508914e-05,
+      "loss": 0.0048,
+      "step": 3430
+    },
+    {
+      "epoch": 0.8678102926337034,
+      "grad_norm": 0.19064132869243622,
+      "learning_rate": 3.553649512277161e-05,
+      "loss": 0.0021,
+      "step": 3440
+    },
+    {
+      "epoch": 0.8703329969727548,
+      "grad_norm": 0.2267286479473114,
+      "learning_rate": 3.549445005045409e-05,
+      "loss": 0.0091,
+      "step": 3450
+    },
+    {
+      "epoch": 0.8728557013118062,
+      "grad_norm": 0.006103180348873138,
+      "learning_rate": 3.545240497813657e-05,
+      "loss": 0.0103,
+      "step": 3460
+    },
+    {
+      "epoch": 0.8753784056508577,
+      "grad_norm": 0.0026887240819633007,
+      "learning_rate": 3.541035990581904e-05,
+      "loss": 0.0048,
+      "step": 3470
+    },
+    {
+      "epoch": 0.8779011099899092,
+      "grad_norm": 0.06880487501621246,
+      "learning_rate": 3.536831483350152e-05,
+      "loss": 0.0031,
+      "step": 3480
+    },
+    {
+      "epoch": 0.8804238143289607,
+      "grad_norm": 0.14154627919197083,
+      "learning_rate": 3.5326269761183986e-05,
+      "loss": 0.0145,
+      "step": 3490
+    },
+    {
+      "epoch": 0.8829465186680121,
+      "grad_norm": 0.0016246146988123655,
+      "learning_rate": 3.528422468886647e-05,
+      "loss": 0.0026,
+      "step": 3500
+    },
+    {
+      "epoch": 0.8829465186680121,
+      "eval_loss": 0.007226438261568546,
+      "eval_runtime": 20.911,
+      "eval_samples_per_second": 84.262,
+      "eval_steps_per_second": 21.089,
+      "step": 3500
+    },
+    {
+      "epoch": 0.8854692230070635,
+      "grad_norm": 0.0016699236584827304,
+      "learning_rate": 3.524217961654894e-05,
+      "loss": 0.0058,
+      "step": 3510
+    },
+    {
+      "epoch": 0.887991927346115,
+      "grad_norm": 0.2493603378534317,
+      "learning_rate": 3.5200134544231416e-05,
+      "loss": 0.0113,
+      "step": 3520
+    },
+    {
+      "epoch": 0.8905146316851665,
+      "grad_norm": 0.1181974858045578,
+      "learning_rate": 3.515808947191389e-05,
+      "loss": 0.0059,
+      "step": 3530
+    },
+    {
+      "epoch": 0.893037336024218,
+      "grad_norm": 0.11245719343423843,
+      "learning_rate": 3.511604439959637e-05,
+      "loss": 0.0045,
+      "step": 3540
+    },
+    {
+      "epoch": 0.8955600403632694,
+      "grad_norm": 0.13200731575489044,
+      "learning_rate": 3.507399932727885e-05,
+      "loss": 0.0054,
+      "step": 3550
+    },
+    {
+      "epoch": 0.8980827447023209,
+      "grad_norm": 0.195307195186615,
+      "learning_rate": 3.503195425496132e-05,
+      "loss": 0.0055,
+      "step": 3560
+    },
+    {
+      "epoch": 0.9006054490413723,
+      "grad_norm": 0.08665880560874939,
+      "learning_rate": 3.4989909182643797e-05,
+      "loss": 0.0074,
+      "step": 3570
+    },
+    {
+      "epoch": 0.9031281533804238,
+      "grad_norm": 0.00980606209486723,
+      "learning_rate": 3.494786411032627e-05,
+      "loss": 0.0049,
+      "step": 3580
+    },
+    {
+      "epoch": 0.9056508577194753,
+      "grad_norm": 0.1497032195329666,
+      "learning_rate": 3.4905819038008746e-05,
+      "loss": 0.0055,
+      "step": 3590
+    },
+    {
+      "epoch": 0.9081735620585267,
+      "grad_norm": 0.09247948974370956,
+      "learning_rate": 3.486377396569122e-05,
+      "loss": 0.0022,
+      "step": 3600
+    },
+    {
+      "epoch": 0.9081735620585267,
+      "eval_loss": 0.0062293908558785915,
+      "eval_runtime": 20.8987,
+      "eval_samples_per_second": 84.312,
+      "eval_steps_per_second": 21.102,
+      "step": 3600
+    },
+    {
+      "epoch": 0.9106962663975782,
+      "grad_norm": 0.09304177761077881,
+      "learning_rate": 3.48217288933737e-05,
+      "loss": 0.0101,
+      "step": 3610
+    },
+    {
+      "epoch": 0.9132189707366297,
+      "grad_norm": 0.004028341267257929,
+      "learning_rate": 3.477968382105618e-05,
+      "loss": 0.0097,
+      "step": 3620
+    },
+    {
+      "epoch": 0.9157416750756812,
+      "grad_norm": 0.03493291139602661,
+      "learning_rate": 3.473763874873865e-05,
+      "loss": 0.0051,
+      "step": 3630
+    },
+    {
+      "epoch": 0.9182643794147326,
+      "grad_norm": 0.12278582155704498,
+      "learning_rate": 3.4695593676421126e-05,
+      "loss": 0.0106,
+      "step": 3640
+    },
+    {
+      "epoch": 0.920787083753784,
+      "grad_norm": 0.18783220648765564,
+      "learning_rate": 3.46535486041036e-05,
+      "loss": 0.0043,
+      "step": 3650
+    },
+    {
+      "epoch": 0.9233097880928355,
+      "grad_norm": 0.013721932657063007,
+      "learning_rate": 3.4611503531786075e-05,
+      "loss": 0.0031,
+      "step": 3660
+    },
+    {
+      "epoch": 0.925832492431887,
+      "grad_norm": 0.01255789864808321,
+      "learning_rate": 3.456945845946855e-05,
+      "loss": 0.0043,
+      "step": 3670
+    },
+    {
+      "epoch": 0.9283551967709385,
+      "grad_norm": 0.06713691353797913,
+      "learning_rate": 3.4527413387151025e-05,
+      "loss": 0.0024,
+      "step": 3680
+    },
+    {
+      "epoch": 0.9308779011099899,
+      "grad_norm": 0.007393176201730967,
+      "learning_rate": 3.4485368314833506e-05,
+      "loss": 0.0059,
+      "step": 3690
+    },
+    {
+      "epoch": 0.9334006054490414,
+      "grad_norm": 0.14634644985198975,
+      "learning_rate": 3.444332324251598e-05,
+      "loss": 0.0102,
+      "step": 3700
+    },
+    {
+      "epoch": 0.9334006054490414,
+      "eval_loss": 0.005810776725411415,
+      "eval_runtime": 20.8803,
+      "eval_samples_per_second": 84.386,
+      "eval_steps_per_second": 21.12,
+      "step": 3700
+    },
+    {
+      "epoch": 0.9359233097880928,
+      "grad_norm": 0.00905498769134283,
+      "learning_rate": 3.4401278170198455e-05,
+      "loss": 0.003,
+      "step": 3710
+    },
+    {
+      "epoch": 0.9384460141271443,
+      "grad_norm": 0.005104383919388056,
+      "learning_rate": 3.435923309788093e-05,
+      "loss": 0.0046,
+      "step": 3720
+    },
+    {
+      "epoch": 0.9409687184661958,
+      "grad_norm": 0.036407459527254105,
+      "learning_rate": 3.4317188025563405e-05,
+      "loss": 0.0052,
+      "step": 3730
+    },
+    {
+      "epoch": 0.9434914228052472,
+      "grad_norm": 0.12225465476512909,
+      "learning_rate": 3.427514295324588e-05,
+      "loss": 0.0011,
+      "step": 3740
+    },
+    {
+      "epoch": 0.9460141271442987,
+      "grad_norm": 0.002337078098207712,
+      "learning_rate": 3.4233097880928354e-05,
+      "loss": 0.0053,
+      "step": 3750
+    },
+    {
+      "epoch": 0.9485368314833501,
+      "grad_norm": 0.0018623026553541422,
+      "learning_rate": 3.4191052808610836e-05,
+      "loss": 0.0021,
+      "step": 3760
+    },
+    {
+      "epoch": 0.9510595358224017,
+      "grad_norm": 0.013399872928857803,
+      "learning_rate": 3.414900773629331e-05,
+      "loss": 0.0032,
+      "step": 3770
+    },
+    {
+      "epoch": 0.9535822401614531,
+      "grad_norm": 0.010270589962601662,
+      "learning_rate": 3.4106962663975785e-05,
+      "loss": 0.0083,
+      "step": 3780
+    },
+    {
+      "epoch": 0.9561049445005045,
+      "grad_norm": 0.07046973705291748,
+      "learning_rate": 3.406491759165826e-05,
+      "loss": 0.0014,
+      "step": 3790
+    },
+    {
+      "epoch": 0.958627648839556,
+      "grad_norm": 0.0009812767384573817,
+      "learning_rate": 3.4022872519340734e-05,
+      "loss": 0.0039,
+      "step": 3800
+    },
+    {
+      "epoch": 0.958627648839556,
+      "eval_loss": 0.005735259968787432,
+      "eval_runtime": 20.894,
+      "eval_samples_per_second": 84.331,
+      "eval_steps_per_second": 21.107,
+      "step": 3800
+    },
+    {
+      "epoch": 0.9611503531786074,
+      "grad_norm": 0.11425192654132843,
+      "learning_rate": 3.398082744702321e-05,
+      "loss": 0.0068,
+      "step": 3810
+    },
+    {
+      "epoch": 0.963673057517659,
+      "grad_norm": 0.07777775824069977,
+      "learning_rate": 3.3938782374705683e-05,
+      "loss": 0.0013,
+      "step": 3820
+    },
+    {
+      "epoch": 0.9661957618567104,
+      "grad_norm": 0.10662028938531876,
+      "learning_rate": 3.389673730238816e-05,
+      "loss": 0.0084,
+      "step": 3830
+    },
+    {
+      "epoch": 0.9687184661957619,
+      "grad_norm": 0.07375224679708481,
+      "learning_rate": 3.385469223007064e-05,
+      "loss": 0.008,
+      "step": 3840
+    },
+    {
+      "epoch": 0.9712411705348133,
+      "grad_norm": 0.03361163288354874,
+      "learning_rate": 3.3812647157753114e-05,
+      "loss": 0.0059,
+      "step": 3850
+    },
+    {
+      "epoch": 0.9737638748738647,
+      "grad_norm": 0.012914448976516724,
+      "learning_rate": 3.377060208543559e-05,
+      "loss": 0.0009,
+      "step": 3860
+    },
+    {
+      "epoch": 0.9762865792129163,
+      "grad_norm": 0.15875528752803802,
+      "learning_rate": 3.3728557013118064e-05,
+      "loss": 0.0089,
+      "step": 3870
+    },
+    {
+      "epoch": 0.9788092835519677,
+      "grad_norm": 0.08293774724006653,
+      "learning_rate": 3.368651194080054e-05,
+      "loss": 0.0032,
+      "step": 3880
+    },
+    {
+      "epoch": 0.9813319878910192,
+      "grad_norm": 0.176809623837471,
+      "learning_rate": 3.364446686848301e-05,
+      "loss": 0.0095,
+      "step": 3890
+    },
+    {
+      "epoch": 0.9838546922300706,
+      "grad_norm": 0.15428629517555237,
+      "learning_rate": 3.360242179616549e-05,
+      "loss": 0.009,
+      "step": 3900
+    },
+    {
+      "epoch": 0.9838546922300706,
+      "eval_loss": 0.005954666528850794,
+      "eval_runtime": 20.8681,
+      "eval_samples_per_second": 84.435,
+      "eval_steps_per_second": 21.133,
+      "step": 3900
+    },
+    {
+      "epoch": 0.986377396569122,
+      "grad_norm": 0.03709837794303894,
+      "learning_rate": 3.356037672384797e-05,
+      "loss": 0.0062,
+      "step": 3910
+    },
+    {
+      "epoch": 0.9889001009081736,
+      "grad_norm": 0.01939135603606701,
+      "learning_rate": 3.3518331651530444e-05,
+      "loss": 0.0045,
+      "step": 3920
+    },
+    {
+      "epoch": 0.991422805247225,
+      "grad_norm": 0.12339838594198227,
+      "learning_rate": 3.347628657921292e-05,
+      "loss": 0.0107,
+      "step": 3930
+    },
+    {
+      "epoch": 0.9939455095862765,
+      "grad_norm": 0.11743946373462677,
+      "learning_rate": 3.343424150689539e-05,
+      "loss": 0.0026,
+      "step": 3940
+    },
+    {
+      "epoch": 0.9964682139253279,
+      "grad_norm": 0.0008224299526773393,
+      "learning_rate": 3.3392196434577874e-05,
+      "loss": 0.0029,
+      "step": 3950
+    },
+    {
+      "epoch": 0.9989909182643795,
+      "grad_norm": 0.1629364788532257,
+      "learning_rate": 3.335015136226034e-05,
+      "loss": 0.0045,
+      "step": 3960
+    },
+    {
+      "epoch": 1.001513622603431,
+      "grad_norm": 0.015702659264206886,
+      "learning_rate": 3.330810628994282e-05,
+      "loss": 0.0018,
+      "step": 3970
+    },
+    {
+      "epoch": 1.0040363269424823,
+      "grad_norm": 0.010090239346027374,
+      "learning_rate": 3.326606121762529e-05,
+      "loss": 0.0016,
+      "step": 3980
+    },
+    {
+      "epoch": 1.0065590312815338,
+      "grad_norm": 0.1662328988313675,
+      "learning_rate": 3.322401614530777e-05,
+      "loss": 0.0028,
+      "step": 3990
+    },
+    {
+      "epoch": 1.0090817356205852,
+      "grad_norm": 0.028376251459121704,
+      "learning_rate": 3.318197107299025e-05,
+      "loss": 0.0012,
+      "step": 4000
+    },
+    {
+      "epoch": 1.0090817356205852,
+      "eval_loss": 0.005606195889413357,
+      "eval_runtime": 20.8958,
+      "eval_samples_per_second": 84.323,
+      "eval_steps_per_second": 21.105,
+      "step": 4000
+    },
+    {
+      "epoch": 1.0116044399596367,
+      "grad_norm": 0.18040278553962708,
+      "learning_rate": 3.313992600067272e-05,
+      "loss": 0.0031,
+      "step": 4010
+    },
+    {
+      "epoch": 1.014127144298688,
+      "grad_norm": 0.09627419710159302,
+      "learning_rate": 3.30978809283552e-05,
+      "loss": 0.005,
+      "step": 4020
+    },
+    {
+      "epoch": 1.0166498486377396,
+      "grad_norm": 0.02057558484375477,
+      "learning_rate": 3.305583585603768e-05,
+      "loss": 0.0017,
+      "step": 4030
+    },
+    {
+      "epoch": 1.0191725529767912,
+      "grad_norm": 0.0033118566498160362,
+      "learning_rate": 3.301379078372015e-05,
+      "loss": 0.0013,
+      "step": 4040
+    },
+    {
+      "epoch": 1.0216952573158427,
+      "grad_norm": 0.13773638010025024,
+      "learning_rate": 3.297174571140262e-05,
+      "loss": 0.0032,
+      "step": 4050
+    },
+    {
+      "epoch": 1.024217961654894,
+      "grad_norm": 0.11453539878129959,
+      "learning_rate": 3.2929700639085096e-05,
+      "loss": 0.0031,
+      "step": 4060
+    },
+    {
+      "epoch": 1.0267406659939455,
+      "grad_norm": 0.002913431962952018,
+      "learning_rate": 3.288765556676758e-05,
+      "loss": 0.0011,
+      "step": 4070
+    },
+    {
+      "epoch": 1.029263370332997,
+      "grad_norm": 0.1419718712568283,
+      "learning_rate": 3.284561049445005e-05,
+      "loss": 0.0091,
+      "step": 4080
+    },
+    {
+      "epoch": 1.0317860746720484,
+      "grad_norm": 0.007667609490454197,
+      "learning_rate": 3.2803565422132527e-05,
+      "loss": 0.0022,
+      "step": 4090
+    },
+    {
+      "epoch": 1.0343087790110999,
+      "grad_norm": 0.11158380657434464,
+      "learning_rate": 3.276152034981501e-05,
+      "loss": 0.0029,
+      "step": 4100
+    },
+    {
+      "epoch": 1.0343087790110999,
+      "eval_loss": 0.005591338500380516,
+      "eval_runtime": 20.8942,
+      "eval_samples_per_second": 84.33,
+      "eval_steps_per_second": 21.106,
+      "step": 4100
+    },
+    {
+      "epoch": 1.0368314833501513,
+      "grad_norm": 0.005254568066447973,
+      "learning_rate": 3.271947527749748e-05,
+      "loss": 0.0037,
+      "step": 4110
+    },
+    {
+      "epoch": 1.0393541876892027,
+      "grad_norm": 0.13042157888412476,
+      "learning_rate": 3.267743020517996e-05,
+      "loss": 0.0052,
+      "step": 4120
+    },
+    {
+      "epoch": 1.0418768920282542,
+      "grad_norm": 0.007648650091141462,
+      "learning_rate": 3.2635385132862425e-05,
+      "loss": 0.0037,
+      "step": 4130
+    },
+    {
+      "epoch": 1.0443995963673058,
+      "grad_norm": 0.18600983917713165,
+      "learning_rate": 3.259334006054491e-05,
+      "loss": 0.0036,
+      "step": 4140
+    },
+    {
+      "epoch": 1.0469223007063573,
+      "grad_norm": 0.00177775660995394,
+      "learning_rate": 3.255129498822738e-05,
+      "loss": 0.0026,
+      "step": 4150
+    },
+    {
+      "epoch": 1.0494450050454087,
+      "grad_norm": 0.055269479751586914,
+      "learning_rate": 3.2509249915909856e-05,
+      "loss": 0.0044,
+      "step": 4160
+    },
+    {
+      "epoch": 1.0519677093844602,
+      "grad_norm": 0.007563919294625521,
+      "learning_rate": 3.246720484359233e-05,
+      "loss": 0.0073,
+      "step": 4170
+    },
+    {
+      "epoch": 1.0544904137235116,
+      "grad_norm": 0.12277320772409439,
+      "learning_rate": 3.242515977127481e-05,
+      "loss": 0.0034,
+      "step": 4180
+    },
+    {
+      "epoch": 1.057013118062563,
+      "grad_norm": 0.1538102626800537,
+      "learning_rate": 3.238311469895729e-05,
+      "loss": 0.0029,
+      "step": 4190
+    },
+    {
+      "epoch": 1.0595358224016145,
+      "grad_norm": 0.23897789418697357,
+      "learning_rate": 3.234106962663976e-05,
+      "loss": 0.0026,
+      "step": 4200
+    },
+    {
+      "epoch": 1.0595358224016145,
+      "eval_loss": 0.005667256191372871,
+      "eval_runtime": 20.9021,
+      "eval_samples_per_second": 84.298,
+      "eval_steps_per_second": 21.098,
+      "step": 4200
+    },
+    {
+      "epoch": 1.062058526740666,
+      "grad_norm": 0.059796128422021866,
+      "learning_rate": 3.229902455432223e-05,
+      "loss": 0.0036,
+      "step": 4210
+    },
+    {
+      "epoch": 1.0645812310797174,
+      "grad_norm": 0.11254319548606873,
+      "learning_rate": 3.225697948200471e-05,
+      "loss": 0.0052,
+      "step": 4220
+    },
+    {
+      "epoch": 1.067103935418769,
+      "grad_norm": 0.008323262445628643,
+      "learning_rate": 3.2214934409687185e-05,
+      "loss": 0.0033,
+      "step": 4230
+    },
+    {
+      "epoch": 1.0696266397578205,
+      "grad_norm": 4.037764301756397e-05,
+      "learning_rate": 3.217288933736966e-05,
+      "loss": 0.0018,
+      "step": 4240
+    },
+    {
+      "epoch": 1.072149344096872,
+      "grad_norm": 0.07235367596149445,
+      "learning_rate": 3.213084426505214e-05,
+      "loss": 0.001,
+      "step": 4250
+    },
+    {
+      "epoch": 1.0746720484359233,
+      "grad_norm": 0.14224140346050262,
+      "learning_rate": 3.2088799192734616e-05,
+      "loss": 0.0021,
+      "step": 4260
+    },
+    {
+      "epoch": 1.0771947527749748,
+      "grad_norm": 0.15408071875572205,
+      "learning_rate": 3.204675412041709e-05,
+      "loss": 0.0036,
+      "step": 4270
+    },
+    {
+      "epoch": 1.0797174571140262,
+      "grad_norm": 0.0037913790438324213,
+      "learning_rate": 3.2004709048099566e-05,
+      "loss": 0.0053,
+      "step": 4280
+    },
+    {
+      "epoch": 1.0822401614530777,
+      "grad_norm": 0.13299870491027832,
+      "learning_rate": 3.196266397578204e-05,
+      "loss": 0.0028,
+      "step": 4290
+    },
+    {
+      "epoch": 1.084762865792129,
+      "grad_norm": 0.12634998559951782,
+      "learning_rate": 3.1920618903464515e-05,
+      "loss": 0.0041,
+      "step": 4300
+    },
+    {
+      "epoch": 1.084762865792129,
+      "eval_loss": 0.005686003249138594,
+      "eval_runtime": 20.8959,
+      "eval_samples_per_second": 84.323,
+      "eval_steps_per_second": 21.105,
+      "step": 4300
+    },
+    {
+      "epoch": 1.0872855701311805,
+      "grad_norm": 0.16738834977149963,
+      "learning_rate": 3.187857383114699e-05,
+      "loss": 0.003,
+      "step": 4310
+    },
+    {
+      "epoch": 1.089808274470232,
+      "grad_norm": 0.037017568945884705,
+      "learning_rate": 3.1836528758829464e-05,
+      "loss": 0.0025,
+      "step": 4320
+    },
+    {
+      "epoch": 1.0923309788092836,
+      "grad_norm": 0.07605406641960144,
+      "learning_rate": 3.1794483686511946e-05,
+      "loss": 0.0024,
+      "step": 4330
+    },
+    {
+      "epoch": 1.094853683148335,
+      "grad_norm": 0.011002608574926853,
+      "learning_rate": 3.175243861419442e-05,
+      "loss": 0.0017,
+      "step": 4340
+    },
+    {
+      "epoch": 1.0973763874873865,
+      "grad_norm": 0.0014461104292422533,
+      "learning_rate": 3.1710393541876895e-05,
+      "loss": 0.001,
+      "step": 4350
+    },
+    {
+      "epoch": 1.099899091826438,
+      "grad_norm": 0.04258690029382706,
+      "learning_rate": 3.166834846955937e-05,
+      "loss": 0.001,
+      "step": 4360
+    },
+    {
+      "epoch": 1.1024217961654894,
+      "grad_norm": 0.1580243706703186,
+      "learning_rate": 3.1626303397241844e-05,
+      "loss": 0.0063,
+      "step": 4370
+    },
+    {
+      "epoch": 1.1049445005045408,
+      "grad_norm": 0.0013237325474619865,
+      "learning_rate": 3.158425832492432e-05,
+      "loss": 0.0062,
+      "step": 4380
+    },
+    {
+      "epoch": 1.1074672048435923,
+      "grad_norm": 0.0012392470380291343,
+      "learning_rate": 3.1542213252606794e-05,
+      "loss": 0.0025,
+      "step": 4390
+    },
+    {
+      "epoch": 1.1099899091826437,
+      "grad_norm": 0.04316063970327377,
+      "learning_rate": 3.150016818028927e-05,
+      "loss": 0.0015,
+      "step": 4400
+    },
+    {
+      "epoch": 1.1099899091826437,
+      "eval_loss": 0.005799129139631987,
+      "eval_runtime": 20.8943,
+      "eval_samples_per_second": 84.329,
+      "eval_steps_per_second": 21.106,
+      "step": 4400
+    },
+    {
+      "epoch": 1.1125126135216952,
+      "grad_norm": 0.12705808877944946,
+      "learning_rate": 3.145812310797175e-05,
+      "loss": 0.0038,
+      "step": 4410
+    },
+    {
+      "epoch": 1.1150353178607468,
+      "grad_norm": 0.01606024242937565,
+      "learning_rate": 3.1416078035654224e-05,
+      "loss": 0.0035,
+      "step": 4420
+    },
+    {
+      "epoch": 1.1175580221997983,
+      "grad_norm": 0.0025201209355145693,
+      "learning_rate": 3.13740329633367e-05,
+      "loss": 0.0053,
+      "step": 4430
+    },
+    {
+      "epoch": 1.1200807265388497,
+      "grad_norm": 0.0011012004688382149,
+      "learning_rate": 3.1331987891019174e-05,
+      "loss": 0.0044,
+      "step": 4440
+    },
+    {
+      "epoch": 1.1226034308779012,
+      "grad_norm": 0.00029570693732239306,
+      "learning_rate": 3.128994281870165e-05,
+      "loss": 0.0037,
+      "step": 4450
+    },
+    {
+      "epoch": 1.1251261352169526,
+      "grad_norm": 0.028565967455506325,
+      "learning_rate": 3.124789774638412e-05,
+      "loss": 0.005,
+      "step": 4460
+    },
+    {
+      "epoch": 1.127648839556004,
+      "grad_norm": 0.09666335582733154,
+      "learning_rate": 3.12058526740666e-05,
+      "loss": 0.0025,
+      "step": 4470
+    },
+    {
+      "epoch": 1.1301715438950555,
+      "grad_norm": 0.0027206747326999903,
+      "learning_rate": 3.116380760174908e-05,
+      "loss": 0.0011,
+      "step": 4480
+    },
+    {
+      "epoch": 1.132694248234107,
+      "grad_norm": 0.012391073629260063,
+      "learning_rate": 3.1121762529431554e-05,
+      "loss": 0.0002,
+      "step": 4490
+    },
+    {
+      "epoch": 1.1352169525731584,
+      "grad_norm": 0.05354852229356766,
+      "learning_rate": 3.107971745711403e-05,
+      "loss": 0.0014,
+      "step": 4500
+    },
+    {
+      "epoch": 1.1352169525731584,
+      "eval_loss": 0.005535118281841278,
+      "eval_runtime": 20.8683,
+      "eval_samples_per_second": 84.434,
+      "eval_steps_per_second": 21.132,
+      "step": 4500
+    },
+    {
+      "epoch": 1.1377396569122098,
+      "grad_norm": 0.03663089498877525,
+      "learning_rate": 3.10376723847965e-05,
+      "loss": 0.0028,
+      "step": 4510
+    },
+    {
+      "epoch": 1.1402623612512612,
+      "grad_norm": 0.20328471064567566,
+      "learning_rate": 3.099562731247898e-05,
+      "loss": 0.0014,
+      "step": 4520
+    },
+    {
+      "epoch": 1.142785065590313,
+      "grad_norm": 0.19447293877601624,
+      "learning_rate": 3.095358224016145e-05,
+      "loss": 0.0046,
+      "step": 4530
+    },
+    {
+      "epoch": 1.1453077699293643,
+      "grad_norm": 0.02381107583642006,
+      "learning_rate": 3.091153716784393e-05,
+      "loss": 0.0049,
+      "step": 4540
+    },
+    {
+      "epoch": 1.1478304742684158,
+      "grad_norm": 0.010867373086512089,
+      "learning_rate": 3.08694920955264e-05,
+      "loss": 0.0014,
+      "step": 4550
+    },
+    {
+      "epoch": 1.1503531786074672,
+      "grad_norm": 0.09643906354904175,
+      "learning_rate": 3.082744702320888e-05,
+      "loss": 0.0077,
+      "step": 4560
+    },
+    {
+      "epoch": 1.1528758829465187,
+      "grad_norm": 0.0748005211353302,
+      "learning_rate": 3.078540195089136e-05,
+      "loss": 0.0006,
+      "step": 4570
+    },
+    {
+      "epoch": 1.15539858728557,
+      "grad_norm": 0.007943224161863327,
+      "learning_rate": 3.074335687857383e-05,
+      "loss": 0.0009,
+      "step": 4580
+    },
+    {
+      "epoch": 1.1579212916246215,
+      "grad_norm": 0.0026662801392376423,
+      "learning_rate": 3.0701311806256314e-05,
+      "loss": 0.0006,
+      "step": 4590
+    },
+    {
+      "epoch": 1.160443995963673,
+      "grad_norm": 0.007000184152275324,
+      "learning_rate": 3.065926673393878e-05,
+      "loss": 0.0029,
+      "step": 4600
+    },
+    {
+      "epoch": 1.160443995963673,
+      "eval_loss": 0.005535861011594534,
+      "eval_runtime": 20.911,
+      "eval_samples_per_second": 84.262,
+      "eval_steps_per_second": 21.089,
+      "step": 4600
+    },
+    {
+      "epoch": 1.1629667003027246,
+      "grad_norm": 0.010504338890314102,
+      "learning_rate": 3.0617221661621257e-05,
+      "loss": 0.0021,
+      "step": 4610
+    },
+    {
+      "epoch": 1.165489404641776,
+      "grad_norm": 0.0042596235871315,
+      "learning_rate": 3.057517658930373e-05,
+      "loss": 0.0025,
+      "step": 4620
+    },
+    {
+      "epoch": 1.1680121089808275,
+      "grad_norm": 0.1560107320547104,
+      "learning_rate": 3.053313151698621e-05,
+      "loss": 0.0053,
+      "step": 4630
+    },
+    {
+      "epoch": 1.170534813319879,
+      "grad_norm": 0.03363262489438057,
+      "learning_rate": 3.0491086444668687e-05,
+      "loss": 0.0021,
+      "step": 4640
+    },
+    {
+      "epoch": 1.1730575176589304,
+      "grad_norm": 0.024774545803666115,
+      "learning_rate": 3.0449041372351162e-05,
+      "loss": 0.0085,
+      "step": 4650
+    },
+    {
+      "epoch": 1.1755802219979818,
+      "grad_norm": 0.021040301769971848,
+      "learning_rate": 3.0406996300033637e-05,
+      "loss": 0.0015,
+      "step": 4660
+    },
+    {
+      "epoch": 1.1781029263370333,
+      "grad_norm": 0.09655909985303879,
+      "learning_rate": 3.0364951227716115e-05,
+      "loss": 0.0021,
+      "step": 4670
+    },
+    {
+      "epoch": 1.1806256306760847,
+      "grad_norm": 0.13116657733917236,
+      "learning_rate": 3.032290615539859e-05,
+      "loss": 0.0035,
+      "step": 4680
+    },
+    {
+      "epoch": 1.1831483350151362,
+      "grad_norm": 0.11077822744846344,
+      "learning_rate": 3.0280861083081064e-05,
+      "loss": 0.0019,
+      "step": 4690
+    },
+    {
+      "epoch": 1.1856710393541876,
+      "grad_norm": 0.0015139818424358964,
+      "learning_rate": 3.023881601076354e-05,
+      "loss": 0.0019,
+      "step": 4700
+    },
+    {
+      "epoch": 1.1856710393541876,
+      "eval_loss": 0.005410597659647465,
+      "eval_runtime": 20.9117,
+      "eval_samples_per_second": 84.259,
+      "eval_steps_per_second": 21.089,
+      "step": 4700
+    },
+    {
+      "epoch": 1.188193743693239,
+      "grad_norm": 0.24234654009342194,
+      "learning_rate": 3.0196770938446017e-05,
+      "loss": 0.0029,
+      "step": 4710
+    },
+    {
+      "epoch": 1.1907164480322907,
+      "grad_norm": 0.13478586077690125,
+      "learning_rate": 3.015472586612849e-05,
+      "loss": 0.0054,
+      "step": 4720
+    },
+    {
+      "epoch": 1.1932391523713421,
+      "grad_norm": 0.00044387555681169033,
+      "learning_rate": 3.0112680793810966e-05,
+      "loss": 0.0024,
+      "step": 4730
+    },
+    {
+      "epoch": 1.1957618567103936,
+      "grad_norm": 0.005823497194796801,
+      "learning_rate": 3.007063572149344e-05,
+      "loss": 0.0018,
+      "step": 4740
+    },
+    {
+      "epoch": 1.198284561049445,
+      "grad_norm": 0.03638460114598274,
+      "learning_rate": 3.002859064917592e-05,
+      "loss": 0.0029,
+      "step": 4750
+    },
+    {
+      "epoch": 1.2008072653884965,
+      "grad_norm": 0.019287308678030968,
+      "learning_rate": 2.9986545576858393e-05,
+      "loss": 0.0021,
+      "step": 4760
+    },
+    {
+      "epoch": 1.203329969727548,
+      "grad_norm": 0.10308881103992462,
+      "learning_rate": 2.9944500504540868e-05,
+      "loss": 0.0065,
+      "step": 4770
+    },
+    {
+      "epoch": 1.2058526740665994,
+      "grad_norm": 0.061113424599170685,
+      "learning_rate": 2.9902455432223346e-05,
+      "loss": 0.0017,
+      "step": 4780
+    },
+    {
+      "epoch": 1.2083753784056508,
+      "grad_norm": 0.07032662630081177,
+      "learning_rate": 2.986041035990582e-05,
+      "loss": 0.0029,
+      "step": 4790
+    },
+    {
+      "epoch": 1.2108980827447022,
+      "grad_norm": 0.09232667833566666,
+      "learning_rate": 2.9818365287588296e-05,
+      "loss": 0.0005,
+      "step": 4800
+    },
+    {
+      "epoch": 1.2108980827447022,
+      "eval_loss": 0.0057380907237529755,
+      "eval_runtime": 20.8889,
+      "eval_samples_per_second": 84.351,
+      "eval_steps_per_second": 21.112,
+      "step": 4800
+    },
+    {
+      "epoch": 1.213420787083754,
+      "grad_norm": 0.0020024082623422146,
+      "learning_rate": 2.977632021527077e-05,
+      "loss": 0.0027,
+      "step": 4810
+    },
+    {
+      "epoch": 1.2159434914228053,
+      "grad_norm": 0.02490418404340744,
+      "learning_rate": 2.9734275142953248e-05,
+      "loss": 0.001,
+      "step": 4820
+    },
+    {
+      "epoch": 1.2184661957618568,
+      "grad_norm": 0.16153936088085175,
+      "learning_rate": 2.9692230070635723e-05,
+      "loss": 0.0021,
+      "step": 4830
+    },
+    {
+      "epoch": 1.2209889001009082,
+      "grad_norm": 0.1121373400092125,
+      "learning_rate": 2.9650184998318198e-05,
+      "loss": 0.002,
+      "step": 4840
+    },
+    {
+      "epoch": 1.2235116044399597,
+      "grad_norm": 0.0005092213395982981,
+      "learning_rate": 2.9608139926000672e-05,
+      "loss": 0.0031,
+      "step": 4850
+    },
+    {
+      "epoch": 1.226034308779011,
+      "grad_norm": 0.004732844419777393,
+      "learning_rate": 2.956609485368315e-05,
+      "loss": 0.0008,
+      "step": 4860
+    },
+    {
+      "epoch": 1.2285570131180625,
+      "grad_norm": 0.21401868760585785,
+      "learning_rate": 2.9524049781365625e-05,
+      "loss": 0.0103,
+      "step": 4870
+    },
+    {
+      "epoch": 1.231079717457114,
+      "grad_norm": 0.08474498987197876,
+      "learning_rate": 2.94820047090481e-05,
+      "loss": 0.0012,
+      "step": 4880
+    },
+    {
+      "epoch": 1.2336024217961654,
+      "grad_norm": 0.005474930163472891,
+      "learning_rate": 2.9439959636730574e-05,
+      "loss": 0.0018,
+      "step": 4890
+    },
+    {
+      "epoch": 1.2361251261352169,
+      "grad_norm": 0.20003733038902283,
+      "learning_rate": 2.9397914564413052e-05,
+      "loss": 0.0038,
+      "step": 4900
+    },
+    {
+      "epoch": 1.2361251261352169,
+      "eval_loss": 0.005515508819371462,
+      "eval_runtime": 20.9039,
+      "eval_samples_per_second": 84.29,
+      "eval_steps_per_second": 21.097,
+      "step": 4900
+    },
+    {
+      "epoch": 1.2386478304742683,
+      "grad_norm": 0.16607780754566193,
+      "learning_rate": 2.9355869492095527e-05,
+      "loss": 0.0037,
+      "step": 4910
+    },
+    {
+      "epoch": 1.24117053481332,
+      "grad_norm": 0.21575786173343658,
+      "learning_rate": 2.9313824419778e-05,
+      "loss": 0.0049,
+      "step": 4920
+    },
+    {
+      "epoch": 1.2436932391523714,
+      "grad_norm": 0.1419685035943985,
+      "learning_rate": 2.9271779347460483e-05,
+      "loss": 0.0056,
+      "step": 4930
+    },
+    {
+      "epoch": 1.2462159434914228,
+      "grad_norm": 0.0009386079618707299,
+      "learning_rate": 2.9229734275142954e-05,
+      "loss": 0.0015,
+      "step": 4940
+    },
+    {
+      "epoch": 1.2487386478304743,
+      "grad_norm": 0.011205712333321571,
+      "learning_rate": 2.918768920282543e-05,
+      "loss": 0.0026,
+      "step": 4950
+    },
+    {
+      "epoch": 1.2512613521695257,
+      "grad_norm": 0.21776536107063293,
+      "learning_rate": 2.9145644130507904e-05,
+      "loss": 0.0039,
+      "step": 4960
+    },
+    {
+      "epoch": 1.2537840565085772,
+      "grad_norm": 0.09818103164434433,
+      "learning_rate": 2.9103599058190385e-05,
+      "loss": 0.0011,
+      "step": 4970
+    },
+    {
+      "epoch": 1.2563067608476286,
+      "grad_norm": 0.16241490840911865,
+      "learning_rate": 2.9061553985872856e-05,
+      "loss": 0.002,
+      "step": 4980
+    },
+    {
+      "epoch": 1.25882946518668,
+      "grad_norm": 0.0026818953920155764,
+      "learning_rate": 2.901950891355533e-05,
+      "loss": 0.0021,
+      "step": 4990
+    },
+    {
+      "epoch": 1.2613521695257317,
+      "grad_norm": 0.026470551267266273,
+      "learning_rate": 2.8977463841237806e-05,
+      "loss": 0.006,
+      "step": 5000
+    },
+    {
+      "epoch": 1.2613521695257317,
+      "eval_loss": 0.00518006319180131,
+      "eval_runtime": 20.9271,
+      "eval_samples_per_second": 84.197,
+      "eval_steps_per_second": 21.073,
+      "step": 5000
+    },
+    {
+      "epoch": 1.2638748738647831,
+      "grad_norm": 0.0004209143517073244,
+      "learning_rate": 2.8935418768920287e-05,
+      "loss": 0.0033,
+      "step": 5010
+    },
+    {
+      "epoch": 1.2663975782038346,
+      "grad_norm": 0.0030910836067050695,
+      "learning_rate": 2.8893373696602762e-05,
+      "loss": 0.0024,
+      "step": 5020
+    },
+    {
+      "epoch": 1.268920282542886,
+      "grad_norm": 0.013859076425433159,
+      "learning_rate": 2.8851328624285233e-05,
+      "loss": 0.0073,
+      "step": 5030
+    },
+    {
+      "epoch": 1.2714429868819375,
+      "grad_norm": 0.0023835492320358753,
+      "learning_rate": 2.8809283551967708e-05,
+      "loss": 0.0023,
+      "step": 5040
+    },
+    {
+      "epoch": 1.273965691220989,
+      "grad_norm": 0.0017705514328554273,
+      "learning_rate": 2.876723847965019e-05,
+      "loss": 0.0009,
+      "step": 5050
+    },
+    {
+      "epoch": 1.2764883955600403,
+      "grad_norm": 0.09929084032773972,
+      "learning_rate": 2.8725193407332664e-05,
+      "loss": 0.0024,
+      "step": 5060
+    },
+    {
+      "epoch": 1.2790110998990918,
+      "grad_norm": 0.21266485750675201,
+      "learning_rate": 2.8683148335015135e-05,
+      "loss": 0.0051,
+      "step": 5070
+    },
+    {
+      "epoch": 1.2815338042381432,
+      "grad_norm": 0.045401476323604584,
+      "learning_rate": 2.864110326269761e-05,
+      "loss": 0.001,
+      "step": 5080
+    },
+    {
+      "epoch": 1.2840565085771947,
+      "grad_norm": 0.010040095075964928,
+      "learning_rate": 2.859905819038009e-05,
+      "loss": 0.0049,
+      "step": 5090
+    },
+    {
+      "epoch": 1.286579212916246,
+      "grad_norm": 0.1340843141078949,
+      "learning_rate": 2.8557013118062566e-05,
+      "loss": 0.0038,
+      "step": 5100
+    },
+    {
+      "epoch": 1.286579212916246,
+      "eval_loss": 0.0053547462448477745,
+      "eval_runtime": 20.8929,
+      "eval_samples_per_second": 84.335,
+      "eval_steps_per_second": 21.108,
+      "step": 5100
+    },
+    {
+      "epoch": 1.2891019172552975,
+      "grad_norm": 0.060051582753658295,
+      "learning_rate": 2.8514968045745037e-05,
+      "loss": 0.0005,
+      "step": 5110
+    },
+    {
+      "epoch": 1.2916246215943492,
+      "grad_norm": 0.000854416866786778,
+      "learning_rate": 2.847292297342752e-05,
+      "loss": 0.0023,
+      "step": 5120
+    },
+    {
+      "epoch": 1.2941473259334006,
+      "grad_norm": 0.002330298302695155,
+      "learning_rate": 2.8430877901109993e-05,
+      "loss": 0.0031,
+      "step": 5130
+    },
+    {
+      "epoch": 1.296670030272452,
+      "grad_norm": 0.08991765975952148,
+      "learning_rate": 2.8388832828792468e-05,
+      "loss": 0.001,
+      "step": 5140
+    },
+    {
+      "epoch": 1.2991927346115035,
+      "grad_norm": 0.1184747964143753,
+      "learning_rate": 2.834678775647494e-05,
+      "loss": 0.0023,
+      "step": 5150
+    },
+    {
+      "epoch": 1.301715438950555,
+      "grad_norm": 0.023154448717832565,
+      "learning_rate": 2.830474268415742e-05,
+      "loss": 0.0024,
+      "step": 5160
+    },
+    {
+      "epoch": 1.3042381432896064,
+      "grad_norm": 0.0035342529881745577,
+      "learning_rate": 2.8262697611839895e-05,
+      "loss": 0.0033,
+      "step": 5170
+    },
+    {
+      "epoch": 1.3067608476286579,
+      "grad_norm": 0.09643299877643585,
+      "learning_rate": 2.822065253952237e-05,
+      "loss": 0.0068,
+      "step": 5180
+    },
+    {
+      "epoch": 1.3092835519677095,
+      "grad_norm": 0.0010538576170802116,
+      "learning_rate": 2.817860746720484e-05,
+      "loss": 0.0032,
+      "step": 5190
+    },
+    {
+      "epoch": 1.311806256306761,
+      "grad_norm": 0.004331338219344616,
+      "learning_rate": 2.8136562394887323e-05,
+      "loss": 0.0065,
+      "step": 5200
+    },
+    {
+      "epoch": 1.311806256306761,
+      "eval_loss": 0.005469166673719883,
+      "eval_runtime": 20.887,
+      "eval_samples_per_second": 84.359,
+      "eval_steps_per_second": 21.114,
+      "step": 5200
+    },
+    {
+      "epoch": 1.3143289606458124,
+      "grad_norm": 0.11001147329807281,
+      "learning_rate": 2.8094517322569797e-05,
+      "loss": 0.0027,
+      "step": 5210
+    },
+    {
+      "epoch": 1.3168516649848638,
+      "grad_norm": 0.006123987026512623,
+      "learning_rate": 2.8052472250252272e-05,
+      "loss": 0.0048,
+      "step": 5220
+    },
+    {
+      "epoch": 1.3193743693239153,
+      "grad_norm": 0.018299918621778488,
+      "learning_rate": 2.8010427177934743e-05,
+      "loss": 0.0046,
+      "step": 5230
+    },
+    {
+      "epoch": 1.3218970736629667,
+      "grad_norm": 0.04792286828160286,
+      "learning_rate": 2.7968382105617225e-05,
+      "loss": 0.0026,
+      "step": 5240
+    },
+    {
+      "epoch": 1.3244197780020182,
+      "grad_norm": 0.0024629354011267424,
+      "learning_rate": 2.79263370332997e-05,
+      "loss": 0.0021,
+      "step": 5250
+    },
+    {
+      "epoch": 1.3269424823410696,
+      "grad_norm": 0.00013681373093277216,
+      "learning_rate": 2.7884291960982174e-05,
+      "loss": 0.0044,
+      "step": 5260
+    },
+    {
+      "epoch": 1.329465186680121,
+      "grad_norm": 0.10849064588546753,
+      "learning_rate": 2.7842246888664652e-05,
+      "loss": 0.0034,
+      "step": 5270
+    },
+    {
+      "epoch": 1.3319878910191725,
+      "grad_norm": 0.0731433853507042,
+      "learning_rate": 2.7800201816347127e-05,
+      "loss": 0.0026,
+      "step": 5280
+    },
+    {
+      "epoch": 1.334510595358224,
+      "grad_norm": 0.0010674018412828445,
+      "learning_rate": 2.77581567440296e-05,
+      "loss": 0.0009,
+      "step": 5290
+    },
+    {
+      "epoch": 1.3370332996972754,
+      "grad_norm": 0.17949962615966797,
+      "learning_rate": 2.7716111671712076e-05,
+      "loss": 0.0015,
+      "step": 5300
+    },
+    {
+      "epoch": 1.3370332996972754,
+      "eval_loss": 0.005221678409725428,
+      "eval_runtime": 20.8945,
+      "eval_samples_per_second": 84.328,
+      "eval_steps_per_second": 21.106,
+      "step": 5300
+    },
+    {
+      "epoch": 1.339556004036327,
+      "grad_norm": 0.0010082372464239597,
+      "learning_rate": 2.7674066599394554e-05,
+      "loss": 0.0003,
+      "step": 5310
+    },
+    {
+      "epoch": 1.3420787083753785,
+      "grad_norm": 0.00024729970027692616,
+      "learning_rate": 2.763202152707703e-05,
+      "loss": 0.0012,
+      "step": 5320
+    },
+    {
+      "epoch": 1.34460141271443,
+      "grad_norm": 0.09433750808238983,
+      "learning_rate": 2.7589976454759504e-05,
+      "loss": 0.0016,
+      "step": 5330
+    },
+    {
+      "epoch": 1.3471241170534813,
+      "grad_norm": 0.001336489338427782,
+      "learning_rate": 2.7547931382441978e-05,
+      "loss": 0.0024,
+      "step": 5340
+    },
+    {
+      "epoch": 1.3496468213925328,
+      "grad_norm": 0.012806025333702564,
+      "learning_rate": 2.7505886310124456e-05,
+      "loss": 0.0033,
+      "step": 5350
+    },
+    {
+      "epoch": 1.3521695257315842,
+      "grad_norm": 0.16509069502353668,
+      "learning_rate": 2.746384123780693e-05,
+      "loss": 0.0086,
+      "step": 5360
+    },
+    {
+      "epoch": 1.3546922300706357,
+      "grad_norm": 0.0008099581464193761,
+      "learning_rate": 2.7421796165489406e-05,
+      "loss": 0.0024,
+      "step": 5370
+    },
+    {
+      "epoch": 1.357214934409687,
+      "grad_norm": 0.004303140100091696,
+      "learning_rate": 2.737975109317188e-05,
+      "loss": 0.0027,
+      "step": 5380
+    },
+    {
+      "epoch": 1.3597376387487388,
+      "grad_norm": 0.00023327719827648252,
+      "learning_rate": 2.733770602085436e-05,
+      "loss": 0.0013,
+      "step": 5390
+    },
+    {
+      "epoch": 1.3622603430877902,
+      "grad_norm": 0.003809950314462185,
+      "learning_rate": 2.7295660948536833e-05,
+      "loss": 0.0005,
+      "step": 5400
+    },
+    {
+      "epoch": 1.3622603430877902,
+      "eval_loss": 0.005022699944674969,
+      "eval_runtime": 21.0476,
+      "eval_samples_per_second": 83.715,
+      "eval_steps_per_second": 20.952,
+      "step": 5400
+    },
+    {
+      "epoch": 1.3647830474268416,
+      "grad_norm": 0.00021514434774871916,
+      "learning_rate": 2.7253615876219308e-05,
+      "loss": 0.0028,
+      "step": 5410
+    },
+    {
+      "epoch": 1.367305751765893,
+      "grad_norm": 0.17706815898418427,
+      "learning_rate": 2.7211570803901782e-05,
+      "loss": 0.0014,
+      "step": 5420
+    },
+    {
+      "epoch": 1.3698284561049445,
+      "grad_norm": 0.004937909543514252,
+      "learning_rate": 2.716952573158426e-05,
+      "loss": 0.0031,
+      "step": 5430
+    },
+    {
+      "epoch": 1.372351160443996,
+      "grad_norm": 0.0958208441734314,
+      "learning_rate": 2.7127480659266735e-05,
+      "loss": 0.0033,
+      "step": 5440
+    },
+    {
+      "epoch": 1.3748738647830474,
+      "grad_norm": 0.06263504922389984,
+      "learning_rate": 2.708543558694921e-05,
+      "loss": 0.0001,
+      "step": 5450
+    },
+    {
+      "epoch": 1.3773965691220988,
+      "grad_norm": 0.003332935506477952,
+      "learning_rate": 2.7043390514631688e-05,
+      "loss": 0.0002,
+      "step": 5460
+    },
+    {
+      "epoch": 1.3799192734611503,
+      "grad_norm": 0.16171465814113617,
+      "learning_rate": 2.7001345442314162e-05,
+      "loss": 0.004,
+      "step": 5470
+    },
+    {
+      "epoch": 1.3824419778002017,
+      "grad_norm": 0.04109754040837288,
+      "learning_rate": 2.6959300369996637e-05,
+      "loss": 0.0055,
+      "step": 5480
+    },
+    {
+      "epoch": 1.3849646821392532,
+      "grad_norm": 0.0015252727316692472,
+      "learning_rate": 2.6917255297679112e-05,
+      "loss": 0.0038,
+      "step": 5490
+    },
+    {
+      "epoch": 1.3874873864783046,
+      "grad_norm": 0.0015239976346492767,
+      "learning_rate": 2.687521022536159e-05,
+      "loss": 0.001,
+      "step": 5500
+    },
+    {
+      "epoch": 1.3874873864783046,
+      "eval_loss": 0.005176006816327572,
+      "eval_runtime": 21.011,
+      "eval_samples_per_second": 83.861,
+      "eval_steps_per_second": 20.989,
+      "step": 5500
+    },
+    {
+      "epoch": 1.3900100908173563,
+      "grad_norm": 0.08292572945356369,
+      "learning_rate": 2.6833165153044064e-05,
+      "loss": 0.0015,
+      "step": 5510
+    },
+    {
+      "epoch": 1.3925327951564077,
+      "grad_norm": 0.011006727814674377,
+      "learning_rate": 2.679112008072654e-05,
+      "loss": 0.0019,
+      "step": 5520
+    },
+    {
+      "epoch": 1.3950554994954592,
+      "grad_norm": 0.15567320585250854,
+      "learning_rate": 2.6749075008409014e-05,
+      "loss": 0.0021,
+      "step": 5530
+    },
+    {
+      "epoch": 1.3975782038345106,
+      "grad_norm": 0.09949897229671478,
+      "learning_rate": 2.6707029936091492e-05,
+      "loss": 0.002,
+      "step": 5540
+    },
+    {
+      "epoch": 1.400100908173562,
+      "grad_norm": 0.07961593568325043,
+      "learning_rate": 2.6664984863773967e-05,
+      "loss": 0.0009,
+      "step": 5550
+    },
+    {
+      "epoch": 1.4026236125126135,
+      "grad_norm": 0.15322332084178925,
+      "learning_rate": 2.662293979145644e-05,
+      "loss": 0.0024,
+      "step": 5560
+    },
+    {
+      "epoch": 1.405146316851665,
+      "grad_norm": 0.1159447729587555,
+      "learning_rate": 2.6580894719138916e-05,
+      "loss": 0.0019,
+      "step": 5570
+    },
+    {
+      "epoch": 1.4076690211907166,
+      "grad_norm": 0.0029101588297635317,
+      "learning_rate": 2.6538849646821394e-05,
+      "loss": 0.0008,
+      "step": 5580
+    },
+    {
+      "epoch": 1.410191725529768,
+      "grad_norm": 0.0002611145027913153,
+      "learning_rate": 2.649680457450387e-05,
+      "loss": 0.0055,
+      "step": 5590
+    },
+    {
+      "epoch": 1.4127144298688195,
+      "grad_norm": 0.14663146436214447,
+      "learning_rate": 2.6454759502186343e-05,
+      "loss": 0.0039,
+      "step": 5600
+    },
+    {
+      "epoch": 1.4127144298688195,
+      "eval_loss": 0.005056018941104412,
+      "eval_runtime": 20.9323,
+      "eval_samples_per_second": 84.176,
+      "eval_steps_per_second": 21.068,
+      "step": 5600
+    },
+    {
+      "epoch": 1.415237134207871,
+      "grad_norm": 0.0049458956345915794,
+      "learning_rate": 2.6412714429868825e-05,
+      "loss": 0.001,
+      "step": 5610
+    },
+    {
+      "epoch": 1.4177598385469223,
+      "grad_norm": 0.3080619275569916,
+      "learning_rate": 2.6370669357551296e-05,
+      "loss": 0.0071,
+      "step": 5620
+    },
+    {
+      "epoch": 1.4202825428859738,
+      "grad_norm": 0.0023910084273666143,
+      "learning_rate": 2.632862428523377e-05,
+      "loss": 0.0017,
+      "step": 5630
+    },
+    {
+      "epoch": 1.4228052472250252,
+      "grad_norm": 0.0009933901019394398,
+      "learning_rate": 2.6286579212916245e-05,
+      "loss": 0.0044,
+      "step": 5640
+    },
+    {
+      "epoch": 1.4253279515640767,
+      "grad_norm": 0.02665986306965351,
+      "learning_rate": 2.6244534140598727e-05,
+      "loss": 0.0009,
+      "step": 5650
+    },
+    {
+      "epoch": 1.427850655903128,
+      "grad_norm": 0.17384563386440277,
+      "learning_rate": 2.6202489068281198e-05,
+      "loss": 0.0065,
+      "step": 5660
+    },
+    {
+      "epoch": 1.4303733602421795,
+      "grad_norm": 0.05648142844438553,
+      "learning_rate": 2.6160443995963673e-05,
+      "loss": 0.0016,
+      "step": 5670
+    },
+    {
+      "epoch": 1.432896064581231,
+      "grad_norm": 0.004266271833330393,
+      "learning_rate": 2.6118398923646147e-05,
+      "loss": 0.0028,
+      "step": 5680
+    },
+    {
+      "epoch": 1.4354187689202824,
+      "grad_norm": 0.020753854885697365,
+      "learning_rate": 2.607635385132863e-05,
+      "loss": 0.002,
+      "step": 5690
+    },
+    {
+      "epoch": 1.437941473259334,
+      "grad_norm": 0.08341605216264725,
+      "learning_rate": 2.6034308779011103e-05,
+      "loss": 0.0023,
+      "step": 5700
+    },
+    {
+      "epoch": 1.437941473259334,
+      "eval_loss": 0.004923286382108927,
+      "eval_runtime": 20.9235,
+      "eval_samples_per_second": 84.212,
+      "eval_steps_per_second": 21.077,
+      "step": 5700
+    },
+    {
+      "epoch": 1.4404641775983855,
+      "grad_norm": 0.007267744280397892,
+      "learning_rate": 2.5992263706693575e-05,
+      "loss": 0.003,
+      "step": 5710
+    },
+    {
+      "epoch": 1.442986881937437,
+      "grad_norm": 0.00982646644115448,
+      "learning_rate": 2.595021863437605e-05,
+      "loss": 0.0001,
+      "step": 5720
+    },
+    {
+      "epoch": 1.4455095862764884,
+      "grad_norm": 0.0013306884793564677,
+      "learning_rate": 2.590817356205853e-05,
+      "loss": 0.0019,
+      "step": 5730
+    },
+    {
+      "epoch": 1.4480322906155398,
+      "grad_norm": 0.037949543446302414,
+      "learning_rate": 2.5866128489741005e-05,
+      "loss": 0.0014,
+      "step": 5740
+    },
+    {
+      "epoch": 1.4505549949545913,
+      "grad_norm": 0.0034357199911028147,
+      "learning_rate": 2.5824083417423477e-05,
+      "loss": 0.0025,
+      "step": 5750
+    },
+    {
+      "epoch": 1.4530776992936427,
+      "grad_norm": 0.08490198105573654,
+      "learning_rate": 2.578203834510595e-05,
+      "loss": 0.0016,
+      "step": 5760
+    },
+    {
+      "epoch": 1.4556004036326944,
+      "grad_norm": 0.09188306331634521,
+      "learning_rate": 2.5739993272788433e-05,
+      "loss": 0.0006,
+      "step": 5770
+    },
+    {
+      "epoch": 1.4581231079717458,
+      "grad_norm": 0.3032228350639343,
+      "learning_rate": 2.5697948200470908e-05,
+      "loss": 0.0037,
+      "step": 5780
+    },
+    {
+      "epoch": 1.4606458123107973,
+      "grad_norm": 0.0033124187029898167,
+      "learning_rate": 2.565590312815338e-05,
+      "loss": 0.0003,
+      "step": 5790
+    },
+    {
+      "epoch": 1.4631685166498487,
+      "grad_norm": 0.18161827325820923,
+      "learning_rate": 2.561385805583586e-05,
+      "loss": 0.0035,
+      "step": 5800
+    },
+    {
+      "epoch": 1.4631685166498487,
+      "eval_loss": 0.0052693067118525505,
+      "eval_runtime": 20.897,
+      "eval_samples_per_second": 84.318,
+      "eval_steps_per_second": 21.103,
+      "step": 5800
+    },
+    {
+      "epoch": 1.4656912209889001,
+      "grad_norm": 0.005931831430643797,
+      "learning_rate": 2.5571812983518335e-05,
+      "loss": 0.0028,
+      "step": 5810
+    },
+    {
+      "epoch": 1.4682139253279516,
+      "grad_norm": 0.0015284974360838532,
+      "learning_rate": 2.552976791120081e-05,
+      "loss": 0.0014,
+      "step": 5820
+    },
+    {
+      "epoch": 1.470736629667003,
+      "grad_norm": 0.035359546542167664,
+      "learning_rate": 2.548772283888328e-05,
+      "loss": 0.0017,
+      "step": 5830
+    },
+    {
+      "epoch": 1.4732593340060545,
+      "grad_norm": 0.0004528906138148159,
+      "learning_rate": 2.5445677766565762e-05,
+      "loss": 0.003,
+      "step": 5840
+    },
+    {
+      "epoch": 1.475782038345106,
+      "grad_norm": 0.0017564162844792008,
+      "learning_rate": 2.5403632694248237e-05,
+      "loss": 0.0016,
+      "step": 5850
+    },
+    {
+      "epoch": 1.4783047426841573,
+      "grad_norm": 0.1506708413362503,
+      "learning_rate": 2.536158762193071e-05,
+      "loss": 0.0008,
+      "step": 5860
+    },
+    {
+      "epoch": 1.4808274470232088,
+      "grad_norm": 0.0023614871315658092,
+      "learning_rate": 2.5319542549613183e-05,
+      "loss": 0.0022,
+      "step": 5870
+    },
+    {
+      "epoch": 1.4833501513622602,
+      "grad_norm": 0.00034669501474127173,
+      "learning_rate": 2.5277497477295664e-05,
+      "loss": 0.0032,
+      "step": 5880
+    },
+    {
+      "epoch": 1.4858728557013117,
+      "grad_norm": 0.0016111385775730014,
+      "learning_rate": 2.523545240497814e-05,
+      "loss": 0.0037,
+      "step": 5890
+    },
+    {
+      "epoch": 1.4883955600403633,
+      "grad_norm": 0.00014663147157989442,
+      "learning_rate": 2.5193407332660614e-05,
+      "loss": 0.0008,
+      "step": 5900
+    },
+    {
+      "epoch": 1.4883955600403633,
+      "eval_loss": 0.004971860907971859,
+      "eval_runtime": 20.9024,
+      "eval_samples_per_second": 84.296,
+      "eval_steps_per_second": 21.098,
+      "step": 5900
+    },
+    {
+      "epoch": 1.4909182643794148,
+      "grad_norm": 0.15087057650089264,
+      "learning_rate": 2.5151362260343085e-05,
+      "loss": 0.0027,
+      "step": 5910
+    },
+    {
+      "epoch": 1.4934409687184662,
+      "grad_norm": 0.18127664923667908,
+      "learning_rate": 2.5109317188025566e-05,
+      "loss": 0.0042,
+      "step": 5920
+    },
+    {
+      "epoch": 1.4959636730575177,
+      "grad_norm": 0.069893017411232,
+      "learning_rate": 2.506727211570804e-05,
+      "loss": 0.0022,
+      "step": 5930
+    },
+    {
+      "epoch": 1.498486377396569,
+      "grad_norm": 0.00019991624867543578,
+      "learning_rate": 2.5025227043390516e-05,
+      "loss": 0.0017,
+      "step": 5940
+    },
+    {
+      "epoch": 1.5010090817356205,
+      "grad_norm": 0.09269930422306061,
+      "learning_rate": 2.498318197107299e-05,
+      "loss": 0.0028,
+      "step": 5950
+    },
+    {
+      "epoch": 1.5035317860746722,
+      "grad_norm": 0.06926806271076202,
+      "learning_rate": 2.494113689875547e-05,
+      "loss": 0.0039,
+      "step": 5960
+    },
+    {
+      "epoch": 1.5060544904137236,
+      "grad_norm": 0.03350943699479103,
+      "learning_rate": 2.4899091826437943e-05,
+      "loss": 0.0052,
+      "step": 5970
+    },
+    {
+      "epoch": 1.508577194752775,
+      "grad_norm": 0.008175240829586983,
+      "learning_rate": 2.4857046754120418e-05,
+      "loss": 0.0023,
+      "step": 5980
+    },
+    {
+      "epoch": 1.5110998990918265,
+      "grad_norm": 0.1838151216506958,
+      "learning_rate": 2.4815001681802892e-05,
+      "loss": 0.0031,
+      "step": 5990
+    },
+    {
+      "epoch": 1.513622603430878,
+      "grad_norm": 0.11169478297233582,
+      "learning_rate": 2.477295660948537e-05,
+      "loss": 0.0042,
+      "step": 6000
+    },
+    {
+      "epoch": 1.513622603430878,
+      "eval_loss": 0.004873940721154213,
+      "eval_runtime": 20.8974,
+      "eval_samples_per_second": 84.317,
+      "eval_steps_per_second": 21.103,
+      "step": 6000
+    },
+    {
+      "epoch": 1.5161453077699294,
+      "grad_norm": 0.0018095527775585651,
+      "learning_rate": 2.4730911537167845e-05,
+      "loss": 0.0023,
+      "step": 6010
+    },
+    {
+      "epoch": 1.5186680121089808,
+      "grad_norm": 0.0017755021108314395,
+      "learning_rate": 2.468886646485032e-05,
+      "loss": 0.0007,
+      "step": 6020
+    },
+    {
+      "epoch": 1.5211907164480323,
+      "grad_norm": 0.1006636768579483,
+      "learning_rate": 2.4646821392532794e-05,
+      "loss": 0.0054,
+      "step": 6030
+    },
+    {
+      "epoch": 1.5237134207870837,
+      "grad_norm": 0.17334707081317902,
+      "learning_rate": 2.4604776320215273e-05,
+      "loss": 0.0031,
+      "step": 6040
+    },
+    {
+      "epoch": 1.5262361251261352,
+      "grad_norm": 0.004185323137789965,
+      "learning_rate": 2.4562731247897747e-05,
+      "loss": 0.0018,
+      "step": 6050
+    },
+    {
+      "epoch": 1.5287588294651866,
+      "grad_norm": 0.06221470236778259,
+      "learning_rate": 2.4520686175580225e-05,
+      "loss": 0.0032,
+      "step": 6060
+    },
+    {
+      "epoch": 1.531281533804238,
+      "grad_norm": 0.10330460220575333,
+      "learning_rate": 2.4478641103262697e-05,
+      "loss": 0.0035,
+      "step": 6070
+    },
+    {
+      "epoch": 1.5338042381432895,
+      "grad_norm": 0.13589535653591156,
+      "learning_rate": 2.4436596030945175e-05,
+      "loss": 0.0016,
+      "step": 6080
+    },
+    {
+      "epoch": 1.536326942482341,
+      "grad_norm": 0.19307217001914978,
+      "learning_rate": 2.439455095862765e-05,
+      "loss": 0.0039,
+      "step": 6090
+    },
+    {
+      "epoch": 1.5388496468213926,
+      "grad_norm": 0.07404123246669769,
+      "learning_rate": 2.4352505886310127e-05,
+      "loss": 0.0008,
+      "step": 6100
+    },
+    {
+      "epoch": 1.5388496468213926,
+      "eval_loss": 0.005088960751891136,
+      "eval_runtime": 20.9119,
+      "eval_samples_per_second": 84.258,
+      "eval_steps_per_second": 21.088,
+      "step": 6100
+    },
+    {
+      "epoch": 1.541372351160444,
+      "grad_norm": 0.06749244034290314,
+      "learning_rate": 2.43104608139926e-05,
+      "loss": 0.0045,
+      "step": 6110
+    },
+    {
+      "epoch": 1.5438950554994955,
+      "grad_norm": 0.00822696927934885,
+      "learning_rate": 2.4268415741675077e-05,
+      "loss": 0.0022,
+      "step": 6120
+    },
+    {
+      "epoch": 1.546417759838547,
+      "grad_norm": 0.0362759605050087,
+      "learning_rate": 2.4226370669357555e-05,
+      "loss": 0.0018,
+      "step": 6130
+    },
+    {
+      "epoch": 1.5489404641775983,
+      "grad_norm": 0.0018353847553953528,
+      "learning_rate": 2.418432559704003e-05,
+      "loss": 0.0033,
+      "step": 6140
+    },
+    {
+      "epoch": 1.55146316851665,
+      "grad_norm": 0.0009886518819257617,
+      "learning_rate": 2.4142280524722504e-05,
+      "loss": 0.0033,
+      "step": 6150
+    },
+    {
+      "epoch": 1.5539858728557014,
+      "grad_norm": 0.005221598315984011,
+      "learning_rate": 2.410023545240498e-05,
+      "loss": 0.0042,
+      "step": 6160
+    },
+    {
+      "epoch": 1.5565085771947529,
+      "grad_norm": 0.02474922128021717,
+      "learning_rate": 2.4058190380087457e-05,
+      "loss": 0.0059,
+      "step": 6170
+    },
+    {
+      "epoch": 1.5590312815338043,
+      "grad_norm": 0.005371175706386566,
+      "learning_rate": 2.401614530776993e-05,
+      "loss": 0.004,
+      "step": 6180
+    },
+    {
+      "epoch": 1.5615539858728558,
+      "grad_norm": 0.1124267429113388,
+      "learning_rate": 2.3974100235452406e-05,
+      "loss": 0.0007,
+      "step": 6190
+    },
+    {
+      "epoch": 1.5640766902119072,
+      "grad_norm": 0.009999338537454605,
+      "learning_rate": 2.393205516313488e-05,
+      "loss": 0.0018,
+      "step": 6200
+    },
+    {
+      "epoch": 1.5640766902119072,
+      "eval_loss": 0.004953174851834774,
+      "eval_runtime": 20.8923,
+      "eval_samples_per_second": 84.337,
+      "eval_steps_per_second": 21.108,
+      "step": 6200
+    },
+    {
+      "epoch": 1.5665993945509586,
+      "grad_norm": 0.028597630560398102,
+      "learning_rate": 2.389001009081736e-05,
+      "loss": 0.0044,
+      "step": 6210
+    },
+    {
+      "epoch": 1.56912209889001,
+      "grad_norm": 0.0002847505093086511,
+      "learning_rate": 2.3847965018499833e-05,
+      "loss": 0.0013,
+      "step": 6220
+    },
+    {
+      "epoch": 1.5716448032290615,
+      "grad_norm": 0.12137818336486816,
+      "learning_rate": 2.3805919946182308e-05,
+      "loss": 0.0025,
+      "step": 6230
+    },
+    {
+      "epoch": 1.574167507568113,
+      "grad_norm": 0.05651724711060524,
+      "learning_rate": 2.3763874873864783e-05,
+      "loss": 0.0022,
+      "step": 6240
+    },
+    {
+      "epoch": 1.5766902119071644,
+      "grad_norm": 0.225179061293602,
+      "learning_rate": 2.372182980154726e-05,
+      "loss": 0.0012,
+      "step": 6250
+    },
+    {
+      "epoch": 1.5792129162462158,
+      "grad_norm": 0.0041250442154705524,
+      "learning_rate": 2.3679784729229735e-05,
+      "loss": 0.0055,
+      "step": 6260
+    },
+    {
+      "epoch": 1.5817356205852673,
+      "grad_norm": 0.008712096139788628,
+      "learning_rate": 2.363773965691221e-05,
+      "loss": 0.0014,
+      "step": 6270
+    },
+    {
+      "epoch": 1.5842583249243187,
+      "grad_norm": 0.007221329025924206,
+      "learning_rate": 2.3595694584594685e-05,
+      "loss": 0.0022,
+      "step": 6280
+    },
+    {
+      "epoch": 1.5867810292633702,
+      "grad_norm": 0.012200511991977692,
+      "learning_rate": 2.3553649512277163e-05,
+      "loss": 0.0012,
+      "step": 6290
+    },
+    {
+      "epoch": 1.5893037336024218,
+      "grad_norm": 0.0005704654031433165,
+      "learning_rate": 2.3511604439959638e-05,
+      "loss": 0.0013,
+      "step": 6300
+    },
+    {
+      "epoch": 1.5893037336024218,
+      "eval_loss": 0.0048631117679178715,
+      "eval_runtime": 20.8908,
+      "eval_samples_per_second": 84.344,
+      "eval_steps_per_second": 21.11,
+      "step": 6300
+    },
+    {
+      "epoch": 1.5918264379414733,
+      "grad_norm": 0.002790976082906127,
+      "learning_rate": 2.3469559367642112e-05,
+      "loss": 0.0013,
+      "step": 6310
+    },
+    {
+      "epoch": 1.5943491422805247,
+      "grad_norm": 0.0014387964038178325,
+      "learning_rate": 2.342751429532459e-05,
+      "loss": 0.0007,
+      "step": 6320
+    },
+    {
+      "epoch": 1.5968718466195762,
+      "grad_norm": 0.00022175043704919517,
+      "learning_rate": 2.3385469223007065e-05,
+      "loss": 0.0035,
+      "step": 6330
+    },
+    {
+      "epoch": 1.5993945509586278,
+      "grad_norm": 0.11953356117010117,
+      "learning_rate": 2.334342415068954e-05,
+      "loss": 0.0048,
+      "step": 6340
+    },
+    {
+      "epoch": 1.6019172552976793,
+      "grad_norm": 0.18491606414318085,
+      "learning_rate": 2.3301379078372014e-05,
+      "loss": 0.0038,
+      "step": 6350
+    },
+    {
+      "epoch": 1.6044399596367307,
+      "grad_norm": 0.19568416476249695,
+      "learning_rate": 2.3259334006054492e-05,
+      "loss": 0.0027,
+      "step": 6360
+    },
+    {
+      "epoch": 1.6069626639757821,
+      "grad_norm": 0.08327057212591171,
+      "learning_rate": 2.3217288933736967e-05,
+      "loss": 0.0012,
+      "step": 6370
+    },
+    {
+      "epoch": 1.6094853683148336,
+      "grad_norm": 0.03957786411046982,
+      "learning_rate": 2.3175243861419445e-05,
+      "loss": 0.0037,
+      "step": 6380
+    },
+    {
+      "epoch": 1.612008072653885,
+      "grad_norm": 0.003976314328610897,
+      "learning_rate": 2.3133198789101916e-05,
+      "loss": 0.0015,
+      "step": 6390
+    },
+    {
+      "epoch": 1.6145307769929365,
+      "grad_norm": 0.05154380202293396,
+      "learning_rate": 2.3091153716784394e-05,
+      "loss": 0.0003,
+      "step": 6400
+    },
+    {
+      "epoch": 1.6145307769929365,
+      "eval_loss": 0.0047143567353487015,
+      "eval_runtime": 20.8928,
+      "eval_samples_per_second": 84.335,
+      "eval_steps_per_second": 21.108,
+      "step": 6400
+    },
+    {
+      "epoch": 1.617053481331988,
+      "grad_norm": 0.0036455143708735704,
+      "learning_rate": 2.304910864446687e-05,
+      "loss": 0.0023,
+      "step": 6410
+    },
+    {
+      "epoch": 1.6195761856710393,
+      "grad_norm": 0.0702298954129219,
+      "learning_rate": 2.3007063572149347e-05,
+      "loss": 0.0019,
+      "step": 6420
+    },
+    {
+      "epoch": 1.6220988900100908,
+      "grad_norm": 0.002678563119843602,
+      "learning_rate": 2.296501849983182e-05,
+      "loss": 0.0023,
+      "step": 6430
+    },
+    {
+      "epoch": 1.6246215943491422,
+      "grad_norm": 0.002132556401193142,
+      "learning_rate": 2.2922973427514296e-05,
+      "loss": 0.0018,
+      "step": 6440
+    },
+    {
+      "epoch": 1.6271442986881937,
+      "grad_norm": 0.2592438757419586,
+      "learning_rate": 2.288092835519677e-05,
+      "loss": 0.0039,
+      "step": 6450
+    },
+    {
+      "epoch": 1.629667003027245,
+      "grad_norm": 0.07999824732542038,
+      "learning_rate": 2.283888328287925e-05,
+      "loss": 0.005,
+      "step": 6460
+    },
+    {
+      "epoch": 1.6321897073662965,
+      "grad_norm": 0.0010637440718710423,
+      "learning_rate": 2.2796838210561724e-05,
+      "loss": 0.0008,
+      "step": 6470
+    },
+    {
+      "epoch": 1.634712411705348,
+      "grad_norm": 0.2309955358505249,
+      "learning_rate": 2.27547931382442e-05,
+      "loss": 0.0033,
+      "step": 6480
+    },
+    {
+      "epoch": 1.6372351160443996,
+      "grad_norm": 0.003076382912695408,
+      "learning_rate": 2.2712748065926676e-05,
+      "loss": 0.0004,
+      "step": 6490
+    },
+    {
+      "epoch": 1.639757820383451,
+      "grad_norm": 0.13771465420722961,
+      "learning_rate": 2.267070299360915e-05,
+      "loss": 0.0016,
+      "step": 6500
+    },
+    {
+      "epoch": 1.639757820383451,
+      "eval_loss": 0.00458995345979929,
+      "eval_runtime": 20.8904,
+      "eval_samples_per_second": 84.345,
+      "eval_steps_per_second": 21.11,
+      "step": 6500
+    },
+    {
+      "epoch": 1.6422805247225025,
+      "grad_norm": 0.0014351740246638656,
+      "learning_rate": 2.2628657921291626e-05,
+      "loss": 0.0014,
+      "step": 6510
+    },
+    {
+      "epoch": 1.644803229061554,
+      "grad_norm": 0.08296415954828262,
+      "learning_rate": 2.25866128489741e-05,
+      "loss": 0.0012,
+      "step": 6520
+    },
+    {
+      "epoch": 1.6473259334006054,
+      "grad_norm": 0.00042011673212982714,
+      "learning_rate": 2.254456777665658e-05,
+      "loss": 0.0087,
+      "step": 6530
+    },
+    {
+      "epoch": 1.649848637739657,
+      "grad_norm": 0.01929500512778759,
+      "learning_rate": 2.2502522704339053e-05,
+      "loss": 0.0015,
+      "step": 6540
+    },
+    {
+      "epoch": 1.6523713420787085,
+      "grad_norm": 0.14432388544082642,
+      "learning_rate": 2.2460477632021528e-05,
+      "loss": 0.001,
+      "step": 6550
+    },
+    {
+      "epoch": 1.65489404641776,
+      "grad_norm": 0.0028279852122068405,
+      "learning_rate": 2.2418432559704003e-05,
+      "loss": 0.0022,
+      "step": 6560
+    },
+    {
+      "epoch": 1.6574167507568114,
+      "grad_norm": 0.12562096118927002,
+      "learning_rate": 2.237638748738648e-05,
+      "loss": 0.0023,
+      "step": 6570
+    },
+    {
+      "epoch": 1.6599394550958628,
+      "grad_norm": 0.12296324968338013,
+      "learning_rate": 2.2334342415068955e-05,
+      "loss": 0.0031,
+      "step": 6580
+    },
+    {
+      "epoch": 1.6624621594349143,
+      "grad_norm": 0.023628313094377518,
+      "learning_rate": 2.229229734275143e-05,
+      "loss": 0.0049,
+      "step": 6590
+    },
+    {
+      "epoch": 1.6649848637739657,
+      "grad_norm": 0.06037677451968193,
+      "learning_rate": 2.2250252270433905e-05,
+      "loss": 0.0015,
+      "step": 6600
+    },
+    {
+      "epoch": 1.6649848637739657,
+      "eval_loss": 0.004487240687012672,
+      "eval_runtime": 20.9059,
+      "eval_samples_per_second": 84.282,
+      "eval_steps_per_second": 21.094,
+      "step": 6600
+    },
+    {
+      "epoch": 1.6675075681130171,
+      "grad_norm": 0.038203973323106766,
+      "learning_rate": 2.2208207198116383e-05,
+      "loss": 0.0024,
+      "step": 6610
+    },
+    {
+      "epoch": 1.6700302724520686,
+      "grad_norm": 0.0006242411327548325,
+      "learning_rate": 2.2166162125798857e-05,
+      "loss": 0.0016,
+      "step": 6620
+    },
+    {
+      "epoch": 1.67255297679112,
+      "grad_norm": 0.24198785424232483,
+      "learning_rate": 2.2124117053481332e-05,
+      "loss": 0.0042,
+      "step": 6630
+    },
+    {
+      "epoch": 1.6750756811301715,
+      "grad_norm": 0.008689168840646744,
+      "learning_rate": 2.208207198116381e-05,
+      "loss": 0.0014,
+      "step": 6640
+    },
+    {
+      "epoch": 1.677598385469223,
+      "grad_norm": 0.0001791265094652772,
+      "learning_rate": 2.2040026908846285e-05,
+      "loss": 0.001,
+      "step": 6650
+    },
+    {
+      "epoch": 1.6801210898082743,
+      "grad_norm": 0.1187405064702034,
+      "learning_rate": 2.199798183652876e-05,
+      "loss": 0.0012,
+      "step": 6660
+    },
+    {
+      "epoch": 1.6826437941473258,
+      "grad_norm": 0.17541439831256866,
+      "learning_rate": 2.1955936764211234e-05,
+      "loss": 0.0015,
+      "step": 6670
+    },
+    {
+      "epoch": 1.6851664984863775,
+      "grad_norm": 0.025754814967513084,
+      "learning_rate": 2.1913891691893712e-05,
+      "loss": 0.0024,
+      "step": 6680
+    },
+    {
+      "epoch": 1.687689202825429,
+      "grad_norm": 0.16264697909355164,
+      "learning_rate": 2.1871846619576187e-05,
+      "loss": 0.0025,
+      "step": 6690
+    },
+    {
+      "epoch": 1.6902119071644803,
+      "grad_norm": 0.0034603734966367483,
+      "learning_rate": 2.1829801547258665e-05,
+      "loss": 0.0005,
+      "step": 6700
+    },
+    {
+      "epoch": 1.6902119071644803,
+      "eval_loss": 0.00464710732921958,
+      "eval_runtime": 20.9042,
+      "eval_samples_per_second": 84.289,
+      "eval_steps_per_second": 21.096,
+      "step": 6700
+    },
+    {
+      "epoch": 1.6927346115035318,
+      "grad_norm": 0.005144911352545023,
+      "learning_rate": 2.1787756474941136e-05,
+      "loss": 0.0005,
+      "step": 6710
+    },
+    {
+      "epoch": 1.6952573158425832,
+      "grad_norm": 0.2386636584997177,
+      "learning_rate": 2.1745711402623614e-05,
+      "loss": 0.0045,
+      "step": 6720
+    },
+    {
+      "epoch": 1.6977800201816349,
+      "grad_norm": 0.0006302748224698007,
+      "learning_rate": 2.170366633030609e-05,
+      "loss": 0.0003,
+      "step": 6730
+    },
+    {
+      "epoch": 1.7003027245206863,
+      "grad_norm": 0.04393825680017471,
+      "learning_rate": 2.1661621257988567e-05,
+      "loss": 0.0015,
+      "step": 6740
+    },
+    {
+      "epoch": 1.7028254288597378,
+      "grad_norm": 0.002832000143826008,
+      "learning_rate": 2.1619576185671038e-05,
+      "loss": 0.0029,
+      "step": 6750
+    },
+    {
+      "epoch": 1.7053481331987892,
+      "grad_norm": 0.16164688766002655,
+      "learning_rate": 2.1577531113353516e-05,
+      "loss": 0.0049,
+      "step": 6760
+    },
+    {
+      "epoch": 1.7078708375378406,
+      "grad_norm": 0.1447678804397583,
+      "learning_rate": 2.153548604103599e-05,
+      "loss": 0.0026,
+      "step": 6770
+    },
+    {
+      "epoch": 1.710393541876892,
+      "grad_norm": 0.005429640877991915,
+      "learning_rate": 2.149344096871847e-05,
+      "loss": 0.003,
+      "step": 6780
+    },
+    {
+      "epoch": 1.7129162462159435,
+      "grad_norm": 0.0007169700693339109,
+      "learning_rate": 2.145139589640094e-05,
+      "loss": 0.0012,
+      "step": 6790
+    },
+    {
+      "epoch": 1.715438950554995,
+      "grad_norm": 0.0009801093256101012,
+      "learning_rate": 2.1409350824083418e-05,
+      "loss": 0.0023,
+      "step": 6800
+    },
+    {
+      "epoch": 1.715438950554995,
+      "eval_loss": 0.0046570939011871815,
+      "eval_runtime": 20.8879,
+      "eval_samples_per_second": 84.355,
+      "eval_steps_per_second": 21.113,
+      "step": 6800
+    },
+    {
+      "epoch": 1.7179616548940464,
+      "grad_norm": 0.009192834608256817,
+      "learning_rate": 2.1367305751765896e-05,
+      "loss": 0.004,
+      "step": 6810
+    },
+    {
+      "epoch": 1.7204843592330978,
+      "grad_norm": 0.004604650661349297,
+      "learning_rate": 2.132526067944837e-05,
+      "loss": 0.0012,
+      "step": 6820
+    },
+    {
+      "epoch": 1.7230070635721493,
+      "grad_norm": 0.0017496418440714478,
+      "learning_rate": 2.1283215607130846e-05,
+      "loss": 0.0027,
+      "step": 6830
+    },
+    {
+      "epoch": 1.7255297679112007,
+      "grad_norm": 0.003268537111580372,
+      "learning_rate": 2.124117053481332e-05,
+      "loss": 0.0027,
+      "step": 6840
+    },
+    {
+      "epoch": 1.7280524722502522,
+      "grad_norm": 0.00015593957505188882,
+      "learning_rate": 2.11991254624958e-05,
+      "loss": 0.0034,
+      "step": 6850
+    },
+    {
+      "epoch": 1.7305751765893036,
+      "grad_norm": 0.0808212012052536,
+      "learning_rate": 2.1157080390178273e-05,
+      "loss": 0.0048,
+      "step": 6860
+    },
+    {
+      "epoch": 1.733097880928355,
+      "grad_norm": 0.24864982068538666,
+      "learning_rate": 2.1115035317860748e-05,
+      "loss": 0.0018,
+      "step": 6870
+    },
+    {
+      "epoch": 1.7356205852674067,
+      "grad_norm": 0.004026748705655336,
+      "learning_rate": 2.1072990245543222e-05,
+      "loss": 0.0033,
+      "step": 6880
+    },
+    {
+      "epoch": 1.7381432896064581,
+      "grad_norm": 0.017659470438957214,
+      "learning_rate": 2.10309451732257e-05,
+      "loss": 0.0016,
+      "step": 6890
+    },
+    {
+      "epoch": 1.7406659939455096,
+      "grad_norm": 0.004599269945174456,
+      "learning_rate": 2.0988900100908175e-05,
+      "loss": 0.0011,
+      "step": 6900
+    },
+    {
+      "epoch": 1.7406659939455096,
+      "eval_loss": 0.004562552087008953,
+      "eval_runtime": 20.8736,
+      "eval_samples_per_second": 84.413,
+      "eval_steps_per_second": 21.127,
+      "step": 6900
+    },
+    {
+      "epoch": 1.743188698284561,
+      "grad_norm": 0.00346059980802238,
+      "learning_rate": 2.094685502859065e-05,
+      "loss": 0.0026,
+      "step": 6910
+    },
+    {
+      "epoch": 1.7457114026236125,
+      "grad_norm": 0.03819597512483597,
+      "learning_rate": 2.0904809956273124e-05,
+      "loss": 0.0009,
+      "step": 6920
+    },
+    {
+      "epoch": 1.7482341069626641,
+      "grad_norm": 0.2355988621711731,
+      "learning_rate": 2.0862764883955602e-05,
+      "loss": 0.0041,
+      "step": 6930
+    },
+    {
+      "epoch": 1.7507568113017156,
+      "grad_norm": 0.0024086080957204103,
+      "learning_rate": 2.0820719811638077e-05,
+      "loss": 0.0029,
+      "step": 6940
+    },
+    {
+      "epoch": 1.753279515640767,
+      "grad_norm": 0.25141242146492004,
+      "learning_rate": 2.0778674739320552e-05,
+      "loss": 0.0027,
+      "step": 6950
+    },
+    {
+      "epoch": 1.7558022199798184,
+      "grad_norm": 0.08421680331230164,
+      "learning_rate": 2.0736629667003026e-05,
+      "loss": 0.0005,
+      "step": 6960
+    },
+    {
+      "epoch": 1.7583249243188699,
+      "grad_norm": 0.19552625715732574,
+      "learning_rate": 2.0694584594685504e-05,
+      "loss": 0.0014,
+      "step": 6970
+    },
+    {
+      "epoch": 1.7608476286579213,
+      "grad_norm": 0.0034374226815998554,
+      "learning_rate": 2.065253952236798e-05,
+      "loss": 0.0009,
+      "step": 6980
+    },
+    {
+      "epoch": 1.7633703329969728,
+      "grad_norm": 0.0022041252814233303,
+      "learning_rate": 2.0610494450050454e-05,
+      "loss": 0.0007,
+      "step": 6990
+    },
+    {
+      "epoch": 1.7658930373360242,
+      "grad_norm": 0.2155584841966629,
+      "learning_rate": 2.0568449377732932e-05,
+      "loss": 0.0025,
+      "step": 7000
+    },
+    {
+      "epoch": 1.7658930373360242,
+      "eval_loss": 0.004664108622819185,
+      "eval_runtime": 20.9248,
+      "eval_samples_per_second": 84.206,
+      "eval_steps_per_second": 21.075,
+      "step": 7000
+    },
+    {
+      "epoch": 1.7684157416750756,
+      "grad_norm": 0.00033274645102210343,
+      "learning_rate": 2.0526404305415406e-05,
+      "loss": 0.0011,
+      "step": 7010
+    },
+    {
+      "epoch": 1.770938446014127,
+      "grad_norm": 0.004436755087226629,
+      "learning_rate": 2.048435923309788e-05,
+      "loss": 0.0023,
+      "step": 7020
+    },
+    {
+      "epoch": 1.7734611503531785,
+      "grad_norm": 0.12980465590953827,
+      "learning_rate": 2.0442314160780356e-05,
+      "loss": 0.0012,
+      "step": 7030
+    },
+    {
+      "epoch": 1.77598385469223,
+      "grad_norm": 0.0001751563249854371,
+      "learning_rate": 2.0400269088462834e-05,
+      "loss": 0.0009,
+      "step": 7040
+    },
+    {
+      "epoch": 1.7785065590312814,
+      "grad_norm": 0.18885697424411774,
+      "learning_rate": 2.035822401614531e-05,
+      "loss": 0.0037,
+      "step": 7050
+    },
+    {
+      "epoch": 1.7810292633703328,
+      "grad_norm": 0.10697660595178604,
+      "learning_rate": 2.0316178943827787e-05,
+      "loss": 0.0016,
+      "step": 7060
+    },
+    {
+      "epoch": 1.7835519677093845,
+      "grad_norm": 0.02362459897994995,
+      "learning_rate": 2.0274133871510258e-05,
+      "loss": 0.0007,
+      "step": 7070
+    },
+    {
+      "epoch": 1.786074672048436,
+      "grad_norm": 8.634651749162003e-05,
+      "learning_rate": 2.0232088799192736e-05,
+      "loss": 0.0015,
+      "step": 7080
+    },
+    {
+      "epoch": 1.7885973763874874,
+      "grad_norm": 0.14645344018936157,
+      "learning_rate": 2.019004372687521e-05,
+      "loss": 0.0019,
+      "step": 7090
+    },
+    {
+      "epoch": 1.7911200807265388,
+      "grad_norm": 0.0008572082151658833,
+      "learning_rate": 2.014799865455769e-05,
+      "loss": 0.0022,
+      "step": 7100
+    },
+    {
+      "epoch": 1.7911200807265388,
+      "eval_loss": 0.0045646862126886845,
+      "eval_runtime": 20.8857,
+      "eval_samples_per_second": 84.364,
+      "eval_steps_per_second": 21.115,
+      "step": 7100
+    },
+    {
+      "epoch": 1.7936427850655903,
+      "grad_norm": 0.0013892538845539093,
+      "learning_rate": 2.010595358224016e-05,
+      "loss": 0.0028,
+      "step": 7110
+    },
+    {
+      "epoch": 1.796165489404642,
+      "grad_norm": 0.17493274807929993,
+      "learning_rate": 2.0063908509922638e-05,
+      "loss": 0.0026,
+      "step": 7120
+    },
+    {
+      "epoch": 1.7986881937436934,
+      "grad_norm": 0.0053392620757222176,
+      "learning_rate": 2.0021863437605113e-05,
+      "loss": 0.0006,
+      "step": 7130
+    },
+    {
+      "epoch": 1.8012108980827448,
+      "grad_norm": 0.05420933663845062,
+      "learning_rate": 1.997981836528759e-05,
+      "loss": 0.0003,
+      "step": 7140
+    },
+    {
+      "epoch": 1.8037336024217963,
+      "grad_norm": 0.00458677439019084,
+      "learning_rate": 1.9937773292970065e-05,
+      "loss": 0.0014,
+      "step": 7150
+    },
+    {
+      "epoch": 1.8062563067608477,
+      "grad_norm": 0.0370076522231102,
+      "learning_rate": 1.989572822065254e-05,
+      "loss": 0.0031,
+      "step": 7160
+    },
+    {
+      "epoch": 1.8087790110998991,
+      "grad_norm": 0.007424044422805309,
+      "learning_rate": 1.9853683148335018e-05,
+      "loss": 0.0027,
+      "step": 7170
+    },
+    {
+      "epoch": 1.8113017154389506,
+      "grad_norm": 0.03529626876115799,
+      "learning_rate": 1.9811638076017493e-05,
+      "loss": 0.0009,
+      "step": 7180
+    },
+    {
+      "epoch": 1.813824419778002,
+      "grad_norm": 0.15175025165081024,
+      "learning_rate": 1.9769593003699967e-05,
+      "loss": 0.0015,
+      "step": 7190
+    },
+    {
+      "epoch": 1.8163471241170535,
+      "grad_norm": 0.00071295554516837,
+      "learning_rate": 1.9727547931382442e-05,
+      "loss": 0.0006,
+      "step": 7200
+    },
+    {
+      "epoch": 1.8163471241170535,
+      "eval_loss": 0.004439685959368944,
+      "eval_runtime": 20.8976,
+      "eval_samples_per_second": 84.316,
+      "eval_steps_per_second": 21.103,
+      "step": 7200
+    },
+    {
+      "epoch": 1.818869828456105,
+      "grad_norm": 0.13802039623260498,
+      "learning_rate": 1.968550285906492e-05,
+      "loss": 0.0029,
+      "step": 7210
+    },
+    {
+      "epoch": 1.8213925327951563,
+      "grad_norm": 0.001995902741327882,
+      "learning_rate": 1.9643457786747395e-05,
+      "loss": 0.0021,
+      "step": 7220
+    },
+    {
+      "epoch": 1.8239152371342078,
+      "grad_norm": 0.0005990486242808402,
+      "learning_rate": 1.960141271442987e-05,
+      "loss": 0.0006,
+      "step": 7230
+    },
+    {
+      "epoch": 1.8264379414732592,
+      "grad_norm": 0.08083165436983109,
+      "learning_rate": 1.9559367642112344e-05,
+      "loss": 0.0025,
+      "step": 7240
+    },
+    {
+      "epoch": 1.8289606458123107,
+      "grad_norm": 0.15096262097358704,
+      "learning_rate": 1.9517322569794822e-05,
+      "loss": 0.0024,
+      "step": 7250
+    },
+    {
+      "epoch": 1.831483350151362,
+      "grad_norm": 0.0005141481524333358,
+      "learning_rate": 1.9475277497477297e-05,
+      "loss": 0.0047,
+      "step": 7260
+    },
+    {
+      "epoch": 1.8340060544904138,
+      "grad_norm": 0.0009411073406226933,
+      "learning_rate": 1.943323242515977e-05,
+      "loss": 0.0033,
+      "step": 7270
+    },
+    {
+      "epoch": 1.8365287588294652,
+      "grad_norm": 0.005096075590699911,
+      "learning_rate": 1.9391187352842246e-05,
+      "loss": 0.0023,
+      "step": 7280
+    },
+    {
+      "epoch": 1.8390514631685166,
+      "grad_norm": 0.0007901940844021738,
+      "learning_rate": 1.9349142280524724e-05,
+      "loss": 0.0012,
+      "step": 7290
+    },
+    {
+      "epoch": 1.841574167507568,
+      "grad_norm": 0.00019426460494287312,
+      "learning_rate": 1.93070972082072e-05,
+      "loss": 0.0007,
+      "step": 7300
+    },
+    {
+      "epoch": 1.841574167507568,
+      "eval_loss": 0.004412606358528137,
+      "eval_runtime": 20.9146,
+      "eval_samples_per_second": 84.247,
+      "eval_steps_per_second": 21.086,
+      "step": 7300
+    },
+    {
+      "epoch": 1.8440968718466195,
+      "grad_norm": 0.1782449334859848,
+      "learning_rate": 1.9265052135889674e-05,
+      "loss": 0.0031,
+      "step": 7310
+    },
+    {
+      "epoch": 1.8466195761856712,
+      "grad_norm": 0.01708620972931385,
+      "learning_rate": 1.922300706357215e-05,
+      "loss": 0.0014,
+      "step": 7320
+    },
+    {
+      "epoch": 1.8491422805247226,
+      "grad_norm": 0.06721550226211548,
+      "learning_rate": 1.9180961991254626e-05,
+      "loss": 0.0006,
+      "step": 7330
+    },
+    {
+      "epoch": 1.851664984863774,
+      "grad_norm": 0.16179241240024567,
+      "learning_rate": 1.91389169189371e-05,
+      "loss": 0.0031,
+      "step": 7340
+    },
+    {
+      "epoch": 1.8541876892028255,
+      "grad_norm": 0.0004393104463815689,
+      "learning_rate": 1.9096871846619576e-05,
+      "loss": 0.0006,
+      "step": 7350
+    },
+    {
+      "epoch": 1.856710393541877,
+      "grad_norm": 3.2668671337887645e-05,
+      "learning_rate": 1.9054826774302054e-05,
+      "loss": 0.0034,
+      "step": 7360
+    },
+    {
+      "epoch": 1.8592330978809284,
+      "grad_norm": 0.1319074034690857,
+      "learning_rate": 1.901278170198453e-05,
+      "loss": 0.0007,
+      "step": 7370
+    },
+    {
+      "epoch": 1.8617558022199798,
+      "grad_norm": 0.18538399040699005,
+      "learning_rate": 1.8970736629667006e-05,
+      "loss": 0.0022,
+      "step": 7380
+    },
+    {
+      "epoch": 1.8642785065590313,
+      "grad_norm": 0.05137573927640915,
+      "learning_rate": 1.8928691557349478e-05,
+      "loss": 0.0028,
+      "step": 7390
+    },
+    {
+      "epoch": 1.8668012108980827,
+      "grad_norm": 0.12319748103618622,
+      "learning_rate": 1.8886646485031956e-05,
+      "loss": 0.002,
+      "step": 7400
+    },
+    {
+      "epoch": 1.8668012108980827,
+      "eval_loss": 0.004506191238760948,
+      "eval_runtime": 20.8931,
+      "eval_samples_per_second": 84.334,
+      "eval_steps_per_second": 21.107,
+      "step": 7400
+    },
+    {
+      "epoch": 1.8693239152371341,
+      "grad_norm": 0.0010132059687748551,
+      "learning_rate": 1.884460141271443e-05,
+      "loss": 0.003,
+      "step": 7410
+    },
+    {
+      "epoch": 1.8718466195761856,
+      "grad_norm": 0.005542921368032694,
+      "learning_rate": 1.880255634039691e-05,
+      "loss": 0.0031,
+      "step": 7420
+    },
+    {
+      "epoch": 1.874369323915237,
+      "grad_norm": 0.06616313755512238,
+      "learning_rate": 1.876051126807938e-05,
+      "loss": 0.003,
+      "step": 7430
+    },
+    {
+      "epoch": 1.8768920282542885,
+      "grad_norm": 0.13730089366436005,
+      "learning_rate": 1.8718466195761858e-05,
+      "loss": 0.0035,
+      "step": 7440
+    },
+    {
+      "epoch": 1.87941473259334,
+      "grad_norm": 0.00045170748489908874,
+      "learning_rate": 1.8676421123444332e-05,
+      "loss": 0.0006,
+      "step": 7450
+    },
+    {
+      "epoch": 1.8819374369323916,
+      "grad_norm": 0.00024056418624240905,
+      "learning_rate": 1.863437605112681e-05,
+      "loss": 0.0032,
+      "step": 7460
+    },
+    {
+      "epoch": 1.884460141271443,
+      "grad_norm": 0.11974132061004639,
+      "learning_rate": 1.8592330978809282e-05,
+      "loss": 0.0003,
+      "step": 7470
+    },
+    {
+      "epoch": 1.8869828456104945,
+      "grad_norm": 0.17995594441890717,
+      "learning_rate": 1.855028590649176e-05,
+      "loss": 0.0026,
+      "step": 7480
+    },
+    {
+      "epoch": 1.889505549949546,
+      "grad_norm": 0.16754589974880219,
+      "learning_rate": 1.8508240834174238e-05,
+      "loss": 0.0024,
+      "step": 7490
+    },
+    {
+      "epoch": 1.8920282542885973,
+      "grad_norm": 0.0004354271513875574,
+      "learning_rate": 1.8466195761856713e-05,
+      "loss": 0.0054,
+      "step": 7500
+    },
+    {
+      "epoch": 1.8920282542885973,
+      "eval_loss": 0.004418503027409315,
+      "eval_runtime": 20.8963,
+      "eval_samples_per_second": 84.321,
+      "eval_steps_per_second": 21.104,
+      "step": 7500
+    },
+    {
+      "epoch": 1.894550958627649,
+      "grad_norm": 0.08341953903436661,
+      "learning_rate": 1.8424150689539187e-05,
+      "loss": 0.0018,
+      "step": 7510
+    },
+    {
+      "epoch": 1.8970736629667004,
+      "grad_norm": 0.027765339240431786,
+      "learning_rate": 1.8382105617221662e-05,
+      "loss": 0.0019,
+      "step": 7520
+    },
+    {
+      "epoch": 1.8995963673057519,
+      "grad_norm": 0.08241262286901474,
+      "learning_rate": 1.834006054490414e-05,
+      "loss": 0.0033,
+      "step": 7530
+    },
+    {
+      "epoch": 1.9021190716448033,
+      "grad_norm": 0.0011641500750556588,
+      "learning_rate": 1.8298015472586615e-05,
+      "loss": 0.0018,
+      "step": 7540
+    },
+    {
+      "epoch": 1.9046417759838548,
+      "grad_norm": 0.15536606311798096,
+      "learning_rate": 1.825597040026909e-05,
+      "loss": 0.0029,
+      "step": 7550
+    },
+    {
+      "epoch": 1.9071644803229062,
+      "grad_norm": 0.08957267552614212,
+      "learning_rate": 1.8213925327951564e-05,
+      "loss": 0.0015,
+      "step": 7560
+    },
+    {
+      "epoch": 1.9096871846619576,
+      "grad_norm": 0.001939677633345127,
+      "learning_rate": 1.8171880255634042e-05,
+      "loss": 0.0023,
+      "step": 7570
+    },
+    {
+      "epoch": 1.912209889001009,
+      "grad_norm": 0.0007268782937899232,
+      "learning_rate": 1.8129835183316517e-05,
+      "loss": 0.0009,
+      "step": 7580
+    },
+    {
+      "epoch": 1.9147325933400605,
+      "grad_norm": 0.0003874763788189739,
+      "learning_rate": 1.808779011099899e-05,
+      "loss": 0.0016,
+      "step": 7590
+    },
+    {
+      "epoch": 1.917255297679112,
+      "grad_norm": 0.077357217669487,
+      "learning_rate": 1.8045745038681466e-05,
+      "loss": 0.0021,
+      "step": 7600
+    },
+    {
+      "epoch": 1.917255297679112,
+      "eval_loss": 0.00432681106030941,
+      "eval_runtime": 20.8834,
+      "eval_samples_per_second": 84.373,
+      "eval_steps_per_second": 21.117,
+      "step": 7600
+    },
+    {
+      "epoch": 1.9197780020181634,
+      "grad_norm": 0.006311261095106602,
+      "learning_rate": 1.8003699966363944e-05,
+      "loss": 0.0035,
+      "step": 7610
+    },
+    {
+      "epoch": 1.9223007063572148,
+      "grad_norm": 0.0030232472345232964,
+      "learning_rate": 1.796165489404642e-05,
+      "loss": 0.0025,
+      "step": 7620
+    },
+    {
+      "epoch": 1.9248234106962663,
+      "grad_norm": 0.201024129986763,
+      "learning_rate": 1.7919609821728893e-05,
+      "loss": 0.0015,
+      "step": 7630
+    },
+    {
+      "epoch": 1.9273461150353177,
+      "grad_norm": 0.002884042216464877,
+      "learning_rate": 1.7877564749411368e-05,
+      "loss": 0.0018,
+      "step": 7640
+    },
+    {
+      "epoch": 1.9298688193743692,
+      "grad_norm": 0.23472699522972107,
+      "learning_rate": 1.7835519677093846e-05,
+      "loss": 0.0029,
+      "step": 7650
+    },
+    {
+      "epoch": 1.9323915237134208,
+      "grad_norm": 0.0487983413040638,
+      "learning_rate": 1.779347460477632e-05,
+      "loss": 0.0016,
+      "step": 7660
+    },
+    {
+      "epoch": 1.9349142280524723,
+      "grad_norm": 0.1196766048669815,
+      "learning_rate": 1.7751429532458795e-05,
+      "loss": 0.002,
+      "step": 7670
+    },
+    {
+      "epoch": 1.9374369323915237,
+      "grad_norm": 0.11802256107330322,
+      "learning_rate": 1.7709384460141273e-05,
+      "loss": 0.0016,
+      "step": 7680
+    },
+    {
+      "epoch": 1.9399596367305751,
+      "grad_norm": 0.00048595041153021157,
+      "learning_rate": 1.7667339387823748e-05,
+      "loss": 0.0021,
+      "step": 7690
+    },
+    {
+      "epoch": 1.9424823410696268,
+      "grad_norm": 0.0009480112348683178,
+      "learning_rate": 1.7625294315506226e-05,
+      "loss": 0.0025,
+      "step": 7700
+    },
+    {
+      "epoch": 1.9424823410696268,
+      "eval_loss": 0.0042783478274941444,
+      "eval_runtime": 20.8947,
+      "eval_samples_per_second": 84.328,
+      "eval_steps_per_second": 21.106,
+      "step": 7700
+    },
+    {
+      "epoch": 1.9450050454086782,
+      "grad_norm": 0.0010953915771096945,
+      "learning_rate": 1.7583249243188697e-05,
+      "loss": 0.0017,
+      "step": 7710
+    },
+    {
+      "epoch": 1.9475277497477297,
+      "grad_norm": 0.004912779200822115,
+      "learning_rate": 1.7541204170871175e-05,
+      "loss": 0.003,
+      "step": 7720
+    },
+    {
+      "epoch": 1.9500504540867811,
+      "grad_norm": 0.05038010701537132,
+      "learning_rate": 1.749915909855365e-05,
+      "loss": 0.0017,
+      "step": 7730
+    },
+    {
+      "epoch": 1.9525731584258326,
+      "grad_norm": 0.0019162135431542993,
+      "learning_rate": 1.7457114026236128e-05,
+      "loss": 0.0012,
+      "step": 7740
+    },
+    {
+      "epoch": 1.955095862764884,
+      "grad_norm": 0.09494713693857193,
+      "learning_rate": 1.74150689539186e-05,
+      "loss": 0.0008,
+      "step": 7750
+    },
+    {
+      "epoch": 1.9576185671039354,
+      "grad_norm": 0.0007395916618406773,
+      "learning_rate": 1.7373023881601078e-05,
+      "loss": 0.0,
+      "step": 7760
+    },
+    {
+      "epoch": 1.9601412714429869,
+      "grad_norm": 0.10083262622356415,
+      "learning_rate": 1.7330978809283552e-05,
+      "loss": 0.0012,
+      "step": 7770
+    },
+    {
+      "epoch": 1.9626639757820383,
+      "grad_norm": 0.06073877960443497,
+      "learning_rate": 1.728893373696603e-05,
+      "loss": 0.0021,
+      "step": 7780
+    },
+    {
+      "epoch": 1.9651866801210898,
+      "grad_norm": 0.0009476240957155824,
+      "learning_rate": 1.72468886646485e-05,
+      "loss": 0.0029,
+      "step": 7790
+    },
+    {
+      "epoch": 1.9677093844601412,
+      "grad_norm": 0.11249450594186783,
+      "learning_rate": 1.720484359233098e-05,
+      "loss": 0.0019,
+      "step": 7800
+    },
+    {
+      "epoch": 1.9677093844601412,
+      "eval_loss": 0.004121602047234774,
+      "eval_runtime": 20.9133,
+      "eval_samples_per_second": 84.253,
+      "eval_steps_per_second": 21.087,
+      "step": 7800
+    },
+    {
+      "epoch": 1.9702320887991926,
+      "grad_norm": 0.10980529338121414,
+      "learning_rate": 1.7162798520013454e-05,
+      "loss": 0.0031,
+      "step": 7810
+    },
+    {
+      "epoch": 1.972754793138244,
+      "grad_norm": 0.09414640069007874,
+      "learning_rate": 1.7120753447695932e-05,
+      "loss": 0.0012,
+      "step": 7820
+    },
+    {
+      "epoch": 1.9752774974772955,
+      "grad_norm": 0.1049361452460289,
+      "learning_rate": 1.7078708375378407e-05,
+      "loss": 0.0052,
+      "step": 7830
+    },
+    {
+      "epoch": 1.977800201816347,
+      "grad_norm": 0.06330663710832596,
+      "learning_rate": 1.703666330306088e-05,
+      "loss": 0.0043,
+      "step": 7840
+    },
+    {
+      "epoch": 1.9803229061553986,
+      "grad_norm": 0.000840139458887279,
+      "learning_rate": 1.699461823074336e-05,
+      "loss": 0.0007,
+      "step": 7850
+    },
+    {
+      "epoch": 1.98284561049445,
+      "grad_norm": 0.0014659571461379528,
+      "learning_rate": 1.6952573158425834e-05,
+      "loss": 0.0015,
+      "step": 7860
+    },
+    {
+      "epoch": 1.9853683148335015,
+      "grad_norm": 0.0011587137123569846,
+      "learning_rate": 1.691052808610831e-05,
+      "loss": 0.0009,
+      "step": 7870
+    },
+    {
+      "epoch": 1.987891019172553,
+      "grad_norm": 0.19755807518959045,
+      "learning_rate": 1.6868483013790784e-05,
+      "loss": 0.0028,
+      "step": 7880
+    },
+    {
+      "epoch": 1.9904137235116044,
+      "grad_norm": 0.08327340334653854,
+      "learning_rate": 1.6826437941473262e-05,
+      "loss": 0.0012,
+      "step": 7890
+    },
+    {
+      "epoch": 1.992936427850656,
+      "grad_norm": 0.0017620738362893462,
+      "learning_rate": 1.6784392869155736e-05,
+      "loss": 0.0013,
+      "step": 7900
+    },
+    {
+      "epoch": 1.992936427850656,
+      "eval_loss": 0.004206486977636814,
+      "eval_runtime": 20.8838,
+      "eval_samples_per_second": 84.372,
+      "eval_steps_per_second": 21.117,
+      "step": 7900
+    },
+    {
+      "epoch": 1.9954591321897075,
+      "grad_norm": 0.002144803060218692,
+      "learning_rate": 1.674234779683821e-05,
+      "loss": 0.0004,
+      "step": 7910
+    },
+    {
+      "epoch": 1.997981836528759,
+      "grad_norm": 0.21179014444351196,
+      "learning_rate": 1.6700302724520686e-05,
+      "loss": 0.0029,
+      "step": 7920
+    },
+    {
+      "epoch": 2.0005045408678104,
+      "grad_norm": 0.029519561678171158,
+      "learning_rate": 1.6658257652203164e-05,
+      "loss": 0.0002,
+      "step": 7930
+    },
+    {
+      "epoch": 2.003027245206862,
+      "grad_norm": 0.0001509381690993905,
+      "learning_rate": 1.661621257988564e-05,
+      "loss": 0.0008,
+      "step": 7940
+    },
+    {
+      "epoch": 2.0055499495459133,
+      "grad_norm": 0.0014306082157418132,
+      "learning_rate": 1.6574167507568113e-05,
+      "loss": 0.0007,
+      "step": 7950
+    },
+    {
+      "epoch": 2.0080726538849647,
+      "grad_norm": 0.0010387469083070755,
+      "learning_rate": 1.6532122435250588e-05,
+      "loss": 0.0005,
+      "step": 7960
+    },
+    {
+      "epoch": 2.010595358224016,
+      "grad_norm": 0.1410035490989685,
+      "learning_rate": 1.6490077362933066e-05,
+      "loss": 0.0002,
+      "step": 7970
+    },
+    {
+      "epoch": 2.0131180625630676,
+      "grad_norm": 0.00042714065057225525,
+      "learning_rate": 1.644803229061554e-05,
+      "loss": 0.0008,
+      "step": 7980
+    },
+    {
+      "epoch": 2.015640766902119,
+      "grad_norm": 0.0003534654970280826,
+      "learning_rate": 1.6405987218298015e-05,
+      "loss": 0.0014,
+      "step": 7990
+    },
+    {
+      "epoch": 2.0181634712411705,
+      "grad_norm": 0.13387076556682587,
+      "learning_rate": 1.6363942145980493e-05,
+      "loss": 0.0039,
+      "step": 8000
+    },
+    {
+      "epoch": 2.0181634712411705,
+      "eval_loss": 0.004306289833039045,
+      "eval_runtime": 20.8954,
+      "eval_samples_per_second": 84.325,
+      "eval_steps_per_second": 21.105,
+      "step": 8000
+    },
+    {
+      "epoch": 2.020686175580222,
+      "grad_norm": 0.12466511130332947,
+      "learning_rate": 1.6321897073662968e-05,
+      "loss": 0.001,
+      "step": 8010
+    },
+    {
+      "epoch": 2.0232088799192733,
+      "grad_norm": 0.00024663680233061314,
+      "learning_rate": 1.6279852001345443e-05,
+      "loss": 0.0002,
+      "step": 8020
+    },
+    {
+      "epoch": 2.025731584258325,
+      "grad_norm": 0.00923093967139721,
+      "learning_rate": 1.6237806929027917e-05,
+      "loss": 0.0006,
+      "step": 8030
+    },
+    {
+      "epoch": 2.028254288597376,
+      "grad_norm": 0.00024134966952260584,
+      "learning_rate": 1.6195761856710395e-05,
+      "loss": 0.0013,
+      "step": 8040
+    },
+    {
+      "epoch": 2.0307769929364277,
+      "grad_norm": 0.14056503772735596,
+      "learning_rate": 1.615371678439287e-05,
+      "loss": 0.0023,
+      "step": 8050
+    },
+    {
+      "epoch": 2.033299697275479,
+      "grad_norm": 0.17755192518234253,
+      "learning_rate": 1.6111671712075348e-05,
+      "loss": 0.0018,
+      "step": 8060
+    },
+    {
+      "epoch": 2.035822401614531,
+      "grad_norm": 0.07552599161863327,
+      "learning_rate": 1.606962663975782e-05,
+      "loss": 0.001,
+      "step": 8070
+    },
+    {
+      "epoch": 2.0383451059535824,
+      "grad_norm": 0.05230065807700157,
+      "learning_rate": 1.6027581567440297e-05,
+      "loss": 0.0006,
+      "step": 8080
+    },
+    {
+      "epoch": 2.040867810292634,
+      "grad_norm": 0.011521569453179836,
+      "learning_rate": 1.5985536495122772e-05,
+      "loss": 0.0005,
+      "step": 8090
+    },
+    {
+      "epoch": 2.0433905146316853,
+      "grad_norm": 0.00038396025775000453,
+      "learning_rate": 1.594349142280525e-05,
+      "loss": 0.0008,
+      "step": 8100
+    },
+    {
+      "epoch": 2.0433905146316853,
+      "eval_loss": 0.0042536817491054535,
+      "eval_runtime": 20.8933,
+      "eval_samples_per_second": 84.333,
+      "eval_steps_per_second": 21.107,
+      "step": 8100
+    },
+    {
+      "epoch": 2.0459132189707367,
+      "grad_norm": 0.0009189638658426702,
+      "learning_rate": 1.590144635048772e-05,
+      "loss": 0.0002,
+      "step": 8110
+    },
+    {
+      "epoch": 2.048435923309788,
+      "grad_norm": 0.00038111425237730145,
+      "learning_rate": 1.58594012781702e-05,
+      "loss": 0.0009,
+      "step": 8120
+    },
+    {
+      "epoch": 2.0509586276488396,
+      "grad_norm": 0.002369858091697097,
+      "learning_rate": 1.5817356205852674e-05,
+      "loss": 0.0001,
+      "step": 8130
+    },
+    {
+      "epoch": 2.053481331987891,
+      "grad_norm": 0.07348914444446564,
+      "learning_rate": 1.5775311133535152e-05,
+      "loss": 0.0018,
+      "step": 8140
+    },
+    {
+      "epoch": 2.0560040363269425,
+      "grad_norm": 0.0008833975298330188,
+      "learning_rate": 1.5733266061217623e-05,
+      "loss": 0.0005,
+      "step": 8150
+    },
+    {
+      "epoch": 2.058526740665994,
+      "grad_norm": 0.0008616661070846021,
+      "learning_rate": 1.56912209889001e-05,
+      "loss": 0.0002,
+      "step": 8160
+    },
+    {
+      "epoch": 2.0610494450050454,
+      "grad_norm": 0.003825935535132885,
+      "learning_rate": 1.564917591658258e-05,
+      "loss": 0.0009,
+      "step": 8170
+    },
+    {
+      "epoch": 2.063572149344097,
+      "grad_norm": 0.0037789177149534225,
+      "learning_rate": 1.5607130844265054e-05,
+      "loss": 0.0012,
+      "step": 8180
+    },
+    {
+      "epoch": 2.0660948536831483,
+      "grad_norm": 0.002533160848543048,
+      "learning_rate": 1.556508577194753e-05,
+      "loss": 0.0007,
+      "step": 8190
+    },
+    {
+      "epoch": 2.0686175580221997,
+      "grad_norm": 0.005100834183394909,
+      "learning_rate": 1.5523040699630003e-05,
+      "loss": 0.0005,
+      "step": 8200
+    },
+    {
+      "epoch": 2.0686175580221997,
+      "eval_loss": 0.004342484753578901,
+      "eval_runtime": 20.9079,
+      "eval_samples_per_second": 84.274,
+      "eval_steps_per_second": 21.093,
+      "step": 8200
+    },
+    {
+      "epoch": 2.071140262361251,
+      "grad_norm": 0.003723128465935588,
+      "learning_rate": 1.548099562731248e-05,
+      "loss": 0.0005,
+      "step": 8210
+    },
+    {
+      "epoch": 2.0736629667003026,
+      "grad_norm": 0.0664861872792244,
+      "learning_rate": 1.5438950554994956e-05,
+      "loss": 0.0007,
+      "step": 8220
+    },
+    {
+      "epoch": 2.076185671039354,
+      "grad_norm": 0.003986823838204145,
+      "learning_rate": 1.539690548267743e-05,
+      "loss": 0.0008,
+      "step": 8230
+    },
+    {
+      "epoch": 2.0787083753784055,
+      "grad_norm": 0.03816875070333481,
+      "learning_rate": 1.5354860410359905e-05,
+      "loss": 0.0017,
+      "step": 8240
+    },
+    {
+      "epoch": 2.081231079717457,
+      "grad_norm": 0.22215162217617035,
+      "learning_rate": 1.5312815338042384e-05,
+      "loss": 0.0027,
+      "step": 8250
+    },
+    {
+      "epoch": 2.0837537840565084,
+      "grad_norm": 0.13046610355377197,
+      "learning_rate": 1.5270770265724858e-05,
+      "loss": 0.0031,
+      "step": 8260
+    },
+    {
+      "epoch": 2.0862764883955602,
+      "grad_norm": 0.16013352572917938,
+      "learning_rate": 1.5228725193407335e-05,
+      "loss": 0.0013,
+      "step": 8270
+    },
+    {
+      "epoch": 2.0887991927346117,
+      "grad_norm": 0.01975584402680397,
+      "learning_rate": 1.5186680121089808e-05,
+      "loss": 0.0002,
+      "step": 8280
+    },
+    {
+      "epoch": 2.091321897073663,
+      "grad_norm": 0.02759338729083538,
+      "learning_rate": 1.5144635048772286e-05,
+      "loss": 0.0009,
+      "step": 8290
+    },
+    {
+      "epoch": 2.0938446014127146,
+      "grad_norm": 0.0007609634776599705,
+      "learning_rate": 1.5102589976454759e-05,
+      "loss": 0.0003,
+      "step": 8300
+    },
+    {
+      "epoch": 2.0938446014127146,
+      "eval_loss": 0.004117065574973822,
+      "eval_runtime": 20.9041,
+      "eval_samples_per_second": 84.29,
+      "eval_steps_per_second": 21.096,
+      "step": 8300
+    },
+    {
+      "epoch": 2.096367305751766,
+      "grad_norm": 0.0008400371880270541,
+      "learning_rate": 1.5060544904137237e-05,
+      "loss": 0.0014,
+      "step": 8310
+    },
+    {
+      "epoch": 2.0988900100908174,
+      "grad_norm": 0.0029490781016647816,
+      "learning_rate": 1.501849983181971e-05,
+      "loss": 0.0017,
+      "step": 8320
+    },
+    {
+      "epoch": 2.101412714429869,
+      "grad_norm": 0.14210307598114014,
+      "learning_rate": 1.4976454759502188e-05,
+      "loss": 0.0012,
+      "step": 8330
+    },
+    {
+      "epoch": 2.1039354187689203,
+      "grad_norm": 0.0019058181205764413,
+      "learning_rate": 1.4934409687184664e-05,
+      "loss": 0.0006,
+      "step": 8340
+    },
+    {
+      "epoch": 2.1064581231079718,
+      "grad_norm": 0.0012054074322804809,
+      "learning_rate": 1.4892364614867139e-05,
+      "loss": 0.0018,
+      "step": 8350
+    },
+    {
+      "epoch": 2.108980827447023,
+      "grad_norm": 0.0005894547794014215,
+      "learning_rate": 1.4850319542549615e-05,
+      "loss": 0.0025,
+      "step": 8360
+    },
+    {
+      "epoch": 2.1115035317860746,
+      "grad_norm": 0.0017818346386775374,
+      "learning_rate": 1.480827447023209e-05,
+      "loss": 0.0017,
+      "step": 8370
+    },
+    {
+      "epoch": 2.114026236125126,
+      "grad_norm": 0.0991104245185852,
+      "learning_rate": 1.4766229397914566e-05,
+      "loss": 0.001,
+      "step": 8380
+    },
+    {
+      "epoch": 2.1165489404641775,
+      "grad_norm": 0.0006472957320511341,
+      "learning_rate": 1.472418432559704e-05,
+      "loss": 0.0004,
+      "step": 8390
+    },
+    {
+      "epoch": 2.119071644803229,
+      "grad_norm": 0.03154408931732178,
+      "learning_rate": 1.4682139253279517e-05,
+      "loss": 0.001,
+      "step": 8400
+    },
+    {
+      "epoch": 2.119071644803229,
+      "eval_loss": 0.004066385794430971,
+      "eval_runtime": 20.8985,
+      "eval_samples_per_second": 84.312,
+      "eval_steps_per_second": 21.102,
+      "step": 8400
+    },
+    {
+      "epoch": 2.1215943491422804,
+      "grad_norm": 0.0002149187057511881,
+      "learning_rate": 1.4640094180961992e-05,
+      "loss": 0.0013,
+      "step": 8410
+    },
+    {
+      "epoch": 2.124117053481332,
+      "grad_norm": 0.11510289460420609,
+      "learning_rate": 1.4598049108644468e-05,
+      "loss": 0.0006,
+      "step": 8420
+    },
+    {
+      "epoch": 2.1266397578203833,
+      "grad_norm": 0.11964685469865799,
+      "learning_rate": 1.4556004036326943e-05,
+      "loss": 0.0003,
+      "step": 8430
+    },
+    {
+      "epoch": 2.1291624621594347,
+      "grad_norm": 0.1316743791103363,
+      "learning_rate": 1.4513958964009419e-05,
+      "loss": 0.0005,
+      "step": 8440
+    },
+    {
+      "epoch": 2.131685166498486,
+      "grad_norm": 3.9832641050452366e-05,
+      "learning_rate": 1.4471913891691894e-05,
+      "loss": 0.0,
+      "step": 8450
+    },
+    {
+      "epoch": 2.134207870837538,
+      "grad_norm": 0.00014946168812457472,
+      "learning_rate": 1.442986881937437e-05,
+      "loss": 0.0008,
+      "step": 8460
+    },
+    {
+      "epoch": 2.1367305751765895,
+      "grad_norm": 0.040535129606723785,
+      "learning_rate": 1.4387823747056845e-05,
+      "loss": 0.0007,
+      "step": 8470
+    },
+    {
+      "epoch": 2.139253279515641,
+      "grad_norm": 9.963886986952275e-05,
+      "learning_rate": 1.4345778674739321e-05,
+      "loss": 0.0008,
+      "step": 8480
+    },
+    {
+      "epoch": 2.1417759838546924,
+      "grad_norm": 0.00044994213385507464,
+      "learning_rate": 1.4303733602421796e-05,
+      "loss": 0.0004,
+      "step": 8490
+    },
+    {
+      "epoch": 2.144298688193744,
+      "grad_norm": 0.09494508057832718,
+      "learning_rate": 1.4261688530104272e-05,
+      "loss": 0.0008,
+      "step": 8500
+    },
+    {
+      "epoch": 2.144298688193744,
+      "eval_loss": 0.004108693916350603,
+      "eval_runtime": 20.9,
+      "eval_samples_per_second": 84.306,
+      "eval_steps_per_second": 21.1,
+      "step": 8500
+    },
+    {
+      "epoch": 2.1468213925327952,
+      "grad_norm": 0.001810372225008905,
+      "learning_rate": 1.4219643457786749e-05,
+      "loss": 0.0002,
+      "step": 8510
+    },
+    {
+      "epoch": 2.1493440968718467,
+      "grad_norm": 0.00013946890248917043,
+      "learning_rate": 1.4177598385469223e-05,
+      "loss": 0.0007,
+      "step": 8520
+    },
+    {
+      "epoch": 2.151866801210898,
+      "grad_norm": 0.003362849121913314,
+      "learning_rate": 1.4135553313151701e-05,
+      "loss": 0.0002,
+      "step": 8530
+    },
+    {
+      "epoch": 2.1543895055499496,
+      "grad_norm": 0.01551423966884613,
+      "learning_rate": 1.4093508240834174e-05,
+      "loss": 0.0003,
+      "step": 8540
+    },
+    {
+      "epoch": 2.156912209889001,
+      "grad_norm": 0.0713684931397438,
+      "learning_rate": 1.4051463168516652e-05,
+      "loss": 0.0005,
+      "step": 8550
+    },
+    {
+      "epoch": 2.1594349142280524,
+      "grad_norm": 0.0008997659897431731,
+      "learning_rate": 1.4009418096199125e-05,
+      "loss": 0.0008,
+      "step": 8560
+    },
+    {
+      "epoch": 2.161957618567104,
+      "grad_norm": 0.00035184502485208213,
+      "learning_rate": 1.3967373023881603e-05,
+      "loss": 0.0004,
+      "step": 8570
+    },
+    {
+      "epoch": 2.1644803229061553,
+      "grad_norm": 0.08870701491832733,
+      "learning_rate": 1.3925327951564076e-05,
+      "loss": 0.002,
+      "step": 8580
+    },
+    {
+      "epoch": 2.1670030272452068,
+      "grad_norm": 0.032671812921762466,
+      "learning_rate": 1.3883282879246554e-05,
+      "loss": 0.0001,
+      "step": 8590
+    },
+    {
+      "epoch": 2.169525731584258,
+      "grad_norm": 0.01390095055103302,
+      "learning_rate": 1.3841237806929027e-05,
+      "loss": 0.0016,
+      "step": 8600
+    },
+    {
+      "epoch": 2.169525731584258,
+      "eval_loss": 0.004325889516621828,
+      "eval_runtime": 20.8969,
+      "eval_samples_per_second": 84.319,
+      "eval_steps_per_second": 21.104,
+      "step": 8600
+    },
+    {
+      "epoch": 2.1720484359233097,
+      "grad_norm": 0.04760993644595146,
+      "learning_rate": 1.3799192734611505e-05,
+      "loss": 0.0016,
+      "step": 8610
+    },
+    {
+      "epoch": 2.174571140262361,
+      "grad_norm": 0.006453169509768486,
+      "learning_rate": 1.3757147662293978e-05,
+      "loss": 0.0002,
+      "step": 8620
+    },
+    {
+      "epoch": 2.1770938446014125,
+      "grad_norm": 0.04346233606338501,
+      "learning_rate": 1.3715102589976456e-05,
+      "loss": 0.0012,
+      "step": 8630
+    },
+    {
+      "epoch": 2.179616548940464,
+      "grad_norm": 0.010294480249285698,
+      "learning_rate": 1.367305751765893e-05,
+      "loss": 0.0003,
+      "step": 8640
+    },
+    {
+      "epoch": 2.182139253279516,
+      "grad_norm": 0.00014406938862521201,
+      "learning_rate": 1.3631012445341407e-05,
+      "loss": 0.0001,
+      "step": 8650
+    },
+    {
+      "epoch": 2.1846619576185673,
+      "grad_norm": 0.10597830265760422,
+      "learning_rate": 1.358896737302388e-05,
+      "loss": 0.0005,
+      "step": 8660
+    },
+    {
+      "epoch": 2.1871846619576187,
+      "grad_norm": 0.125259131193161,
+      "learning_rate": 1.3546922300706358e-05,
+      "loss": 0.0004,
+      "step": 8670
+    },
+    {
+      "epoch": 2.18970736629667,
+      "grad_norm": 0.00017470364400651306,
+      "learning_rate": 1.3504877228388835e-05,
+      "loss": 0.0005,
+      "step": 8680
+    },
+    {
+      "epoch": 2.1922300706357216,
+      "grad_norm": 0.1178673580288887,
+      "learning_rate": 1.346283215607131e-05,
+      "loss": 0.0005,
+      "step": 8690
+    },
+    {
+      "epoch": 2.194752774974773,
+      "grad_norm": 0.00032886656117625535,
+      "learning_rate": 1.3420787083753786e-05,
+      "loss": 0.0016,
+      "step": 8700
+    },
+    {
+      "epoch": 2.194752774974773,
+      "eval_loss": 0.004205956123769283,
+      "eval_runtime": 20.8455,
+      "eval_samples_per_second": 84.527,
+      "eval_steps_per_second": 21.156,
+      "step": 8700
+    },
+    {
+      "epoch": 2.1972754793138245,
+      "grad_norm": 0.00314329843968153,
+      "learning_rate": 1.337874201143626e-05,
+      "loss": 0.0009,
+      "step": 8710
+    },
+    {
+      "epoch": 2.199798183652876,
+      "grad_norm": 0.0048809046857059,
+      "learning_rate": 1.3336696939118737e-05,
+      "loss": 0.0013,
+      "step": 8720
+    },
+    {
+      "epoch": 2.2023208879919274,
+      "grad_norm": 0.0004060929059050977,
+      "learning_rate": 1.3294651866801211e-05,
+      "loss": 0.0001,
+      "step": 8730
+    },
+    {
+      "epoch": 2.204843592330979,
+      "grad_norm": 0.0005921365809626877,
+      "learning_rate": 1.3252606794483688e-05,
+      "loss": 0.0004,
+      "step": 8740
+    },
+    {
+      "epoch": 2.2073662966700303,
+      "grad_norm": 0.00011614049435593188,
+      "learning_rate": 1.3210561722166163e-05,
+      "loss": 0.0013,
+      "step": 8750
+    },
+    {
+      "epoch": 2.2098890010090817,
+      "grad_norm": 0.010343813337385654,
+      "learning_rate": 1.3168516649848639e-05,
+      "loss": 0.0003,
+      "step": 8760
+    },
+    {
+      "epoch": 2.212411705348133,
+      "grad_norm": 0.06460902839899063,
+      "learning_rate": 1.3126471577531114e-05,
+      "loss": 0.0004,
+      "step": 8770
+    },
+    {
+      "epoch": 2.2149344096871846,
+      "grad_norm": 0.0001900464267237112,
+      "learning_rate": 1.308442650521359e-05,
+      "loss": 0.0005,
+      "step": 8780
+    },
+    {
+      "epoch": 2.217457114026236,
+      "grad_norm": 0.24223244190216064,
+      "learning_rate": 1.3042381432896065e-05,
+      "loss": 0.0009,
+      "step": 8790
+    },
+    {
+      "epoch": 2.2199798183652875,
+      "grad_norm": 0.012591979466378689,
+      "learning_rate": 1.3000336360578541e-05,
+      "loss": 0.0015,
+      "step": 8800
+    },
+    {
+      "epoch": 2.2199798183652875,
+      "eval_loss": 0.00407541636377573,
+      "eval_runtime": 20.8574,
+      "eval_samples_per_second": 84.478,
+      "eval_steps_per_second": 21.144,
+      "step": 8800
+    },
+    {
+      "epoch": 2.222502522704339,
+      "grad_norm": 0.008851522579789162,
+      "learning_rate": 1.2958291288261016e-05,
+      "loss": 0.0001,
+      "step": 8810
+    },
+    {
+      "epoch": 2.2250252270433903,
+      "grad_norm": 0.0007781846798025072,
+      "learning_rate": 1.2916246215943492e-05,
+      "loss": 0.0002,
+      "step": 8820
+    },
+    {
+      "epoch": 2.227547931382442,
+      "grad_norm": 0.00011876798089360818,
+      "learning_rate": 1.2874201143625967e-05,
+      "loss": 0.0004,
+      "step": 8830
+    },
+    {
+      "epoch": 2.2300706357214937,
+      "grad_norm": 7.863504288252443e-05,
+      "learning_rate": 1.2832156071308443e-05,
+      "loss": 0.0006,
+      "step": 8840
+    },
+    {
+      "epoch": 2.232593340060545,
+      "grad_norm": 0.03496154770255089,
+      "learning_rate": 1.279011099899092e-05,
+      "loss": 0.0008,
+      "step": 8850
+    },
+    {
+      "epoch": 2.2351160443995965,
+      "grad_norm": 0.0036200936883687973,
+      "learning_rate": 1.2748065926673394e-05,
+      "loss": 0.0013,
+      "step": 8860
+    },
+    {
+      "epoch": 2.237638748738648,
+      "grad_norm": 0.1222803145647049,
+      "learning_rate": 1.2706020854355872e-05,
+      "loss": 0.0024,
+      "step": 8870
+    },
+    {
+      "epoch": 2.2401614530776994,
+      "grad_norm": 0.006895432714372873,
+      "learning_rate": 1.2663975782038345e-05,
+      "loss": 0.0005,
+      "step": 8880
+    },
+    {
+      "epoch": 2.242684157416751,
+      "grad_norm": 0.0022829826921224594,
+      "learning_rate": 1.2621930709720823e-05,
+      "loss": 0.0002,
+      "step": 8890
+    },
+    {
+      "epoch": 2.2452068617558023,
+      "grad_norm": 0.0012163568753749132,
+      "learning_rate": 1.2579885637403296e-05,
+      "loss": 0.0008,
+      "step": 8900
+    },
+    {
+      "epoch": 2.2452068617558023,
+      "eval_loss": 0.004117515403777361,
+      "eval_runtime": 20.8601,
+      "eval_samples_per_second": 84.468,
+      "eval_steps_per_second": 21.141,
+      "step": 8900
+    },
+    {
+      "epoch": 2.2477295660948537,
+      "grad_norm": 0.00023967861488927156,
+      "learning_rate": 1.2537840565085774e-05,
+      "loss": 0.0001,
+      "step": 8910
+    },
+    {
+      "epoch": 2.250252270433905,
+      "grad_norm": 0.002785157412290573,
+      "learning_rate": 1.2495795492768249e-05,
+      "loss": 0.0008,
+      "step": 8920
+    },
+    {
+      "epoch": 2.2527749747729566,
+      "grad_norm": 0.002094635972753167,
+      "learning_rate": 1.2453750420450725e-05,
+      "loss": 0.001,
+      "step": 8930
+    },
+    {
+      "epoch": 2.255297679112008,
+      "grad_norm": 0.17214207351207733,
+      "learning_rate": 1.24117053481332e-05,
+      "loss": 0.001,
+      "step": 8940
+    },
+    {
+      "epoch": 2.2578203834510595,
+      "grad_norm": 0.14640472829341888,
+      "learning_rate": 1.2369660275815676e-05,
+      "loss": 0.0009,
+      "step": 8950
+    },
+    {
+      "epoch": 2.260343087790111,
+      "grad_norm": 0.05824064090847969,
+      "learning_rate": 1.232761520349815e-05,
+      "loss": 0.0004,
+      "step": 8960
+    },
+    {
+      "epoch": 2.2628657921291624,
+      "grad_norm": 0.0001570479798829183,
+      "learning_rate": 1.2285570131180627e-05,
+      "loss": 0.0003,
+      "step": 8970
+    },
+    {
+      "epoch": 2.265388496468214,
+      "grad_norm": 0.008577616885304451,
+      "learning_rate": 1.2243525058863102e-05,
+      "loss": 0.0001,
+      "step": 8980
+    },
+    {
+      "epoch": 2.2679112008072653,
+      "grad_norm": 0.1447947919368744,
+      "learning_rate": 1.2201479986545578e-05,
+      "loss": 0.0011,
+      "step": 8990
+    },
+    {
+      "epoch": 2.2704339051463167,
+      "grad_norm": 0.00018846993043553084,
+      "learning_rate": 1.2159434914228053e-05,
+      "loss": 0.0002,
+      "step": 9000
+    },
+    {
+      "epoch": 2.2704339051463167,
+      "eval_loss": 0.004093192983418703,
+      "eval_runtime": 20.8777,
+      "eval_samples_per_second": 84.396,
+      "eval_steps_per_second": 21.123,
+      "step": 9000
+    },
+    {
+      "epoch": 2.272956609485368,
+      "grad_norm": 0.0007275242242030799,
+      "learning_rate": 1.211738984191053e-05,
+      "loss": 0.0011,
+      "step": 9010
+    },
+    {
+      "epoch": 2.2754793138244196,
+      "grad_norm": 0.010818258859217167,
+      "learning_rate": 1.2075344769593004e-05,
+      "loss": 0.0002,
+      "step": 9020
+    },
+    {
+      "epoch": 2.2780020181634715,
+      "grad_norm": 0.019404212012887,
+      "learning_rate": 1.203329969727548e-05,
+      "loss": 0.0007,
+      "step": 9030
+    },
+    {
+      "epoch": 2.2805247225025225,
+      "grad_norm": 0.07261942327022552,
+      "learning_rate": 1.1991254624957955e-05,
+      "loss": 0.0002,
+      "step": 9040
+    },
+    {
+      "epoch": 2.2830474268415744,
+      "grad_norm": 0.0001016618189169094,
+      "learning_rate": 1.1949209552640431e-05,
+      "loss": 0.0019,
+      "step": 9050
+    },
+    {
+      "epoch": 2.285570131180626,
+      "grad_norm": 0.00039705351809971035,
+      "learning_rate": 1.1907164480322906e-05,
+      "loss": 0.0002,
+      "step": 9060
+    },
+    {
+      "epoch": 2.2880928355196772,
+      "grad_norm": 0.019509321078658104,
+      "learning_rate": 1.1865119408005382e-05,
+      "loss": 0.0007,
+      "step": 9070
+    },
+    {
+      "epoch": 2.2906155398587287,
+      "grad_norm": 0.00039067715988494456,
+      "learning_rate": 1.1823074335687857e-05,
+      "loss": 0.0007,
+      "step": 9080
+    },
+    {
+      "epoch": 2.29313824419778,
+      "grad_norm": 0.006623209919780493,
+      "learning_rate": 1.1781029263370335e-05,
+      "loss": 0.0017,
+      "step": 9090
+    },
+    {
+      "epoch": 2.2956609485368316,
+      "grad_norm": 0.09274378418922424,
+      "learning_rate": 1.173898419105281e-05,
+      "loss": 0.0007,
+      "step": 9100
+    },
+    {
+      "epoch": 2.2956609485368316,
+      "eval_loss": 0.004082069266587496,
+      "eval_runtime": 20.8844,
+      "eval_samples_per_second": 84.369,
+      "eval_steps_per_second": 21.116,
+      "step": 9100
+    },
+    {
+      "epoch": 2.298183652875883,
+      "grad_norm": 0.00021790213941130787,
+      "learning_rate": 1.1696939118735286e-05,
+      "loss": 0.0006,
+      "step": 9110
+    },
+    {
+      "epoch": 2.3007063572149344,
+      "grad_norm": 0.11509310454130173,
+      "learning_rate": 1.165489404641776e-05,
+      "loss": 0.0008,
+      "step": 9120
+    },
+    {
+      "epoch": 2.303229061553986,
+      "grad_norm": 0.0007341225282289088,
+      "learning_rate": 1.1612848974100237e-05,
+      "loss": 0.0004,
+      "step": 9130
+    },
+    {
+      "epoch": 2.3057517658930373,
+      "grad_norm": 0.14291776716709137,
+      "learning_rate": 1.1570803901782712e-05,
+      "loss": 0.0023,
+      "step": 9140
+    },
+    {
+      "epoch": 2.3082744702320888,
+      "grad_norm": 9.565720392856747e-05,
+      "learning_rate": 1.1528758829465188e-05,
+      "loss": 0.0007,
+      "step": 9150
+    },
+    {
+      "epoch": 2.31079717457114,
+      "grad_norm": 3.3541025914018974e-05,
+      "learning_rate": 1.1486713757147663e-05,
+      "loss": 0.0013,
+      "step": 9160
+    },
+    {
+      "epoch": 2.3133198789101916,
+      "grad_norm": 0.0004649158217944205,
+      "learning_rate": 1.1444668684830139e-05,
+      "loss": 0.0013,
+      "step": 9170
+    },
+    {
+      "epoch": 2.315842583249243,
+      "grad_norm": 0.19510401785373688,
+      "learning_rate": 1.1402623612512614e-05,
+      "loss": 0.0009,
+      "step": 9180
+    },
+    {
+      "epoch": 2.3183652875882945,
+      "grad_norm": 0.06924453377723694,
+      "learning_rate": 1.136057854019509e-05,
+      "loss": 0.0009,
+      "step": 9190
+    },
+    {
+      "epoch": 2.320887991927346,
+      "grad_norm": 0.006778767332434654,
+      "learning_rate": 1.1318533467877565e-05,
+      "loss": 0.0,
+      "step": 9200
+    },
+    {
+      "epoch": 2.320887991927346,
+      "eval_loss": 0.004074608441442251,
+      "eval_runtime": 20.8629,
+      "eval_samples_per_second": 84.456,
+      "eval_steps_per_second": 21.138,
+      "step": 9200
+    },
+    {
+      "epoch": 2.3234106962663974,
+      "grad_norm": 0.0005497061647474766,
+      "learning_rate": 1.1276488395560041e-05,
+      "loss": 0.0007,
+      "step": 9210
+    },
+    {
+      "epoch": 2.3259334006054493,
+      "grad_norm": 0.2050006240606308,
+      "learning_rate": 1.1234443323242516e-05,
+      "loss": 0.0025,
+      "step": 9220
+    },
+    {
+      "epoch": 2.3284561049445003,
+      "grad_norm": 0.002956175012513995,
+      "learning_rate": 1.1192398250924992e-05,
+      "loss": 0.0013,
+      "step": 9230
+    },
+    {
+      "epoch": 2.330978809283552,
+      "grad_norm": 0.0013219810789451003,
+      "learning_rate": 1.1150353178607467e-05,
+      "loss": 0.0016,
+      "step": 9240
+    },
+    {
+      "epoch": 2.3335015136226036,
+      "grad_norm": 0.0003643881937023252,
+      "learning_rate": 1.1108308106289943e-05,
+      "loss": 0.0003,
+      "step": 9250
+    },
+    {
+      "epoch": 2.336024217961655,
+      "grad_norm": 0.00021895192912779748,
+      "learning_rate": 1.106626303397242e-05,
+      "loss": 0.0005,
+      "step": 9260
+    },
+    {
+      "epoch": 2.3385469223007065,
+      "grad_norm": 0.07140027731657028,
+      "learning_rate": 1.1024217961654896e-05,
+      "loss": 0.0003,
+      "step": 9270
+    },
+    {
+      "epoch": 2.341069626639758,
+      "grad_norm": 0.004080440849065781,
+      "learning_rate": 1.098217288933737e-05,
+      "loss": 0.0007,
+      "step": 9280
+    },
+    {
+      "epoch": 2.3435923309788094,
+      "grad_norm": 0.00017156251124106348,
+      "learning_rate": 1.0940127817019847e-05,
+      "loss": 0.001,
+      "step": 9290
+    },
+    {
+      "epoch": 2.346115035317861,
+      "grad_norm": 0.00018193376308772713,
+      "learning_rate": 1.0898082744702322e-05,
+      "loss": 0.0019,
+      "step": 9300
+    },
+    {
+      "epoch": 2.346115035317861,
+      "eval_loss": 0.00402724277228117,
+      "eval_runtime": 20.8513,
+      "eval_samples_per_second": 84.503,
+      "eval_steps_per_second": 21.15,
+      "step": 9300
+    },
+    {
+      "epoch": 2.3486377396569122,
+      "grad_norm": 3.1227758881868795e-05,
+      "learning_rate": 1.0856037672384798e-05,
+      "loss": 0.0001,
+      "step": 9310
+    },
+    {
+      "epoch": 2.3511604439959637,
+      "grad_norm": 0.0005630968371406198,
+      "learning_rate": 1.0813992600067273e-05,
+      "loss": 0.0013,
+      "step": 9320
+    },
+    {
+      "epoch": 2.353683148335015,
+      "grad_norm": 0.011226714588701725,
+      "learning_rate": 1.0771947527749749e-05,
+      "loss": 0.0026,
+      "step": 9330
+    },
+    {
+      "epoch": 2.3562058526740666,
+      "grad_norm": 0.0001727965282043442,
+      "learning_rate": 1.0729902455432224e-05,
+      "loss": 0.0007,
+      "step": 9340
+    },
+    {
+      "epoch": 2.358728557013118,
+      "grad_norm": 0.00026375442394055426,
+      "learning_rate": 1.06878573831147e-05,
+      "loss": 0.0005,
+      "step": 9350
+    },
+    {
+      "epoch": 2.3612512613521695,
+      "grad_norm": 0.058445390313863754,
+      "learning_rate": 1.0645812310797175e-05,
+      "loss": 0.0002,
+      "step": 9360
+    },
+    {
+      "epoch": 2.363773965691221,
+      "grad_norm": 0.0032288488000631332,
+      "learning_rate": 1.0603767238479651e-05,
+      "loss": 0.0009,
+      "step": 9370
+    },
+    {
+      "epoch": 2.3662966700302723,
+      "grad_norm": 0.0001218500838149339,
+      "learning_rate": 1.0561722166162126e-05,
+      "loss": 0.0003,
+      "step": 9380
+    },
+    {
+      "epoch": 2.3688193743693238,
+      "grad_norm": 0.0034417565912008286,
+      "learning_rate": 1.0519677093844602e-05,
+      "loss": 0.0001,
+      "step": 9390
+    },
+    {
+      "epoch": 2.371342078708375,
+      "grad_norm": 0.0020040010567754507,
+      "learning_rate": 1.0477632021527077e-05,
+      "loss": 0.0007,
+      "step": 9400
+    },
+    {
+      "epoch": 2.371342078708375,
+      "eval_loss": 0.00405073631554842,
+      "eval_runtime": 20.8534,
+      "eval_samples_per_second": 84.495,
+      "eval_steps_per_second": 21.148,
+      "step": 9400
+    },
+    {
+      "epoch": 2.3738647830474267,
+      "grad_norm": 0.158976748585701,
+      "learning_rate": 1.0435586949209553e-05,
+      "loss": 0.0011,
+      "step": 9410
+    },
+    {
+      "epoch": 2.376387487386478,
+      "grad_norm": 0.0006932442774996161,
+      "learning_rate": 1.0393541876892028e-05,
+      "loss": 0.0004,
+      "step": 9420
+    },
+    {
+      "epoch": 2.37891019172553,
+      "grad_norm": 0.000254453276284039,
+      "learning_rate": 1.0351496804574506e-05,
+      "loss": 0.0003,
+      "step": 9430
+    },
+    {
+      "epoch": 2.3814328960645814,
+      "grad_norm": 0.001546886982396245,
+      "learning_rate": 1.030945173225698e-05,
+      "loss": 0.0001,
+      "step": 9440
+    },
+    {
+      "epoch": 2.383955600403633,
+      "grad_norm": 0.004422788508236408,
+      "learning_rate": 1.0267406659939457e-05,
+      "loss": 0.0001,
+      "step": 9450
+    },
+    {
+      "epoch": 2.3864783047426843,
+      "grad_norm": 0.08441481739282608,
+      "learning_rate": 1.0225361587621931e-05,
+      "loss": 0.001,
+      "step": 9460
+    },
+    {
+      "epoch": 2.3890010090817357,
+      "grad_norm": 0.0022749004419893026,
+      "learning_rate": 1.0183316515304408e-05,
+      "loss": 0.0002,
+      "step": 9470
+    },
+    {
+      "epoch": 2.391523713420787,
+      "grad_norm": 0.0009072842076420784,
+      "learning_rate": 1.0141271442986882e-05,
+      "loss": 0.0007,
+      "step": 9480
+    },
+    {
+      "epoch": 2.3940464177598386,
+      "grad_norm": 0.0009312007459811866,
+      "learning_rate": 1.0099226370669359e-05,
+      "loss": 0.0004,
+      "step": 9490
+    },
+    {
+      "epoch": 2.39656912209889,
+      "grad_norm": 0.00628532562404871,
+      "learning_rate": 1.0057181298351834e-05,
+      "loss": 0.0004,
+      "step": 9500
+    },
+    {
+      "epoch": 2.39656912209889,
+      "eval_loss": 0.004011793062090874,
+      "eval_runtime": 20.8472,
+      "eval_samples_per_second": 84.52,
+      "eval_steps_per_second": 21.154,
+      "step": 9500
+    },
+    {
+      "epoch": 2.3990918264379415,
+      "grad_norm": 0.01351741049438715,
+      "learning_rate": 1.001513622603431e-05,
+      "loss": 0.0002,
+      "step": 9510
+    },
+    {
+      "epoch": 2.401614530776993,
+      "grad_norm": 0.1305786371231079,
+      "learning_rate": 9.973091153716785e-06,
+      "loss": 0.0007,
+      "step": 9520
+    },
+    {
+      "epoch": 2.4041372351160444,
+      "grad_norm": 0.0029213367961347103,
+      "learning_rate": 9.931046081399261e-06,
+      "loss": 0.0003,
+      "step": 9530
+    },
+    {
+      "epoch": 2.406659939455096,
+      "grad_norm": 0.004418868105858564,
+      "learning_rate": 9.889001009081736e-06,
+      "loss": 0.0007,
+      "step": 9540
+    },
+    {
+      "epoch": 2.4091826437941473,
+      "grad_norm": 0.022457575425505638,
+      "learning_rate": 9.846955936764212e-06,
+      "loss": 0.0005,
+      "step": 9550
+    },
+    {
+      "epoch": 2.4117053481331987,
+      "grad_norm": 0.0011523871216922998,
+      "learning_rate": 9.804910864446687e-06,
+      "loss": 0.0008,
+      "step": 9560
+    },
+    {
+      "epoch": 2.41422805247225,
+      "grad_norm": 0.00031501849298365414,
+      "learning_rate": 9.762865792129163e-06,
+      "loss": 0.0005,
+      "step": 9570
+    },
+    {
+      "epoch": 2.4167507568113016,
+      "grad_norm": 0.0037088736426085234,
+      "learning_rate": 9.720820719811638e-06,
+      "loss": 0.0012,
+      "step": 9580
+    },
+    {
+      "epoch": 2.419273461150353,
+      "grad_norm": 6.174742884468287e-05,
+      "learning_rate": 9.678775647494114e-06,
+      "loss": 0.0001,
+      "step": 9590
+    },
+    {
+      "epoch": 2.4217961654894045,
+      "grad_norm": 0.05361416935920715,
+      "learning_rate": 9.63673057517659e-06,
+      "loss": 0.0014,
+      "step": 9600
+    },
+    {
+      "epoch": 2.4217961654894045,
+      "eval_loss": 0.003927647601813078,
+      "eval_runtime": 20.8667,
+      "eval_samples_per_second": 84.441,
+      "eval_steps_per_second": 21.134,
+      "step": 9600
+    },
+    {
+      "epoch": 2.424318869828456,
+      "grad_norm": 0.0005672819679602981,
+      "learning_rate": 9.594685502859067e-06,
+      "loss": 0.0002,
+      "step": 9610
+    },
+    {
+      "epoch": 2.426841574167508,
+      "grad_norm": 0.03695020079612732,
+      "learning_rate": 9.552640430541541e-06,
+      "loss": 0.004,
+      "step": 9620
+    },
+    {
+      "epoch": 2.429364278506559,
+      "grad_norm": 0.17845889925956726,
+      "learning_rate": 9.510595358224018e-06,
+      "loss": 0.0011,
+      "step": 9630
+    },
+    {
+      "epoch": 2.4318869828456107,
+      "grad_norm": 0.0009999609319493175,
+      "learning_rate": 9.468550285906492e-06,
+      "loss": 0.0008,
+      "step": 9640
+    },
+    {
+      "epoch": 2.434409687184662,
+      "grad_norm": 0.10458512604236603,
+      "learning_rate": 9.426505213588969e-06,
+      "loss": 0.0007,
+      "step": 9650
+    },
+    {
+      "epoch": 2.4369323915237135,
+      "grad_norm": 0.00014382805966306478,
+      "learning_rate": 9.384460141271443e-06,
+      "loss": 0.0006,
+      "step": 9660
+    },
+    {
+      "epoch": 2.439455095862765,
+      "grad_norm": 0.2015506774187088,
+      "learning_rate": 9.34241506895392e-06,
+      "loss": 0.0022,
+      "step": 9670
+    },
+    {
+      "epoch": 2.4419778002018164,
+      "grad_norm": 0.0007017810712568462,
+      "learning_rate": 9.300369996636394e-06,
+      "loss": 0.0003,
+      "step": 9680
+    },
+    {
+      "epoch": 2.444500504540868,
+      "grad_norm": 0.003951243124902248,
+      "learning_rate": 9.25832492431887e-06,
+      "loss": 0.0016,
+      "step": 9690
+    },
+    {
+      "epoch": 2.4470232088799193,
+      "grad_norm": 0.0021652476862072945,
+      "learning_rate": 9.216279852001345e-06,
+      "loss": 0.0002,
+      "step": 9700
+    },
+    {
+      "epoch": 2.4470232088799193,
+      "eval_loss": 0.004020575433969498,
+      "eval_runtime": 20.8681,
+      "eval_samples_per_second": 84.435,
+      "eval_steps_per_second": 21.133,
+      "step": 9700
+    },
+    {
+      "epoch": 2.4495459132189707,
+      "grad_norm": 0.0001469567941967398,
+      "learning_rate": 9.174234779683822e-06,
+      "loss": 0.0004,
+      "step": 9710
+    },
+    {
+      "epoch": 2.452068617558022,
+      "grad_norm": 0.06326698511838913,
+      "learning_rate": 9.132189707366296e-06,
+      "loss": 0.0002,
+      "step": 9720
+    },
+    {
+      "epoch": 2.4545913218970736,
+      "grad_norm": 0.006055818870663643,
+      "learning_rate": 9.090144635048773e-06,
+      "loss": 0.0001,
+      "step": 9730
+    },
+    {
+      "epoch": 2.457114026236125,
+      "grad_norm": 0.006732003763318062,
+      "learning_rate": 9.048099562731247e-06,
+      "loss": 0.0003,
+      "step": 9740
+    },
+    {
+      "epoch": 2.4596367305751765,
+      "grad_norm": 0.0013023527571931481,
+      "learning_rate": 9.006054490413724e-06,
+      "loss": 0.0003,
+      "step": 9750
+    },
+    {
+      "epoch": 2.462159434914228,
+      "grad_norm": 0.00027046047034673393,
+      "learning_rate": 8.964009418096199e-06,
+      "loss": 0.0005,
+      "step": 9760
+    },
+    {
+      "epoch": 2.4646821392532794,
+      "grad_norm": 0.0663028210401535,
+      "learning_rate": 8.921964345778677e-06,
+      "loss": 0.0017,
+      "step": 9770
+    },
+    {
+      "epoch": 2.467204843592331,
+      "grad_norm": 0.06608086824417114,
+      "learning_rate": 8.879919273461151e-06,
+      "loss": 0.0016,
+      "step": 9780
+    },
+    {
+      "epoch": 2.4697275479313823,
+      "grad_norm": 0.1625903695821762,
+      "learning_rate": 8.837874201143628e-06,
+      "loss": 0.0028,
+      "step": 9790
+    },
+    {
+      "epoch": 2.4722502522704337,
+      "grad_norm": 0.0005290044355206192,
+      "learning_rate": 8.795829128826102e-06,
+      "loss": 0.0013,
+      "step": 9800
+    },
+    {
+      "epoch": 2.4722502522704337,
+      "eval_loss": 0.00388718512840569,
+      "eval_runtime": 20.8863,
+      "eval_samples_per_second": 84.362,
+      "eval_steps_per_second": 21.114,
+      "step": 9800
+    },
+    {
+      "epoch": 2.4747729566094856,
+      "grad_norm": 0.0725766122341156,
+      "learning_rate": 8.753784056508579e-06,
+      "loss": 0.0003,
+      "step": 9810
+    },
+    {
+      "epoch": 2.4772956609485366,
+      "grad_norm": 0.00025911492411978543,
+      "learning_rate": 8.711738984191053e-06,
+      "loss": 0.0007,
+      "step": 9820
+    },
+    {
+      "epoch": 2.4798183652875885,
+      "grad_norm": 0.005303604993969202,
+      "learning_rate": 8.66969391187353e-06,
+      "loss": 0.0005,
+      "step": 9830
+    },
+    {
+      "epoch": 2.48234106962664,
+      "grad_norm": 0.00039688186370767653,
+      "learning_rate": 8.627648839556004e-06,
+      "loss": 0.0006,
+      "step": 9840
+    },
+    {
+      "epoch": 2.4848637739656914,
+      "grad_norm": 0.010475796647369862,
+      "learning_rate": 8.58560376723848e-06,
+      "loss": 0.0005,
+      "step": 9850
+    },
+    {
+      "epoch": 2.487386478304743,
+      "grad_norm": 0.0014096908271312714,
+      "learning_rate": 8.543558694920955e-06,
+      "loss": 0.0009,
+      "step": 9860
+    },
+    {
+      "epoch": 2.4899091826437942,
+      "grad_norm": 0.0016413936391472816,
+      "learning_rate": 8.501513622603432e-06,
+      "loss": 0.0011,
+      "step": 9870
+    },
+    {
+      "epoch": 2.4924318869828457,
+      "grad_norm": 0.00043976728920824826,
+      "learning_rate": 8.459468550285906e-06,
+      "loss": 0.0001,
+      "step": 9880
+    },
+    {
+      "epoch": 2.494954591321897,
+      "grad_norm": 0.00024035267415456474,
+      "learning_rate": 8.417423477968383e-06,
+      "loss": 0.0023,
+      "step": 9890
+    },
+    {
+      "epoch": 2.4974772956609486,
+      "grad_norm": 0.0011502320412546396,
+      "learning_rate": 8.375378405650857e-06,
+      "loss": 0.0006,
+      "step": 9900
+    },
+    {
+      "epoch": 2.4974772956609486,
+      "eval_loss": 0.00393605325371027,
+      "eval_runtime": 20.8485,
+      "eval_samples_per_second": 84.515,
+      "eval_steps_per_second": 21.153,
+      "step": 9900
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.00509544787928462,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.0002,
+      "step": 9910
+    },
+    {
+      "epoch": 2.5025227043390514,
+      "grad_norm": 5.717107706004754e-05,
+      "learning_rate": 8.291288261015808e-06,
+      "loss": 0.0001,
+      "step": 9920
+    },
+    {
+      "epoch": 2.505045408678103,
+      "grad_norm": 0.08509433269500732,
+      "learning_rate": 8.249243188698285e-06,
+      "loss": 0.0003,
+      "step": 9930
+    },
+    {
+      "epoch": 2.5075681130171543,
+      "grad_norm": 0.0029674407560378313,
+      "learning_rate": 8.207198116380761e-06,
+      "loss": 0.0003,
+      "step": 9940
+    },
+    {
+      "epoch": 2.5100908173562058,
+      "grad_norm": 0.0014874001499265432,
+      "learning_rate": 8.165153044063237e-06,
+      "loss": 0.0012,
+      "step": 9950
+    },
+    {
+      "epoch": 2.512613521695257,
+      "grad_norm": 0.0054718488827347755,
+      "learning_rate": 8.123107971745712e-06,
+      "loss": 0.0004,
+      "step": 9960
+    },
+    {
+      "epoch": 2.5151362260343086,
+      "grad_norm": 0.021269412711262703,
+      "learning_rate": 8.081062899428188e-06,
+      "loss": 0.0005,
+      "step": 9970
+    },
+    {
+      "epoch": 2.51765893037336,
+      "grad_norm": 0.00045099278213456273,
+      "learning_rate": 8.039017827110663e-06,
+      "loss": 0.0003,
+      "step": 9980
+    },
+    {
+      "epoch": 2.5201816347124115,
+      "grad_norm": 5.915598012506962e-05,
+      "learning_rate": 7.99697275479314e-06,
+      "loss": 0.0002,
+      "step": 9990
+    },
+    {
+      "epoch": 2.5227043390514634,
+      "grad_norm": 9.444829629501328e-05,
+      "learning_rate": 7.954927682475614e-06,
+      "loss": 0.0001,
+      "step": 10000
+    },
+    {
+      "epoch": 2.5227043390514634,
+      "eval_loss": 0.0038506174460053444,
+      "eval_runtime": 20.868,
+      "eval_samples_per_second": 84.436,
+      "eval_steps_per_second": 21.133,
+      "step": 10000
+    },
+    {
+      "epoch": 2.5252270433905144,
+      "grad_norm": 0.00031684929854236543,
+      "learning_rate": 7.91288261015809e-06,
+      "loss": 0.0,
+      "step": 10010
+    },
+    {
+      "epoch": 2.5277497477295663,
+      "grad_norm": 0.00030053374939598143,
+      "learning_rate": 7.870837537840565e-06,
+      "loss": 0.0002,
+      "step": 10020
+    },
+    {
+      "epoch": 2.5302724520686173,
+      "grad_norm": 0.07218382507562637,
+      "learning_rate": 7.828792465523042e-06,
+      "loss": 0.0002,
+      "step": 10030
+    },
+    {
+      "epoch": 2.532795156407669,
+      "grad_norm": 9.184365626424551e-05,
+      "learning_rate": 7.786747393205516e-06,
+      "loss": 0.0,
+      "step": 10040
+    },
+    {
+      "epoch": 2.5353178607467206,
+      "grad_norm": 0.00037872057873755693,
+      "learning_rate": 7.744702320887993e-06,
+      "loss": 0.0028,
+      "step": 10050
+    },
+    {
+      "epoch": 2.537840565085772,
+      "grad_norm": 0.00029898545471951365,
+      "learning_rate": 7.702657248570467e-06,
+      "loss": 0.0009,
+      "step": 10060
+    },
+    {
+      "epoch": 2.5403632694248235,
+      "grad_norm": 0.0002952404029201716,
+      "learning_rate": 7.660612176252944e-06,
+      "loss": 0.0001,
+      "step": 10070
+    },
+    {
+      "epoch": 2.542885973763875,
+      "grad_norm": 0.06380714476108551,
+      "learning_rate": 7.618567103935418e-06,
+      "loss": 0.0005,
+      "step": 10080
+    },
+    {
+      "epoch": 2.5454086781029264,
+      "grad_norm": 0.2544499933719635,
+      "learning_rate": 7.576522031617894e-06,
+      "loss": 0.0029,
+      "step": 10090
+    },
+    {
+      "epoch": 2.547931382441978,
+      "grad_norm": 0.0004265220195520669,
+      "learning_rate": 7.53447695930037e-06,
+      "loss": 0.0001,
+      "step": 10100
+    },
+    {
+      "epoch": 2.547931382441978,
+      "eval_loss": 0.00382298999466002,
+      "eval_runtime": 20.8338,
+      "eval_samples_per_second": 84.574,
+      "eval_steps_per_second": 21.168,
+      "step": 10100
+    },
+    {
+      "epoch": 2.5504540867810293,
+      "grad_norm": 0.00022531530703417957,
+      "learning_rate": 7.4924318869828465e-06,
+      "loss": 0.0005,
+      "step": 10110
+    },
+    {
+      "epoch": 2.5529767911200807,
+      "grad_norm": 0.0006108383531682193,
+      "learning_rate": 7.450386814665322e-06,
+      "loss": 0.0012,
+      "step": 10120
+    },
+    {
+      "epoch": 2.555499495459132,
+      "grad_norm": 0.007859915494918823,
+      "learning_rate": 7.4083417423477975e-06,
+      "loss": 0.0002,
+      "step": 10130
+    },
+    {
+      "epoch": 2.5580221997981836,
+      "grad_norm": 0.07120391726493835,
+      "learning_rate": 7.366296670030273e-06,
+      "loss": 0.0018,
+      "step": 10140
+    },
+    {
+      "epoch": 2.560544904137235,
+      "grad_norm": 3.911245221388526e-05,
+      "learning_rate": 7.3242515977127486e-06,
+      "loss": 0.0001,
+      "step": 10150
+    },
+    {
+      "epoch": 2.5630676084762865,
+      "grad_norm": 0.00011251613614149392,
+      "learning_rate": 7.282206525395224e-06,
+      "loss": 0.0003,
+      "step": 10160
+    },
+    {
+      "epoch": 2.565590312815338,
+      "grad_norm": 2.7329329896019772e-05,
+      "learning_rate": 7.2401614530777e-06,
+      "loss": 0.0,
+      "step": 10170
+    },
+    {
+      "epoch": 2.5681130171543893,
+      "grad_norm": 0.015592537820339203,
+      "learning_rate": 7.198116380760175e-06,
+      "loss": 0.0001,
+      "step": 10180
+    },
+    {
+      "epoch": 2.570635721493441,
+      "grad_norm": 0.004030313808470964,
+      "learning_rate": 7.156071308442651e-06,
+      "loss": 0.0006,
+      "step": 10190
+    },
+    {
+      "epoch": 2.573158425832492,
+      "grad_norm": 0.00028404564363881946,
+      "learning_rate": 7.114026236125126e-06,
+      "loss": 0.0011,
+      "step": 10200
+    },
+    {
+      "epoch": 2.573158425832492,
+      "eval_loss": 0.0038917113561183214,
+      "eval_runtime": 20.8627,
+      "eval_samples_per_second": 84.457,
+      "eval_steps_per_second": 21.138,
+      "step": 10200
+    },
+    {
+      "epoch": 2.575681130171544,
+      "grad_norm": 0.015596185810863972,
+      "learning_rate": 7.071981163807602e-06,
+      "loss": 0.0008,
+      "step": 10210
+    },
+    {
+      "epoch": 2.578203834510595,
+      "grad_norm": 0.000240606430452317,
+      "learning_rate": 7.029936091490077e-06,
+      "loss": 0.0003,
+      "step": 10220
+    },
+    {
+      "epoch": 2.580726538849647,
+      "grad_norm": 0.03709765151143074,
+      "learning_rate": 6.987891019172553e-06,
+      "loss": 0.0015,
+      "step": 10230
+    },
+    {
+      "epoch": 2.5832492431886984,
+      "grad_norm": 0.038254085928201675,
+      "learning_rate": 6.945845946855028e-06,
+      "loss": 0.0004,
+      "step": 10240
+    },
+    {
+      "epoch": 2.58577194752775,
+      "grad_norm": 0.0966513380408287,
+      "learning_rate": 6.903800874537504e-06,
+      "loss": 0.0012,
+      "step": 10250
+    },
+    {
+      "epoch": 2.5882946518668013,
+      "grad_norm": 0.002003374509513378,
+      "learning_rate": 6.861755802219979e-06,
+      "loss": 0.0015,
+      "step": 10260
+    },
+    {
+      "epoch": 2.5908173562058527,
+      "grad_norm": 0.2246997058391571,
+      "learning_rate": 6.8197107299024555e-06,
+      "loss": 0.0013,
+      "step": 10270
+    },
+    {
+      "epoch": 2.593340060544904,
+      "grad_norm": 0.046293605118989944,
+      "learning_rate": 6.777665657584932e-06,
+      "loss": 0.0013,
+      "step": 10280
+    },
+    {
+      "epoch": 2.5958627648839556,
+      "grad_norm": 0.04472287371754646,
+      "learning_rate": 6.735620585267407e-06,
+      "loss": 0.0024,
+      "step": 10290
+    },
+    {
+      "epoch": 2.598385469223007,
+      "grad_norm": 0.013854089193046093,
+      "learning_rate": 6.693575512949883e-06,
+      "loss": 0.0007,
+      "step": 10300
+    },
+    {
+      "epoch": 2.598385469223007,
+      "eval_loss": 0.003853140166029334,
+      "eval_runtime": 20.8889,
+      "eval_samples_per_second": 84.351,
+      "eval_steps_per_second": 21.112,
+      "step": 10300
+    },
+    {
+      "epoch": 2.6009081735620585,
+      "grad_norm": 6.33889067103155e-05,
+      "learning_rate": 6.6515304406323584e-06,
+      "loss": 0.0,
+      "step": 10310
+    },
+    {
+      "epoch": 2.60343087790111,
+      "grad_norm": 0.028591223061084747,
+      "learning_rate": 6.609485368314834e-06,
+      "loss": 0.0007,
+      "step": 10320
+    },
+    {
+      "epoch": 2.6059535822401614,
+      "grad_norm": 0.015871459618210793,
+      "learning_rate": 6.5674402959973095e-06,
+      "loss": 0.0008,
+      "step": 10330
+    },
+    {
+      "epoch": 2.608476286579213,
+      "grad_norm": 0.0005443996633403003,
+      "learning_rate": 6.525395223679785e-06,
+      "loss": 0.002,
+      "step": 10340
+    },
+    {
+      "epoch": 2.6109989909182643,
+      "grad_norm": 0.11164157837629318,
+      "learning_rate": 6.4833501513622605e-06,
+      "loss": 0.0005,
+      "step": 10350
+    },
+    {
+      "epoch": 2.6135216952573157,
+      "grad_norm": 0.005891601089388132,
+      "learning_rate": 6.441305079044736e-06,
+      "loss": 0.0002,
+      "step": 10360
+    },
+    {
+      "epoch": 2.616044399596367,
+      "grad_norm": 0.00020742563356179744,
+      "learning_rate": 6.3992600067272115e-06,
+      "loss": 0.0014,
+      "step": 10370
+    },
+    {
+      "epoch": 2.618567103935419,
+      "grad_norm": 6.0241254686843604e-05,
+      "learning_rate": 6.357214934409687e-06,
+      "loss": 0.0002,
+      "step": 10380
+    },
+    {
+      "epoch": 2.62108980827447,
+      "grad_norm": 0.00019894012075383216,
+      "learning_rate": 6.3151698620921625e-06,
+      "loss": 0.0005,
+      "step": 10390
+    },
+    {
+      "epoch": 2.623612512613522,
+      "grad_norm": 0.13243666291236877,
+      "learning_rate": 6.273124789774638e-06,
+      "loss": 0.001,
+      "step": 10400
+    },
+    {
+      "epoch": 2.623612512613522,
+      "eval_loss": 0.0038242663722485304,
+      "eval_runtime": 20.9018,
+      "eval_samples_per_second": 84.299,
+      "eval_steps_per_second": 21.099,
+      "step": 10400
+    },
+    {
+      "epoch": 2.626135216952573,
+      "grad_norm": 0.12121517211198807,
+      "learning_rate": 6.231079717457114e-06,
+      "loss": 0.0008,
+      "step": 10410
+    },
+    {
+      "epoch": 2.628657921291625,
+      "grad_norm": 0.008915661834180355,
+      "learning_rate": 6.18903464513959e-06,
+      "loss": 0.0,
+      "step": 10420
+    },
+    {
+      "epoch": 2.6311806256306762,
+      "grad_norm": 0.005935797467827797,
+      "learning_rate": 6.1469895728220654e-06,
+      "loss": 0.0002,
+      "step": 10430
+    },
+    {
+      "epoch": 2.6337033299697277,
+      "grad_norm": 0.030999109148979187,
+      "learning_rate": 6.104944500504541e-06,
+      "loss": 0.0002,
+      "step": 10440
+    },
+    {
+      "epoch": 2.636226034308779,
+      "grad_norm": 0.13484105467796326,
+      "learning_rate": 6.0628994281870165e-06,
+      "loss": 0.0005,
+      "step": 10450
+    },
+    {
+      "epoch": 2.6387487386478305,
+      "grad_norm": 0.0004957011551596224,
+      "learning_rate": 6.020854355869492e-06,
+      "loss": 0.0006,
+      "step": 10460
+    },
+    {
+      "epoch": 2.641271442986882,
+      "grad_norm": 0.12348122149705887,
+      "learning_rate": 5.9788092835519675e-06,
+      "loss": 0.0005,
+      "step": 10470
+    },
+    {
+      "epoch": 2.6437941473259334,
+      "grad_norm": 9.011943620862439e-05,
+      "learning_rate": 5.936764211234443e-06,
+      "loss": 0.0006,
+      "step": 10480
+    },
+    {
+      "epoch": 2.646316851664985,
+      "grad_norm": 0.00026155789964832366,
+      "learning_rate": 5.8947191389169185e-06,
+      "loss": 0.0008,
+      "step": 10490
+    },
+    {
+      "epoch": 2.6488395560040363,
+      "grad_norm": 0.0001088874414563179,
+      "learning_rate": 5.852674066599395e-06,
+      "loss": 0.0011,
+      "step": 10500
+    },
+    {
+      "epoch": 2.6488395560040363,
+      "eval_loss": 0.0038149829488247633,
+      "eval_runtime": 20.9282,
+      "eval_samples_per_second": 84.193,
+      "eval_steps_per_second": 21.072,
+      "step": 10500
+    },
+    {
+      "epoch": 2.6513622603430878,
+      "grad_norm": 4.513943349593319e-05,
+      "learning_rate": 5.81062899428187e-06,
+      "loss": 0.0008,
+      "step": 10510
+    },
+    {
+      "epoch": 2.653884964682139,
+      "grad_norm": 0.00011352117144269869,
+      "learning_rate": 5.768583921964346e-06,
+      "loss": 0.0004,
+      "step": 10520
+    },
+    {
+      "epoch": 2.6564076690211906,
+      "grad_norm": 8.327852265210822e-05,
+      "learning_rate": 5.726538849646821e-06,
+      "loss": 0.0003,
+      "step": 10530
+    },
+    {
+      "epoch": 2.658930373360242,
+      "grad_norm": 0.08921755105257034,
+      "learning_rate": 5.684493777329297e-06,
+      "loss": 0.0014,
+      "step": 10540
+    },
+    {
+      "epoch": 2.6614530776992935,
+      "grad_norm": 0.00016910290287341923,
+      "learning_rate": 5.642448705011772e-06,
+      "loss": 0.0001,
+      "step": 10550
+    },
+    {
+      "epoch": 2.663975782038345,
+      "grad_norm": 7.930315769044682e-05,
+      "learning_rate": 5.600403632694248e-06,
+      "loss": 0.0001,
+      "step": 10560
+    },
+    {
+      "epoch": 2.666498486377397,
+      "grad_norm": 9.457457781536505e-05,
+      "learning_rate": 5.5583585603767234e-06,
+      "loss": 0.0,
+      "step": 10570
+    },
+    {
+      "epoch": 2.669021190716448,
+      "grad_norm": 0.00021707512496504933,
+      "learning_rate": 5.5163134880592e-06,
+      "loss": 0.0003,
+      "step": 10580
+    },
+    {
+      "epoch": 2.6715438950554997,
+      "grad_norm": 0.001966067822650075,
+      "learning_rate": 5.474268415741675e-06,
+      "loss": 0.0007,
+      "step": 10590
+    },
+    {
+      "epoch": 2.6740665993945507,
+      "grad_norm": 0.004528351593762636,
+      "learning_rate": 5.432223343424151e-06,
+      "loss": 0.0005,
+      "step": 10600
+    },
+    {
+      "epoch": 2.6740665993945507,
+      "eval_loss": 0.003807534696534276,
+      "eval_runtime": 20.87,
+      "eval_samples_per_second": 84.427,
+      "eval_steps_per_second": 21.131,
+      "step": 10600
+    },
+    {
+      "epoch": 2.6765893037336026,
+      "grad_norm": 0.00017803607624955475,
+      "learning_rate": 5.390178271106626e-06,
+      "loss": 0.0005,
+      "step": 10610
+    },
+    {
+      "epoch": 2.679112008072654,
+      "grad_norm": 3.115162326139398e-05,
+      "learning_rate": 5.348133198789102e-06,
+      "loss": 0.0,
+      "step": 10620
+    },
+    {
+      "epoch": 2.6816347124117055,
+      "grad_norm": 0.07494215667247772,
+      "learning_rate": 5.306088126471577e-06,
+      "loss": 0.0008,
+      "step": 10630
+    },
+    {
+      "epoch": 2.684157416750757,
+      "grad_norm": 0.07868482917547226,
+      "learning_rate": 5.264043054154053e-06,
+      "loss": 0.0005,
+      "step": 10640
+    },
+    {
+      "epoch": 2.6866801210898084,
+      "grad_norm": 0.01250834483653307,
+      "learning_rate": 5.221997981836528e-06,
+      "loss": 0.0001,
+      "step": 10650
+    },
+    {
+      "epoch": 2.68920282542886,
+      "grad_norm": 0.10575691610574722,
+      "learning_rate": 5.179952909519004e-06,
+      "loss": 0.001,
+      "step": 10660
+    },
+    {
+      "epoch": 2.6917255297679112,
+      "grad_norm": 0.16051237285137177,
+      "learning_rate": 5.13790783720148e-06,
+      "loss": 0.0006,
+      "step": 10670
+    },
+    {
+      "epoch": 2.6942482341069627,
+      "grad_norm": 4.039399209432304e-05,
+      "learning_rate": 5.095862764883956e-06,
+      "loss": 0.0001,
+      "step": 10680
+    },
+    {
+      "epoch": 2.696770938446014,
+      "grad_norm": 9.023944585351273e-05,
+      "learning_rate": 5.053817692566431e-06,
+      "loss": 0.0005,
+      "step": 10690
+    },
+    {
+      "epoch": 2.6992936427850656,
+      "grad_norm": 0.20219027996063232,
+      "learning_rate": 5.011772620248907e-06,
+      "loss": 0.0003,
+      "step": 10700
+    },
+    {
+      "epoch": 2.6992936427850656,
+      "eval_loss": 0.0038071214221417904,
+      "eval_runtime": 20.8909,
+      "eval_samples_per_second": 84.343,
+      "eval_steps_per_second": 21.11,
+      "step": 10700
+    },
+    {
+      "epoch": 2.701816347124117,
+      "grad_norm": 0.12794020771980286,
+      "learning_rate": 4.969727547931382e-06,
+      "loss": 0.0014,
+      "step": 10710
+    },
+    {
+      "epoch": 2.7043390514631684,
+      "grad_norm": 0.000275536032859236,
+      "learning_rate": 4.927682475613858e-06,
+      "loss": 0.0006,
+      "step": 10720
+    },
+    {
+      "epoch": 2.70686175580222,
+      "grad_norm": 0.0002630538656376302,
+      "learning_rate": 4.885637403296333e-06,
+      "loss": 0.0016,
+      "step": 10730
+    },
+    {
+      "epoch": 2.7093844601412713,
+      "grad_norm": 0.042821742594242096,
+      "learning_rate": 4.843592330978809e-06,
+      "loss": 0.0001,
+      "step": 10740
+    },
+    {
+      "epoch": 2.7119071644803228,
+      "grad_norm": 0.10874561965465546,
+      "learning_rate": 4.801547258661285e-06,
+      "loss": 0.0006,
+      "step": 10750
+    },
+    {
+      "epoch": 2.714429868819374,
+      "grad_norm": 0.00025562438531778753,
+      "learning_rate": 4.759502186343761e-06,
+      "loss": 0.0001,
+      "step": 10760
+    },
+    {
+      "epoch": 2.7169525731584256,
+      "grad_norm": 0.006827104836702347,
+      "learning_rate": 4.717457114026236e-06,
+      "loss": 0.001,
+      "step": 10770
+    },
+    {
+      "epoch": 2.7194752774974775,
+      "grad_norm": 0.005648414604365826,
+      "learning_rate": 4.675412041708712e-06,
+      "loss": 0.0004,
+      "step": 10780
+    },
+    {
+      "epoch": 2.7219979818365285,
+      "grad_norm": 0.001025490928441286,
+      "learning_rate": 4.633366969391187e-06,
+      "loss": 0.0018,
+      "step": 10790
+    },
+    {
+      "epoch": 2.7245206861755804,
+      "grad_norm": 0.0006745181744918227,
+      "learning_rate": 4.591321897073663e-06,
+      "loss": 0.0002,
+      "step": 10800
+    },
+    {
+      "epoch": 2.7245206861755804,
+      "eval_loss": 0.0037325455341488123,
+      "eval_runtime": 20.8462,
+      "eval_samples_per_second": 84.524,
+      "eval_steps_per_second": 21.155,
+      "step": 10800
+    },
+    {
+      "epoch": 2.727043390514632,
+      "grad_norm": 0.0065965172834694386,
+      "learning_rate": 4.549276824756138e-06,
+      "loss": 0.0001,
+      "step": 10810
+    },
+    {
+      "epoch": 2.7295660948536833,
+      "grad_norm": 0.0002903965360019356,
+      "learning_rate": 4.507231752438614e-06,
+      "loss": 0.0003,
+      "step": 10820
+    },
+    {
+      "epoch": 2.7320887991927347,
+      "grad_norm": 0.16553114354610443,
+      "learning_rate": 4.465186680121089e-06,
+      "loss": 0.0006,
+      "step": 10830
+    },
+    {
+      "epoch": 2.734611503531786,
+      "grad_norm": 0.0074982005171477795,
+      "learning_rate": 4.423141607803566e-06,
+      "loss": 0.0001,
+      "step": 10840
+    },
+    {
+      "epoch": 2.7371342078708376,
+      "grad_norm": 0.0002544449525885284,
+      "learning_rate": 4.381096535486041e-06,
+      "loss": 0.0005,
+      "step": 10850
+    },
+    {
+      "epoch": 2.739656912209889,
+      "grad_norm": 0.07092459499835968,
+      "learning_rate": 4.339051463168517e-06,
+      "loss": 0.0005,
+      "step": 10860
+    },
+    {
+      "epoch": 2.7421796165489405,
+      "grad_norm": 0.03416803479194641,
+      "learning_rate": 4.297006390850992e-06,
+      "loss": 0.0004,
+      "step": 10870
+    },
+    {
+      "epoch": 2.744702320887992,
+      "grad_norm": 0.0010244250297546387,
+      "learning_rate": 4.254961318533468e-06,
+      "loss": 0.0003,
+      "step": 10880
+    },
+    {
+      "epoch": 2.7472250252270434,
+      "grad_norm": 0.0003882810124196112,
+      "learning_rate": 4.212916246215943e-06,
+      "loss": 0.0005,
+      "step": 10890
+    },
+    {
+      "epoch": 2.749747729566095,
+      "grad_norm": 0.0006057489081285894,
+      "learning_rate": 4.170871173898419e-06,
+      "loss": 0.0009,
+      "step": 10900
+    },
+    {
+      "epoch": 2.749747729566095,
+      "eval_loss": 0.0038078054785728455,
+      "eval_runtime": 20.8522,
+      "eval_samples_per_second": 84.499,
+      "eval_steps_per_second": 21.149,
+      "step": 10900
+    },
+    {
+      "epoch": 2.7522704339051463,
+      "grad_norm": 0.00148275145329535,
+      "learning_rate": 4.128826101580894e-06,
+      "loss": 0.0001,
+      "step": 10910
+    },
+    {
+      "epoch": 2.7547931382441977,
+      "grad_norm": 0.062449101358652115,
+      "learning_rate": 4.086781029263371e-06,
+      "loss": 0.0017,
+      "step": 10920
+    },
+    {
+      "epoch": 2.757315842583249,
+      "grad_norm": 0.011098313145339489,
+      "learning_rate": 4.044735956945846e-06,
+      "loss": 0.0003,
+      "step": 10930
+    },
+    {
+      "epoch": 2.7598385469223006,
+      "grad_norm": 0.0001871915883384645,
+      "learning_rate": 4.002690884628322e-06,
+      "loss": 0.0001,
+      "step": 10940
+    },
+    {
+      "epoch": 2.762361251261352,
+      "grad_norm": 0.00037126371171325445,
+      "learning_rate": 3.960645812310797e-06,
+      "loss": 0.0,
+      "step": 10950
+    },
+    {
+      "epoch": 2.7648839556004035,
+      "grad_norm": 0.0006047156057320535,
+      "learning_rate": 3.918600739993273e-06,
+      "loss": 0.0012,
+      "step": 10960
+    },
+    {
+      "epoch": 2.7674066599394553,
+      "grad_norm": 0.00014337015454657376,
+      "learning_rate": 3.876555667675748e-06,
+      "loss": 0.0004,
+      "step": 10970
+    },
+    {
+      "epoch": 2.7699293642785063,
+      "grad_norm": 0.12640614807605743,
+      "learning_rate": 3.834510595358224e-06,
+      "loss": 0.0026,
+      "step": 10980
+    },
+    {
+      "epoch": 2.772452068617558,
+      "grad_norm": 0.00037311791675165296,
+      "learning_rate": 3.7924655230406996e-06,
+      "loss": 0.0,
+      "step": 10990
+    },
+    {
+      "epoch": 2.774974772956609,
+      "grad_norm": 0.00015324597188737243,
+      "learning_rate": 3.750420450723175e-06,
+      "loss": 0.0009,
+      "step": 11000
+    },
+    {
+      "epoch": 2.774974772956609,
+      "eval_loss": 0.0037996473256498575,
+      "eval_runtime": 20.8643,
+      "eval_samples_per_second": 84.451,
+      "eval_steps_per_second": 21.137,
+      "step": 11000
+    },
+    {
+      "epoch": 2.777497477295661,
+      "grad_norm": 0.10045702010393143,
+      "learning_rate": 3.7083753784056515e-06,
+      "loss": 0.0002,
+      "step": 11010
+    },
+    {
+      "epoch": 2.7800201816347125,
+      "grad_norm": 0.00010195528011536226,
+      "learning_rate": 3.666330306088127e-06,
+      "loss": 0.0012,
+      "step": 11020
+    },
+    {
+      "epoch": 2.782542885973764,
+      "grad_norm": 0.00022041058400645852,
+      "learning_rate": 3.6242852337706025e-06,
+      "loss": 0.0002,
+      "step": 11030
+    },
+    {
+      "epoch": 2.7850655903128154,
+      "grad_norm": 0.18306997418403625,
+      "learning_rate": 3.582240161453078e-06,
+      "loss": 0.0016,
+      "step": 11040
+    },
+    {
+      "epoch": 2.787588294651867,
+      "grad_norm": 6.177197064971551e-05,
+      "learning_rate": 3.5401950891355535e-06,
+      "loss": 0.0,
+      "step": 11050
+    },
+    {
+      "epoch": 2.7901109989909183,
+      "grad_norm": 0.0007134904735721648,
+      "learning_rate": 3.498150016818029e-06,
+      "loss": 0.0001,
+      "step": 11060
+    },
+    {
+      "epoch": 2.7926337033299697,
+      "grad_norm": 0.026096561923623085,
+      "learning_rate": 3.4561049445005045e-06,
+      "loss": 0.0004,
+      "step": 11070
+    },
+    {
+      "epoch": 2.795156407669021,
+      "grad_norm": 0.0005445599090307951,
+      "learning_rate": 3.41405987218298e-06,
+      "loss": 0.0,
+      "step": 11080
+    },
+    {
+      "epoch": 2.7976791120080726,
+      "grad_norm": 0.00022800432634539902,
+      "learning_rate": 3.3720147998654564e-06,
+      "loss": 0.0,
+      "step": 11090
+    },
+    {
+      "epoch": 2.800201816347124,
+      "grad_norm": 0.00023530615726485848,
+      "learning_rate": 3.329969727547932e-06,
+      "loss": 0.0004,
+      "step": 11100
+    },
+    {
+      "epoch": 2.800201816347124,
+      "eval_loss": 0.003767798189073801,
+      "eval_runtime": 20.8684,
+      "eval_samples_per_second": 84.434,
+      "eval_steps_per_second": 21.132,
+      "step": 11100
+    },
+    {
+      "epoch": 2.8027245206861755,
+      "grad_norm": 9.487225906923413e-05,
+      "learning_rate": 3.2879246552304074e-06,
+      "loss": 0.0,
+      "step": 11110
+    },
+    {
+      "epoch": 2.805247225025227,
+      "grad_norm": 0.07703667134046555,
+      "learning_rate": 3.245879582912883e-06,
+      "loss": 0.0009,
+      "step": 11120
+    },
+    {
+      "epoch": 2.8077699293642784,
+      "grad_norm": 0.09232014417648315,
+      "learning_rate": 3.2038345105953585e-06,
+      "loss": 0.0003,
+      "step": 11130
+    },
+    {
+      "epoch": 2.81029263370333,
+      "grad_norm": 0.0033814776688814163,
+      "learning_rate": 3.161789438277834e-06,
+      "loss": 0.0022,
+      "step": 11140
+    },
+    {
+      "epoch": 2.8128153380423813,
+      "grad_norm": 0.003383078845217824,
+      "learning_rate": 3.11974436596031e-06,
+      "loss": 0.0018,
+      "step": 11150
+    },
+    {
+      "epoch": 2.815338042381433,
+      "grad_norm": 0.0075791082344949245,
+      "learning_rate": 3.0776992936427854e-06,
+      "loss": 0.0001,
+      "step": 11160
+    },
+    {
+      "epoch": 2.817860746720484,
+      "grad_norm": 0.0005082746502012014,
+      "learning_rate": 3.035654221325261e-06,
+      "loss": 0.0004,
+      "step": 11170
+    },
+    {
+      "epoch": 2.820383451059536,
+      "grad_norm": 1.3200211469666101e-05,
+      "learning_rate": 2.9936091490077364e-06,
+      "loss": 0.0003,
+      "step": 11180
+    },
+    {
+      "epoch": 2.822906155398587,
+      "grad_norm": 0.0003332770138513297,
+      "learning_rate": 2.951564076690212e-06,
+      "loss": 0.0,
+      "step": 11190
+    },
+    {
+      "epoch": 2.825428859737639,
+      "grad_norm": 0.15700918436050415,
+      "learning_rate": 2.909519004372688e-06,
+      "loss": 0.0019,
+      "step": 11200
+    },
+    {
+      "epoch": 2.825428859737639,
+      "eval_loss": 0.0037862148601561785,
+      "eval_runtime": 20.8605,
+      "eval_samples_per_second": 84.466,
+      "eval_steps_per_second": 21.14,
+      "step": 11200
+    },
+    {
+      "epoch": 2.8279515640766903,
+      "grad_norm": 0.00048214950948022306,
+      "learning_rate": 2.8674739320551634e-06,
+      "loss": 0.0001,
+      "step": 11210
+    },
+    {
+      "epoch": 2.830474268415742,
+      "grad_norm": 0.0002856640494428575,
+      "learning_rate": 2.825428859737639e-06,
+      "loss": 0.0009,
+      "step": 11220
+    },
+    {
+      "epoch": 2.8329969727547932,
+      "grad_norm": 0.00010505354293854907,
+      "learning_rate": 2.7833837874201144e-06,
+      "loss": 0.0001,
+      "step": 11230
+    },
+    {
+      "epoch": 2.8355196770938447,
+      "grad_norm": 0.0005396956112235785,
+      "learning_rate": 2.7413387151025904e-06,
+      "loss": 0.0006,
+      "step": 11240
+    },
+    {
+      "epoch": 2.838042381432896,
+      "grad_norm": 0.03256835415959358,
+      "learning_rate": 2.699293642785066e-06,
+      "loss": 0.0006,
+      "step": 11250
+    },
+    {
+      "epoch": 2.8405650857719476,
+      "grad_norm": 4.0616308979224414e-05,
+      "learning_rate": 2.6572485704675414e-06,
+      "loss": 0.0005,
+      "step": 11260
+    },
+    {
+      "epoch": 2.843087790110999,
+      "grad_norm": 0.09430497884750366,
+      "learning_rate": 2.615203498150017e-06,
+      "loss": 0.0003,
+      "step": 11270
+    },
+    {
+      "epoch": 2.8456104944500504,
+      "grad_norm": 0.0003849182394333184,
+      "learning_rate": 2.573158425832493e-06,
+      "loss": 0.0,
+      "step": 11280
+    },
+    {
+      "epoch": 2.848133198789102,
+      "grad_norm": 0.0575651191174984,
+      "learning_rate": 2.5311133535149683e-06,
+      "loss": 0.0001,
+      "step": 11290
+    },
+    {
+      "epoch": 2.8506559031281533,
+      "grad_norm": 0.0015473919920623302,
+      "learning_rate": 2.489068281197444e-06,
+      "loss": 0.0007,
+      "step": 11300
+    },
+    {
+      "epoch": 2.8506559031281533,
+      "eval_loss": 0.0037519715260714293,
+      "eval_runtime": 20.8758,
+      "eval_samples_per_second": 84.404,
+      "eval_steps_per_second": 21.125,
+      "step": 11300
+    },
+    {
+      "epoch": 2.8531786074672048,
+      "grad_norm": 0.000955607567448169,
+      "learning_rate": 2.4470232088799194e-06,
+      "loss": 0.0011,
+      "step": 11310
+    },
+    {
+      "epoch": 2.855701311806256,
+      "grad_norm": 4.5967324695084244e-05,
+      "learning_rate": 2.4049781365623953e-06,
+      "loss": 0.0003,
+      "step": 11320
+    },
+    {
+      "epoch": 2.8582240161453076,
+      "grad_norm": 0.0006287918658927083,
+      "learning_rate": 2.362933064244871e-06,
+      "loss": 0.0008,
+      "step": 11330
+    },
+    {
+      "epoch": 2.860746720484359,
+      "grad_norm": 0.0023038501385599375,
+      "learning_rate": 2.3208879919273463e-06,
+      "loss": 0.0002,
+      "step": 11340
+    },
+    {
+      "epoch": 2.863269424823411,
+      "grad_norm": 0.00017235818086192012,
+      "learning_rate": 2.278842919609822e-06,
+      "loss": 0.0001,
+      "step": 11350
+    },
+    {
+      "epoch": 2.865792129162462,
+      "grad_norm": 0.0003580110496841371,
+      "learning_rate": 2.2367978472922973e-06,
+      "loss": 0.0001,
+      "step": 11360
+    },
+    {
+      "epoch": 2.868314833501514,
+      "grad_norm": 0.0013928780099377036,
+      "learning_rate": 2.1947527749747733e-06,
+      "loss": 0.0003,
+      "step": 11370
+    },
+    {
+      "epoch": 2.870837537840565,
+      "grad_norm": 0.11936355382204056,
+      "learning_rate": 2.152707702657249e-06,
+      "loss": 0.001,
+      "step": 11380
+    },
+    {
+      "epoch": 2.8733602421796167,
+      "grad_norm": 0.0003929549129679799,
+      "learning_rate": 2.1106626303397243e-06,
+      "loss": 0.0,
+      "step": 11390
+    },
+    {
+      "epoch": 2.875882946518668,
+      "grad_norm": 0.00011808017734438181,
+      "learning_rate": 2.0686175580222e-06,
+      "loss": 0.001,
+      "step": 11400
+    },
+    {
+      "epoch": 2.875882946518668,
+      "eval_loss": 0.0037574958987534046,
+      "eval_runtime": 20.8424,
+      "eval_samples_per_second": 84.539,
+      "eval_steps_per_second": 21.159,
+      "step": 11400
+    },
+    {
+      "epoch": 2.8784056508577196,
+      "grad_norm": 0.1294584721326828,
+      "learning_rate": 2.0265724857046758e-06,
+      "loss": 0.0006,
+      "step": 11410
+    },
+    {
+      "epoch": 2.880928355196771,
+      "grad_norm": 0.0016598176443949342,
+      "learning_rate": 1.9845274133871513e-06,
+      "loss": 0.0009,
+      "step": 11420
+    },
+    {
+      "epoch": 2.8834510595358225,
+      "grad_norm": 1.5323972547776066e-05,
+      "learning_rate": 1.9424823410696268e-06,
+      "loss": 0.0,
+      "step": 11430
+    },
+    {
+      "epoch": 2.885973763874874,
+      "grad_norm": 0.000323007203405723,
+      "learning_rate": 1.9004372687521023e-06,
+      "loss": 0.0001,
+      "step": 11440
+    },
+    {
+      "epoch": 2.8884964682139254,
+      "grad_norm": 0.0010820828611031175,
+      "learning_rate": 1.858392196434578e-06,
+      "loss": 0.001,
+      "step": 11450
+    },
+    {
+      "epoch": 2.891019172552977,
+      "grad_norm": 0.009386632591485977,
+      "learning_rate": 1.8163471241170535e-06,
+      "loss": 0.0,
+      "step": 11460
+    },
+    {
+      "epoch": 2.8935418768920282,
+      "grad_norm": 0.005150569602847099,
+      "learning_rate": 1.7743020517995292e-06,
+      "loss": 0.0001,
+      "step": 11470
+    },
+    {
+      "epoch": 2.8960645812310797,
+      "grad_norm": 5.517240060726181e-05,
+      "learning_rate": 1.7322569794820048e-06,
+      "loss": 0.0011,
+      "step": 11480
+    },
+    {
+      "epoch": 2.898587285570131,
+      "grad_norm": 0.16206330060958862,
+      "learning_rate": 1.6902119071644805e-06,
+      "loss": 0.001,
+      "step": 11490
+    },
+    {
+      "epoch": 2.9011099899091826,
+      "grad_norm": 0.00011850109876831993,
+      "learning_rate": 1.648166834846956e-06,
+      "loss": 0.001,
+      "step": 11500
+    },
+    {
+      "epoch": 2.9011099899091826,
+      "eval_loss": 0.0037340966518968344,
+      "eval_runtime": 20.8481,
+      "eval_samples_per_second": 84.516,
+      "eval_steps_per_second": 21.153,
+      "step": 11500
+    },
+    {
+      "epoch": 2.903632694248234,
+      "grad_norm": 0.0002364334650337696,
+      "learning_rate": 1.6061217625294317e-06,
+      "loss": 0.0,
+      "step": 11510
+    },
+    {
+      "epoch": 2.9061553985872854,
+      "grad_norm": 0.005173501092940569,
+      "learning_rate": 1.5640766902119072e-06,
+      "loss": 0.0005,
+      "step": 11520
+    },
+    {
+      "epoch": 2.908678102926337,
+      "grad_norm": 0.08454468101263046,
+      "learning_rate": 1.522031617894383e-06,
+      "loss": 0.0001,
+      "step": 11530
+    },
+    {
+      "epoch": 2.9112008072653888,
+      "grad_norm": 0.018247609958052635,
+      "learning_rate": 1.4799865455768585e-06,
+      "loss": 0.0004,
+      "step": 11540
+    },
+    {
+      "epoch": 2.9137235116044398,
+      "grad_norm": 0.08170945197343826,
+      "learning_rate": 1.4379414732593342e-06,
+      "loss": 0.0005,
+      "step": 11550
+    },
+    {
+      "epoch": 2.9162462159434916,
+      "grad_norm": 8.340697240782902e-05,
+      "learning_rate": 1.3958964009418097e-06,
+      "loss": 0.0006,
+      "step": 11560
+    },
+    {
+      "epoch": 2.9187689202825426,
+      "grad_norm": 0.0001018949769786559,
+      "learning_rate": 1.3538513286242854e-06,
+      "loss": 0.0009,
+      "step": 11570
+    },
+    {
+      "epoch": 2.9212916246215945,
+      "grad_norm": 0.0001912551961140707,
+      "learning_rate": 1.311806256306761e-06,
+      "loss": 0.0002,
+      "step": 11580
+    },
+    {
+      "epoch": 2.923814328960646,
+      "grad_norm": 0.0019354906398802996,
+      "learning_rate": 1.2697611839892367e-06,
+      "loss": 0.0,
+      "step": 11590
+    },
+    {
+      "epoch": 2.9263370332996974,
+      "grad_norm": 0.05732259526848793,
+      "learning_rate": 1.2277161116717122e-06,
+      "loss": 0.0013,
+      "step": 11600
+    },
+    {
+      "epoch": 2.9263370332996974,
+      "eval_loss": 0.0037303089629858732,
+      "eval_runtime": 20.8696,
+      "eval_samples_per_second": 84.429,
+      "eval_steps_per_second": 21.131,
+      "step": 11600
+    },
+    {
+      "epoch": 2.928859737638749,
+      "grad_norm": 0.04633721709251404,
+      "learning_rate": 1.185671039354188e-06,
+      "loss": 0.0002,
+      "step": 11610
+    },
+    {
+      "epoch": 2.9313824419778003,
+      "grad_norm": 0.000182849689736031,
+      "learning_rate": 1.1436259670366634e-06,
+      "loss": 0.0,
+      "step": 11620
+    },
+    {
+      "epoch": 2.9339051463168517,
+      "grad_norm": 9.736415813677013e-05,
+      "learning_rate": 1.1015808947191391e-06,
+      "loss": 0.0002,
+      "step": 11630
+    },
+    {
+      "epoch": 2.936427850655903,
+      "grad_norm": 0.00046163774095475674,
+      "learning_rate": 1.0595358224016146e-06,
+      "loss": 0.0011,
+      "step": 11640
+    },
+    {
+      "epoch": 2.9389505549949546,
+      "grad_norm": 5.9953119489364326e-05,
+      "learning_rate": 1.0174907500840902e-06,
+      "loss": 0.0003,
+      "step": 11650
+    },
+    {
+      "epoch": 2.941473259334006,
+      "grad_norm": 0.00025017280131578445,
+      "learning_rate": 9.754456777665659e-07,
+      "loss": 0.0011,
+      "step": 11660
+    },
+    {
+      "epoch": 2.9439959636730575,
+      "grad_norm": 0.142095148563385,
+      "learning_rate": 9.334006054490415e-07,
+      "loss": 0.0005,
+      "step": 11670
+    },
+    {
+      "epoch": 2.946518668012109,
+      "grad_norm": 0.00014950388867873698,
+      "learning_rate": 8.91355533131517e-07,
+      "loss": 0.0011,
+      "step": 11680
+    },
+    {
+      "epoch": 2.9490413723511604,
+      "grad_norm": 7.325205660890788e-05,
+      "learning_rate": 8.493104608139925e-07,
+      "loss": 0.0014,
+      "step": 11690
+    },
+    {
+      "epoch": 2.951564076690212,
+      "grad_norm": 0.00012016925757052377,
+      "learning_rate": 8.072653884964682e-07,
+      "loss": 0.0014,
+      "step": 11700
+    },
+    {
+      "epoch": 2.951564076690212,
+      "eval_loss": 0.0037244223058223724,
+      "eval_runtime": 20.8534,
+      "eval_samples_per_second": 84.495,
+      "eval_steps_per_second": 21.148,
+      "step": 11700
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 11892,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}